diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h index 175eb181c3..0e7ff481f3 100644 --- a/crmd/crmd_lrm.h +++ b/crmd/crmd_lrm.h @@ -1,162 +1,163 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include extern gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level); extern void lrm_clear_last_failure(const char *rsc_id, const char *node_name); void lrm_op_callback(lrmd_event_data_t * op); typedef struct resource_history_s { char *id; uint32_t last_callid; lrmd_rsc_info_t rsc; lrmd_event_data_t *last; lrmd_event_data_t *failed; GList *recurring_op_list; /* Resources must be stopped using the same * parameters they were started with. This hashtable * holds the parameters that should be used for the next stop * cmd on this resource. */ GHashTable *stop_params; } rsc_history_t; void history_free(gpointer data); /* TDOD - Replace this with lrmd_event_data_t */ struct recurring_op_s { int call_id; int interval; gboolean remove; gboolean cancelled; unsigned int start_time; char *rsc_id; char *op_type; char *op_key; char *user_data; GHashTable *params; }; typedef struct lrm_state_s { const char *node_name; /* reserved for lrm_state.c usage only */ void *conn; /* reserved for remote_lrmd_ra.c usage only */ void *remote_ra_data; GHashTable *resource_history; GHashTable *pending_ops; GHashTable *deletion_ops; GHashTable *rsc_info_cache; int num_lrm_register_fails; } lrm_state_t; struct pending_deletion_op_s { char *rsc; ha_msg_input_t *input; }; /*! * \brief Is this the local ipc connection to the lrmd */ gboolean lrm_state_is_local(lrm_state_t *lrm_state); /*! * \brief Clear all state information from a single state entry. * \note This does not close the lrmd connection */ void lrm_state_reset_tables(lrm_state_t * lrm_state); GList *lrm_state_get_list(void); /*! * \brief Initiate internal state tables */ gboolean lrm_state_init_local(void); /*! * \brief Destroy all state entries and internal state tables */ void lrm_state_destroy_all(void); /*! * \brief Create lrmd connection entry. */ lrm_state_t *lrm_state_create(const char *node_name); /*! * \brief Destroy lrmd connection keyed of node name */ void lrm_state_destroy(const char *node_name); /*! * \brief Find lrm_state data by node name */ lrm_state_t *lrm_state_find(const char *node_name); /*! * \brief Either find or create a new entry */ lrm_state_t *lrm_state_find_or_create(const char *node_name); /*! * The functions below are wrappers for the lrmd api calls the crmd * uses. These wrapper functions allow us to treat the crmd's remote * lrmd connection resources the same as regular resources. Internally * Regular resources go to the lrmd, and remote connection resources are * handled locally in the crmd. */ void lrm_state_disconnect(lrm_state_t * lrm_state); int lrm_state_ipc_connect(lrm_state_t * lrm_state); int lrm_state_remote_connect_async(lrm_state_t * lrm_state, const char *server, int port, int timeout); int lrm_state_is_connected(lrm_state_t * lrm_state); int lrm_state_poke_connection(lrm_state_t * lrm_state); int lrm_state_get_metadata(lrm_state_t * lrm_state, const char *class, const char *provider, const char *agent, char **output, enum lrmd_call_options options); int lrm_state_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval); int lrm_state_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ lrmd_key_value_t * params); lrmd_rsc_info_t *lrm_state_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options); int lrm_state_register_rsc(lrm_state_t * lrm_state, const char *rsc_id, const char *class, const char *provider, const char *agent, enum lrmd_call_options options); int lrm_state_unregister_rsc(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options); /*! These functions are used to manage the remote lrmd connection resources */ void remote_lrm_op_callback(lrmd_event_data_t * op); gboolean is_remote_lrmd_ra(const char *agent, const char *provider, const char *id); lrmd_rsc_info_t *remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id); int remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval); int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ lrmd_key_value_t * params); void remote_ra_cleanup(lrm_state_t * lrm_state); +void remote_ra_fail(const char *node_name); gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending); diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index ce54e507a8..b4aebd3e7b 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -1,1027 +1,1048 @@ /* * Copyright (C) 2013 David Vossel * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #define REMOTE_LRMD_RA "remote" /* The max start timeout before cmd retry */ #define MAX_START_TIMEOUT_MS 10000 typedef struct remote_ra_cmd_s { /*! the local node the cmd is issued from */ char *owner; /*! the remote node the cmd is executed on */ char *rsc_id; /*! the action to execute */ char *action; /*! some string the client wants us to give it back */ char *userdata; /*! start delay in ms */ int start_delay; /*! timer id used for start delay. */ int delay_id; /*! timeout in ms for cmd */ int timeout; int remaining_timeout; /*! recurring interval in ms */ int interval; /*! interval timer id */ int interval_id; int reported_success; int monitor_timeout_id; int takeover_timeout_id; /*! action parameters */ lrmd_key_value_t *params; /*! executed rc */ int rc; int op_status; int call_id; time_t start_time; gboolean cancel; } remote_ra_cmd_t; enum remote_migration_status { expect_takeover = 1, takeover_complete, }; typedef struct remote_ra_data_s { crm_trigger_t *work; remote_ra_cmd_t *cur_cmd; GList *cmds; GList *recurring_cmds; enum remote_migration_status migrate_status; gboolean active; } remote_ra_data_t; static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms); static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd); static GList *fail_all_monitor_cmds(GList * list); static void free_cmd(gpointer user_data) { remote_ra_cmd_t *cmd = user_data; if (!cmd) { return; } if (cmd->delay_id) { g_source_remove(cmd->delay_id); } if (cmd->interval_id) { g_source_remove(cmd->interval_id); } if (cmd->monitor_timeout_id) { g_source_remove(cmd->monitor_timeout_id); } if (cmd->takeover_timeout_id) { g_source_remove(cmd->takeover_timeout_id); } free(cmd->owner); free(cmd->rsc_id); free(cmd->action); free(cmd->userdata); lrmd_key_value_freeall(cmd->params); free(cmd); } static int generate_callid(void) { static int remote_ra_callid = 0; remote_ra_callid++; if (remote_ra_callid <= 0) { remote_ra_callid = 1; } return remote_ra_callid; } static gboolean recurring_helper(gpointer data) { remote_ra_cmd_t *cmd = data; lrm_state_t *connection_rsc = NULL; cmd->interval_id = 0; connection_rsc = lrm_state_find(cmd->rsc_id); if (connection_rsc && connection_rsc->remote_ra_data) { remote_ra_data_t *ra_data = connection_rsc->remote_ra_data; ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd); ra_data->cmds = g_list_append(ra_data->cmds, cmd); mainloop_set_trigger(ra_data->work); } return FALSE; } static gboolean start_delay_helper(gpointer data) { remote_ra_cmd_t *cmd = data; lrm_state_t *connection_rsc = NULL; cmd->delay_id = 0; connection_rsc = lrm_state_find(cmd->rsc_id); if (connection_rsc && connection_rsc->remote_ra_data) { remote_ra_data_t *ra_data = connection_rsc->remote_ra_data; mainloop_set_trigger(ra_data->work); } return FALSE; } /*! * \internal * \brief Handle cluster communication related to pacemaker_remote node joining * * \param[in] node_name Name of newly integrated pacemaker_remote node */ static void remote_node_up(const char *node_name) { int call_opt, call_id = 0; xmlNode *update, *state; crm_node_t *node; CRM_CHECK(node_name != NULL, return); crm_info("Announcing pacemaker_remote node %s", node_name); /* Clear node's operation history and transient attributes */ call_opt = crmd_cib_smart_opt(); erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt); erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt); /* Clear node's probed attribute */ update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE); /* Ensure node is in the remote peer cache with member status */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0); /* pacemaker_remote nodes don't participate in the membership layer, * so cluster nodes don't automatically get notified when they come and go. * We send a cluster message to the DC, and update the CIB node state entry, * so the DC will get it sooner (via message) or later (via CIB refresh), * and any other interested parties can query the CIB. */ send_remote_state_message(node_name, TRUE); update = create_xml_node(NULL, XML_CIB_TAG_STATUS); state = do_update_node_cib(node, node_update_cluster, update, __FUNCTION__); /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever * needs to be fenced, this flag will allow various actions to determine * whether the fencing has happened yet. */ crm_xml_add(state, XML_NODE_IS_FENCED, "0"); /* TODO: If the remote connection drops, and this (async) CIB update either * failed or has not yet completed, later actions could mistakenly think the * node has already been fenced (if the XML_NODE_IS_FENCED attribute was * previously set, because it won't have been cleared). This could prevent * actual fencing or allow recurring monitor failures to be cleared too * soon. Ideally, we wouldn't rely on the CIB for the fenced status. */ fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id < 0) { crm_perror(LOG_WARNING, "%s CIB node state setup", node_name); } free_xml(update); } /*! * \internal * \brief Handle cluster communication related to pacemaker_remote node leaving * * \param[in] node_name Name of lost node */ static void remote_node_down(const char *node_name) { xmlNode *update; int call_id = 0; int call_opt = crmd_cib_smart_opt(); crm_node_t *node; /* Clear all node attributes */ update_attrd_remote_node_removed(node_name, NULL); /* Ensure node is in the remote peer cache with lost state */ node = crm_remote_peer_get(node_name); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0); /* Notify DC */ send_remote_state_message(node_name, FALSE); /* Update CIB node state */ update = create_xml_node(NULL, XML_CIB_TAG_STATUS); do_update_node_cib(node, node_update_cluster, update, __FUNCTION__); fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id < 0) { crm_perror(LOG_ERR, "%s CIB node state update", node_name); } free_xml(update); } /*! * \internal * \brief Handle effects of a remote RA command on node state * * \param[in] cmd Completed remote RA command */ static void check_remote_node_state(remote_ra_cmd_t *cmd) { /* Only successful actions can change node state */ if (cmd->rc != PCMK_OCF_OK) { return; } if (safe_str_eq(cmd->action, "start")) { remote_node_up(cmd->rsc_id); } else if (safe_str_eq(cmd->action, "migrate_from")) { /* Ensure node is in this host's remote peer cache */ crm_node_t *node = crm_remote_peer_get(cmd->rsc_id); CRM_CHECK(node != NULL, return); crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0); } else if (safe_str_eq(cmd->action, "stop")) { lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id); remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL; if (ra_data) { if (ra_data->migrate_status != takeover_complete) { /* Stop means down if we didn't successfully migrate elsewhere */ remote_node_down(cmd->rsc_id); } else if (AM_I_DC == FALSE) { /* Only the connection host and DC track node state, * so if the connection migrated elsewhere and we aren't DC, * un-cache the node, so we don't have stale info */ crm_remote_peer_cache_remove(cmd->rsc_id); } } } } static void report_remote_ra_result(remote_ra_cmd_t * cmd) { lrmd_event_data_t op = { 0, }; check_remote_node_state(cmd); op.type = lrmd_event_exec_complete; op.rsc_id = cmd->rsc_id; op.op_type = cmd->action; op.user_data = cmd->userdata; op.timeout = cmd->timeout; op.interval = cmd->interval; op.rc = cmd->rc; op.op_status = cmd->op_status; op.t_run = cmd->start_time; op.t_rcchange = cmd->start_time; if (cmd->reported_success && cmd->rc != PCMK_OCF_OK) { op.t_rcchange = time(NULL); /* This edge case will likely never ever occur, but if it does the * result is that a failure will not be processed correctly. This is only * remotely possible because we are able to detect a connection resource's tcp * connection has failed at any moment after start has completed. The actual * recurring operation is just a connectivity ping. * * basically, we are not guaranteed that the first successful monitor op and * a subsequent failed monitor op will not occur in the same timestamp. We have to * make it look like the operations occurred at separate times though. */ if (op.t_rcchange == op.t_run) { op.t_rcchange++; } } if (cmd->params) { lrmd_key_value_t *tmp; op.params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); for (tmp = cmd->params; tmp; tmp = tmp->next) { g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value)); } } op.call_id = cmd->call_id; op.remote_nodename = cmd->owner; lrm_op_callback(&op); if (op.params) { g_hash_table_destroy(op.params); } } static void update_remaining_timeout(remote_ra_cmd_t * cmd) { cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000; } static gboolean retry_start_cmd_cb(gpointer data) { lrm_state_t *lrm_state = data; remote_ra_data_t *ra_data = lrm_state->remote_ra_data; remote_ra_cmd_t *cmd = NULL; int rc = -1; if (!ra_data || !ra_data->cur_cmd) { return FALSE; } cmd = ra_data->cur_cmd; if (safe_str_neq(cmd->action, "start") && safe_str_neq(cmd->action, "migrate_from")) { return FALSE; } update_remaining_timeout(cmd); if (cmd->remaining_timeout > 0) { rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout); } if (rc != 0) { cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; report_remote_ra_result(cmd); if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } ra_data->cur_cmd = NULL; free_cmd(cmd); } else { /* wait for connection event */ } return FALSE; } static gboolean connection_takeover_timeout_cb(gpointer data) { lrm_state_t *lrm_state = NULL; remote_ra_cmd_t *cmd = data; crm_info("takeover event timed out for node %s", cmd->rsc_id); cmd->takeover_timeout_id = 0; lrm_state = lrm_state_find(cmd->rsc_id); handle_remote_ra_stop(lrm_state, cmd); free_cmd(cmd); return FALSE; } static gboolean monitor_timeout_cb(gpointer data) { lrm_state_t *lrm_state = NULL; remote_ra_cmd_t *cmd = data; lrm_state = lrm_state_find(cmd->rsc_id); crm_info("Poke async response timed out for node %s (%p)", cmd->rsc_id, lrm_state); cmd->monitor_timeout_id = 0; cmd->op_status = PCMK_LRM_OP_TIMEOUT; cmd->rc = PCMK_OCF_UNKNOWN_ERROR; if (lrm_state && lrm_state->remote_ra_data) { remote_ra_data_t *ra_data = lrm_state->remote_ra_data; if (ra_data->cur_cmd == cmd) { ra_data->cur_cmd = NULL; } if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } } report_remote_ra_result(cmd); free_cmd(cmd); if(lrm_state) { lrm_state_disconnect(lrm_state); } return FALSE; } void remote_lrm_op_callback(lrmd_event_data_t * op) { gboolean cmd_handled = FALSE; lrm_state_t *lrm_state = NULL; remote_ra_data_t *ra_data = NULL; remote_ra_cmd_t *cmd = NULL; crm_debug("remote connection event - event_type:%s node:%s action:%s rc:%s op_status:%s", lrmd_event_type2str(op->type), op->remote_nodename, op->op_type ? op->op_type : "none", services_ocf_exitcode_str(op->rc), services_lrm_status_str(op->op_status)); lrm_state = lrm_state_find(op->remote_nodename); if (!lrm_state || !lrm_state->remote_ra_data) { crm_debug("lrm_state info not found for remote lrmd connection event"); return; } ra_data = lrm_state->remote_ra_data; /* Another client has connected to the remote daemon, * determine if this is expected. */ if (op->type == lrmd_event_new_client) { /* great, we new this was coming */ if (ra_data->migrate_status == expect_takeover) { ra_data->migrate_status = takeover_complete; } else { crm_err("Unexpected pacemaker_remote client takeover. Disconnecting"); lrm_state_disconnect(lrm_state); } return; } /* filter all EXEC events up */ if (op->type == lrmd_event_exec_complete) { if (ra_data->migrate_status == takeover_complete) { crm_debug("ignoring event, this connection is taken over by another node"); } else { lrm_op_callback(op); } return; } if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL) && (ra_data->active == TRUE)) { crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name); ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds); ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds); return; } if (!ra_data->cur_cmd) { crm_debug("no event to match"); return; } cmd = ra_data->cur_cmd; /* Start actions and migrate from actions complete after connection * comes back to us. */ if (op->type == lrmd_event_connect && (safe_str_eq(cmd->action, "start") || safe_str_eq(cmd->action, "migrate_from"))) { if (op->connection_rc < 0) { update_remaining_timeout(cmd); /* There isn't much of a reason to reschedule if the timeout is too small */ if (cmd->remaining_timeout > 3000) { crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout); g_timeout_add(1000, retry_start_cmd_cb, lrm_state); return; } else { crm_trace("can't reschedule start, remaining timeout too small %d", cmd->remaining_timeout); } cmd->op_status = PCMK_LRM_OP_TIMEOUT; cmd->rc = PCMK_OCF_UNKNOWN_ERROR; } else { lrm_state_reset_tables(lrm_state); cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; ra_data->active = TRUE; } crm_debug("remote lrmd connect event matched %s action. ", cmd->action); report_remote_ra_result(cmd); cmd_handled = TRUE; } else if (op->type == lrmd_event_poke && safe_str_eq(cmd->action, "monitor")) { if (cmd->monitor_timeout_id) { g_source_remove(cmd->monitor_timeout_id); cmd->monitor_timeout_id = 0; } /* Only report success the first time, after that only worry about failures. * For this function, if we get the poke pack, it is always a success. Pokes * only fail if the send fails, or the response times out. */ if (!cmd->reported_success) { cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; report_remote_ra_result(cmd); cmd->reported_success = 1; } crm_debug("remote lrmd poke event matched %s action. ", cmd->action); /* success, keep rescheduling if interval is present. */ if (cmd->interval && (cmd->cancel == FALSE)) { ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd); cmd->interval_id = g_timeout_add(cmd->interval, recurring_helper, cmd); cmd = NULL; /* prevent free */ } cmd_handled = TRUE; } else if (op->type == lrmd_event_disconnect && safe_str_eq(cmd->action, "monitor")) { if (ra_data->active == TRUE && (cmd->cancel == FALSE)) { cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; report_remote_ra_result(cmd); crm_err("remote-node %s unexpectedly disconneced during monitor operation", lrm_state->node_name); } cmd_handled = TRUE; } else if (op->type == lrmd_event_new_client && safe_str_eq(cmd->action, "stop")) { handle_remote_ra_stop(lrm_state, cmd); cmd_handled = TRUE; } else { crm_debug("Event did not match %s action", ra_data->cur_cmd->action); } if (cmd_handled) { ra_data->cur_cmd = NULL; if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } free_cmd(cmd); } } static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd) { remote_ra_data_t *ra_data = NULL; CRM_ASSERT(lrm_state); ra_data = lrm_state->remote_ra_data; if (ra_data->migrate_status != takeover_complete) { /* delete pending ops when ever the remote connection is intentionally stopped */ g_hash_table_remove_all(lrm_state->pending_ops); } else { /* we no longer hold the history if this connection has been migrated */ lrm_state_reset_tables(lrm_state); } ra_data->active = FALSE; lrm_state_disconnect(lrm_state); cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; if (ra_data->cmds) { g_list_free_full(ra_data->cmds, free_cmd); } if (ra_data->recurring_cmds) { g_list_free_full(ra_data->recurring_cmds, free_cmd); } ra_data->cmds = NULL; ra_data->recurring_cmds = NULL; ra_data->cur_cmd = NULL; report_remote_ra_result(cmd); } static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms) { const char *server = NULL; lrmd_key_value_t *tmp = NULL; int port = 0; int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms; for (tmp = cmd->params; tmp; tmp = tmp->next) { if (safe_str_eq(tmp->key, "addr") || safe_str_eq(tmp->key, "server")) { server = tmp->value; } if (safe_str_eq(tmp->key, "port")) { port = atoi(tmp->value); } } return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used); } static gboolean handle_remote_ra_exec(gpointer user_data) { int rc = 0; lrm_state_t *lrm_state = user_data; remote_ra_data_t *ra_data = lrm_state->remote_ra_data; remote_ra_cmd_t *cmd; GList *first = NULL; if (ra_data->cur_cmd) { /* still waiting on previous cmd */ return TRUE; } while (ra_data->cmds) { first = ra_data->cmds; cmd = first->data; if (cmd->delay_id) { /* still waiting for start delay timer to trip */ return TRUE; } ra_data->cmds = g_list_remove_link(ra_data->cmds, first); g_list_free_1(first); if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) { ra_data->migrate_status = 0; rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout); if (rc == 0) { /* take care of this later when we get async connection result */ crm_debug("began remote lrmd connect, waiting for connect event."); ra_data->cur_cmd = cmd; return TRUE; } else { crm_debug("connect failed, not expecting to match any connection event later"); cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; } report_remote_ra_result(cmd); } else if (!strcmp(cmd->action, "monitor")) { if (lrm_state_is_connected(lrm_state) == TRUE) { rc = lrm_state_poke_connection(lrm_state); if (rc < 0) { cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; } } else { rc = -1; cmd->op_status = PCMK_LRM_OP_DONE; cmd->rc = PCMK_OCF_NOT_RUNNING; } if (rc == 0) { crm_debug("poked remote lrmd at node %s, waiting for async response.", cmd->rsc_id); ra_data->cur_cmd = cmd; cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd); return TRUE; } report_remote_ra_result(cmd); } else if (!strcmp(cmd->action, "stop")) { if (ra_data->migrate_status == expect_takeover) { /* briefly wait on stop for the takeover event to occur. If the * takeover event does not occur during the wait period, that's fine. * It just means that the remote-node's lrm_status section is going to get * cleared which will require all the resources running in the remote-node * to be explicitly re-detected via probe actions. If the takeover does occur * successfully, then we can leave the status section intact. */ cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd); ra_data->cur_cmd = cmd; return TRUE; } handle_remote_ra_stop(lrm_state, cmd); } else if (!strcmp(cmd->action, "migrate_to")) { ra_data->migrate_status = expect_takeover; cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; report_remote_ra_result(cmd); } else if (!strcmp(cmd->action, "reload")) { /* reloads are a no-op right now, add logic here when they become important */ cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; report_remote_ra_result(cmd); } free_cmd(cmd); } return TRUE; } static void remote_ra_data_init(lrm_state_t * lrm_state) { remote_ra_data_t *ra_data = NULL; if (lrm_state->remote_ra_data) { return; } ra_data = calloc(1, sizeof(remote_ra_data_t)); ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state); lrm_state->remote_ra_data = ra_data; } void remote_ra_cleanup(lrm_state_t * lrm_state) { remote_ra_data_t *ra_data = lrm_state->remote_ra_data; if (!ra_data) { return; } if (ra_data->cmds) { g_list_free_full(ra_data->cmds, free_cmd); } if (ra_data->recurring_cmds) { g_list_free_full(ra_data->recurring_cmds, free_cmd); } mainloop_destroy_trigger(ra_data->work); free(ra_data); lrm_state->remote_ra_data = NULL; } gboolean is_remote_lrmd_ra(const char *agent, const char *provider, const char *id) { if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) { return TRUE; } if (id && lrm_state_find(id) && safe_str_neq(id, fsa_our_uname)) { return TRUE; } return FALSE; } lrmd_rsc_info_t * remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id) { lrmd_rsc_info_t *info = NULL; if ((lrm_state_find(rsc_id))) { info = calloc(1, sizeof(lrmd_rsc_info_t)); info->id = strdup(rsc_id); info->type = strdup(REMOTE_LRMD_RA); info->class = strdup("ocf"); info->provider = strdup("pacemaker"); } return info; } static gboolean is_remote_ra_supported_action(const char *action) { if (!action) { return FALSE; } else if (strcmp(action, "start") && strcmp(action, "stop") && strcmp(action, "reload") && strcmp(action, "migrate_to") && strcmp(action, "migrate_from") && strcmp(action, "monitor")) { return FALSE; } return TRUE; } static GList * fail_all_monitor_cmds(GList * list) { GList *rm_list = NULL; remote_ra_cmd_t *cmd = NULL; GListPtr gIter = NULL; for (gIter = list; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if (cmd->interval > 0 && safe_str_eq(cmd->action, "monitor")) { rm_list = g_list_append(rm_list, cmd); } } for (gIter = rm_list; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; crm_trace("Pre-emptively failing %s %s (interval=%d, %s)", cmd->action, cmd->rsc_id, cmd->interval, cmd->userdata); report_remote_ra_result(cmd); list = g_list_remove(list, cmd); free_cmd(cmd); } /* frees only the list data, not the cmds */ g_list_free(rm_list); return list; } static GList * remove_cmd(GList * list, const char *action, int interval) { remote_ra_cmd_t *cmd = NULL; GListPtr gIter = NULL; for (gIter = list; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if (cmd->interval == interval && safe_str_eq(cmd->action, action)) { break; } cmd = NULL; } if (cmd) { list = g_list_remove(list, cmd); free_cmd(cmd); } return list; } int remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval) { lrm_state_t *connection_rsc = NULL; remote_ra_data_t *ra_data = NULL; connection_rsc = lrm_state_find(rsc_id); if (!connection_rsc || !connection_rsc->remote_ra_data) { return -EINVAL; } ra_data = connection_rsc->remote_ra_data; ra_data->cmds = remove_cmd(ra_data->cmds, action, interval); ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action, interval); if (ra_data->cur_cmd && (ra_data->cur_cmd->interval == interval) && (safe_str_eq(ra_data->cur_cmd->action, action))) { ra_data->cur_cmd->cancel = TRUE; } return 0; } static remote_ra_cmd_t * handle_dup_monitor(remote_ra_data_t *ra_data, int interval, const char *userdata) { GList *gIter = NULL; remote_ra_cmd_t *cmd = NULL; /* there are 3 places a potential duplicate monitor operation * could exist. * 1. recurring_cmds list. where the op is waiting for its next interval * 2. cmds list, where the op is queued to get executed immediately * 3. cur_cmd, which means the monitor op is in flight right now. */ if (interval == 0) { return NULL; } if (ra_data->cur_cmd && ra_data->cur_cmd->cancel == FALSE && ra_data->cur_cmd->interval == interval && safe_str_eq(ra_data->cur_cmd->action, "monitor")) { cmd = ra_data->cur_cmd; goto handle_dup; } for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) { goto handle_dup; } } for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) { goto handle_dup; } } return NULL; handle_dup: crm_trace("merging duplicate monitor cmd %s_monitor_%d", cmd->rsc_id, interval); /* update the userdata */ if (userdata) { free(cmd->userdata); cmd->userdata = strdup(userdata); } /* if we've already reported success, generate a new call id */ if (cmd->reported_success) { cmd->start_time = time(NULL); cmd->call_id = generate_callid(); cmd->reported_success = 0; } /* if we have an interval_id set, that means we are in the process of * waiting for this cmd's next interval. instead of waiting, cancel * the timer and execute the action immediately */ if (cmd->interval_id) { g_source_remove(cmd->interval_id); cmd->interval_id = 0; recurring_helper(cmd); } return cmd; } int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ lrmd_key_value_t * params) { int rc = 0; lrm_state_t *connection_rsc = NULL; remote_ra_cmd_t *cmd = NULL; remote_ra_data_t *ra_data = NULL; if (is_remote_ra_supported_action(action) == FALSE) { rc = -EINVAL; goto exec_done; } connection_rsc = lrm_state_find(rsc_id); if (!connection_rsc) { rc = -EINVAL; goto exec_done; } remote_ra_data_init(connection_rsc); ra_data = connection_rsc->remote_ra_data; cmd = handle_dup_monitor(ra_data, interval, userdata); if (cmd) { return cmd->call_id; } cmd = calloc(1, sizeof(remote_ra_cmd_t)); cmd->owner = strdup(lrm_state->node_name); cmd->rsc_id = strdup(rsc_id); cmd->action = strdup(action); cmd->userdata = strdup(userdata); cmd->interval = interval; cmd->timeout = timeout; cmd->start_delay = start_delay; cmd->params = params; cmd->start_time = time(NULL); cmd->call_id = generate_callid(); if (cmd->start_delay) { cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); } ra_data->cmds = g_list_append(ra_data->cmds, cmd); mainloop_set_trigger(ra_data->work); return cmd->call_id; exec_done: lrmd_key_value_freeall(params); return rc; } + +/*! + * \internal + * \brief Immediately fail all monitors of a remote node, if proxied here + * + * \param[in] node_name Name of pacemaker_remote node + */ +void +remote_ra_fail(const char *node_name) +{ + lrm_state_t *lrm_state = lrm_state_find(node_name); + + if (lrm_state && lrm_state_is_connected(lrm_state)) { + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + + crm_info("Failing monitors on pacemaker_remote node %s", node_name); + ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds); + ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds); + } +} + diff --git a/crmd/te_utils.c b/crmd/te_utils.c index 3a9f491e4f..03e3b04f23 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -1,638 +1,646 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include +#include #include #include #include crm_trigger_t *stonith_reconnect = NULL; /* * stonith cleanup list * * If the DC is shot, proper notifications might not go out. * The stonith cleanup list allows the cluster to (re-)send * notifications once a new DC is elected. */ static GListPtr stonith_cleanup_list = NULL; /*! * \internal * \brief Add a node to the stonith cleanup list * * \param[in] target Name of node to add */ void add_stonith_cleanup(const char *target) { stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); } /*! * \internal * \brief Remove a node from the stonith cleanup list * * \param[in] Name of node to remove */ void remove_stonith_cleanup(const char *target) { GListPtr iter = stonith_cleanup_list; while (iter != NULL) { GListPtr tmp = iter; char *iter_name = tmp->data; iter = iter->next; if (safe_str_eq(target, iter_name)) { crm_trace("Removing %s from the cleanup list", iter_name); stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); free(iter_name); } } } /*! * \internal * \brief Purge all entries from the stonith cleanup list */ void purge_stonith_cleanup() { if (stonith_cleanup_list) { GListPtr iter = NULL; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_info("Purging %s from stonith cleanup list", target); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } } /*! * \internal * \brief Send stonith updates for all entries in cleanup list, then purge it */ void execute_stonith_cleanup() { GListPtr iter; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_node_t *target_node = crm_get_peer(0, target); const char *uuid = crm_peer_uuid(target_node); crm_notice("Marking %s, target of a previous stonith action, as clean", target); send_stonith_update(NULL, target, uuid); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } /* end stonith cleanup list functions */ static gboolean fail_incompletable_stonith(crm_graph_t * graph) { GListPtr lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GListPtr lpc2 = NULL; synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { crm_action_t *action = (crm_action_t *) lpc2->data; if (action->type != action_type_crm || action->confirmed) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (task && safe_str_eq(task, CRM_OP_FENCE)) { action->failed = TRUE; last_action = action->xml; update_graph(graph, action); crm_notice("Failing action %d (%s): STONITHd terminated", action->id, ID(action->xml)); } } } if (last_action != NULL) { crm_warn("STONITHd failure resulted in un-runnable actions"); abort_transition(INFINITY, tg_restart, "Stonith failure", last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e) { if (is_set(fsa_input_register, R_ST_REQUIRED)) { crm_crit("Fencing daemon connection failed"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencing daemon disconnected"); } /* cbchan will be garbage at this point, arrange for it to be reset */ if(stonith_api) { stonith_api->state = stonith_disconnected; } if (AM_I_DC) { fail_incompletable_stonith(transition_graph); trigger_graph(); } } #if SUPPORT_CMAN # include #endif char *te_client_id = NULL; #ifdef HAVE_SYS_REBOOT_H # include # include #endif static void tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) { if(te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%d", crm_system_name, getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } crmd_notify_fencing_op(st_event); if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { crm_notice("%s was successfully unfenced by %s (at the request of %s)", st_event->target, st_event->executioner ? st_event->executioner : "", st_event->origin); /* TODO: Hook up st_event->device */ return; } else if (safe_str_eq("on", st_event->action)) { crm_err("Unfencing of %s by %s failed: %s (%d)", st_event->target, st_event->executioner ? st_event->executioner : "", pcmk_strerror(st_event->result), st_event->result); return; } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { crm_crit("We were allegedly just fenced by %s for %s!", st_event->executioner ? st_event->executioner : "", st_event->origin); /* Dumps blackbox if enabled */ qb_log_fini(); /* Try to get the above log message to disk - somehow */ /* Get out ASAP and do not come back up. * * Triggering a reboot is also not the worst idea either since * the rest of the cluster thinks we're safely down */ #ifdef RB_HALT_SYSTEM reboot(RB_HALT_SYSTEM); #endif /* * If reboot() fails or is not supported, coming back up will * probably lead to a situation where the other nodes set our * status to 'lost' because of the fencing callback and will * discard subsequent election votes with: * * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) * * So just stay dead, something is seriously messed up anyway. * */ exit(100); /* None of our wrappers since we already called qb_log_fini() */ return; } if (st_event->result == pcmk_ok && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { st_fail_count_reset(st_event->target); } crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "", st_event->origin, pcmk_strerror(st_event->result), st_event->id, st_event->client_origin ? st_event->client_origin : ""); #if SUPPORT_CMAN if (st_event->result == pcmk_ok && is_cman_cluster()) { int local_rc = 0; int confirm = 0; char *target_copy = strdup(st_event->target); /* In case fenced hasn't noticed yet * * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect */ local_rc = fenced_external(target_copy); if (local_rc != 0) { crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target, local_rc); } else { crm_notice("Notified CMAN that '%s' is now fenced", st_event->target); } /* In case fenced is already trying to shoot it */ confirm = open("/var/run/cluster/fenced_override", O_NONBLOCK|O_WRONLY); if (confirm >= 0) { int ignore = 0; int len = strlen(target_copy); errno = 0; local_rc = write(confirm, target_copy, len); ignore = write(confirm, "\n", 1); if(ignore < 0 && errno == EBADF) { crm_trace("CMAN not expecting %s to be fenced (yet)", st_event->target); } else if (local_rc < len) { crm_perror(LOG_ERR, "Confirmation of CMAN fencing event for '%s' failed: %d", st_event->target, local_rc); } else { fsync(confirm); crm_notice("Confirmed CMAN fencing event for '%s'", st_event->target); } close(confirm); } free(target_copy); } #endif if (st_event->result == pcmk_ok) { crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_ANY); const char *uuid = NULL; gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); if (peer == NULL) { return; } uuid = crm_peer_uuid(peer); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we dont currently have one */ } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target)) && !is_set(peer->flags, crm_remote_node)) { crm_notice("Target %s our leader %s (recorded: %s)", fsa_our_dc ? "was" : "may have been", st_event->target, fsa_our_dc ? fsa_our_dc : ""); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } add_stonith_cleanup(st_event->target); } + /* If the target is a remote node, and we host its connection, + * immediately fail all monitors so it can be recovered quickly. + */ + if (is_set(peer->flags, crm_remote_node)) { + remote_ra_fail(st_event->target); + } + crmd_peer_down(peer, TRUE); } } gboolean te_connect_stonith(gpointer user_data) { int lpc = 0; int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); } if (stonith_api->state != stonith_disconnected) { crm_trace("Still connected"); return TRUE; } for (lpc = 0; lpc < 30; lpc++) { crm_debug("Attempting connection to fencing daemon..."); sleep(1); rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (rc == pcmk_ok) { break; } if (user_data != NULL) { if (is_set(fsa_input_register, R_ST_REQUIRED)) { crm_err("Sign-in failed: triggered a retry"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Sign-in failed, but no longer required"); } return TRUE; } crm_err("Sign-in failed: pausing and trying again in 2s..."); sleep(1); } CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */ stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, tengine_stonith_notify); crm_trace("Connected"); return TRUE; } gboolean stop_te_timer(crm_action_timer_t * timer) { const char *timer_desc = "action timer"; if (timer == NULL) { return FALSE; } if (timer->reason == timeout_abort) { timer_desc = "global timer"; crm_trace("Stopping %s", timer_desc); } if (timer->source_id != 0) { crm_trace("Stopping %s", timer_desc); g_source_remove(timer->source_id); timer->source_id = 0; } else { crm_trace("%s was already stopped", timer_desc); return FALSE; } return TRUE; } gboolean te_graph_trigger(gpointer user_data) { enum transition_status graph_rc = -1; if (transition_graph == NULL) { crm_debug("Nothing to do"); return TRUE; } crm_trace("Invoking graph %d in state %s", transition_graph->id, fsa_state2string(fsa_state)); switch (fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: return TRUE; break; default: break; } if (transition_graph->complete == FALSE) { int limit = transition_graph->batch_limit; transition_graph->batch_limit = throttle_get_total_job_limit(limit); graph_rc = run_graph(transition_graph); transition_graph->batch_limit = limit; /* Restore the configured value */ /* significant overhead... */ /* print_graph(LOG_DEBUG_3, transition_graph); */ if (graph_rc == transition_active) { crm_trace("Transition not yet complete"); return TRUE; } else if (graph_rc == transition_pending) { crm_trace("Transition not yet complete - no actions fired"); return TRUE; } if (graph_rc != transition_complete) { crm_warn("Transition failed: %s", transition_status(graph_rc)); print_graph(LOG_NOTICE, transition_graph); } } crm_debug("Transition %d is now complete", transition_graph->id); transition_graph->complete = TRUE; notify_crmd(transition_graph); return TRUE; } void trigger_graph_processing(const char *fn, int line) { crm_trace("%s:%d - Triggered graph processing", fn, line); mainloop_set_trigger(transition_trigger); } void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line) { int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; int level = LOG_INFO; xmlNode *diff = NULL; xmlNode *change = NULL; CRM_CHECK(transition_graph != NULL, return); switch (fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: crm_info("Abort %s suppressed: state=%s (complete=%d)", abort_text, fsa_state2string(fsa_state), transition_graph->complete); return; default: break; } /* Make sure any queued calculations are discarded ASAP */ free(fsa_pe_ref); fsa_pe_ref = NULL; if (transition_graph->complete == FALSE) { if(update_abort_priority(transition_graph, abort_priority, abort_action, abort_text)) { level = LOG_NOTICE; } } if(reason) { xmlNode *search = NULL; for(search = reason; search; search = search->parent) { if (safe_str_eq(XML_TAG_DIFF, TYPE(search))) { diff = search; break; } } if(diff) { xml_patch_versions(diff, add, del); for(search = reason; search; search = search->parent) { if (safe_str_eq(XML_DIFF_CHANGE, TYPE(search))) { change = search; break; } } } } if(reason == NULL) { do_crm_log(level, "Transition aborted: %s (source=%s:%d, %d)", abort_text, fn, line, transition_graph->complete); } else if(change == NULL) { char *local_path = xml_get_path(reason); do_crm_log(level, "Transition aborted by %s.%s: %s (cib=%d.%d.%d, source=%s:%d, path=%s, %d)", TYPE(reason), ID(reason), abort_text, add[0], add[1], add[2], fn, line, local_path, transition_graph->complete); free(local_path); } else { const char *kind = NULL; const char *op = crm_element_value(change, XML_DIFF_OP); const char *path = crm_element_value(change, XML_DIFF_PATH); if(change == reason) { if(strcmp(op, "create") == 0) { reason = reason->children; } else if(strcmp(op, "modify") == 0) { reason = first_named_child(reason, XML_DIFF_RESULT); if(reason) { reason = reason->children; } } } kind = TYPE(reason); if(strcmp(op, "delete") == 0) { const char *shortpath = strrchr(path, '/'); do_crm_log(level, "Transition aborted by deletion of %s: %s (cib=%d.%d.%d, source=%s:%d, path=%s, %d)", shortpath?shortpath+1:path, abort_text, add[0], add[1], add[2], fn, line, path, transition_graph->complete); } else if (safe_str_eq(XML_CIB_TAG_NVPAIR, kind)) { do_crm_log(level, "Transition aborted by %s, %s=%s: %s (%s cib=%d.%d.%d, source=%s:%d, path=%s, %d)", crm_element_value(reason, XML_ATTR_ID), crm_element_value(reason, XML_NVPAIR_ATTR_NAME), crm_element_value(reason, XML_NVPAIR_ATTR_VALUE), abort_text, op, add[0], add[1], add[2], fn, line, path, transition_graph->complete); } else if (safe_str_eq(XML_LRM_TAG_RSC_OP, kind)) { const char *magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC); do_crm_log(level, "Transition aborted by %s '%s' on %s: %s (magic=%s, cib=%d.%d.%d, source=%s:%d, %d)", crm_element_value(reason, XML_LRM_ATTR_TASK_KEY), op, crm_element_value(reason, XML_LRM_ATTR_TARGET), abort_text, magic, add[0], add[1], add[2], fn, line, transition_graph->complete); } else if (safe_str_eq(XML_CIB_TAG_STATE, kind) || safe_str_eq(XML_CIB_TAG_NODE, kind)) { const char *uname = crm_peer_uname(ID(reason)); do_crm_log(level, "Transition aborted by %s '%s' on %s: %s (cib=%d.%d.%d, source=%s:%d, %d)", kind, op, uname ? uname : ID(reason), abort_text, add[0], add[1], add[2], fn, line, transition_graph->complete); } else { do_crm_log(level, "Transition aborted by %s.%s '%s': %s (cib=%d.%d.%d, source=%s:%d, path=%s, %d)", TYPE(reason), ID(reason), op?op:"change", abort_text, add[0], add[1], add[2], fn, line, path, transition_graph->complete); } } if (transition_graph->complete) { if (transition_timer->period_ms > 0) { crm_timer_stop(transition_timer); crm_timer_start(transition_timer); } else { register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL); } return; } mainloop_set_trigger(transition_trigger); }