diff --git a/crmd/callbacks.c b/crmd/callbacks.c index f88fc938b1..aee29fc23f 100644 --- a/crmd/callbacks.c +++ b/crmd/callbacks.c @@ -1,259 +1,262 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void crmd_ha_connection_destroy(gpointer user_data); /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void crmd_ha_connection_destroy(gpointer user_data) { crm_trace("Invoked"); if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { /* we signed out, so this is expected */ crm_info("Heartbeat disconnection complete"); return; } crm_crit("Lost connection to heartbeat service!"); register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); trigger_fsa(fsa_source); } void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) { const char *from = crm_element_value(msg, F_ORIG); if (safe_str_neq(from, fsa_our_uname)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __FUNCTION__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (safe_str_eq(sys_to, CRM_SYSTEM_DC)) { return; } } /* crm_log_xml_trace("HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(fsa_source); } void peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { uint32_t old = 0; uint32_t changed = 0; bool appeared = FALSE; const char *status = NULL; set_bit(fsa_input_register, R_PEER_DATA); if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: /* If we've never seen the node, then it also wont be in the status section */ crm_info("%s is now %s", node->uname, node->state); return; + case crm_status_rstate: + crm_info("Remote node %s is now %s (was %s)", node->uname, node->state, (const char *)data); + case crm_status_nstate: crm_info("%s is now %s (was %s)", node->uname, node->state, (const char *)data); if (safe_str_eq(data, node->state)) { /* State did not change */ return; } else if(safe_str_eq(CRM_NODE_MEMBER, node->state)) { appeared = TRUE; } break; case crm_status_processes: if (data) { old = *(const uint32_t *)data; changed = node->processes ^ old; } /* crmd_proc_update(node, proc_flags); */ status = (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS; crm_info("Client %s/%s now has status [%s] (DC=%s)", node->uname, peer2text(proc_flags), status, AM_I_DC ? "true" : crm_str(fsa_our_dc)); if ((changed & proc_flags) == 0) { /* Peer process did not change */ crm_trace("No change %6x %6x %6x", old, node->processes, proc_flags); return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_trace("Not connected"); return; } else if (fsa_state == S_STOPPING) { crm_trace("Stopping"); return; } appeared = (node->processes & proc_flags) != 0; if (safe_str_eq(node->uname, fsa_our_uname) && (node->processes & proc_flags) == 0) { /* Did we get evicted? */ crm_notice("Our peer connection failed"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL); } else if (safe_str_eq(node->uname, fsa_our_dc) && crm_is_peer_active(node) == FALSE) { /* Did the DC leave us? */ crm_notice("Our peer on the DC is dead"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); } break; } if (AM_I_DC) { xmlNode *update = NULL; gboolean alive = crm_is_peer_active(node); crm_action_t *down = match_down_event(0, node->uuid, NULL, appeared); crm_trace("Alive=%d, appear=%d, down=%p", alive, appeared, down); if (alive && type == crm_status_processes) { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (alive && safe_str_eq(task, CRM_OP_FENCE)) { crm_info("Node return implies stonith of %s (action %d) completed", node->uname, down->id); erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */ down->sent_update = TRUE; /* Prevent tengine_stonith_callback() from calling send_stonith_update() */ } else if (safe_str_eq(task, CRM_OP_FENCE)) { crm_trace("Waiting for stonithd to report the fencing of %s is complete", node->uname); /* via tengine_stonith_callback() */ } else if (alive == FALSE) { crm_notice("%s of %s (op %d) is complete", task, node->uname, down->id); /* down->confirmed = TRUE; Only stonith-ng returning should imply completion */ stop_te_timer(down->timer); crm_update_peer_join(__FUNCTION__, node, crm_join_none); crm_update_peer_expected(__FUNCTION__, node, CRMD_JOINSTATE_DOWN); check_join_state(fsa_state, __FUNCTION__); update_graph(transition_graph, down); trigger_graph(); } else { crm_trace("Other %p", down); } } else if (appeared == FALSE) { crm_notice("Stonith/shutdown of %s not matched", node->uname); crm_update_peer_join(__FUNCTION__, node, crm_join_none); check_join_state(fsa_state, __FUNCTION__); abort_transition(INFINITY, tg_restart, "Node failure", NULL); fail_incompletable_actions(transition_graph, node->uuid); } else { crm_trace("Other %p", down); } update = do_update_node_cib(node, node_update_peer, NULL, __FUNCTION__); fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); } trigger_fsa(fsa_source); } void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn,;); crm_trace("Invoked"); trigger_fsa(fsa_source); fsa_cib_conn->state = cib_disconnected; if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit(fsa_input_register, R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h index d0ca58cbd8..e877ad18ca 100644 --- a/crmd/crmd_lrm.h +++ b/crmd/crmd_lrm.h @@ -1,147 +1,151 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include + extern gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level); extern void lrm_clear_last_failure(const char *rsc_id, const char *node_name); void lrm_op_callback(lrmd_event_data_t * op); typedef struct resource_history_s { char *id; uint32_t last_callid; lrmd_rsc_info_t rsc; lrmd_event_data_t *last; lrmd_event_data_t *failed; GList *recurring_op_list; /* Resources must be stopped using the same * parameters they were started with. This hashtable * holds the parameters that should be used for the next stop * cmd on this resource. */ GHashTable *stop_params; } rsc_history_t; struct recurring_op_s { char *rsc_id; char *op_type; char *op_key; int call_id; int interval; gboolean remove; gboolean cancelled; }; typedef struct lrm_state_s { const char *node_name; /* reserved for lrm_state.c usage only */ void *conn; /* reserved for remote_lrmd_ra.c usage only */ void *remote_ra_data; GHashTable *resource_history; GHashTable *pending_ops; GHashTable *deletion_ops; int num_lrm_register_fails; } lrm_state_t; struct pending_deletion_op_s { char *rsc; ha_msg_input_t *input; }; xmlNode *do_lrm_query_internal(lrm_state_t * lrm_state, gboolean is_replace); /*! * \brief Clear all state information from a single state entry. * \note This does not close the lrmd connection */ void lrm_state_reset_tables(lrm_state_t * lrm_state); GList *lrm_state_get_list(void); /*! * \brief Initiate internal state tables */ gboolean lrm_state_init_local(void); /*! * \brief Destroy all state entries and internal state tables */ void lrm_state_destroy_all(void); /*! * \brief Create lrmd connection entry. */ lrm_state_t *lrm_state_create(const char *node_name); /*! * \brief Destroy lrmd connection keyed of node name */ void lrm_state_destroy(const char *node_name); /*! * \brief Find lrm_state data by node name */ lrm_state_t *lrm_state_find(const char *node_name); /*! * \brief Either find or create a new entry */ lrm_state_t *lrm_state_find_or_create(const char *node_name); /*! * The functions below are wrappers for the lrmd api calls the crmd * uses. These wrapper functions allow us to treat the crmd's remote * lrmd connection resources the same as regular resources. Internally * Regular resources go to the lrmd, and remote connection resources are * handled locally in the crmd. */ void lrm_state_disconnect(lrm_state_t * lrm_state); int lrm_state_ipc_connect(lrm_state_t * lrm_state); int lrm_state_remote_connect_async(lrm_state_t * lrm_state, const char *server, int port, int timeout); int lrm_state_is_connected(lrm_state_t * lrm_state); int lrm_state_poke_connection(lrm_state_t * lrm_state); int lrm_state_get_metadata(lrm_state_t * lrm_state, const char *class, const char *provider, const char *agent, char **output, enum lrmd_call_options options); int lrm_state_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval); int lrm_state_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ lrmd_key_value_t * params); lrmd_rsc_info_t *lrm_state_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options); int lrm_state_register_rsc(lrm_state_t * lrm_state, const char *rsc_id, const char *class, const char *provider, const char *agent, enum lrmd_call_options options); int lrm_state_unregister_rsc(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options); /*! These functions are used to manage the remote lrmd connection resources */ void remote_lrm_op_callback(lrmd_event_data_t * op); gboolean is_remote_lrmd_ra(const char *agent, const char *provider, const char *id); lrmd_rsc_info_t *remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id); int remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval); int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ lrmd_key_value_t * params); void remote_ra_cleanup(lrm_state_t * lrm_state); + +xmlNode *simple_remote_node_status(const char *node_name, xmlNode *parent, const char *source); diff --git a/crmd/membership.c b/crmd/membership.c index 167f27ce86..d59c1c800d 100644 --- a/crmd/membership.c +++ b/crmd/membership.c @@ -1,311 +1,316 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include #include +#include #include #include #include #include gboolean membership_flux_hack = FALSE; void post_cache_update(int instance); int last_peer_update = 0; extern GHashTable *voted; extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static void reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; if (crm_is_peer_active(node) == FALSE) { crm_update_peer_join(__FUNCTION__, node, crm_join_none); if(node->uname) { if (voted != NULL) { g_hash_table_remove(voted, node->uname); } if (safe_str_eq(fsa_our_uname, node->uname)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if (AM_I_DC == FALSE && safe_str_eq(node->uname, fsa_our_dc)) { crm_warn("Our DC node (%s) left the cluster", node->uname); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } } if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { check_join_state(fsa_state, __FUNCTION__); } fail_incompletable_actions(transition_graph, node->uuid); } } gboolean ever_had_quorum = FALSE; void post_cache_update(int instance) { xmlNode *no_op = NULL; crm_peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL); set_bit(fsa_input_register, R_MEMBERSHIP); if (AM_I_DC) { populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer | node_update_expected, __FUNCTION__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ register_fsa_action(A_ELECTION_CHECK); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); free_xml(no_op); } static void crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; last_peer_update = 0; if (rc == pcmk_ok) { crm_trace("Node update %d complete", call_id); } else if(call_id < pcmk_ok) { crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } xmlNode * do_update_node_cib(crm_node_t * node, int flags, xmlNode * parent, const char *source) { const char *value = NULL; xmlNode *node_state; + if (is_set(node->flags, crm_remote_node)) { + return simple_remote_node_status(node->uname, parent, source); + } + if (!node->state) { crm_info("Node update for %s cancelled: no state, not seen yet", node->uname); return NULL; } node_state = create_xml_node(parent, XML_CIB_TAG_STATE); set_uuid(node_state, XML_ATTR_UUID, node); if (crm_element_value(node_state, XML_ATTR_UUID) == NULL) { crm_info("Node update for %s cancelled: no id", node->uname); free_xml(node_state); return NULL; } crm_xml_add(node_state, XML_ATTR_UNAME, node->uname); if (flags & node_update_cluster) { if (safe_str_eq(node->state, CRM_NODE_ACTIVE)) { value = XML_BOOLEAN_YES; } else if (node->state) { value = XML_BOOLEAN_NO; } else { value = NULL; } crm_xml_add(node_state, XML_NODE_IN_CLUSTER, value); } if (flags & node_update_peer) { value = OFFLINESTATUS; if (node->processes & proc_flags) { value = ONLINESTATUS; } crm_xml_add(node_state, XML_NODE_IS_PEER, value); } if (flags & node_update_join) { if(node->join <= crm_join_none) { value = CRMD_JOINSTATE_DOWN; } else { value = CRMD_JOINSTATE_MEMBER; } crm_xml_add(node_state, XML_NODE_JOIN_STATE, value); } if (flags & node_update_expected) { crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected); } crm_xml_add(node_state, XML_ATTR_ORIGIN, source); return node_state; } static void node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if(call_id < pcmk_ok) { crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } else if(rc < pcmk_ok) { crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void populate_cib_nodes(enum node_update_flags flags, const char *source) { int call_id = 0; gboolean from_hashtable = TRUE; int call_options = cib_scope_local | cib_quorum_override; xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); #if SUPPORT_HEARTBEAT if (is_not_set(flags, node_update_quick) && is_heartbeat_cluster()) { from_hashtable = heartbeat_initialize_nodelist(fsa_cluster_conn, FALSE, node_list); } #endif #if SUPPORT_COROSYNC # if !SUPPORT_PLUGIN if (is_not_set(flags, node_update_quick) && is_corosync_cluster()) { from_hashtable = corosync_initialize_nodelist(NULL, FALSE, node_list); } # endif #endif if (from_hashtable) { GHashTableIter iter; crm_node_t *node = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { xmlNode *new_node = NULL; crm_trace("Creating node entry for %s/%s", node->uname, node->uuid); if(node->uuid && node->uname) { /* We need both to be valid */ new_node = create_xml_node(node_list, XML_CIB_TAG_NODE); crm_xml_add(new_node, XML_ATTR_ID, node->uuid); crm_xml_add(new_node, XML_ATTR_UNAME, node->uname); } } } crm_trace("Populating section from %s", from_hashtable ? "hashtable" : "cluster"); fsa_cib_update(XML_CIB_TAG_NODES, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, node_list_update_callback); free_xml(node_list); if (call_id >= pcmk_ok && crm_peer_cache != NULL && AM_I_DC) { /* * There is no need to update the local CIB with our values if * we've not seen valid membership data */ GHashTableIter iter; crm_node_t *node = NULL; node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS); g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { do_update_node_cib(node, flags, node_list, source); } fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL); fsa_register_cib_callback(call_id, FALSE, NULL, crmd_node_update_complete); last_peer_update = call_id; free_xml(node_list); } } static void cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { ever_had_quorum |= quorum; if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) { int call_id = 0; xmlNode *update = NULL; int call_options = cib_scope_local | cib_quorum_override; update = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); crm_xml_add(update, XML_ATTR_DC_UUID, fsa_our_uuid); fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); crm_debug("Updating quorum status to %s (call=%d)", quorum ? "true" : "false", call_id); fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete); free_xml(update); } fsa_has_quorum = quorum; } diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index ec54fcb345..bb626c044c 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -1,686 +1,694 @@ /* * Copyright (C) 2013 David Vossel * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #define REMOTE_LRMD_RA "remote" /* The max start timeout before cmd retry */ #define MAX_START_TIMEOUT_MS 10000 typedef struct remote_ra_cmd_s { /*! the local node the cmd is issued from */ char *owner; /*! the remote node the cmd is executed on */ char *rsc_id; /*! the action to execute */ char *action; /*! some string the client wants us to give it back */ char *userdata; /*! start delay in ms */ int start_delay; /*! timer id used for start delay. */ int delay_id; /*! timeout in ms for cmd */ int timeout; int remaining_timeout; /*! recurring interval in ms */ int interval; /*! interval timer id */ int interval_id; int reported_success; int monitor_timeout_id; /*! action parameters */ lrmd_key_value_t *params; /*! executed rc */ int rc; int op_status; int call_id; time_t start_time; gboolean cancel; } remote_ra_cmd_t; typedef struct remote_ra_data_s { crm_trigger_t *work; remote_ra_cmd_t *cur_cmd; GList *cmds; GList *recurring_cmds; } remote_ra_data_t; static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms); static void free_cmd(gpointer user_data) { remote_ra_cmd_t *cmd = user_data; if (!cmd) { return; } if (cmd->delay_id) { g_source_remove(cmd->delay_id); } if (cmd->interval_id) { g_source_remove(cmd->interval_id); } if (cmd->monitor_timeout_id) { g_source_remove(cmd->monitor_timeout_id); } free(cmd->owner); free(cmd->rsc_id); free(cmd->action); free(cmd->userdata); lrmd_key_value_freeall(cmd->params); free(cmd); } static int generate_callid(void) { static int remote_ra_callid = 0; remote_ra_callid++; if (remote_ra_callid <= 0) { remote_ra_callid = 1; } return remote_ra_callid; } static gboolean recurring_helper(gpointer data) { remote_ra_cmd_t *cmd = data; lrm_state_t *connection_rsc = NULL; cmd->interval_id = 0; connection_rsc = lrm_state_find(cmd->rsc_id); if (connection_rsc && connection_rsc->remote_ra_data) { remote_ra_data_t *ra_data = connection_rsc->remote_ra_data; ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd); cmd->call_id = generate_callid(); ra_data->cmds = g_list_append(ra_data->cmds, cmd); mainloop_set_trigger(ra_data->work); } return FALSE; } static gboolean start_delay_helper(gpointer data) { remote_ra_cmd_t *cmd = data; lrm_state_t *connection_rsc = NULL; cmd->delay_id = 0; connection_rsc = lrm_state_find(cmd->rsc_id); if (connection_rsc && connection_rsc->remote_ra_data) { remote_ra_data_t *ra_data = connection_rsc->remote_ra_data; mainloop_set_trigger(ra_data->work); } return FALSE; } static void report_remote_ra_result(remote_ra_cmd_t * cmd) { lrmd_event_data_t op = { 0, }; op.type = lrmd_event_exec_complete; op.rsc_id = cmd->rsc_id; op.op_type = cmd->action; op.user_data = cmd->userdata; op.timeout = cmd->timeout; op.interval = cmd->interval; op.rc = cmd->rc; op.op_status = cmd->op_status; if (cmd->params) { lrmd_key_value_t *tmp; op.params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); for (tmp = cmd->params; tmp; tmp = tmp->next) { g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value)); } } op.call_id = cmd->call_id; op.remote_nodename = cmd->owner; lrm_op_callback(&op); if (op.params) { g_hash_table_destroy(op.params); } } static void update_remaining_timeout(remote_ra_cmd_t * cmd) { cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000; } static gboolean retry_start_cmd_cb(gpointer data) { lrm_state_t *lrm_state = data; remote_ra_data_t *ra_data = lrm_state->remote_ra_data; remote_ra_cmd_t *cmd = NULL; int rc = -1; if (!ra_data || !ra_data->cur_cmd) { return FALSE; } cmd = ra_data->cur_cmd; if (safe_str_neq(cmd->action, "start")) { return FALSE; } update_remaining_timeout(cmd); if (cmd->remaining_timeout > 0) { rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout); } if (rc != 0) { cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; report_remote_ra_result(cmd); if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } ra_data->cur_cmd = NULL; free_cmd(cmd); } else { /* wait for connection event */ } return FALSE; } static gboolean monitor_timeout_cb(gpointer data) { lrm_state_t *lrm_state = NULL; remote_ra_cmd_t *cmd = data; crm_debug("Poke async response timed out for node %s", cmd->rsc_id); cmd->monitor_timeout_id = 0; cmd->op_status = PCMK_LRM_OP_TIMEOUT; cmd->rc = PCMK_OCF_UNKNOWN_ERROR; lrm_state = lrm_state_find(cmd->rsc_id); if (lrm_state && lrm_state->remote_ra_data) { remote_ra_data_t *ra_data = lrm_state->remote_ra_data; if (ra_data->cur_cmd == cmd) { ra_data->cur_cmd = NULL; } if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } } report_remote_ra_result(cmd); free_cmd(cmd); return FALSE; } +xmlNode * +simple_remote_node_status(const char *node_name, xmlNode * parent, const char *source) +{ + xmlNode *state = create_xml_node(parent, XML_CIB_TAG_STATE); + + crm_xml_add(state, XML_NODE_IS_REMOTE, "true"); + crm_xml_add(state, XML_ATTR_UUID, node_name); + crm_xml_add(state, XML_ATTR_UNAME, node_name); + crm_xml_add(state, XML_ATTR_ORIGIN, source); + + return state; +} + static void -remote_init_cib_status(lrm_state_t *lrm_state) +remote_init_cib_status(const char *node_name) { - xmlNode *update = create_xml_node(NULL, XML_CIB_TAG_STATUS); - xmlNode *state = NULL; int call_id = 0; int call_opt = cib_quorum_override; + xmlNode *update = create_xml_node(NULL, XML_CIB_TAG_STATUS); + + simple_remote_node_status(node_name, update,__FUNCTION__); if (fsa_state == S_ELECTION || fsa_state == S_PENDING) { call_opt |= cib_scope_local; } - state = create_xml_node(update, XML_CIB_TAG_STATE); - - crm_xml_add(state, XML_NODE_IS_REMOTE, "true"); - crm_xml_add(state, XML_ATTR_UUID, lrm_state->node_name); - crm_xml_add(state, XML_ATTR_UNAME, lrm_state->node_name); - crm_xml_add(state, XML_ATTR_ORIGIN, __FUNCTION__); fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); if (call_id != pcmk_ok) { - crm_debug("Failed to init status section for remote-node %s", lrm_state->node_name); + crm_debug("Failed to init status section for remote-node %s", node_name); } free_xml(update); } void remote_lrm_op_callback(lrmd_event_data_t * op) { lrm_state_t *lrm_state = NULL; remote_ra_data_t *ra_data = NULL; remote_ra_cmd_t *cmd = NULL; crm_debug("remote connection event - event_type:%s node:%s action:%s rc:%s op_status:%s", lrmd_event_type2str(op->type), op->remote_nodename, op->op_type ? op->op_type : "none", services_ocf_exitcode_str(op->rc), services_lrm_status_str(op->op_status)); /* filter all EXEC events up */ if (op->type == lrmd_event_exec_complete) { lrm_op_callback(op); return; } lrm_state = lrm_state_find(op->remote_nodename); if (!lrm_state || !lrm_state->remote_ra_data) { crm_debug("lrm_state info not found for remote lrmd connection event"); return; } ra_data = lrm_state->remote_ra_data; if (!ra_data->cur_cmd) { crm_debug("no event to match"); return; } cmd = ra_data->cur_cmd; /* Start actions and migrate from actions complete after connection * comes back to us. */ if (op->type == lrmd_event_connect && (safe_str_eq(cmd->action, "start") || safe_str_eq(cmd->action, "migrate_from"))) { if (op->connection_rc < 0) { update_remaining_timeout(cmd); /* There isn't much of a reason to reschedule if the timeout is too small */ if (cmd->remaining_timeout > 3000) { crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout); g_timeout_add(1000, retry_start_cmd_cb, lrm_state); return; } else { crm_trace("can't reschedule start, remaining timeout too small %d", cmd->remaining_timeout); } cmd->op_status = PCMK_LRM_OP_TIMEOUT; cmd->rc = PCMK_OCF_UNKNOWN_ERROR; } else { /* make sure we have a clean status section to start with */ - remote_init_cib_status(lrm_state); + remote_init_cib_status(lrm_state->node_name); cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; } crm_debug("remote lrmd connect event matched %s action. ", cmd->action); report_remote_ra_result(cmd); if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } ra_data->cur_cmd = NULL; free_cmd(cmd); return; } else if (op->type == lrmd_event_poke && safe_str_eq(cmd->action, "monitor")) { if (cmd->monitor_timeout_id) { g_source_remove(cmd->monitor_timeout_id); cmd->monitor_timeout_id = 0; } /* Only report success the first time, after that only worry about failures. * For this function, if we get the poke pack, it is always a success. Pokes * only fail if the send fails, or the response times out. */ if (!cmd->reported_success) { cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; report_remote_ra_result(cmd); cmd->reported_success = 1; } crm_debug("remote lrmd poke event matched %s action. ", cmd->action); /* success, keep rescheduling if interval is present. */ if (cmd->interval && (cmd->cancel == FALSE)) { ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd); cmd->interval_id = g_timeout_add(cmd->interval, recurring_helper, cmd); cmd = NULL; /* prevent free */ } if (ra_data->cmds) { mainloop_set_trigger(ra_data->work); } ra_data->cur_cmd = NULL; free_cmd(cmd); return; } crm_debug("Event did not match %s action", ra_data->cur_cmd->action); } static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms) { const char *server = NULL; lrmd_key_value_t *tmp = NULL; int port = 0; int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms; for (tmp = cmd->params; tmp; tmp = tmp->next) { const char *key = tmp->key; /* skip over 'remote-' prefix if it exists */ if (strstr(key, "remote-")) { key += 7; } if (safe_str_eq(tmp->key, "addr") || safe_str_eq(tmp->key, "server")) { server = tmp->value; } if (safe_str_eq(tmp->key, "port")) { port = atoi(tmp->value); } } return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used); } static gboolean handle_remote_ra_exec(gpointer user_data) { int rc = 0; lrm_state_t *lrm_state = user_data; remote_ra_data_t *ra_data = lrm_state->remote_ra_data; remote_ra_cmd_t *cmd; GList *first = NULL; if (ra_data->cur_cmd) { /* still waiting on previous cmd */ return TRUE; } while (ra_data->cmds) { first = ra_data->cmds; cmd = first->data; if (cmd->delay_id) { /* still waiting for start delay timer to trip */ return TRUE; } ra_data->cmds = g_list_remove_link(ra_data->cmds, first); g_list_free_1(first); if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) { xmlNode *status = create_xml_node(NULL, XML_CIB_TAG_STATE); /* clear node status in cib */ crm_xml_add(status, XML_ATTR_ID, lrm_state->node_name); lrm_state_reset_tables(lrm_state); fsa_cib_delete(XML_CIB_TAG_STATUS, status, cib_quorum_override, rc, NULL); crm_info("Forced a remote LRM refresh before connection start: call=%d", rc); crm_log_xml_trace(status, "CLEAR LRM"); free_xml(status); rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout); if (rc == 0) { /* take care of this later when we get async connection result */ crm_debug("began remote lrmd connect, waiting for connect event."); ra_data->cur_cmd = cmd; return TRUE; } else { crm_debug("connect failed, not expecting to match any connection event later"); cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; } report_remote_ra_result(cmd); } else if (!strcmp(cmd->action, "monitor")) { if (lrm_state_is_connected(lrm_state) == TRUE) { rc = lrm_state_poke_connection(lrm_state); if (rc < 0) { cmd->rc = PCMK_OCF_UNKNOWN_ERROR; cmd->op_status = PCMK_LRM_OP_ERROR; } } else { rc = -1; cmd->op_status = PCMK_LRM_OP_DONE; cmd->rc = PCMK_OCF_NOT_RUNNING; } if (rc == 0) { crm_debug("poked remote lrmd at node %s, waiting for async response.", cmd->rsc_id); ra_data->cur_cmd = cmd; cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd); return TRUE; } report_remote_ra_result(cmd); } else if (!strcmp(cmd->action, "stop")) { lrm_state_disconnect(lrm_state); cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; if (ra_data->cmds) { g_list_free_full(ra_data->cmds, free_cmd); } if (ra_data->recurring_cmds) { g_list_free_full(ra_data->recurring_cmds, free_cmd); } ra_data->cmds = NULL; ra_data->recurring_cmds = NULL; report_remote_ra_result(cmd); } else if (!strcmp(cmd->action, "migrate_to")) { /* no-op. */ cmd->rc = PCMK_OCF_OK; cmd->op_status = PCMK_LRM_OP_DONE; report_remote_ra_result(cmd); } free_cmd(cmd); } return TRUE; } static void remote_ra_data_init(lrm_state_t * lrm_state) { remote_ra_data_t *ra_data = NULL; if (lrm_state->remote_ra_data) { return; } ra_data = calloc(1, sizeof(remote_ra_data_t)); ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state); lrm_state->remote_ra_data = ra_data; } void remote_ra_cleanup(lrm_state_t * lrm_state) { remote_ra_data_t *ra_data = lrm_state->remote_ra_data; if (!ra_data) { return; } if (ra_data->cmds) { g_list_free_full(ra_data->cmds, free_cmd); } if (ra_data->recurring_cmds) { g_list_free_full(ra_data->recurring_cmds, free_cmd); } mainloop_destroy_trigger(ra_data->work); free(ra_data); lrm_state->remote_ra_data = NULL; } gboolean is_remote_lrmd_ra(const char *agent, const char *provider, const char *id) { if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) { return TRUE; } if (id && lrm_state_find(id)) { return TRUE; } return FALSE; } lrmd_rsc_info_t * remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id) { lrmd_rsc_info_t *info = NULL; if ((lrm_state_find(rsc_id))) { info = calloc(1, sizeof(lrmd_rsc_info_t)); info->id = strdup(rsc_id); info->type = strdup(REMOTE_LRMD_RA); info->class = strdup("ocf"); info->provider = strdup("pacemaker"); } return info; } static gboolean is_remote_ra_supported_action(const char *action) { if (!action) { return FALSE; } else if (strcmp(action, "start") && strcmp(action, "stop") && strcmp(action, "migrate_to") && strcmp(action, "migrate_from") && strcmp(action, "monitor")) { return FALSE; } return TRUE; } static GList * remove_cmd(GList * list, const char *action, int interval) { remote_ra_cmd_t *cmd = NULL; GListPtr gIter = NULL; for (gIter = list; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if (cmd->interval == interval && safe_str_eq(cmd->action, action)) { break; } cmd = NULL; } if (cmd) { list = g_list_remove(list, cmd); free_cmd(cmd); } return list; } int remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval) { lrm_state_t *connection_rsc = NULL; remote_ra_data_t *ra_data = NULL; connection_rsc = lrm_state_find(rsc_id); if (!connection_rsc || !connection_rsc->remote_ra_data) { return -EINVAL; } ra_data = connection_rsc->remote_ra_data; ra_data->cmds = remove_cmd(ra_data->cmds, action, interval); ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action, interval); if (ra_data->cur_cmd && (ra_data->cur_cmd->interval == interval) && (safe_str_eq(ra_data->cur_cmd->action, action))) { ra_data->cur_cmd->cancel = TRUE; } return 0; } int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ lrmd_key_value_t * params) { int rc = 0; lrm_state_t *connection_rsc = NULL; remote_ra_cmd_t *cmd = NULL; remote_ra_data_t *ra_data = NULL; if (is_remote_ra_supported_action(action) == FALSE) { rc = -EINVAL; goto exec_done; } connection_rsc = lrm_state_find(rsc_id); if (!connection_rsc) { rc = -EINVAL; goto exec_done; } remote_ra_data_init(connection_rsc); cmd = calloc(1, sizeof(remote_ra_cmd_t)); cmd->owner = strdup(lrm_state->node_name); cmd->rsc_id = strdup(rsc_id); cmd->action = strdup(action); cmd->userdata = strdup(userdata); cmd->interval = interval; cmd->timeout = timeout; cmd->start_delay = start_delay; cmd->params = params; cmd->start_time = time(NULL); cmd->call_id = generate_callid(); if (cmd->start_delay) { cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd); } ra_data = connection_rsc->remote_ra_data; ra_data->cmds = g_list_append(ra_data->cmds, cmd); mainloop_set_trigger(ra_data->work); return cmd->call_id; exec_done: lrmd_key_value_freeall(params); return rc; } diff --git a/crmd/te_actions.c b/crmd/te_actions.c index ed93eec912..07aaf0f180 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -1,545 +1,547 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include char *te_uuid = NULL; void send_rsc_command(crm_action_t * action); static void te_start_action_timer(crm_graph_t * graph, crm_action_t * action) { action->timer = calloc(1, sizeof(crm_action_timer_t)); action->timer->timeout = action->timeout; action->timer->reason = timeout_action; action->timer->action = action; action->timer->source_id = g_timeout_add(action->timer->timeout + graph->network_delay, action_timer_callback, (void *)action->timer); CRM_ASSERT(action->timer->source_id != 0); } static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { crm_debug("Pseudo action %d fired and confirmed", pseudo->id); pseudo->confirmed = TRUE; update_graph(graph, pseudo); trigger_graph(); return TRUE; } void send_stonith_update(crm_action_t * action, const char *target, const char *uuid) { int rc = pcmk_ok; crm_node_t *peer = NULL; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); /* Make sure the membership and join caches are accurate */ peer = crm_get_peer_full(0, target, CRM_GET_PEER_CLUSTER | CRM_GET_PEER_REMOTE); + CRM_CHECK(peer != NULL, return); + if (peer->uuid == NULL) { crm_info("Recording uuid '%s' for node '%s'", uuid, target); peer->uuid = strdup(uuid); } crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL); crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); crm_update_peer_join(__FUNCTION__, peer, crm_join_none); node_state = do_update_node_cib(peer, node_update_cluster | node_update_peer | node_update_join | node_update_expected, NULL, __FUNCTION__); /* Force our known ID */ crm_xml_add(node_state, XML_ATTR_UUID, uuid); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override | cib_scope_local | cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); /* Make sure it sticks */ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); free_xml(node_state); return; } static gboolean te_fence_node(crm_graph_t * graph, crm_action_t * action) { int rc = 0; const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; gboolean invalid_action = FALSE; enum stonith_call_options options = st_opt_none; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = crm_meta_value(action->params, "stonith_action"); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return FALSE; } crm_notice("Executing %s fencing operation (%s) on %s (timeout=%d)", type, id, target, transition_graph->stonith_timeout); /* Passing NULL means block until we can connect... */ te_connect_stonith(NULL); if (crmd_join_phase_count(crm_join_confirmed) == 1) { options |= st_opt_allow_suicide; } rc = stonith_api->cmds->fence(stonith_api, options, target, type, transition_graph->stonith_timeout / 1000, 0); stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000, st_opt_timeout_updates, generate_transition_key(transition_graph->id, action->id, 0, te_uuid), "tengine_stonith_callback", tengine_stonith_callback); return TRUE; } static int get_target_rc(crm_action_t * action) { const char *target_rc_s = crm_meta_value(action->params, XML_ATTR_TE_TARGET_RC); if (target_rc_s != NULL) { return crm_parse_int(target_rc_s, "0"); } return 0; } static gboolean te_crm_command(crm_graph_t * graph, crm_action_t * action) { char *counter = NULL; xmlNode *cmd = NULL; gboolean is_local = FALSE; const char *id = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *router_node = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; id = ID(action->xml); task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (!router_node) { router_node = on_node; } CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err("Corrupted command (id=%s) %s: no node", crm_str(id), crm_str(task)); return FALSE); crm_info("Executing crm-event (%s): %s on %s%s%s", crm_str(id), crm_str(task), on_node, is_local ? " (local)" : "", no_wait ? " - no waiting" : ""); if (safe_str_eq(router_node, fsa_our_uname)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } if (is_local && safe_str_eq(task, CRM_OP_SHUTDOWN)) { /* defer until everything else completes */ crm_info("crm-event (%s) is a local shutdown", crm_str(id)); graph->completion_action = tg_shutdown; graph->abort_reason = "local shutdown"; action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); return TRUE; } else if (safe_str_eq(task, CRM_OP_SHUTDOWN)) { crm_node_t *peer = crm_get_peer(0, router_node); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); } cmd = create_request(task, action->xml, router_node, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); counter = generate_transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(cmd, XML_ATTR_TRANSITION_KEY, counter); rc = send_cluster_message(crm_get_peer(0, router_node), crm_msg_crmd, cmd, TRUE); free(counter); free_xml(cmd); if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); } else { if (action->timeout <= 0) { crm_err("Action %d: %s on %s had an invalid timeout (%dms). Using %dms instead", action->id, task, on_node, action->timeout, graph->network_delay); action->timeout = graph->network_delay; } te_start_action_timer(graph, action); } return TRUE; } gboolean cib_action_update(crm_action_t * action, int status, int op_rc) { lrmd_event_data_t *op = NULL; xmlNode *state = NULL; xmlNode *rsc = NULL; xmlNode *xml_op = NULL; xmlNode *action_rsc = NULL; int rc = pcmk_ok; const char *name = NULL; const char *value = NULL; const char *rsc_id = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); const char *target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int call_options = cib_quorum_override | cib_scope_local; int target_rc = get_target_rc(action); if (status == PCMK_LRM_OP_PENDING) { crm_debug("%s %d: Recording pending operation %s on %s", crm_element_name(action->xml), action->id, task_uuid, target); } else { crm_warn("%s %d: %s on %s timed out", crm_element_name(action->xml), action->id, task_uuid, target); } action_rsc = find_xml_node(action->xml, XML_CIB_TAG_RESOURCE, TRUE); if (action_rsc == NULL) { return FALSE; } rsc_id = ID(action_rsc); CRM_CHECK(rsc_id != NULL, crm_log_xml_err(action->xml, "Bad:action"); return FALSE); /* update the CIB */ state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(state, XML_ATTR_UUID, target_uuid); crm_xml_add(state, XML_ATTR_UNAME, target); rsc = create_xml_node(state, XML_CIB_TAG_LRM); crm_xml_add(rsc, XML_ATTR_ID, target_uuid); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCES); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, rsc_id); name = XML_ATTR_TYPE; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_CLASS; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_PROVIDER; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); op = convert_graph_action(NULL, action, status, op_rc); op->call_id = -1; op->user_data = generate_transition_key(transition_graph->id, action->id, target_rc, te_uuid); xml_op = create_operation_update(rsc, op, CRM_FEATURE_SET, target_rc, __FUNCTION__, LOG_INFO); lrmd_free_event(op); crm_trace("Updating CIB with \"%s\" (%s): %s %s on %s", status < 0 ? "new action" : XML_ATTR_TIMEOUT, crm_element_name(action->xml), crm_str(task), rsc_id, target); crm_log_xml_trace(xml_op, "Op"); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); crm_trace("Updating CIB with %s action %d: %s on %s (call_id=%d)", services_lrm_status_str(status), action->id, task_uuid, target, rc); fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); free_xml(state); action->sent_update = TRUE; if (rc < pcmk_ok) { return FALSE; } return TRUE; } static gboolean te_rsc_command(crm_graph_t * graph, crm_action_t * action) { /* never overwrite stop actions in the CIB with * anything other than completed results * * Writing pending stops makes it look like the * resource is running again */ xmlNode *cmd = NULL; xmlNode *rsc_op = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; gboolean is_local = FALSE; char *counter = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *router_node = NULL; const char *task_uuid = NULL; CRM_ASSERT(action != NULL); CRM_ASSERT(action->xml != NULL); action->executed = FALSE; on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err("Corrupted command(id=%s) %s: no node", ID(action->xml), crm_str(task)); return FALSE); rsc_op = action->xml; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); router_node = crm_element_value(rsc_op, XML_LRM_ATTR_ROUTER_NODE); if (!router_node) { router_node = on_node; } counter = generate_transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(rsc_op, XML_ATTR_TRANSITION_KEY, counter); if (safe_str_eq(router_node, fsa_our_uname)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } crm_notice("Initiating action %d: %s %s on %s%s%s", action->id, task, task_uuid, on_node, is_local ? " (local)" : "", no_wait ? " - no waiting" : ""); cmd = create_request(CRM_OP_INVOKE_LRM, rsc_op, router_node, CRM_SYSTEM_LRMD, CRM_SYSTEM_TENGINE, NULL); if (is_local) { /* shortcut local resource commands */ ha_msg_input_t data = { .msg = cmd, .xml = rsc_op, }; fsa_data_t msg = { .id = 0, .data = &data, .data_type = fsa_dt_ha_msg, .fsa_input = I_NULL, .fsa_cause = C_FSA_INTERNAL, .actions = A_LRM_INVOKE, .origin = __FUNCTION__, }; do_lrm_invoke(A_LRM_INVOKE, C_FSA_INTERNAL, fsa_state, I_NULL, &msg); } else { rc = send_cluster_message(crm_get_peer(0, router_node), crm_msg_lrmd, cmd, TRUE); } free(counter); free_xml(cmd); action->executed = TRUE; if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { crm_info("Action %d confirmed - no wait", action->id); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); } else { if (action->timeout <= 0) { crm_err("Action %d: %s %s on %s had an invalid timeout (%dms). Using %dms instead", action->id, task, task_uuid, on_node, action->timeout, graph->network_delay); action->timeout = graph->network_delay; } te_start_action_timer(graph, action); } value = crm_meta_value(action->params, XML_OP_ATTR_PENDING); if (crm_is_true(value) && safe_str_neq(task, CRMD_ACTION_CANCEL) && safe_str_neq(task, CRMD_ACTION_DELETE)) { /* write a "pending" entry to the CIB, inhibit notification */ crm_debug("Recording pending op %s in the CIB", task_uuid); cib_action_update(action, PCMK_LRM_OP_PENDING, PCMK_OCF_UNKNOWN); } return TRUE; } crm_graph_functions_t te_graph_fns = { te_pseudo_action, te_rsc_command, te_crm_command, te_fence_node }; void notify_crmd(crm_graph_t * graph) { const char *type = "unknown"; enum crmd_fsa_input event = I_NULL; crm_debug("Processing transition completion in state %s", fsa_state2string(fsa_state)); CRM_CHECK(graph->complete, graph->complete = TRUE); switch (graph->completion_action) { case tg_stop: type = "stop"; /* fall through */ case tg_done: type = "done"; if (fsa_state == S_TRANSITION_ENGINE) { event = I_TE_SUCCESS; } break; case tg_restart: type = "restart"; if (fsa_state == S_TRANSITION_ENGINE) { if (too_many_st_failures() == FALSE) { if (transition_timer->period_ms > 0) { crm_timer_stop(transition_timer); crm_timer_start(transition_timer); } else { event = I_PE_CALC; } } else { event = I_TE_SUCCESS; } } else if (fsa_state == S_POLICY_ENGINE) { register_fsa_action(A_PE_INVOKE); } break; case tg_shutdown: type = "shutdown"; if (is_set(fsa_input_register, R_SHUTDOWN)) { event = I_STOP; } else { crm_err("We didn't ask to be shut down, yet our" " PE is telling us too."); event = I_TERMINATE; } } crm_debug("Transition %d status: %s - %s", graph->id, type, crm_str(graph->abort_reason)); graph->abort_reason = NULL; graph->completion_action = tg_done; clear_bit(fsa_input_register, R_IN_TRANSITION); if (event != I_NULL) { register_fsa_input(C_FSA_INTERNAL, event, NULL); } else if (fsa_source) { mainloop_set_trigger(fsa_source); } } diff --git a/crmd/te_events.c b/crmd/te_events.c index 78d00dde00..8174b98b54 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -1,549 +1,607 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; int match_graph_event(int action_id, xmlNode * event, const char *event_node, int op_status, int op_rc, int target_rc); gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node) { const char *target = NULL; xmlNode *last_action = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; if (graph == NULL || graph->complete) { return FALSE; } gIter = graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; if (synapse->confirmed) { continue; } gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->type == action_type_pseudo || action->confirmed) { continue; } else if (action->type == action_type_crm) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRM_OP_FENCE)) { continue; } } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (safe_str_eq(target, down_node)) { action->failed = TRUE; synapse->failed = TRUE; last_action = action->xml; stop_te_timer(action->timer); update_graph(graph, action); if (synapse->executed) { crm_notice("Action %d (%s) was pending on %s (offline)", action->id, ID(action->xml), down_node); } else { crm_notice("Action %d (%s) is scheduled for %s (offline)", action->id, ID(action->xml), down_node); } } } } if (last_action != NULL) { crm_warn("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } static const char * get_uname_from_event(xmlNode * event) { xmlNode *node = event; while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return NULL); return crm_element_value(node, XML_ATTR_UNAME); } static gboolean get_is_remote_from_event(xmlNode * event) { xmlNode *node = event; while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return FALSE); return crm_element_value(node, XML_NODE_IS_REMOTE) ? TRUE : FALSE; } static gboolean update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int target_rc, gboolean do_update) { int interval = 0; char *task = NULL; char *rsc_id = NULL; char *attr_name = NULL; const char *value = NULL; const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); const char *on_uname = get_uname_from_event(event); const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); if (rc == 99) { /* this is an internal code for "we're busy, try again" */ return FALSE; } else if (rc == target_rc) { return FALSE; } if (safe_str_eq(origin, "build_active_RAs")) { crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", id, rc, on_uname); return FALSE; } CRM_LOG_ASSERT(on_uname != NULL); if (on_uname == NULL) { return TRUE; } if (failed_stop_offset == NULL) { failed_stop_offset = strdup(INFINITY_S); } if (failed_start_offset == NULL) { failed_start_offset = strdup(INFINITY_S); } CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event)); goto bail); CRM_CHECK(task != NULL, goto bail); CRM_CHECK(rsc_id != NULL, goto bail); if (do_update || interval > 0) { do_update = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_START)) { do_update = TRUE; value = failed_start_offset; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { do_update = TRUE; value = failed_stop_offset; } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { do_update = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { do_update = TRUE; } if (value == NULL || safe_str_neq(value, INFINITY_S)) { value = XML_NVPAIR_ATTR_VALUE "++"; } if (do_update) { char *now = crm_itoa(time(NULL)); gboolean is_remote_node = get_is_remote_from_event(event); crm_warn("Updating failcount for %s on %s after failed %s:" " rc=%d (update=%s, time=%s)", rsc_id, on_uname, task, rc, value, now); attr_name = crm_concat("fail-count", rsc_id, '-'); update_attrd(on_uname, attr_name, value, NULL, is_remote_node); free(attr_name); attr_name = crm_concat("last-failure", rsc_id, '-'); update_attrd(on_uname, attr_name, now, NULL, is_remote_node); free(attr_name); free(now); } bail: free(rsc_id); free(task); return TRUE; } static int status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc) { int status = orig_status; if (target_rc == rc) { crm_trace("Target rc: == %d", rc); if (status != PCMK_LRM_OP_DONE) { crm_trace("Re-mapping op status to" " PCMK_LRM_OP_DONE for rc=%d", rc); status = PCMK_LRM_OP_DONE; } } else { status = PCMK_LRM_OP_ERROR; } /* 99 is the code we use for direct nack's */ if (rc != 99 && status != PCMK_LRM_OP_DONE) { const char *task, *uname; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s", action->id, task, uname, target_rc, rc, services_lrm_status_str(status)); } return status; } +static void +process_remote_node_action(crm_action_t *action, xmlNode *event) +{ + xmlNode *child = NULL; + + /* The whole point of this function is to detect when a remote-node + * is integrated into the cluster, and abort the transition if that remote-node + * was fenced earlier in the transition. This allows a new transition to be + * generated so resources can be placed on the new node. + */ + + if (crm_remote_peer_cache_size() == 0) { + return; + } else if (action->type != action_type_rsc) { + return; + } else if (action->failed || action->confirmed == FALSE) { + return; + } else if (safe_str_neq(crm_element_value(action->xml, XML_LRM_ATTR_TASK), "start")) { + return; + } + + for (child = __xml_first_child(action->xml); child != NULL; child = __xml_next(child)) { + const char *provider; + const char *type; + const char *rsc; + crm_node_t *remote_peer; + + if (safe_str_neq(crm_element_name(child), XML_CIB_TAG_RESOURCE)) { + continue; + } + + provider = crm_element_value(child, XML_AGENT_ATTR_PROVIDER); + type = crm_element_value(child, XML_ATTR_TYPE); + rsc = ID(child); + + if (safe_str_neq(provider, "pacemaker") || safe_str_neq(type, "remote") || rsc == NULL) { + break; + } + + remote_peer = crm_get_peer_full(0, rsc, CRM_GET_PEER_REMOTE); + if (remote_peer == NULL) { + break; + } + + /* A remote node will be placed in the "lost" state after + * it has been successfully fenced. After successfully connecting + * to a remote-node after being fenced, we need to abort the transition + * so resources can be placed on the newly integrated remote-node */ + if (safe_str_eq(remote_peer->state, CRM_NODE_LOST)) { + abort_transition(INFINITY, tg_restart, "Remote-node re-discovered.", event); + } + + return; + } +} + /* * returns the ID of the action if a match is found * returns -1 if a match was not found * returns -2 if a match was found but the action failed (and was * not allowed to) */ int match_graph_event(int action_id, xmlNode * event, const char *event_node, int op_status, int op_rc, int target_rc) { const char *target = NULL; const char *allow_fail = NULL; const char *this_event = NULL; crm_action_t *action = NULL; action = get_action(action_id, FALSE); if (action == NULL) { return -1; } op_status = status_from_rc(action, op_status, op_rc, target_rc); if (op_status != PCMK_LRM_OP_DONE) { update_failcount(event, event_node, op_rc, target_rc, FALSE); } /* Process OP status */ switch (op_status) { case PCMK_LRM_OP_PENDING: crm_debug("Ignoring pending operation"); return action->id; break; case PCMK_LRM_OP_DONE: break; case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_TIMEOUT: case PCMK_LRM_OP_NOTSUPPORTED: action->failed = TRUE; break; case PCMK_LRM_OP_CANCELLED: /* do nothing?? */ crm_err("Dont know what to do for cancelled ops yet"); break; default: action->failed = TRUE; crm_err("Unsupported action result: %d", op_status); } /* stop this event's timer if it had one */ stop_te_timer(action->timer); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); if (action->failed) { allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); if (crm_is_true(allow_fail)) { action->failed = FALSE; } } if (action->failed) { abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event); } this_event = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_info("Action %s (%d) confirmed on %s (rc=%d)", crm_str(this_event), action->id, crm_str(target), op_status); + /* determine if this action affects a remote-node's online/offline status */ + process_remote_node_action(action, event); return action->id; } crm_action_t * get_action(int id, gboolean confirmed) { GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (action->id == id) { if (confirmed) { stop_te_timer(action->timer); action->confirmed = TRUE; } return action; } } } return NULL; } crm_action_t * get_cancel_action(const char *id, const char *node) { const char *task = NULL; const char *target = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (safe_str_neq(CRMD_ACTION_CANCEL, task)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); if (safe_str_neq(task, id)) { continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (safe_str_neq(target, node)) { continue; } return action; } } return NULL; } crm_action_t * match_down_event(int id, const char *target, const char *filter, bool quiet) { const char *this_action = NULL; const char *this_node = NULL; crm_action_t *match = NULL; GListPtr gIter = NULL; GListPtr gIter2 = NULL; gIter = transition_graph->synapses; for (; gIter != NULL; gIter = gIter->next) { synapse_t *synapse = (synapse_t *) gIter->data; /* lookup event */ gIter2 = synapse->actions; for (; gIter2 != NULL; gIter2 = gIter2->next) { crm_action_t *action = (crm_action_t *) gIter2->data; if (id > 0 && action->id == id) { match = action; break; } this_action = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (action->type != action_type_crm) { continue; } else if (safe_str_eq(this_action, CRM_OP_LRM_REFRESH)) { continue; } else if (filter != NULL && safe_str_neq(this_action, filter)) { continue; } this_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); if (this_node == NULL) { crm_log_xml_err(action->xml, "No node uuid"); } if (safe_str_neq(this_node, target)) { crm_debug("Action %d : Node mismatch: %s", action->id, this_node); continue; } match = action; id = action->id; break; } if (match != NULL) { /* stop this event's timer if it had one */ break; } } if (match != NULL) { /* stop this event's timer if it had one */ crm_debug("Match found for action %d: %s on %s", id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); } else if (id > 0) { crm_err("No match for action %d", id); } else if(quiet == FALSE) { crm_warn("No match for shutdown action on %s", target); } return match; } gboolean process_graph_event(xmlNode * event, const char *event_node) { int rc = -1; int status = -1; int callid = -1; int action = -1; int target_rc = -1; int transition_num = -1; char *update_te_uuid = NULL; gboolean stop_early = FALSE; gboolean passed = FALSE; const char *id = NULL; const char *desc = NULL; const char *magic = NULL; CRM_ASSERT(event != NULL); /* */ id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); crm_element_value_int(event, XML_LRM_ATTR_RC, &rc); crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status); crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid); magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY); if (magic == NULL) { /* non-change */ return FALSE; } if (decode_transition_key(magic, &update_te_uuid, &transition_num, &action, &target_rc) == FALSE) { crm_err("Invalid event %s.%d detected: %s", id, callid, magic); abort_transition(INFINITY, tg_restart, "Bad event", event); return FALSE; } if (status == PCMK_LRM_OP_PENDING) { goto bail; } if (transition_num == -1) { desc = "initiated outside of the cluster"; abort_transition(INFINITY, tg_restart, "Unexpected event", event); } else if (action < 0 || crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE) { desc = "initiated by a different node"; abort_transition(INFINITY, tg_restart, "Foreign event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if (transition_graph->id != transition_num) { desc = "arrived really late"; abort_transition(INFINITY, tg_restart, "Old event", event); stop_early = TRUE; /* This could be an lrm status refresh */ } else if (transition_graph->complete) { desc = "arrived late"; abort_transition(INFINITY, tg_restart, "Inactive graph", event); } else if (match_graph_event(action, event, event_node, status, rc, target_rc) < 0) { desc = "unknown"; abort_transition(INFINITY, tg_restart, "Unknown event", event); } else if (rc == target_rc) { passed = TRUE; crm_trace("Processed update to %s: %s", id, magic); } if (passed == FALSE) { if (update_failcount(event, event_node, rc, target_rc, transition_num == -1)) { /* Turns out this wasn't an lrm status refresh update aferall */ stop_early = FALSE; desc = "failed"; } crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num, action, id, callid, services_ocf_exitcode_str(rc), desc); } bail: free(update_te_uuid); return stop_early; } diff --git a/crmd/te_utils.c b/crmd/te_utils.c index 239af6335e..b3a543a77c 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -1,465 +1,464 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include crm_trigger_t *stonith_reconnect = NULL; GListPtr stonith_cleanup_list = NULL; static gboolean fail_incompletable_stonith(crm_graph_t * graph) { GListPtr lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GListPtr lpc2 = NULL; synapse_t *synapse = (synapse_t *) lpc->data; if (synapse->confirmed) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { crm_action_t *action = (crm_action_t *) lpc2->data; if (action->type != action_type_crm || action->confirmed) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (task && safe_str_eq(task, CRM_OP_FENCE)) { action->failed = TRUE; last_action = action->xml; update_graph(graph, action); crm_notice("Failing action %d (%s): STONITHd terminated", action->id, ID(action->xml)); } } } if (last_action != NULL) { crm_warn("STONITHd failure resulted in un-runnable actions"); abort_transition(INFINITY, tg_restart, "Stonith failure", last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e) { if (is_set(fsa_input_register, R_ST_REQUIRED)) { crm_crit("Fencing daemon connection failed"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencing daemon disconnected"); } /* cbchan will be garbage at this point, arrange for it to be reset */ if(stonith_api) { stonith_api->state = stonith_disconnected; } if (AM_I_DC) { fail_incompletable_stonith(transition_graph); trigger_graph(); } } #if SUPPORT_CMAN # include #endif char *te_client_id = NULL; #ifdef HAVE_SYS_REBOOT_H # include # include #endif static void tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) { if(te_client_id == NULL) { te_client_id = g_strdup_printf("%s.%d", crm_system_name, getpid()); } if (st_event == NULL) { crm_err("Notify data not found"); return; } if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { crm_crit("We were alegedly just fenced by %s for %s with %s!", st_event->executioner, st_event->origin, st_event->device); /* Dumps blackbox if enabled */ qb_log_fini(); /* Try to get the above log message to disk - somehow */ /* Get out ASAP and do not come back up. * * Triggering a reboot is also not the worst idea either since * the rest of the cluster thinks we're safely down */ #ifdef RB_HALT_SYSTEM reboot(RB_HALT_SYSTEM); #endif /* * If reboot() fails or is not supported, coming back up will * probably lead to a situation where the other nodes set our * status to 'lost' because of the fencing callback and will * discard subsequent election votes with: * * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) * * So just stay dead, something is seriously messed up anyway. * */ exit(100); /* None of our wrappers since we already called qb_log_fini() */ return; } if (st_event->result == pcmk_ok && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { st_fail_count_reset(st_event->target); } crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->action, st_event->executioner ? st_event->executioner : "", st_event->origin, pcmk_strerror(st_event->result), st_event->id, st_event->client_origin ? st_event->client_origin : ""); #if SUPPORT_CMAN if (st_event->result == pcmk_ok && is_cman_cluster()) { int local_rc = 0; char *target_copy = strdup(st_event->target); /* In case fenced hasn't noticed yet * * Any fencing that has been inititated will be completed by way of the fence_pcmk redirect */ local_rc = fenced_external(target_copy); if (local_rc != 0) { crm_err("Could not notify CMAN that '%s' is now fenced: %d", st_event->target, local_rc); } else { crm_notice("Notified CMAN that '%s' is now fenced", st_event->target); } free(target_copy); } #endif if (st_event->result == pcmk_ok) { - crm_node_t *peer = crm_get_peer(0, st_event->target); + crm_node_t *peer = crm_get_peer_full(0, st_event->target, CRM_GET_PEER_REMOTE | CRM_GET_PEER_CLUSTER); const char *uuid = crm_peer_uuid(peer); gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); if(AM_I_DC) { /* The DC always sends updates */ send_stonith_update(NULL, st_event->target, uuid); if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { /* Abort the current transition graph if it wasn't us * that invoked stonith to fence someone */ crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); } /* Assume it was our leader if we dont currently have one */ } else if (fsa_our_dc == NULL || safe_str_eq(fsa_our_dc, st_event->target)) { crm_notice("Target %s our leader %s (recorded: %s)", fsa_our_dc ? "was" : "may have been", st_event->target, fsa_our_dc ? fsa_our_dc : ""); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ if (we_are_executioner) { send_stonith_update(NULL, st_event->target, uuid); } stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(st_event->target)); } - /* Everyone records them as safely down */ crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL); crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); crm_update_peer_join(__FUNCTION__, peer, crm_join_none); } } gboolean te_connect_stonith(gpointer user_data) { int lpc = 0; int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); } if (stonith_api->state != stonith_disconnected) { crm_trace("Still connected"); return TRUE; } for (lpc = 0; lpc < 30; lpc++) { crm_debug("Attempting connection to fencing daemon..."); sleep(1); rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (rc == pcmk_ok) { break; } if (user_data != NULL) { crm_err("Sign-in failed: triggered a retry"); mainloop_set_trigger(stonith_reconnect); return TRUE; } crm_err("Sign-in failed: pausing and trying again in 2s..."); sleep(1); } CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */ stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, tengine_stonith_notify); crm_trace("Connected"); return TRUE; } gboolean stop_te_timer(crm_action_timer_t * timer) { const char *timer_desc = "action timer"; if (timer == NULL) { return FALSE; } if (timer->reason == timeout_abort) { timer_desc = "global timer"; crm_trace("Stopping %s", timer_desc); } if (timer->source_id != 0) { crm_trace("Stopping %s", timer_desc); g_source_remove(timer->source_id); timer->source_id = 0; } else { crm_trace("%s was already stopped", timer_desc); return FALSE; } return TRUE; } gboolean te_graph_trigger(gpointer user_data) { enum transition_status graph_rc = -1; if (transition_graph == NULL) { crm_debug("Nothing to do"); return TRUE; } crm_trace("Invoking graph %d in state %s", transition_graph->id, fsa_state2string(fsa_state)); switch (fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: return TRUE; break; default: break; } if (transition_graph->complete == FALSE) { graph_rc = run_graph(transition_graph); print_graph(LOG_DEBUG_3, transition_graph); if (graph_rc == transition_active) { crm_trace("Transition not yet complete"); return TRUE; } else if (graph_rc == transition_pending) { crm_trace("Transition not yet complete - no actions fired"); return TRUE; } if (graph_rc != transition_complete) { crm_warn("Transition failed: %s", transition_status(graph_rc)); print_graph(LOG_NOTICE, transition_graph); } } crm_debug("Transition %d is now complete", transition_graph->id); transition_graph->complete = TRUE; notify_crmd(transition_graph); return TRUE; } void trigger_graph_processing(const char *fn, int line) { crm_trace("%s:%d - Triggered graph processing", fn, line); mainloop_set_trigger(transition_trigger); } void abort_transition_graph(int abort_priority, enum transition_action abort_action, const char *abort_text, xmlNode * reason, const char *fn, int line) { const char *magic = NULL; CRM_CHECK(transition_graph != NULL, return); if (reason) { int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; const char *uname = ""; xmlNode *search = reason; xmlNode *diff = get_xpath_object("//" F_CIB_UPDATE_RESULT "//diff", reason, LOG_DEBUG_2); magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC); while(search) { const char *kind = TYPE(search); if (safe_str_eq(XML_CIB_TAG_STATE, kind) || safe_str_eq(XML_CIB_TAG_NODE, kind)) { uname = crm_peer_uname(ID(search)); break; } search = search->parent; } if (diff) { cib_diff_version_details(diff, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); if (crm_str_eq(TYPE(reason), XML_CIB_TAG_NVPAIR, TRUE)) { crm_info ("%s:%d - Triggered transition abort (complete=%d, node=%s, tag=%s, id=%s, name=%s, value=%s, magic=%s, cib=%d.%d.%d) : %s", fn, line, transition_graph->complete, uname, TYPE(reason), ID(reason), NAME(reason), VALUE(reason), magic ? magic : "NA", diff_add_admin_epoch, diff_add_epoch, diff_add_updates, abort_text); } else { crm_info ("%s:%d - Triggered transition abort (complete=%d, node=%s, tag=%s, id=%s, magic=%s, cib=%d.%d.%d) : %s", fn, line, transition_graph->complete, uname, TYPE(reason), ID(reason), magic ? magic : "NA", diff_add_admin_epoch, diff_add_epoch, diff_add_updates, abort_text); } } else { crm_info ("%s:%d - Triggered transition abort (complete=%d, node=%s, tag=%s, id=%s, magic=%s) : %s", fn, line, transition_graph->complete, uname, TYPE(reason), ID(reason), magic ? magic : "NA", abort_text); } } else { crm_info("%s:%d - Triggered transition abort (complete=%d) : %s", fn, line, transition_graph->complete, abort_text); } switch (fsa_state) { case S_STARTING: case S_PENDING: case S_NOT_DC: case S_HALT: case S_ILLEGAL: case S_STOPPING: case S_TERMINATE: crm_info("Abort suppressed: state=%s (complete=%d)", fsa_state2string(fsa_state), transition_graph->complete); return; default: break; } if (magic == NULL && reason != NULL) { crm_log_xml_debug(reason, "Cause"); } /* Make sure any queued calculations are discarded ASAP */ free(fsa_pe_ref); fsa_pe_ref = NULL; if (transition_graph->complete) { if (transition_timer->period_ms > 0) { crm_timer_stop(transition_timer); crm_timer_start(transition_timer); } else { register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL); } return; } update_abort_priority(transition_graph, abort_priority, abort_action, abort_text); mainloop_set_trigger(transition_trigger); } diff --git a/include/crm/cluster.h b/include/crm/cluster.h index 001d8e4180..3a2c2f9be0 100644 --- a/include/crm/cluster.h +++ b/include/crm/cluster.h @@ -1,227 +1,230 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_CLUSTER__H # define CRM_COMMON_CLUSTER__H # include # include # if SUPPORT_HEARTBEAT # include # include # endif # if SUPPORT_COROSYNC # include # endif extern gboolean crm_have_quorum; extern GHashTable *crm_peer_cache; extern GHashTable *crm_remote_peer_cache; extern unsigned long long crm_peer_seq; # ifndef CRM_SERVICE # define CRM_SERVICE PCMK_SERVICE_ID # endif /* *INDENT-OFF* */ #define CRM_NODE_LOST "lost" #define CRM_NODE_MEMBER "member" #define CRM_NODE_ACTIVE CRM_NODE_MEMBER #define CRM_NODE_EVICTED "evicted" enum crm_join_phase { crm_join_nack = -1, crm_join_none = 0, crm_join_welcomed = 1, crm_join_integrated = 2, crm_join_finalized = 3, crm_join_confirmed = 4, }; enum crm_node_flags { /* node is not a cluster node and should not be considered for cluster membership */ crm_remote_node = 0x0001, /* This node is a remote node living within a container resource */ crm_remote_container = 0x0002, /* This node is a bare metal remote-node */ crm_remote_baremetal = 0x0004, }; /* *INDENT-ON* */ typedef struct crm_peer_node_s { uint32_t id; /* Only used by corosync derivatives */ uint64_t born; /* Only used by heartbeat and the legacy plugin */ uint64_t last_seen; uint64_t flags; /* Specified by crm_node_flags enum */ int32_t votes; /* Only used by the legacy plugin */ uint32_t processes; enum crm_join_phase join; char *uname; char *uuid; char *state; char *expected; char *addr; /* Only used by the legacy plugin */ char *version; /* Unused */ } crm_node_t; void crm_peer_init(void); void crm_peer_destroy(void); typedef struct crm_cluster_s { char *uuid; char *uname; uint32_t nodeid; void (*destroy) (gpointer); # if SUPPORT_HEARTBEAT ll_cluster_t *hb_conn; void (*hb_dispatch) (HA_Message * msg, void *private); # endif # if SUPPORT_COROSYNC struct cpg_name group; cpg_callbacks_t cpg; cpg_handle_t cpg_handle; # endif } crm_cluster_t; gboolean crm_cluster_connect(crm_cluster_t * cluster); void crm_cluster_disconnect(crm_cluster_t * cluster); /* *INDENT-OFF* */ enum crm_ais_msg_class { crm_class_cluster = 0, crm_class_members = 1, crm_class_notify = 2, crm_class_nodeid = 3, crm_class_rmpeer = 4, crm_class_quorum = 5, }; /* order here matters - its used to index into the crm_children array */ enum crm_ais_msg_types { crm_msg_none = 0, crm_msg_ais = 1, crm_msg_lrmd = 2, crm_msg_cib = 3, crm_msg_crmd = 4, crm_msg_attrd = 5, crm_msg_stonithd = 6, crm_msg_te = 7, crm_msg_pe = 8, crm_msg_stonith_ng = 9, }; /* used with crm_get_peer_full */ enum crm_get_peer_flags { CRM_GET_PEER_CLUSTER = 0x0001, CRM_GET_PEER_REMOTE = 0x0002, }; /* *INDENT-ON* */ gboolean send_cluster_message(crm_node_t * node, enum crm_ais_msg_types service, xmlNode * data, gboolean ordered); +int crm_remote_peer_cache_size(void); + /* Initialize and refresh the remote peer cache from a cib config */ void crm_remote_peer_cache_refresh(xmlNode *cib); /* allows filtering of remote and cluster nodes using crm_get_peer_flags */ crm_node_t *crm_get_peer_full(unsigned int id, const char *uname, int flags); /* only searches cluster nodes */ crm_node_t *crm_get_peer(unsigned int id, const char *uname); guint crm_active_peers(void); gboolean crm_is_peer_active(const crm_node_t * node); guint reap_crm_member(uint32_t id, const char *name); int crm_terminate_member(int nodeid, const char *uname, void *unused); int crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection); # if SUPPORT_HEARTBEAT gboolean crm_is_heartbeat_peer_active(const crm_node_t * node); # endif # if SUPPORT_COROSYNC extern int ais_fd_sync; uint32_t get_local_nodeid(cpg_handle_t handle); gboolean cluster_connect_cpg(crm_cluster_t *cluster); void cluster_disconnect_cpg(crm_cluster_t * cluster); void pcmk_cpg_membership(cpg_handle_t handle, const struct cpg_name *groupName, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries); gboolean crm_is_corosync_peer_active(const crm_node_t * node); gboolean send_cluster_text(int class, const char *data, gboolean local, crm_node_t * node, enum crm_ais_msg_types dest); # endif const char *crm_peer_uuid(crm_node_t *node); const char *crm_peer_uname(const char *uuid); void set_uuid(xmlNode *xml, const char *attr, crm_node_t *node); enum crm_status_type { crm_status_uname, crm_status_nstate, crm_status_processes, + crm_status_rstate, /* remote node state */ }; enum crm_ais_msg_types text2msg_type(const char *text); void crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *)); /* *INDENT-OFF* */ enum cluster_type_e { pcmk_cluster_unknown = 0x0001, pcmk_cluster_invalid = 0x0002, pcmk_cluster_heartbeat = 0x0004, pcmk_cluster_classic_ais = 0x0010, pcmk_cluster_corosync = 0x0020, pcmk_cluster_cman = 0x0040, }; /* *INDENT-ON* */ enum cluster_type_e get_cluster_type(void); const char *name_for_cluster_type(enum cluster_type_e type); gboolean is_corosync_cluster(void); gboolean is_cman_cluster(void); gboolean is_openais_cluster(void); gboolean is_classic_ais_cluster(void); gboolean is_heartbeat_cluster(void); const char *get_local_node_name(void); char *get_node_name(uint32_t nodeid); # if SUPPORT_COROSYNC char *pcmk_message_common_cs(cpg_handle_t handle, uint32_t nodeid, uint32_t pid, void *msg, uint32_t *kind, const char **from); # endif #endif diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index e8f9dbdc0c..6555661be3 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -1,590 +1,610 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include GHashTable *crm_peer_cache = NULL; GHashTable *crm_remote_peer_cache = NULL; unsigned long long crm_peer_seq = 0; gboolean crm_have_quorum = FALSE; +int +crm_remote_peer_cache_size(void) +{ + if (crm_remote_peer_cache == NULL) { + return 0; + } + return g_hash_table_size(crm_remote_peer_cache); +} + static void remote_cache_refresh_helper(xmlNode *cib, const char *xpath, const char *field, int flags) { const char *remote = NULL; crm_node_t *node = NULL; xmlXPathObjectPtr xpathObj = NULL; int max = 0; int lpc = 0; xpathObj = xpath_search(cib, xpath); max = numXpathResults(xpathObj); for (lpc = 0; lpc < max; lpc++) { xmlNode *xml = getXpathResult(xpathObj, lpc); CRM_CHECK(xml != NULL, continue); remote = crm_element_value(xml, field); if (remote) { crm_trace("added %s to remote cache", remote); node = calloc(1, sizeof(crm_node_t)); node->flags = flags; CRM_ASSERT(node); node->uname = strdup(remote); node->uuid = strdup(remote); + node->state = strdup(CRM_NODE_MEMBER); g_hash_table_replace(crm_remote_peer_cache, node->uname, node); } } freeXpathObject(xpathObj); } void crm_remote_peer_cache_refresh(xmlNode *cib) { const char *xpath = NULL; g_hash_table_remove_all(crm_remote_peer_cache); /* remote nodes associated with a cluster resource */ xpath = "//" XML_TAG_CIB "//" XML_CIB_TAG_CONFIGURATION "//" XML_CIB_TAG_RESOURCE "//" XML_TAG_META_SETS "//" XML_CIB_TAG_NVPAIR "[@name='remote-node']"; remote_cache_refresh_helper(cib, xpath, "value", crm_remote_node | crm_remote_container); /* baremetal nodes defined by connection resources*/ xpath = "//" XML_TAG_CIB "//" XML_CIB_TAG_CONFIGURATION "//" XML_CIB_TAG_RESOURCE "[@type='remote'][@provider='pacemaker']"; remote_cache_refresh_helper(cib, xpath, "id", crm_remote_node | crm_remote_baremetal); /* baremetal nodes we have seen in the config that may or may not have connection * resources associated with them anymore */ xpath = "//" XML_TAG_CIB "//" XML_CIB_TAG_STATUS "//" XML_CIB_TAG_STATE "[@remote_node='true']"; remote_cache_refresh_helper(cib, xpath, "id", crm_remote_node | crm_remote_baremetal); } gboolean crm_is_peer_active(const crm_node_t * node) { if(node == NULL) { return FALSE; } + + if (is_set(node->flags, crm_remote_node)) { + /* remote nodes are never considered active members. This + * guarantees they will never be considered for DC membership.*/ + return FALSE; + } #if SUPPORT_COROSYNC if (is_openais_cluster()) { return crm_is_corosync_peer_active(node); } #endif #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { return crm_is_heartbeat_peer_active(node); } #endif crm_err("Unhandled cluster type: %s", name_for_cluster_type(get_cluster_type())); return FALSE; } static gboolean crm_reap_dead_member(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; crm_node_t *search = user_data; if (search == NULL) { return FALSE; } else if (search->id && node->id != search->id) { return FALSE; } else if (search->id == 0 && safe_str_neq(node->uname, search->uname)) { return FALSE; } else if (crm_is_peer_active(value) == FALSE) { crm_notice("Removing %s/%u from the membership list", node->uname, node->id); return TRUE; } return FALSE; } guint reap_crm_member(uint32_t id, const char *name) { int matches = 0; crm_node_t search; if (crm_peer_cache == NULL) { crm_trace("Nothing to do, cache not initialized"); return 0; } search.id = id; search.uname = strdup(name); matches = g_hash_table_foreach_remove(crm_peer_cache, crm_reap_dead_member, &search); if(matches) { crm_notice("Purged %d peers with id=%u and/or uname=%s from the membership cache", matches, id, name); } else { crm_info("No peers with id=%u and/or uname=%s exist", id, name); } free(search.uname); return matches; } static void crm_count_peer(gpointer key, gpointer value, gpointer user_data) { guint *count = user_data; crm_node_t *node = value; if (crm_is_peer_active(node)) { *count = *count + 1; } } guint crm_active_peers(void) { guint count = 0; if (crm_peer_cache) { g_hash_table_foreach(crm_peer_cache, crm_count_peer, &count); } return count; } static void destroy_crm_node(gpointer data) { crm_node_t *node = data; crm_trace("Destroying entry for node %u", node->id); free(node->addr); free(node->uname); free(node->state); free(node->uuid); free(node->expected); free(node); } void crm_peer_init(void) { if (crm_peer_cache == NULL) { crm_peer_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, free, destroy_crm_node); } if (crm_remote_peer_cache == NULL) { crm_remote_peer_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, destroy_crm_node); } } void crm_peer_destroy(void) { if (crm_peer_cache != NULL) { crm_trace("Destroying peer cache with %d members", g_hash_table_size(crm_peer_cache)); g_hash_table_destroy(crm_peer_cache); crm_peer_cache = NULL; } if (crm_remote_peer_cache != NULL) { crm_trace("Destroying remote peer cache with %d members", g_hash_table_size(crm_remote_peer_cache)); g_hash_table_destroy(crm_remote_peer_cache); crm_remote_peer_cache = NULL; } } void (*crm_status_callback) (enum crm_status_type, crm_node_t *, const void *) = NULL; void crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *)) { crm_status_callback = dispatch; } static void crm_dump_peer_hash(int level, const char *caller) { GHashTableIter iter; const char *id = NULL; crm_node_t *node = NULL; g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) { do_crm_log(level, "%s: Node %u/%s = %p - %s", caller, node->id, node->uname, node, id); } } static gboolean crm_hash_find_by_data(gpointer key, gpointer value, gpointer user_data) { if(value == user_data) { return TRUE; } return FALSE; } crm_node_t * crm_get_peer_full(unsigned int id, const char *uname, int flags) { crm_node_t *node = NULL; CRM_ASSERT(id > 0 || uname != NULL); crm_peer_init(); if (flags & CRM_GET_PEER_REMOTE) { node = g_hash_table_lookup(crm_remote_peer_cache, uname); } if (node == NULL && (flags & CRM_GET_PEER_CLUSTER)) { node = crm_get_peer(id, uname); } return node; } /* coverity[-alloc] Memory is referenced in one or both hashtables */ crm_node_t * crm_get_peer(unsigned int id, const char *uname) { GHashTableIter iter; crm_node_t *node = NULL; crm_node_t *by_id = NULL; crm_node_t *by_name = NULL; CRM_ASSERT(id > 0 || uname != NULL); crm_peer_init(); if (uname != NULL) { g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if(node->uname && strcasecmp(node->uname, uname) == 0) { crm_trace("Name match: %s = %p", node->uname, node); by_name = node; break; } } } if (id > 0) { g_hash_table_iter_init(&iter, crm_peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if(node->id == id) { crm_trace("ID match: %u = %p", node->id, node); by_id = node; break; } } } node = by_id; /* Good default */ if(by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %u/%s", by_id, id, uname); } else if(by_id == NULL && by_name) { crm_trace("Only one: %p for %u/%s", by_name, id, uname); if(id && by_name->id) { crm_dump_peer_hash(LOG_WARNING, __FUNCTION__); crm_crit("Node %u and %u share the same name '%s'", id, by_name->id, uname); node = NULL; /* Create a new one */ } else { node = by_name; } } else if(by_name == NULL && by_id) { crm_trace("Only one: %p for %u/%s", by_id, id, uname); if(uname && by_id->uname) { crm_dump_peer_hash(LOG_WARNING, __FUNCTION__); crm_crit("Node '%s' and '%s' share the same cluster nodeid %u: assuming '%s' is correct", uname, by_id->uname, id, uname); } } else if(uname && by_id->uname) { crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u", by_id->uname, by_name->uname, id); } else if(id && by_name->id) { crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname); } else { /* Simple merge */ /* Only corosync based clusters use nodeid's * * The functions that call crm_update_peer_state() only know nodeid * so 'by_id' is authorative when merging * * Same for crm_update_peer_proc() */ crm_dump_peer_hash(LOG_DEBUG, __FUNCTION__); crm_info("Merging %p into %p", by_name, by_id); g_hash_table_foreach_remove(crm_peer_cache, crm_hash_find_by_data, by_name); } if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = calloc(1, sizeof(crm_node_t)); CRM_ASSERT(node); crm_info("Created entry %s/%p for node %s/%u (%d total)", uniqueid, node, uname, id, 1 + g_hash_table_size(crm_peer_cache)); g_hash_table_replace(crm_peer_cache, uniqueid, node); } if(id > 0 && uname && (node->id == 0 || node->uname == NULL)) { crm_info("Node %u is now known as %s", id, uname); } if(id > 0 && node->id == 0) { node->id = id; } if(uname && node->uname == NULL) { int lpc, len = strlen(uname); for (lpc = 0; lpc < len; lpc++) { if (uname[lpc] >= 'A' && uname[lpc] <= 'Z') { crm_warn("Node names with capitals are discouraged, consider changing '%s' to something else", uname); break; } } node->uname = strdup(uname); if (crm_status_callback) { crm_status_callback(crm_status_uname, node, NULL); } } if(node->uuid == NULL) { const char *uuid = crm_peer_uuid(node); if (uuid) { crm_info("Node %u has uuid %s", id, uuid); } else { crm_info("Cannot obtain a UUID for node %d/%s", id, node->uname); } } return node; } crm_node_t * crm_update_peer(const char *source, unsigned int id, uint64_t born, uint64_t seen, int32_t votes, uint32_t children, const char *uuid, const char *uname, const char *addr, const char *state) { #if SUPPORT_PLUGIN gboolean addr_changed = FALSE; gboolean votes_changed = FALSE; #endif crm_node_t *node = NULL; id = get_corosync_id(id, uuid); node = crm_get_peer(id, uname); CRM_ASSERT(node != NULL); if (node->uuid == NULL) { if (is_openais_cluster()) { /* Yes, overrule whatever was passed in */ crm_peer_uuid(node); } else if (uuid != NULL) { node->uuid = strdup(uuid); } } if (children > 0) { crm_update_peer_proc(source, node, children, state); } if (state != NULL) { crm_update_peer_state(source, node, state, seen); } #if SUPPORT_HEARTBEAT if (born != 0) { node->born = born; } #endif #if SUPPORT_PLUGIN /* These were only used by the plugin */ if (born != 0) { node->born = born; } if (votes > 0 && node->votes != votes) { votes_changed = TRUE; node->votes = votes; } if (addr != NULL) { if (node->addr == NULL || crm_str_eq(node->addr, addr, FALSE) == FALSE) { addr_changed = TRUE; free(node->addr); node->addr = strdup(addr); } } if (addr_changed || votes_changed) { crm_info("%s: Node %s: id=%u state=%s addr=%s%s votes=%d%s born=" U64T " seen=" U64T " proc=%.32x", source, node->uname, node->id, node->state, node->addr, addr_changed ? " (new)" : "", node->votes, votes_changed ? " (new)" : "", node->born, node->last_seen, node->processes); } #endif return node; } void crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const char *status) { uint32_t last = 0; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL", source, peer2text(flag), status); return); last = node->processes; if (status == NULL) { node->processes = flag; if (node->processes != last) { changed = TRUE; } } else if (safe_str_eq(status, ONLINESTATUS)) { if ((node->processes & flag) == 0) { set_bit(node->processes, flag); changed = TRUE; } #if SUPPORT_PLUGIN } else if (safe_str_eq(status, CRM_NODE_MEMBER)) { if (flag > 0 && node->processes != flag) { node->processes = flag; changed = TRUE; } #endif } else if (node->processes & flag) { clear_bit(node->processes, flag); changed = TRUE; } if (changed) { if (status == NULL && flag <= crm_proc_none) { crm_info("%s: Node %s[%u] - all processes are now offline", source, node->uname, node->id); } else { crm_info("%s: Node %s[%u] - %s is now %s", source, node->uname, node->id, peer2text(flag), status); } if (crm_status_callback) { crm_status_callback(crm_status_processes, node, &last); } } else { crm_trace("%s: Node %s[%u] - %s is unchanged (%s)", source, node->uname, node->id, peer2text(flag), status); } } void crm_update_peer_expected(const char *source, crm_node_t * node, const char *expected) { char *last = NULL; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected); return); last = node->expected; if (expected != NULL && safe_str_neq(node->expected, expected)) { node->expected = strdup(expected); changed = TRUE; } if (changed) { crm_info("%s: Node %s[%u] - expected state is now %s", source, node->uname, node->id, expected); free(last); } else { crm_trace("%s: Node %s[%u] - expected state is unchanged (%s)", source, node->uname, node->id, expected); } } void crm_update_peer_state(const char *source, crm_node_t * node, const char *state, int membership) { char *last = NULL; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set 'state' to %s", source, state); return); last = node->state; if (state != NULL && safe_str_neq(node->state, state)) { node->state = strdup(state); changed = TRUE; } if (membership != 0 && safe_str_eq(node->state, CRM_NODE_MEMBER)) { node->last_seen = membership; } if (changed) { crm_notice("%s: Node %s[%u] - state is now %s (was %s)", source, node->uname, node->id, state, last); if (crm_status_callback) { - crm_status_callback(crm_status_nstate, node, last); + enum crm_status_type status_type = crm_status_nstate; + if (is_set(node->flags, crm_remote_node)) { + status_type = crm_status_rstate; + } + crm_status_callback(status_type, node, last); } free(last); } else { crm_trace("%s: Node %s[%u] - state is unchanged (%s)", source, node->uname, node->id, state); } } int crm_terminate_member(int nodeid, const char *uname, void *unused) { /* Always use the synchronous, non-mainloop version */ return stonith_api_kick(nodeid, uname, 120, TRUE); } int crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection) { return stonith_api_kick(nodeid, uname, 120, TRUE); }