diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am index db45bcba4a..0a29925c0b 100644 --- a/daemons/controld/Makefile.am +++ b/daemons/controld/Makefile.am @@ -1,84 +1,85 @@ # # Copyright 2018-2021 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # include $(top_srcdir)/mk/common.mk include $(top_srcdir)/mk/man.mk halibdir = $(CRM_DAEMON_DIR) halib_PROGRAMS = pacemaker-controld noinst_HEADERS = controld_alerts.h \ controld_callbacks.h \ controld_fencing.h \ controld_fsa.h \ controld_lrm.h \ controld_matrix.h \ controld_membership.h \ controld_messages.h \ controld_metadata.h \ controld_throttle.h \ controld_timers.h \ controld_transition.h \ controld_utils.h \ pacemaker-controld.h pacemaker_controld_CFLAGS = $(CFLAGS_HARDENED_EXE) pacemaker_controld_LDFLAGS = $(LDFLAGS_HARDENED_EXE) pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ $(top_builddir)/lib/pacemaker/libpacemaker.la \ $(top_builddir)/lib/pengine/libpe_rules.la \ $(top_builddir)/lib/cib/libcib.la \ $(top_builddir)/lib/cluster/libcrmcluster.la \ $(top_builddir)/lib/common/libcrmcommon.la \ $(top_builddir)/lib/services/libcrmservice.la \ $(top_builddir)/lib/lrmd/liblrmd.la \ $(CLUSTERLIBS) pacemaker_controld_SOURCES = pacemaker-controld.c \ + controld_alerts.c \ controld_attrd.c \ controld_callbacks.c \ controld_based.c \ controld_control.c \ controld_corosync.c \ controld_election.c \ controld_execd.c \ controld_execd_state.c \ controld_fencing.c \ controld_fsa.c \ controld_join_client.c \ controld_join_dc.c \ controld_membership.c \ controld_messages.c \ controld_metadata.c \ controld_remote_ra.c \ controld_schedulerd.c \ controld_te_actions.c \ controld_te_callbacks.c \ controld_te_events.c \ controld_te_utils.c \ controld_throttle.c \ controld_timers.c \ controld_transition.c \ controld_utils.c if BUILD_XML_HELP man7_MANS = pacemaker-controld.7 endif CLEANFILES = $(man7_MANS) if BUILD_LEGACY_LINKS install-exec-hook: cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f crmd && $(LN_S) pacemaker-controld crmd uninstall-hook: cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f crmd endif diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c new file mode 100644 index 0000000000..2e0a67dba2 --- /dev/null +++ b/daemons/controld/controld_alerts.c @@ -0,0 +1,88 @@ +/* + * Copyright 2012-2021 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static GList *crmd_alert_list = NULL; + +void +crmd_unpack_alerts(xmlNode *alerts) +{ + pe_free_alert_list(crmd_alert_list); + crmd_alert_list = pe_unpack_alerts(alerts); +} + +void +crmd_alert_node_event(crm_node_t *node) +{ + lrm_state_t *lrm_state; + + if (crmd_alert_list == NULL) { + return; + } + + lrm_state = lrm_state_find(fsa_our_uname); + if (lrm_state == NULL) { + return; + } + + lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, + node->uname, node->id, node->state); +} + +void +crmd_alert_fencing_op(stonith_event_t * e) +{ + char *desc; + lrm_state_t *lrm_state; + + if (crmd_alert_list == NULL) { + return; + } + + lrm_state = lrm_state_find(fsa_our_uname); + if (lrm_state == NULL) { + return; + } + + desc = stonith__event_description(e); + lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, + e->target, e->operation, desc, e->result); + free(desc); +} + +void +crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) +{ + lrm_state_t *lrm_state; + + if (crmd_alert_list == NULL) { + return; + } + + lrm_state = lrm_state_find(fsa_our_uname); + if (lrm_state == NULL) { + return; + } + + lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, + op); +} diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c index 67c376a426..5dce6c6d59 100644 --- a/daemons/controld/controld_execd_state.c +++ b/daemons/controld/controld_execd_state.c @@ -1,854 +1,779 @@ /* * Copyright 2012-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include GHashTable *lrm_state_table = NULL; extern GHashTable *proxy_table; int lrmd_internal_proxy_send(lrmd_t * lrmd, xmlNode *msg); void lrmd_internal_set_proxy_callback(lrmd_t * lrmd, void *userdata, void (*callback)(lrmd_t *lrmd, void *userdata, xmlNode *msg)); static void free_rsc_info(gpointer value) { lrmd_rsc_info_t *rsc_info = value; lrmd_free_rsc_info(rsc_info); } static void free_deletion_op(gpointer value) { struct pending_deletion_op_s *op = value; free(op->rsc); delete_ha_msg_input(op->input); free(op); } static void free_recurring_op(gpointer value) { active_op_t *op = value; free(op->user_data); free(op->rsc_id); free(op->op_type); free(op->op_key); if (op->params) { g_hash_table_destroy(op->params); } free(op); } static gboolean fail_pending_op(gpointer key, gpointer value, gpointer user_data) { lrmd_event_data_t event = { 0, }; lrm_state_t *lrm_state = user_data; active_op_t *op = value; crm_trace("Pre-emptively failing " PCMK__OP_FMT " on %s (call=%s, %s)", op->rsc_id, op->op_type, op->interval_ms, lrm_state->node_name, (char*)key, op->user_data); event.type = lrmd_event_exec_complete; event.rsc_id = op->rsc_id; event.op_type = op->op_type; event.user_data = op->user_data; event.timeout = 0; event.interval_ms = op->interval_ms; lrmd__set_result(&event, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_NOT_CONNECTED, "Action was pending when executor connection was dropped"); event.t_run = (unsigned int) op->start_time; event.t_rcchange = (unsigned int) op->start_time; event.call_id = op->call_id; event.remote_nodename = lrm_state->node_name; event.params = op->params; process_lrm_event(lrm_state, &event, op, NULL); lrmd__reset_result(&event); return TRUE; } gboolean lrm_state_is_local(lrm_state_t *lrm_state) { if (lrm_state == NULL || fsa_our_uname == NULL) { return FALSE; } if (strcmp(lrm_state->node_name, fsa_our_uname) != 0) { return FALSE; } return TRUE; } lrm_state_t * lrm_state_create(const char *node_name) { lrm_state_t *state = NULL; if (!node_name) { crm_err("No node name given for lrm state object"); return NULL; } state = calloc(1, sizeof(lrm_state_t)); if (!state) { return NULL; } state->node_name = strdup(node_name); state->rsc_info_cache = pcmk__strkey_table(NULL, free_rsc_info); state->deletion_ops = pcmk__strkey_table(free, free_deletion_op); state->pending_ops = pcmk__strkey_table(free, free_recurring_op); state->resource_history = pcmk__strkey_table(NULL, history_free); state->metadata_cache = metadata_cache_new(); g_hash_table_insert(lrm_state_table, (char *)state->node_name, state); return state; } void lrm_state_destroy(const char *node_name) { g_hash_table_remove(lrm_state_table, node_name); } static gboolean remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data) { remote_proxy_t *proxy = value; const char *node_name = user_data; if (pcmk__str_eq(node_name, proxy->node_name, pcmk__str_casei)) { return TRUE; } return FALSE; } static void internal_lrm_state_destroy(gpointer data) { lrm_state_t *lrm_state = data; if (!lrm_state) { return; } crm_trace("Destroying proxy table %s with %d members", lrm_state->node_name, g_hash_table_size(proxy_table)); g_hash_table_foreach_remove(proxy_table, remote_proxy_remove_by_node, (char *) lrm_state->node_name); remote_ra_cleanup(lrm_state); lrmd_api_delete(lrm_state->conn); if (lrm_state->rsc_info_cache) { crm_trace("Destroying rsc info cache with %d members", g_hash_table_size(lrm_state->rsc_info_cache)); g_hash_table_destroy(lrm_state->rsc_info_cache); } if (lrm_state->resource_history) { crm_trace("Destroying history op cache with %d members", g_hash_table_size(lrm_state->resource_history)); g_hash_table_destroy(lrm_state->resource_history); } if (lrm_state->deletion_ops) { crm_trace("Destroying deletion op cache with %d members", g_hash_table_size(lrm_state->deletion_ops)); g_hash_table_destroy(lrm_state->deletion_ops); } if (lrm_state->pending_ops) { crm_trace("Destroying pending op cache with %d members", g_hash_table_size(lrm_state->pending_ops)); g_hash_table_destroy(lrm_state->pending_ops); } metadata_cache_free(lrm_state->metadata_cache); free((char *)lrm_state->node_name); free(lrm_state); } void lrm_state_reset_tables(lrm_state_t * lrm_state, gboolean reset_metadata) { if (lrm_state->resource_history) { crm_trace("Re-setting history op cache with %d members", g_hash_table_size(lrm_state->resource_history)); g_hash_table_remove_all(lrm_state->resource_history); } if (lrm_state->deletion_ops) { crm_trace("Re-setting deletion op cache with %d members", g_hash_table_size(lrm_state->deletion_ops)); g_hash_table_remove_all(lrm_state->deletion_ops); } if (lrm_state->pending_ops) { crm_trace("Re-setting pending op cache with %d members", g_hash_table_size(lrm_state->pending_ops)); g_hash_table_remove_all(lrm_state->pending_ops); } if (lrm_state->rsc_info_cache) { crm_trace("Re-setting rsc info cache with %d members", g_hash_table_size(lrm_state->rsc_info_cache)); g_hash_table_remove_all(lrm_state->rsc_info_cache); } if (reset_metadata) { metadata_cache_reset(lrm_state->metadata_cache); } } gboolean lrm_state_init_local(void) { if (lrm_state_table) { return TRUE; } lrm_state_table = pcmk__strikey_table(NULL, internal_lrm_state_destroy); if (!lrm_state_table) { return FALSE; } proxy_table = pcmk__strikey_table(NULL, remote_proxy_free); if (!proxy_table) { g_hash_table_destroy(lrm_state_table); lrm_state_table = NULL; return FALSE; } return TRUE; } void lrm_state_destroy_all(void) { if (lrm_state_table) { crm_trace("Destroying state table with %d members", g_hash_table_size(lrm_state_table)); g_hash_table_destroy(lrm_state_table); lrm_state_table = NULL; } if(proxy_table) { crm_trace("Destroying proxy table with %d members", g_hash_table_size(proxy_table)); g_hash_table_destroy(proxy_table); proxy_table = NULL; } } lrm_state_t * lrm_state_find(const char *node_name) { if (!node_name) { return NULL; } return g_hash_table_lookup(lrm_state_table, node_name); } lrm_state_t * lrm_state_find_or_create(const char *node_name) { lrm_state_t *lrm_state; lrm_state = g_hash_table_lookup(lrm_state_table, node_name); if (!lrm_state) { lrm_state = lrm_state_create(node_name); } return lrm_state; } GList * lrm_state_get_list(void) { return g_hash_table_get_values(lrm_state_table); } static remote_proxy_t * find_connected_proxy_by_node(const char * node_name) { GHashTableIter gIter; remote_proxy_t *proxy = NULL; CRM_CHECK(proxy_table != NULL, return NULL); g_hash_table_iter_init(&gIter, proxy_table); while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) &proxy)) { if (proxy->source && pcmk__str_eq(node_name, proxy->node_name, pcmk__str_casei)) { return proxy; } } return NULL; } static void remote_proxy_disconnect_by_node(const char * node_name) { remote_proxy_t *proxy = NULL; CRM_CHECK(proxy_table != NULL, return); while ((proxy = find_connected_proxy_by_node(node_name)) != NULL) { /* mainloop_del_ipc_client() eventually calls remote_proxy_disconnected() * , which removes the entry from proxy_table. * Do not do this in a g_hash_table_iter_next() loop. */ if (proxy->source) { mainloop_del_ipc_client(proxy->source); } } return; } void lrm_state_disconnect_only(lrm_state_t * lrm_state) { int removed = 0; if (!lrm_state->conn) { return; } crm_trace("Disconnecting %s", lrm_state->node_name); remote_proxy_disconnect_by_node(lrm_state->node_name); ((lrmd_t *) lrm_state->conn)->cmds->disconnect(lrm_state->conn); if (!pcmk_is_set(fsa_input_register, R_SHUTDOWN)) { removed = g_hash_table_foreach_remove(lrm_state->pending_ops, fail_pending_op, lrm_state); crm_trace("Synthesized %d operation failures for %s", removed, lrm_state->node_name); } } void lrm_state_disconnect(lrm_state_t * lrm_state) { if (!lrm_state->conn) { return; } lrm_state_disconnect_only(lrm_state); lrmd_api_delete(lrm_state->conn); lrm_state->conn = NULL; } int lrm_state_is_connected(lrm_state_t * lrm_state) { if (!lrm_state->conn) { return FALSE; } return ((lrmd_t *) lrm_state->conn)->cmds->is_connected(lrm_state->conn); } int lrm_state_poke_connection(lrm_state_t * lrm_state) { if (!lrm_state->conn) { return -ENOTCONN; } return ((lrmd_t *) lrm_state->conn)->cmds->poke_connection(lrm_state->conn); } // \return Standard Pacemaker return code int controld_connect_local_executor(lrm_state_t *lrm_state) { int rc = pcmk_rc_ok; if (lrm_state->conn == NULL) { lrmd_t *api = NULL; rc = lrmd__new(&api, NULL, NULL, 0); if (rc != pcmk_rc_ok) { return rc; } api->cmds->set_callback(api, lrm_op_callback); lrm_state->conn = api; } rc = ((lrmd_t *) lrm_state->conn)->cmds->connect(lrm_state->conn, CRM_SYSTEM_CRMD, NULL); rc = pcmk_legacy2rc(rc); if (rc == pcmk_rc_ok) { lrm_state->num_lrm_register_fails = 0; } else { lrm_state->num_lrm_register_fails++; } return rc; } static remote_proxy_t * crmd_remote_proxy_new(lrmd_t *lrmd, const char *node_name, const char *session_id, const char *channel) { struct ipc_client_callbacks proxy_callbacks = { .dispatch = remote_proxy_dispatch, .destroy = remote_proxy_disconnected }; remote_proxy_t *proxy = remote_proxy_new(lrmd, &proxy_callbacks, node_name, session_id, channel); return proxy; } gboolean crmd_is_proxy_session(const char *session) { return g_hash_table_lookup(proxy_table, session) ? TRUE : FALSE; } void crmd_proxy_send(const char *session, xmlNode *msg) { remote_proxy_t *proxy = g_hash_table_lookup(proxy_table, session); lrm_state_t *lrm_state = NULL; if (!proxy) { return; } crm_log_xml_trace(msg, "to-proxy"); lrm_state = lrm_state_find(proxy->node_name); if (lrm_state) { crm_trace("Sending event to %.8s on %s", proxy->session_id, proxy->node_name); remote_proxy_relay_event(proxy, msg); } } static void crmd_proxy_dispatch(const char *session, xmlNode *msg) { crm_trace("Processing proxied IPC message from session %s", session); crm_log_xml_trace(msg, "controller[inbound]"); crm_xml_add(msg, F_CRM_SYS_FROM, session); if (controld_authorize_ipc_message(msg, NULL, session)) { route_message(C_IPC_MESSAGE, msg); } trigger_fsa(); } static void remote_config_check(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc != pcmk_ok) { crm_err("Query resulted in an error: %s", pcmk_strerror(rc)); if (rc == -EACCES || rc == -pcmk_err_schema_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); } } else { lrmd_t * lrmd = (lrmd_t *)user_data; crm_time_t *now = crm_time_new(NULL); GHashTable *config_hash = pcmk__strkey_table(free, free); crm_debug("Call %d : Parsing CIB options", call_id); pe_unpack_nvpairs(output, output, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now, NULL); /* Now send it to the remote peer */ lrmd__validate_remote_settings(lrmd, config_hash); g_hash_table_destroy(config_hash); crm_time_free(now); } } static void crmd_remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) { lrm_state_t *lrm_state = userdata; const char *session = crm_element_value(msg, F_LRMD_IPC_SESSION); remote_proxy_t *proxy = g_hash_table_lookup(proxy_table, session); const char *op = crm_element_value(msg, F_LRMD_IPC_OP); if (pcmk__str_eq(op, LRMD_IPC_OP_NEW, pcmk__str_casei)) { const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER); proxy = crmd_remote_proxy_new(lrmd, lrm_state->node_name, session, channel); if (!remote_ra_controlling_guest(lrm_state)) { if (proxy != NULL) { /* Look up stonith-watchdog-timeout and send to the remote peer for validation */ int rc = fsa_cib_conn->cmds->query(fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local); fsa_cib_conn->cmds->register_callback_full(fsa_cib_conn, rc, 10, FALSE, lrmd, "remote_config_check", remote_config_check, NULL); } } else { crm_debug("Skipping remote_config_check for guest-nodes"); } } else if (pcmk__str_eq(op, LRMD_IPC_OP_SHUTDOWN_REQ, pcmk__str_casei)) { char *now_s = NULL; crm_notice("%s requested shutdown of its remote connection", lrm_state->node_name); if (!remote_ra_is_in_maintenance(lrm_state)) { now_s = pcmk__ttoa(time(NULL)); update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE); free(now_s); remote_proxy_ack_shutdown(lrmd); crm_warn("Reconnection attempts to %s may result in failures that must be cleared", lrm_state->node_name); } else { remote_proxy_nack_shutdown(lrmd); crm_notice("Remote resource for %s is not managed so no ordered shutdown happening", lrm_state->node_name); } return; } else if (pcmk__str_eq(op, LRMD_IPC_OP_REQUEST, pcmk__str_casei) && proxy && proxy->is_local) { /* This is for the controller, which we are, so don't try * to send to ourselves over IPC -- do it directly. */ int flags = 0; xmlNode *request = get_message_xml(msg, F_LRMD_IPC_MSG); CRM_CHECK(request != NULL, return); CRM_CHECK(lrm_state->node_name, return); crm_xml_add(request, XML_ACL_TAG_ROLE, "pacemaker-remote"); pcmk__update_acl_user(request, F_LRMD_IPC_USER, lrm_state->node_name); /* Pacemaker Remote nodes don't know their own names (as known to the * cluster). When getting a node info request with no name or ID, add * the name, so we don't return info for ourselves instead of the * Pacemaker Remote node. */ if (pcmk__str_eq(crm_element_value(request, F_CRM_TASK), CRM_OP_NODE_INFO, pcmk__str_casei)) { int node_id = 0; crm_element_value_int(request, XML_ATTR_ID, &node_id); if ((node_id <= 0) && (crm_element_value(request, XML_ATTR_UNAME) == NULL)) { crm_xml_add(request, XML_ATTR_UNAME, lrm_state->node_name); } } crmd_proxy_dispatch(session, request); crm_element_value_int(msg, F_LRMD_IPC_MSG_FLAGS, &flags); if (flags & crm_ipc_client_response) { int msg_id = 0; xmlNode *op_reply = create_xml_node(NULL, "ack"); crm_xml_add(op_reply, "function", __func__); crm_xml_add_int(op_reply, "line", __LINE__); crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id); remote_proxy_relay_response(proxy, op_reply, msg_id); free_xml(op_reply); } } else { remote_proxy_cb(lrmd, lrm_state->node_name, msg); } } // \return Standard Pacemaker return code int controld_connect_remote_executor(lrm_state_t *lrm_state, const char *server, int port, int timeout_ms) { int rc = pcmk_rc_ok; if (lrm_state->conn == NULL) { lrmd_t *api = NULL; rc = lrmd__new(&api, lrm_state->node_name, server, port); if (rc != pcmk_rc_ok) { crm_warn("Pacemaker Remote connection to %s:%s failed: %s " CRM_XS " rc=%d", server, port, pcmk_rc_str(rc), rc); return rc; } lrm_state->conn = api; api->cmds->set_callback(api, remote_lrm_op_callback); lrmd_internal_set_proxy_callback(api, lrm_state, crmd_remote_proxy_cb); } crm_trace("Initiating remote connection to %s:%d with timeout %dms", server, port, timeout_ms); rc = ((lrmd_t *) lrm_state->conn)->cmds->connect_async(lrm_state->conn, lrm_state->node_name, timeout_ms); if (rc == pcmk_ok) { lrm_state->num_lrm_register_fails = 0; } else { lrm_state->num_lrm_register_fails++; // Ignored for remote connections } return pcmk_legacy2rc(rc); } int lrm_state_get_metadata(lrm_state_t * lrm_state, const char *class, const char *provider, const char *agent, char **output, enum lrmd_call_options options) { lrmd_key_value_t *params = NULL; if (!lrm_state->conn) { return -ENOTCONN; } /* Add the node name to the environment, as is done with normal resource * action calls. Meta-data calls shouldn't need it, but some agents are * written with an ocf_local_nodename call at the beginning regardless of * action. Without the environment variable, the agent would try to contact * the controller to get the node name -- but the controller would be * blocking on the synchronous meta-data call. * * At this point, we have to assume that agents are unlikely to make other * calls that require the controller, such as crm_node --quorum or * --cluster-id. * * @TODO Make meta-data calls asynchronous. (This will be part of a larger * project to make meta-data calls via the executor rather than directly.) */ params = lrmd_key_value_add(params, CRM_META "_" XML_LRM_ATTR_TARGET, lrm_state->node_name); return ((lrmd_t *) lrm_state->conn)->cmds->get_metadata_params(lrm_state->conn, class, provider, agent, output, options, params); } int lrm_state_cancel(lrm_state_t *lrm_state, const char *rsc_id, const char *action, guint interval_ms) { if (!lrm_state->conn) { return -ENOTCONN; } /* Figure out a way to make this async? * NOTICE: Currently it's synced and directly acknowledged in do_lrm_invoke(). */ if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) { return remote_ra_cancel(lrm_state, rsc_id, action, interval_ms); } return ((lrmd_t *) lrm_state->conn)->cmds->cancel(lrm_state->conn, rsc_id, action, interval_ms); } lrmd_rsc_info_t * lrm_state_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options) { lrmd_rsc_info_t *rsc = NULL; if (!lrm_state->conn) { return NULL; } if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) { return remote_ra_get_rsc_info(lrm_state, rsc_id); } rsc = g_hash_table_lookup(lrm_state->rsc_info_cache, rsc_id); if (rsc == NULL) { /* only contact the lrmd if we don't already have a cached rsc info */ rsc = ((lrmd_t *) lrm_state->conn)->cmds->get_rsc_info(lrm_state->conn, rsc_id, options); if (rsc == NULL) { return NULL; } /* cache the result */ g_hash_table_insert(lrm_state->rsc_info_cache, rsc->id, rsc); } return lrmd_copy_rsc_info(rsc); } /*! * \internal * \brief Initiate a resource agent action * * \param[in] lrm_state Executor state object * \param[in] rsc_id ID of resource for action * \param[in] action Action to execute * \param[in] userdata String to copy and pass to execution callback * \param[in] interval_ms Action interval (in milliseconds) * \param[in] timeout_ms Action timeout (in milliseconds) * \param[in] start_delay_ms Delay (in milliseconds) before initiating action * \param[in] params Resource parameters * \param[out] call_id Where to store call ID on success * * \return Standard Pacemaker return code * \note This takes ownership of \p params, which should not be used or freed * after calling this function. */ int controld_execute_resource_agent(lrm_state_t *lrm_state, const char *rsc_id, const char *action, const char *userdata, guint interval_ms, int timeout_ms, int start_delay_ms, lrmd_key_value_t *params, int *call_id) { int rc = pcmk_rc_ok; if (lrm_state->conn == NULL) { lrmd_key_value_freeall(params); rc = ENOTCONN; } else if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) { rc = controld_execute_remote_agent(lrm_state, rsc_id, action, userdata, interval_ms, timeout_ms, start_delay_ms, params, call_id); } else { rc = ((lrmd_t *) lrm_state->conn)->cmds->exec(lrm_state->conn, rsc_id, action, userdata, interval_ms, timeout_ms, start_delay_ms, lrmd_opt_notify_changes_only, params); if (rc < 0) { rc = pcmk_legacy2rc(rc); } else { *call_id = rc; rc = pcmk_rc_ok; } } return rc; } int lrm_state_register_rsc(lrm_state_t * lrm_state, const char *rsc_id, const char *class, const char *provider, const char *agent, enum lrmd_call_options options) { lrmd_t *conn = (lrmd_t *) lrm_state->conn; if (conn == NULL) { return -ENOTCONN; } if (is_remote_lrmd_ra(agent, provider, NULL)) { return lrm_state_find_or_create(rsc_id)? pcmk_ok : -EINVAL; } /* @TODO Implement an asynchronous version of this (currently a blocking * call to the lrmd). */ return conn->cmds->register_rsc(lrm_state->conn, rsc_id, class, provider, agent, options); } int lrm_state_unregister_rsc(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options) { if (!lrm_state->conn) { return -ENOTCONN; } if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) { lrm_state_destroy(rsc_id); return pcmk_ok; } g_hash_table_remove(lrm_state->rsc_info_cache, rsc_id); /* @TODO Optimize this ... this function is a blocking round trip from * client to daemon. The controld_execd_state.c code path that uses this * function should always treat it as an async operation. The executor API * should make an async version available. */ return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options); } - -/* - * Functions for sending alerts via local executor connection - */ - -static GList *crmd_alert_list = NULL; - -void -crmd_unpack_alerts(xmlNode *alerts) -{ - pe_free_alert_list(crmd_alert_list); - crmd_alert_list = pe_unpack_alerts(alerts); -} - -void -crmd_alert_node_event(crm_node_t *node) -{ - lrm_state_t *lrm_state; - - if (crmd_alert_list == NULL) { - return; - } - - lrm_state = lrm_state_find(fsa_our_uname); - if (lrm_state == NULL) { - return; - } - - lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, - node->uname, node->id, node->state); -} - -void -crmd_alert_fencing_op(stonith_event_t * e) -{ - char *desc; - lrm_state_t *lrm_state; - - if (crmd_alert_list == NULL) { - return; - } - - lrm_state = lrm_state_find(fsa_our_uname); - if (lrm_state == NULL) { - return; - } - - desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", - e->action, e->target, - (e->executioner? e->executioner : ""), - e->client_origin, e->origin, - pcmk_strerror(e->result), e->id); - - lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, - e->target, e->operation, desc, e->result); - free(desc); -} - -void -crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) -{ - lrm_state_t *lrm_state; - - if (crmd_alert_list == NULL) { - return; - } - - lrm_state = lrm_state_find(fsa_our_uname); - if (lrm_state == NULL) { - return; - } - - lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, - op); -} diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index f8d2fc13f4..15954b2358 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -1,1026 +1,1059 @@ /* - * Copyright 2004-2021 the Pacemaker project contributors + * Copyright 2004-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event); /* * stonith failure counting * * We don't want to get stuck in a permanent fencing loop. Keep track of the * number of fencing failures for each target node, and the most we'll restart a * transition for. */ struct st_fail_rec { int count; }; static bool fence_reaction_panic = FALSE; static unsigned long int stonith_max_attempts = 10; static GHashTable *stonith_failures = NULL; void update_stonith_max_attempts(const char *value) { stonith_max_attempts = char2score(value); if (stonith_max_attempts < 1UL) { stonith_max_attempts = 10UL; } } void set_fence_reaction(const char *reaction_s) { if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) { fence_reaction_panic = TRUE; } else { if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) { crm_warn("Invalid value '%s' for %s, using 'stop'", reaction_s, XML_CONFIG_ATTR_FENCE_REACTION); } fence_reaction_panic = FALSE; } } static gboolean too_many_st_failures(const char *target) { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *value = NULL; if (stonith_failures == NULL) { return FALSE; } if (target == NULL) { g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) { if (value->count >= stonith_max_attempts) { target = (const char*)key; goto too_many; } } } else { value = g_hash_table_lookup(stonith_failures, target); if ((value != NULL) && (value->count >= stonith_max_attempts)) { goto too_many; } } return FALSE; too_many: crm_warn("Too many failures (%d) to fence %s, giving up", value->count, target); return TRUE; } /*! * \internal * \brief Reset a stonith fail count * * \param[in] target Name of node to reset, or NULL for all */ void st_fail_count_reset(const char *target) { if (stonith_failures == NULL) { return; } if (target) { struct st_fail_rec *rec = NULL; rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count = 0; } } else { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *rec = NULL; g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &rec)) { rec->count = 0; } } } static void st_fail_count_increment(const char *target) { struct st_fail_rec *rec = NULL; if (stonith_failures == NULL) { stonith_failures = pcmk__strkey_table(free, free); } rec = g_hash_table_lookup(stonith_failures, target); if (rec) { rec->count++; } else { rec = malloc(sizeof(struct st_fail_rec)); if(rec == NULL) { return; } rec->count = 1; g_hash_table_insert(stonith_failures, strdup(target), rec); } } /* end stonith fail count functions */ static void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { if (rc < pcmk_ok) { crm_err("Fencing update %d for %s: failed - %s (%d)", call_id, (char *)user_data, pcmk_strerror(rc), rc); crm_log_xml_warn(msg, "Failed update"); abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); } else { crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); } } static void send_stonith_update(crm_action_t *action, const char *target, const char *uuid) { int rc = pcmk_ok; crm_node_t *peer = NULL; /* We (usually) rely on the membership layer to do node_update_cluster, * and the peer status callback to do node_update_peer, because the node * might have already rejoined before we get the stonith result here. */ int flags = node_update_join | node_update_expected; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); /* Make sure the membership and join caches are accurate */ peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); CRM_CHECK(peer != NULL, return); if (peer->state == NULL) { /* Usually, we rely on the membership layer to update the cluster state * in the CIB. However, if the node has never been seen, do it here, so * the node is not considered unclean. */ flags |= node_update_cluster; } if (peer->uuid == NULL) { crm_info("Recording uuid '%s' for node '%s'", uuid, target); peer->uuid = strdup(uuid); } crmd_peer_down(peer, TRUE); /* Generate a node state update for the CIB */ node_state = create_node_state_update(peer, flags, NULL, __func__); /* we have to mark whether or not remote nodes have already been fenced */ if (peer->flags & crm_remote_node) { char *now_s = pcmk__ttoa(time(NULL)); crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s); free(now_s); } /* Force our known ID */ crm_xml_add(node_state, XML_ATTR_UUID, uuid); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override | cib_scope_local | cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); /* Make sure it sticks */ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ controld_delete_node_state(peer->uname, controld_section_all, cib_scope_local); free_xml(node_state); return; } /*! * \internal * \brief Abort transition due to stonith failure * * \param[in] abort_action Whether to restart or stop transition * \param[in] target Don't restart if this (NULL for any) has too many failures * \param[in] reason Log this stonith action XML as abort reason (or NULL) */ static void abort_for_stonith_failure(enum transition_action abort_action, const char *target, xmlNode *reason) { /* If stonith repeatedly fails, we eventually give up on starting a new * transition for that reason. */ if ((abort_action != tg_stop) && too_many_st_failures(target)) { abort_action = tg_stop; } abort_transition(INFINITY, abort_action, "Stonith failed", reason); } /* * stonith cleanup list * * If the DC is shot, proper notifications might not go out. * The stonith cleanup list allows the cluster to (re-)send * notifications once a new DC is elected. */ static GList *stonith_cleanup_list = NULL; /*! * \internal * \brief Add a node to the stonith cleanup list * * \param[in] target Name of node to add */ void add_stonith_cleanup(const char *target) { stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); } /*! * \internal * \brief Remove a node from the stonith cleanup list * * \param[in] Name of node to remove */ void remove_stonith_cleanup(const char *target) { GList *iter = stonith_cleanup_list; while (iter != NULL) { GList *tmp = iter; char *iter_name = tmp->data; iter = iter->next; if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) { crm_trace("Removing %s from the cleanup list", iter_name); stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); free(iter_name); } } } /*! * \internal * \brief Purge all entries from the stonith cleanup list */ void purge_stonith_cleanup() { if (stonith_cleanup_list) { GList *iter = NULL; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_info("Purging %s from stonith cleanup list", target); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } } /*! * \internal * \brief Send stonith updates for all entries in cleanup list, then purge it */ void execute_stonith_cleanup() { GList *iter; for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { char *target = iter->data; crm_node_t *target_node = crm_get_peer(0, target); const char *uuid = crm_peer_uuid(target_node); crm_notice("Marking %s, target of a previous stonith action, as clean", target); send_stonith_update(NULL, target, uuid); free(target); } g_list_free(stonith_cleanup_list); stonith_cleanup_list = NULL; } /* end stonith cleanup list functions */ /* stonith API client * * Functions that need to interact directly with the fencer via its API */ static stonith_t *stonith_api = NULL; static crm_trigger_t *stonith_reconnect = NULL; static char *te_client_id = NULL; static gboolean fail_incompletable_stonith(crm_graph_t *graph) { GList *lpc = NULL; const char *task = NULL; xmlNode *last_action = NULL; if (graph == NULL) { return FALSE; } for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { GList *lpc2 = NULL; synapse_t *synapse = (synapse_t *) lpc->data; if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) { continue; } for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { crm_action_t *action = (crm_action_t *) lpc2->data; if (action->type != action_type_crm || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { continue; } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { crm__set_graph_action_flags(action, pcmk__graph_action_failed); last_action = action->xml; pcmk__update_graph(graph, action); crm_notice("Failing action %d (%s): fencer terminated", action->id, ID(action->xml)); } } } if (last_action != NULL) { crm_warn("Fencer failure resulted in unrunnable actions"); abort_for_stonith_failure(tg_restart, NULL, last_action); return TRUE; } return FALSE; } static void tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) { te_cleanup_stonith_history_sync(st, FALSE); if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) { crm_crit("Fencing daemon connection failed"); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencing daemon disconnected"); } if (stonith_api) { /* the client API won't properly reconnect notifications * if they are still in the table - so remove them */ if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(st); } stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); } if (AM_I_DC) { fail_incompletable_stonith(transition_graph); trigger_graph(); } } +/*! + * \internal + * \brief Handle an event notification from the fencing API + * + * \param[in] st Fencing API connection + * \param[in] event Fencing API event notification + */ static void -tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) +handle_fence_notification(stonith_t *st, stonith_event_t *event) { + bool succeeded = true; + const char *executioner = "the cluster"; + const char *client = "a client"; + const char *reason = NULL; + int exec_status; + if (te_client_id == NULL) { te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, (unsigned long) getpid()); } - if (st_event == NULL) { + if (event == NULL) { crm_err("Notify data not found"); return; } - crmd_alert_fencing_op(st_event); + if (event->executioner != NULL) { + executioner = event->executioner; + } + if (event->client_origin != NULL) { + client = event->client_origin; + } - if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { - crm_notice("%s was successfully unfenced by %s (at the request of %s)", - st_event->target, - st_event->executioner? st_event->executioner : "", - st_event->origin); - /* TODO: Hook up st_event->device */ - return; + exec_status = stonith__event_execution_status(event); + if ((stonith__event_exit_status(event) != CRM_EX_OK) + || (exec_status != PCMK_EXEC_DONE)) { + succeeded = false; + if (exec_status == PCMK_EXEC_DONE) { + exec_status = PCMK_EXEC_ERROR; + } + } + reason = stonith__event_exit_reason(event); - } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { - crm_err("Unfencing of %s by %s failed: %s (%d)", - st_event->target, - st_event->executioner? st_event->executioner : "", - pcmk_strerror(st_event->result), st_event->result); - return; + crmd_alert_fencing_op(event); - } else if ((st_event->result == pcmk_ok) - && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) { + if (pcmk__str_eq("on", event->action, pcmk__str_none)) { + // Unfencing doesn't need special handling, just a log message + if (succeeded) { + crm_notice("%s was unfenced by %s at the request of %s@%s", + event->target, executioner, client, event->origin); + /* TODO: Hook up event->device */ + } else { + crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", + event->target, executioner, + pcmk_exec_status_str(exec_status), + ((reason == NULL)? "" : ": "), + ((reason == NULL)? "" : reason), + stonith__event_exit_status(event)); + } + return; + } + if (succeeded + && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) { /* We were notified of our own fencing. Most likely, either fencing was * misconfigured, or fabric fencing that doesn't cut cluster * communication is in use. * * Either way, shutting down the local host is a good idea, to require * administrator intervention. Also, other nodes would otherwise likely * set our status to lost because of the fencing callback and discard * our subsequent election votes as "not part of our cluster". */ crm_crit("We were allegedly just fenced by %s for %s!", - st_event->executioner? st_event->executioner : "the cluster", - st_event->origin); /* Dumps blackbox if enabled */ + executioner, event->origin); // Dumps blackbox if enabled if (fence_reaction_panic) { pcmk__panic(__func__); } else { crm_exit(CRM_EX_FATAL); } - return; + return; // Should never get here } - /* Update the count of stonith failures for this target, in case we become + /* Update the count of fencing failures for this target, in case we become * DC later. The current DC has already updated its fail count in * tengine_stonith_callback(). */ - if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { - if (st_event->result == pcmk_ok) { - st_fail_count_reset(st_event->target); + if (!AM_I_DC + && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, + pcmk__str_none)) { + + if (succeeded) { + st_fail_count_reset(event->target); } else { - st_fail_count_increment(st_event->target); + st_fail_count_increment(event->target); } } - crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " - CRM_XS " initiator=%s ref=%s", - st_event->target, st_event->result == pcmk_ok ? "" : " not", - st_event->action, - st_event->executioner ? st_event->executioner : "", - (st_event->client_origin? st_event->client_origin : ""), - pcmk_strerror(st_event->result), - st_event->origin, st_event->id); - - if (st_event->result == pcmk_ok) { - crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target, + crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " + "%s%s%s%s " CRM_XS " event=%s", + event->target, (succeeded? "" : " not"), + event->action, executioner, client, event->origin, + (succeeded? "OK" : pcmk_exec_status_str(exec_status)), + ((reason == NULL)? "" : " ("), + ((reason == NULL)? "" : reason), + ((reason == NULL)? "" : ")"), + event->id); + + if (succeeded) { + crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, CRM_GET_PEER_ANY); const char *uuid = NULL; - gboolean we_are_executioner = pcmk__str_eq(st_event->executioner, - fsa_our_uname, - pcmk__str_casei); if (peer == NULL) { return; } uuid = crm_peer_uuid(peer); - crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); - if(AM_I_DC) { + if (AM_I_DC) { /* The DC always sends updates */ - send_stonith_update(NULL, st_event->target, uuid); + send_stonith_update(NULL, event->target, uuid); /* @TODO Ideally, at this point, we'd check whether the fenced node * hosted any guest nodes, and call remote_node_down() for them. * Unfortunately, the controller doesn't have a simple, reliable way * to map hosts to guests. It might be possible to track this in the * peer cache via crm_remote_peer_cache_refresh(). For now, we rely * on the scheduler creating fence pseudo-events for the guests. */ - if (st_event->client_origin - && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) { - - /* Abort the current transition graph if it wasn't us - * that invoked stonith to fence someone + if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { + /* Abort the current transition if it wasn't the cluster that + * initiated fencing. */ - crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); - abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); + crm_info("External fencing operation from %s fenced %s", + client, event->target); + abort_transition(INFINITY, tg_restart, + "External Fencing Operation", NULL); } /* Assume it was our leader if we don't currently have one */ - } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei) + } else if (pcmk__str_eq(fsa_our_dc, event->target, + pcmk__str_null_matches|pcmk__str_casei) && !pcmk_is_set(peer->flags, crm_remote_node)) { crm_notice("Fencing target %s %s our leader", - st_event->target, (fsa_our_dc? "was" : "may have been")); + event->target, (fsa_our_dc? "was" : "may have been")); /* Given the CIB resyncing that occurs around elections, * have one node update the CIB now and, if the new DC is different, * have them do so too after the election */ - if (we_are_executioner) { - send_stonith_update(NULL, st_event->target, uuid); + if (pcmk__str_eq(event->executioner, fsa_our_uname, + pcmk__str_casei)) { + send_stonith_update(NULL, event->target, uuid); } - add_stonith_cleanup(st_event->target); + add_stonith_cleanup(event->target); } /* If the target is a remote node, and we host its connection, * immediately fail all monitors so it can be recovered quickly. * The connection won't necessarily drop when a remote node is fenced, * so the failure might not otherwise be detected until the next poke. */ if (pcmk_is_set(peer->flags, crm_remote_node)) { - remote_ra_fail(st_event->target); + remote_ra_fail(event->target); } crmd_peer_down(peer, TRUE); } } /*! * \brief Connect to fencer * * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop * * \return TRUE * \note If user_data is NULL, this will wait 2s between attempts, for up to * 30 attempts, meaning the controller could be blocked as long as 58s. */ static gboolean te_connect_stonith(gpointer user_data) { int rc = pcmk_ok; if (stonith_api == NULL) { stonith_api = stonith_api_new(); if (stonith_api == NULL) { crm_err("Could not connect to fencer: API memory allocation failed"); return TRUE; } } if (stonith_api->state != stonith_disconnected) { crm_trace("Already connected to fencer, no need to retry"); return TRUE; } if (user_data == NULL) { // Blocking (retry failures now until successful) rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); if (rc != pcmk_ok) { crm_err("Could not connect to fencer in 30 attempts: %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); } } else { // Non-blocking (retry failures later in main loop) rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); if (rc != pcmk_ok) { if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) { crm_notice("Fencer connection failed (will retry): %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); mainloop_set_trigger(stonith_reconnect); } else { crm_info("Fencer connection failed (ignoring because no longer required): %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); } return TRUE; } } if (rc == pcmk_ok) { stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT, tengine_stonith_connection_destroy); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE, - tengine_stonith_notify); + handle_fence_notification); stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED, tengine_stonith_history_synced); te_trigger_stonith_history_sync(TRUE); crm_notice("Fencer successfully connected"); } return TRUE; } /*! \internal \brief Schedule fencer connection attempt in main loop */ void controld_trigger_fencer_connect() { if (stonith_reconnect == NULL) { stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, GINT_TO_POINTER(TRUE)); } controld_set_fsa_input_flags(R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); } void controld_disconnect_fencer(bool destroy) { if (stonith_api) { // Prevent fencer connection from coming up again controld_clear_fsa_input_flags(R_ST_REQUIRED); if (stonith_api->state != stonith_disconnected) { stonith_api->cmds->disconnect(stonith_api); } stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE); stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED); } if (destroy) { if (stonith_api) { stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if (stonith_reconnect) { mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; } if (te_client_id) { free(te_client_id); te_client_id = NULL; } } } static gboolean do_stonith_history_sync(gpointer user_data) { if (stonith_api && (stonith_api->state != stonith_disconnected)) { stonith_history_t *history = NULL; te_cleanup_stonith_history_sync(stonith_api, FALSE); stonith_api->cmds->history(stonith_api, st_opt_sync_call | st_opt_broadcast, NULL, &history, 5); stonith_history_free(history); return TRUE; } else { crm_info("Skip triggering stonith history-sync as stonith is disconnected"); return FALSE; } } static void tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) { char *uuid = NULL; int stonith_id = -1; int transition_id = -1; crm_action_t *action = NULL; const char *target = NULL; if ((data == NULL) || (data->userdata == NULL)) { crm_err("Ignoring fence operation %d result: " "No transition key given (bug?)", ((data == NULL)? -1 : data->call_id)); return; } if (!AM_I_DC) { const char *reason = stonith__exit_reason(data); if (reason == NULL) { reason = pcmk_exec_status_str(stonith__execution_status(data)); } crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s", data->call_id, stonith__exit_status(data), reason, (const char *) data->userdata); return; } CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, &stonith_id, NULL), goto bail); if (transition_graph->complete || (stonith_id < 0) || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none) || (transition_graph->id != transition_id)) { crm_info("Ignoring fence operation %d result: " "Not from current transition " CRM_XS " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", data->call_id, pcmk__btoa(transition_graph->complete), stonith_id, uuid, te_uuid, transition_id, transition_graph->id); goto bail; } action = controld_get_action(stonith_id); if (action == NULL) { crm_err("Ignoring fence operation %d result: " "Action %d not found in transition graph (bug?) " CRM_XS " uuid=%s transition=%d", data->call_id, stonith_id, uuid, transition_id); goto bail; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); if (target == NULL) { crm_err("Ignoring fence operation %d result: No target given (bug?)", data->call_id); goto bail; } stop_te_timer(action->timer); if (stonith__exit_status(data) == CRM_EX_OK) { const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); const char *op = crm_meta_value(action->params, "stonith_action"); crm_notice("Fence operation %d for %s passed", data->call_id, target); if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { te_action_confirmed(action, NULL); if (pcmk__str_eq("on", op, pcmk__str_casei)) { const char *value = NULL; char *now = pcmk__ttoa(time(NULL)); gboolean is_remote_node = FALSE; /* This check is not 100% reliable, since this node is not * guaranteed to have the remote node cached. However, it * doesn't have to be reliable, since the attribute manager can * learn a node's "remoteness" by other means sooner or later. * This allows it to learn more quickly if this node does have * the information. */ if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) { is_remote_node = TRUE; } update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, is_remote_node); free(now); value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, is_remote_node); value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, is_remote_node); } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) { send_stonith_update(action, target, uuid); crm__set_graph_action_flags(action, pcmk__graph_action_sent_update); } } st_fail_count_reset(target); } else { enum transition_action abort_action = tg_restart; int status = stonith__execution_status(data); const char *reason = stonith__exit_reason(data); if (reason == NULL) { if (status == PCMK_EXEC_DONE) { reason = "Agent returned error"; } else { reason = pcmk_exec_status_str(status); } } crm__set_graph_action_flags(action, pcmk__graph_action_failed); /* If no fence devices were available, there's no use in immediately * checking again, so don't start a new transition in that case. */ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { crm_warn("Fence operation %d for %s failed: %s " "(aborting transition and giving up for now)", data->call_id, target, reason); abort_action = tg_stop; } else { crm_notice("Fence operation %d for %s failed: %s " "(aborting transition)", data->call_id, target, reason); } /* Increment the fail count now, so abort_for_stonith_failure() can - * check it. Non-DC nodes will increment it in tengine_stonith_notify(). + * check it. Non-DC nodes will increment it in + * handle_fence_notification(). */ st_fail_count_increment(target); abort_for_stonith_failure(abort_action, target, NULL); } pcmk__update_graph(transition_graph, action); trigger_graph(); bail: free(data->userdata); free(uuid); return; } static int fence_with_delay(const char *target, const char *type, const char *delay) { uint32_t options = st_opt_none; // Group of enum stonith_call_options int timeout_sec = (int) (transition_graph->stonith_timeout / 1000); int delay_i; if (crmd_join_phase_count(crm_join_confirmed) == 1) { stonith__set_call_options(options, target, st_opt_allow_suicide); } pcmk__scan_min_int(delay, &delay_i, 0); return stonith_api->cmds->fence_with_delay(stonith_api, options, target, type, timeout_sec, 0, delay_i); } gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action) { int rc = 0; const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; char *transition_key = NULL; const char *priority_delay = NULL; gboolean invalid_action = FALSE; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = crm_meta_value(action->params, "stonith_action"); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return FALSE; } priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY); crm_notice("Requesting fencing (%s) of node %s " CRM_XS " action=%s timeout=%u%s%s", type, target, id, transition_graph->stonith_timeout, priority_delay ? " priority_delay=" : "", priority_delay ? priority_delay : ""); /* Passing NULL means block until we can connect... */ te_connect_stonith(NULL); rc = fence_with_delay(target, type, priority_delay); transition_key = pcmk__transition_key(transition_graph->id, action->id, 0, te_uuid), stonith_api->cmds->register_callback(stonith_api, rc, (int) (transition_graph->stonith_timeout / 1000), st_opt_timeout_updates, transition_key, "tengine_stonith_callback", tengine_stonith_callback); return TRUE; } bool controld_verify_stonith_watchdog_timeout(const char *value) { gboolean rv = TRUE; if (stonith_api && (stonith_api->state != stonith_disconnected) && stonith__watchdog_fencing_enabled_for_node_api(stonith_api, fsa_our_uname)) { rv = pcmk__valid_sbd_timeout(value); } return rv; } /* end stonith API client functions */ /* * stonith history synchronization * * Each node's fencer keeps track of a cluster-wide fencing history. When a node * joins or leaves, we need to synchronize the history across all nodes. */ static crm_trigger_t *stonith_history_sync_trigger = NULL; static mainloop_timer_t *stonith_history_sync_timer_short = NULL; static mainloop_timer_t *stonith_history_sync_timer_long = NULL; void te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers) { if (free_timers) { mainloop_timer_del(stonith_history_sync_timer_short); stonith_history_sync_timer_short = NULL; mainloop_timer_del(stonith_history_sync_timer_long); stonith_history_sync_timer_long = NULL; } else { mainloop_timer_stop(stonith_history_sync_timer_short); mainloop_timer_stop(stonith_history_sync_timer_long); } if (st) { st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED); } } static void tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event) { te_cleanup_stonith_history_sync(st, FALSE); crm_debug("Fence-history synced - cancel all timers"); } static gboolean stonith_history_sync_set_trigger(gpointer user_data) { mainloop_set_trigger(stonith_history_sync_trigger); return FALSE; } void te_trigger_stonith_history_sync(bool long_timeout) { /* trigger a sync in 5s to give more nodes the * chance to show up so that we don't create * unnecessary stonith-history-sync traffic * * the long timeout of 30s is there as a fallback * so that after a successful connection to fenced * we will wait for 30s for the DC to trigger a * history-sync * if this doesn't happen we trigger a sync locally * (e.g. fenced segfaults and is restarted by pacemakerd) */ /* as we are finally checking the stonith-connection * in do_stonith_history_sync we should be fine * leaving stonith_history_sync_time & stonith_history_sync_trigger * around */ if (stonith_history_sync_trigger == NULL) { stonith_history_sync_trigger = mainloop_add_trigger(G_PRIORITY_LOW, do_stonith_history_sync, NULL); } if (long_timeout) { if(stonith_history_sync_timer_long == NULL) { stonith_history_sync_timer_long = mainloop_timer_add("history_sync_long", 30000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 30 seconds"); mainloop_timer_start(stonith_history_sync_timer_long); } else { if(stonith_history_sync_timer_short == NULL) { stonith_history_sync_timer_short = mainloop_timer_add("history_sync_short", 5000, FALSE, stonith_history_sync_set_trigger, NULL); } crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); mainloop_timer_start(stonith_history_sync_timer_short); } } /* end stonith history synchronization functions */ diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c index e222a59f9f..e3113452ef 100644 --- a/daemons/fenced/cts-fence-helper.c +++ b/daemons/fenced/cts-fence-helper.c @@ -1,682 +1,686 @@ /* * Copyright 2009-2022 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static GMainLoop *mainloop = NULL; static crm_trigger_t *trig = NULL; static int mainloop_iter = 0; static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; typedef void (*mainloop_test_iteration_cb) (int check_event); #define MAINLOOP_DEFAULT_TIMEOUT 2 enum test_modes { test_standard = 0, // test using a specific developer environment test_passive, // watch notifications only test_api_sanity, // sanity-test stonith client API using fence_dummy test_api_mainloop, // sanity-test mainloop code with async responses }; static pcmk__cli_option_t long_options[] = { // long option, argument type, storage, short option, description, flags { "verbose", no_argument, NULL, 'V', NULL, pcmk__option_default }, { "version", no_argument, NULL, '$', NULL, pcmk__option_default }, { "help", no_argument, NULL, '?', NULL, pcmk__option_default }, { "passive", no_argument, NULL, 'p', NULL, pcmk__option_default }, { "api_test", no_argument, NULL, 't', NULL, pcmk__option_default }, { "mainloop_api_test", no_argument, NULL, 'm', NULL, pcmk__option_default }, { 0, 0, 0, 0 } }; static stonith_t *st = NULL; static struct pollfd pollfd; static const int st_opts = st_opt_sync_call; static int expected_notifications = 0; static int verbose = 0; static void mainloop_test_done(const char *origin, bool pass) { if (pass) { crm_info("SUCCESS - %s", origin); mainloop_iter++; mainloop_set_trigger(trig); result.execution_status = PCMK_EXEC_DONE; result.exit_status = CRM_EX_OK; } else { crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, pcmk_exec_status_str(result.execution_status)); crm_exit(CRM_EX_ERROR); } } static void dispatch_helper(int timeout) { int rc; crm_debug("Looking for notification"); pollfd.events = POLLIN; while (true) { rc = poll(&pollfd, 1, timeout); /* wait 10 minutes, -1 forever */ if (rc > 0) { if (!stonith_dispatch(st)) { break; } } else { break; } } } static void st_callback(stonith_t * st, stonith_event_t * e) { if (st->state == stonith_disconnected) { crm_exit(CRM_EX_DISCONNECT); } - crm_notice("Operation %s requested by %s %s for peer %s. %s reported: %s (ref=%s)", - e->operation, e->origin, e->result == pcmk_ok ? "completed" : "failed", - e->target, e->executioner ? e->executioner : "", - pcmk_strerror(e->result), e->id); + crm_notice("Operation '%s' targeting %s by %s for %s: %s (exit=%d, ref=%s)", + ((e->operation == NULL)? "unknown" : e->operation), + ((e->target == NULL)? "no node" : e->target), + ((e->executioner == NULL)? "any node" : e->executioner), + ((e->origin == NULL)? "unknown client" : e->origin), + pcmk_exec_status_str(stonith__event_execution_status(e)), + stonith__event_exit_status(e), + ((e->id == NULL)? "none" : e->id)); if (expected_notifications) { expected_notifications--; } } static void st_global_callback(stonith_t * stonith, stonith_callback_data_t * data) { crm_notice("Call %d exited %d: %s (%s)", data->call_id, stonith__exit_status(data), stonith__execution_status(data), crm_str(stonith__exit_reason(data))); } static void passive_test(void) { int rc = 0; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); if (rc != pcmk_ok) { stonith_api_delete(st); crm_exit(CRM_EX_DISCONNECT); } st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback); st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_DEL, st_callback); st->cmds->register_callback(st, 0, 120, st_opt_timeout_updates, NULL, "st_global_callback", st_global_callback); dispatch_helper(600 * 1000); } #define single_test(cmd, str, num_notifications, expected_rc) \ { \ int rc = 0; \ rc = cmd; \ expected_notifications = 0; \ if (num_notifications) { \ expected_notifications = num_notifications; \ dispatch_helper(500); \ } \ if (rc != expected_rc) { \ crm_err("FAILURE - expected rc %d != %d(%s) for cmd - %s", expected_rc, rc, pcmk_strerror(rc), str); \ crm_exit(CRM_EX_ERROR); \ } else if (expected_notifications) { \ crm_err("FAILURE - expected %d notifications, got only %d for cmd - %s", \ num_notifications, num_notifications - expected_notifications, str); \ crm_exit(CRM_EX_ERROR); \ } else { \ if (verbose) { \ crm_info("SUCCESS - %s: %d", str, rc); \ } else { \ crm_debug("SUCCESS - %s: %d", str, rc); \ } \ } \ }\ static void run_fence_failure_test(void) { stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "fail"); single_test(st-> cmds->register_device(st, st_opts, "test-id1", "stonith-ng", "fence_dummy", params), "Register device1 for failure test", 1, 0); single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0), - "Fence failure results off", 1, -pcmk_err_generic); + "Fence failure results off", 1, -ENODATA); single_test(st->cmds->fence(st, st_opts, "false_1_node2", "reboot", 3, 0), - "Fence failure results reboot", 1, -pcmk_err_generic); + "Fence failure results reboot", 1, -ENODATA); single_test(st->cmds->remove_device(st, st_opts, "test-id1"), "Remove device1 for failure test", 1, 0); stonith_key_value_freeall(params, 1, 1); } static void run_fence_failure_rollover_test(void) { stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "fail"); single_test(st-> cmds->register_device(st, st_opts, "test-id1", "stonith-ng", "fence_dummy", params), "Register device1 for rollover test", 1, 0); stonith_key_value_freeall(params, 1, 1); params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "pass"); single_test(st-> cmds->register_device(st, st_opts, "test-id2", "stonith-ng", "fence_dummy", params), "Register device2 for rollover test", 1, 0); single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0), "Fence rollover results off", 1, 0); /* Expect -ENODEV because fence_dummy requires 'on' to be executed on target */ single_test(st->cmds->fence(st, st_opts, "false_1_node2", "on", 3, 0), "Fence rollover results on", 1, -ENODEV); single_test(st->cmds->remove_device(st, st_opts, "test-id1"), "Remove device1 for rollover tests", 1, 0); single_test(st->cmds->remove_device(st, st_opts, "test-id2"), "Remove device2 for rollover tests", 1, 0); stonith_key_value_freeall(params, 1, 1); } static void run_standard_test(void) { stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2 false_1_node2=3,4"); params = stonith_key_value_add(params, "mode", "pass"); params = stonith_key_value_add(params, "mock_dynamic_hosts", "false_1_node1 false_1_node2"); single_test(st-> cmds->register_device(st, st_opts, "test-id", "stonith-ng", "fence_dummy", params), "Register", 1, 0); stonith_key_value_freeall(params, 1, 1); params = NULL; single_test(st->cmds->list(st, st_opts, "test-id", NULL, 1), "list", 1, 0); single_test(st->cmds->monitor(st, st_opts, "test-id", 1), "Monitor", 1, 0); single_test(st->cmds->status(st, st_opts, "test-id", "false_1_node2", 1), "Status false_1_node2", 1, 0); single_test(st->cmds->status(st, st_opts, "test-id", "false_1_node1", 1), "Status false_1_node1", 1, 0); single_test(st->cmds->fence(st, st_opts, "unknown-host", "off", 1, 0), "Fence unknown-host (expected failure)", 0, -ENODEV); single_test(st->cmds->fence(st, st_opts, "false_1_node1", "off", 1, 0), "Fence false_1_node1", 1, 0); /* Expect -ENODEV because fence_dummy requires 'on' to be executed on target */ single_test(st->cmds->fence(st, st_opts, "false_1_node1", "on", 1, 0), "Unfence false_1_node1", 1, -ENODEV); /* Confirm that an invalid level index is rejected */ single_test(st->cmds->register_level(st, st_opts, "node1", 999, params), "Attempt to register an invalid level index", 0, -EINVAL); single_test(st->cmds->remove_device(st, st_opts, "test-id"), "Remove test-id", 1, 0); stonith_key_value_freeall(params, 1, 1); } static void sanity_tests(void) { int rc = 0; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); if (rc != pcmk_ok) { stonith_api_delete(st); crm_exit(CRM_EX_DISCONNECT); } st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback); st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback); st->cmds->register_notification(st, STONITH_OP_DEVICE_DEL, st_callback); st->cmds->register_callback(st, 0, 120, st_opt_timeout_updates, NULL, "st_global_callback", st_global_callback); crm_info("Starting API Sanity Tests"); run_standard_test(); run_fence_failure_test(); run_fence_failure_rollover_test(); crm_info("Sanity Tests Passed"); } static void standard_dev_test(void) { int rc = 0; char *tmp = NULL; stonith_key_value_t *params = NULL; rc = st->cmds->connect(st, crm_system_name, &pollfd.fd); if (rc != pcmk_ok) { stonith_api_delete(st); crm_exit(CRM_EX_DISCONNECT); } params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "some-host=pcmk-7 true_1_node1=3,4"); rc = st->cmds->register_device(st, st_opts, "test-id", "stonith-ng", "fence_xvm", params); crm_debug("Register: %d", rc); rc = st->cmds->list(st, st_opts, "test-id", &tmp, 10); crm_debug("List: %d output: %s", rc, tmp ? tmp : ""); rc = st->cmds->monitor(st, st_opts, "test-id", 10); crm_debug("Monitor: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node2", 10); crm_debug("Status false_1_node2: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "unknown-host", "off", 60, 0); crm_debug("Fence unknown-host: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "false_1_node1", "off", 60, 0); crm_debug("Fence false_1_node1: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "false_1_node1", "on", 10, 0); crm_debug("Unfence false_1_node1: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "false_1_node1", 10); crm_debug("Status false_1_node1: %d", rc); rc = st->cmds->fence(st, st_opts, "some-host", "off", 10, 0); crm_debug("Fence alias: %d", rc); rc = st->cmds->status(st, st_opts, "test-id", "some-host", 10); crm_debug("Status alias: %d", rc); rc = st->cmds->fence(st, st_opts, "false_1_node1", "on", 10, 0); crm_debug("Unfence false_1_node1: %d", rc); rc = st->cmds->remove_device(st, st_opts, "test-id"); crm_debug("Remove test-id: %d", rc); stonith_key_value_freeall(params, 1, 1); } static void iterate_mainloop_tests(gboolean event_ready); static void mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data) { pcmk__set_result(&result, stonith__exit_status(data), stonith__execution_status(data), stonith__exit_reason(data)); iterate_mainloop_tests(TRUE); } static int register_callback_helper(int callid) { return st->cmds->register_callback(st, callid, MAINLOOP_DEFAULT_TIMEOUT, st_opt_timeout_updates, NULL, "callback", mainloop_callback); } static void test_async_fence_pass(int check_event) { int rc = 0; if (check_event) { mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); return; } rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); if (rc < 0) { crm_err("fence failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } #define CUSTOM_TIMEOUT_ADDITION 10 static void test_async_fence_custom_timeout(int check_event) { int rc = 0; static time_t begin = 0; if (check_event) { uint32_t diff = (time(NULL) - begin); if (result.execution_status != PCMK_EXEC_TIMEOUT) { mainloop_test_done(__func__, false); } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) { crm_err ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d", CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff); mainloop_test_done(__func__, false); } else { mainloop_test_done(__func__, true); } return; } begin = time(NULL); rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); if (rc < 0) { crm_err("fence failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } static void test_async_fence_timeout(int check_event) { int rc = 0; if (check_event) { mainloop_test_done(__func__, (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE)); return; } rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); if (rc < 0) { crm_err("fence failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } static void test_async_monitor(int check_event) { int rc = 0; if (check_event) { mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); return; } rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT); if (rc < 0) { crm_err("monitor failed with rc %d", rc); mainloop_test_done(__func__, false); } register_callback_helper(rc); /* wait for event */ } static void test_register_async_devices(int check_event) { char buf[16] = { 0, }; stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "false_1_node1=1,2"); params = stonith_key_value_add(params, "mode", "fail"); st->cmds->register_device(st, st_opts, "false_1", "stonith-ng", "fence_dummy", params); stonith_key_value_freeall(params, 1, 1); params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "true_1_node1=1,2"); params = stonith_key_value_add(params, "mode", "pass"); st->cmds->register_device(st, st_opts, "true_1", "stonith-ng", "fence_dummy", params); stonith_key_value_freeall(params, 1, 1); params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_MAP, "custom_timeout_node1=1,2"); params = stonith_key_value_add(params, "mode", "fail"); params = stonith_key_value_add(params, "delay", "1000"); snprintf(buf, sizeof(buf) - 1, "%d", MAINLOOP_DEFAULT_TIMEOUT + CUSTOM_TIMEOUT_ADDITION); params = stonith_key_value_add(params, "pcmk_off_timeout", buf); st->cmds->register_device(st, st_opts, "false_custom_timeout", "stonith-ng", "fence_dummy", params); stonith_key_value_freeall(params, 1, 1); mainloop_test_done(__func__, true); } static void try_mainloop_connect(int check_event) { int rc = stonith_api_connect_retry(st, crm_system_name, 10); if (rc == pcmk_ok) { mainloop_test_done(__func__, true); return; } crm_err("API CONNECTION FAILURE"); mainloop_test_done(__func__, false); } static void iterate_mainloop_tests(gboolean event_ready) { static mainloop_test_iteration_cb callbacks[] = { try_mainloop_connect, test_register_async_devices, test_async_monitor, test_async_fence_pass, test_async_fence_timeout, test_async_fence_custom_timeout, }; if (mainloop_iter == (sizeof(callbacks) / sizeof(mainloop_test_iteration_cb))) { /* all tests ran, everything passed */ crm_info("ALL MAINLOOP TESTS PASSED!"); crm_exit(CRM_EX_OK); } callbacks[mainloop_iter] (event_ready); } static gboolean trigger_iterate_mainloop_tests(gpointer user_data) { iterate_mainloop_tests(FALSE); return TRUE; } static void test_shutdown(int nsig) { int rc = 0; if (st) { rc = st->cmds->disconnect(st); crm_info("Disconnect: %d", rc); crm_debug("Destroy"); stonith_api_delete(st); } if (rc) { crm_exit(CRM_EX_ERROR); } } static void mainloop_tests(void) { trig = mainloop_add_trigger(G_PRIORITY_HIGH, trigger_iterate_mainloop_tests, NULL); mainloop_set_trigger(trig); mainloop_add_signal(SIGTERM, test_shutdown); crm_info("Starting"); mainloop = g_main_loop_new(NULL, FALSE); g_main_loop_run(mainloop); } int main(int argc, char **argv) { int argerr = 0; int flag; int option_index = 0; enum test_modes mode = test_standard; pcmk__cli_init_logging("cts-fence-helper", 0); pcmk__set_cli_options(NULL, " [options]", long_options, "inject commands into the Pacemaker fencer, " "and watch for events"); while (1) { flag = pcmk__next_cli_option(argc, argv, &option_index, NULL); if (flag == -1) { break; } switch (flag) { case 'V': verbose = 1; break; case '$': case '?': pcmk__cli_help(flag, CRM_EX_OK); break; case 'p': mode = test_passive; break; case 't': mode = test_api_sanity; break; case 'm': mode = test_api_mainloop; break; default: ++argerr; break; } } crm_log_init(NULL, LOG_INFO, TRUE, (verbose? TRUE : FALSE), argc, argv, FALSE); if (optind > argc) { ++argerr; } if (argerr) { pcmk__cli_help('?', CRM_EX_USAGE); } st = stonith_api_new(); if (st == NULL) { crm_err("Could not connect to fencer: API memory allocation failed"); crm_exit(CRM_EX_DISCONNECT); } switch (mode) { case test_standard: standard_dev_test(); break; case test_passive: passive_test(); break; case test_api_sanity: sanity_tests(); break; case test_api_mainloop: mainloop_tests(); break; } test_shutdown(0); return CRM_EX_OK; } diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h index eff689e59b..d2b49f831a 100644 --- a/include/crm/fencing/internal.h +++ b/include/crm/fencing/internal.h @@ -1,211 +1,217 @@ /* * Copyright 2011-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef STONITH_NG_INTERNAL__H # define STONITH_NG_INTERNAL__H # include # include # include # include # include enum st_device_flags { st_device_supports_list = 0x0001, st_device_supports_status = 0x0002, st_device_supports_reboot = 0x0004, st_device_supports_parameter_plug = 0x0008, st_device_supports_parameter_port = 0x0010, }; #define stonith__set_device_flags(device_flags, device_id, flags_to_set) do { \ device_flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ "Fence device", device_id, \ (device_flags), (flags_to_set), \ #flags_to_set); \ } while (0) #define stonith__set_call_options(st_call_opts, call_for, flags_to_set) do { \ st_call_opts = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ "Fencer call", (call_for), \ (st_call_opts), (flags_to_set), \ #flags_to_set); \ } while (0) #define stonith__clear_call_options(st_call_opts, call_for, flags_to_clear) do { \ st_call_opts = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \ "Fencer call", (call_for), \ (st_call_opts), (flags_to_clear), \ #flags_to_clear); \ } while (0) struct stonith_action_s; typedef struct stonith_action_s stonith_action_t; stonith_action_t *stonith_action_create(const char *agent, const char *_action, const char *victim, uint32_t victim_nodeid, int timeout, GHashTable * device_args, GHashTable * port_map, const char * host_arg); void stonith__destroy_action(stonith_action_t *action); pcmk__action_result_t *stonith__action_result(stonith_action_t *action); int stonith__result2rc(const pcmk__action_result_t *result); void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result); void stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result); xmlNode *stonith__find_xe_with_result(xmlNode *xml); int stonith_action_execute_async(stonith_action_t * action, void *userdata, void (*done) (int pid, const pcmk__action_result_t *result, void *user_data), void (*fork_cb) (int pid, void *user_data)); xmlNode *create_level_registration_xml(const char *node, const char *pattern, const char *attr, const char *value, int level, stonith_key_value_t *device_list); xmlNode *create_device_registration_xml(const char *id, enum stonith_namespace namespace, const char *agent, stonith_key_value_t *params, const char *rsc_provides); void stonith__register_messages(pcmk__output_t *out); GList *stonith__parse_targets(const char *hosts); gboolean stonith__later_succeeded(stonith_history_t *event, stonith_history_t *top_history); stonith_history_t *stonith__sort_history(stonith_history_t *history); void stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name, xmlNode *metadata); # define ST_LEVEL_MAX 10 # define F_STONITH_CLIENTID "st_clientid" # define F_STONITH_CALLOPTS "st_callopt" # define F_STONITH_CALLID "st_callid" # define F_STONITH_CALLDATA "st_calldata" # define F_STONITH_OPERATION "st_op" # define F_STONITH_TARGET "st_target" # define F_STONITH_REMOTE_OP_ID "st_remote_op" # define F_STONITH_REMOTE_OP_ID_RELAY "st_remote_op_relay" # define F_STONITH_RC "st_rc" # define F_STONITH_OUTPUT "st_output" /*! Timeout period per a device execution */ # define F_STONITH_TIMEOUT "st_timeout" # define F_STONITH_TOLERANCE "st_tolerance" # define F_STONITH_DELAY "st_delay" /*! Action specific timeout period returned in query of fencing devices. */ # define F_STONITH_ACTION_TIMEOUT "st_action_timeout" /*! Host in query result is not allowed to run this action */ # define F_STONITH_ACTION_DISALLOWED "st_action_disallowed" /*! Maximum of random fencing delay for a device */ # define F_STONITH_DELAY_MAX "st_delay_max" /*! Base delay used for a fencing delay */ # define F_STONITH_DELAY_BASE "st_delay_base" /*! Has this device been verified using a monitor type * operation (monitor, list, status) */ # define F_STONITH_DEVICE_VERIFIED "st_monitor_verified" /*! device is required for this action */ # define F_STONITH_DEVICE_REQUIRED "st_required" /*! number of available devices in query result */ # define F_STONITH_AVAILABLE_DEVICES "st-available-devices" # define F_STONITH_CALLBACK_TOKEN "st_async_id" # define F_STONITH_CLIENTNAME "st_clientname" # define F_STONITH_CLIENTNODE "st_clientnode" # define F_STONITH_NOTIFY_ACTIVATE "st_notify_activate" # define F_STONITH_NOTIFY_DEACTIVATE "st_notify_deactivate" # define F_STONITH_DELEGATE "st_delegate" /*! The node initiating the stonith operation. If an operation * is relayed, this is the last node the operation lands on. When * in standalone mode, origin is the client's id that originated the * operation. */ # define F_STONITH_ORIGIN "st_origin" # define F_STONITH_HISTORY_LIST "st_history" # define F_STONITH_DATE "st_date" # define F_STONITH_DATE_NSEC "st_date_nsec" # define F_STONITH_STATE "st_state" # define F_STONITH_ACTIVE "st_active" # define F_STONITH_DIFFERENTIAL "st_differential" # define F_STONITH_DEVICE "st_device_id" # define F_STONITH_ACTION "st_device_action" # define F_STONITH_MERGED "st_op_merged" # define T_STONITH_NG "stonith-ng" # define T_STONITH_REPLY "st-reply" /*! For async operations, an event from the server containing * the total amount of time the server is allowing for the operation * to take place is returned to the client. */ # define T_STONITH_TIMEOUT_VALUE "st-async-timeout-value" # define T_STONITH_NOTIFY "st_notify" # define STONITH_ATTR_ACTION_OP "action" # define STONITH_OP_EXEC "st_execute" # define STONITH_OP_TIMEOUT_UPDATE "st_timeout_update" # define STONITH_OP_QUERY "st_query" # define STONITH_OP_FENCE "st_fence" # define STONITH_OP_RELAY "st_relay" # define STONITH_OP_DEVICE_ADD "st_device_register" # define STONITH_OP_DEVICE_DEL "st_device_remove" # define STONITH_OP_FENCE_HISTORY "st_fence_history" # define STONITH_OP_LEVEL_ADD "st_level_add" # define STONITH_OP_LEVEL_DEL "st_level_remove" # define STONITH_WATCHDOG_AGENT "fence_watchdog" /* Don't change 2 below as it would break rolling upgrade */ # define STONITH_WATCHDOG_AGENT_INTERNAL "#watchdog" # define STONITH_WATCHDOG_ID "watchdog" /* Exported for crm_mon to reference */ int stonith__failed_history(pcmk__output_t *out, va_list args); int stonith__history(pcmk__output_t *out, va_list args); int stonith__full_history(pcmk__output_t *out, va_list args); int stonith__pending_actions(pcmk__output_t *out, va_list args); stonith_history_t *stonith__first_matching_event(stonith_history_t *history, bool (*matching_fn)(stonith_history_t *, void *), void *user_data); bool stonith__event_state_pending(stonith_history_t *history, void *user_data); bool stonith__event_state_eq(stonith_history_t *history, void *user_data); bool stonith__event_state_neq(stonith_history_t *history, void *user_data); int stonith__legacy2status(int rc); + int stonith__exit_status(stonith_callback_data_t *data); int stonith__execution_status(stonith_callback_data_t *data); const char *stonith__exit_reason(stonith_callback_data_t *data); +int stonith__event_exit_status(stonith_event_t *event); +int stonith__event_execution_status(stonith_event_t *event); +const char *stonith__event_exit_reason(stonith_event_t *event); +char *stonith__event_description(stonith_event_t *event); + /*! * \internal * \brief Is a fencing operation in pending state? * * \param[in] state State as enum op_state value * * \return A boolean */ static inline bool stonith__op_state_pending(enum op_state state) { return state != st_failed && state != st_done; } gboolean stonith__watchdog_fencing_enabled_for_node(const char *node); gboolean stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node); #endif diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index 4823751267..b1de912b2a 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -1,2367 +1,2483 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fencing_private.h" CRM_TRACE_INIT_DATA(stonith); typedef struct stonith_private_s { char *token; crm_ipc_t *ipc; mainloop_io_t *source; GHashTable *stonith_op_callback_table; GList *notify_list; int notify_refcnt; bool notify_deletes; void (*op_callback) (stonith_t * st, stonith_callback_data_t * data); } stonith_private_t; typedef struct stonith_notify_client_s { const char *event; const char *obj_id; /* implement one day */ const char *obj_type; /* implement one day */ void (*notify) (stonith_t * st, stonith_event_t * e); bool delete; } stonith_notify_client_t; typedef struct stonith_callback_client_s { void (*callback) (stonith_t * st, stonith_callback_data_t * data); const char *id; void *user_data; gboolean only_success; gboolean allow_timeout_updates; struct timer_rec_s *timer; } stonith_callback_client_t; struct notify_blob_s { stonith_t *stonith; xmlNode *xml; }; struct timer_rec_s { int call_id; int timeout; guint ref; stonith_t *stonith; }; typedef int (*stonith_op_t) (const char *, int, const char *, xmlNode *, xmlNode *, xmlNode *, xmlNode **, xmlNode **); bool stonith_dispatch(stonith_t * st); xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, int call_options); static int stonith_send_command(stonith_t *stonith, const char *op, xmlNode *data, xmlNode **output_data, int call_options, int timeout); static void stonith_connection_destroy(gpointer user_data); static void stonith_send_notification(gpointer data, gpointer user_data); /*! * \brief Get agent namespace by name * * \param[in] namespace_s Name of namespace as string * * \return Namespace as enum value */ enum stonith_namespace stonith_text2namespace(const char *namespace_s) { if (pcmk__str_eq(namespace_s, "any", pcmk__str_null_matches)) { return st_namespace_any; } else if (!strcmp(namespace_s, "redhat") || !strcmp(namespace_s, "stonith-ng")) { return st_namespace_rhcs; } else if (!strcmp(namespace_s, "internal")) { return st_namespace_internal; } else if (!strcmp(namespace_s, "heartbeat")) { return st_namespace_lha; } return st_namespace_invalid; } /*! * \brief Get agent namespace name * * \param[in] namespace Namespace as enum value * * \return Namespace name as string */ const char * stonith_namespace2text(enum stonith_namespace st_namespace) { switch (st_namespace) { case st_namespace_any: return "any"; case st_namespace_rhcs: return "stonith-ng"; case st_namespace_internal: return "internal"; case st_namespace_lha: return "heartbeat"; default: break; } return "unsupported"; } /*! * \brief Determine namespace of a fence agent * * \param[in] agent Fence agent type * \param[in] namespace_s Name of agent namespace as string, if known * * \return Namespace of specified agent, as enum value */ enum stonith_namespace stonith_get_namespace(const char *agent, const char *namespace_s) { if (pcmk__str_eq(namespace_s, "internal", pcmk__str_casei)) { return st_namespace_internal; } if (stonith__agent_is_rhcs(agent)) { return st_namespace_rhcs; } #if HAVE_STONITH_STONITH_H if (stonith__agent_is_lha(agent)) { return st_namespace_lha; } #endif crm_err("Unknown fence agent: %s", agent); return st_namespace_invalid; } gboolean stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) { gboolean rv = FALSE; stonith_t *stonith_api = st?st:stonith_api_new(); char *list = NULL; if(stonith_api) { if (stonith_api->state == stonith_disconnected) { int rc = stonith_api->cmds->connect(stonith_api, "stonith-api", NULL); if (rc != pcmk_ok) { crm_err("Failed connecting to Stonith-API for watchdog-fencing-query."); } } if (stonith_api->state != stonith_disconnected) { /* caveat!!! * this might fail when when stonithd is just updating the device-list * probably something we should fix as well for other api-calls */ int rc = stonith_api->cmds->list(stonith_api, st_opt_sync_call, STONITH_WATCHDOG_ID, &list, 0); if ((rc != pcmk_ok) || (list == NULL)) { /* due to the race described above it can happen that * we drop in here - so as not to make remote nodes * panic on that answer */ crm_warn("watchdog-fencing-query failed"); } else if (list[0] == '\0') { rv = TRUE; } else { GList *targets = stonith__parse_targets(list); rv = pcmk__str_in_list(node, targets, pcmk__str_casei); g_list_free_full(targets, free); } free(list); if (!st) { /* if we're provided the api we still might have done the * connection - but let's assume the caller won't bother */ stonith_api->cmds->disconnect(stonith_api); } } if (!st) { stonith_api_delete(stonith_api); } } else { crm_err("Stonith-API for watchdog-fencing-query couldn't be created."); } crm_trace("Pacemaker assumes node %s %sto do watchdog-fencing.", node, rv?"":"not "); return rv; } gboolean stonith__watchdog_fencing_enabled_for_node(const char *node) { return stonith__watchdog_fencing_enabled_for_node_api(NULL, node); } /* when cycling through the list we don't want to delete items so just mark them and when we know nobody is using the list loop over it to remove the marked items */ static void foreach_notify_entry (stonith_private_t *private, GFunc func, gpointer user_data) { private->notify_refcnt++; g_list_foreach(private->notify_list, func, user_data); private->notify_refcnt--; if ((private->notify_refcnt == 0) && private->notify_deletes) { GList *list_item = private->notify_list; private->notify_deletes = FALSE; while (list_item != NULL) { stonith_notify_client_t *list_client = list_item->data; GList *next = g_list_next(list_item); if (list_client->delete) { free(list_client); private->notify_list = g_list_delete_link(private->notify_list, list_item); } list_item = next; } } } static void stonith_connection_destroy(gpointer user_data) { stonith_t *stonith = user_data; stonith_private_t *native = NULL; struct notify_blob_s blob; crm_trace("Sending destroyed notification"); blob.stonith = stonith; blob.xml = create_xml_node(NULL, "notify"); native = stonith->st_private; native->ipc = NULL; native->source = NULL; free(native->token); native->token = NULL; stonith->state = stonith_disconnected; crm_xml_add(blob.xml, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(blob.xml, F_SUBTYPE, T_STONITH_NOTIFY_DISCONNECT); foreach_notify_entry(native, stonith_send_notification, &blob); free_xml(blob.xml); } xmlNode * create_device_registration_xml(const char *id, enum stonith_namespace namespace, const char *agent, stonith_key_value_t *params, const char *rsc_provides) { xmlNode *data = create_xml_node(NULL, F_STONITH_DEVICE); xmlNode *args = create_xml_node(data, XML_TAG_ATTRS); #if HAVE_STONITH_STONITH_H if (namespace == st_namespace_any) { namespace = stonith_get_namespace(agent, NULL); } if (namespace == st_namespace_lha) { hash2field((gpointer) "plugin", (gpointer) agent, args); agent = "fence_legacy"; } #endif crm_xml_add(data, XML_ATTR_ID, id); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add(data, "agent", agent); if ((namespace != st_namespace_any) && (namespace != st_namespace_invalid)) { crm_xml_add(data, "namespace", stonith_namespace2text(namespace)); } if (rsc_provides) { crm_xml_add(data, "rsc_provides", rsc_provides); } for (; params; params = params->next) { hash2field((gpointer) params->key, (gpointer) params->value, args); } return data; } static int stonith_api_register_device(stonith_t * st, int call_options, const char *id, const char *namespace, const char *agent, stonith_key_value_t * params) { int rc = 0; xmlNode *data = NULL; data = create_device_registration_xml(id, stonith_text2namespace(namespace), agent, params, NULL); rc = stonith_send_command(st, STONITH_OP_DEVICE_ADD, data, NULL, call_options, 0); free_xml(data); return rc; } static int stonith_api_remove_device(stonith_t * st, int call_options, const char *name) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add(data, XML_ATTR_ID, name); rc = stonith_send_command(st, STONITH_OP_DEVICE_DEL, data, NULL, call_options, 0); free_xml(data); return rc; } static int stonith_api_remove_level_full(stonith_t *st, int options, const char *node, const char *pattern, const char *attr, const char *value, int level) { int rc = 0; xmlNode *data = NULL; CRM_CHECK(node || pattern || (attr && value), return -EINVAL); data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); crm_xml_add(data, F_STONITH_ORIGIN, __func__); if (node) { crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); } else if (pattern) { crm_xml_add(data, XML_ATTR_STONITH_TARGET_PATTERN, pattern); } else { crm_xml_add(data, XML_ATTR_STONITH_TARGET_ATTRIBUTE, attr); crm_xml_add(data, XML_ATTR_STONITH_TARGET_VALUE, value); } crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); rc = stonith_send_command(st, STONITH_OP_LEVEL_DEL, data, NULL, options, 0); free_xml(data); return rc; } static int stonith_api_remove_level(stonith_t * st, int options, const char *node, int level) { return stonith_api_remove_level_full(st, options, node, NULL, NULL, NULL, level); } /*! * \internal * \brief Create XML for fence topology level registration request * * \param[in] node If not NULL, target level by this node name * \param[in] pattern If not NULL, target by node name using this regex * \param[in] attr If not NULL, target by this node attribute * \param[in] value If not NULL, target by this node attribute value * \param[in] level Index number of level to register * \param[in] device_list List of devices in level * * \return Newly allocated XML tree on success, NULL otherwise * * \note The caller should set only one of node, pattern or attr/value. */ xmlNode * create_level_registration_xml(const char *node, const char *pattern, const char *attr, const char *value, int level, stonith_key_value_t *device_list) { size_t len = 0; char *list = NULL; xmlNode *data; CRM_CHECK(node || pattern || (attr && value), return NULL); data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); CRM_CHECK(data, return NULL); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add_int(data, XML_ATTR_ID, level); crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); if (node) { crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); } else if (pattern) { crm_xml_add(data, XML_ATTR_STONITH_TARGET_PATTERN, pattern); } else { crm_xml_add(data, XML_ATTR_STONITH_TARGET_ATTRIBUTE, attr); crm_xml_add(data, XML_ATTR_STONITH_TARGET_VALUE, value); } // cppcheck seems not to understand the abort logic behind pcmk__realloc // cppcheck-suppress memleak for (; device_list; device_list = device_list->next) { pcmk__add_separated_word(&list, &len, device_list->value, ","); } crm_xml_add(data, XML_ATTR_STONITH_DEVICES, list); free(list); return data; } static int stonith_api_register_level_full(stonith_t * st, int options, const char *node, const char *pattern, const char *attr, const char *value, int level, stonith_key_value_t *device_list) { int rc = 0; xmlNode *data = create_level_registration_xml(node, pattern, attr, value, level, device_list); CRM_CHECK(data != NULL, return -EINVAL); rc = stonith_send_command(st, STONITH_OP_LEVEL_ADD, data, NULL, options, 0); free_xml(data); return rc; } static int stonith_api_register_level(stonith_t * st, int options, const char *node, int level, stonith_key_value_t * device_list) { return stonith_api_register_level_full(st, options, node, NULL, NULL, NULL, level, device_list); } static int stonith_api_device_list(stonith_t * stonith, int call_options, const char *namespace, stonith_key_value_t ** devices, int timeout) { int count = 0; enum stonith_namespace ns = stonith_text2namespace(namespace); if (devices == NULL) { crm_err("Parameter error: stonith_api_device_list"); return -EFAULT; } #if HAVE_STONITH_STONITH_H // Include Linux-HA agents if requested if ((ns == st_namespace_any) || (ns == st_namespace_lha)) { count += stonith__list_lha_agents(devices); } #endif // Include Red Hat agents if requested if ((ns == st_namespace_any) || (ns == st_namespace_rhcs)) { count += stonith__list_rhcs_agents(devices); } return count; } static int stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *agent, const char *namespace, char **output, int timeout) { /* By executing meta-data directly, we can get it from stonith_admin when * the cluster is not running, which is important for higher-level tools. */ enum stonith_namespace ns = stonith_get_namespace(agent, namespace); crm_trace("Looking up metadata for %s agent %s", stonith_namespace2text(ns), agent); switch (ns) { case st_namespace_rhcs: return stonith__rhcs_metadata(agent, timeout, output); #if HAVE_STONITH_STONITH_H case st_namespace_lha: return stonith__lha_metadata(agent, timeout, output); #endif default: crm_err("Can't get fence agent '%s' meta-data: No such agent", agent); break; } return -ENODEV; } static int stonith_api_query(stonith_t * stonith, int call_options, const char *target, stonith_key_value_t ** devices, int timeout) { int rc = 0, lpc = 0, max = 0; xmlNode *data = NULL; xmlNode *output = NULL; xmlXPathObjectPtr xpathObj = NULL; CRM_CHECK(devices != NULL, return -EINVAL); data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add(data, F_STONITH_TARGET, target); crm_xml_add(data, F_STONITH_ACTION, "off"); rc = stonith_send_command(stonith, STONITH_OP_QUERY, data, &output, call_options, timeout); if (rc < 0) { return rc; } xpathObj = xpath_search(output, "//@agent"); if (xpathObj) { max = numXpathResults(xpathObj); for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if(match != NULL) { xmlChar *match_path = xmlGetNodePath(match); crm_info("%s[%d] = %s", "//@agent", lpc, match_path); free(match_path); *devices = stonith_key_value_add(*devices, NULL, crm_element_value(match, XML_ATTR_ID)); } } freeXpathObject(xpathObj); } free_xml(output); free_xml(data); return max; } static int stonith_api_call(stonith_t * stonith, int call_options, const char *id, const char *action, const char *victim, int timeout, xmlNode ** output) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add(data, F_STONITH_DEVICE, id); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add(data, F_STONITH_TARGET, victim); rc = stonith_send_command(stonith, STONITH_OP_EXEC, data, output, call_options, timeout); free_xml(data); return rc; } static int stonith_api_list(stonith_t * stonith, int call_options, const char *id, char **list_info, int timeout) { int rc; xmlNode *output = NULL; rc = stonith_api_call(stonith, call_options, id, "list", NULL, timeout, &output); if (output && list_info) { const char *list_str; list_str = crm_element_value(output, F_STONITH_OUTPUT); if (list_str) { *list_info = strdup(list_str); } } if (output) { free_xml(output); } return rc; } static int stonith_api_monitor(stonith_t * stonith, int call_options, const char *id, int timeout) { return stonith_api_call(stonith, call_options, id, "monitor", NULL, timeout, NULL); } static int stonith_api_status(stonith_t * stonith, int call_options, const char *id, const char *port, int timeout) { return stonith_api_call(stonith, call_options, id, "status", port, timeout, NULL); } static int stonith_api_fence_with_delay(stonith_t * stonith, int call_options, const char *node, const char *action, int timeout, int tolerance, int delay) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, __func__); crm_xml_add(data, F_STONITH_TARGET, node); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add_int(data, F_STONITH_TIMEOUT, timeout); crm_xml_add_int(data, F_STONITH_TOLERANCE, tolerance); crm_xml_add_int(data, F_STONITH_DELAY, delay); rc = stonith_send_command(stonith, STONITH_OP_FENCE, data, NULL, call_options, timeout); free_xml(data); return rc; } static int stonith_api_fence(stonith_t * stonith, int call_options, const char *node, const char *action, int timeout, int tolerance) { return stonith_api_fence_with_delay(stonith, call_options, node, action, timeout, tolerance, 0); } static int stonith_api_confirm(stonith_t * stonith, int call_options, const char *target) { stonith__set_call_options(call_options, target, st_opt_manual_ack); return stonith_api_fence(stonith, call_options, target, "off", 0, 0); } static int stonith_api_history(stonith_t * stonith, int call_options, const char *node, stonith_history_t ** history, int timeout) { int rc = 0; xmlNode *data = NULL; xmlNode *output = NULL; stonith_history_t *last = NULL; *history = NULL; if (node) { data = create_xml_node(NULL, __func__); crm_xml_add(data, F_STONITH_TARGET, node); } stonith__set_call_options(call_options, node, st_opt_sync_call); rc = stonith_send_command(stonith, STONITH_OP_FENCE_HISTORY, data, &output, call_options, timeout); free_xml(data); if (rc == 0) { xmlNode *op = NULL; xmlNode *reply = get_xpath_object("//" F_STONITH_HISTORY_LIST, output, LOG_NEVER); for (op = pcmk__xml_first_child(reply); op != NULL; op = pcmk__xml_next(op)) { stonith_history_t *kvp; long long completed; long long completed_nsec = 0L; kvp = calloc(1, sizeof(stonith_history_t)); kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); kvp->action = crm_element_value_copy(op, F_STONITH_ACTION); kvp->origin = crm_element_value_copy(op, F_STONITH_ORIGIN); kvp->delegate = crm_element_value_copy(op, F_STONITH_DELEGATE); kvp->client = crm_element_value_copy(op, F_STONITH_CLIENTNAME); crm_element_value_ll(op, F_STONITH_DATE, &completed); kvp->completed = (time_t) completed; crm_element_value_ll(op, F_STONITH_DATE_NSEC, &completed_nsec); kvp->completed_nsec = completed_nsec; crm_element_value_int(op, F_STONITH_STATE, &kvp->state); if (last) { last->next = kvp; } else { *history = kvp; } last = kvp; } } free_xml(output); return rc; } void stonith_history_free(stonith_history_t *history) { stonith_history_t *hp, *hp_old; for (hp = history; hp; hp_old = hp, hp = hp->next, free(hp_old)) { free(hp->target); free(hp->action); free(hp->origin); free(hp->delegate); free(hp->client); } } static gint stonithlib_GCompareFunc(gconstpointer a, gconstpointer b) { int rc = 0; const stonith_notify_client_t *a_client = a; const stonith_notify_client_t *b_client = b; if (a_client->delete || b_client->delete) { /* make entries marked for deletion not findable */ return -1; } CRM_CHECK(a_client->event != NULL && b_client->event != NULL, return 0); rc = strcmp(a_client->event, b_client->event); if (rc == 0) { if (a_client->notify == NULL || b_client->notify == NULL) { return 0; } else if (a_client->notify == b_client->notify) { return 0; } else if (((long)a_client->notify) < ((long)b_client->notify)) { crm_err("callbacks for %s are not equal: %p vs. %p", a_client->event, a_client->notify, b_client->notify); return -1; } crm_err("callbacks for %s are not equal: %p vs. %p", a_client->event, a_client->notify, b_client->notify); return 1; } return rc; } xmlNode * stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, int call_options) { xmlNode *op_msg = create_xml_node(NULL, "stonith_command"); CRM_CHECK(op_msg != NULL, return NULL); CRM_CHECK(token != NULL, return NULL); crm_xml_add(op_msg, F_XML_TAGNAME, "stonith_command"); crm_xml_add(op_msg, F_TYPE, T_STONITH_NG); crm_xml_add(op_msg, F_STONITH_CALLBACK_TOKEN, token); crm_xml_add(op_msg, F_STONITH_OPERATION, op); crm_xml_add_int(op_msg, F_STONITH_CALLID, call_id); crm_trace("Sending call options: %.8lx, %d", (long)call_options, call_options); crm_xml_add_int(op_msg, F_STONITH_CALLOPTS, call_options); if (data != NULL) { add_message_xml(op_msg, F_STONITH_CALLDATA, data); } return op_msg; } static void stonith_destroy_op_callback(gpointer data) { stonith_callback_client_t *blob = data; if (blob->timer && blob->timer->ref > 0) { g_source_remove(blob->timer->ref); } free(blob->timer); free(blob); } static int stonith_api_signoff(stonith_t * stonith) { stonith_private_t *native = stonith->st_private; crm_debug("Disconnecting from the fencer"); if (native->source != NULL) { /* Attached to mainloop */ mainloop_del_ipc_client(native->source); native->source = NULL; native->ipc = NULL; } else if (native->ipc) { /* Not attached to mainloop */ crm_ipc_t *ipc = native->ipc; native->ipc = NULL; crm_ipc_close(ipc); crm_ipc_destroy(ipc); } free(native->token); native->token = NULL; stonith->state = stonith_disconnected; return pcmk_ok; } static int stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) { stonith_private_t *private = stonith->st_private; if (all_callbacks) { private->op_callback = NULL; g_hash_table_destroy(private->stonith_op_callback_table); private->stonith_op_callback_table = pcmk__intkey_table(stonith_destroy_op_callback); } else if (call_id == 0) { private->op_callback = NULL; } else { pcmk__intkey_table_remove(private->stonith_op_callback_table, call_id); } return pcmk_ok; } /*! * \internal * \brief Invoke a (single) specified fence action callback * * \param[in] st Fencer API connection * \param[in] call_id If positive, call ID of completed fence action, otherwise * legacy return code for early action failure * \param[in] result Full result for action * \param[in] userdata User data to pass to callback * \param[in] callback Fence action callback to invoke */ static void invoke_fence_action_callback(stonith_t *st, int call_id, pcmk__action_result_t *result, void *userdata, void (*callback) (stonith_t *st, stonith_callback_data_t *data)) { stonith_callback_data_t data = { 0, }; data.call_id = call_id; data.rc = pcmk_rc2legacy(stonith__result2rc(result)); data.userdata = userdata; data.opaque = (void *) result; callback(st, &data); } /*! * \internal * \brief Invoke any callbacks registered for a specified fence action result * * Given a fence action result from the fencer, invoke any callback registered * for that action, as well as any global callback registered. * * \param[in] st Fencer API connection * \param[in] msg If non-NULL, fencer reply * \param[in] call_id If \p msg is NULL, call ID of action that timed out */ static void invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) { stonith_private_t *private = NULL; stonith_callback_client_t *cb_info = NULL; pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; CRM_CHECK(stonith != NULL, return); CRM_CHECK(stonith->st_private != NULL, return); private = stonith->st_private; if (msg == NULL) { // Fencer didn't reply in time pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, "Timeout waiting for reply from fencer"); CRM_LOG_ASSERT(call_id > 0); } else { // We have the fencer reply if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) || (call_id <= 0)) { crm_log_xml_warn(msg, "Bad fencer reply"); } stonith__xe_get_result(msg, &result); } if (call_id > 0) { cb_info = pcmk__intkey_table_lookup(private->stonith_op_callback_table, call_id); } if ((cb_info != NULL) && (cb_info->callback != NULL) && (pcmk__result_ok(&result) || !(cb_info->only_success))) { crm_trace("Invoking callback %s for call %d", crm_str(cb_info->id), call_id); invoke_fence_action_callback(stonith, call_id, &result, cb_info->user_data, cb_info->callback); } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { crm_warn("Fencing action without registered callback failed: %d (%s%s%s)", result.exit_status, pcmk_exec_status_str(result.execution_status), ((result.exit_reason == NULL)? "" : ": "), ((result.exit_reason == NULL)? "" : result.exit_reason)); crm_log_xml_debug(msg, "Failed fence update"); } if (private->op_callback != NULL) { crm_trace("Invoking global callback for call %d", call_id); invoke_fence_action_callback(stonith, call_id, &result, NULL, private->op_callback); } if (cb_info != NULL) { stonith_api_del_callback(stonith, call_id, FALSE); } pcmk__reset_result(&result); } static gboolean stonith_async_timeout_handler(gpointer data) { struct timer_rec_s *timer = data; crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); invoke_registered_callbacks(timer->stonith, NULL, timer->call_id); /* Always return TRUE, never remove the handler * We do that in stonith_del_callback() */ return TRUE; } static void set_callback_timeout(stonith_callback_client_t * callback, stonith_t * stonith, int call_id, int timeout) { struct timer_rec_s *async_timer = callback->timer; if (timeout <= 0) { return; } if (!async_timer) { async_timer = calloc(1, sizeof(struct timer_rec_s)); callback->timer = async_timer; } async_timer->stonith = stonith; async_timer->call_id = call_id; /* Allow a fair bit of grace to allow the server to tell us of a timeout * This is only a fallback */ async_timer->timeout = (timeout + 60) * 1000; if (async_timer->ref) { g_source_remove(async_timer->ref); } async_timer->ref = g_timeout_add(async_timer->timeout, stonith_async_timeout_handler, async_timer); } static void update_callback_timeout(int call_id, int timeout, stonith_t * st) { stonith_callback_client_t *callback = NULL; stonith_private_t *private = st->st_private; callback = pcmk__intkey_table_lookup(private->stonith_op_callback_table, call_id); if (!callback || !callback->allow_timeout_updates) { return; } set_callback_timeout(callback, st, call_id, timeout); } static int stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) { const char *type = NULL; struct notify_blob_s blob; stonith_t *st = userdata; stonith_private_t *private = NULL; CRM_ASSERT(st != NULL); private = st->st_private; blob.stonith = st; blob.xml = string2xml(buffer); if (blob.xml == NULL) { crm_warn("Received malformed message from fencer: %s", buffer); return 0; } /* do callbacks */ type = crm_element_value(blob.xml, F_TYPE); crm_trace("Activating %s callbacks...", type); if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { invoke_registered_callbacks(st, blob.xml, 0); } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { foreach_notify_entry(private, stonith_send_notification, &blob); } else if (pcmk__str_eq(type, T_STONITH_TIMEOUT_VALUE, pcmk__str_casei)) { int call_id = 0; int timeout = 0; crm_element_value_int(blob.xml, F_STONITH_TIMEOUT, &timeout); crm_element_value_int(blob.xml, F_STONITH_CALLID, &call_id); update_callback_timeout(call_id, timeout, st); } else { crm_err("Unknown message type: %s", type); crm_log_xml_warn(blob.xml, "BadReply"); } free_xml(blob.xml); return 1; } static int stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) { int rc = pcmk_ok; stonith_private_t *native = NULL; const char *display_name = name? name : "client"; struct ipc_client_callbacks st_callbacks = { .dispatch = stonith_dispatch_internal, .destroy = stonith_connection_destroy }; CRM_CHECK(stonith != NULL, return -EINVAL); native = stonith->st_private; CRM_ASSERT(native != NULL); crm_debug("Attempting fencer connection by %s with%s mainloop", display_name, (stonith_fd? "out" : "")); stonith->state = stonith_connected_command; if (stonith_fd) { /* No mainloop */ native->ipc = crm_ipc_new("stonith-ng", 0); if (native->ipc && crm_ipc_connect(native->ipc)) { *stonith_fd = crm_ipc_get_fd(native->ipc); } else if (native->ipc) { crm_ipc_close(native->ipc); crm_ipc_destroy(native->ipc); native->ipc = NULL; } } else { /* With mainloop */ native->source = mainloop_add_ipc_client("stonith-ng", G_PRIORITY_MEDIUM, 0, stonith, &st_callbacks); native->ipc = mainloop_get_ipc_client(native->source); } if (native->ipc == NULL) { rc = -ENOTCONN; } else { xmlNode *reply = NULL; xmlNode *hello = create_xml_node(NULL, "stonith_command"); crm_xml_add(hello, F_TYPE, T_STONITH_NG); crm_xml_add(hello, F_STONITH_OPERATION, CRM_OP_REGISTER); crm_xml_add(hello, F_STONITH_CLIENTNAME, name); rc = crm_ipc_send(native->ipc, hello, crm_ipc_client_response, -1, &reply); if (rc < 0) { crm_debug("Couldn't register with the fencer: %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); rc = -ECOMM; } else if (reply == NULL) { crm_debug("Couldn't register with the fencer: no reply"); rc = -EPROTO; } else { const char *msg_type = crm_element_value(reply, F_STONITH_OPERATION); native->token = crm_element_value_copy(reply, F_STONITH_CLIENTID); if (!pcmk__str_eq(msg_type, CRM_OP_REGISTER, pcmk__str_casei)) { crm_debug("Couldn't register with the fencer: invalid reply type '%s'", (msg_type? msg_type : "(missing)")); crm_log_xml_debug(reply, "Invalid fencer reply"); rc = -EPROTO; } else if (native->token == NULL) { crm_debug("Couldn't register with the fencer: no token in reply"); crm_log_xml_debug(reply, "Invalid fencer reply"); rc = -EPROTO; } else { #if HAVE_MSGFROMIPC_TIMEOUT stonith->call_timeout = PCMK__IPC_TIMEOUT; #endif crm_debug("Connection to fencer by %s succeeded (registration token: %s)", display_name, native->token); rc = pcmk_ok; } } free_xml(reply); free_xml(hello); } if (rc != pcmk_ok) { crm_debug("Connection attempt to fencer by %s failed: %s " CRM_XS " rc=%d", display_name, pcmk_strerror(rc), rc); stonith->cmds->disconnect(stonith); } return rc; } static int stonith_set_notification(stonith_t * stonith, const char *callback, int enabled) { int rc = pcmk_ok; xmlNode *notify_msg = create_xml_node(NULL, __func__); stonith_private_t *native = stonith->st_private; if (stonith->state != stonith_disconnected) { crm_xml_add(notify_msg, F_STONITH_OPERATION, T_STONITH_NOTIFY); if (enabled) { crm_xml_add(notify_msg, F_STONITH_NOTIFY_ACTIVATE, callback); } else { crm_xml_add(notify_msg, F_STONITH_NOTIFY_DEACTIVATE, callback); } rc = crm_ipc_send(native->ipc, notify_msg, crm_ipc_client_response, -1, NULL); if (rc < 0) { crm_perror(LOG_DEBUG, "Couldn't register for fencing notifications: %d", rc); rc = -ECOMM; } else { rc = pcmk_ok; } } free_xml(notify_msg); return rc; } static int stonith_api_add_notification(stonith_t * stonith, const char *event, void (*callback) (stonith_t * stonith, stonith_event_t * e)) { GList *list_item = NULL; stonith_notify_client_t *new_client = NULL; stonith_private_t *private = NULL; private = stonith->st_private; crm_trace("Adding callback for %s events (%d)", event, g_list_length(private->notify_list)); new_client = calloc(1, sizeof(stonith_notify_client_t)); new_client->event = event; new_client->notify = callback; list_item = g_list_find_custom(private->notify_list, new_client, stonithlib_GCompareFunc); if (list_item != NULL) { crm_warn("Callback already present"); free(new_client); return -ENOTUNIQ; } else { private->notify_list = g_list_append(private->notify_list, new_client); stonith_set_notification(stonith, event, 1); crm_trace("Callback added (%d)", g_list_length(private->notify_list)); } return pcmk_ok; } static int stonith_api_del_notification(stonith_t * stonith, const char *event) { GList *list_item = NULL; stonith_notify_client_t *new_client = NULL; stonith_private_t *private = NULL; crm_debug("Removing callback for %s events", event); private = stonith->st_private; new_client = calloc(1, sizeof(stonith_notify_client_t)); new_client->event = event; new_client->notify = NULL; list_item = g_list_find_custom(private->notify_list, new_client, stonithlib_GCompareFunc); stonith_set_notification(stonith, event, 0); if (list_item != NULL) { stonith_notify_client_t *list_client = list_item->data; if (private->notify_refcnt) { list_client->delete = TRUE; private->notify_deletes = TRUE; } else { private->notify_list = g_list_remove(private->notify_list, list_client); free(list_client); } crm_trace("Removed callback"); } else { crm_trace("Callback not present"); } free(new_client); return pcmk_ok; } static int stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int options, void *user_data, const char *callback_name, void (*callback) (stonith_t * st, stonith_callback_data_t * data)) { stonith_callback_client_t *blob = NULL; stonith_private_t *private = NULL; CRM_CHECK(stonith != NULL, return -EINVAL); CRM_CHECK(stonith->st_private != NULL, return -EINVAL); private = stonith->st_private; if (call_id == 0) { // Add global callback private->op_callback = callback; } else if (call_id < 0) { // Call failed immediately, so call callback now if (!(options & st_opt_report_only_success)) { pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); pcmk__set_result(&result, CRM_EX_ERROR, stonith__legacy2status(call_id), NULL); invoke_fence_action_callback(stonith, call_id, &result, user_data, callback); } else { crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); } return FALSE; } blob = calloc(1, sizeof(stonith_callback_client_t)); blob->id = callback_name; blob->only_success = (options & st_opt_report_only_success) ? TRUE : FALSE; blob->user_data = user_data; blob->callback = callback; blob->allow_timeout_updates = (options & st_opt_timeout_updates) ? TRUE : FALSE; if (timeout > 0) { set_callback_timeout(blob, stonith, call_id, timeout); } pcmk__intkey_table_insert(private->stonith_op_callback_table, call_id, blob); crm_trace("Added callback to %s for call %d", callback_name, call_id); return TRUE; } static void stonith_dump_pending_op(gpointer key, gpointer value, gpointer user_data) { int call = GPOINTER_TO_INT(key); stonith_callback_client_t *blob = value; crm_debug("Call %d (%s): pending", call, crm_str(blob->id)); } void stonith_dump_pending_callbacks(stonith_t * stonith) { stonith_private_t *private = stonith->st_private; if (private->stonith_op_callback_table == NULL) { return; } return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL); } +/*! + * \internal + * \brief Get the data section of a fencer notification + * + * \param[in] msg Notification XML + * \param[in] ntype Notification type + */ +static xmlNode * +get_event_data_xml(xmlNode *msg, const char *ntype) +{ + char *data_addr = crm_strdup_printf("//%s", ntype); + xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); + + free(data_addr); + return data; +} + /* */ static stonith_event_t * -xml_to_event(xmlNode * msg) +xml_to_event(xmlNode *msg, pcmk__action_result_t *result) { stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); const char *ntype = crm_element_value(msg, F_SUBTYPE); - char *data_addr = crm_strdup_printf("//%s", ntype); - xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); + + CRM_ASSERT((event != NULL) && (result != NULL)); crm_log_xml_trace(msg, "stonith_notify"); - crm_element_value_int(msg, F_STONITH_RC, &(event->result)); + // All notification types have the operation result + event->opaque = result; + stonith__xe_get_result(msg, result); + // @COMPAT The API originally provided the result as a legacy return code + event->result = pcmk_rc2legacy(stonith__result2rc(result)); + + // Fence notifications have additional information if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { - event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); + xmlNode *data = get_event_data_xml(msg, ntype); - if (data) { + if (data == NULL) { + crm_err("No data for %s event", ntype); + crm_log_xml_notice(msg, "BadEvent"); + } else { event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN); event->action = crm_element_value_copy(data, F_STONITH_ACTION); event->target = crm_element_value_copy(data, F_STONITH_TARGET); event->executioner = crm_element_value_copy(data, F_STONITH_DELEGATE); event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); event->device = crm_element_value_copy(data, F_STONITH_DEVICE); - - } else { - crm_err("No data for %s event", ntype); - crm_log_xml_notice(msg, "BadEvent"); } + event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); } - free(data_addr); return event; } static void event_free(stonith_event_t * event) { free(event->id); free(event->type); free(event->message); free(event->operation); free(event->origin); free(event->action); free(event->target); free(event->executioner); free(event->device); free(event->client_origin); + pcmk__reset_result((pcmk__action_result_t *) (event->opaque)); free(event); } static void stonith_send_notification(gpointer data, gpointer user_data) { struct notify_blob_s *blob = user_data; stonith_notify_client_t *entry = data; stonith_event_t *st_event = NULL; const char *event = NULL; + pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; if (blob->xml == NULL) { crm_warn("Skipping callback - NULL message"); return; } event = crm_element_value(blob->xml, F_SUBTYPE); if (entry == NULL) { crm_warn("Skipping callback - NULL callback client"); return; } else if (entry->delete) { crm_trace("Skipping callback - marked for deletion"); return; } else if (entry->notify == NULL) { crm_warn("Skipping callback - NULL callback"); return; } else if (!pcmk__str_eq(entry->event, event, pcmk__str_casei)) { crm_trace("Skipping callback - event mismatch %p/%s vs. %s", entry, entry->event, event); return; } - st_event = xml_to_event(blob->xml); + st_event = xml_to_event(blob->xml, &result); crm_trace("Invoking callback for %p/%s event...", entry, event); entry->notify(blob->stonith, st_event); crm_trace("Callback invoked..."); event_free(st_event); } /*! * \internal * \brief Create and send an API request * * \param[in] stonith Stonith connection * \param[in] op API operation to request * \param[in] data Data to attach to request * \param[out] output_data If not NULL, will be set to reply if synchronous * \param[in] call_options Bitmask of stonith_call_options to use * \param[in] timeout Error if not completed within this many seconds * * \return pcmk_ok (for synchronous requests) or positive call ID * (for asynchronous requests) on success, -errno otherwise */ static int stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNode ** output_data, int call_options, int timeout) { int rc = 0; int reply_id = -1; xmlNode *op_msg = NULL; xmlNode *op_reply = NULL; stonith_private_t *native = NULL; CRM_ASSERT(stonith && stonith->st_private && op); native = stonith->st_private; if (output_data != NULL) { *output_data = NULL; } if ((stonith->state == stonith_disconnected) || (native->token == NULL)) { return -ENOTCONN; } /* Increment the call ID, which must be positive to avoid conflicting with * error codes. This shouldn't be a problem unless the client mucked with * it or the counter wrapped around. */ stonith->call_id++; if (stonith->call_id < 1) { stonith->call_id = 1; } op_msg = stonith_create_op(stonith->call_id, native->token, op, data, call_options); if (op_msg == NULL) { return -EINVAL; } crm_xml_add_int(op_msg, F_STONITH_TIMEOUT, timeout); crm_trace("Sending %s message to fencer with timeout %ds", op, timeout); if (data) { const char *delay_s = crm_element_value(data, F_STONITH_DELAY); if (delay_s) { crm_xml_add(op_msg, F_STONITH_DELAY, delay_s); } } { enum crm_ipc_flags ipc_flags = crm_ipc_flags_none; if (call_options & st_opt_sync_call) { pcmk__set_ipc_flags(ipc_flags, "stonith command", crm_ipc_client_response); } rc = crm_ipc_send(native->ipc, op_msg, ipc_flags, 1000 * (timeout + 60), &op_reply); } free_xml(op_msg); if (rc < 0) { crm_perror(LOG_ERR, "Couldn't perform %s operation (timeout=%ds): %d", op, timeout, rc); rc = -ECOMM; goto done; } crm_log_xml_trace(op_reply, "Reply"); if (!(call_options & st_opt_sync_call)) { crm_trace("Async call %d, returning", stonith->call_id); free_xml(op_reply); return stonith->call_id; } rc = pcmk_ok; crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); if (reply_id == stonith->call_id) { + pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; + crm_trace("Synchronous reply %d received", reply_id); - if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { - rc = -ENOMSG; - } + stonith__xe_get_result(op_reply, &result); + rc = pcmk_rc2legacy(stonith__result2rc(&result)); + pcmk__reset_result(&result); if ((call_options & st_opt_discard_reply) || output_data == NULL) { crm_trace("Discarding reply"); } else { *output_data = op_reply; op_reply = NULL; /* Prevent subsequent free */ } } else if (reply_id <= 0) { crm_err("Received bad reply: No id set"); crm_log_xml_err(op_reply, "Bad reply"); free_xml(op_reply); rc = -ENOMSG; } else { crm_err("Received bad reply: %d (wanted %d)", reply_id, stonith->call_id); crm_log_xml_err(op_reply, "Old reply"); free_xml(op_reply); rc = -ENOMSG; } done: if (crm_ipc_connected(native->ipc) == FALSE) { crm_err("Fencer disconnected"); free(native->token); native->token = NULL; stonith->state = stonith_disconnected; } free_xml(op_reply); return rc; } /* Not used with mainloop */ bool stonith_dispatch(stonith_t * st) { gboolean stay_connected = TRUE; stonith_private_t *private = NULL; CRM_ASSERT(st != NULL); private = st->st_private; while (crm_ipc_ready(private->ipc)) { if (crm_ipc_read(private->ipc) > 0) { const char *msg = crm_ipc_buffer(private->ipc); stonith_dispatch_internal(msg, strlen(msg), st); } if (crm_ipc_connected(private->ipc) == FALSE) { crm_err("Connection closed"); stay_connected = FALSE; } } return stay_connected; } static int stonith_api_free(stonith_t * stonith) { int rc = pcmk_ok; crm_trace("Destroying %p", stonith); if (stonith->state != stonith_disconnected) { crm_trace("Disconnecting %p first", stonith); rc = stonith->cmds->disconnect(stonith); } if (stonith->state == stonith_disconnected) { stonith_private_t *private = stonith->st_private; crm_trace("Removing %d callbacks", g_hash_table_size(private->stonith_op_callback_table)); g_hash_table_destroy(private->stonith_op_callback_table); crm_trace("Destroying %d notification clients", g_list_length(private->notify_list)); g_list_free_full(private->notify_list, free); free(stonith->st_private); free(stonith->cmds); free(stonith); } else { crm_err("Not free'ing active connection: %s (%d)", pcmk_strerror(rc), rc); } return rc; } void stonith_api_delete(stonith_t * stonith) { crm_trace("Destroying %p", stonith); if(stonith) { stonith->cmds->free(stonith); } } static int stonith_api_validate(stonith_t *st, int call_options, const char *rsc_id, const char *namespace_s, const char *agent, stonith_key_value_t *params, int timeout, char **output, char **error_output) { /* Validation should be done directly via the agent, so we can get it from * stonith_admin when the cluster is not running, which is important for * higher-level tools. */ int rc = pcmk_ok; /* Use a dummy node name in case the agent requires a target. We assume the * actual target doesn't matter for validation purposes (if in practice, * that is incorrect, we will need to allow the caller to pass the target). */ const char *target = "node1"; const char *host_arg = NULL; GHashTable *params_table = pcmk__strkey_table(free, free); // Convert parameter list to a hash table for (; params; params = params->next) { if (pcmk__str_eq(params->key, PCMK_STONITH_HOST_ARGUMENT, pcmk__str_casei)) { host_arg = params->value; } if (!pcmk_stonith_param(params->key)) { g_hash_table_insert(params_table, strdup(params->key), strdup(params->value)); } } #if SUPPORT_CIBSECRETS rc = pcmk__substitute_secrets(rsc_id, params_table); if (rc != pcmk_rc_ok) { crm_warn("Could not replace secret parameters for validation of %s: %s", agent, pcmk_rc_str(rc)); // rc is standard return value, don't return it in this function } #endif if (output) { *output = NULL; } if (error_output) { *error_output = NULL; } switch (stonith_get_namespace(agent, namespace_s)) { case st_namespace_rhcs: rc = stonith__rhcs_validate(st, call_options, target, agent, params_table, host_arg, timeout, output, error_output); break; #if HAVE_STONITH_STONITH_H case st_namespace_lha: rc = stonith__lha_validate(st, call_options, target, agent, params_table, timeout, output, error_output); break; #endif default: rc = -EINVAL; errno = EINVAL; crm_perror(LOG_ERR, "Agent %s not found or does not support validation", agent); break; } g_hash_table_destroy(params_table); return rc; } stonith_t * stonith_api_new(void) { stonith_t *new_stonith = NULL; stonith_private_t *private = NULL; new_stonith = calloc(1, sizeof(stonith_t)); if (new_stonith == NULL) { return NULL; } private = calloc(1, sizeof(stonith_private_t)); if (private == NULL) { free(new_stonith); return NULL; } new_stonith->st_private = private; private->stonith_op_callback_table = pcmk__intkey_table(stonith_destroy_op_callback); private->notify_list = NULL; private->notify_refcnt = 0; private->notify_deletes = FALSE; new_stonith->call_id = 1; new_stonith->state = stonith_disconnected; new_stonith->cmds = calloc(1, sizeof(stonith_api_operations_t)); if (new_stonith->cmds == NULL) { free(new_stonith->st_private); free(new_stonith); return NULL; } /* *INDENT-OFF* */ new_stonith->cmds->free = stonith_api_free; new_stonith->cmds->connect = stonith_api_signon; new_stonith->cmds->disconnect = stonith_api_signoff; new_stonith->cmds->list = stonith_api_list; new_stonith->cmds->monitor = stonith_api_monitor; new_stonith->cmds->status = stonith_api_status; new_stonith->cmds->fence = stonith_api_fence; new_stonith->cmds->fence_with_delay = stonith_api_fence_with_delay; new_stonith->cmds->confirm = stonith_api_confirm; new_stonith->cmds->history = stonith_api_history; new_stonith->cmds->list_agents = stonith_api_device_list; new_stonith->cmds->metadata = stonith_api_device_metadata; new_stonith->cmds->query = stonith_api_query; new_stonith->cmds->remove_device = stonith_api_remove_device; new_stonith->cmds->register_device = stonith_api_register_device; new_stonith->cmds->remove_level = stonith_api_remove_level; new_stonith->cmds->remove_level_full = stonith_api_remove_level_full; new_stonith->cmds->register_level = stonith_api_register_level; new_stonith->cmds->register_level_full = stonith_api_register_level_full; new_stonith->cmds->remove_callback = stonith_api_del_callback; new_stonith->cmds->register_callback = stonith_api_add_callback; new_stonith->cmds->remove_notification = stonith_api_del_notification; new_stonith->cmds->register_notification = stonith_api_add_notification; new_stonith->cmds->validate = stonith_api_validate; /* *INDENT-ON* */ return new_stonith; } /*! * \brief Make a blocking connection attempt to the fencer * * \param[in,out] st Fencer API object * \param[in] name Client name to use with fencer * \param[in] max_attempts Return error if this many attempts fail * * \return pcmk_ok on success, result of last attempt otherwise */ int stonith_api_connect_retry(stonith_t *st, const char *name, int max_attempts) { int rc = -EINVAL; // if max_attempts is not positive for (int attempt = 1; attempt <= max_attempts; attempt++) { rc = st->cmds->connect(st, name, NULL); if (rc == pcmk_ok) { return pcmk_ok; } else if (attempt < max_attempts) { crm_notice("Fencer connection attempt %d of %d failed (retrying in 2s): %s " CRM_XS " rc=%d", attempt, max_attempts, pcmk_strerror(rc), rc); sleep(2); } } crm_notice("Could not connect to fencer: %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); return rc; } stonith_key_value_t * stonith_key_value_add(stonith_key_value_t * head, const char *key, const char *value) { stonith_key_value_t *p, *end; p = calloc(1, sizeof(stonith_key_value_t)); if (key) { p->key = strdup(key); } if (value) { p->value = strdup(value); } end = head; while (end && end->next) { end = end->next; } if (end) { end->next = p; } else { head = p; } return head; } void stonith_key_value_freeall(stonith_key_value_t * head, int keys, int values) { stonith_key_value_t *p; while (head) { p = head->next; if (keys) { free(head->key); } if (values) { free(head->value); } free(head); head = p; } } #define api_log_open() openlog("stonith-api", LOG_CONS | LOG_NDELAY | LOG_PID, LOG_DAEMON) #define api_log(level, fmt, args...) syslog(level, "%s: "fmt, __func__, args) int stonith_api_kick(uint32_t nodeid, const char *uname, int timeout, bool off) { int rc = pcmk_ok; stonith_t *st = stonith_api_new(); const char *action = off? "off" : "reboot"; api_log_open(); if (st == NULL) { api_log(LOG_ERR, "API initialization failed, could not kick (%s) node %u/%s", action, nodeid, uname); return -EPROTO; } rc = st->cmds->connect(st, "stonith-api", NULL); if (rc != pcmk_ok) { api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc); } else { char *name = (uname == NULL)? pcmk__itoa(nodeid) : strdup(uname); int opts = 0; stonith__set_call_options(opts, name, st_opt_sync_call|st_opt_allow_suicide); if ((uname == NULL) && (nodeid > 0)) { stonith__set_call_options(opts, name, st_opt_cs_nodeid); } rc = st->cmds->fence(st, opts, name, action, timeout, 0); free(name); if (rc != pcmk_ok) { api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc); } else { api_log(LOG_NOTICE, "Node %u/%s kicked: %s", nodeid, uname, action); } } stonith_api_delete(st); return rc; } time_t stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress) { int rc = pcmk_ok; time_t when = 0; stonith_t *st = stonith_api_new(); stonith_history_t *history = NULL, *hp = NULL; if (st == NULL) { api_log(LOG_ERR, "Could not retrieve fence history for %u/%s: " "API initialization failed", nodeid, uname); return when; } rc = st->cmds->connect(st, "stonith-api", NULL); if (rc != pcmk_ok) { api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc); } else { int entries = 0; int progress = 0; int completed = 0; int opts = 0; char *name = (uname == NULL)? pcmk__itoa(nodeid) : strdup(uname); stonith__set_call_options(opts, name, st_opt_sync_call); if ((uname == NULL) && (nodeid > 0)) { stonith__set_call_options(opts, name, st_opt_cs_nodeid); } rc = st->cmds->history(st, opts, name, &history, 120); free(name); for (hp = history; hp; hp = hp->next) { entries++; if (in_progress) { progress++; if (hp->state != st_done && hp->state != st_failed) { when = time(NULL); } } else if (hp->state == st_done) { completed++; if (hp->completed > when) { when = hp->completed; } } } stonith_history_free(history); if(rc == pcmk_ok) { api_log(LOG_INFO, "Found %d entries for %u/%s: %d in progress, %d completed", entries, nodeid, uname, progress, completed); } else { api_log(LOG_ERR, "Could not retrieve fence history for %u/%s: %s (%d)", nodeid, uname, pcmk_strerror(rc), rc); } } stonith_api_delete(st); if(when) { api_log(LOG_INFO, "Node %u/%s last kicked at: %ld", nodeid, uname, (long int)when); } return when; } bool stonith_agent_exists(const char *agent, int timeout) { stonith_t *st = NULL; stonith_key_value_t *devices = NULL; stonith_key_value_t *dIter = NULL; bool rc = FALSE; if (agent == NULL) { return rc; } st = stonith_api_new(); if (st == NULL) { crm_err("Could not list fence agents: API memory allocation failed"); return FALSE; } st->cmds->list_agents(st, st_opt_sync_call, NULL, &devices, timeout == 0 ? 120 : timeout); for (dIter = devices; dIter != NULL; dIter = dIter->next) { if (pcmk__str_eq(dIter->value, agent, pcmk__str_none)) { rc = TRUE; break; } } stonith_key_value_freeall(devices, 1, 1); stonith_api_delete(st); return rc; } const char * stonith_action_str(const char *action) { if (action == NULL) { return "fencing"; } else if (!strcmp(action, "on")) { return "unfencing"; } else if (!strcmp(action, "off")) { return "turning off"; } else { return action; } } /*! * \internal * \brief Parse a target name from one line of a target list string * * \param[in] line One line of a target list string * \param[in] len String length of line * \param[in,out] output List to add newly allocated target name to */ static void parse_list_line(const char *line, int len, GList **output) { size_t i = 0; size_t entry_start = 0; /* Skip complaints about additional parameters device doesn't understand * * @TODO Document or eliminate the implied restriction of target names */ if (strstr(line, "invalid") || strstr(line, "variable")) { crm_debug("Skipping list output line: %s", line); return; } // Process line content, character by character for (i = 0; i <= len; i++) { if (isspace(line[i]) || (line[i] == ',') || (line[i] == ';') || (line[i] == '\0')) { // We've found a separator (i.e. the end of an entry) int rc = 0; char *entry = NULL; if (i == entry_start) { // Skip leading and sequential separators entry_start = i + 1; continue; } entry = calloc(i - entry_start + 1, sizeof(char)); CRM_ASSERT(entry != NULL); /* Read entry, stopping at first separator * * @TODO Document or eliminate these character restrictions */ rc = sscanf(line + entry_start, "%[a-zA-Z0-9_-.]", entry); if (rc != 1) { crm_warn("Could not parse list output entry: %s " CRM_XS " entry_start=%d position=%d", line + entry_start, entry_start, i); free(entry); } else if (pcmk__strcase_any_of(entry, "on", "off", NULL)) { /* Some agents print the target status in the list output, * though none are known now (the separate list-status command * is used for this, but it can also print "UNKNOWN"). To handle * this possibility, skip such entries. * * @TODO Document or eliminate the implied restriction of target * names. */ free(entry); } else { // We have a valid entry *output = g_list_append(*output, entry); } entry_start = i + 1; } } } /*! * \internal * \brief Parse a list of targets from a string * * \param[in] list_output Target list as a string * * \return List of target names * \note The target list string format is flexible, to allow for user-specified * lists such pcmk_host_list and the output of an agent's list action * (whether direct or via the API, which escapes newlines). There may be * multiple lines, separated by either a newline or an escaped newline * (backslash n). Each line may have one or more target names, separated * by any combination of whitespace, commas, and semi-colons. Lines * containing "invalid" or "variable" will be ignored entirely. Target * names "on" or "off" (case-insensitive) will be ignored. Target names * may contain only alphanumeric characters, underbars (_), dashes (-), * and dots (.) (if any other character occurs in the name, it and all * subsequent characters in the name will be ignored). * \note The caller is responsible for freeing the result with * g_list_free_full(result, free). */ GList * stonith__parse_targets(const char *target_spec) { GList *targets = NULL; if (target_spec != NULL) { size_t out_len = strlen(target_spec); size_t line_start = 0; // Starting index of line being processed for (size_t i = 0; i <= out_len; ++i) { if ((target_spec[i] == '\n') || (target_spec[i] == '\0') || ((target_spec[i] == '\\') && (target_spec[i + 1] == 'n'))) { // We've reached the end of one line of output int len = i - line_start; if (len > 0) { char *line = strndup(target_spec + line_start, len); line[len] = '\0'; // Because it might be a newline parse_list_line(line, len, &targets); free(line); } if (target_spec[i] == '\\') { ++i; // backslash-n takes up two positions } line_start = i + 1; } } } return targets; } /*! * \internal * \brief Determine if a later stonith event succeeded. * * \note Before calling this function, use stonith__sort_history() to sort the * top_history argument. */ gboolean stonith__later_succeeded(stonith_history_t *event, stonith_history_t *top_history) { gboolean ret = FALSE; for (stonith_history_t *prev_hp = top_history; prev_hp; prev_hp = prev_hp->next) { if (prev_hp == event) { break; } if ((prev_hp->state == st_done) && pcmk__str_eq(event->target, prev_hp->target, pcmk__str_casei) && pcmk__str_eq(event->action, prev_hp->action, pcmk__str_casei) && pcmk__str_eq(event->delegate, prev_hp->delegate, pcmk__str_casei) && ((event->completed < prev_hp->completed) || ((event->completed == prev_hp->completed) && (event->completed_nsec < prev_hp->completed_nsec)))) { ret = TRUE; break; } } return ret; } /*! * \internal * \brief Sort the stonith-history * sort by competed most current on the top * pending actions lacking a completed-stamp are gathered at the top * * \param[in] history List of stonith actions * */ stonith_history_t * stonith__sort_history(stonith_history_t *history) { stonith_history_t *new = NULL, *pending = NULL, *hp, *np, *tmp; for (hp = history; hp; ) { tmp = hp->next; if ((hp->state == st_done) || (hp->state == st_failed)) { /* sort into new */ if ((!new) || (hp->completed > new->completed) || ((hp->completed == new->completed) && (hp->completed_nsec > new->completed_nsec))) { hp->next = new; new = hp; } else { np = new; do { if ((!np->next) || (hp->completed > np->next->completed) || ((hp->completed == np->next->completed) && (hp->completed_nsec > np->next->completed_nsec))) { hp->next = np->next; np->next = hp; break; } np = np->next; } while (1); } } else { /* put into pending */ hp->next = pending; pending = hp; } hp = tmp; } /* pending actions don't have a completed-stamp so make them go front */ if (pending) { stonith_history_t *last_pending = pending; while (last_pending->next) { last_pending = last_pending->next; } last_pending->next = new; new = pending; } return new; } /*! * \brief Return string equivalent of an operation state value * * \param[in] state Fencing operation state value * * \return Human-friendly string equivalent of state */ const char * stonith_op_state_str(enum op_state state) { switch (state) { case st_query: return "querying"; case st_exec: return "executing"; case st_done: return "completed"; case st_duplicate: return "duplicate"; case st_failed: return "failed"; } return "unknown"; } stonith_history_t * stonith__first_matching_event(stonith_history_t *history, bool (*matching_fn)(stonith_history_t *, void *), void *user_data) { for (stonith_history_t *hp = history; hp; hp = hp->next) { if (matching_fn(hp, user_data)) { return hp; } } return NULL; } bool stonith__event_state_pending(stonith_history_t *history, void *user_data) { return history->state != st_failed && history->state != st_done; } bool stonith__event_state_eq(stonith_history_t *history, void *user_data) { return history->state == GPOINTER_TO_INT(user_data); } bool stonith__event_state_neq(stonith_history_t *history, void *user_data) { return history->state != GPOINTER_TO_INT(user_data); } void stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name, xmlNode *metadata) { xmlXPathObjectPtr xpath = NULL; int max = 0; int lpc = 0; CRM_CHECK((device_flags != NULL) && (metadata != NULL), return); xpath = xpath_search(metadata, "//parameter"); max = numXpathResults(xpath); if (max <= 0) { freeXpathObject(xpath); return; } for (lpc = 0; lpc < max; lpc++) { const char *parameter = NULL; xmlNode *match = getXpathResult(xpath, lpc); CRM_LOG_ASSERT(match != NULL); if (match == NULL) { continue; } parameter = crm_element_value(match, "name"); if (pcmk__str_eq(parameter, "plug", pcmk__str_casei)) { stonith__set_device_flags(*device_flags, device_name, st_device_supports_parameter_plug); } else if (pcmk__str_eq(parameter, "port", pcmk__str_casei)) { stonith__set_device_flags(*device_flags, device_name, st_device_supports_parameter_port); } } freeXpathObject(xpath); } /*! * \internal * \brief Return the exit status from an async action callback * * \param[in] data Callback data * * \return Exit status from callback data */ int stonith__exit_status(stonith_callback_data_t *data) { if ((data == NULL) || (data->opaque == NULL)) { return CRM_EX_ERROR; } return ((pcmk__action_result_t *) data->opaque)->exit_status; } /*! * \internal * \brief Return the execution status from an async action callback * * \param[in] data Callback data * * \return Execution status from callback data */ int stonith__execution_status(stonith_callback_data_t *data) { if ((data == NULL) || (data->opaque == NULL)) { return PCMK_EXEC_UNKNOWN; } return ((pcmk__action_result_t *) data->opaque)->execution_status; } /*! * \internal * \brief Return the exit reason from an async action callback * * \param[in] data Callback data * * \return Exit reason from callback data */ const char * stonith__exit_reason(stonith_callback_data_t *data) { if ((data == NULL) || (data->opaque == NULL)) { return NULL; } return ((pcmk__action_result_t *) data->opaque)->exit_reason; } +/*! + * \internal + * \brief Return the exit status from an event notification + * + * \param[in] event Event + * + * \return Exit status from event + */ +int +stonith__event_exit_status(stonith_event_t *event) +{ + if ((event == NULL) || (event->opaque == NULL)) { + return CRM_EX_ERROR; + } + return ((pcmk__action_result_t *) event->opaque)->exit_status; +} + +/*! + * \internal + * \brief Return the execution status from an event notification + * + * \param[in] event Event + * + * \return Execution status from event + */ +int +stonith__event_execution_status(stonith_event_t *event) +{ + if ((event == NULL) || (event->opaque == NULL)) { + return PCMK_EXEC_UNKNOWN; + } + return ((pcmk__action_result_t *) event->opaque)->execution_status; +} + +/*! + * \internal + * \brief Return the exit reason from an event notification + * + * \param[in] event Event + * + * \return Exit reason from event + */ +const char * +stonith__event_exit_reason(stonith_event_t *event) +{ + if ((event == NULL) || (event->opaque == NULL)) { + return NULL; + } + return ((pcmk__action_result_t *) event->opaque)->exit_reason; +} + +/*! + * \internal + * \brief Return a human-friendly description of a fencing event + * + * \param[in] event Event to describe + * + * \return Newly allocated string with description of \p event + * \note The caller is responsible for freeing the return value. + * This function asserts on memory errors and never returns NULL. + * \note This currently is useful only for events of type + * T_STONITH_NOTIFY_FENCE. + */ +char * +stonith__event_description(stonith_event_t *event) +{ + const char *reason; + const char *status; + + if (stonith__event_execution_status(event) != PCMK_EXEC_DONE) { + status = pcmk_exec_status_str(stonith__event_execution_status(event)); + } else if (stonith__event_exit_status(event) != CRM_EX_OK) { + status = pcmk_exec_status_str(PCMK_EXEC_ERROR); + } else { + status = crm_exit_str(CRM_EX_OK); + } + reason = stonith__event_exit_reason(event); + + return crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s%s%s%s (ref=%s)", + event->action, event->target, + (event->executioner? event->executioner : "the cluster"), + (event->client_origin? event->client_origin : "a client"), + event->origin, status, + ((reason == NULL)? "" : " ("), + ((reason == NULL)? "" : reason), + ((reason == NULL)? "" : ")"), + event->id); +} + + // Deprecated functions kept only for backward API compatibility // LCOV_EXCL_START const char *get_stonith_provider(const char *agent, const char *provider); const char * get_stonith_provider(const char *agent, const char *provider) { return stonith_namespace2text(stonith_get_namespace(agent, provider)); } // LCOV_EXCL_STOP // End deprecated API diff --git a/tools/crm_mon.c b/tools/crm_mon.c index a6c459aaf7..e7b4fe2847 100644 --- a/tools/crm_mon.c +++ b/tools/crm_mon.c @@ -1,2409 +1,2408 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // pcmk__ends_with_ext() #include #include #include #include #include #include #include #include #include #include #include #include #include #include "crm_mon.h" #define SUMMARY "Provides a summary of cluster's current state.\n\n" \ "Outputs varying levels of detail in a number of different formats." /* * Definitions indicating which items to print */ static unsigned int show; static unsigned int show_opts = pcmk_show_pending; /* * Definitions indicating how to output */ static mon_output_format_t output_format = mon_output_unset; /* other globals */ static GIOChannel *io_channel = NULL; static GMainLoop *mainloop = NULL; static guint reconnect_timer = 0; static mainloop_timer_t *refresh_timer = NULL; static pe_working_set_t *mon_data_set = NULL; static cib_t *cib = NULL; static stonith_t *st = NULL; static xmlNode *current_cib = NULL; static GError *error = NULL; static pcmk__common_args_t *args = NULL; static pcmk__output_t *out = NULL; static GOptionContext *context = NULL; static gchar **processed_args = NULL; static time_t last_refresh = 0; volatile crm_trigger_t *refresh_trigger = NULL; static gboolean fence_history = FALSE; static gboolean has_warnings = FALSE; static gboolean on_remote_node = FALSE; static gboolean use_cib_native = FALSE; int interactive_fence_level = 0; static pcmk__supported_format_t formats[] = { #if CURSES_ENABLED CRM_MON_SUPPORTED_FORMAT_CURSES, #endif PCMK__SUPPORTED_FORMAT_HTML, PCMK__SUPPORTED_FORMAT_NONE, PCMK__SUPPORTED_FORMAT_TEXT, PCMK__SUPPORTED_FORMAT_XML, { NULL, NULL, NULL } }; /* Define exit codes for monitoring-compatible output * For nagios plugins, the possibilities are * OK=0, WARN=1, CRIT=2, and UNKNOWN=3 */ #define MON_STATUS_WARN CRM_EX_ERROR #define MON_STATUS_CRIT CRM_EX_INVALID_PARAM #define MON_STATUS_UNKNOWN CRM_EX_UNIMPLEMENT_FEATURE #define RECONNECT_MSECS 5000 struct { guint reconnect_ms; gboolean daemonize; gboolean fence_connect; gboolean one_shot; gboolean print_pending; gboolean show_bans; gboolean watch_fencing; char *pid_file; char *external_agent; char *external_recipient; char *neg_location_prefix; char *only_node; char *only_rsc; GSList *user_includes_excludes; GSList *includes_excludes; } options = { .fence_connect = TRUE, .reconnect_ms = RECONNECT_MSECS }; static void clean_up_fencing_connection(void); static crm_exit_t clean_up(crm_exit_t exit_code); static void crm_diff_update(const char *event, xmlNode * msg); static void handle_connection_failures(int rc); static int mon_refresh_display(gpointer user_data); static int cib_connect(gboolean full); static int fencing_connect(void); static int pacemakerd_status(void); static void mon_st_callback_event(stonith_t * st, stonith_event_t * e); static void mon_st_callback_display(stonith_t * st, stonith_event_t * e); static void refresh_after_event(gboolean data_updated, gboolean enforce); static unsigned int all_includes(mon_output_format_t fmt) { if (fmt == mon_output_monitor || fmt == mon_output_plain || fmt == mon_output_console) { return ~pcmk_section_options; } else { return pcmk_section_all; } } static unsigned int default_includes(mon_output_format_t fmt) { switch (fmt) { case mon_output_monitor: case mon_output_plain: case mon_output_console: return pcmk_section_summary | pcmk_section_nodes | pcmk_section_resources | pcmk_section_failures; case mon_output_xml: case mon_output_legacy_xml: return all_includes(fmt); case mon_output_html: case mon_output_cgi: return pcmk_section_summary | pcmk_section_nodes | pcmk_section_resources | pcmk_section_failures; default: return 0; } } struct { const char *name; unsigned int bit; } sections[] = { { "attributes", pcmk_section_attributes }, { "bans", pcmk_section_bans }, { "counts", pcmk_section_counts }, { "dc", pcmk_section_dc }, { "failcounts", pcmk_section_failcounts }, { "failures", pcmk_section_failures }, { "fencing", pcmk_section_fencing_all }, { "fencing-failed", pcmk_section_fence_failed }, { "fencing-pending", pcmk_section_fence_pending }, { "fencing-succeeded", pcmk_section_fence_worked }, { "maint-mode", pcmk_section_maint_mode }, { "nodes", pcmk_section_nodes }, { "operations", pcmk_section_operations }, { "options", pcmk_section_options }, { "resources", pcmk_section_resources }, { "stack", pcmk_section_stack }, { "summary", pcmk_section_summary }, { "tickets", pcmk_section_tickets }, { "times", pcmk_section_times }, { NULL } }; static unsigned int find_section_bit(const char *name) { for (int i = 0; sections[i].name != NULL; i++) { if (pcmk__str_eq(sections[i].name, name, pcmk__str_casei)) { return sections[i].bit; } } return 0; } static gboolean apply_exclude(const gchar *excludes, GError **error) { char **parts = NULL; gboolean result = TRUE; parts = g_strsplit(excludes, ",", 0); for (char **s = parts; *s != NULL; s++) { unsigned int bit = find_section_bit(*s); if (pcmk__str_eq(*s, "all", pcmk__str_none)) { show = 0; } else if (pcmk__str_eq(*s, "none", pcmk__str_none)) { show = all_includes(output_format); } else if (bit != 0) { show &= ~bit; } else { g_set_error(error, PCMK__EXITC_ERROR, CRM_EX_USAGE, "--exclude options: all, attributes, bans, counts, dc, " "failcounts, failures, fencing, fencing-failed, " "fencing-pending, fencing-succeeded, maint-mode, nodes, " "none, operations, options, resources, stack, summary, " "tickets, times"); result = FALSE; break; } } g_strfreev(parts); return result; } static gboolean apply_include(const gchar *includes, GError **error) { char **parts = NULL; gboolean result = TRUE; parts = g_strsplit(includes, ",", 0); for (char **s = parts; *s != NULL; s++) { unsigned int bit = find_section_bit(*s); if (pcmk__str_eq(*s, "all", pcmk__str_none)) { show = all_includes(output_format); } else if (pcmk__starts_with(*s, "bans")) { show |= pcmk_section_bans; if (options.neg_location_prefix != NULL) { free(options.neg_location_prefix); options.neg_location_prefix = NULL; } if (strlen(*s) > 4 && (*s)[4] == ':') { options.neg_location_prefix = strdup(*s+5); } } else if (pcmk__str_any_of(*s, "default", "defaults", NULL)) { show |= default_includes(output_format); } else if (pcmk__str_eq(*s, "none", pcmk__str_none)) { show = 0; } else if (bit != 0) { show |= bit; } else { g_set_error(error, PCMK__EXITC_ERROR, CRM_EX_USAGE, "--include options: all, attributes, bans[:PREFIX], counts, dc, " "default, failcounts, failures, fencing, fencing-failed, " "fencing-pending, fencing-succeeded, maint-mode, nodes, none, " "operations, options, resources, stack, summary, tickets, times"); result = FALSE; break; } } g_strfreev(parts); return result; } static gboolean apply_include_exclude(GSList *lst, GError **error) { gboolean rc = TRUE; GSList *node = lst; while (node != NULL) { char *s = node->data; if (pcmk__starts_with(s, "--include=")) { rc = apply_include(s+10, error); } else if (pcmk__starts_with(s, "-I=")) { rc = apply_include(s+3, error); } else if (pcmk__starts_with(s, "--exclude=")) { rc = apply_exclude(s+10, error); } else if (pcmk__starts_with(s, "-U=")) { rc = apply_exclude(s+3, error); } if (rc != TRUE) { break; } node = node->next; } return rc; } static gboolean user_include_exclude_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { char *s = crm_strdup_printf("%s=%s", option_name, optarg); options.user_includes_excludes = g_slist_append(options.user_includes_excludes, s); return TRUE; } static gboolean include_exclude_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { char *s = crm_strdup_printf("%s=%s", option_name, optarg); options.includes_excludes = g_slist_append(options.includes_excludes, s); return TRUE; } static gboolean as_cgi_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { if (args->output_ty != NULL) { free(args->output_ty); } args->output_ty = strdup("html"); output_format = mon_output_cgi; options.one_shot = TRUE; return TRUE; } static gboolean as_html_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { if (args->output_ty != NULL) { free(args->output_ty); } if (args->output_dest != NULL) { free(args->output_dest); args->output_dest = NULL; } if (optarg != NULL) { args->output_dest = strdup(optarg); } args->output_ty = strdup("html"); output_format = mon_output_html; umask(S_IWGRP | S_IWOTH); return TRUE; } static gboolean as_simple_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { if (args->output_ty != NULL) { free(args->output_ty); } args->output_ty = strdup("text"); output_format = mon_output_monitor; options.one_shot = TRUE; return TRUE; } static gboolean as_xml_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { if (args->output_ty != NULL) { free(args->output_ty); } args->output_ty = strdup("xml"); output_format = mon_output_legacy_xml; return TRUE; } static gboolean fence_history_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { if (optarg == NULL) { interactive_fence_level = 2; } else { pcmk__scan_min_int(optarg, &interactive_fence_level, 0); } switch (interactive_fence_level) { case 3: options.fence_connect = TRUE; fence_history = TRUE; return include_exclude_cb("--include", "fencing", data, err); case 2: options.fence_connect = TRUE; fence_history = TRUE; return include_exclude_cb("--include", "fencing", data, err); case 1: options.fence_connect = TRUE; fence_history = TRUE; return include_exclude_cb("--include", "fencing-failed,fencing-pending", data, err); case 0: options.fence_connect = FALSE; fence_history = FALSE; return include_exclude_cb("--exclude", "fencing", data, err); default: g_set_error(err, PCMK__EXITC_ERROR, CRM_EX_INVALID_PARAM, "Fence history must be 0-3"); return FALSE; } } static gboolean group_by_node_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { show_opts |= pcmk_show_rscs_by_node; return TRUE; } static gboolean hide_headers_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { return user_include_exclude_cb("--exclude", "summary", data, err); } static gboolean inactive_resources_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { show_opts |= pcmk_show_inactive_rscs; return TRUE; } static gboolean no_curses_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { output_format = mon_output_plain; return TRUE; } static gboolean print_brief_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { show_opts |= pcmk_show_brief; return TRUE; } static gboolean print_detail_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { show_opts |= pcmk_show_details; return TRUE; } static gboolean print_timing_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { show_opts |= pcmk_show_timing; return user_include_exclude_cb("--include", "operations", data, err); } static gboolean reconnect_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { int rc = crm_get_msec(optarg); if (rc == -1) { g_set_error(err, PCMK__EXITC_ERROR, CRM_EX_INVALID_PARAM, "Invalid value for -i: %s", optarg); return FALSE; } else { options.reconnect_ms = crm_parse_interval_spec(optarg); } return TRUE; } static gboolean show_attributes_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { return user_include_exclude_cb("--include", "attributes", data, err); } static gboolean show_bans_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { if (optarg != NULL) { char *s = crm_strdup_printf("bans:%s", optarg); gboolean rc = user_include_exclude_cb("--include", s, data, err); free(s); return rc; } else { return user_include_exclude_cb("--include", "bans", data, err); } } static gboolean show_failcounts_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { return user_include_exclude_cb("--include", "failcounts", data, err); } static gboolean show_operations_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { return user_include_exclude_cb("--include", "failcounts,operations", data, err); } static gboolean show_tickets_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { return user_include_exclude_cb("--include", "tickets", data, err); } static gboolean use_cib_file_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { setenv("CIB_file", optarg, 1); options.one_shot = TRUE; return TRUE; } #define INDENT " " /* *INDENT-OFF* */ static GOptionEntry addl_entries[] = { { "interval", 'i', 0, G_OPTION_ARG_CALLBACK, reconnect_cb, "Update frequency (default is 5 seconds)", "TIMESPEC" }, { "one-shot", '1', 0, G_OPTION_ARG_NONE, &options.one_shot, "Display the cluster status once on the console and exit", NULL }, { "daemonize", 'd', 0, G_OPTION_ARG_NONE, &options.daemonize, "Run in the background as a daemon.\n" INDENT "Requires at least one of --output-to and --external-agent.", NULL }, { "pid-file", 'p', 0, G_OPTION_ARG_FILENAME, &options.pid_file, "(Advanced) Daemon pid file location", "FILE" }, { "external-agent", 'E', 0, G_OPTION_ARG_FILENAME, &options.external_agent, "A program to run when resource operations take place", "FILE" }, { "external-recipient", 'e', 0, G_OPTION_ARG_STRING, &options.external_recipient, "A recipient for your program (assuming you want the program to send something to someone).", "RCPT" }, { "watch-fencing", 'W', 0, G_OPTION_ARG_NONE, &options.watch_fencing, "Listen for fencing events. For use with --external-agent.", NULL }, { "xml-file", 'x', G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_CALLBACK, use_cib_file_cb, NULL, NULL }, { NULL } }; static GOptionEntry display_entries[] = { { "include", 'I', 0, G_OPTION_ARG_CALLBACK, user_include_exclude_cb, "A list of sections to include in the output.\n" INDENT "See `Output Control` help for more information.", "SECTION(s)" }, { "exclude", 'U', 0, G_OPTION_ARG_CALLBACK, user_include_exclude_cb, "A list of sections to exclude from the output.\n" INDENT "See `Output Control` help for more information.", "SECTION(s)" }, { "node", 0, 0, G_OPTION_ARG_STRING, &options.only_node, "When displaying information about nodes, show only what's related to the given\n" INDENT "node, or to all nodes tagged with the given tag", "NODE" }, { "resource", 0, 0, G_OPTION_ARG_STRING, &options.only_rsc, "When displaying information about resources, show only what's related to the given\n" INDENT "resource, or to all resources tagged with the given tag", "RSC" }, { "group-by-node", 'n', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, group_by_node_cb, "Group resources by node", NULL }, { "inactive", 'r', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, inactive_resources_cb, "Display inactive resources", NULL }, { "failcounts", 'f', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, show_failcounts_cb, "Display resource fail counts", NULL }, { "operations", 'o', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, show_operations_cb, "Display resource operation history", NULL }, { "timing-details", 't', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, print_timing_cb, "Display resource operation history with timing details", NULL }, { "tickets", 'c', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, show_tickets_cb, "Display cluster tickets", NULL }, { "fence-history", 'm', G_OPTION_FLAG_OPTIONAL_ARG, G_OPTION_ARG_CALLBACK, fence_history_cb, "Show fence history:\n" INDENT "0=off, 1=failures and pending (default without option),\n" INDENT "2=add successes (default without value for option),\n" INDENT "3=show full history without reduction to most recent of each flavor", "LEVEL" }, { "neg-locations", 'L', G_OPTION_FLAG_OPTIONAL_ARG, G_OPTION_ARG_CALLBACK, show_bans_cb, "Display negative location constraints [optionally filtered by id prefix]", NULL }, { "show-node-attributes", 'A', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, show_attributes_cb, "Display node attributes", NULL }, { "hide-headers", 'D', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, hide_headers_cb, "Hide all headers", NULL }, { "show-detail", 'R', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, print_detail_cb, "Show more details (node IDs, individual clone instances)", NULL }, { "brief", 'b', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, print_brief_cb, "Brief output", NULL }, { "pending", 'j', G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, &options.print_pending, "Display pending state if 'record-pending' is enabled", NULL }, { "simple-status", 's', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, as_simple_cb, "Display the cluster status once as a simple one line output (suitable for nagios)", NULL }, { NULL } }; static GOptionEntry deprecated_entries[] = { { "as-html", 'h', G_OPTION_FLAG_FILENAME, G_OPTION_ARG_CALLBACK, as_html_cb, "Write cluster status to the named HTML file.\n" INDENT "Use --output-as=html --output-to=FILE instead.", "FILE" }, { "as-xml", 'X', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, as_xml_cb, "Write cluster status as XML to stdout. This will enable one-shot mode.\n" INDENT "Use --output-as=xml instead.", NULL }, { "disable-ncurses", 'N', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, no_curses_cb, "Disable the use of ncurses.\n" INDENT "Use --output-as=text instead.", NULL }, { "web-cgi", 'w', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, as_cgi_cb, "Web mode with output suitable for CGI (preselected when run as *.cgi).\n" INDENT "Use --output-as=html --html-cgi instead.", NULL }, { NULL } }; /* *INDENT-ON* */ /* Reconnect to the CIB and fencing agent after reconnect_ms has passed. This sounds * like it would be more broadly useful, but only ever happens after a disconnect via * mon_cib_connection_destroy. */ static gboolean reconnect_after_timeout(gpointer data) { #if CURSES_ENABLED if (output_format == mon_output_console) { clear(); refresh(); } #endif out->info(out, "Reconnecting..."); if (pacemakerd_status() == pcmk_rc_ok) { fencing_connect(); if (cib_connect(TRUE) == pcmk_rc_ok) { /* trigger redrawing the screen (needs reconnect_timer == 0) */ reconnect_timer = 0; refresh_after_event(FALSE, TRUE); return G_SOURCE_REMOVE; } } reconnect_timer = g_timeout_add(options.reconnect_ms, reconnect_after_timeout, NULL); return G_SOURCE_REMOVE; } /* Called from various places when we are disconnected from the CIB or from the * fencing agent. If the CIB connection is still valid, this function will also * attempt to sign off and reconnect. */ static void mon_cib_connection_destroy(gpointer user_data) { out->info(out, "Connection to the cluster-daemons terminated"); if (refresh_timer != NULL) { /* we'll trigger a refresh after reconnect */ mainloop_timer_stop(refresh_timer); } if (reconnect_timer) { /* we'll trigger a new reconnect-timeout at the end */ g_source_remove(reconnect_timer); reconnect_timer = 0; } if (st) { /* the client API won't properly reconnect notifications * if they are still in the table - so remove them */ clean_up_fencing_connection(); } if (cib) { cib->cmds->signoff(cib); reconnect_timer = g_timeout_add(options.reconnect_ms, reconnect_after_timeout, NULL); } return; } /* Signal handler installed into the mainloop for normal program shutdown */ static void mon_shutdown(int nsig) { clean_up(CRM_EX_OK); } #if CURSES_ENABLED static volatile sighandler_t ncurses_winch_handler; /* Signal handler installed the regular way (not into the main loop) for when * the screen is resized. Commonly, this happens when running in an xterm and * the user changes its size. */ static void mon_winresize(int nsig) { static int not_done; int lines = 0, cols = 0; if (!not_done++) { if (ncurses_winch_handler) /* the original ncurses WINCH signal handler does the * magic of retrieving the new window size; * otherwise, we'd have to use ioctl or tgetent */ (*ncurses_winch_handler) (SIGWINCH); getmaxyx(stdscr, lines, cols); resizeterm(lines, cols); /* Alert the mainloop code we'd like the refresh_trigger to run next * time the mainloop gets around to checking. */ mainloop_set_trigger((crm_trigger_t *) refresh_trigger); } not_done--; } #endif static int fencing_connect(void) { int rc = pcmk_ok; if (options.fence_connect && st == NULL) { st = stonith_api_new(); } if (!options.fence_connect || st == NULL || st->state != stonith_disconnected) { return rc; } rc = st->cmds->connect(st, crm_system_name, NULL); if (rc == pcmk_ok) { crm_trace("Setting up stonith callbacks"); if (options.watch_fencing) { st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, mon_st_callback_event); st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, mon_st_callback_event); } else { st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, mon_st_callback_display); st->cmds->register_notification(st, T_STONITH_NOTIFY_HISTORY, mon_st_callback_display); } } else { clean_up_fencing_connection(); } return rc; } static int cib_connect(gboolean full) { int rc = pcmk_rc_ok; CRM_CHECK(cib != NULL, return EINVAL); if (cib->state == cib_connected_query || cib->state == cib_connected_command) { return rc; } crm_trace("Connecting to the CIB"); rc = pcmk_legacy2rc(cib->cmds->signon(cib, crm_system_name, cib_query)); if (rc != pcmk_rc_ok) { out->err(out, "Could not connect to the CIB: %s", pcmk_rc_str(rc)); return rc; } #if CURSES_ENABLED /* just show this if refresh is gonna remove all traces */ if (output_format == mon_output_console) { out->info(out,"Waiting for CIB ..."); } #endif rc = pcmk_legacy2rc(cib->cmds->query(cib, NULL, ¤t_cib, cib_scope_local | cib_sync_call)); if (rc == pcmk_rc_ok && full) { rc = pcmk_legacy2rc(cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy)); if (rc == EPROTONOSUPPORT) { out->err(out, "Notification setup not supported, won't be " "able to reconnect after failure"); if (output_format == mon_output_console) { sleep(2); } rc = pcmk_rc_ok; } if (rc == pcmk_rc_ok) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = pcmk_legacy2rc(cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update)); } if (rc != pcmk_rc_ok) { out->err(out, "Notification setup failed, could not monitor CIB actions"); cib__clean_up_connection(&cib); clean_up_fencing_connection(); } } return rc; } /* This is used to set up the fencing options after the interactive UI has been stared. * fence_history_cb can't be used because it builds up a list of includes/excludes that * then have to be processed with apply_include_exclude and that could affect other * things. */ static void set_fencing_options(int level) { switch (level) { case 3: options.fence_connect = TRUE; fence_history = TRUE; show |= pcmk_section_fencing_all; break; case 2: options.fence_connect = TRUE; fence_history = TRUE; show |= pcmk_section_fencing_all; break; case 1: options.fence_connect = TRUE; fence_history = TRUE; show |= pcmk_section_fence_failed | pcmk_section_fence_pending; break; default: interactive_fence_level = 0; options.fence_connect = FALSE; fence_history = FALSE; show &= ~pcmk_section_fencing_all; break; } } /* Before trying to connect to fencer or cib check for state of pacemakerd - just no sense in trying till pacemakerd has taken care of starting all the sub-processes Only noteworthy thing to show here is when pacemakerd is waiting for startup-trigger from SBD. */ static void pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api, enum pcmk_ipc_event event_type, crm_exit_t status, void *event_data, void *user_data) { pcmk_pacemakerd_api_reply_t *reply = event_data; enum pcmk_pacemakerd_state *state = (enum pcmk_pacemakerd_state *) user_data; /* we are just interested in the latest reply */ *state = pcmk_pacemakerd_state_invalid; switch (event_type) { case pcmk_ipc_event_reply: break; default: return; } if (status != CRM_EX_OK) { out->err(out, "Bad reply from pacemakerd: %s", crm_exit_str(status)); return; } if (reply->reply_type != pcmk_pacemakerd_reply_ping) { out->err(out, "Unknown reply type %d from pacemakerd", reply->reply_type); } else { if ((reply->data.ping.last_good != (time_t) 0) && (reply->data.ping.status == pcmk_rc_ok)) { *state = reply->data.ping.state; } } } static int pacemakerd_status(void) { int rc = pcmk_rc_ok; pcmk_ipc_api_t *pacemakerd_api = NULL; enum pcmk_pacemakerd_state state = pcmk_pacemakerd_state_invalid; if (!use_cib_native) { /* we don't need fully functional pacemakerd otherwise */ return rc; } if (cib != NULL && (cib->state == cib_connected_query || cib->state == cib_connected_command)) { /* As long as we have a cib-connection let's go with * that to fetch further cluster-status and avoid * unnecessary pings to pacemakerd. * If cluster is going down and fencer is down already * this will lead to a silently failing fencer reconnect. * On cluster startup we shouldn't see this situation * as first we do is wait for pacemakerd to report all * daemons running. */ return rc; } rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd); if (pacemakerd_api == NULL) { out->err(out, "Could not connect to pacemakerd: %s", pcmk_rc_str(rc)); /* this is unrecoverable so return with rc we have */ return rc; } pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, (void *) &state); rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_poll); switch (rc) { case pcmk_rc_ok: rc = pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); if (rc == pcmk_rc_ok) { rc = pcmk_poll_ipc(pacemakerd_api, options.reconnect_ms/2); if (rc == pcmk_rc_ok) { pcmk_dispatch_ipc(pacemakerd_api); rc = ENOTCONN; if ((output_format == mon_output_console) || (output_format == mon_output_plain)) { switch (state) { case pcmk_pacemakerd_state_running: rc = pcmk_rc_ok; break; case pcmk_pacemakerd_state_starting_daemons: out->info(out,"Pacemaker daemons starting ..."); break; case pcmk_pacemakerd_state_wait_for_ping: out->info(out,"Waiting for startup-trigger from SBD ..."); break; case pcmk_pacemakerd_state_shutting_down: out->info(out,"Pacemaker daemons shutting down ..."); /* try our luck maybe CIB is still accessible */ rc = pcmk_rc_ok; break; case pcmk_pacemakerd_state_shutdown_complete: /* assuming pacemakerd doesn't dispatch any pings after entering * that state unless it is waiting for SBD */ out->info(out,"Pacemaker daemons shut down - reporting to SBD ..."); break; default: break; } } else { switch (state) { case pcmk_pacemakerd_state_running: rc = pcmk_rc_ok; break; case pcmk_pacemakerd_state_shutting_down: /* try our luck maybe CIB is still accessible */ rc = pcmk_rc_ok; break; default: break; } } } } break; case EREMOTEIO: rc = pcmk_rc_ok; on_remote_node = TRUE; #if CURSES_ENABLED /* just show this if refresh is gonna remove all traces */ if (output_format == mon_output_console) { out->info(out, "Running on remote-node waiting to be connected by cluster ..."); } #endif break; default: break; } pcmk_free_ipc_api(pacemakerd_api); /* returning with ENOTCONN triggers a retry */ return (rc == pcmk_rc_ok)?rc:ENOTCONN; } #if CURSES_ENABLED static const char * get_option_desc(char c) { const char *desc = "No help available"; for (GOptionEntry *entry = display_entries; entry != NULL; entry++) { if (entry->short_name == c) { desc = entry->description; break; } } return desc; } #define print_option_help(out, option, condition) \ curses_formatted_printf(out, "%c %c: \t%s\n", ((condition)? '*': ' '), option, get_option_desc(option)); /* This function is called from the main loop when there is something to be read * on stdin, like an interactive user's keystroke. All it does is read the keystroke, * set flags (or show the page showing which keystrokes are valid), and redraw the * screen. It does not do anything with connections to the CIB or fencing agent * agent what would happen in mon_refresh_display. */ static gboolean detect_user_input(GIOChannel *channel, GIOCondition condition, gpointer user_data) { int c; gboolean config_mode = FALSE; while (1) { /* Get user input */ c = getchar(); switch (c) { case 'm': interactive_fence_level++; if (interactive_fence_level > 3) { interactive_fence_level = 0; } set_fencing_options(interactive_fence_level); break; case 'c': show ^= pcmk_section_tickets; break; case 'f': show ^= pcmk_section_failcounts; break; case 'n': show_opts ^= pcmk_show_rscs_by_node; break; case 'o': show ^= pcmk_section_operations; if (!pcmk_is_set(show, pcmk_section_operations)) { show_opts &= ~pcmk_show_timing; } break; case 'r': show_opts ^= pcmk_show_inactive_rscs; break; case 'R': show_opts ^= pcmk_show_details; #ifdef PCMK__COMPAT_2_0 // Keep failed action output the same as 2.0.x show_opts |= pcmk_show_failed_detail; #endif break; case 't': show_opts ^= pcmk_show_timing; if (pcmk_is_set(show_opts, pcmk_show_timing)) { show |= pcmk_section_operations; } break; case 'A': show ^= pcmk_section_attributes; break; case 'L': show ^= pcmk_section_bans; break; case 'D': /* If any header is shown, clear them all, otherwise set them all */ if (pcmk_any_flags_set(show, pcmk_section_summary)) { show &= ~pcmk_section_summary; } else { show |= pcmk_section_summary; } /* Regardless, we don't show options in console mode. */ show &= ~pcmk_section_options; break; case 'b': show_opts ^= pcmk_show_brief; break; case 'j': show_opts ^= pcmk_show_pending; break; case '?': config_mode = TRUE; break; default: /* All other keys just redraw the screen. */ goto refresh; } if (!config_mode) goto refresh; blank_screen(); curses_formatted_printf(out, "%s", "Display option change mode\n"); print_option_help(out, 'c', pcmk_is_set(show, pcmk_section_tickets)); print_option_help(out, 'f', pcmk_is_set(show, pcmk_section_failcounts)); print_option_help(out, 'n', pcmk_is_set(show_opts, pcmk_show_rscs_by_node)); print_option_help(out, 'o', pcmk_is_set(show, pcmk_section_operations)); print_option_help(out, 'r', pcmk_is_set(show_opts, pcmk_show_inactive_rscs)); print_option_help(out, 't', pcmk_is_set(show_opts, pcmk_show_timing)); print_option_help(out, 'A', pcmk_is_set(show, pcmk_section_attributes)); print_option_help(out, 'L', pcmk_is_set(show, pcmk_section_bans)); print_option_help(out, 'D', !pcmk_is_set(show, pcmk_section_summary)); print_option_help(out, 'R', pcmk_any_flags_set(show_opts, pcmk_show_details)); print_option_help(out, 'b', pcmk_is_set(show_opts, pcmk_show_brief)); print_option_help(out, 'j', pcmk_is_set(show_opts, pcmk_show_pending)); curses_formatted_printf(out, "%d m: \t%s\n", interactive_fence_level, get_option_desc('m')); curses_formatted_printf(out, "%s", "\nToggle fields via field letter, type any other key to return\n"); } refresh: refresh_after_event(FALSE, TRUE); return TRUE; } #endif // Basically crm_signal_handler(SIGCHLD, SIG_IGN) plus the SA_NOCLDWAIT flag static void avoid_zombies(void) { struct sigaction sa; memset(&sa, 0, sizeof(struct sigaction)); if (sigemptyset(&sa.sa_mask) < 0) { crm_warn("Cannot avoid zombies: %s", pcmk_strerror(errno)); return; } sa.sa_handler = SIG_IGN; sa.sa_flags = SA_RESTART|SA_NOCLDWAIT; if (sigaction(SIGCHLD, &sa, NULL) < 0) { crm_warn("Cannot avoid zombies: %s", pcmk_strerror(errno)); } } static GOptionContext * build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { GOptionContext *context = NULL; GOptionEntry extra_prog_entries[] = { { "quiet", 'Q', 0, G_OPTION_ARG_NONE, &(args->quiet), "Be less descriptive in output.", NULL }, { NULL } }; const char *description = "Notes:\n\n" "If this program is called as crm_mon.cgi, --output-as=html --html-cgi will\n" "automatically be added to the command line arguments.\n\n" "Time Specification:\n\n" "The TIMESPEC in any command line option can be specified in many different\n" "formats. It can be just an integer number of seconds, a number plus units\n" "(ms/msec/us/usec/s/sec/m/min/h/hr), or an ISO 8601 period specification.\n\n" "Output Control:\n\n" "By default, a certain list of sections are written to the output destination.\n" "The default varies based on the output format - XML includes everything, while\n" "other output formats will display less. This list can be modified with the\n" "--include and --exclude command line options. Each option may be given multiple\n" "times on the command line, and each can give a comma-separated list of sections.\n" "The options are applied to the default set, from left to right as seen on the\n" "command line. For a list of valid sections, pass --include=list or --exclude=list.\n\n" "Interactive Use:\n\n" "When run interactively, crm_mon can be told to hide and display various sections\n" "of output. To see a help screen explaining the options, hit '?'. Any key stroke\n" "aside from those listed will cause the screen to refresh.\n\n" "Examples:\n\n" "Display the cluster status on the console with updates as they occur:\n\n" "\tcrm_mon\n\n" "Display the cluster status on the console just once then exit:\n\n" "\tcrm_mon -1\n\n" "Display your cluster status, group resources by node, and include inactive resources in the list:\n\n" "\tcrm_mon --group-by-node --inactive\n\n" "Start crm_mon as a background daemon and have it write the cluster status to an HTML file:\n\n" "\tcrm_mon --daemonize --output-as html --output-to /path/to/docroot/filename.html\n\n" "Start crm_mon and export the current cluster status as XML to stdout, then exit:\n\n" "\tcrm_mon --output-as xml\n\n"; #if CURSES_ENABLED context = pcmk__build_arg_context(args, "console (default), html, text, xml", group, NULL); #else context = pcmk__build_arg_context(args, "text (default), html, xml", group, NULL); #endif pcmk__add_main_args(context, extra_prog_entries); g_option_context_set_description(context, description); pcmk__add_arg_group(context, "display", "Display Options:", "Show display options", display_entries); pcmk__add_arg_group(context, "additional", "Additional Options:", "Show additional options", addl_entries); pcmk__add_arg_group(context, "deprecated", "Deprecated Options:", "Show deprecated options", deprecated_entries); return context; } /* If certain format options were specified, we want to set some extra * options. We can just process these like they were given on the * command line. */ static void add_output_args(void) { GError *err = NULL; if (output_format == mon_output_plain) { if (!pcmk__force_args(context, &err, "%s --text-fancy", g_get_prgname())) { g_propagate_error(&error, err); clean_up(CRM_EX_USAGE); } } else if (output_format == mon_output_cgi) { if (!pcmk__force_args(context, &err, "%s --html-cgi", g_get_prgname())) { g_propagate_error(&error, err); clean_up(CRM_EX_USAGE); } } else if (output_format == mon_output_xml) { if (!pcmk__force_args(context, &err, "%s --xml-simple-list --xml-substitute", g_get_prgname())) { g_propagate_error(&error, err); clean_up(CRM_EX_USAGE); } } else if (output_format == mon_output_legacy_xml) { output_format = mon_output_xml; if (!pcmk__force_args(context, &err, "%s --xml-legacy --xml-substitute", g_get_prgname())) { g_propagate_error(&error, err); clean_up(CRM_EX_USAGE); } } } /* Which output format to use could come from two places: The --as-xml * style arguments we gave in deprecated_entries above, or the formatted output * arguments added by pcmk__register_formats. If the latter were used, * output_format will be mon_output_unset. * * Call the callbacks as if those older style arguments were provided so * the various things they do get done. */ static void reconcile_output_format(pcmk__common_args_t *args) { gboolean retval = TRUE; GError *err = NULL; if (output_format != mon_output_unset) { return; } if (pcmk__str_eq(args->output_ty, "html", pcmk__str_casei)) { char *dest = NULL; if (args->output_dest != NULL) { dest = strdup(args->output_dest); } retval = as_html_cb("h", dest, NULL, &err); free(dest); } else if (pcmk__str_eq(args->output_ty, "text", pcmk__str_casei)) { retval = no_curses_cb("N", NULL, NULL, &err); } else if (pcmk__str_eq(args->output_ty, "xml", pcmk__str_casei)) { if (args->output_ty != NULL) { free(args->output_ty); } args->output_ty = strdup("xml"); output_format = mon_output_xml; } else if (options.one_shot) { if (args->output_ty != NULL) { free(args->output_ty); } args->output_ty = strdup("text"); output_format = mon_output_plain; } else { /* Neither old nor new arguments were given, so set the default. */ if (args->output_ty != NULL) { free(args->output_ty); } args->output_ty = strdup("console"); output_format = mon_output_console; } if (!retval) { g_propagate_error(&error, err); clean_up(CRM_EX_USAGE); } } static void handle_connection_failures(int rc) { if (rc == pcmk_rc_ok) { return; } if (output_format == mon_output_monitor) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "CLUSTER CRIT: Connection to cluster failed: %s", pcmk_rc_str(rc)); rc = MON_STATUS_CRIT; } else if (rc == ENOTCONN) { if (on_remote_node) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Error: remote-node not connected to cluster"); } else { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Error: cluster is not available on this node"); } rc = pcmk_rc2exitc(rc); } else { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Connection to cluster failed: %s", pcmk_rc_str(rc)); rc = pcmk_rc2exitc(rc); } clean_up(rc); } static void one_shot(void) { int rc; rc = pacemakerd_status(); if (rc == pcmk_rc_ok) { fencing_connect(); rc = cib_connect(FALSE); } if (rc == pcmk_rc_ok) { mon_refresh_display(NULL); } else { handle_connection_failures(rc); } clean_up(CRM_EX_OK); } int main(int argc, char **argv) { int rc = pcmk_ok; GOptionGroup *output_group = NULL; args = pcmk__new_common_args(SUMMARY); context = build_arg_context(args, &output_group); pcmk__register_formats(output_group, formats); options.pid_file = strdup("/tmp/ClusterMon.pid"); pcmk__cli_init_logging("crm_mon", 0); // Avoid needing to wait for subprocesses forked for -E/--external-agent avoid_zombies(); if (pcmk__ends_with_ext(argv[0], ".cgi")) { output_format = mon_output_cgi; options.one_shot = TRUE; } processed_args = pcmk__cmdline_preproc(argv, "ehimpxEILU"); fence_history_cb("--fence-history", "1", NULL, NULL); /* Set an HTML title regardless of what format we will eventually use. This can't * be done in add_output_args. That function is called after command line * arguments are processed in the next block, which means it'll override whatever * title the user provides. Doing this here means the user can give their own * title on the command line. */ if (!pcmk__force_args(context, &error, "%s --html-title \"Cluster Status\"", g_get_prgname())) { return clean_up(CRM_EX_USAGE); } if (!g_option_context_parse_strv(context, &processed_args, &error)) { return clean_up(CRM_EX_USAGE); } for (int i = 0; i < args->verbosity; i++) { crm_bump_log_level(argc, argv); } if (!args->version) { if (args->quiet) { include_exclude_cb("--exclude", "times", NULL, NULL); } if (options.watch_fencing) { fence_history_cb("--fence-history", "0", NULL, NULL); options.fence_connect = TRUE; } /* create the cib-object early to be able to do further * decisions based on the cib-source */ cib = cib_new(); if (cib == NULL) { rc = -EINVAL; } else { switch (cib->variant) { case cib_native: /* cib & fencing - everything available */ use_cib_native = TRUE; break; case cib_file: /* Don't try to connect to fencing as we * either don't have a running cluster or * the fencing-information would possibly * not match the cib data from a file. * As we don't expect cib-updates coming * in enforce one-shot. */ fence_history_cb("--fence-history", "0", NULL, NULL); options.one_shot = TRUE; break; case cib_remote: /* updates coming in but no fencing */ fence_history_cb("--fence-history", "0", NULL, NULL); break; case cib_undefined: case cib_database: default: /* something is odd */ rc = -EINVAL; break; } } if (options.one_shot) { if (output_format == mon_output_console) { output_format = mon_output_plain; } } else if (options.daemonize) { if ((output_format == mon_output_console) || (args->output_dest == NULL)) { output_format = mon_output_none; } crm_enable_stderr(FALSE); if (pcmk__str_eq(args->output_dest, "-", pcmk__str_null_matches | pcmk__str_casei) && !options.external_agent) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_USAGE, "--daemonize requires at least one of --output-to and --external-agent"); return clean_up(CRM_EX_USAGE); } if (cib) { /* to be on the safe side don't have cib-object around * when we are forking */ cib_delete(cib); cib = NULL; pcmk__daemonize(crm_system_name, options.pid_file); cib = cib_new(); if (cib == NULL) { rc = -EINVAL; } /* otherwise assume we've got the same cib-object we've just destroyed * in our parent */ } } else if (output_format == mon_output_console) { #if CURSES_ENABLED crm_enable_stderr(FALSE); #else options.one_shot = TRUE; output_format = mon_output_plain; printf("Defaulting to one-shot mode\n"); printf("You need to have curses available at compile time to enable console mode\n"); #endif } } if (rc != pcmk_ok) { // Shouldn't really be possible g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Invalid CIB source"); return clean_up(CRM_EX_ERROR); } reconcile_output_format(args); add_output_args(); if (args->version && output_format == mon_output_console) { /* Use the text output format here if we are in curses mode but were given * --version. Displaying version information uses printf, and then we * immediately exit. We don't want to initialize curses for that. */ rc = pcmk__output_new(&out, "text", args->output_dest, argv); } else { rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); } if (rc != pcmk_rc_ok) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_ERROR, "Error creating output format %s: %s", args->output_ty, pcmk_rc_str(rc)); return clean_up(CRM_EX_ERROR); } /* output_format MUST NOT BE CHANGED AFTER THIS POINT. */ show = default_includes(output_format); /* Apply --include/--exclude flags we used internally. There's no error reporting * here because this would be a programming error. */ apply_include_exclude(options.includes_excludes, &error); /* And now apply any --include/--exclude flags the user gave on the command line. * These are done in a separate pass from the internal ones because we want to * make sure whatever the user specifies overrides whatever we do. */ if (!apply_include_exclude(options.user_includes_excludes, &error)) { return clean_up(CRM_EX_USAGE); } /* Sync up the initial value of interactive_fence_level with whatever was set with * --include/--exclude= options. */ if (pcmk_all_flags_set(show, pcmk_section_fencing_all)) { interactive_fence_level = 3; } else if (pcmk_is_set(show, pcmk_section_fence_worked)) { interactive_fence_level = 2; } else if (pcmk_any_flags_set(show, pcmk_section_fence_failed | pcmk_section_fence_pending)) { interactive_fence_level = 1; } else { interactive_fence_level = 0; } pcmk__register_lib_messages(out); crm_mon_register_messages(out); pe__register_messages(out); stonith__register_messages(out); if (args->version) { out->version(out, false); return clean_up(CRM_EX_OK); } /* Extra sanity checks when in CGI mode */ if (output_format == mon_output_cgi) { if (cib->variant == cib_file) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_USAGE, "CGI mode used with CIB file"); return clean_up(CRM_EX_USAGE); } else if (options.external_agent != NULL) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_USAGE, "CGI mode cannot be used with --external-agent"); return clean_up(CRM_EX_USAGE); } else if (options.daemonize == TRUE) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_USAGE, "CGI mode cannot be used with -d"); return clean_up(CRM_EX_USAGE); } } if (output_format == mon_output_xml || output_format == mon_output_legacy_xml) { show_opts |= pcmk_show_inactive_rscs | pcmk_show_timing; if (!options.daemonize) { options.one_shot = TRUE; } } if ((output_format == mon_output_html || output_format == mon_output_cgi) && out->dest != stdout) { pcmk__html_add_header("meta", "http-equiv", "refresh", "content", pcmk__itoa(options.reconnect_ms / 1000), NULL); } #ifdef PCMK__COMPAT_2_0 // Keep failed action output the same as 2.0.x show_opts |= pcmk_show_failed_detail; #endif crm_info("Starting %s", crm_system_name); cib__set_output(cib, out); if (options.one_shot) { one_shot(); } do { out->info(out,"Waiting until cluster is available on this node ..."); rc = pacemakerd_status(); if (rc == pcmk_rc_ok) { fencing_connect(); rc = cib_connect(TRUE); } if (rc != pcmk_rc_ok) { pcmk__sleep_ms(options.reconnect_ms); #if CURSES_ENABLED if (output_format == mon_output_console) { clear(); refresh(); } #endif } else if (output_format == mon_output_html && out->dest != stdout) { printf("Writing html to %s ...\n", args->output_dest); } } while (rc == ENOTCONN); handle_connection_failures(rc); set_fencing_options(interactive_fence_level); mon_refresh_display(NULL); mainloop = g_main_loop_new(NULL, FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); #if CURSES_ENABLED if (output_format == mon_output_console) { ncurses_winch_handler = crm_signal_handler(SIGWINCH, mon_winresize); if (ncurses_winch_handler == SIG_DFL || ncurses_winch_handler == SIG_IGN || ncurses_winch_handler == SIG_ERR) ncurses_winch_handler = NULL; io_channel = g_io_channel_unix_new(STDIN_FILENO); g_io_add_watch(io_channel, G_IO_IN, detect_user_input, NULL); } #endif /* When refresh_trigger->trigger is set to TRUE, call mon_refresh_display. In * this file, that is anywhere mainloop_set_trigger is called. */ refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_display, NULL); g_main_loop_run(mainloop); g_main_loop_unref(mainloop); if (io_channel != NULL) { g_io_channel_shutdown(io_channel, TRUE, NULL); } crm_info("Exiting %s", crm_system_name); return clean_up(CRM_EX_OK); } /*! * \internal * \brief Print one-line status suitable for use with monitoring software * * \param[in] data_set Working set of CIB state * * \note This function's output (and the return code when the program exits) * should conform to https://www.monitoring-plugins.org/doc/guidelines.html */ static void print_simple_status(pcmk__output_t *out, pe_working_set_t * data_set) { GList *gIter = NULL; int nodes_online = 0; int nodes_standby = 0; int nodes_maintenance = 0; char *offline_nodes = NULL; size_t offline_nodes_len = 0; gboolean no_dc = FALSE; gboolean offline = FALSE; if (data_set->dc_node == NULL) { has_warnings = TRUE; no_dc = TRUE; } for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (node->details->standby && node->details->online) { nodes_standby++; } else if (node->details->maintenance && node->details->online) { nodes_maintenance++; } else if (node->details->online) { nodes_online++; } else { char *s = crm_strdup_printf("offline node: %s", node->details->uname); /* coverity[leaked_storage] False positive */ pcmk__add_word(&offline_nodes, &offline_nodes_len, s); free(s); has_warnings = TRUE; offline = TRUE; } } if (has_warnings) { out->info(out, "CLUSTER WARN: %s%s%s", no_dc ? "No DC" : "", no_dc && offline ? ", " : "", (offline? offline_nodes : "")); free(offline_nodes); } else { char *nodes_standby_s = NULL; char *nodes_maint_s = NULL; if (nodes_standby > 0) { nodes_standby_s = crm_strdup_printf(", %d standby node%s", nodes_standby, pcmk__plural_s(nodes_standby)); } if (nodes_maintenance > 0) { nodes_maint_s = crm_strdup_printf(", %d maintenance node%s", nodes_maintenance, pcmk__plural_s(nodes_maintenance)); } out->info(out, "CLUSTER OK: %d node%s online%s%s, " "%d resource instance%s configured", nodes_online, pcmk__plural_s(nodes_online), nodes_standby_s != NULL ? nodes_standby_s : "", nodes_maint_s != NULL ? nodes_maint_s : "", data_set->ninstances, pcmk__plural_s(data_set->ninstances)); free(nodes_standby_s); free(nodes_maint_s); } /* coverity[leaked_storage] False positive */ } static int send_custom_trap(const char *node, const char *rsc, const char *task, int target_rc, int rc, int status, const char *desc) { pid_t pid; /*setenv needs chars, these are ints */ char *rc_s = pcmk__itoa(rc); char *status_s = pcmk__itoa(status); char *target_rc_s = pcmk__itoa(target_rc); crm_debug("Sending external notification to '%s' via '%s'", options.external_recipient, options.external_agent); if(rsc) { setenv("CRM_notify_rsc", rsc, 1); } if (options.external_recipient) { setenv("CRM_notify_recipient", options.external_recipient, 1); } setenv("CRM_notify_node", node, 1); setenv("CRM_notify_task", task, 1); setenv("CRM_notify_desc", desc, 1); setenv("CRM_notify_rc", rc_s, 1); setenv("CRM_notify_target_rc", target_rc_s, 1); setenv("CRM_notify_status", status_s, 1); pid = fork(); if (pid == -1) { crm_perror(LOG_ERR, "notification fork() failed."); } if (pid == 0) { /* crm_debug("notification: I am the child. Executing the nofitication program."); */ execl(options.external_agent, options.external_agent, NULL); exit(CRM_EX_ERROR); } crm_trace("Finished running custom notification program '%s'.", options.external_agent); free(target_rc_s); free(status_s); free(rc_s); return 0; } static void handle_rsc_op(xmlNode * xml, const char *node_id) { int rc = -1; int status = -1; int target_rc = -1; gboolean notify = TRUE; char *rsc = NULL; char *task = NULL; const char *desc = NULL; const char *magic = NULL; const char *id = NULL; const char *node = NULL; xmlNode *n = xml; xmlNode * rsc_op = xml; if(strcmp((const char*)xml->name, XML_LRM_TAG_RSC_OP) != 0) { xmlNode *cIter; for(cIter = xml->children; cIter; cIter = cIter->next) { handle_rsc_op(cIter, node_id); } return; } id = crm_element_value(rsc_op, XML_LRM_ATTR_TASK_KEY); if (id == NULL) { /* Compatibility with <= 1.1.5 */ id = ID(rsc_op); } magic = crm_element_value(rsc_op, XML_ATTR_TRANSITION_MAGIC); if (magic == NULL) { /* non-change */ return; } if (!decode_transition_magic(magic, NULL, NULL, NULL, &status, &rc, &target_rc)) { crm_err("Invalid event %s detected for %s", magic, id); return; } if (parse_op_key(id, &rsc, &task, NULL) == FALSE) { crm_err("Invalid event detected for %s", id); goto bail; } node = crm_element_value(rsc_op, XML_LRM_ATTR_TARGET); while (n != NULL && !pcmk__str_eq(XML_CIB_TAG_STATE, TYPE(n), pcmk__str_casei)) { n = n->parent; } if(node == NULL && n) { node = crm_element_value(n, XML_ATTR_UNAME); } if (node == NULL && n) { node = ID(n); } if (node == NULL) { node = node_id; } if (node == NULL) { crm_err("No node detected for event %s (%s)", magic, id); goto bail; } /* look up where we expected it to be? */ desc = pcmk_strerror(pcmk_ok); if ((status == PCMK_EXEC_DONE) && (target_rc == rc)) { crm_notice("%s of %s on %s completed: %s", task, rsc, node, desc); if (rc == PCMK_OCF_NOT_RUNNING) { notify = FALSE; } } else if (status == PCMK_EXEC_DONE) { desc = services_ocf_exitcode_str(rc); crm_warn("%s of %s on %s failed: %s", task, rsc, node, desc); } else { desc = pcmk_exec_status_str(status); crm_warn("%s of %s on %s failed: %s", task, rsc, node, desc); } if (notify && options.external_agent) { send_custom_trap(node, rsc, task, target_rc, rc, status, desc); } bail: free(rsc); free(task); } /* This function is just a wrapper around mainloop_set_trigger so that it can be * called from a mainloop directly. It's simply another way of ensuring the screen * gets redrawn. */ static gboolean mon_trigger_refresh(gpointer user_data) { mainloop_set_trigger((crm_trigger_t *) refresh_trigger); return FALSE; } static void crm_diff_update_v2(const char *event, xmlNode * msg) { xmlNode *change = NULL; xmlNode *diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); for (change = pcmk__xml_first_child(diff); change != NULL; change = pcmk__xml_next(change)) { const char *name = NULL; const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); xmlNode *match = NULL; const char *node = NULL; if(op == NULL) { continue; } else if(strcmp(op, "create") == 0) { match = change->children; } else if(strcmp(op, "move") == 0) { continue; } else if(strcmp(op, "delete") == 0) { continue; } else if(strcmp(op, "modify") == 0) { match = first_named_child(change, XML_DIFF_RESULT); if(match) { match = match->children; } } if(match) { name = (const char *)match->name; } crm_trace("Handling %s operation for %s %p, %s", op, xpath, match, name); if(xpath == NULL) { /* Version field, ignore */ } else if(name == NULL) { crm_debug("No result for %s operation to %s", op, xpath); CRM_ASSERT(strcmp(op, "delete") == 0 || strcmp(op, "move") == 0); } else if(strcmp(name, XML_TAG_CIB) == 0) { xmlNode *state = NULL; xmlNode *status = first_named_child(match, XML_CIB_TAG_STATUS); for (state = pcmk__xe_first_child(status); state != NULL; state = pcmk__xe_next(state)) { node = crm_element_value(state, XML_ATTR_UNAME); if (node == NULL) { node = ID(state); } handle_rsc_op(state, node); } } else if(strcmp(name, XML_CIB_TAG_STATUS) == 0) { xmlNode *state = NULL; for (state = pcmk__xe_first_child(match); state != NULL; state = pcmk__xe_next(state)) { node = crm_element_value(state, XML_ATTR_UNAME); if (node == NULL) { node = ID(state); } handle_rsc_op(state, node); } } else if(strcmp(name, XML_CIB_TAG_STATE) == 0) { node = crm_element_value(match, XML_ATTR_UNAME); if (node == NULL) { node = ID(match); } handle_rsc_op(match, node); } else if(strcmp(name, XML_CIB_TAG_LRM) == 0) { node = ID(match); handle_rsc_op(match, node); } else if(strcmp(name, XML_LRM_TAG_RESOURCES) == 0) { char *local_node = pcmk__xpath_node_id(xpath, "lrm"); handle_rsc_op(match, local_node); free(local_node); } else if(strcmp(name, XML_LRM_TAG_RESOURCE) == 0) { char *local_node = pcmk__xpath_node_id(xpath, "lrm"); handle_rsc_op(match, local_node); free(local_node); } else if(strcmp(name, XML_LRM_TAG_RSC_OP) == 0) { char *local_node = pcmk__xpath_node_id(xpath, "lrm"); handle_rsc_op(match, local_node); free(local_node); } else { crm_trace("Ignoring %s operation for %s %p, %s", op, xpath, match, name); } } } static void crm_diff_update_v1(const char *event, xmlNode * msg) { /* Process operation updates */ xmlXPathObject *xpathObj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RSC_OP); int lpc = 0, max = numXpathResults(xpathObj); for (lpc = 0; lpc < max; lpc++) { xmlNode *rsc_op = getXpathResult(xpathObj, lpc); handle_rsc_op(rsc_op, NULL); } freeXpathObject(xpathObj); } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; static bool stale = FALSE; gboolean cib_updated = FALSE; xmlNode *diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); out->progress(out, false); if (current_cib != NULL) { rc = xml_apply_patchset(current_cib, diff, TRUE); switch (rc) { case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_notice("[%s] Patch aborted: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(current_cib); current_cib = NULL; break; case pcmk_ok: cib_updated = TRUE; break; default: crm_notice("[%s] ABORTED: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(current_cib); current_cib = NULL; } } if (current_cib == NULL) { crm_trace("Re-requesting the full cib"); cib->cmds->query(cib, NULL, ¤t_cib, cib_scope_local | cib_sync_call); } if (options.external_agent) { int format = 0; crm_element_value_int(diff, "format", &format); switch(format) { case 1: crm_diff_update_v1(event, msg); break; case 2: crm_diff_update_v2(event, msg); break; default: crm_err("Unknown patch format: %d", format); } } if (current_cib == NULL) { if(!stale) { out->info(out, "--- Stale data ---"); } stale = TRUE; return; } stale = FALSE; refresh_after_event(cib_updated, FALSE); } static int get_fencing_history(stonith_history_t **stonith_history) { int rc = 0; while (fence_history) { if (st != NULL) { rc = st->cmds->history(st, st_opt_sync_call, NULL, stonith_history, 120); if (rc == 0) { *stonith_history = stonith__sort_history(*stonith_history); if (!pcmk_all_flags_set(show, pcmk_section_fencing_all) && (output_format != mon_output_xml)) { *stonith_history = pcmk__reduce_fence_history(*stonith_history); } break; /* all other cases are errors */ } } else { rc = ENOTCONN; break; } } return rc; } static int mon_refresh_display(gpointer user_data) { xmlNode *cib_copy = copy_xml(current_cib); stonith_history_t *stonith_history = NULL; int history_rc = 0; GList *unames = NULL; GList *resources = NULL; last_refresh = time(NULL); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cib__clean_up_connection(&cib); out->err(out, "Upgrade failed: %s", pcmk_strerror(-pcmk_err_schema_validation)); clean_up(CRM_EX_CONFIG); return 0; } /* get the stonith-history if there is evidence we need it */ history_rc = get_fencing_history(&stonith_history); if (mon_data_set == NULL) { mon_data_set = pe_new_working_set(); CRM_ASSERT(mon_data_set != NULL); } pe__set_working_set_flags(mon_data_set, pe_flag_no_compat); mon_data_set->input = cib_copy; mon_data_set->priv = out; cluster_status(mon_data_set); /* Unpack constraints if any section will need them * (tickets may be referenced in constraints but not granted yet, * and bans need negative location constraints) */ if (pcmk_is_set(show, pcmk_section_bans) || pcmk_is_set(show, pcmk_section_tickets)) { pcmk__unpack_constraints(mon_data_set); } if (options.daemonize) { out->reset(out); } unames = pe__build_node_name_list(mon_data_set, options.only_node); resources = pe__build_rsc_list(mon_data_set, options.only_rsc); /* Always print DC if NULL. */ if (mon_data_set->dc_node == NULL) { show |= pcmk_section_dc; } switch (output_format) { case mon_output_html: case mon_output_cgi: if (out->message(out, "cluster-status", mon_data_set, crm_errno2exit(history_rc), stonith_history, fence_history, show, show_opts, options.neg_location_prefix, unames, resources) != 0) { g_set_error(&error, PCMK__EXITC_ERROR, CRM_EX_CANTCREAT, "Critical: Unable to output html file"); clean_up(CRM_EX_CANTCREAT); return 0; } break; case mon_output_monitor: print_simple_status(out, mon_data_set); if (has_warnings) { clean_up(MON_STATUS_WARN); return FALSE; } break; case mon_output_unset: case mon_output_none: break; default: out->message(out, "cluster-status", mon_data_set, crm_errno2exit(history_rc), stonith_history, fence_history, show, show_opts, options.neg_location_prefix, unames, resources); break; } if (options.daemonize) { out->finish(out, CRM_EX_OK, true, NULL); } g_list_free_full(unames, free); g_list_free_full(resources, free); stonith_history_free(stonith_history); stonith_history = NULL; pe_reset_working_set(mon_data_set); return 1; } /* This function is called for fencing events (see fencing_connect for which ones) when * --watch-fencing is used on the command line. */ static void mon_st_callback_event(stonith_t * st, stonith_event_t * e) { if (st->state == stonith_disconnected) { /* disconnect cib as well and have everything reconnect */ mon_cib_connection_destroy(NULL); } else if (options.external_agent) { - char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", - e->operation, e->origin, e->target, pcmk_strerror(e->result), - e->id); + char *desc = stonith__event_description(e); + send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); free(desc); } } /* Cause the screen to be redrawn (via mainloop_set_trigger) when various conditions are met: * * - If the last update occurred more than reconnect_ms ago (defaults to 5s, but * can be changed via the -i command line option), or * - After every 10 CIB updates, or * - If it's been 2s since the last update * * This function sounds like it would be more broadly useful, but it is only called when a * fencing event is received or a CIB diff occurrs. */ static void refresh_after_event(gboolean data_updated, gboolean enforce) { static int updates = 0; time_t now = time(NULL); if (data_updated) { updates++; } if(refresh_timer == NULL) { refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL); } if (reconnect_timer > 0) { /* we will receive a refresh request after successful reconnect */ mainloop_timer_stop(refresh_timer); return; } /* as we're not handling initial failure of fencer-connection as * fatal give it a retry here * not getting here if cib-reconnection is already on the way */ fencing_connect(); if (enforce || ((now - last_refresh) > (options.reconnect_ms / 1000)) || updates >= 10) { mainloop_set_trigger((crm_trigger_t *) refresh_trigger); mainloop_timer_stop(refresh_timer); updates = 0; } else { mainloop_timer_start(refresh_timer); } } /* This function is called for fencing events (see fencing_connect for which ones) when * --watch-fencing is NOT used on the command line. */ static void mon_st_callback_display(stonith_t * st, stonith_event_t * e) { if (st->state == stonith_disconnected) { /* disconnect cib as well and have everything reconnect */ mon_cib_connection_destroy(NULL); } else { out->progress(out, false); refresh_after_event(TRUE, FALSE); } } static void clean_up_fencing_connection(void) { if (st == NULL) { return; } if (st->state != stonith_disconnected) { st->cmds->remove_notification(st, T_STONITH_NOTIFY_DISCONNECT); st->cmds->remove_notification(st, T_STONITH_NOTIFY_FENCE); st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY); st->cmds->disconnect(st); } stonith_api_delete(st); st = NULL; } /* * De-init ncurses, disconnect from the CIB manager, disconnect fencing, * deallocate memory and show usage-message if requested. * * We don't actually return, but nominally returning crm_exit_t allows a usage * like "return clean_up(exit_code);" which helps static analysis understand the * code flow. */ static crm_exit_t clean_up(crm_exit_t exit_code) { /* Quitting crm_mon is much more complicated than it ought to be. */ /* (1) Close connections, free things, etc. */ cib__clean_up_connection(&cib); clean_up_fencing_connection(); free(options.neg_location_prefix); free(options.only_node); free(options.only_rsc); free(options.pid_file); g_slist_free_full(options.includes_excludes, free); pe_free_working_set(mon_data_set); mon_data_set = NULL; g_strfreev(processed_args); /* (2) If this is abnormal termination and we're in curses mode, shut down * curses first. Any messages displayed to the screen before curses is shut * down will be lost because doing the shut down will also restore the * screen to whatever it looked like before crm_mon was started. */ if ((error != NULL || exit_code == CRM_EX_USAGE) && output_format == mon_output_console) { out->finish(out, exit_code, false, NULL); pcmk__output_free(out); out = NULL; } /* (3) If this is a command line usage related failure, print the usage * message. */ if (exit_code == CRM_EX_USAGE && (output_format == mon_output_console || output_format == mon_output_plain)) { char *help = g_option_context_get_help(context, TRUE, NULL); fprintf(stderr, "%s", help); g_free(help); } pcmk__free_arg_context(context); /* (4) If this is any kind of error, print the error out and exit. Make * sure to handle situations both before and after formatted output is * set up. We want errors to appear formatted if at all possible. */ if (error != NULL) { if (out != NULL) { out->err(out, "%s: %s", g_get_prgname(), error->message); out->finish(out, exit_code, true, NULL); pcmk__output_free(out); } else { fprintf(stderr, "%s: %s\n", g_get_prgname(), error->message); } g_clear_error(&error); crm_exit(exit_code); } /* (5) Print formatted output to the screen if we made it far enough in * crm_mon to be able to do so. */ if (out != NULL) { if (!options.daemonize) { out->finish(out, exit_code, true, NULL); } pcmk__output_free(out); pcmk__unregister_formats(); } crm_exit(exit_code); }