diff --git a/daemons/fenced/fenced_scheduler.c b/daemons/fenced/fenced_scheduler.c index 609a506a8e..3e9aa32e13 100644 --- a/daemons/fenced/fenced_scheduler.c +++ b/daemons/fenced/fenced_scheduler.c @@ -1,14 +1,71 @@ /* * Copyright 2009-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include +#include +#include + #include +#include + +#include +#include pcmk_scheduler_t *fenced_data_set = NULL; + +/*! + * \internal + * \brief Initialize scheduler data for fencer purposes + * + * \return Standard Pacemaker return code + */ +int +fenced_scheduler_init(void) +{ + pcmk__output_t *logger = NULL; + int rc = pcmk__log_output_new(&logger); + + if (rc != pcmk_rc_ok) { + return rc; + } + + fenced_data_set = pe_new_working_set(); + if (fenced_data_set == NULL) { + pcmk__output_free(logger); + return ENOMEM; + } + + pe__register_messages(logger); + pcmk__register_lib_messages(logger); + pcmk__output_set_log_level(logger, LOG_TRACE); + fenced_data_set->priv = logger; + + return pcmk_rc_ok; +} + +/*! + * \internal + * \brief Free all scheduler-related resources + */ +void +fenced_scheduler_cleanup(void) +{ + if (fenced_data_set != NULL) { + pcmk__output_t *logger = fenced_data_set->priv; + + if (logger != NULL) { + logger->finish(logger, CRM_EX_OK, true, NULL); + pcmk__output_free(logger); + fenced_data_set->priv = NULL; + } + pe_free_working_set(fenced_data_set); + fenced_data_set = NULL; + } +} diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index d352555162..7c69fb8a10 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -1,915 +1,898 @@ /* * Copyright 2009-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include // PRIu32, PRIx32 #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include #include #define SUMMARY "daemon for executing fencing devices in a Pacemaker cluster" -extern pcmk_scheduler_t *fenced_data_set; - char *stonith_our_uname = NULL; long stonith_watchdog_timeout_ms = 0; GList *stonith_watchdog_targets = NULL; static GMainLoop *mainloop = NULL; gboolean stand_alone = FALSE; gboolean stonith_shutdown_flag = FALSE; static qb_ipcs_service_t *ipcs = NULL; -static pcmk__output_t *logger_out = NULL; static pcmk__output_t *out = NULL; pcmk__supported_format_t formats[] = { PCMK__SUPPORTED_FORMAT_NONE, PCMK__SUPPORTED_FORMAT_TEXT, PCMK__SUPPORTED_FORMAT_XML, { NULL, NULL, NULL } }; static struct { bool no_cib_connect; gchar **log_files; } options; crm_exit_t exit_code = CRM_EX_OK; static void stonith_cleanup(void); static int32_t st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { if (stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", pcmk__client_pid(c)); return -EPERM; } if (pcmk__new_client(c, uid, gid) == NULL) { return -EIO; } return 0; } /* Exit code means? */ static int32_t st_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; int call_options = 0; xmlNode *request = NULL; pcmk__client_t *c = pcmk__find_client(qbc); const char *op = NULL; if (c == NULL) { crm_info("Invalid client: %p", qbc); return 0; } request = pcmk__client_data2xml(c, data, &id, &flags); if (request == NULL) { pcmk__ipc_send_ack(c, id, flags, "nack", NULL, CRM_EX_PROTOCOL); return 0; } op = crm_element_value(request, F_CRM_TASK); if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { crm_xml_add(request, F_TYPE, T_STONITH_NG); crm_xml_add(request, F_STONITH_OPERATION, op); crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); send_cluster_message(NULL, crm_msg_stonith_ng, request, FALSE); free_xml(request); return 0; } if (c->name == NULL) { const char *value = crm_element_value(request, F_STONITH_CLIENTNAME); if (value == NULL) { value = "unknown"; } c->name = crm_strdup_printf("%s.%u", value, c->pid); } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_trace("Flags %#08" PRIx32 "/%#08x for command %" PRIu32 " from client %s", flags, call_options, id, pcmk__client_name(c)); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(flags & crm_ipc_client_response); CRM_LOG_ASSERT(c->request_id == 0); /* This means the client has two synchronous events in-flight */ c->request_id = id; /* Reply only to the last one */ } crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); crm_log_xml_trace(request, "ipc-received"); stonith_command(c, id, flags, request, NULL); free_xml(request); return 0; } /* Error code means? */ static int32_t st_ipc_closed(qb_ipcs_connection_t * c) { pcmk__client_t *client = pcmk__find_client(c); if (client == NULL) { return 0; } crm_trace("Connection %p closed", c); pcmk__free_client(client); /* 0 means: yes, go ahead and destroy the connection */ return 0; } static void st_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p destroyed", c); st_ipc_closed(c); } static void stonith_peer_callback(xmlNode * msg, void *private_data) { const char *remote_peer = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_STONITH_OPERATION); if (pcmk__str_eq(op, "poke", pcmk__str_none)) { return; } crm_log_xml_trace(msg, "Peer[inbound]"); stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_COROSYNC static void stonith_peer_ais_callback(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if(data == NULL) { return; } if (kind == crm_class_cluster) { xml = string2xml(data); if (xml == NULL) { crm_err("Invalid XML: '%.120s'", data); free(data); return; } crm_xml_add(xml, F_ORIG, from); /* crm_xml_add_int(xml, F_SEQ, wrapper->id); */ stonith_peer_callback(xml, NULL); } free_xml(xml); free(data); return; } static void stonith_peer_cs_destroy(gpointer user_data) { crm_crit("Lost connection to cluster layer, shutting down"); stonith_shutdown(0); } #endif void do_local_reply(const xmlNode *notify_src, pcmk__client_t *client, int call_options) { /* send callback to originating child */ int local_rc = pcmk_rc_ok; int rid = 0; uint32_t ipc_flags = crm_ipc_server_event; if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_LOG_ASSERT(client->request_id); rid = client->request_id; client->request_id = 0; ipc_flags = crm_ipc_flags_none; } local_rc = pcmk__ipc_send_xml(client, rid, notify_src, ipc_flags); if (local_rc == pcmk_rc_ok) { crm_trace("Sent response %d to client %s", rid, pcmk__client_name(client)); } else { crm_warn("%synchronous reply to client %s failed: %s", (pcmk_is_set(call_options, st_opt_sync_call)? "S" : "As"), pcmk__client_name(client), pcmk_rc_str(local_rc)); } } uint64_t get_stonith_flag(const char *name) { if (pcmk__str_eq(name, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { return st_callback_notify_fence; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_ADD, pcmk__str_casei)) { return st_callback_device_add; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_DEL, pcmk__str_casei)) { return st_callback_device_del; } else if (pcmk__str_eq(name, T_STONITH_NOTIFY_HISTORY, pcmk__str_casei)) { return st_callback_notify_history; } else if (pcmk__str_eq(name, T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk__str_casei)) { return st_callback_notify_history_synced; } return st_callback_unknown; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { const xmlNode *update_msg = user_data; pcmk__client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, F_SUBTYPE); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if (client->ipcs == NULL) { crm_trace("Skipping client with NULL channel"); return; } if (pcmk_is_set(client->flags, get_stonith_flag(type))) { int rc = pcmk__ipc_send_xml(client, 0, update_msg, crm_ipc_server_event); if (rc != pcmk_rc_ok) { crm_warn("%s notification of client %s failed: %s " CRM_XS " id=%.8s rc=%d", type, pcmk__client_name(client), pcmk_rc_str(rc), client->id, rc); } else { crm_trace("Sent %s notification to client %s", type, pcmk__client_name(client)); } } } void do_stonith_async_timeout_update(const char *client_id, const char *call_id, int timeout) { pcmk__client_t *client = NULL; xmlNode *notify_data = NULL; if (!timeout || !call_id || !client_id) { return; } client = pcmk__find_client_by_id(client_id); if (!client) { return; } notify_data = create_xml_node(NULL, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_TYPE, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_STONITH_CALLID, call_id); crm_xml_add_int(notify_data, F_STONITH_TIMEOUT, timeout); crm_trace("timeout update is %d for client %s and call id %s", timeout, client_id, call_id); if (client) { pcmk__ipc_send_xml(client, 0, notify_data, crm_ipc_server_event); } free_xml(notify_data); } /*! * \internal * \brief Notify relevant IPC clients of a fencing operation result * * \param[in] type Notification type * \param[in] result Result of fencing operation (assume success if NULL) * \param[in] data If not NULL, add to notification as call data */ void fenced_send_notification(const char *type, const pcmk__action_result_t *result, xmlNode *data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); CRM_LOG_ASSERT(type != NULL); crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(update_msg, F_SUBTYPE, type); crm_xml_add(update_msg, F_STONITH_OPERATION, type); stonith__xe_set_result(update_msg, result); if (data != NULL) { add_message_xml(update_msg, F_STONITH_CALLDATA, data); } crm_trace("Notifying clients"); pcmk__foreach_ipc_client(stonith_notify_client, update_msg); free_xml(update_msg); crm_trace("Notify complete"); } /*! * \internal * \brief Send notifications for a configuration change to subscribed clients * * \param[in] op Notification type (STONITH_OP_DEVICE_ADD, * STONITH_OP_DEVICE_DEL, STONITH_OP_LEVEL_ADD, or * STONITH_OP_LEVEL_DEL) * \param[in] result Operation result * \param[in] desc Description of what changed * \param[in] active Current number of devices or topologies in use */ static void send_config_notification(const char *op, const pcmk__action_result_t *result, const char *desc, int active) { xmlNode *notify_data = create_xml_node(NULL, op); CRM_CHECK(notify_data != NULL, return); crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); fenced_send_notification(op, result, notify_data); free_xml(notify_data); } /*! * \internal * \brief Send notifications for a device change to subscribed clients * * \param[in] op Notification type (STONITH_OP_DEVICE_ADD or * STONITH_OP_DEVICE_DEL) * \param[in] result Operation result * \param[in] desc ID of device that changed */ void fenced_send_device_notification(const char *op, const pcmk__action_result_t *result, const char *desc) { send_config_notification(op, result, desc, g_hash_table_size(device_list)); } /*! * \internal * \brief Send notifications for a topology level change to subscribed clients * * \param[in] op Notification type (STONITH_OP_LEVEL_ADD or * STONITH_OP_LEVEL_DEL) * \param[in] result Operation result * \param[in] desc String representation of level ([]) */ void fenced_send_level_notification(const char *op, const pcmk__action_result_t *result, const char *desc) { send_config_notification(op, result, desc, g_hash_table_size(topology)); } /*! * \internal * \brief Check whether a node does watchdog-fencing * * \param[in] node Name of node to check * * \return TRUE if node found in stonith_watchdog_targets * or stonith_watchdog_targets is empty indicating * all nodes are doing watchdog-fencing */ gboolean node_does_watchdog_fencing(const char *node) { return ((stonith_watchdog_targets == NULL) || pcmk__str_in_list(node, stonith_watchdog_targets, pcmk__str_casei)); } void stonith_shutdown(int nsig) { crm_info("Terminating with %d clients", pcmk__ipc_client_count()); stonith_shutdown_flag = TRUE; if (mainloop != NULL && g_main_loop_is_running(mainloop)) { g_main_loop_quit(mainloop); } } static void stonith_cleanup(void) { fenced_cib_cleanup(); if (ipcs) { qb_ipcs_destroy(ipcs); } crm_peer_destroy(); pcmk__client_cleanup(); free_stonith_remote_op_list(); free_topology_list(); free_device_list(); free_metadata_cache(); fenced_unregister_handlers(); free(stonith_our_uname); stonith_our_uname = NULL; } static gboolean stand_alone_cpg_cb(const gchar *option_name, const gchar *optarg, gpointer data, GError **error) { stand_alone = FALSE; options.no_cib_connect = true; return TRUE; } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = st_ipc_accept, .connection_created = NULL, .msg_process = st_ipc_dispatch, .connection_closed = st_ipc_closed, .connection_destroyed = st_ipc_destroy }; /*! * \internal * \brief Callback for peer status changes * * \param[in] type What changed * \param[in] node What peer had the change * \param[in] data Previous value of what changed */ static void st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { if ((type != crm_status_processes) && !pcmk_is_set(node->flags, crm_remote_node)) { /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in stonith_peer_callback() */ xmlNode *query = create_xml_node(NULL, "stonith_command"); crm_xml_add(query, F_XML_TAGNAME, "stonith_command"); crm_xml_add(query, F_TYPE, T_STONITH_NG); crm_xml_add(query, F_STONITH_OPERATION, "poke"); crm_debug("Broadcasting our uname because of node %u", node->id); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); } } static pcmk__cluster_option_t fencer_options[] = { /* name, old name, type, allowed values, * default value, validator, * short description, * long description */ { PCMK_STONITH_HOST_ARGUMENT, NULL, "string", NULL, "port", NULL, N_("Advanced use only: An alternate parameter to supply instead of 'port'"), N_("some devices do not support the " "standard 'port' parameter or may provide additional ones. Use " "this to specify an alternate, device-specific, parameter " "that should indicate the machine to be fenced. A value of " "none can be used to tell the cluster not to supply any " "additional parameters.") }, { PCMK_STONITH_HOST_MAP,NULL, "string", NULL, "", NULL, N_("A mapping of host names to ports numbers for devices that do not support host names."), N_("Eg. node1:1;node2:2,3 would tell the cluster to use port 1 for node1 and ports 2 and 3 for node2") }, { PCMK_STONITH_HOST_LIST,NULL, "string", NULL, "", NULL, N_("Eg. node1,node2,node3"), N_("A list of machines controlled by " "this device (Optional unless pcmk_host_list=static-list)") }, { PCMK_STONITH_HOST_CHECK,NULL, "string", NULL, "dynamic-list", NULL, N_("How to determine which machines are controlled by the device."), N_("Allowed values: dynamic-list " "(query the device via the 'list' command), static-list " "(check the pcmk_host_list attribute), status " "(query the device via the 'status' command), " "none (assume every device can fence every " "machine)") }, { PCMK_STONITH_DELAY_MAX,NULL, "time", NULL, "0s", NULL, N_("Enable a base delay for fencing actions and specify base delay value."), N_("Enable a delay of no more than the " "time specified before executing fencing actions. Pacemaker " "derives the overall delay by taking the value of " "pcmk_delay_base and adding a random delay value such " "that the sum is kept below this maximum.") }, { PCMK_STONITH_DELAY_BASE,NULL, "string", NULL, "0s", NULL, N_("Enable a base delay for " "fencing actions and specify base delay value."), N_("This enables a static delay for " "fencing actions, which can help avoid \"death matches\" where " "two nodes try to fence each other at the same time. If " "pcmk_delay_max is also used, a random delay will be " "added such that the total delay is kept below that value." "This can be set to a single time value to apply to any node " "targeted by this device (useful if a separate device is " "configured for each target), or to a node map (for example, " "\"node1:1s;node2:5\") to set a different value per target.") }, { PCMK_STONITH_ACTION_LIMIT,NULL, "integer", NULL, "1", NULL, N_("The maximum number of actions can be performed in parallel on this device"), N_("Cluster property concurrent-fencing=true needs to be configured first." "Then use this to specify the maximum number of actions can be performed in parallel on this device. -1 is unlimited.") }, { "pcmk_reboot_action", NULL, "string", NULL, PCMK_ACTION_REBOOT, NULL, N_("Advanced use only: An alternate command to run instead of 'reboot'"), N_("Some devices do not support the standard commands or may provide additional ones.\n" "Use this to specify an alternate, device-specific, command that implements the \'reboot\' action.") }, { "pcmk_reboot_timeout",NULL, "time", NULL, "60s", NULL, N_("Advanced use only: Specify an alternate timeout to use for reboot actions instead of stonith-timeout"), N_("Some devices need much more/less time to complete than normal." "Use this to specify an alternate, device-specific, timeout for \'reboot\' actions.") }, { "pcmk_reboot_retries",NULL, "integer", NULL, "2", NULL, N_("Advanced use only: The maximum number of times to retry the 'reboot' command within the timeout period"), N_("Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries \'reboot\' actions before giving up.") }, { "pcmk_off_action", NULL, "string", NULL, PCMK_ACTION_OFF, NULL, N_("Advanced use only: An alternate command to run instead of \'off\'"), N_("Some devices do not support the standard commands or may provide additional ones." "Use this to specify an alternate, device-specific, command that implements the \'off\' action.") }, { "pcmk_off_timeout",NULL, "time", NULL, "60s", NULL, N_("Advanced use only: Specify an alternate timeout to use for off actions instead of stonith-timeout"), N_("Some devices need much more/less time to complete than normal." "Use this to specify an alternate, device-specific, timeout for \'off\' actions.") }, { "pcmk_off_retries",NULL, "integer", NULL, "2", NULL, N_("Advanced use only: The maximum number of times to retry the 'off' command within the timeout period"), N_("Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries \'off\' actions before giving up.") }, { "pcmk_on_action", NULL, "string", NULL, PCMK_ACTION_ON, NULL, N_("Advanced use only: An alternate command to run instead of 'on'"), N_("Some devices do not support the standard commands or may provide additional ones." "Use this to specify an alternate, device-specific, command that implements the \'on\' action.") }, { "pcmk_on_timeout",NULL, "time", NULL, "60s", NULL, N_("Advanced use only: Specify an alternate timeout to use for on actions instead of stonith-timeout"), N_("Some devices need much more/less time to complete than normal." "Use this to specify an alternate, device-specific, timeout for \'on\' actions.") }, { "pcmk_on_retries",NULL, "integer", NULL, "2", NULL, N_("Advanced use only: The maximum number of times to retry the 'on' command within the timeout period"), N_("Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries \'on\' actions before giving up.") }, { "pcmk_list_action",NULL, "string", NULL, PCMK_ACTION_LIST, NULL, N_("Advanced use only: An alternate command to run instead of \'list\'"), N_("Some devices do not support the standard commands or may provide additional ones." "Use this to specify an alternate, device-specific, command that implements the \'list\' action.") }, { "pcmk_list_timeout",NULL, "time", NULL, "60s", NULL, N_("Advanced use only: Specify an alternate timeout to use for list actions instead of stonith-timeout"), N_("Some devices need much more/less time to complete than normal." "Use this to specify an alternate, device-specific, timeout for \'list\' actions.") }, { "pcmk_list_retries",NULL, "integer", NULL, "2", NULL, N_("Advanced use only: The maximum number of times to retry the \'list\' command within the timeout period"), N_("Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries \'list\' actions before giving up.") }, { "pcmk_monitor_action", NULL, "string", NULL, PCMK_ACTION_MONITOR, NULL, N_("Advanced use only: An alternate command to run instead of \'monitor\'"), N_("Some devices do not support the standard commands or may provide additional ones." "Use this to specify an alternate, device-specific, command that implements the \'monitor\' action.") }, { "pcmk_monitor_timeout",NULL, "time", NULL, "60s", NULL, N_("Advanced use only: Specify an alternate timeout to use for monitor actions instead of stonith-timeout"), N_("Some devices need much more/less time to complete than normal.\n" "Use this to specify an alternate, device-specific, timeout for \'monitor\' actions.") }, { "pcmk_monitor_retries",NULL, "integer", NULL, "2", NULL, N_("Advanced use only: The maximum number of times to retry the \'monitor\' command within the timeout period"), N_("Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries \'monitor\' actions before giving up.") }, { "pcmk_status_action", NULL, "string", NULL, PCMK_ACTION_STATUS, NULL, N_("Advanced use only: An alternate command to run instead of \'status\'"), N_("Some devices do not support the standard commands or may provide additional ones." "Use this to specify an alternate, device-specific, command that implements the \'status\' action.") }, { "pcmk_status_timeout",NULL, "time", NULL, "60s", NULL, N_("Advanced use only: Specify an alternate timeout to use for status actions instead of stonith-timeout"), N_("Some devices need much more/less time to complete than normal." "Use this to specify an alternate, device-specific, timeout for \'status\' actions.") }, { "pcmk_status_retries",NULL, "integer", NULL, "2", NULL, N_("Advanced use only: The maximum number of times to retry the \'status\' command within the timeout period"), N_("Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries \'status\' actions before giving up.") }, }; void fencer_metadata(void) { const char *desc_short = N_("Instance attributes available for all " "\"stonith\"-class resources"); const char *desc_long = N_("Instance attributes available for all \"stonith\"-" "class resources and used by Pacemaker's fence " "daemon, formerly known as stonithd"); gchar *s = pcmk__format_option_metadata("pacemaker-fenced", desc_short, desc_long, fencer_options, PCMK__NELEM(fencer_options)); printf("%s", s); g_free(s); } static GOptionEntry entries[] = { { "stand-alone", 's', G_OPTION_FLAG_NONE, G_OPTION_ARG_NONE, &stand_alone, N_("Deprecated (will be removed in a future release)"), NULL }, { "stand-alone-w-cpg", 'c', G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, stand_alone_cpg_cb, N_("Intended for use in regression testing only"), NULL }, { "logfile", 'l', G_OPTION_FLAG_NONE, G_OPTION_ARG_FILENAME_ARRAY, &options.log_files, N_("Send logs to the additional named logfile"), NULL }, { NULL } }; static GOptionContext * build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { GOptionContext *context = NULL; context = pcmk__build_arg_context(args, "text (default), xml", group, "[metadata]"); pcmk__add_main_args(context, entries); return context; } int main(int argc, char **argv) { int rc = pcmk_rc_ok; crm_cluster_t *cluster = NULL; crm_ipc_t *old_instance = NULL; GError *error = NULL; GOptionGroup *output_group = NULL; pcmk__common_args_t *args = pcmk__new_common_args(SUMMARY); gchar **processed_args = pcmk__cmdline_preproc(argv, "l"); GOptionContext *context = build_arg_context(args, &output_group); crm_log_preinit(NULL, argc, argv); pcmk__register_formats(output_group, formats); if (!g_option_context_parse_strv(context, &processed_args, &error)) { exit_code = CRM_EX_USAGE; goto done; } rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); if (rc != pcmk_rc_ok) { exit_code = CRM_EX_ERROR; g_set_error(&error, PCMK__EXITC_ERROR, exit_code, "Error creating output format %s: %s", args->output_ty, pcmk_rc_str(rc)); goto done; } if (args->version) { out->version(out, false); goto done; } if ((g_strv_length(processed_args) >= 2) && pcmk__str_eq(processed_args[1], "metadata", pcmk__str_none)) { fencer_metadata(); goto done; } // Open additional log files pcmk__add_logfiles(options.log_files, out); crm_log_init(NULL, LOG_INFO + args->verbosity, TRUE, (args->verbosity > 0), argc, argv, FALSE); crm_notice("Starting Pacemaker fencer"); old_instance = crm_ipc_new("stonith-ng", 0); if (old_instance == NULL) { /* crm_ipc_new() will have already logged an error message with * crm_err() */ exit_code = CRM_EX_FATAL; goto done; } if (pcmk__connect_generic_ipc(old_instance) == pcmk_rc_ok) { // IPC endpoint already up crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); crm_err("pacemaker-fenced is already active, aborting startup"); goto done; } else { // Not up or not authentic, we'll proceed either way crm_ipc_destroy(old_instance); old_instance = NULL; } mainloop_add_signal(SIGTERM, stonith_shutdown); crm_peer_init(); - fenced_data_set = pe_new_working_set(); - CRM_ASSERT(fenced_data_set != NULL); + rc = fenced_scheduler_init(); + if (rc != pcmk_rc_ok) { + exit_code = CRM_EX_FATAL; + g_set_error(&error, PCMK__EXITC_ERROR, exit_code, + "Error initializing scheduler data: %s", pcmk_rc_str(rc)); + goto done; + } cluster = pcmk_cluster_new(); if (!stand_alone) { #if SUPPORT_COROSYNC if (is_corosync_cluster()) { cluster->destroy = stonith_peer_cs_destroy; cluster->cpg.cpg_deliver_fn = stonith_peer_ais_callback; cluster->cpg.cpg_confchg_fn = pcmk_cpg_membership; } #endif // SUPPORT_COROSYNC crm_set_status_callback(&st_peer_update_callback); if (crm_cluster_connect(cluster) == FALSE) { exit_code = CRM_EX_FATAL; crm_crit("Cannot sign in to the cluster... terminating"); goto done; } pcmk__str_update(&stonith_our_uname, cluster->uname); if (!options.no_cib_connect) { setup_cib(); } } else { pcmk__str_update(&stonith_our_uname, "localhost"); crm_warn("Stand-alone mode is deprecated and will be removed " "in a future release"); } init_device_list(); init_topology_list(); pcmk__serve_fenced_ipc(&ipcs, &ipc_callbacks); - rc = pcmk__log_output_new(&logger_out); - if (rc != pcmk_rc_ok) { - exit_code = CRM_EX_FATAL; - g_set_error(&error, PCMK__EXITC_ERROR, exit_code, - "Error creating output format log: %s", pcmk_rc_str(rc)); - goto done; - } - pe__register_messages(logger_out); - pcmk__register_lib_messages(logger_out); - pcmk__output_set_log_level(logger_out, LOG_TRACE); - fenced_data_set->priv = logger_out; - // Create the mainloop and run it... mainloop = g_main_loop_new(NULL, FALSE); crm_notice("Pacemaker fencer successfully started and accepting connections"); g_main_loop_run(mainloop); done: g_strfreev(processed_args); pcmk__free_arg_context(context); g_strfreev(options.log_files); stonith_cleanup(); pcmk_cluster_free(cluster); - pe_free_working_set(fenced_data_set); + fenced_scheduler_cleanup(); pcmk__output_and_clear_error(&error, out); - if (logger_out != NULL) { - logger_out->finish(logger_out, exit_code, true, NULL); - pcmk__output_free(logger_out); - } - if (out != NULL) { out->finish(out, exit_code, true, NULL); pcmk__output_free(out); } pcmk__unregister_formats(); crm_exit(exit_code); } diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h index bf0d5d37dc..c2acf184bd 100644 --- a/daemons/fenced/pacemaker-fenced.h +++ b/daemons/fenced/pacemaker-fenced.h @@ -1,328 +1,331 @@ /* * Copyright 2009-2023 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include // uint32_t, uint64_t #include #include #include #include /*! * \internal * \brief Check whether target has already been fenced recently * * \param[in] tolerance Number of seconds to look back in time * \param[in] target Name of node to search for * \param[in] action Action we want to match * * \return TRUE if an equivalent fencing operation took place in the last * \p tolerance seconds, FALSE otherwise */ gboolean stonith_check_fence_tolerance(int tolerance, const char *target, const char *action); typedef struct stonith_device_s { char *id; char *agent; char *namespace; /*! list of actions that must execute on the target node. Used for unfencing */ GString *on_target_actions; GList *targets; time_t targets_age; gboolean has_attr_map; // Whether target's nodeid should be passed as a parameter to the agent gboolean include_nodeid; /* whether the cluster should automatically unfence nodes with the device */ gboolean automatic_unfencing; guint priority; uint32_t flags; // Group of enum st_device_flags GHashTable *params; GHashTable *aliases; GList *pending_ops; mainloop_timer_t *timer; crm_trigger_t *work; xmlNode *agent_metadata; /*! A verified device is one that has contacted the * agent successfully to perform a monitor operation */ gboolean verified; gboolean cib_registered; gboolean api_registered; gboolean dirty; } stonith_device_t; /* These values are used to index certain arrays by "phase". Usually an * operation has only one "phase", so phase is always zero. However, some * reboots are remapped to "off" then "on", in which case "reboot" will be * phase 0, "off" will be phase 1 and "on" will be phase 2. */ enum st_remap_phase { st_phase_requested = 0, st_phase_off = 1, st_phase_on = 2, st_phase_max = 3 }; typedef struct remote_fencing_op_s { /* The unique id associated with this operation */ char *id; /*! The node this operation will fence */ char *target; /*! The fencing action to perform on the target. (reboot, on, off) */ char *action; /*! When was the fencing action recorded (seconds since epoch) */ time_t created; /*! Marks if the final notifications have been sent to local stonith clients. */ gboolean notify_sent; /*! The number of query replies received */ guint replies; /*! The number of query replies expected */ guint replies_expected; /*! Does this node own control of this operation */ gboolean owner; /*! After query is complete, This the high level timer that expires the entire operation */ guint op_timer_total; /*! This timer expires the current fencing request. Many fencing * requests may exist in a single operation */ guint op_timer_one; /*! This timer expires the query request sent out to determine * what nodes are contain what devices, and who those devices can fence */ guint query_timer; /*! This is the default timeout to use for each fencing device if no * custom timeout is received in the query. */ gint base_timeout; /*! This is the calculated total timeout an operation can take before * expiring. This is calculated by adding together all the timeout * values associated with the devices this fencing operation may call */ gint total_timeout; /*! * Fencing delay (in seconds) requested by API client (used by controller to * implement priority-fencing-delay). A value of -1 means disable all * configured delays. */ int client_delay; /*! Delegate is the node being asked to perform a fencing action * on behalf of the node that owns the remote operation. Some operations * will involve multiple delegates. This value represents the final delegate * that is used. */ char *delegate; /*! The point at which the remote operation completed */ time_t completed; //! Group of enum stonith_call_options associated with this operation uint32_t call_options; /*! The current state of the remote operation. This indicates * what stage the op is in, query, exec, done, duplicate, failed. */ enum op_state state; /*! The node that owns the remote operation */ char *originator; /*! The local client id that initiated the fencing request */ char *client_id; /*! The client's call_id that initiated the fencing request */ int client_callid; /*! The name of client that initiated the fencing request */ char *client_name; /*! List of the received query results for all the nodes in the cpg group */ GList *query_results; /*! The original request that initiated the remote stonith operation */ xmlNode *request; /*! The current topology level being executed */ guint level; /*! The current operation phase being executed */ enum st_remap_phase phase; /*! Devices with automatic unfencing (always run if "on" requested, never if remapped) */ GList *automatic_list; /*! List of all devices at the currently executing topology level */ GList *devices_list; /*! Current entry in the topology device list */ GList *devices; /*! List of duplicate operations attached to this operation. Once this operation * completes, the duplicate operations will be closed out as well. */ GList *duplicates; /*! The point at which the remote operation completed(nsec) */ long long completed_nsec; /*! The (potentially intermediate) result of the operation */ pcmk__action_result_t result; } remote_fencing_op_t; void fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged); // Fencer-specific client flags enum st_client_flags { st_callback_unknown = UINT64_C(0), st_callback_notify_fence = (UINT64_C(1) << 0), st_callback_device_add = (UINT64_C(1) << 2), st_callback_device_del = (UINT64_C(1) << 4), st_callback_notify_history = (UINT64_C(1) << 5), st_callback_notify_history_synced = (UINT64_C(1) << 6) }; // How the user specified the target of a topology level enum fenced_target_by { fenced_target_by_unknown = -1, // Invalid or not yet parsed fenced_target_by_name, // By target name fenced_target_by_pattern, // By a pattern matching target names fenced_target_by_attribute, // By a node attribute/value on target }; /* * Complex fencing requirements are specified via fencing topologies. * A topology consists of levels; each level is a list of fencing devices. * Topologies are stored in a hash table by node name. When a node needs to be * fenced, if it has an entry in the topology table, the levels are tried * sequentially, and the devices in each level are tried sequentially. * Fencing is considered successful as soon as any level succeeds; * a level is considered successful if all its devices succeed. * Essentially, all devices at a given level are "and-ed" and the * levels are "or-ed". * * This structure is used for the topology table entries. * Topology levels start from 1, so levels[0] is unused and always NULL. */ typedef struct stonith_topology_s { enum fenced_target_by kind; // How target was specified /*! Node name regex or attribute name=value for which topology applies */ char *target; char *target_value; char *target_pattern; char *target_attribute; /*! Names of fencing devices at each topology level */ GList *levels[ST_LEVEL_MAX]; } stonith_topology_t; void stonith_shutdown(int nsig); void init_device_list(void); void free_device_list(void); void init_topology_list(void); void free_topology_list(void); void free_stonith_remote_op_list(void); void init_stonith_remote_op_hash_table(GHashTable **table); void free_metadata_cache(void); void fenced_unregister_handlers(void); uint64_t get_stonith_flag(const char *name); void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, xmlNode *op_request, const char *remote_peer); int stonith_device_register(xmlNode *msg, gboolean from_cib); void stonith_device_remove(const char *id, bool from_cib); char *stonith_level_key(const xmlNode *msg, enum fenced_target_by); void fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result); void fenced_unregister_level(xmlNode *msg, char **desc, pcmk__action_result_t *result); stonith_topology_t *find_topology_for_host(const char *host); void do_local_reply(const xmlNode *notify_src, pcmk__client_t *client, int call_options); xmlNode *fenced_construct_reply(const xmlNode *request, xmlNode *data, const pcmk__action_result_t *result); void do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); void fenced_send_notification(const char *type, const pcmk__action_result_t *result, xmlNode *data); void fenced_send_device_notification(const char *op, const pcmk__action_result_t *result, const char *desc); void fenced_send_level_notification(const char *op, const pcmk__action_result_t *result, const char *desc); remote_fencing_op_t *initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request, gboolean manual_ack); void fenced_process_fencing_reply(xmlNode *msg); int process_remote_stonith_query(xmlNode * msg); void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer); void stonith_fence_history(xmlNode *msg, xmlNode **output, const char *remote_peer, int options); void stonith_fence_history_trim(void); bool fencing_peer_active(crm_node_t *peer); void set_fencing_completed(remote_fencing_op_t * op); int fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg); void fencer_metadata(void); const char *fenced_device_reboot_action(const char *device_id); bool fenced_device_supports_on(const char *device_id); gboolean node_has_attr(const char *node, const char *name, const char *value); gboolean node_does_watchdog_fencing(const char *node); void fencing_topology_init(void); void setup_cib(void); void fenced_cib_cleanup(void); +int fenced_scheduler_init(void); +void fenced_scheduler_cleanup(void); + static inline void fenced_set_protocol_error(pcmk__action_result_t *result) { pcmk__set_result(result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, "Fencer API request missing required information (bug?)"); } /*! * \internal * \brief Get the device flag to use with a given action when searching devices * * \param[in] action Action to check * * \return st_device_supports_on if \p action is "on", otherwise * st_device_supports_none */ static inline uint32_t fenced_support_flag(const char *action) { if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) { return st_device_supports_on; } return st_device_supports_none; } extern char *stonith_our_uname; extern gboolean stand_alone; extern GHashTable *device_list; extern GHashTable *topology; extern long stonith_watchdog_timeout_ms; extern GList *stonith_watchdog_targets; extern GHashTable *stonith_remote_op_list; extern crm_exit_t exit_code; extern gboolean stonith_shutdown_flag;