diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c index 4a8e931c58..0ed6a10479 100644 --- a/daemons/fenced/fenced_remote.c +++ b/daemons/fenced/fenced_remote.c @@ -1,2599 +1,2599 @@ /* * Copyright 2009-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TIMEOUT_MULTIPLY_FACTOR 1.2 /* When one fencer queries its peers for devices able to handle a fencing * request, each peer will reply with a list of such devices available to it. * Each reply will be parsed into a peer_device_info_t, with each device's * information kept in a device_properties_t. */ typedef struct device_properties_s { /* Whether access to this device has been verified */ gboolean verified; /* The remaining members are indexed by the operation's "phase" */ /* Whether this device has been executed in each phase */ gboolean executed[st_phase_max]; /* Whether this device is disallowed from executing in each phase */ gboolean disallowed[st_phase_max]; /* Action-specific timeout for each phase */ int custom_action_timeout[st_phase_max]; /* Action-specific maximum random delay for each phase */ int delay_max[st_phase_max]; /* Action-specific base delay for each phase */ int delay_base[st_phase_max]; /* Group of enum st_device_flags */ uint32_t device_support_flags; } device_properties_t; typedef struct { /* Name of peer that sent this result */ char *host; /* Only try peers for non-topology based operations once */ gboolean tried; /* Number of entries in the devices table */ int ndevices; /* Devices available to this host that are capable of fencing the target */ GHashTable *devices; } peer_device_info_t; GHashTable *stonith_remote_op_list = NULL; extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, int call_options); static void request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer); static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); static int get_op_total_timeout(const remote_fencing_op_t *op, const peer_device_info_t *chosen_peer); static gint sort_strings(gconstpointer a, gconstpointer b) { return strcmp(a, b); } static void free_remote_query(gpointer data) { if (data != NULL) { peer_device_info_t *peer = data; g_hash_table_destroy(peer->devices); free(peer->host); free(peer); } } void free_stonith_remote_op_list(void) { if (stonith_remote_op_list != NULL) { g_hash_table_destroy(stonith_remote_op_list); stonith_remote_op_list = NULL; } } struct peer_count_data { const remote_fencing_op_t *op; gboolean verified_only; uint32_t support_action_only; int count; }; /*! * \internal * \brief Increment a counter if a device has not been executed yet * * \param[in] key Device ID (ignored) * \param[in] value Device properties * \param[in,out] user_data Peer count data */ static void count_peer_device(gpointer key, gpointer value, gpointer user_data) { device_properties_t *props = (device_properties_t*)value; struct peer_count_data *data = user_data; if (!props->executed[data->op->phase] && (!data->verified_only || props->verified) && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) { ++(data->count); } } /*! * \internal * \brief Check the number of available devices in a peer's query results * * \param[in] op Operation that results are for * \param[in] peer Peer to count * \param[in] verified_only Whether to count only verified devices * \param[in] support_action_only Whether to count only devices that support action * * \return Number of devices available to peer that were not already executed */ static int count_peer_devices(const remote_fencing_op_t *op, const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only) { struct peer_count_data data; data.op = op; data.verified_only = verified_only; data.support_action_only = support_on_action_only; data.count = 0; if (peer) { g_hash_table_foreach(peer->devices, count_peer_device, &data); } return data.count; } /*! * \internal * \brief Search for a device in a query result * * \param[in] op Operation that result is for * \param[in] peer Query result for a peer * \param[in] device Device ID to search for * * \return Device properties if found, NULL otherwise */ static device_properties_t * find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer, const char *device, uint32_t support_action_only) { device_properties_t *props = g_hash_table_lookup(peer->devices, device); if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) { return NULL; } return (props && !props->executed[op->phase] && !props->disallowed[op->phase])? props : NULL; } /*! * \internal * \brief Find a device in a peer's device list and mark it as executed * * \param[in] op Operation that peer result is for * \param[in,out] peer Peer with results to search * \param[in] device ID of device to mark as done * \param[in] verified_devices_only Only consider verified devices * * \return TRUE if device was found and marked, FALSE otherwise */ static gboolean grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer, const char *device, gboolean verified_devices_only) { device_properties_t *props = find_peer_device(op, peer, device, fenced_support_flag(op->action)); if ((props == NULL) || (verified_devices_only && !props->verified)) { return FALSE; } crm_trace("Removing %s from %s (%d remaining)", device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none)); props->executed[op->phase] = TRUE; return TRUE; } static void clear_remote_op_timers(remote_fencing_op_t * op) { if (op->query_timer) { g_source_remove(op->query_timer); op->query_timer = 0; } if (op->op_timer_total) { g_source_remove(op->op_timer_total); op->op_timer_total = 0; } if (op->op_timer_one) { g_source_remove(op->op_timer_one); op->op_timer_one = 0; } } static void free_remote_op(gpointer data) { remote_fencing_op_t *op = data; crm_log_xml_debug(op->request, "Destroying"); clear_remote_op_timers(op); free(op->id); free(op->action); free(op->delegate); free(op->target); free(op->client_id); free(op->client_name); free(op->originator); if (op->query_results) { g_list_free_full(op->query_results, free_remote_query); } if (op->request) { pcmk__xml_free(op->request); op->request = NULL; } if (op->devices_list) { g_list_free_full(op->devices_list, free); op->devices_list = NULL; } g_list_free_full(op->automatic_list, free); g_list_free(op->duplicates); pcmk__reset_result(&op->result); free(op); } void init_stonith_remote_op_hash_table(GHashTable **table) { if (*table == NULL) { *table = pcmk__strkey_table(NULL, free_remote_op); } } /*! * \internal * \brief Return an operation's originally requested action (before any remap) * * \param[in] op Operation to check * * \return Operation's original action */ static const char * op_requested_action(const remote_fencing_op_t *op) { return ((op->phase > st_phase_requested)? PCMK_ACTION_REBOOT : op->action); } /*! * \internal * \brief Remap a "reboot" operation to the "off" phase * * \param[in,out] op Operation to remap */ static void op_phase_off(remote_fencing_op_t *op) { crm_info("Remapping multiple-device reboot targeting %s to 'off' " QB_XS " id=%.8s", op->target, op->id); op->phase = st_phase_off; /* Happily, "off" and "on" are shorter than "reboot", so we can reuse the * memory allocation at each phase. */ strcpy(op->action, PCMK_ACTION_OFF); } /*! * \internal * \brief Advance a remapped reboot operation to the "on" phase * * \param[in,out] op Operation to remap */ static void op_phase_on(remote_fencing_op_t *op) { GList *iter = NULL; crm_info("Remapped 'off' targeting %s complete, " "remapping to 'on' for %s " QB_XS " id=%.8s", op->target, op->client_name, op->id); op->phase = st_phase_on; strcpy(op->action, PCMK_ACTION_ON); /* Skip devices with automatic unfencing, because the cluster will handle it * when the node rejoins. */ for (iter = op->automatic_list; iter != NULL; iter = iter->next) { GList *match = g_list_find_custom(op->devices_list, iter->data, sort_strings); if (match) { op->devices_list = g_list_remove(op->devices_list, match->data); } } g_list_free_full(op->automatic_list, free); op->automatic_list = NULL; /* Rewind device list pointer */ op->devices = op->devices_list; } /*! * \internal * \brief Reset a remapped reboot operation * * \param[in,out] op Operation to reset */ static void undo_op_remap(remote_fencing_op_t *op) { if (op->phase > 0) { crm_info("Undoing remap of reboot targeting %s for %s " QB_XS " id=%.8s", op->target, op->client_name, op->id); op->phase = st_phase_requested; strcpy(op->action, PCMK_ACTION_REBOOT); } } /*! * \internal * \brief Create notification data XML for a fencing operation result * * \param[in,out] parent Parent XML element for newly created element * \param[in] op Fencer operation that completed * * \return Newly created XML to add as notification data * \note The caller is responsible for freeing the result. */ static xmlNode * fencing_result2xml(xmlNode *parent, const remote_fencing_op_t *op) { xmlNode *notify_data = pcmk__xe_create(parent, PCMK__XE_ST_NOTIFY_FENCE); crm_xml_add_int(notify_data, PCMK_XA_STATE, op->state); crm_xml_add(notify_data, PCMK__XA_ST_TARGET, op->target); crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ACTION, op->action); crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, op->delegate); crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, op->id); crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, op->originator); crm_xml_add(notify_data, PCMK__XA_ST_CLIENTID, op->client_id); crm_xml_add(notify_data, PCMK__XA_ST_CLIENTNAME, op->client_name); return notify_data; } /*! * \internal * \brief Broadcast a fence result notification to all CPG peers * * \param[in] op Fencer operation that completed * \param[in] op_merged Whether this operation is a duplicate of another */ void fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged) { static int count = 0; xmlNode *bcast = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY); xmlNode *wrapper = NULL; xmlNode *notify_data = NULL; count++; crm_trace("Broadcasting result to peers"); crm_xml_add(bcast, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY); crm_xml_add(bcast, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST); crm_xml_add(bcast, PCMK__XA_ST_OP, STONITH_OP_NOTIFY); crm_xml_add_int(bcast, PCMK_XA_COUNT, count); if (op_merged) { pcmk__xe_set_bool_attr(bcast, PCMK__XA_ST_OP_MERGED, true); } wrapper = pcmk__xe_create(bcast, PCMK__XE_ST_CALLDATA); notify_data = fencing_result2xml(wrapper, op); stonith__xe_set_result(notify_data, &op->result); pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, bcast); pcmk__xml_free(bcast); return; } /*! * \internal * \brief Reply to a local request originator and notify all subscribed clients * * \param[in,out] op Fencer operation that completed * \param[in,out] data Top-level XML to add notification to */ static void handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) { xmlNode *notify_data = NULL; xmlNode *reply = NULL; pcmk__client_t *client = NULL; if (op->notify_sent == TRUE) { /* nothing to do */ return; } /* Do notification with a clean data object */ crm_xml_add_int(data, PCMK_XA_STATE, op->state); crm_xml_add(data, PCMK__XA_ST_TARGET, op->target); crm_xml_add(data, PCMK__XA_ST_OP, op->action); reply = fenced_construct_reply(op->request, data, &op->result); crm_xml_add(reply, PCMK__XA_ST_DELEGATE, op->delegate); /* Send fencing OP reply to local client that initiated fencing */ client = pcmk__find_client_by_id(op->client_id); if (client == NULL) { crm_trace("Skipping reply to %s: no longer a client", op->client_id); } else { do_local_reply(reply, client, op->call_options); } /* bcast to all local clients that the fencing operation happend */ notify_data = fencing_result2xml(NULL, op); fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, &op->result, notify_data); pcmk__xml_free(notify_data); fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL); /* mark this op as having notify's already sent */ op->notify_sent = TRUE; pcmk__xml_free(reply); } /*! * \internal * \brief Finalize all duplicates of a given fencer operation * * \param[in,out] op Fencer operation that completed * \param[in,out] data Top-level XML to add notification to */ static void finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data) { for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { remote_fencing_op_t *other = iter->data; if (other->state == st_duplicate) { other->state = op->state; crm_debug("Performing duplicate notification for %s@%s: %s " QB_XS " id=%.8s", other->client_name, other->originator, pcmk_exec_status_str(op->result.execution_status), other->id); pcmk__copy_result(&op->result, &other->result); finalize_op(other, data, true); } else { // Possible if (for example) it timed out already crm_err("Skipping duplicate notification for %s@%s " QB_XS " state=%s id=%.8s", other->client_name, other->originator, stonith_op_state_str(other->state), other->id); } } } static char * delegate_from_xml(xmlNode *xml) { xmlNode *match = get_xpath_object("//@" PCMK__XA_ST_DELEGATE, xml, LOG_NEVER); if (match == NULL) { return crm_element_value_copy(xml, PCMK__XA_SRC); } else { return crm_element_value_copy(match, PCMK__XA_ST_DELEGATE); } } /*! * \internal * \brief Finalize a peer fencing operation * * Clean up after a fencing operation completes. This function has two code * paths: the executioner uses it to broadcast the result to CPG peers, and then * each peer (including the executioner) uses it to process that broadcast and * notify its IPC clients of the result. * * \param[in,out] op Fencer operation that completed * \param[in,out] data If not NULL, XML reply of last delegated operation * \param[in] dup Whether this operation is a duplicate of another * (in which case, do not broadcast the result) * * \note The operation result should be set before calling this function. */ static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) { int level = LOG_ERR; const char *subt = NULL; xmlNode *local_data = NULL; gboolean op_merged = FALSE; CRM_CHECK((op != NULL), return); // This is a no-op if timers have already been cleared clear_remote_op_timers(op); if (op->notify_sent) { // Most likely, this is a timed-out action that eventually completed crm_notice("Operation '%s'%s%s by %s for %s@%s%s: " "Result arrived too late " QB_XS " id=%.8s", op->action, (op->target? " targeting " : ""), (op->target? op->target : ""), (op->delegate? op->delegate : "unknown node"), op->client_name, op->originator, (op_merged? " (merged)" : ""), op->id); return; } set_fencing_completed(op); undo_op_remap(op); if (data == NULL) { data = pcmk__xe_create(NULL, "remote-op"); local_data = data; } else if (op->delegate == NULL) { switch (op->result.execution_status) { case PCMK_EXEC_NO_FENCE_DEVICE: break; case PCMK_EXEC_INVALID: if (op->result.exit_status != CRM_EX_EXPIRED) { op->delegate = delegate_from_xml(data); } break; default: op->delegate = delegate_from_xml(data); break; } } if (dup || (crm_element_value(data, PCMK__XA_ST_OP_MERGED) != NULL)) { op_merged = true; } /* Tell everyone the operation is done, we will continue * with doing the local notifications once we receive * the broadcast back. */ subt = crm_element_value(data, PCMK__XA_SUBT); if (!dup && !pcmk__str_eq(subt, PCMK__VALUE_BROADCAST, pcmk__str_none)) { /* Defer notification until the bcast message arrives */ fenced_broadcast_op_result(op, op_merged); pcmk__xml_free(local_data); return; } if (pcmk__result_ok(&op->result) || dup || !pcmk__str_eq(op->originator, fenced_get_local_node(), pcmk__str_casei)) { level = LOG_NOTICE; } do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) " QB_XS " id=%.8s", op->action, (op->target? " targeting " : ""), (op->target? op->target : ""), (op->delegate? op->delegate : "unknown node"), op->client_name, op->originator, (op_merged? " (merged)" : ""), crm_exit_str(op->result.exit_status), pcmk_exec_status_str(op->result.execution_status), ((op->result.exit_reason == NULL)? "" : ": "), ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), op->id); handle_local_reply_and_notify(op, data); if (!dup) { finalize_op_duplicates(op, data); } /* Free non-essential parts of the record * Keep the record around so we can query the history */ if (op->query_results) { g_list_free_full(op->query_results, free_remote_query); op->query_results = NULL; } if (op->request) { pcmk__xml_free(op->request); op->request = NULL; } pcmk__xml_free(local_data); } /*! * \internal * \brief Finalize a watchdog fencer op after the waiting time expires * * \param[in,out] userdata Fencer operation that completed * * \return G_SOURCE_REMOVE (which tells glib not to restart timer) */ static gboolean remote_op_watchdog_done(gpointer userdata) { remote_fencing_op_t *op = userdata; op->op_timer_one = 0; crm_notice("Self-fencing (%s) by %s for %s assumed complete " QB_XS " id=%.8s", op->action, op->target, op->client_name, op->id); op->state = st_done; pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); finalize_op(op, NULL, false); return G_SOURCE_REMOVE; } static gboolean remote_op_timeout_one(gpointer userdata) { remote_fencing_op_t *op = userdata; op->op_timer_one = 0; crm_notice("Peer's '%s' action targeting %s for client %s timed out " QB_XS " id=%.8s", op->action, op->target, op->client_name, op->id); pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, "Peer did not return fence result within timeout"); // The requested delay has been applied for the first device if (op->client_delay > 0) { op->client_delay = 0; crm_trace("Try another device for '%s' action targeting %s " "for client %s without delay " QB_XS " id=%.8s", op->action, op->target, op->client_name, op->id); } // Try another device, if appropriate request_peer_fencing(op, NULL); return G_SOURCE_REMOVE; } /*! * \internal * \brief Finalize a remote fencer operation that timed out * * \param[in,out] op Fencer operation that timed out * \param[in] reason Readable description of what step timed out */ static void finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) { crm_debug("Action '%s' targeting %s for client %s timed out " QB_XS " id=%.8s", op->action, op->target, op->client_name, op->id); if (op->phase == st_phase_on) { /* A remapped reboot operation timed out in the "on" phase, but the * "off" phase completed successfully, so quit trying any further * devices, and return success. */ op->state = st_done; pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } else { op->state = st_failed; pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); } finalize_op(op, NULL, false); } /*! * \internal * \brief Finalize a remote fencer operation that timed out * * \param[in,out] userdata Fencer operation that timed out * * \return G_SOURCE_REMOVE (which tells glib not to restart timer) */ static gboolean remote_op_timeout(gpointer userdata) { remote_fencing_op_t *op = userdata; op->op_timer_total = 0; if (op->state == st_done) { crm_debug("Action '%s' targeting %s for client %s already completed " QB_XS " id=%.8s", op->action, op->target, op->client_name, op->id); } else { finalize_timed_out_op(userdata, "Fencing did not complete within a " "total timeout based on the " "configured timeout and retries for " "any devices attempted"); } return G_SOURCE_REMOVE; } static gboolean remote_op_query_timeout(gpointer data) { remote_fencing_op_t *op = data; op->query_timer = 0; if (op->state == st_done) { crm_debug("Operation %.8s targeting %s already completed", op->id, op->target); } else if (op->state == st_exec) { crm_debug("Operation %.8s targeting %s already in progress", op->id, op->target); } else if (op->query_results) { // Query succeeded, so attempt the actual fencing crm_debug("Query %.8s targeting %s complete (state=%s)", op->id, op->target, stonith_op_state_str(op->state)); request_peer_fencing(op, NULL); } else { crm_debug("Query %.8s targeting %s timed out (state=%s)", op->id, op->target, stonith_op_state_str(op->state)); finalize_timed_out_op(op, "No capable peers replied to device query " "within timeout"); } return G_SOURCE_REMOVE; } static gboolean topology_is_empty(stonith_topology_t *tp) { int i; if (tp == NULL) { return TRUE; } for (i = 0; i < ST__LEVEL_COUNT; i++) { if (tp->levels[i] != NULL) { return FALSE; } } return TRUE; } /*! * \internal * \brief Add a device to an operation's automatic unfencing list * * \param[in,out] op Operation to modify * \param[in] device Device ID to add */ static void add_required_device(remote_fencing_op_t *op, const char *device) { GList *match = g_list_find_custom(op->automatic_list, device, sort_strings); if (!match) { op->automatic_list = g_list_prepend(op->automatic_list, pcmk__str_copy(device)); } } /*! * \internal * \brief Remove a device from the automatic unfencing list * * \param[in,out] op Operation to modify * \param[in] device Device ID to remove */ static void remove_required_device(remote_fencing_op_t *op, const char *device) { GList *match = g_list_find_custom(op->automatic_list, device, sort_strings); if (match) { op->automatic_list = g_list_remove(op->automatic_list, match->data); } } /* deep copy the device list */ static void set_op_device_list(remote_fencing_op_t * op, GList *devices) { GList *lpc = NULL; if (op->devices_list) { g_list_free_full(op->devices_list, free); op->devices_list = NULL; } for (lpc = devices; lpc != NULL; lpc = lpc->next) { const char *device = lpc->data; op->devices_list = g_list_append(op->devices_list, pcmk__str_copy(device)); } op->devices = op->devices_list; } /*! * \internal * \brief Check whether a node matches a topology target * * \param[in] tp Topology table entry to check * \param[in] node Name of node to check * * \return TRUE if node matches topology target */ static gboolean topology_matches(const stonith_topology_t *tp, const char *node) { regex_t r_patt; CRM_CHECK(node && tp && tp->target, return FALSE); switch (tp->kind) { case fenced_target_by_attribute: /* This level targets by attribute, so tp->target is a NAME=VALUE pair * of a permanent attribute applied to targeted nodes. The test below * relies on the locally cached copy of the CIB, so if fencing needs to * be done before the initial CIB is received or after a malformed CIB * is received, then the topology will be unable to be used. */ if (node_has_attr(node, tp->target_attribute, tp->target_value)) { crm_notice("Matched %s with %s by attribute", node, tp->target); return TRUE; } break; case fenced_target_by_pattern: /* This level targets node names matching a pattern, so tp->target * (and tp->target_pattern) is a regular expression. */ if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) { crm_info("Bad regex '%s' for fencing level", tp->target); } else { int status = regexec(&r_patt, node, 0, NULL, 0); regfree(&r_patt); if (status == 0) { crm_notice("Matched %s with %s by name", node, tp->target); return TRUE; } } break; case fenced_target_by_name: crm_trace("Testing %s against %s", node, tp->target); return pcmk__str_eq(tp->target, node, pcmk__str_casei); default: break; } crm_trace("No match for %s with %s", node, tp->target); return FALSE; } stonith_topology_t * find_topology_for_host(const char *host) { GHashTableIter tIter; stonith_topology_t *tp = g_hash_table_lookup(topology, host); if(tp != NULL) { crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology)); return tp; } g_hash_table_iter_init(&tIter, topology); while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) { if (topology_matches(tp, host)) { crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology)); return tp; } } crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology)); return NULL; } /*! * \internal * \brief Set fencing operation's device list to target's next topology level * * \param[in,out] op Remote fencing operation to modify * \param[in] empty_ok If true, an operation without a target (i.e. * queries) or a target without a topology will get a * pcmk_rc_ok return value instead of ENODEV * * \return Standard Pacemaker return value */ static int advance_topology_level(remote_fencing_op_t *op, bool empty_ok) { stonith_topology_t *tp = NULL; if (op->target) { tp = find_topology_for_host(op->target); } if (topology_is_empty(tp)) { return empty_ok? pcmk_rc_ok : ENODEV; } CRM_ASSERT(tp->levels != NULL); stonith__set_call_options(op->call_options, op->id, st_opt_topology); /* This is a new level, so undo any remapping left over from previous */ undo_op_remap(op); do { op->level++; } while (op->level < ST__LEVEL_COUNT && tp->levels[op->level] == NULL); if (op->level < ST__LEVEL_COUNT) { crm_trace("Attempting fencing level %d targeting %s (%d devices) " "for client %s@%s (id=%.8s)", op->level, op->target, g_list_length(tp->levels[op->level]), op->client_name, op->originator, op->id); set_op_device_list(op, tp->levels[op->level]); // The requested delay has been applied for the first fencing level if ((op->level > 1) && (op->client_delay > 0)) { op->client_delay = 0; } if ((g_list_next(op->devices_list) != NULL) && pcmk__str_eq(op->action, PCMK_ACTION_REBOOT, pcmk__str_none)) { /* A reboot has been requested for a topology level with multiple * devices. Instead of rebooting the devices sequentially, we will * turn them all off, then turn them all on again. (Think about * switched power outlets for redundant power supplies.) */ op_phase_off(op); } return pcmk_rc_ok; } crm_info("All %sfencing options targeting %s for client %s@%s failed " QB_XS " id=%.8s", (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"", op->target, op->client_name, op->originator, op->id); return ENODEV; } /*! * \internal * \brief If fencing operation is a duplicate, merge it into the other one * * \param[in,out] op Fencing operation to check */ static void merge_duplicates(remote_fencing_op_t *op) { GHashTableIter iter; remote_fencing_op_t *other = NULL; time_t now = time(NULL); g_hash_table_iter_init(&iter, stonith_remote_op_list); while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) { const char *other_action = op_requested_action(other); pcmk__node_status_t *node = NULL; if (!strcmp(op->id, other->id)) { continue; // Don't compare against self } if (other->state > st_exec) { crm_trace("%.8s not duplicate of %.8s: not in progress", op->id, other->id); continue; } if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) { crm_trace("%.8s not duplicate of %.8s: node %s vs. %s", op->id, other->id, op->target, other->target); continue; } if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) { crm_trace("%.8s not duplicate of %.8s: action %s vs. %s", op->id, other->id, op->action, other_action); continue; } if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) { crm_trace("%.8s not duplicate of %.8s: same client %s", op->id, other->id, op->client_name); continue; } if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) { crm_trace("%.8s not duplicate of %.8s: suicide for %s", op->id, other->id, other->target); continue; } node = pcmk__get_node(0, other->originator, NULL, pcmk__node_search_cluster_member); if (!fencing_peer_active(node)) { crm_notice("Failing action '%s' targeting %s originating from " "client %s@%s: Originator is dead " QB_XS " id=%.8s", other->action, other->target, other->client_name, other->originator, other->id); crm_trace("%.8s not duplicate of %.8s: originator dead", op->id, other->id); other->state = st_failed; continue; } if ((other->total_timeout > 0) && (now > (other->total_timeout + other->created))) { crm_trace("%.8s not duplicate of %.8s: old (%lld vs. %lld + %ds)", op->id, other->id, (long long)now, (long long)other->created, other->total_timeout); continue; } /* There is another in-flight request to fence the same host * Piggyback on that instead. If it fails, so do we. */ other->duplicates = g_list_append(other->duplicates, op); if (other->total_timeout == 0) { other->total_timeout = op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL); crm_trace("Best guess as to timeout used for %.8s: %ds", other->id, other->total_timeout); } crm_notice("Merging fencing action '%s' targeting %s originating from " "client %s with identical request from %s@%s " QB_XS " original=%.8s duplicate=%.8s total_timeout=%ds", op->action, op->target, op->client_name, other->client_name, other->originator, op->id, other->id, other->total_timeout); report_timeout_period(op, other->total_timeout); op->state = st_duplicate; } } static uint32_t fencing_active_peers(void) { uint32_t count = 0; pcmk__node_status_t *entry = NULL; GHashTableIter gIter; g_hash_table_iter_init(&gIter, pcmk__peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { if(fencing_peer_active(entry)) { count++; } } return count; } /*! * \internal * \brief Process a manual confirmation of a pending fence action * * \param[in] client IPC client that sent confirmation * \param[in,out] msg Request XML with manual confirmation * * \return Standard Pacemaker return code */ int fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg) { remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, msg, LOG_ERR); CRM_CHECK(dev != NULL, return EPROTO); crm_notice("Received manual confirmation that %s has been fenced", pcmk__s(crm_element_value(dev, PCMK__XA_ST_TARGET), "unknown target")); op = initiate_remote_stonith_op(client, msg, TRUE); if (op == NULL) { return EPROTO; } op->state = st_done; set_fencing_completed(op); op->delegate = pcmk__str_copy("a human"); // For the fencer's purposes, the fencing operation is done pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); finalize_op(op, msg, false); /* For the requester's purposes, the operation is still pending. The * actual result will be sent asynchronously via the operation's done_cb(). */ return EINPROGRESS; } /*! * \internal * \brief Create a new remote stonith operation * * \param[in] client ID of local stonith client that initiated the operation * \param[in] request The request from the client that started the operation * \param[in] peer TRUE if this operation is owned by another stonith peer * (an operation owned by one peer is stored on all peers, * but only the owner executes it; all nodes get the results * once the owner finishes execution) */ void * create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer) { remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request, LOG_NEVER); int call_options = 0; const char *operation = NULL; init_stonith_remote_op_hash_table(&stonith_remote_op_list); /* If this operation is owned by another node, check to make * sure we haven't already created this operation. */ if (peer && dev) { const char *op_id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP); CRM_CHECK(op_id != NULL, return NULL); op = g_hash_table_lookup(stonith_remote_op_list, op_id); if (op) { crm_debug("Reusing existing remote fencing op %.8s for %s", op_id, ((client == NULL)? "unknown client" : client)); return op; } } op = pcmk__assert_alloc(1, sizeof(remote_fencing_op_t)); crm_element_value_int(request, PCMK__XA_ST_TIMEOUT, &(op->base_timeout)); // Value -1 means disable any static/random fencing delays crm_element_value_int(request, PCMK__XA_ST_DELAY, &(op->client_delay)); if (peer && dev) { op->id = crm_element_value_copy(dev, PCMK__XA_ST_REMOTE_OP); } else { op->id = crm_generate_uuid(); } g_hash_table_replace(stonith_remote_op_list, op->id, op); op->state = st_query; op->replies_expected = fencing_active_peers(); op->action = crm_element_value_copy(dev, PCMK__XA_ST_DEVICE_ACTION); /* The node initiating the stonith operation. If an operation is relayed, * this is the last node the operation lands on. When in standalone mode, * origin is the ID of the client that originated the operation. * * Or may be the name of the function that created the operation. */ op->originator = crm_element_value_copy(dev, PCMK__XA_ST_ORIGIN); if (op->originator == NULL) { /* Local or relayed request */ op->originator = pcmk__str_copy(fenced_get_local_node()); } // Delegate may not be set op->delegate = crm_element_value_copy(dev, PCMK__XA_ST_DELEGATE); op->created = time(NULL); CRM_LOG_ASSERT(client != NULL); op->client_id = pcmk__str_copy(client); /* For a RELAY operation, set fenced on the client. */ operation = crm_element_value(request, PCMK__XA_ST_OP); if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) { op->client_name = crm_strdup_printf("%s.%lu", crm_system_name, (unsigned long) getpid()); } else { op->client_name = crm_element_value_copy(request, PCMK__XA_ST_CLIENTNAME); } op->target = crm_element_value_copy(dev, PCMK__XA_ST_TARGET); // @TODO Figure out how to avoid copying XML here op->request = pcmk__xml_copy(NULL, request); crm_element_value_int(request, PCMK__XA_ST_CALLOPT, &call_options); op->call_options = call_options; crm_element_value_int(request, PCMK__XA_ST_CALLID, &(op->client_callid)); crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, " "base timeout %ds, %u %s expected)", (peer && dev)? "Recorded" : "Generated", op->id, op->action, op->target, op->client_name, op->base_timeout, op->replies_expected, pcmk__plural_alt(op->replies_expected, "reply", "replies")); if (op->call_options & st_opt_cs_nodeid) { int nodeid; pcmk__node_status_t *node = NULL; pcmk__scan_min_int(op->target, &nodeid, 0); node = pcmk__search_node_caches(nodeid, NULL, pcmk__node_search_any |pcmk__node_search_cluster_cib); /* Ensure the conversion only happens once */ stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid); if ((node != NULL) && (node->name != NULL)) { pcmk__str_update(&(op->target), node->name); } else { crm_warn("Could not expand nodeid '%s' into a host name", op->target); } } /* check to see if this is a duplicate operation of another in-flight operation */ merge_duplicates(op); if (op->state != st_duplicate) { /* kick history readers */ fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL); } /* safe to trim as long as that doesn't touch pending ops */ stonith_fence_history_trim(); return op; } /*! * \internal * \brief Create a peer fencing operation from a request, and initiate it * * \param[in] client IPC client that made request (NULL to get from request) * \param[in] request Request XML * \param[in] manual_ack Whether this is a manual action confirmation * * \return Newly created operation on success, otherwise NULL */ remote_fencing_op_t * initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request, gboolean manual_ack) { int query_timeout = 0; xmlNode *query = NULL; const char *client_id = NULL; remote_fencing_op_t *op = NULL; const char *relay_op_id = NULL; const char *operation = NULL; if (client) { client_id = client->id; } else { client_id = crm_element_value(request, PCMK__XA_ST_CLIENTID); } CRM_LOG_ASSERT(client_id != NULL); op = create_remote_stonith_op(client_id, request, FALSE); op->owner = TRUE; if (manual_ack) { return op; } CRM_CHECK(op->action, return NULL); if (advance_topology_level(op, true) != pcmk_rc_ok) { op->state = st_failed; } switch (op->state) { case st_failed: // advance_topology_level() exhausted levels pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, "All topology levels failed"); crm_warn("Could not request peer fencing (%s) targeting %s " QB_XS " id=%.8s", op->action, op->target, op->id); finalize_op(op, NULL, false); return op; case st_duplicate: crm_info("Requesting peer fencing (%s) targeting %s (duplicate) " QB_XS " id=%.8s", op->action, op->target, op->id); return op; default: crm_notice("Requesting peer fencing (%s) targeting %s " QB_XS " id=%.8s state=%s base_timeout=%ds", op->action, op->target, op->id, stonith_op_state_str(op->state), op->base_timeout); } query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY, NULL, op->call_options); crm_xml_add(query, PCMK__XA_ST_REMOTE_OP, op->id); crm_xml_add(query, PCMK__XA_ST_TARGET, op->target); crm_xml_add(query, PCMK__XA_ST_DEVICE_ACTION, op_requested_action(op)); crm_xml_add(query, PCMK__XA_ST_ORIGIN, op->originator); crm_xml_add(query, PCMK__XA_ST_CLIENTID, op->client_id); crm_xml_add(query, PCMK__XA_ST_CLIENTNAME, op->client_name); crm_xml_add_int(query, PCMK__XA_ST_TIMEOUT, op->base_timeout); /* In case of RELAY operation, RELAY information is added to the query to delete the original operation of RELAY. */ operation = crm_element_value(request, PCMK__XA_ST_OP); if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) { relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP); if (relay_op_id) { crm_xml_add(query, PCMK__XA_ST_REMOTE_OP_RELAY, relay_op_id); } } pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, query); pcmk__xml_free(query); query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR; op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op); return op; } enum find_best_peer_options { /*! Skip checking the target peer for capable fencing devices */ FIND_PEER_SKIP_TARGET = 0x0001, /*! Only check the target peer for capable fencing devices */ FIND_PEER_TARGET_ONLY = 0x0002, /*! Skip peers and devices that are not verified */ FIND_PEER_VERIFIED_ONLY = 0x0004, }; static bool is_watchdog_fencing(const remote_fencing_op_t *op, const char *device) { return (stonith_watchdog_timeout_ms > 0 // Only an explicit mismatch is considered not a watchdog fencing. && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_null_matches) && pcmk__is_fencing_action(op->action) && node_does_watchdog_fencing(op->target)); } static peer_device_info_t * find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options) { GList *iter = NULL; gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE; if (!device && pcmk_is_set(op->call_options, st_opt_topology)) { return NULL; } for (iter = op->query_results; iter != NULL; iter = iter->next) { peer_device_info_t *peer = iter->data; crm_trace("Testing result from %s targeting %s with %d device%s: %d %x", peer->host, op->target, peer->ndevices, pcmk__plural_s(peer->ndevices), peer->tried, options); if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) { continue; } if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) { continue; } if (pcmk_is_set(op->call_options, st_opt_topology)) { if (grab_peer_device(op, peer, device, verified_devices_only)) { return peer; } } else if (!peer->tried && count_peer_devices(op, peer, verified_devices_only, fenced_support_flag(op->action))) { /* No topology: Use the current best peer */ crm_trace("Simple fencing"); return peer; } } return NULL; } static peer_device_info_t * stonith_choose_peer(remote_fencing_op_t * op) { const char *device = NULL; peer_device_info_t *peer = NULL; uint32_t active = fencing_active_peers(); do { if (op->devices) { device = op->devices->data; crm_trace("Checking for someone to fence (%s) %s using %s", op->action, op->target, device); } else { crm_trace("Checking for someone to fence (%s) %s", op->action, op->target); } /* Best choice is a peer other than the target with verified access */ peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY); if (peer) { crm_trace("Found verified peer %s for %s", peer->host, device?device:""); return peer; } if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) { crm_trace("Waiting before looking for unverified devices to fence %s", op->target); return NULL; } /* If no other peer has verified access, next best is unverified access */ peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET); if (peer) { crm_trace("Found best unverified peer %s", peer->host); return peer; } /* If no other peer can do it, last option is self-fencing * (which is never allowed for the "on" phase of a remapped reboot) */ if (op->phase != st_phase_on) { peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY); if (peer) { crm_trace("%s will fence itself", peer->host); return peer; } } /* Try the next fencing level if there is one (unless we're in the "on" * phase of a remapped "reboot", because we ignore errors in that case) */ } while ((op->phase != st_phase_on) && pcmk_is_set(op->call_options, st_opt_topology) && (advance_topology_level(op, false) == pcmk_rc_ok)); /* With a simple watchdog fencing configuration without a topology, * "device" is NULL here. Consider it should be done with watchdog fencing. */ if (is_watchdog_fencing(op, device)) { crm_info("Couldn't contact watchdog-fencing target-node (%s)", op->target); /* check_watchdog_fencing_and_wait will log additional info */ } else { crm_notice("Couldn't find anyone to fence (%s) %s using %s", op->action, op->target, (device? device : "any device")); } return NULL; } static int valid_fencing_timeout(int specified_timeout, bool action_specific, const remote_fencing_op_t *op, const char *device) { int timeout = specified_timeout; if (!is_watchdog_fencing(op, device)) { return timeout; } timeout = (int) QB_MIN(QB_MAX(specified_timeout, stonith_watchdog_timeout_ms / 1000), INT_MAX); if (timeout > specified_timeout) { if (action_specific) { crm_warn("pcmk_%s_timeout %ds for %s is too short (must be >= " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds " "instead", op->action, specified_timeout, device? device : "watchdog", timeout, timeout); } else { crm_warn("Fencing timeout %ds is too short (must be >= " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds " "instead", specified_timeout, timeout, timeout); } } return timeout; } static int get_device_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer, const char *device, bool with_delay) { int timeout = op->base_timeout; device_properties_t *props; timeout = valid_fencing_timeout(op->base_timeout, false, op, device); if (!peer || !device) { return timeout; } props = g_hash_table_lookup(peer->devices, device); if (!props) { return timeout; } if (props->custom_action_timeout[op->phase]) { timeout = valid_fencing_timeout(props->custom_action_timeout[op->phase], true, op, device); } // op->client_delay < 0 means disable any static/random fencing delays if (with_delay && (op->client_delay >= 0)) { // delay_base is eventually limited by delay_max timeout += (props->delay_max[op->phase] > 0 ? props->delay_max[op->phase] : props->delay_base[op->phase]); } return timeout; } struct timeout_data { const remote_fencing_op_t *op; const peer_device_info_t *peer; int total_timeout; }; /*! * \internal * \brief Add timeout to a total if device has not been executed yet * * \param[in] key GHashTable key (device ID) * \param[in] value GHashTable value (device properties) * \param[in,out] user_data Timeout data */ static void add_device_timeout(gpointer key, gpointer value, gpointer user_data) { const char *device_id = key; device_properties_t *props = value; struct timeout_data *timeout = user_data; if (!props->executed[timeout->op->phase] && !props->disallowed[timeout->op->phase]) { timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer, device_id, true); } } static int get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer) { struct timeout_data timeout; timeout.op = op; timeout.peer = peer; timeout.total_timeout = 0; g_hash_table_foreach(peer->devices, add_device_timeout, &timeout); return (timeout.total_timeout? timeout.total_timeout : op->base_timeout); } static int get_op_total_timeout(const remote_fencing_op_t *op, const peer_device_info_t *chosen_peer) { long long total_timeout = 0; stonith_topology_t *tp = find_topology_for_host(op->target); if (pcmk_is_set(op->call_options, st_opt_topology) && tp) { int i; GList *device_list = NULL; GList *iter = NULL; GList *auto_list = NULL; if (pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none) && (op->automatic_list != NULL)) { auto_list = g_list_copy(op->automatic_list); } /* Yep, this looks scary, nested loops all over the place. * Here is what is going on. * Loop1: Iterate through fencing levels. * Loop2: If a fencing level has devices, loop through each device * Loop3: For each device in a fencing level, see what peer owns it * and what that peer has reported the timeout is for the device. */ for (i = 0; i < ST__LEVEL_COUNT; i++) { if (!tp->levels[i]) { continue; } for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { bool found = false; for (iter = op->query_results; iter != NULL; iter = iter->next) { const peer_device_info_t *peer = iter->data; if (auto_list) { GList *match = g_list_find_custom(auto_list, device_list->data, sort_strings); if (match) { auto_list = g_list_remove(auto_list, match->data); } } if (find_peer_device(op, peer, device_list->data, fenced_support_flag(op->action))) { total_timeout += get_device_timeout(op, peer, device_list->data, true); found = true; break; } } /* End Loop3: match device with peer that owns device, find device's timeout period */ /* in case of watchdog-device we add the timeout to the budget if didn't get a reply */ if (!found && is_watchdog_fencing(op, device_list->data)) { total_timeout += stonith_watchdog_timeout_ms / 1000; } } /* End Loop2: iterate through devices at a specific level */ } /*End Loop1: iterate through fencing levels */ //Add only exists automatic_list device timeout if (auto_list) { for (iter = auto_list; iter != NULL; iter = iter->next) { GList *iter2 = NULL; for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) { peer_device_info_t *peer = iter2->data; if (find_peer_device(op, peer, iter->data, st_device_supports_on)) { total_timeout += get_device_timeout(op, peer, iter->data, true); break; } } } } g_list_free(auto_list); } else if (chosen_peer) { total_timeout = get_peer_timeout(op, chosen_peer); } else { total_timeout = valid_fencing_timeout(op->base_timeout, false, op, NULL); } if (total_timeout <= 0) { total_timeout = op->base_timeout; } /* Take any requested fencing delay into account to prevent it from eating * up the total timeout. */ if (op->client_delay > 0) { total_timeout += op->client_delay; } return (int) QB_MIN(total_timeout, INT_MAX); } static void report_timeout_period(remote_fencing_op_t * op, int op_timeout) { GList *iter = NULL; xmlNode *update = NULL; const char *client_node = NULL; const char *client_id = NULL; const char *call_id = NULL; if (op->call_options & st_opt_sync_call) { /* There is no reason to report the timeout for a synchronous call. It * is impossible to use the reported timeout to do anything when the client * is blocking for the response. This update is only important for * async calls that require a callback to report the results in. */ return; } else if (!op->request) { return; } crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id); client_node = crm_element_value(op->request, PCMK__XA_ST_CLIENTNODE); call_id = crm_element_value(op->request, PCMK__XA_ST_CALLID); client_id = crm_element_value(op->request, PCMK__XA_ST_CLIENTID); if (!client_node || !call_id || !client_id) { return; } if (pcmk__str_eq(client_node, fenced_get_local_node(), pcmk__str_casei)) { // Client is connected to this node, so send update directly to them do_stonith_async_timeout_update(client_id, call_id, op_timeout); return; } /* The client is connected to another node, relay this update to them */ update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0); crm_xml_add(update, PCMK__XA_ST_REMOTE_OP, op->id); crm_xml_add(update, PCMK__XA_ST_CLIENTID, client_id); crm_xml_add(update, PCMK__XA_ST_CALLID, call_id); crm_xml_add_int(update, PCMK__XA_ST_TIMEOUT, op_timeout); pcmk__cluster_send_message(pcmk__get_node(0, client_node, NULL, pcmk__node_search_cluster_member), pcmk_ipc_fenced, update); pcmk__xml_free(update); for (iter = op->duplicates; iter != NULL; iter = iter->next) { remote_fencing_op_t *dup = iter->data; crm_trace("Reporting timeout for duplicate %.8s to client %s", dup->id, dup->client_name); report_timeout_period(iter->data, op_timeout); } } /*! * \internal * \brief Advance an operation to the next device in its topology * * \param[in,out] op Fencer operation to advance * \param[in] device ID of device that just completed * \param[in,out] msg If not NULL, XML reply of last delegated operation */ static void advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, xmlNode *msg) { /* Advance to the next device at this topology level, if any */ if (op->devices) { op->devices = op->devices->next; } /* Handle automatic unfencing if an "on" action was requested */ if ((op->phase == st_phase_requested) && pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)) { /* If the device we just executed was required, it's not anymore */ remove_required_device(op, device); /* If there are no more devices at this topology level, run through any * remaining devices with automatic unfencing */ if (op->devices == NULL) { op->devices = op->automatic_list; } } if ((op->devices == NULL) && (op->phase == st_phase_off)) { /* We're done with this level and with required devices, but we had * remapped "reboot" to "off", so start over with "on". If any devices * need to be turned back on, op->devices will be non-NULL after this. */ op_phase_on(op); } // This function is only called if the previous device succeeded pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); if (op->devices) { /* Necessary devices remain, so execute the next one */ crm_trace("Next targeting %s on behalf of %s@%s", op->target, op->client_name, op->originator); // The requested delay has been applied for the first device if (op->client_delay > 0) { op->client_delay = 0; } request_peer_fencing(op, NULL); } else { /* We're done with all devices and phases, so finalize operation */ crm_trace("Marking complex fencing op targeting %s as complete", op->target); op->state = st_done; finalize_op(op, msg, false); } } static gboolean check_watchdog_fencing_and_wait(remote_fencing_op_t * op) { if (node_does_watchdog_fencing(op->target)) { guint timeout_ms = QB_MIN(stonith_watchdog_timeout_ms, UINT_MAX); crm_notice("Waiting %s for %s to self-fence (%s) for " "client %s " QB_XS " id=%.8s", pcmk__readable_interval(timeout_ms), op->target, op->action, op->client_name, op->id); if (op->op_timer_one) { g_source_remove(op->op_timer_one); } op->op_timer_one = g_timeout_add(timeout_ms, remote_op_watchdog_done, op); return TRUE; } else { crm_debug("Skipping fallback to watchdog-fencing as %s is " "not in host-list", op->target); } return FALSE; } /*! * \internal * \brief Ask a peer to execute a fencing operation * * \param[in,out] op Fencing operation to be executed * \param[in,out] peer If NULL or topology is in use, choose best peer to * execute the fencing, otherwise use this peer */ static void request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) { const char *device = NULL; int timeout; CRM_CHECK(op != NULL, return); crm_trace("Action %.8s targeting %s for %s is %s", op->id, op->target, op->client_name, stonith_op_state_str(op->state)); if ((op->phase == st_phase_on) && (op->devices != NULL)) { /* We are in the "on" phase of a remapped topology reboot. If this * device has pcmk_reboot_action="off", or doesn't support the "on" * action, skip it. * * We can't check device properties at this point because we haven't * chosen a peer for this stage yet. Instead, we check the local node's * knowledge about the device. If different versions of the fence agent * are installed on different nodes, there's a chance this could be * mistaken, but the worst that could happen is we don't try turning the * node back on when we should. */ device = op->devices->data; if (pcmk__str_eq(fenced_device_reboot_action(device), PCMK_ACTION_OFF, pcmk__str_none)) { crm_info("Not turning %s back on using %s because the device is " "configured to stay off (pcmk_reboot_action='off')", op->target, device); advance_topology_device_in_level(op, device, NULL); return; } if (!fenced_device_supports_on(device)) { crm_info("Not turning %s back on using %s because the agent " "doesn't support 'on'", op->target, device); advance_topology_device_in_level(op, device, NULL); return; } } timeout = op->base_timeout; if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) { peer = stonith_choose_peer(op); } if (!op->op_timer_total) { op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer); op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op); report_timeout_period(op, op->total_timeout); - crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s" - QB_XS "id=%.8s", + crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s " + QB_XS " id=%.8s", op->total_timeout, op->target, op->client_name, op->id); } if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) { /* Ignore the caller's peer preference if topology is in use, because * that peer might not have access to the required device. With * topology, stonith_choose_peer() removes the device from further * consideration, so the timeout must be calculated beforehand. * * @TODO Basing the total timeout on the caller's preferred peer (above) * is less than ideal. */ peer = stonith_choose_peer(op); device = op->devices->data; /* Fencing timeout sent to peer takes no delay into account. * The peer will add a dedicated timer for any delay upon * schedule_stonith_command(). */ timeout = get_device_timeout(op, peer, device, false); } if (peer) { int timeout_one = 0; xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0); const pcmk__node_status_t *peer_node = pcmk__get_node(0, peer->host, NULL, pcmk__node_search_cluster_member); if (op->client_delay > 0) { /* Take requested fencing delay into account to prevent it from * eating up the timeout. */ timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay; } crm_xml_add(remote_op, PCMK__XA_ST_REMOTE_OP, op->id); crm_xml_add(remote_op, PCMK__XA_ST_TARGET, op->target); crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ACTION, op->action); crm_xml_add(remote_op, PCMK__XA_ST_ORIGIN, op->originator); crm_xml_add(remote_op, PCMK__XA_ST_CLIENTID, op->client_id); crm_xml_add(remote_op, PCMK__XA_ST_CLIENTNAME, op->client_name); crm_xml_add_int(remote_op, PCMK__XA_ST_TIMEOUT, timeout); crm_xml_add_int(remote_op, PCMK__XA_ST_CALLOPT, op->call_options); crm_xml_add_int(remote_op, PCMK__XA_ST_DELAY, op->client_delay); if (device) { timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(op, peer, device, true); crm_notice("Requesting that %s perform '%s' action targeting %s " "using %s " QB_XS " for client %s (%ds)", peer->host, op->action, op->target, device, op->client_name, timeout_one); crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ID, device); } else { timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer); crm_notice("Requesting that %s perform '%s' action targeting %s " QB_XS " for client %s (%ds, %s)", peer->host, op->action, op->target, op->client_name, timeout_one, pcmk__readable_interval(stonith_watchdog_timeout_ms)); } op->state = st_exec; if (op->op_timer_one) { g_source_remove(op->op_timer_one); op->op_timer_one = 0; } if (!is_watchdog_fencing(op, device) || !check_watchdog_fencing_and_wait(op)) { /* Some thoughts about self-fencing cases reaching this point: - Actually check in check_watchdog_fencing_and_wait shouldn't fail if STONITH_WATCHDOG_ID is chosen as fencing-device and it being present implies watchdog-fencing is enabled anyway - If watchdog-fencing is disabled either in general or for a specific target - detected in check_watchdog_fencing_and_wait - for some other kind of self-fencing we can't expect a success answer but timeout is fine if the node doesn't come back in between - Delicate might be the case where we have watchdog-fencing enabled for a node but the watchdog-fencing-device isn't explicitly chosen for suicide. Local pe-execution in sbd may detect the node as unclean and lead to timely suicide. Otherwise the selection of PCMK_OPT_STONITH_WATCHDOG_TIMEOUT at least is questionable. */ /* coming here we're not waiting for watchdog timeout - thus engage timer with timout evaluated before */ op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op); } pcmk__cluster_send_message(peer_node, pcmk_ipc_fenced, remote_op); peer->tried = TRUE; pcmk__xml_free(remote_op); return; } else if (op->phase == st_phase_on) { /* A remapped "on" cannot be executed, but the node was already * turned off successfully, so ignore the error and continue. */ crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s " "after successful 'off'", device, op->target); advance_topology_device_in_level(op, device, NULL); return; } else if (op->owner == FALSE) { crm_err("Fencing (%s) targeting %s for client %s is not ours to control", op->action, op->target, op->client_name); } else if (op->query_timer == 0) { /* We've exhausted all available peers */ crm_info("No remaining peers capable of fencing (%s) %s for client %s " QB_XS " state=%s", op->action, op->target, op->client_name, stonith_op_state_str(op->state)); CRM_CHECK(op->state < st_done, return); finalize_timed_out_op(op, "All nodes failed, or are unable, to " "fence target"); } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { /* if the operation never left the query state, * but we have all the expected replies, then no devices * are available to execute the fencing operation. */ if (is_watchdog_fencing(op, device) && check_watchdog_fencing_and_wait(op)) { /* Consider a watchdog fencing targeting an offline node executing * once it starts waiting for the target to self-fence. So that when * the query timer pops, remote_op_query_timeout() considers the * fencing already in progress. */ op->state = st_exec; return; } if (op->state == st_query) { crm_info("No peers (out of %d) have devices capable of fencing " "(%s) %s for client %s " QB_XS " state=%s", op->replies, op->action, op->target, op->client_name, stonith_op_state_str(op->state)); pcmk__reset_result(&op->result); pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, NULL); } else { if (pcmk_is_set(op->call_options, st_opt_topology)) { pcmk__reset_result(&op->result); pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, NULL); } /* ... else use existing result from previous failed attempt * (topology is not in use, and no devices remain to be attempted). * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would * prevent finalize_op() from setting the correct delegate if * needed. */ crm_info("No peers (out of %d) are capable of fencing (%s) %s " "for client %s " QB_XS " state=%s", op->replies, op->action, op->target, op->client_name, stonith_op_state_str(op->state)); } op->state = st_failed; finalize_op(op, NULL, false); } else { crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " "for client %s " QB_XS " id=%.8s", op->action, op->target, (device? " using " : ""), (device? device : ""), op->client_name, op->id); } } /*! * \internal * \brief Comparison function for sorting query results * * \param[in] a GList item to compare * \param[in] b GList item to compare * * \return Per the glib documentation, "a negative integer if the first value * comes before the second, 0 if they are equal, or a positive integer * if the first value comes after the second." */ static gint sort_peers(gconstpointer a, gconstpointer b) { const peer_device_info_t *peer_a = a; const peer_device_info_t *peer_b = b; return (peer_b->ndevices - peer_a->ndevices); } /*! * \internal * \brief Determine if all the devices in the topology are found or not * * \param[in] op Fencing operation with topology to check */ static gboolean all_topology_devices_found(const remote_fencing_op_t *op) { GList *device = NULL; GList *iter = NULL; device_properties_t *match = NULL; stonith_topology_t *tp = NULL; gboolean skip_target = FALSE; int i; tp = find_topology_for_host(op->target); if (!tp) { return FALSE; } if (pcmk__is_fencing_action(op->action)) { /* Don't count the devices on the target node if we are killing * the target node. */ skip_target = TRUE; } for (i = 0; i < ST__LEVEL_COUNT; i++) { for (device = tp->levels[i]; device; device = device->next) { match = NULL; for (iter = op->query_results; iter && !match; iter = iter->next) { peer_device_info_t *peer = iter->data; if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) { continue; } match = find_peer_device(op, peer, device->data, st_device_supports_none); } if (!match) { return FALSE; } } } return TRUE; } /*! * \internal * \brief Parse action-specific device properties from XML * * \param[in] xml XML element containing the properties * \param[in] peer Name of peer that sent XML (for logs) * \param[in] device Device ID (for logs) * \param[in] action Action the properties relate to (for logs) * \param[in,out] op Fencing operation that properties are being parsed for * \param[in] phase Phase the properties relate to * \param[in,out] props Device properties to update */ static void parse_action_specific(const xmlNode *xml, const char *peer, const char *device, const char *action, remote_fencing_op_t *op, enum st_remap_phase phase, device_properties_t *props) { props->custom_action_timeout[phase] = 0; crm_element_value_int(xml, PCMK__XA_ST_ACTION_TIMEOUT, &props->custom_action_timeout[phase]); if (props->custom_action_timeout[phase]) { crm_trace("Peer %s with device %s returned %s action timeout %ds", peer, device, action, props->custom_action_timeout[phase]); } props->delay_max[phase] = 0; crm_element_value_int(xml, PCMK__XA_ST_DELAY_MAX, &props->delay_max[phase]); if (props->delay_max[phase]) { crm_trace("Peer %s with device %s returned maximum of random delay %ds for %s", peer, device, props->delay_max[phase], action); } props->delay_base[phase] = 0; crm_element_value_int(xml, PCMK__XA_ST_DELAY_BASE, &props->delay_base[phase]); if (props->delay_base[phase]) { crm_trace("Peer %s with device %s returned base delay %ds for %s", peer, device, props->delay_base[phase], action); } /* Handle devices with automatic unfencing */ if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) { int required = 0; crm_element_value_int(xml, PCMK__XA_ST_REQUIRED, &required); if (required) { crm_trace("Peer %s requires device %s to execute for action %s", peer, device, action); add_required_device(op, device); } } /* If a reboot is remapped to off+on, it's possible that a node is allowed * to perform one action but not another. */ if (pcmk__xe_attr_is_true(xml, PCMK__XA_ST_ACTION_DISALLOWED)) { props->disallowed[phase] = TRUE; crm_trace("Peer %s is disallowed from executing %s for device %s", peer, action, device); } } /*! * \internal * \brief Parse one device's properties from peer's XML query reply * * \param[in] xml XML node containing device properties * \param[in,out] op Operation that query and reply relate to * \param[in,out] peer Peer's device information * \param[in] device ID of device being parsed */ static void add_device_properties(const xmlNode *xml, remote_fencing_op_t *op, peer_device_info_t *peer, const char *device) { xmlNode *child; int verified = 0; device_properties_t *props = pcmk__assert_alloc(1, sizeof(device_properties_t)); int flags = st_device_supports_on; /* Old nodes that don't set the flag assume they support the on action */ /* Add a new entry to this peer's devices list */ g_hash_table_insert(peer->devices, pcmk__str_copy(device), props); /* Peers with verified (monitored) access will be preferred */ crm_element_value_int(xml, PCMK__XA_ST_MONITOR_VERIFIED, &verified); if (verified) { crm_trace("Peer %s has confirmed a verified device %s", peer->host, device); props->verified = TRUE; } crm_element_value_int(xml, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS, &flags); props->device_support_flags = flags; /* Parse action-specific device properties */ parse_action_specific(xml, peer->host, device, op_requested_action(op), op, st_phase_requested, props); for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL; child = pcmk__xe_next(child)) { /* Replies for "reboot" operations will include the action-specific * values for "off" and "on" in child elements, just in case the reboot * winds up getting remapped. */ if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_OFF, pcmk__str_none)) { parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF, op, st_phase_off, props); } else if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_ON, pcmk__str_none)) { parse_action_specific(child, peer->host, device, PCMK_ACTION_ON, op, st_phase_on, props); } } } /*! * \internal * \brief Parse a peer's XML query reply and add it to operation's results * * \param[in,out] op Operation that query and reply relate to * \param[in] host Name of peer that sent this reply * \param[in] ndevices Number of devices expected in reply * \param[in] xml XML node containing device list * * \return Newly allocated result structure with parsed reply */ static peer_device_info_t * add_result(remote_fencing_op_t *op, const char *host, int ndevices, const xmlNode *xml) { peer_device_info_t *peer = pcmk__assert_alloc(1, sizeof(peer_device_info_t)); xmlNode *child; peer->host = pcmk__str_copy(host); peer->devices = pcmk__strkey_table(free, free); /* Each child element describes one capable device available to the peer */ for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL; child = pcmk__xe_next(child)) { const char *device = pcmk__xe_id(child); if (device) { add_device_properties(child, op, peer, device); } } peer->ndevices = g_hash_table_size(peer->devices); CRM_CHECK(ndevices == peer->ndevices, crm_err("Query claimed to have %d device%s but %d found", ndevices, pcmk__plural_s(ndevices), peer->ndevices)); op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers); return peer; } /*! * \internal * \brief Handle a peer's reply to our fencing query * * Parse a query result from XML and store it in the remote operation * table, and when enough replies have been received, issue a fencing request. * * \param[in] msg XML reply received * * \return pcmk_ok on success, -errno on error * * \note See initiate_remote_stonith_op() for how the XML query was initially * formed, and stonith_query() for how the peer formed its XML reply. */ int process_remote_stonith_query(xmlNode *msg) { int ndevices = 0; gboolean host_is_target = FALSE; gboolean have_all_replies = FALSE; const char *id = NULL; const char *host = NULL; remote_fencing_op_t *op = NULL; peer_device_info_t *peer = NULL; uint32_t replies_expected; xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR); CRM_CHECK(dev != NULL, return -EPROTO); id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP); CRM_CHECK(id != NULL, return -EPROTO); dev = get_xpath_object("//@" PCMK__XA_ST_AVAILABLE_DEVICES, msg, LOG_ERR); CRM_CHECK(dev != NULL, return -EPROTO); crm_element_value_int(dev, PCMK__XA_ST_AVAILABLE_DEVICES, &ndevices); op = g_hash_table_lookup(stonith_remote_op_list, id); if (op == NULL) { crm_debug("Received query reply for unknown or expired operation %s", id); return -EOPNOTSUPP; } replies_expected = fencing_active_peers(); if (op->replies_expected < replies_expected) { replies_expected = op->replies_expected; } if ((++op->replies >= replies_expected) && (op->state == st_query)) { have_all_replies = TRUE; } host = crm_element_value(msg, PCMK__XA_SRC); host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei); crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s", op->replies, replies_expected, host, op->target, op->action, ndevices, pcmk__plural_s(ndevices), id); if (ndevices > 0) { peer = add_result(op, host, ndevices, dev); } pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); if (pcmk_is_set(op->call_options, st_opt_topology)) { /* If we start the fencing before all the topology results are in, * it is possible fencing levels will be skipped because of the missing * query results. */ if (op->state == st_query && all_topology_devices_found(op)) { /* All the query results are in for the topology, start the fencing ops. */ crm_trace("All topology devices found"); request_peer_fencing(op, peer); } else if (have_all_replies) { crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", replies_expected, op->replies); request_peer_fencing(op, NULL); } } else if (op->state == st_query) { int nverified = count_peer_devices(op, peer, TRUE, fenced_support_flag(op->action)); /* We have a result for a non-topology fencing op that looks promising, * go ahead and start fencing before query timeout */ if ((peer != NULL) && !host_is_target && nverified) { /* we have a verified device living on a peer that is not the target */ crm_trace("Found %d verified device%s", nverified, pcmk__plural_s(nverified)); request_peer_fencing(op, peer); } else if (have_all_replies) { crm_info("All query replies have arrived, continuing (%d expected/%d received) ", replies_expected, op->replies); request_peer_fencing(op, NULL); } else { crm_trace("Waiting for more peer results before launching fencing operation"); } } else if ((peer != NULL) && (op->state == st_done)) { crm_info("Discarding query result from %s (%d device%s): " "Operation is %s", peer->host, peer->ndevices, pcmk__plural_s(peer->ndevices), stonith_op_state_str(op->state)); } return pcmk_ok; } /*! * \internal * \brief Handle a peer's reply to a fencing request * * Parse a fencing reply from XML, and either finalize the operation * or attempt another device as appropriate. * * \param[in] msg XML reply received */ void fenced_process_fencing_reply(xmlNode *msg) { const char *id = NULL; const char *device = NULL; remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR); pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; CRM_CHECK(dev != NULL, return); id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP); CRM_CHECK(id != NULL, return); dev = stonith__find_xe_with_result(msg); CRM_CHECK(dev != NULL, return); stonith__xe_get_result(dev, &result); device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID); if (stonith_remote_op_list) { op = g_hash_table_lookup(stonith_remote_op_list, id); } if ((op == NULL) && pcmk__result_ok(&result)) { /* Record successful fencing operations */ const char *client_id = crm_element_value(dev, PCMK__XA_ST_CLIENTID); op = create_remote_stonith_op(client_id, dev, TRUE); } if (op == NULL) { /* Could be for an event that began before we started */ /* TODO: Record the op for later querying */ crm_info("Received peer result of unknown or expired operation %s", id); pcmk__reset_result(&result); return; } pcmk__reset_result(&op->result); op->result = result; // The operation takes ownership of the result if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { crm_err("Received outdated reply for device %s (instead of %s) to " "fence (%s) %s. Operation already timed out at peer level.", device, (const char *) op->devices->data, op->action, op->target); return; } if (pcmk__str_eq(crm_element_value(msg, PCMK__XA_SUBT), PCMK__VALUE_BROADCAST, pcmk__str_none)) { if (pcmk__result_ok(&op->result)) { op->state = st_done; } else { op->state = st_failed; } finalize_op(op, msg, false); return; } else if (!pcmk__str_eq(op->originator, fenced_get_local_node(), pcmk__str_casei)) { /* If this isn't a remote level broadcast, and we are not the * originator of the operation, we should not be receiving this msg. */ crm_err("Received non-broadcast fencing result for operation %.8s " "we do not own (device %s targeting %s)", op->id, device, op->target); return; } if (pcmk_is_set(op->call_options, st_opt_topology)) { const char *device = NULL; const char *reason = op->result.exit_reason; /* We own the op, and it is complete. broadcast the result to all nodes * and notify our local clients. */ if (op->state == st_done) { finalize_op(op, msg, false); return; } device = crm_element_value(msg, PCMK__XA_ST_DEVICE_ID); if ((op->phase == 2) && !pcmk__result_ok(&op->result)) { /* A remapped "on" failed, but the node was already turned off * successfully, so ignore the error and continue. */ crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " "after successful 'off'", device, pcmk_exec_status_str(op->result.execution_status), (reason == NULL)? "" : ": ", (reason == NULL)? "" : reason, op->target); pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } else { crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: " "%s%s%s%s", op->action, op->target, ((device == NULL)? "" : " using "), ((device == NULL)? "" : device), op->client_name, op->originator, pcmk_exec_status_str(op->result.execution_status), (reason == NULL)? "" : " (", (reason == NULL)? "" : reason, (reason == NULL)? "" : ")"); } if (pcmk__result_ok(&op->result)) { /* An operation completed successfully. Try another device if * necessary, otherwise mark the operation as done. */ advance_topology_device_in_level(op, device, msg); return; } else { /* This device failed, time to try another topology level. If no other * levels are available, mark this operation as failed and report results. */ if (advance_topology_level(op, false) != pcmk_rc_ok) { op->state = st_failed; finalize_op(op, msg, false); return; } } } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) { op->state = st_done; finalize_op(op, msg, false); return; } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT) && (op->devices == NULL)) { /* If the operation timed out don't bother retrying other peers. */ op->state = st_failed; finalize_op(op, msg, false); return; } else { /* fall-through and attempt other fencing action using another peer */ } /* Retry on failure */ crm_trace("Next for %s on behalf of %s@%s (result was: %s)", op->target, op->originator, op->client_name, pcmk_exec_status_str(op->result.execution_status)); request_peer_fencing(op, NULL); } gboolean stonith_check_fence_tolerance(int tolerance, const char *target, const char *action) { GHashTableIter iter; time_t now = time(NULL); remote_fencing_op_t *rop = NULL; if (tolerance <= 0 || !stonith_remote_op_list || target == NULL || action == NULL) { return FALSE; } g_hash_table_iter_init(&iter, stonith_remote_op_list); while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) { if (strcmp(rop->target, target) != 0) { continue; } else if (rop->state != st_done) { continue; /* We don't have to worry about remapped reboots here * because if state is done, any remapping has been undone */ } else if (strcmp(rop->action, action) != 0) { continue; } else if ((rop->completed + tolerance) < now) { continue; } crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s", target, action, tolerance, rop->delegate, rop->originator); return TRUE; } return FALSE; } diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index 813941e6b4..4a04da22bc 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -1,1501 +1,1501 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include // PRIu32 #include // bool #include #include #include #include #include #include #include #include #include #include #include #include "crmcluster_private.h" /* The peer cache remembers cluster nodes that have been seen. This is managed * mostly automatically by libcrmcluster, based on cluster membership events. * * Because cluster nodes can have conflicting names or UUIDs, the hash table key * is a uniquely generated ID. * * @TODO Move caches to pcmk_cluster_t */ GHashTable *pcmk__peer_cache = NULL; /* The remote peer cache tracks pacemaker_remote nodes. While the * value has the same type as the peer cache's, it is tracked separately for * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs, * so the name (which is also the UUID) is used as the hash table key; there * is no equivalent of membership events, so management is not automatic; and * most users of the peer cache need to exclude pacemaker_remote nodes. * * @TODO That said, using a single cache would be more logical and less * error-prone, so it would be a good idea to merge them one day. * * libcrmcluster provides two avenues for populating the cache: * pcmk__cluster_lookup_remote_node() and pcmk__cluster_forget_remote_node() * directly manage it, while refresh_remote_nodes() populates it via the CIB. * * @TODO Move caches to pcmk_cluster_t */ GHashTable *pcmk__remote_peer_cache = NULL; /* * The CIB cluster node cache tracks cluster nodes that have been seen in * the CIB. It is useful mainly when a caller needs to know about a node that * may no longer be in the membership, but doesn't want to add the node to the * main peer cache tables. */ static GHashTable *cluster_node_cib_cache = NULL; static bool autoreap = true; static bool has_quorum = false; // Flag setting and clearing for pcmk__node_status_t:flags #define set_peer_flags(peer, flags_to_set) do { \ (peer)->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \ "Peer", (peer)->name, \ (peer)->flags, (flags_to_set), \ #flags_to_set); \ } while (0) #define clear_peer_flags(peer, flags_to_clear) do { \ (peer)->flags = pcmk__clear_flags_as(__func__, __LINE__, \ LOG_TRACE, \ "Peer", (peer)->name, \ (peer)->flags, (flags_to_clear), \ #flags_to_clear); \ } while (0) static void update_peer_uname(pcmk__node_status_t *node, const char *uname); static pcmk__node_status_t *find_cib_cluster_node(const char *id, const char *uname); /*! * \internal * \brief Check whether the cluster currently has quorum * * \return \c true if the cluster has quorum, or \c false otherwise */ bool pcmk__cluster_has_quorum(void) { return has_quorum; } /*! * \internal * \brief Set whether the cluster currently has quorum * * \param[in] quorate \c true if the cluster has quorum, or \c false otherwise */ void pcmk__cluster_set_quorum(bool quorate) { has_quorum = quorate; } /*! * \internal * \brief Get the number of Pacemaker Remote nodes that have been seen * * \return Number of cached Pacemaker Remote nodes */ unsigned int pcmk__cluster_num_remote_nodes(void) { if (pcmk__remote_peer_cache == NULL) { return 0U; } return g_hash_table_size(pcmk__remote_peer_cache); } /*! * \internal * \brief Get a remote node cache entry, creating it if necessary * * \param[in] node_name Name of remote node * * \return Cache entry for node on success, or \c NULL (and set \c errno) * otherwise * * \note When creating a new entry, this will leave the node state undetermined. * The caller should also call \c pcmk__update_peer_state() if the state * is known. * \note Because this can add and remove cache entries, callers should not * assume any previously obtained cache entry pointers remain valid. */ pcmk__node_status_t * pcmk__cluster_lookup_remote_node(const char *node_name) { pcmk__node_status_t *node = NULL; char *node_name_copy = NULL; if (node_name == NULL) { errno = EINVAL; return NULL; } /* It's theoretically possible that the node was added to the cluster peer * cache before it was known to be a Pacemaker Remote node. Remove that * entry unless it has a node ID, which means the name actually is * associated with a cluster node. (@TODO return an error in that case?) */ node = pcmk__search_node_caches(0, node_name, pcmk__node_search_cluster_member); if ((node != NULL) && (node->xml_id == NULL)) { /* node_name could be a pointer into the cache entry being removed, so * reassign it to a copy before the original gets freed */ node_name_copy = strdup(node_name); if (node_name_copy == NULL) { errno = ENOMEM; return NULL; } node_name = node_name_copy; pcmk__cluster_forget_cluster_node(0, node_name); } /* Return existing cache entry if one exists */ node = g_hash_table_lookup(pcmk__remote_peer_cache, node_name); if (node) { free(node_name_copy); return node; } /* Allocate a new entry */ node = calloc(1, sizeof(pcmk__node_status_t)); if (node == NULL) { free(node_name_copy); return NULL; } /* Populate the essential information */ set_peer_flags(node, pcmk__node_status_remote); node->xml_id = strdup(node_name); if (node->xml_id == NULL) { free(node); errno = ENOMEM; free(node_name_copy); return NULL; } /* Add the new entry to the cache */ g_hash_table_replace(pcmk__remote_peer_cache, node->xml_id, node); crm_trace("added %s to remote cache", node_name); /* Update the entry's uname, ensuring peer status callbacks are called */ update_peer_uname(node, node_name); free(node_name_copy); return node; } /*! * \internal * \brief Remove a node from the Pacemaker Remote node cache * * \param[in] node_name Name of node to remove from cache * * \note The caller must be careful not to use \p node_name after calling this * function if it might be a pointer into the cache entry being removed. */ void pcmk__cluster_forget_remote_node(const char *node_name) { /* Do a lookup first, because node_name could be a pointer within the entry * being removed -- we can't log it *after* removing it. */ if (g_hash_table_lookup(pcmk__remote_peer_cache, node_name) != NULL) { crm_trace("Removing %s from Pacemaker Remote node cache", node_name); g_hash_table_remove(pcmk__remote_peer_cache, node_name); } } /*! * \internal * \brief Return node status based on a CIB status entry * * \param[in] node_state XML of node state * * \return \c PCMK_VALUE_MEMBER if \c PCMK__XA_IN_CCM is true in * \c PCMK__XE_NODE_STATE, or \c PCMK__VALUE_LOST otherwise */ static const char * remote_state_from_cib(const xmlNode *node_state) { bool in_ccm = false; if ((pcmk__xe_get_bool_attr(node_state, PCMK__XA_IN_CCM, &in_ccm) == pcmk_rc_ok) && in_ccm) { return PCMK_VALUE_MEMBER; } return PCMK__VALUE_LOST; } /* user data for looping through remote node xpath searches */ struct refresh_data { const char *field; /* XML attribute to check for node name */ gboolean has_state; /* whether to update node state based on XML */ }; /*! * \internal * \brief Process one pacemaker_remote node xpath search result * * \param[in] result XML search result * \param[in] user_data what to look for in the XML */ static void remote_cache_refresh_helper(xmlNode *result, void *user_data) { const struct refresh_data *data = user_data; const char *remote = crm_element_value(result, data->field); const char *state = NULL; pcmk__node_status_t *node; CRM_CHECK(remote != NULL, return); /* Determine node's state, if the result has it */ if (data->has_state) { state = remote_state_from_cib(result); } /* Check whether cache already has entry for node */ node = g_hash_table_lookup(pcmk__remote_peer_cache, remote); if (node == NULL) { /* Node is not in cache, so add a new entry for it */ node = pcmk__cluster_lookup_remote_node(remote); CRM_ASSERT(node); if (state) { pcmk__update_peer_state(__func__, node, state, 0); } } else if (pcmk_is_set(node->flags, pcmk__node_status_dirty)) { /* Node is in cache and hasn't been updated already, so mark it clean */ clear_peer_flags(node, pcmk__node_status_dirty); if (state) { pcmk__update_peer_state(__func__, node, state, 0); } } } static void mark_dirty(gpointer key, gpointer value, gpointer user_data) { set_peer_flags((pcmk__node_status_t *) value, pcmk__node_status_dirty); } static gboolean is_dirty(gpointer key, gpointer value, gpointer user_data) { const pcmk__node_status_t *node = value; return pcmk_is_set(node->flags, pcmk__node_status_dirty); } /*! * \internal * \brief Repopulate the remote node cache based on CIB XML * * \param[in] cib CIB XML to parse */ static void refresh_remote_nodes(xmlNode *cib) { struct refresh_data data; pcmk__cluster_init_node_caches(); /* First, we mark all existing cache entries as dirty, * so that later we can remove any that weren't in the CIB. * We don't empty the cache, because we need to detect changes in state. */ g_hash_table_foreach(pcmk__remote_peer_cache, mark_dirty, NULL); /* Look for guest nodes and remote nodes in the status section */ data.field = PCMK_XA_ID; data.has_state = TRUE; crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_STATUS, remote_cache_refresh_helper, &data); /* Look for guest nodes and remote nodes in the configuration section, * because they may have just been added and not have a status entry yet. * In that case, the cached node state will be left NULL, so that the * peer status callback isn't called until we're sure the node started * successfully. */ data.field = PCMK_XA_VALUE; data.has_state = FALSE; crm_foreach_xpath_result(cib, PCMK__XP_GUEST_NODE_CONFIG, remote_cache_refresh_helper, &data); data.field = PCMK_XA_ID; data.has_state = FALSE; crm_foreach_xpath_result(cib, PCMK__XP_REMOTE_NODE_CONFIG, remote_cache_refresh_helper, &data); /* Remove all old cache entries that weren't seen in the CIB */ g_hash_table_foreach_remove(pcmk__remote_peer_cache, is_dirty, NULL); } /*! * \internal * \brief Check whether a node is an active cluster node * * Remote nodes are never considered active. This guarantees that they can never * become DC. * * \param[in] node Node to check * * \return \c true if the node is an active cluster node, or \c false otherwise */ bool pcmk__cluster_is_node_active(const pcmk__node_status_t *node) { const enum pcmk_cluster_layer cluster_layer = pcmk_get_cluster_layer(); if ((node == NULL) || pcmk_is_set(node->flags, pcmk__node_status_remote)) { return false; } switch (cluster_layer) { case pcmk_cluster_layer_corosync: #if SUPPORT_COROSYNC return pcmk__corosync_is_peer_active(node); #else break; #endif // SUPPORT_COROSYNC default: break; } crm_err("Unhandled cluster layer: %s", pcmk_cluster_layer_text(cluster_layer)); return false; } /*! * \internal * \brief Check if a node's entry should be removed from the cluster node cache * * A node should be removed from the cache if it's inactive and matches another * \c pcmk__node_status_t (the search object). The node is considered a * mismatch if any of the following are true: * * The search object is \c NULL. * * The search object has an ID set and the cached node's ID does not match it. * * The search object does not have an ID set, and the cached node's name does * not match the search node's name. (If both names are \c NULL, it's a * match.) * * Otherwise, the node is considered a match. * * Note that if the search object has both an ID and a name set, the name is * ignored for matching purposes. * * \param[in] key Ignored * \param[in] value \c pcmk__node_status_t object from cluster node cache * \param[in] user_data \c pcmk__node_status_t object to match against (search * object) * * \return \c TRUE if the node entry should be removed from \c pcmk__peer_cache, * or \c FALSE otherwise */ static gboolean should_forget_cluster_node(gpointer key, gpointer value, gpointer user_data) { pcmk__node_status_t *node = value; pcmk__node_status_t *search = user_data; if (search == NULL) { return FALSE; } if ((search->cluster_layer_id != 0) && (node->cluster_layer_id != search->cluster_layer_id)) { return FALSE; } if ((search->cluster_layer_id == 0) && !pcmk__str_eq(node->name, search->name, pcmk__str_casei)) { // @TODO Consider name even if ID is set? return FALSE; } if (pcmk__cluster_is_node_active(value)) { return FALSE; } crm_info("Removing node with name %s and cluster layer ID " PRIu32 " from membership cache", pcmk__s(node->name, "(unknown)"), node->cluster_layer_id); return TRUE; } /*! * \internal * \brief Remove one or more inactive nodes from the cluster node cache * * All inactive nodes matching \p id and \p node_name as described in * \c should_forget_cluster_node documentation are removed from the cache. * * If \p id is 0 and \p node_name is \c NULL, all inactive nodes are removed * from the cache regardless of ID and name. This differs from clearing the * cache, in that entries for active nodes are preserved. * * \param[in] id ID of node to remove from cache (0 to ignore) * \param[in] node_name Name of node to remove from cache (ignored if \p id is * nonzero) * * \note \p node_name is not modified directly, but it will be freed if it's a * pointer into a cache entry that is removed. */ void pcmk__cluster_forget_cluster_node(uint32_t id, const char *node_name) { pcmk__node_status_t search = { 0, }; char *criterion = NULL; // For logging guint matches = 0; if (pcmk__peer_cache == NULL) { crm_trace("Membership cache not initialized, ignoring removal request"); return; } search.cluster_layer_id = id; search.name = pcmk__str_copy(node_name); // May log after original freed if (id > 0) { criterion = crm_strdup_printf("cluster layer ID %" PRIu32, id); } else if (node_name != NULL) { criterion = crm_strdup_printf("name %s", node_name); } matches = g_hash_table_foreach_remove(pcmk__peer_cache, should_forget_cluster_node, &search); if (matches > 0) { if (criterion != NULL) { crm_notice("Removed %u inactive node%s with %s from the membership " "cache", matches, pcmk__plural_s(matches), criterion); } else { crm_notice("Removed all (%u) inactive cluster nodes from the " "membership cache", matches); } } else { crm_info("No inactive cluster nodes%s%s to remove from the membership " "cache", ((criterion != NULL)? " with " : ""), pcmk__s(criterion, "")); } free(search.name); free(criterion); } static void count_peer(gpointer key, gpointer value, gpointer user_data) { unsigned int *count = user_data; pcmk__node_status_t *node = value; if (pcmk__cluster_is_node_active(node)) { *count = *count + 1; } } /*! * \internal * \brief Get the number of active cluster nodes that have been seen * * Remote nodes are never considered active. This guarantees that they can never * become DC. * * \return Number of active nodes in the cluster node cache */ unsigned int pcmk__cluster_num_active_nodes(void) { unsigned int count = 0; if (pcmk__peer_cache != NULL) { g_hash_table_foreach(pcmk__peer_cache, count_peer, &count); } return count; } static void destroy_crm_node(gpointer data) { pcmk__node_status_t *node = data; crm_trace("Destroying entry for node %" PRIu32 ": %s", node->cluster_layer_id, node->name); free(node->name); free(node->state); free(node->xml_id); free(node->user_data); free(node->expected); free(node->conn_host); free(node); } /*! * \internal * \brief Initialize node caches */ void pcmk__cluster_init_node_caches(void) { if (pcmk__peer_cache == NULL) { pcmk__peer_cache = pcmk__strikey_table(free, destroy_crm_node); } if (pcmk__remote_peer_cache == NULL) { pcmk__remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node); } if (cluster_node_cib_cache == NULL) { cluster_node_cib_cache = pcmk__strikey_table(free, destroy_crm_node); } } /*! * \internal * \brief Initialize node caches */ void pcmk__cluster_destroy_node_caches(void) { if (pcmk__peer_cache != NULL) { crm_trace("Destroying peer cache with %d members", g_hash_table_size(pcmk__peer_cache)); g_hash_table_destroy(pcmk__peer_cache); pcmk__peer_cache = NULL; } if (pcmk__remote_peer_cache != NULL) { crm_trace("Destroying remote peer cache with %d members", pcmk__cluster_num_remote_nodes()); g_hash_table_destroy(pcmk__remote_peer_cache); pcmk__remote_peer_cache = NULL; } if (cluster_node_cib_cache != NULL) { crm_trace("Destroying configured cluster node cache with %d members", g_hash_table_size(cluster_node_cib_cache)); g_hash_table_destroy(cluster_node_cib_cache); cluster_node_cib_cache = NULL; } } static void (*peer_status_callback)(enum pcmk__node_update, pcmk__node_status_t *, const void *) = NULL; /*! * \internal * \brief Set a client function that will be called after peer status changes * * \param[in] dispatch Pointer to function to use as callback * * \note Client callbacks should do only client-specific handling. Callbacks * must not add or remove entries in the peer caches. */ void pcmk__cluster_set_status_callback(void (*dispatch)(enum pcmk__node_update, pcmk__node_status_t *, const void *)) { // @TODO Improve documentation of peer_status_callback peer_status_callback = dispatch; } /*! * \internal * \brief Tell the library whether to automatically reap lost nodes * * If \c true (the default), calling \c crm_update_peer_proc() will also update * the peer state to \c PCMK_VALUE_MEMBER or \c PCMK__VALUE_LOST, and updating * the peer state will reap peers whose state changes to anything other than * \c PCMK_VALUE_MEMBER. * * Callers should leave this enabled unless they plan to manage the cache * separately on their own. * * \param[in] enable \c true to enable automatic reaping, \c false to disable */ void pcmk__cluster_set_autoreap(bool enable) { autoreap = enable; } static void dump_peer_hash(int level, const char *caller) { GHashTableIter iter; const char *id = NULL; pcmk__node_status_t *node = NULL; g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) { do_crm_log(level, "%s: Node %" PRIu32 "/%s = %p - %s", caller, node->cluster_layer_id, node->name, node, id); } } static gboolean hash_find_by_data(gpointer key, gpointer value, gpointer user_data) { return value == user_data; } /*! * \internal * \brief Search cluster member node cache * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] uuid If not NULL while id is 0, node UUID instead of cluster * node ID to search for * * \return Cluster node cache entry if found, otherwise NULL */ static pcmk__node_status_t * search_cluster_member_cache(unsigned int id, const char *uname, const char *uuid) { GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__node_status_t *by_id = NULL; pcmk__node_status_t *by_name = NULL; CRM_ASSERT(id > 0 || uname != NULL); pcmk__cluster_init_node_caches(); if (uname != NULL) { g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->name, uname, pcmk__str_casei)) { crm_trace("Name match: %s", node->name); by_name = node; break; } } } if (id > 0) { g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (node->cluster_layer_id == id) { crm_trace("ID match: %" PRIu32, node->cluster_layer_id); by_id = node; break; } } } else if (uuid != NULL) { g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, uuid, pcmk__str_casei)) { crm_trace("UUID match: %s", node->xml_id); by_id = node; break; } } } node = by_id; /* Good default */ if(by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %u/%s", by_id, id, uname); } else if(by_id == NULL && by_name) { crm_trace("Only one: %p for %u/%s", by_name, id, uname); if (id && by_name->cluster_layer_id) { dump_peer_hash(LOG_WARNING, __func__); crm_crit("Nodes %u and %" PRIu32 " share the same name '%s'", id, by_name->cluster_layer_id, uname); node = NULL; /* Create a new one */ } else { node = by_name; } } else if(by_name == NULL && by_id) { crm_trace("Only one: %p for %u/%s", by_id, id, uname); if ((uname != NULL) && (by_id->name != NULL)) { dump_peer_hash(LOG_WARNING, __func__); crm_crit("Nodes '%s' and '%s' share the same cluster nodeid %u: " "assuming '%s' is correct", uname, by_id->name, id, uname); } } else if ((uname != NULL) && (by_id->name != NULL)) { if (pcmk__str_eq(uname, by_id->name, pcmk__str_casei)) { crm_notice("Node '%s' has changed its cluster layer ID " "from %" PRIu32 " to %" PRIu32, by_id->name, by_name->cluster_layer_id, by_id->cluster_layer_id); g_hash_table_foreach_remove(pcmk__peer_cache, hash_find_by_data, by_name); } else { crm_warn("Nodes '%s' and '%s' share the same cluster nodeid: %u %s", by_id->name, by_name->name, id, uname); dump_peer_hash(LOG_INFO, __func__); crm_abort(__FILE__, __func__, __LINE__, "member weirdness", TRUE, TRUE); } } else if ((id > 0) && (by_name->cluster_layer_id > 0)) { crm_warn("Nodes %" PRIu32 " and %" PRIu32 " share the same name: '%s'", by_id->cluster_layer_id, by_name->cluster_layer_id, uname); } else { /* Simple merge */ /* Only corosync-based clusters use node IDs. The functions that call * pcmk__update_peer_state() and crm_update_peer_proc() only know * nodeid, so 'by_id' is authoritative when merging. */ dump_peer_hash(LOG_DEBUG, __func__); crm_info("Merging %p into %p", by_name, by_id); g_hash_table_foreach_remove(pcmk__peer_cache, hash_find_by_data, by_name); } return node; } /*! * \internal * \brief Search caches for a node (cluster or Pacemaker Remote) * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] flags Group of enum pcmk__node_search_flags * * \return Node cache entry if found, otherwise NULL */ pcmk__node_status_t * pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags) { pcmk__node_status_t *node = NULL; CRM_ASSERT(id > 0 || uname != NULL); pcmk__cluster_init_node_caches(); if ((uname != NULL) && pcmk_is_set(flags, pcmk__node_search_remote)) { node = g_hash_table_lookup(pcmk__remote_peer_cache, uname); } if ((node == NULL) && pcmk_is_set(flags, pcmk__node_search_cluster_member)) { node = search_cluster_member_cache(id, uname, NULL); } if ((node == NULL) && pcmk_is_set(flags, pcmk__node_search_cluster_cib)) { char *id_str = (id == 0)? NULL : crm_strdup_printf("%u", id); node = find_cib_cluster_node(id_str, uname); free(id_str); } return node; } /*! * \internal * \brief Purge a node from cache (both cluster and Pacemaker Remote) * * \param[in] node_name If not NULL, purge only nodes with this name * \param[in] node_id If not 0, purge cluster nodes only if they have this ID * * \note If \p node_name is NULL and \p node_id is 0, no nodes will be purged. * If \p node_name is not NULL and \p node_id is not 0, Pacemaker Remote * nodes that match \p node_name will be purged, and cluster nodes that * match both \p node_name and \p node_id will be purged. * \note The caller must be careful not to use \p node_name after calling this * function if it might be a pointer into a cache entry being removed. */ void pcmk__purge_node_from_cache(const char *node_name, uint32_t node_id) { char *node_name_copy = NULL; if ((node_name == NULL) && (node_id == 0U)) { return; } // Purge from Pacemaker Remote node cache if ((node_name != NULL) && (g_hash_table_lookup(pcmk__remote_peer_cache, node_name) != NULL)) { /* node_name could be a pointer into the cache entry being purged, * so reassign it to a copy before the original gets freed */ node_name_copy = pcmk__str_copy(node_name); node_name = node_name_copy; crm_trace("Purging %s from Pacemaker Remote node cache", node_name); g_hash_table_remove(pcmk__remote_peer_cache, node_name); } pcmk__cluster_forget_cluster_node(node_id, node_name); free(node_name_copy); } #if SUPPORT_COROSYNC static guint remove_conflicting_peer(pcmk__node_status_t *node) { int matches = 0; GHashTableIter iter; pcmk__node_status_t *existing_node = NULL; if ((node->cluster_layer_id == 0) || (node->name == NULL)) { return 0; } if (!pcmk__corosync_has_nodelist()) { return 0; } g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) { if ((existing_node->cluster_layer_id > 0) && (existing_node->cluster_layer_id != node->cluster_layer_id) && pcmk__str_eq(existing_node->name, node->name, pcmk__str_casei)) { if (pcmk__cluster_is_node_active(existing_node)) { continue; } crm_warn("Removing cached offline node %" PRIu32 "/%s which has " "conflicting name with %" PRIu32, existing_node->cluster_layer_id, existing_node->name, node->cluster_layer_id); g_hash_table_iter_remove(&iter); matches++; } } return matches; } #endif /*! * \internal * \brief Get a cluster node cache entry, possibly creating one if not found * * If \c pcmk__node_search_cluster_member is set in \p flags, the return value * is guaranteed not to be \c NULL. A new cache entry is created if one does not * already exist. * * \param[in] id If not 0, cluster node ID to search for * \param[in] uname If not NULL, node name to search for * \param[in] uuid If not NULL while id is 0, node UUID instead of cluster * node ID to search for * \param[in] flags Group of enum pcmk__node_search_flags * * \return (Possibly newly created) cluster node cache entry */ /* coverity[-alloc] Memory is referenced in one or both hashtables */ pcmk__node_status_t * pcmk__get_node(unsigned int id, const char *uname, const char *uuid, uint32_t flags) { pcmk__node_status_t *node = NULL; char *uname_lookup = NULL; CRM_ASSERT(id > 0 || uname != NULL); pcmk__cluster_init_node_caches(); // Check the Pacemaker Remote node cache first if (pcmk_is_set(flags, pcmk__node_search_remote)) { node = g_hash_table_lookup(pcmk__remote_peer_cache, uname); if (node != NULL) { return node; } } if (!pcmk_is_set(flags, pcmk__node_search_cluster_member)) { return NULL; } node = search_cluster_member_cache(id, uname, uuid); /* if uname wasn't provided, and find_peer did not turn up a uname based on id. * we need to do a lookup of the node name using the id in the cluster membership. */ if ((uname == NULL) && ((node == NULL) || (node->name == NULL))) { uname_lookup = pcmk__cluster_node_name(id); } if (uname_lookup) { uname = uname_lookup; crm_trace("Inferred a name of '%s' for node %u", uname, id); /* try to turn up the node one more time now that we know the uname. */ if (node == NULL) { node = search_cluster_member_cache(id, uname, uuid); } } if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = pcmk__assert_alloc(1, sizeof(pcmk__node_status_t)); crm_info("Created entry %s/%p for node %s/%u (%d total)", uniqueid, node, uname, id, 1 + g_hash_table_size(pcmk__peer_cache)); g_hash_table_replace(pcmk__peer_cache, uniqueid, node); } if ((id > 0) && (uname != NULL) && ((node->cluster_layer_id == 0) || (node->name == NULL))) { crm_info("Node %u is now known as %s", id, uname); } if ((id > 0) && (node->cluster_layer_id == 0)) { node->cluster_layer_id = id; } if ((uname != NULL) && (node->name == NULL)) { update_peer_uname(node, uname); } if (node->xml_id == NULL) { if (uuid == NULL) { uuid = pcmk__cluster_node_uuid(node); } if (uuid) { crm_info("Node %u has uuid %s", id, uuid); } else { crm_info("Cannot obtain a UUID for node %u/%s", id, node->name); } } free(uname_lookup); return node; } /*! * \internal * \brief Update a node's uname * * \param[in,out] node Node object to update * \param[in] uname New name to set * * \note This function should not be called within a peer cache iteration, * because in some cases it can remove conflicting cache entries, * which would invalidate the iterator. */ static void update_peer_uname(pcmk__node_status_t *node, const char *uname) { CRM_CHECK(uname != NULL, crm_err("Bug: can't update node name without name"); return); CRM_CHECK(node != NULL, crm_err("Bug: can't update node name to %s without node", uname); return); if (pcmk__str_eq(uname, node->name, pcmk__str_casei)) { crm_debug("Node name '%s' did not change", uname); return; } for (const char *c = uname; *c; ++c) { if ((*c >= 'A') && (*c <= 'Z')) { crm_warn("Node names with capitals are discouraged, consider changing '%s'", uname); break; } } pcmk__str_update(&node->name, uname); if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_name, node, NULL); } #if SUPPORT_COROSYNC if ((pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync) && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { remove_conflicting_peer(node); } #endif } /*! * \internal * \brief Get log-friendly string equivalent of a process flag * * \param[in] proc Process flag * * \return Log-friendly string equivalent of \p proc */ static inline const char * proc2text(enum crm_proc_flag proc) { const char *text = "unknown"; switch (proc) { case crm_proc_none: text = "none"; break; case crm_proc_cpg: text = "corosync-cpg"; break; } return text; } /*! * \internal * \brief Update a node's process information (and potentially state) * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] flag Bitmask of new process information * \param[in] status node status (online, offline, etc.) * * \return NULL if any node was reaped from peer caches, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function should not be * called within a cache iteration if reaping is possible, otherwise * reaping could invalidate the iterator. */ pcmk__node_status_t * crm_update_peer_proc(const char *source, pcmk__node_status_t *node, uint32_t flag, const char *status) { uint32_t last = 0; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL", source, proc2text(flag), status); return NULL); /* Pacemaker doesn't spawn processes on remote nodes */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return node; } last = node->processes; if (status == NULL) { node->processes = flag; if (node->processes != last) { changed = TRUE; } } else if (pcmk__str_eq(status, PCMK_VALUE_ONLINE, pcmk__str_casei)) { if ((node->processes & flag) != flag) { node->processes = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Peer process", node->name, node->processes, flag, "processes"); changed = TRUE; } } else if (node->processes & flag) { node->processes = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Peer process", node->name, node->processes, flag, "processes"); changed = TRUE; } if (changed) { if (status == NULL && flag <= crm_proc_none) { crm_info("%s: Node %s[%" PRIu32 "] - all processes are now offline", source, node->name, node->cluster_layer_id); } else { crm_info("%s: Node %s[%" PRIu32 "] - %s is now %s", source, node->name, node->cluster_layer_id, proc2text(flag), status); } if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { node->when_online = time(NULL); } else { node->when_online = 0; } /* Call the client callback first, then update the peer state, * in case the node will be reaped */ if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_processes, node, &last); } /* The client callback shouldn't touch the peer caches, * but as a safety net, bail if the peer cache was destroyed. */ if (pcmk__peer_cache == NULL) { return NULL; } if (autoreap) { const char *peer_state = NULL; if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { peer_state = PCMK_VALUE_MEMBER; } else { peer_state = PCMK__VALUE_LOST; } node = pcmk__update_peer_state(__func__, node, peer_state, 0); } } else { crm_trace("%s: Node %s[%" PRIu32 "] - %s is unchanged (%s)", source, node->name, node->cluster_layer_id, proc2text(flag), status); } return node; } /*! * \internal * \brief Update a cluster node cache entry's expected join state * * \param[in] source Caller's function name (for logging) * \param[in,out] node Node to update * \param[in] expected Node's new join state */ void pcmk__update_peer_expected(const char *source, pcmk__node_status_t *node, const char *expected) { char *last = NULL; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected); return); /* Remote nodes don't participate in joins */ if (pcmk_is_set(node->flags, pcmk__node_status_remote)) { return; } last = node->expected; if (expected != NULL && !pcmk__str_eq(node->expected, expected, pcmk__str_casei)) { node->expected = strdup(expected); changed = TRUE; } if (changed) { crm_info("%s: Node %s[%" PRIu32 "] - expected state is now %s (was %s)", source, node->name, node->cluster_layer_id, expected, last); free(last); } else { crm_trace("%s: Node %s[%" PRIu32 "] - expected state is unchanged (%s)", source, node->name, node->cluster_layer_id, expected); } } /*! * \internal * \brief Update a node's state and membership information * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] state Node's new state * \param[in] membership Node's new membership ID * \param[in,out] iter If not NULL, pointer to node's peer cache iterator * * \return NULL if any node was reaped, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function may be called from * within a peer cache iteration if the iterator is supplied. */ static pcmk__node_status_t * update_peer_state_iter(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership, GHashTableIter *iter) { gboolean is_member; CRM_CHECK(node != NULL, - crm_err("Could not set state for unknown host to %s" + crm_err("Could not set state for unknown host to %s " QB_XS " source=%s", state, source); return NULL); is_member = pcmk__str_eq(state, PCMK_VALUE_MEMBER, pcmk__str_none); if (is_member) { node->when_lost = 0; if (membership) { node->membership_id = membership; } } if (state && !pcmk__str_eq(node->state, state, pcmk__str_casei)) { char *last = node->state; if (is_member) { node->when_member = time(NULL); } else { node->when_member = 0; } node->state = strdup(state); crm_notice("Node %s state is now %s " QB_XS " nodeid=%" PRIu32 " previous=%s source=%s", node->name, state, node->cluster_layer_id, pcmk__s(last, "unknown"), source); if (peer_status_callback != NULL) { peer_status_callback(pcmk__node_update_state, node, last); } free(last); if (autoreap && !is_member && !pcmk_is_set(node->flags, pcmk__node_status_remote)) { /* We only autoreap from the peer cache, not the remote peer cache, * because the latter should be managed only by * refresh_remote_nodes(). */ if(iter) { crm_notice("Purged 1 peer with cluster layer ID=" PRIu32 "and/or name=%s from the membership cache", node->cluster_layer_id, node->name); g_hash_table_iter_remove(iter); } else { pcmk__cluster_forget_cluster_node(node->cluster_layer_id, node->name); } node = NULL; } } else { crm_trace("Node %s state is unchanged (%s) " QB_XS " nodeid=%" PRIu32 " source=%s", node->name, state, node->cluster_layer_id, source); } return node; } /*! * \brief Update a node's state and membership information * * \param[in] source Caller's function name (for log messages) * \param[in,out] node Node object to update * \param[in] state Node's new state * \param[in] membership Node's new membership ID * * \return NULL if any node was reaped, value of node otherwise * * \note If this function returns NULL, the supplied node object was likely * freed and should not be used again. This function should not be * called within a cache iteration if reaping is possible, * otherwise reaping could invalidate the iterator. */ pcmk__node_status_t * pcmk__update_peer_state(const char *source, pcmk__node_status_t *node, const char *state, uint64_t membership) { return update_peer_state_iter(source, node, state, membership, NULL); } /*! * \internal * \brief Reap all nodes from cache whose membership information does not match * * \param[in] membership Membership ID of nodes to keep */ void pcmk__reap_unseen_nodes(uint64_t membership) { GHashTableIter iter; pcmk__node_status_t *node = NULL; crm_trace("Reaping unseen nodes..."); g_hash_table_iter_init(&iter, pcmk__peer_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) { if (node->membership_id != membership) { if (node->state) { /* Calling update_peer_state_iter() allows us to remove the node * from pcmk__peer_cache without invalidating our iterator */ update_peer_state_iter(__func__, node, PCMK__VALUE_LOST, membership, &iter); } else { crm_info("State of node %s[%" PRIu32 "] is still unknown", node->name, node->cluster_layer_id); } } } } static pcmk__node_status_t * find_cib_cluster_node(const char *id, const char *uname) { GHashTableIter iter; pcmk__node_status_t *node = NULL; pcmk__node_status_t *by_id = NULL; pcmk__node_status_t *by_name = NULL; if (uname) { g_hash_table_iter_init(&iter, cluster_node_cib_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->name, uname, pcmk__str_casei)) { crm_trace("Name match: %s = %p", node->name, node); by_name = node; break; } } } if (id) { g_hash_table_iter_init(&iter, cluster_node_cib_cache); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { if (pcmk__str_eq(node->xml_id, id, pcmk__str_casei)) { crm_trace("ID match: %s= %p", id, node); by_id = node; break; } } } node = by_id; /* Good default */ if (by_id == by_name) { /* Nothing to do if they match (both NULL counts) */ crm_trace("Consistent: %p for %s/%s", by_id, id, uname); } else if (by_id == NULL && by_name) { crm_trace("Only one: %p for %s/%s", by_name, id, uname); if (id) { node = NULL; } else { node = by_name; } } else if (by_name == NULL && by_id) { crm_trace("Only one: %p for %s/%s", by_id, id, uname); if (uname) { node = NULL; } } else if ((uname != NULL) && (by_id->name != NULL) && pcmk__str_eq(uname, by_id->name, pcmk__str_casei)) { /* Multiple nodes have the same uname in the CIB. * Return by_id. */ } else if ((id != NULL) && (by_name->xml_id != NULL) && pcmk__str_eq(id, by_name->xml_id, pcmk__str_casei)) { /* Multiple nodes have the same id in the CIB. * Return by_name. */ node = by_name; } else { node = NULL; } if (node == NULL) { crm_debug("Couldn't find node%s%s%s%s", id? " " : "", id? id : "", uname? " with name " : "", uname? uname : ""); } return node; } static void cluster_node_cib_cache_refresh_helper(xmlNode *xml_node, void *user_data) { const char *id = crm_element_value(xml_node, PCMK_XA_ID); const char *uname = crm_element_value(xml_node, PCMK_XA_UNAME); pcmk__node_status_t * node = NULL; CRM_CHECK(id != NULL && uname !=NULL, return); node = find_cib_cluster_node(id, uname); if (node == NULL) { char *uniqueid = crm_generate_uuid(); node = pcmk__assert_alloc(1, sizeof(pcmk__node_status_t)); node->name = pcmk__str_copy(uname); node->xml_id = pcmk__str_copy(id); g_hash_table_replace(cluster_node_cib_cache, uniqueid, node); } else if (pcmk_is_set(node->flags, pcmk__node_status_dirty)) { pcmk__str_update(&node->name, uname); /* Node is in cache and hasn't been updated already, so mark it clean */ clear_peer_flags(node, pcmk__node_status_dirty); } } static void refresh_cluster_node_cib_cache(xmlNode *cib) { pcmk__cluster_init_node_caches(); g_hash_table_foreach(cluster_node_cib_cache, mark_dirty, NULL); crm_foreach_xpath_result(cib, PCMK__XP_MEMBER_NODE_CONFIG, cluster_node_cib_cache_refresh_helper, NULL); // Remove all old cache entries that weren't seen in the CIB g_hash_table_foreach_remove(cluster_node_cib_cache, is_dirty, NULL); } void pcmk__refresh_node_caches_from_cib(xmlNode *cib) { refresh_remote_nodes(cib); refresh_cluster_node_cib_cache(cib); } // Deprecated functions kept only for backward API compatibility // LCOV_EXCL_START #include void crm_peer_init(void) { pcmk__cluster_init_node_caches(); } // LCOV_EXCL_STOP // End deprecated API diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c index e53b0b1c1b..487a4265bd 100644 --- a/lib/common/mainloop.c +++ b/lib/common/mainloop.c @@ -1,1463 +1,1463 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include struct mainloop_child_s { pid_t pid; char *desc; unsigned timerid; gboolean timeout; void *privatedata; enum mainloop_child_flags flags; /* Called when a process dies */ void (*callback) (mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode); }; struct trigger_s { GSource source; gboolean running; gboolean trigger; void *user_data; guint id; }; struct mainloop_timer_s { guint id; guint period_ms; bool repeat; char *name; GSourceFunc cb; void *userdata; }; static gboolean crm_trigger_prepare(GSource * source, gint * timeout) { crm_trigger_t *trig = (crm_trigger_t *) source; /* cluster-glue's FD and IPC related sources make use of * g_source_add_poll() but do not set a timeout in their prepare * functions * * This means mainloop's poll() will block until an event for one * of these sources occurs - any /other/ type of source, such as * this one or g_idle_*, that doesn't use g_source_add_poll() is * S-O-L and won't be processed until there is something fd-based * happens. * * Luckily the timeout we can set here affects all sources and * puts an upper limit on how long poll() can take. * * So unconditionally set a small-ish timeout, not too small that * we're in constant motion, which will act as an upper bound on * how long the signal handling might be delayed for. */ *timeout = 500; /* Timeout in ms */ return trig->trigger; } static gboolean crm_trigger_check(GSource * source) { crm_trigger_t *trig = (crm_trigger_t *) source; return trig->trigger; } /*! * \internal * \brief GSource dispatch function for crm_trigger_t * * \param[in] source crm_trigger_t being dispatched * \param[in] callback Callback passed at source creation * \param[in,out] userdata User data passed at source creation * * \return G_SOURCE_REMOVE to remove source, G_SOURCE_CONTINUE to keep it */ static gboolean crm_trigger_dispatch(GSource *source, GSourceFunc callback, gpointer userdata) { gboolean rc = G_SOURCE_CONTINUE; crm_trigger_t *trig = (crm_trigger_t *) source; if (trig->running) { /* Wait until the existing job is complete before starting the next one */ return G_SOURCE_CONTINUE; } trig->trigger = FALSE; if (callback) { int callback_rc = callback(trig->user_data); if (callback_rc < 0) { crm_trace("Trigger handler %p not yet complete", trig); trig->running = TRUE; } else if (callback_rc == 0) { rc = G_SOURCE_REMOVE; } } return rc; } static void crm_trigger_finalize(GSource * source) { crm_trace("Trigger %p destroyed", source); } static GSourceFuncs crm_trigger_funcs = { crm_trigger_prepare, crm_trigger_check, crm_trigger_dispatch, crm_trigger_finalize, }; static crm_trigger_t * mainloop_setup_trigger(GSource * source, int priority, int (*dispatch) (gpointer user_data), gpointer userdata) { crm_trigger_t *trigger = NULL; trigger = (crm_trigger_t *) source; trigger->id = 0; trigger->trigger = FALSE; trigger->user_data = userdata; if (dispatch) { g_source_set_callback(source, dispatch, trigger, NULL); } g_source_set_priority(source, priority); g_source_set_can_recurse(source, FALSE); trigger->id = g_source_attach(source, NULL); return trigger; } void mainloop_trigger_complete(crm_trigger_t * trig) { crm_trace("Trigger handler %p complete", trig); trig->running = FALSE; } /*! * \brief Create a trigger to be used as a mainloop source * * \param[in] priority Relative priority of source (lower number is higher priority) * \param[in] dispatch Trigger dispatch function (should return 0 to remove the * trigger from the mainloop, -1 if the trigger should be * kept but the job is still running and not complete, and * 1 if the trigger should be kept and the job is complete) * \param[in] userdata Pointer to pass to \p dispatch * * \return Newly allocated mainloop source for trigger */ crm_trigger_t * mainloop_add_trigger(int priority, int (*dispatch) (gpointer user_data), gpointer userdata) { GSource *source = NULL; CRM_ASSERT(sizeof(crm_trigger_t) > sizeof(GSource)); source = g_source_new(&crm_trigger_funcs, sizeof(crm_trigger_t)); return mainloop_setup_trigger(source, priority, dispatch, userdata); } void mainloop_set_trigger(crm_trigger_t * source) { if(source) { source->trigger = TRUE; } } gboolean mainloop_destroy_trigger(crm_trigger_t * source) { GSource *gs = NULL; if(source == NULL) { return TRUE; } gs = (GSource *)source; g_source_destroy(gs); /* Remove from mainloop, ref_count-- */ g_source_unref(gs); /* The caller no longer carries a reference to source * * At this point the source should be free'd, * unless we're currently processing said * source, in which case mainloop holds an * additional reference and it will be free'd * once our processing completes */ return TRUE; } // Define a custom glib source for signal handling // Data structure for custom glib source typedef struct signal_s { crm_trigger_t trigger; // trigger that invoked source (must be first) void (*handler) (int sig); // signal handler int signal; // signal that was received } crm_signal_t; // Table to associate signal handlers with signal numbers static crm_signal_t *crm_signals[NSIG]; /*! * \internal * \brief Dispatch an event from custom glib source for signals * * Given an signal event, clear the event trigger and call any registered * signal handler. * * \param[in] source glib source that triggered this dispatch * \param[in] callback (ignored) * \param[in] userdata (ignored) */ static gboolean crm_signal_dispatch(GSource *source, GSourceFunc callback, gpointer userdata) { crm_signal_t *sig = (crm_signal_t *) source; if(sig->signal != SIGCHLD) { crm_notice("Caught '%s' signal " QB_XS " %d (%s handler)", strsignal(sig->signal), sig->signal, (sig->handler? "invoking" : "no")); } sig->trigger.trigger = FALSE; if (sig->handler) { sig->handler(sig->signal); } return TRUE; } /*! * \internal * \brief Handle a signal by setting a trigger for signal source * * \param[in] sig Signal number that was received * * \note This is the true signal handler for the mainloop signal source, and * must be async-safe. */ static void mainloop_signal_handler(int sig) { if (sig > 0 && sig < NSIG && crm_signals[sig] != NULL) { mainloop_set_trigger((crm_trigger_t *) crm_signals[sig]); } } // Functions implementing our custom glib source for signal handling static GSourceFuncs crm_signal_funcs = { crm_trigger_prepare, crm_trigger_check, crm_signal_dispatch, crm_trigger_finalize, }; /*! * \internal * \brief Set a true signal handler * * signal()-like interface to sigaction() * * \param[in] sig Signal number to register handler for * \param[in] dispatch Signal handler * * \return The previous value of the signal handler, or SIG_ERR on error * \note The dispatch function must be async-safe. */ sighandler_t crm_signal_handler(int sig, sighandler_t dispatch) { sigset_t mask; struct sigaction sa; struct sigaction old; if (sigemptyset(&mask) < 0) { crm_err("Could not set handler for signal %d: %s", sig, pcmk_rc_str(errno)); return SIG_ERR; } memset(&sa, 0, sizeof(struct sigaction)); sa.sa_handler = dispatch; sa.sa_flags = SA_RESTART; sa.sa_mask = mask; if (sigaction(sig, &sa, &old) < 0) { crm_err("Could not set handler for signal %d: %s", sig, pcmk_rc_str(errno)); return SIG_ERR; } return old.sa_handler; } static void mainloop_destroy_signal_entry(int sig) { crm_signal_t *tmp = crm_signals[sig]; crm_signals[sig] = NULL; crm_trace("Destroying signal %d", sig); mainloop_destroy_trigger((crm_trigger_t *) tmp); } /*! * \internal * \brief Add a signal handler to a mainloop * * \param[in] sig Signal number to handle * \param[in] dispatch Signal handler function * * \note The true signal handler merely sets a mainloop trigger to call this * dispatch function via the mainloop. Therefore, the dispatch function * does not need to be async-safe. */ gboolean mainloop_add_signal(int sig, void (*dispatch) (int sig)) { GSource *source = NULL; int priority = G_PRIORITY_HIGH - 1; if (sig == SIGTERM) { /* TERM is higher priority than other signals, * signals are higher priority than other ipc. * Yes, minus: smaller is "higher" */ priority--; } if (sig >= NSIG || sig < 0) { crm_err("Signal %d is out of range", sig); return FALSE; } else if (crm_signals[sig] != NULL && crm_signals[sig]->handler == dispatch) { crm_trace("Signal handler for %d is already installed", sig); return TRUE; } else if (crm_signals[sig] != NULL) { crm_err("Different signal handler for %d is already installed", sig); return FALSE; } CRM_ASSERT(sizeof(crm_signal_t) > sizeof(GSource)); source = g_source_new(&crm_signal_funcs, sizeof(crm_signal_t)); crm_signals[sig] = (crm_signal_t *) mainloop_setup_trigger(source, priority, NULL, NULL); CRM_ASSERT(crm_signals[sig] != NULL); crm_signals[sig]->handler = dispatch; crm_signals[sig]->signal = sig; if (crm_signal_handler(sig, mainloop_signal_handler) == SIG_ERR) { mainloop_destroy_signal_entry(sig); return FALSE; } return TRUE; } gboolean mainloop_destroy_signal(int sig) { if (sig >= NSIG || sig < 0) { crm_err("Signal %d is out of range", sig); return FALSE; } else if (crm_signal_handler(sig, NULL) == SIG_ERR) { crm_perror(LOG_ERR, "Could not uninstall signal handler for signal %d", sig); return FALSE; } else if (crm_signals[sig] == NULL) { return TRUE; } mainloop_destroy_signal_entry(sig); return TRUE; } static qb_array_t *gio_map = NULL; void mainloop_cleanup(void) { if (gio_map) { qb_array_free(gio_map); } for (int sig = 0; sig < NSIG; ++sig) { mainloop_destroy_signal_entry(sig); } } /* * libqb... */ struct gio_to_qb_poll { int32_t is_used; guint source; int32_t events; void *data; qb_ipcs_dispatch_fn_t fn; enum qb_loop_priority p; }; static gboolean gio_read_socket(GIOChannel * gio, GIOCondition condition, gpointer data) { struct gio_to_qb_poll *adaptor = (struct gio_to_qb_poll *)data; gint fd = g_io_channel_unix_get_fd(gio); crm_trace("%p.%d %d", data, fd, condition); /* if this assert get's hit, then there is a race condition between * when we destroy a fd and when mainloop actually gives it up */ CRM_ASSERT(adaptor->is_used > 0); return (adaptor->fn(fd, condition, adaptor->data) == 0); } static void gio_poll_destroy(gpointer data) { struct gio_to_qb_poll *adaptor = (struct gio_to_qb_poll *)data; adaptor->is_used--; CRM_ASSERT(adaptor->is_used >= 0); if (adaptor->is_used == 0) { crm_trace("Marking adaptor %p unused", adaptor); adaptor->source = 0; } } /*! * \internal * \brief Convert libqb's poll priority into GLib's one * * \param[in] prio libqb's poll priority (#QB_LOOP_MED assumed as fallback) * * \return best matching GLib's priority */ static gint conv_prio_libqb2glib(enum qb_loop_priority prio) { switch (prio) { case QB_LOOP_LOW: return G_PRIORITY_LOW; case QB_LOOP_HIGH: return G_PRIORITY_HIGH; default: return G_PRIORITY_DEFAULT; // QB_LOOP_MED } } /*! * \internal * \brief Convert libqb's poll priority to rate limiting spec * * \param[in] prio libqb's poll priority (#QB_LOOP_MED assumed as fallback) * * \return best matching rate limiting spec * \note This is the inverse of libqb's qb_ipcs_request_rate_limit(). */ static enum qb_ipcs_rate_limit conv_libqb_prio2ratelimit(enum qb_loop_priority prio) { switch (prio) { case QB_LOOP_LOW: return QB_IPCS_RATE_SLOW; case QB_LOOP_HIGH: return QB_IPCS_RATE_FAST; default: return QB_IPCS_RATE_NORMAL; // QB_LOOP_MED } } static int32_t gio_poll_dispatch_update(enum qb_loop_priority p, int32_t fd, int32_t evts, void *data, qb_ipcs_dispatch_fn_t fn, int32_t add) { struct gio_to_qb_poll *adaptor; GIOChannel *channel; int32_t res = 0; res = qb_array_index(gio_map, fd, (void **)&adaptor); if (res < 0) { crm_err("Array lookup failed for fd=%d: %d", fd, res); return res; } crm_trace("Adding fd=%d to mainloop as adaptor %p", fd, adaptor); if (add && adaptor->source) { crm_err("Adaptor for descriptor %d is still in-use", fd); return -EEXIST; } if (!add && !adaptor->is_used) { crm_err("Adaptor for descriptor %d is not in-use", fd); return -ENOENT; } /* channel is created with ref_count = 1 */ channel = g_io_channel_unix_new(fd); if (!channel) { crm_err("No memory left to add fd=%d", fd); return -ENOMEM; } if (adaptor->source) { g_source_remove(adaptor->source); adaptor->source = 0; } /* Because unlike the poll() API, glib doesn't tell us about HUPs by default */ evts |= (G_IO_HUP | G_IO_NVAL | G_IO_ERR); adaptor->fn = fn; adaptor->events = evts; adaptor->data = data; adaptor->p = p; adaptor->is_used++; adaptor->source = g_io_add_watch_full(channel, conv_prio_libqb2glib(p), evts, gio_read_socket, adaptor, gio_poll_destroy); /* Now that mainloop now holds a reference to channel, * thanks to g_io_add_watch_full(), drop ours from g_io_channel_unix_new(). * * This means that channel will be free'd by: * g_main_context_dispatch() * -> g_source_destroy_internal() * -> g_source_callback_unref() * shortly after gio_poll_destroy() completes */ g_io_channel_unref(channel); crm_trace("Added to mainloop with gsource id=%d", adaptor->source); if (adaptor->source > 0) { return 0; } return -EINVAL; } static int32_t gio_poll_dispatch_add(enum qb_loop_priority p, int32_t fd, int32_t evts, void *data, qb_ipcs_dispatch_fn_t fn) { return gio_poll_dispatch_update(p, fd, evts, data, fn, QB_TRUE); } static int32_t gio_poll_dispatch_mod(enum qb_loop_priority p, int32_t fd, int32_t evts, void *data, qb_ipcs_dispatch_fn_t fn) { return gio_poll_dispatch_update(p, fd, evts, data, fn, QB_FALSE); } static int32_t gio_poll_dispatch_del(int32_t fd) { struct gio_to_qb_poll *adaptor; crm_trace("Looking for fd=%d", fd); if (qb_array_index(gio_map, fd, (void **)&adaptor) == 0) { if (adaptor->source) { g_source_remove(adaptor->source); adaptor->source = 0; } } return 0; } struct qb_ipcs_poll_handlers gio_poll_funcs = { .job_add = NULL, .dispatch_add = gio_poll_dispatch_add, .dispatch_mod = gio_poll_dispatch_mod, .dispatch_del = gio_poll_dispatch_del, }; static enum qb_ipc_type pick_ipc_type(enum qb_ipc_type requested) { const char *env = pcmk__env_option(PCMK__ENV_IPC_TYPE); if (env && strcmp("shared-mem", env) == 0) { return QB_IPC_SHM; } else if (env && strcmp("socket", env) == 0) { return QB_IPC_SOCKET; } else if (env && strcmp("posix", env) == 0) { return QB_IPC_POSIX_MQ; } else if (env && strcmp("sysv", env) == 0) { return QB_IPC_SYSV_MQ; } else if (requested == QB_IPC_NATIVE) { /* We prefer shared memory because the server never blocks on * send. If part of a message fits into the socket, libqb * needs to block until the remainder can be sent also. * Otherwise the client will wait forever for the remaining * bytes. */ return QB_IPC_SHM; } return requested; } qb_ipcs_service_t * mainloop_add_ipc_server(const char *name, enum qb_ipc_type type, struct qb_ipcs_service_handlers *callbacks) { return mainloop_add_ipc_server_with_prio(name, type, callbacks, QB_LOOP_MED); } qb_ipcs_service_t * mainloop_add_ipc_server_with_prio(const char *name, enum qb_ipc_type type, struct qb_ipcs_service_handlers *callbacks, enum qb_loop_priority prio) { int rc = 0; qb_ipcs_service_t *server = NULL; if (gio_map == NULL) { gio_map = qb_array_create_2(64, sizeof(struct gio_to_qb_poll), 1); } server = qb_ipcs_create(name, 0, pick_ipc_type(type), callbacks); if (server == NULL) { crm_err("Could not create %s IPC server: %s (%d)", name, pcmk_rc_str(errno), errno); return NULL; } if (prio != QB_LOOP_MED) { qb_ipcs_request_rate_limit(server, conv_libqb_prio2ratelimit(prio)); } /* All clients should use at least ipc_buffer_max as their buffer size */ qb_ipcs_enforce_buffer_size(server, crm_ipc_default_buffer_size()); qb_ipcs_poll_handlers_set(server, &gio_poll_funcs); rc = qb_ipcs_run(server); if (rc < 0) { crm_err("Could not start %s IPC server: %s (%d)", name, pcmk_strerror(rc), rc); return NULL; // qb_ipcs_run() destroys server on failure } return server; } void mainloop_del_ipc_server(qb_ipcs_service_t * server) { if (server) { qb_ipcs_destroy(server); } } struct mainloop_io_s { char *name; void *userdata; int fd; guint source; crm_ipc_t *ipc; GIOChannel *channel; int (*dispatch_fn_ipc) (const char *buffer, ssize_t length, gpointer userdata); int (*dispatch_fn_io) (gpointer userdata); void (*destroy_fn) (gpointer userdata); }; /*! * \internal * \brief I/O watch callback function (GIOFunc) * * \param[in] gio I/O channel being watched * \param[in] condition I/O condition satisfied * \param[in] data User data passed when source was created * * \return G_SOURCE_REMOVE to remove source, G_SOURCE_CONTINUE to keep it */ static gboolean mainloop_gio_callback(GIOChannel *gio, GIOCondition condition, gpointer data) { gboolean rc = G_SOURCE_CONTINUE; mainloop_io_t *client = data; CRM_ASSERT(client->fd == g_io_channel_unix_get_fd(gio)); if (condition & G_IO_IN) { if (client->ipc) { long read_rc = 0L; int max = 10; do { read_rc = crm_ipc_read(client->ipc); if (read_rc <= 0) { crm_trace("Could not read IPC message from %s: %s (%ld)", client->name, pcmk_strerror(read_rc), read_rc); } else if (client->dispatch_fn_ipc) { const char *buffer = crm_ipc_buffer(client->ipc); crm_trace("New %ld-byte IPC message from %s " "after I/O condition %d", read_rc, client->name, (int) condition); if (client->dispatch_fn_ipc(buffer, read_rc, client->userdata) < 0) { crm_trace("Connection to %s no longer required", client->name); rc = G_SOURCE_REMOVE; } } } while ((rc == G_SOURCE_CONTINUE) && (read_rc > 0) && --max > 0); } else { crm_trace("New I/O event for %s after I/O condition %d", client->name, (int) condition); if (client->dispatch_fn_io) { if (client->dispatch_fn_io(client->userdata) < 0) { crm_trace("Connection to %s no longer required", client->name); rc = G_SOURCE_REMOVE; } } } } if (client->ipc && !crm_ipc_connected(client->ipc)) { - crm_err("Connection to %s closed " QB_XS "client=%p condition=%d", + crm_err("Connection to %s closed " QB_XS " client=%p condition=%d", client->name, client, condition); rc = G_SOURCE_REMOVE; } else if (condition & (G_IO_HUP | G_IO_NVAL | G_IO_ERR)) { crm_trace("The connection %s[%p] has been closed (I/O condition=%d)", client->name, client, condition); rc = G_SOURCE_REMOVE; } else if ((condition & G_IO_IN) == 0) { /* #define GLIB_SYSDEF_POLLIN =1 #define GLIB_SYSDEF_POLLPRI =2 #define GLIB_SYSDEF_POLLOUT =4 #define GLIB_SYSDEF_POLLERR =8 #define GLIB_SYSDEF_POLLHUP =16 #define GLIB_SYSDEF_POLLNVAL =32 typedef enum { G_IO_IN GLIB_SYSDEF_POLLIN, G_IO_OUT GLIB_SYSDEF_POLLOUT, G_IO_PRI GLIB_SYSDEF_POLLPRI, G_IO_ERR GLIB_SYSDEF_POLLERR, G_IO_HUP GLIB_SYSDEF_POLLHUP, G_IO_NVAL GLIB_SYSDEF_POLLNVAL } GIOCondition; A bitwise combination representing a condition to watch for on an event source. G_IO_IN There is data to read. G_IO_OUT Data can be written (without blocking). G_IO_PRI There is urgent data to read. G_IO_ERR Error condition. G_IO_HUP Hung up (the connection has been broken, usually for pipes and sockets). G_IO_NVAL Invalid request. The file descriptor is not open. */ crm_err("Strange condition: %d", condition); } /* G_SOURCE_REMOVE results in mainloop_gio_destroy() being called * just before the source is removed from mainloop */ return rc; } static void mainloop_gio_destroy(gpointer c) { mainloop_io_t *client = c; char *c_name = strdup(client->name); /* client->source is valid but about to be destroyed (ref_count == 0) in gmain.c * client->channel will still have ref_count > 0... should be == 1 */ crm_trace("Destroying client %s[%p]", c_name, c); if (client->ipc) { crm_ipc_close(client->ipc); } if (client->destroy_fn) { void (*destroy_fn) (gpointer userdata) = client->destroy_fn; client->destroy_fn = NULL; destroy_fn(client->userdata); } if (client->ipc) { crm_ipc_t *ipc = client->ipc; client->ipc = NULL; crm_ipc_destroy(ipc); } crm_trace("Destroyed client %s[%p]", c_name, c); free(client->name); client->name = NULL; free(client); free(c_name); } /*! * \brief Connect to IPC and add it as a main loop source * * \param[in,out] ipc IPC connection to add * \param[in] priority Event source priority to use for connection * \param[in] userdata Data to register with callbacks * \param[in] callbacks Dispatch and destroy callbacks for connection * \param[out] source Newly allocated event source * * \return Standard Pacemaker return code * * \note On failure, the caller is still responsible for ipc. On success, the * caller should call mainloop_del_ipc_client() when source is no longer * needed, which will lead to the disconnection of the IPC later in the * main loop if it is connected. However the IPC disconnects, * mainloop_gio_destroy() will free ipc and source after calling the * destroy callback. */ int pcmk__add_mainloop_ipc(crm_ipc_t *ipc, int priority, void *userdata, const struct ipc_client_callbacks *callbacks, mainloop_io_t **source) { int rc = pcmk_rc_ok; int fd = -1; const char *ipc_name = NULL; CRM_CHECK((ipc != NULL) && (callbacks != NULL), return EINVAL); ipc_name = pcmk__s(crm_ipc_name(ipc), "Pacemaker"); rc = pcmk__connect_generic_ipc(ipc); if (rc != pcmk_rc_ok) { crm_debug("Connection to %s failed: %s", ipc_name, pcmk_rc_str(rc)); return rc; } rc = pcmk__ipc_fd(ipc, &fd); if (rc != pcmk_rc_ok) { crm_debug("Could not obtain file descriptor for %s IPC: %s", ipc_name, pcmk_rc_str(rc)); crm_ipc_close(ipc); return rc; } *source = mainloop_add_fd(ipc_name, priority, fd, userdata, NULL); if (*source == NULL) { rc = errno; crm_ipc_close(ipc); return rc; } (*source)->ipc = ipc; (*source)->destroy_fn = callbacks->destroy; (*source)->dispatch_fn_ipc = callbacks->dispatch; return pcmk_rc_ok; } /*! * \brief Get period for mainloop timer * * \param[in] timer Timer * * \return Period in ms */ guint pcmk__mainloop_timer_get_period(const mainloop_timer_t *timer) { if (timer) { return timer->period_ms; } return 0; } mainloop_io_t * mainloop_add_ipc_client(const char *name, int priority, size_t max_size, void *userdata, struct ipc_client_callbacks *callbacks) { crm_ipc_t *ipc = crm_ipc_new(name, max_size); mainloop_io_t *source = NULL; int rc = pcmk__add_mainloop_ipc(ipc, priority, userdata, callbacks, &source); if (rc != pcmk_rc_ok) { if (crm_log_level == LOG_STDOUT) { fprintf(stderr, "Connection to %s failed: %s", name, pcmk_rc_str(rc)); } crm_ipc_destroy(ipc); if (rc > 0) { errno = rc; } else { errno = ENOTCONN; } return NULL; } return source; } void mainloop_del_ipc_client(mainloop_io_t * client) { mainloop_del_fd(client); } crm_ipc_t * mainloop_get_ipc_client(mainloop_io_t * client) { if (client) { return client->ipc; } return NULL; } mainloop_io_t * mainloop_add_fd(const char *name, int priority, int fd, void *userdata, struct mainloop_fd_callbacks * callbacks) { mainloop_io_t *client = NULL; if (fd >= 0) { client = calloc(1, sizeof(mainloop_io_t)); if (client == NULL) { return NULL; } client->name = strdup(name); client->userdata = userdata; if (callbacks) { client->destroy_fn = callbacks->destroy; client->dispatch_fn_io = callbacks->dispatch; } client->fd = fd; client->channel = g_io_channel_unix_new(fd); client->source = g_io_add_watch_full(client->channel, priority, (G_IO_IN | G_IO_HUP | G_IO_NVAL | G_IO_ERR), mainloop_gio_callback, client, mainloop_gio_destroy); /* Now that mainloop now holds a reference to channel, * thanks to g_io_add_watch_full(), drop ours from g_io_channel_unix_new(). * * This means that channel will be free'd by: * g_main_context_dispatch() or g_source_remove() * -> g_source_destroy_internal() * -> g_source_callback_unref() * shortly after mainloop_gio_destroy() completes */ g_io_channel_unref(client->channel); crm_trace("Added connection %d for %s[%p].%d", client->source, client->name, client, fd); } else { errno = EINVAL; } return client; } void mainloop_del_fd(mainloop_io_t * client) { if (client != NULL) { crm_trace("Removing client %s[%p]", client->name, client); if (client->source) { /* Results in mainloop_gio_destroy() being called just * before the source is removed from mainloop */ g_source_remove(client->source); } } } static GList *child_list = NULL; pid_t mainloop_child_pid(mainloop_child_t * child) { return child->pid; } const char * mainloop_child_name(mainloop_child_t * child) { return child->desc; } int mainloop_child_timeout(mainloop_child_t * child) { return child->timeout; } void * mainloop_child_userdata(mainloop_child_t * child) { return child->privatedata; } void mainloop_clear_child_userdata(mainloop_child_t * child) { child->privatedata = NULL; } /* good function name */ static void child_free(mainloop_child_t *child) { if (child->timerid != 0) { crm_trace("Removing timer %d", child->timerid); g_source_remove(child->timerid); child->timerid = 0; } free(child->desc); free(child); } /* terrible function name */ static int child_kill_helper(mainloop_child_t *child) { int rc; if (child->flags & mainloop_leave_pid_group) { crm_debug("Kill pid %d only. leave group intact.", child->pid); rc = kill(child->pid, SIGKILL); } else { crm_debug("Kill pid %d's group", child->pid); rc = kill(-child->pid, SIGKILL); } if (rc < 0) { if (errno != ESRCH) { crm_perror(LOG_ERR, "kill(%d, KILL) failed", child->pid); } return -errno; } return 0; } static gboolean child_timeout_callback(gpointer p) { mainloop_child_t *child = p; int rc = 0; child->timerid = 0; if (child->timeout) { crm_warn("%s process (PID %d) will not die!", child->desc, (int)child->pid); return FALSE; } rc = child_kill_helper(child); if (rc == -ESRCH) { /* Nothing left to do. pid doesn't exist */ return FALSE; } child->timeout = TRUE; crm_debug("%s process (PID %d) timed out", child->desc, (int)child->pid); child->timerid = g_timeout_add(5000, child_timeout_callback, child); return FALSE; } static bool child_waitpid(mainloop_child_t *child, int flags) { int rc = 0; int core = 0; int signo = 0; int status = 0; int exitcode = 0; bool callback_needed = true; rc = waitpid(child->pid, &status, flags); if (rc == 0) { // WNOHANG in flags, and child status is not available crm_trace("Child process %d (%s) still active", child->pid, child->desc); callback_needed = false; } else if (rc != child->pid) { /* According to POSIX, possible conditions: * - child->pid was non-positive (process group or any child), * and rc is specific child * - errno ECHILD (pid does not exist or is not child) * - errno EINVAL (invalid flags) * - errno EINTR (caller interrupted by signal) * * @TODO Handle these cases more specifically. */ signo = SIGCHLD; exitcode = 1; crm_notice("Wait for child process %d (%s) interrupted: %s", child->pid, child->desc, pcmk_rc_str(errno)); } else if (WIFEXITED(status)) { exitcode = WEXITSTATUS(status); crm_trace("Child process %d (%s) exited with status %d", child->pid, child->desc, exitcode); } else if (WIFSIGNALED(status)) { signo = WTERMSIG(status); crm_trace("Child process %d (%s) exited with signal %d (%s)", child->pid, child->desc, signo, strsignal(signo)); #ifdef WCOREDUMP // AIX, SunOS, maybe others } else if (WCOREDUMP(status)) { core = 1; crm_err("Child process %d (%s) dumped core", child->pid, child->desc); #endif } else { // flags must contain WUNTRACED and/or WCONTINUED to reach this crm_trace("Child process %d (%s) stopped or continued", child->pid, child->desc); callback_needed = false; } if (callback_needed && child->callback) { child->callback(child, child->pid, core, signo, exitcode); } return callback_needed; } static void child_death_dispatch(int signal) { for (GList *iter = child_list; iter; ) { GList *saved = iter; mainloop_child_t *child = iter->data; iter = iter->next; if (child_waitpid(child, WNOHANG)) { crm_trace("Removing completed process %d from child list", child->pid); child_list = g_list_remove_link(child_list, saved); g_list_free(saved); child_free(child); } } } static gboolean child_signal_init(gpointer p) { crm_trace("Installed SIGCHLD handler"); /* Do NOT use g_child_watch_add() and friends, they rely on pthreads */ mainloop_add_signal(SIGCHLD, child_death_dispatch); /* In case they terminated before the signal handler was installed */ child_death_dispatch(SIGCHLD); return FALSE; } gboolean mainloop_child_kill(pid_t pid) { GList *iter; mainloop_child_t *child = NULL; mainloop_child_t *match = NULL; /* It is impossible to block SIGKILL, this allows us to * call waitpid without WNOHANG flag.*/ int waitflags = 0, rc = 0; for (iter = child_list; iter != NULL && match == NULL; iter = iter->next) { child = iter->data; if (pid == child->pid) { match = child; } } if (match == NULL) { return FALSE; } rc = child_kill_helper(match); if(rc == -ESRCH) { /* It's gone, but hasn't shown up in waitpid() yet. Wait until we get * SIGCHLD and let handler clean it up as normal (so we get the correct * return code/status). The blocking alternative would be to call * child_waitpid(match, 0). */ crm_trace("Waiting for signal that child process %d completed", match->pid); return TRUE; } else if(rc != 0) { /* If KILL for some other reason set the WNOHANG flag since we * can't be certain what happened. */ waitflags = WNOHANG; } if (!child_waitpid(match, waitflags)) { /* not much we can do if this occurs */ return FALSE; } child_list = g_list_remove(child_list, match); child_free(match); return TRUE; } /* Create/Log a new tracked process * To track a process group, use -pid * * @TODO Using a non-positive pid (i.e. any child, or process group) would * likely not be useful since we will free the child after the first * completed process. */ void mainloop_child_add_with_flags(pid_t pid, int timeout, const char *desc, void *privatedata, enum mainloop_child_flags flags, void (*callback) (mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)) { static bool need_init = TRUE; mainloop_child_t *child = pcmk__assert_alloc(1, sizeof(mainloop_child_t)); child->pid = pid; child->timerid = 0; child->timeout = FALSE; child->privatedata = privatedata; child->callback = callback; child->flags = flags; child->desc = pcmk__str_copy(desc); if (timeout) { child->timerid = g_timeout_add(timeout, child_timeout_callback, child); } child_list = g_list_append(child_list, child); if(need_init) { need_init = FALSE; /* SIGCHLD processing has to be invoked from mainloop. * We do not want it to be possible to both add a child pid * to mainloop, and have the pid's exit callback invoked within * the same callstack. */ g_timeout_add(1, child_signal_init, NULL); } } void mainloop_child_add(pid_t pid, int timeout, const char *desc, void *privatedata, void (*callback) (mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)) { mainloop_child_add_with_flags(pid, timeout, desc, privatedata, 0, callback); } static gboolean mainloop_timer_cb(gpointer user_data) { int id = 0; bool repeat = FALSE; struct mainloop_timer_s *t = user_data; CRM_ASSERT(t != NULL); id = t->id; t->id = 0; /* Ensure it's unset during callbacks so that * mainloop_timer_running() works as expected */ if(t->cb) { crm_trace("Invoking callbacks for timer %s", t->name); repeat = t->repeat; if(t->cb(t->userdata) == FALSE) { crm_trace("Timer %s complete", t->name); repeat = FALSE; } } if(repeat) { /* Restore if repeating */ t->id = id; } return repeat; } bool mainloop_timer_running(mainloop_timer_t *t) { if(t && t->id != 0) { return TRUE; } return FALSE; } void mainloop_timer_start(mainloop_timer_t *t) { mainloop_timer_stop(t); if(t && t->period_ms > 0) { crm_trace("Starting timer %s", t->name); t->id = g_timeout_add(t->period_ms, mainloop_timer_cb, t); } } void mainloop_timer_stop(mainloop_timer_t *t) { if(t && t->id != 0) { crm_trace("Stopping timer %s", t->name); g_source_remove(t->id); t->id = 0; } } guint mainloop_timer_set_period(mainloop_timer_t *t, guint period_ms) { guint last = 0; if(t) { last = t->period_ms; t->period_ms = period_ms; } if(t && t->id != 0 && last != t->period_ms) { mainloop_timer_start(t); } return last; } mainloop_timer_t * mainloop_timer_add(const char *name, guint period_ms, bool repeat, GSourceFunc cb, void *userdata) { mainloop_timer_t *t = pcmk__assert_alloc(1, sizeof(mainloop_timer_t)); if (name != NULL) { t->name = crm_strdup_printf("%s-%u-%d", name, period_ms, repeat); } else { t->name = crm_strdup_printf("%p-%u-%d", t, period_ms, repeat); } t->id = 0; t->period_ms = period_ms; t->repeat = repeat; t->cb = cb; t->userdata = userdata; crm_trace("Created timer %s with %p %p", t->name, userdata, t->userdata); return t; } void mainloop_timer_del(mainloop_timer_t *t) { if(t) { crm_trace("Destroying timer %s", t->name); mainloop_timer_stop(t); free(t->name); free(t); } } /* * Helpers to make sure certain events aren't lost at shutdown */ static gboolean drain_timeout_cb(gpointer user_data) { bool *timeout_popped = (bool*) user_data; *timeout_popped = TRUE; return FALSE; } /*! * \brief Drain some remaining main loop events then quit it * * \param[in,out] mloop Main loop to drain and quit * \param[in] n Drain up to this many pending events */ void pcmk_quit_main_loop(GMainLoop *mloop, unsigned int n) { if ((mloop != NULL) && g_main_loop_is_running(mloop)) { GMainContext *ctx = g_main_loop_get_context(mloop); /* Drain up to n events in case some memory clean-up is pending * (helpful to reduce noise in valgrind output). */ for (int i = 0; (i < n) && g_main_context_pending(ctx); ++i) { g_main_context_dispatch(ctx); } g_main_loop_quit(mloop); } } /*! * \brief Process main loop events while a certain condition is met * * \param[in,out] mloop Main loop to process * \param[in] timer_ms Don't process longer than this amount of time * \param[in] check Function that returns true if events should be * processed * * \note This function is intended to be called at shutdown if certain important * events should not be missed. The caller would likely quit the main loop * or exit after calling this function. The check() function will be * passed the remaining timeout in milliseconds. */ void pcmk_drain_main_loop(GMainLoop *mloop, guint timer_ms, bool (*check)(guint)) { bool timeout_popped = FALSE; guint timer = 0; GMainContext *ctx = NULL; CRM_CHECK(mloop && check, return); ctx = g_main_loop_get_context(mloop); if (ctx) { time_t start_time = time(NULL); timer = g_timeout_add(timer_ms, drain_timeout_cb, &timeout_popped); while (!timeout_popped && check(timer_ms - (time(NULL) - start_time) * 1000)) { g_main_context_iteration(ctx, TRUE); } } if (!timeout_popped && (timer > 0)) { g_source_remove(timer); } } diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c index 70f15d8d0a..459ae29787 100644 --- a/lib/fencing/st_rhcs.c +++ b/lib/fencing/st_rhcs.c @@ -1,330 +1,330 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include "fencing_private.h" #define RH_STONITH_PREFIX "fence_" /*! * \internal * \brief Add available RHCS-compatible agents to a list * * \param[in,out] List to add to * * \return Number of agents added */ int stonith__list_rhcs_agents(stonith_key_value_t **devices) { // Essentially: ls -1 @sbin_dir@/fence_* int count = 0, i; struct dirent **namelist; const int file_num = scandir(PCMK__FENCE_BINDIR, &namelist, 0, alphasort); #if _POSIX_C_SOURCE < 200809L && !(defined(O_SEARCH) || defined(O_PATH)) char buffer[FILENAME_MAX + 1]; #elif defined(O_SEARCH) const int dirfd = open(PCMK__FENCE_BINDIR, O_SEARCH); #else const int dirfd = open(PCMK__FENCE_BINDIR, O_PATH); #endif for (i = 0; i < file_num; i++) { struct stat prop; if (pcmk__starts_with(namelist[i]->d_name, RH_STONITH_PREFIX)) { #if _POSIX_C_SOURCE < 200809L && !(defined(O_SEARCH) || defined(O_PATH)) snprintf(buffer, sizeof(buffer), "%s/%s", PCMK__FENCE_BINDIR, namelist[i]->d_name); if (stat(buffer, &prop) == 0 && S_ISREG(prop.st_mode)) { #else if (dirfd == -1) { if (i == 0) { - crm_notice("Problem with listing %s directory" - QB_XS "errno=%d", RH_STONITH_PREFIX, errno); + crm_notice("Problem with listing %s directory " + QB_XS " errno=%d", RH_STONITH_PREFIX, errno); } free(namelist[i]); continue; } /* note: we can possibly prevent following symlinks here, which may be a good idea, but fall on the nose when these agents are moved elsewhere & linked back */ if (fstatat(dirfd, namelist[i]->d_name, &prop, 0) == 0 && S_ISREG(prop.st_mode)) { #endif *devices = stonith_key_value_add(*devices, NULL, namelist[i]->d_name); count++; } } free(namelist[i]); } if (file_num > 0) { free(namelist); } #if _POSIX_C_SOURCE >= 200809L || defined(O_SEARCH) || defined(O_PATH) if (dirfd >= 0) { close(dirfd); } #endif return count; } static void stonith_rhcs_parameter_not_required(xmlNode *metadata, const char *parameter) { char *xpath = NULL; xmlXPathObject *xpathObj = NULL; CRM_CHECK(metadata != NULL, return); CRM_CHECK(parameter != NULL, return); xpath = crm_strdup_printf("//" PCMK_XE_PARAMETER "[@" PCMK_XA_NAME "='%s']", parameter); /* Fudge metadata so that the parameter isn't required in config * Pacemaker handles and adds it */ xpathObj = xpath_search(metadata, xpath); if (numXpathResults(xpathObj) > 0) { xmlNode *tmp = getXpathResult(xpathObj, 0); crm_xml_add(tmp, "required", "0"); } freeXpathObject(xpathObj); free(xpath); } /*! * \brief Execute RHCS-compatible agent's metadata action * * \param[in] agent Agent to execute * \param[in] timeout_sec Action timeout * \param[out] metadata Where to store output xmlNode (or NULL to ignore) */ static int stonith__rhcs_get_metadata(const char *agent, int timeout_sec, xmlNode **metadata) { xmlNode *xml = NULL; xmlNode *actions = NULL; xmlXPathObject *xpathObj = NULL; stonith_action_t *action = stonith__action_create(agent, PCMK_ACTION_METADATA, NULL, 0, timeout_sec, NULL, NULL, NULL); int rc = stonith__execute(action); pcmk__action_result_t *result = stonith__action_result(action); if (result == NULL) { if (rc < 0) { crm_warn("Could not execute metadata action for %s: %s " QB_XS " rc=%d", agent, pcmk_strerror(rc), rc); } stonith__destroy_action(action); return rc; } if (result->execution_status != PCMK_EXEC_DONE) { crm_warn("Could not execute metadata action for %s: %s", agent, pcmk_exec_status_str(result->execution_status)); rc = pcmk_rc2legacy(stonith__result2rc(result)); stonith__destroy_action(action); return rc; } if (!pcmk__result_ok(result)) { crm_warn("Metadata action for %s returned error code %d", agent, result->exit_status); rc = pcmk_rc2legacy(stonith__result2rc(result)); stonith__destroy_action(action); return rc; } if (result->action_stdout == NULL) { crm_warn("Metadata action for %s returned no data", agent); stonith__destroy_action(action); return -ENODATA; } xml = pcmk__xml_parse(result->action_stdout); stonith__destroy_action(action); if (xml == NULL) { crm_warn("Metadata for %s is invalid", agent); return -pcmk_err_schema_validation; } xpathObj = xpath_search(xml, "//" PCMK_XE_ACTIONS); if (numXpathResults(xpathObj) > 0) { actions = getXpathResult(xpathObj, 0); } freeXpathObject(xpathObj); // Add start and stop (implemented by pacemaker, not agent) to meta-data xpathObj = xpath_search(xml, "//" PCMK_XE_ACTION "[@" PCMK_XA_NAME "='" PCMK_ACTION_STOP "']"); if (numXpathResults(xpathObj) <= 0) { xmlNode *tmp = NULL; const char *timeout_str = NULL; timeout_str = pcmk__readable_interval(PCMK_DEFAULT_ACTION_TIMEOUT_MS); tmp = pcmk__xe_create(actions, PCMK_XE_ACTION); crm_xml_add(tmp, PCMK_XA_NAME, PCMK_ACTION_STOP); crm_xml_add(tmp, PCMK_META_TIMEOUT, timeout_str); tmp = pcmk__xe_create(actions, PCMK_XE_ACTION); crm_xml_add(tmp, PCMK_XA_NAME, PCMK_ACTION_START); crm_xml_add(tmp, PCMK_META_TIMEOUT, timeout_str); } freeXpathObject(xpathObj); // Fudge metadata so parameters are not required in config (pacemaker adds them) stonith_rhcs_parameter_not_required(xml, STONITH_ATTR_ACTION_OP); stonith_rhcs_parameter_not_required(xml, "plug"); stonith_rhcs_parameter_not_required(xml, "port"); if (metadata) { *metadata = xml; } else { pcmk__xml_free(xml); } return pcmk_ok; } /*! * \brief Retrieve metadata for RHCS-compatible fence agent * * \param[in] agent Agent to execute * \param[in] timeout_sec Action timeout * \param[out] output Where to store action output (or NULL to ignore) */ int stonith__rhcs_metadata(const char *agent, int timeout_sec, char **output) { GString *buffer = NULL; xmlNode *xml = NULL; int rc = stonith__rhcs_get_metadata(agent, timeout_sec, &xml); if (rc != pcmk_ok) { goto done; } buffer = g_string_sized_new(1024); pcmk__xml_string(xml, pcmk__xml_fmt_pretty|pcmk__xml_fmt_text, buffer, 0); if (pcmk__str_empty(buffer->str)) { rc = -pcmk_err_schema_validation; goto done; } if (output != NULL) { pcmk__str_update(output, buffer->str); } done: if (buffer != NULL) { g_string_free(buffer, TRUE); } pcmk__xml_free(xml); return rc; } bool stonith__agent_is_rhcs(const char *agent) { struct stat prop; char *buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", agent); int rc = stat(buffer, &prop); free(buffer); return (rc >= 0) && S_ISREG(prop.st_mode); } int stonith__rhcs_validate(stonith_t *st, int call_options, const char *target, const char *agent, GHashTable *params, const char * host_arg, int timeout, char **output, char **error_output) { int rc = pcmk_ok; int remaining_timeout = timeout; xmlNode *metadata = NULL; stonith_action_t *action = NULL; pcmk__action_result_t *result = NULL; if (host_arg == NULL) { time_t start_time = time(NULL); rc = stonith__rhcs_get_metadata(agent, remaining_timeout, &metadata); if (rc == pcmk_ok) { uint32_t device_flags = 0; stonith__device_parameter_flags(&device_flags, agent, metadata); if (pcmk_is_set(device_flags, st_device_supports_parameter_port)) { host_arg = "port"; } else if (pcmk_is_set(device_flags, st_device_supports_parameter_plug)) { host_arg = "plug"; } } pcmk__xml_free(metadata); remaining_timeout -= time(NULL) - start_time; if (rc == -ETIME || remaining_timeout <= 0 ) { return -ETIME; } } else if (pcmk__str_eq(host_arg, PCMK_VALUE_NONE, pcmk__str_casei)) { host_arg = NULL; } action = stonith__action_create(agent, PCMK_ACTION_VALIDATE_ALL, target, 0, remaining_timeout, params, NULL, host_arg); rc = stonith__execute(action); result = stonith__action_result(action); if (result != NULL) { rc = pcmk_rc2legacy(stonith__result2rc(result)); // Take ownership of output so stonith__destroy_action() doesn't free it if (output != NULL) { *output = result->action_stdout; result->action_stdout = NULL; } if (error_output != NULL) { *error_output = result->action_stderr; result->action_stderr = NULL; } } stonith__destroy_action(action); return rc; } diff --git a/lib/pacemaker/pcmk_sched_resource.c b/lib/pacemaker/pcmk_sched_resource.c index d016857b6c..7b008d3cf9 100644 --- a/lib/pacemaker/pcmk_sched_resource.c +++ b/lib/pacemaker/pcmk_sched_resource.c @@ -1,800 +1,800 @@ /* * Copyright 2014-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include "libpacemaker_private.h" // Resource assignment methods by resource variant static pcmk__assignment_methods_t assignment_methods[] = { { pcmk__primitive_assign, pcmk__primitive_create_actions, pcmk__probe_rsc_on_node, pcmk__primitive_internal_constraints, pcmk__primitive_apply_coloc_score, pcmk__colocated_resources, pcmk__with_primitive_colocations, pcmk__primitive_with_colocations, pcmk__add_colocated_node_scores, pcmk__apply_location, pcmk__primitive_action_flags, pcmk__update_ordered_actions, pcmk__output_resource_actions, pcmk__add_rsc_actions_to_graph, pcmk__primitive_add_graph_meta, pcmk__primitive_add_utilization, pcmk__primitive_shutdown_lock, }, { pcmk__group_assign, pcmk__group_create_actions, pcmk__probe_rsc_on_node, pcmk__group_internal_constraints, pcmk__group_apply_coloc_score, pcmk__group_colocated_resources, pcmk__with_group_colocations, pcmk__group_with_colocations, pcmk__group_add_colocated_node_scores, pcmk__group_apply_location, pcmk__group_action_flags, pcmk__group_update_ordered_actions, pcmk__output_resource_actions, pcmk__add_rsc_actions_to_graph, pcmk__noop_add_graph_meta, pcmk__group_add_utilization, pcmk__group_shutdown_lock, }, { pcmk__clone_assign, pcmk__clone_create_actions, pcmk__clone_create_probe, pcmk__clone_internal_constraints, pcmk__clone_apply_coloc_score, pcmk__colocated_resources, pcmk__with_clone_colocations, pcmk__clone_with_colocations, pcmk__add_colocated_node_scores, pcmk__clone_apply_location, pcmk__clone_action_flags, pcmk__instance_update_ordered_actions, pcmk__output_resource_actions, pcmk__clone_add_actions_to_graph, pcmk__clone_add_graph_meta, pcmk__clone_add_utilization, pcmk__clone_shutdown_lock, }, { pcmk__bundle_assign, pcmk__bundle_create_actions, pcmk__bundle_create_probe, pcmk__bundle_internal_constraints, pcmk__bundle_apply_coloc_score, pcmk__colocated_resources, pcmk__with_bundle_colocations, pcmk__bundle_with_colocations, pcmk__add_colocated_node_scores, pcmk__bundle_apply_location, pcmk__bundle_action_flags, pcmk__instance_update_ordered_actions, pcmk__output_bundle_actions, pcmk__bundle_add_actions_to_graph, pcmk__noop_add_graph_meta, pcmk__bundle_add_utilization, pcmk__bundle_shutdown_lock, } }; /*! * \internal * \brief Check whether a resource's agent standard, provider, or type changed * * \param[in,out] rsc Resource to check * \param[in,out] node Node needing unfencing if agent changed * \param[in] rsc_entry XML with previously known agent information * \param[in] active_on_node Whether \p rsc is active on \p node * * \return true if agent for \p rsc changed, otherwise false */ bool pcmk__rsc_agent_changed(pcmk_resource_t *rsc, pcmk_node_t *node, const xmlNode *rsc_entry, bool active_on_node) { bool changed = false; const char *attr_list[] = { PCMK_XA_TYPE, PCMK_XA_CLASS, PCMK_XA_PROVIDER, }; for (int i = 0; i < PCMK__NELEM(attr_list); i++) { const char *value = crm_element_value(rsc->priv->xml, attr_list[i]); const char *old_value = crm_element_value(rsc_entry, attr_list[i]); if (!pcmk__str_eq(value, old_value, pcmk__str_none)) { changed = true; trigger_unfencing(rsc, node, "Device definition changed", NULL, rsc->priv->scheduler); if (active_on_node) { crm_notice("Forcing restart of %s on %s " "because %s changed from '%s' to '%s'", rsc->id, pcmk__node_name(node), attr_list[i], pcmk__s(old_value, ""), pcmk__s(value, "")); } } } if (changed && active_on_node) { // Make sure the resource is restarted custom_action(rsc, stop_key(rsc), PCMK_ACTION_STOP, node, FALSE, rsc->priv->scheduler); pcmk__set_rsc_flags(rsc, pcmk__rsc_start_pending); } return changed; } /*! * \internal * \brief Add resource (and any matching children) to list if it matches ID * * \param[in] result List to add resource to * \param[in] rsc Resource to check * \param[in] id ID to match * * \return (Possibly new) head of list */ static GList * add_rsc_if_matching(GList *result, pcmk_resource_t *rsc, const char *id) { if (pcmk__str_eq(id, rsc->id, pcmk__str_none) || pcmk__str_eq(id, rsc->priv->history_id, pcmk__str_none)) { result = g_list_prepend(result, rsc); } for (GList *iter = rsc->priv->children; iter != NULL; iter = iter->next) { pcmk_resource_t *child = (pcmk_resource_t *) iter->data; result = add_rsc_if_matching(result, child, id); } return result; } /*! * \internal * \brief Find all resources matching a given ID by either ID or clone name * * \param[in] id Resource ID to check * \param[in] scheduler Scheduler data * * \return List of all resources that match \p id * \note The caller is responsible for freeing the return value with * g_list_free(). */ GList * pcmk__rscs_matching_id(const char *id, const pcmk_scheduler_t *scheduler) { GList *result = NULL; CRM_CHECK((id != NULL) && (scheduler != NULL), return NULL); for (GList *iter = scheduler->priv->resources; iter != NULL; iter = iter->next) { result = add_rsc_if_matching(result, (pcmk_resource_t *) iter->data, id); } return result; } /*! * \internal * \brief Set the variant-appropriate assignment methods for a resource * * \param[in,out] data Resource to set assignment methods for * \param[in] user_data Ignored */ static void set_assignment_methods_for_rsc(gpointer data, gpointer user_data) { pcmk_resource_t *rsc = data; rsc->priv->cmds = &assignment_methods[rsc->priv->variant]; g_list_foreach(rsc->priv->children, set_assignment_methods_for_rsc, NULL); } /*! * \internal * \brief Set the variant-appropriate assignment methods for all resources * * \param[in,out] scheduler Scheduler data */ void pcmk__set_assignment_methods(pcmk_scheduler_t *scheduler) { g_list_foreach(scheduler->priv->resources, set_assignment_methods_for_rsc, NULL); } /*! * \internal * \brief Wrapper for colocated_resources() method for readability * * \param[in] rsc Resource to add to colocated list * \param[in] orig_rsc Resource originally requested * \param[in,out] list Pointer to list to add to * * \return (Possibly new) head of list */ static inline void add_colocated_resources(const pcmk_resource_t *rsc, const pcmk_resource_t *orig_rsc, GList **list) { *list = rsc->priv->cmds->colocated_resources(rsc, orig_rsc, *list); } // Shared implementation of pcmk__assignment_methods_t:colocated_resources() GList * pcmk__colocated_resources(const pcmk_resource_t *rsc, const pcmk_resource_t *orig_rsc, GList *colocated_rscs) { const GList *iter = NULL; GList *colocations = NULL; if (orig_rsc == NULL) { orig_rsc = rsc; } if ((rsc == NULL) || (g_list_find(colocated_rscs, rsc) != NULL)) { return colocated_rscs; } pcmk__rsc_trace(orig_rsc, "%s is in colocation chain with %s", rsc->id, orig_rsc->id); colocated_rscs = g_list_prepend(colocated_rscs, (gpointer) rsc); // Follow colocations where this resource is the dependent resource colocations = pcmk__this_with_colocations(rsc); for (iter = colocations; iter != NULL; iter = iter->next) { const pcmk__colocation_t *constraint = iter->data; const pcmk_resource_t *primary = constraint->primary; if (primary == orig_rsc) { continue; // Break colocation loop } if ((constraint->score == PCMK_SCORE_INFINITY) && (pcmk__colocation_affects(rsc, primary, constraint, true) == pcmk__coloc_affects_location)) { add_colocated_resources(primary, orig_rsc, &colocated_rscs); } } g_list_free(colocations); // Follow colocations where this resource is the primary resource colocations = pcmk__with_this_colocations(rsc); for (iter = colocations; iter != NULL; iter = iter->next) { const pcmk__colocation_t *constraint = iter->data; const pcmk_resource_t *dependent = constraint->dependent; if (dependent == orig_rsc) { continue; // Break colocation loop } if (pcmk__is_clone(rsc) && !pcmk__is_clone(dependent)) { continue; // We can't be sure whether dependent will be colocated } if ((constraint->score == PCMK_SCORE_INFINITY) && (pcmk__colocation_affects(dependent, rsc, constraint, true) == pcmk__coloc_affects_location)) { add_colocated_resources(dependent, orig_rsc, &colocated_rscs); } } g_list_free(colocations); return colocated_rscs; } // No-op function for variants that don't need to implement add_graph_meta() void pcmk__noop_add_graph_meta(const pcmk_resource_t *rsc, xmlNode *xml) { } /*! * \internal * \brief Output a summary of scheduled actions for a resource * * \param[in,out] rsc Resource to output actions for */ void pcmk__output_resource_actions(pcmk_resource_t *rsc) { pcmk_node_t *next = NULL; pcmk_node_t *current = NULL; pcmk__output_t *out = NULL; CRM_ASSERT(rsc != NULL); out = rsc->priv->scheduler->priv->out; if (rsc->priv->children != NULL) { for (GList *iter = rsc->priv->children; iter != NULL; iter = iter->next) { pcmk_resource_t *child = (pcmk_resource_t *) iter->data; child->priv->cmds->output_actions(child); } return; } next = rsc->priv->assigned_node; if (rsc->priv->active_nodes != NULL) { current = pcmk__current_node(rsc); if (rsc->priv->orig_role == pcmk_role_stopped) { /* This can occur when resources are being recovered because * the current role can change in pcmk__primitive_create_actions() */ rsc->priv->orig_role = pcmk_role_started; } } if ((current == NULL) && pcmk_is_set(rsc->flags, pcmk__rsc_removed)) { /* Don't log stopped orphans */ return; } out->message(out, "rsc-action", rsc, current, next); } /*! * \internal * \brief Add a resource to a node's list of assigned resources * * \param[in,out] node Node to add resource to * \param[in] rsc Resource to add */ static inline void add_assigned_resource(pcmk_node_t *node, pcmk_resource_t *rsc) { node->priv->assigned_resources = g_list_prepend(node->priv->assigned_resources, rsc); } /*! * \internal * \brief Assign a specified resource (of any variant) to a node * * Assign a specified resource and its children (if any) to a specified node, if * the node can run the resource (or unconditionally, if \p force is true). Mark * the resources as no longer provisional. * * If a resource can't be assigned (or \p node is \c NULL), unassign any * previous assignment. If \p stop_if_fail is \c true, set next role to stopped * and update any existing actions scheduled for the resource. * * \param[in,out] rsc Resource to assign * \param[in,out] node Node to assign \p rsc to * \param[in] force If true, assign to \p node even if unavailable * \param[in] stop_if_fail If \c true and either \p rsc can't be assigned * or \p chosen is \c NULL, set next role to * stopped and update existing actions (if \p rsc * is not a primitive, this applies to its * primitive descendants instead) * * \return \c true if the assignment of \p rsc changed, or \c false otherwise * * \note Assigning a resource to the NULL node using this function is different * from calling pcmk__unassign_resource(), in that it may also update any * actions created for the resource. * \note The \c pcmk__assignment_methods_t:assign() method is preferred, unless * a resource should be assigned to the \c NULL node or every resource in * a tree should be assigned to the same node. * \note If \p stop_if_fail is \c false, then \c pcmk__unassign_resource() can * completely undo the assignment. A successful assignment can be either * undone or left alone as final. A failed assignment has the same effect * as calling pcmk__unassign_resource(); there are no side effects on * roles or actions. */ bool pcmk__assign_resource(pcmk_resource_t *rsc, pcmk_node_t *node, bool force, bool stop_if_fail) { bool changed = false; pcmk_scheduler_t *scheduler = NULL; CRM_ASSERT(rsc != NULL); scheduler = rsc->priv->scheduler; if (rsc->priv->children != NULL) { for (GList *iter = rsc->priv->children; iter != NULL; iter = iter->next) { pcmk_resource_t *child_rsc = iter->data; changed |= pcmk__assign_resource(child_rsc, node, force, stop_if_fail); } return changed; } // Assigning a primitive if (!force && (node != NULL) && ((node->assign->score < 0) // Allow graph to assume that guest node connections will come up || (!pcmk__node_available(node, true, false) && !pcmk__is_guest_or_bundle_node(node)))) { pcmk__rsc_debug(rsc, "All nodes for resource %s are unavailable, unclean or " "shutting down (%s can%s run resources, with score %s)", rsc->id, pcmk__node_name(node), (pcmk__node_available(node, true, false)? "" : "not"), pcmk_readable_score(node->assign->score)); if (stop_if_fail) { pe__set_next_role(rsc, pcmk_role_stopped, "node availability"); } node = NULL; } if (rsc->priv->assigned_node != NULL) { changed = !pcmk__same_node(rsc->priv->assigned_node, node); } else { changed = (node != NULL); } pcmk__unassign_resource(rsc); pcmk__clear_rsc_flags(rsc, pcmk__rsc_unassigned); if (node == NULL) { char *rc_stopped = NULL; pcmk__rsc_debug(rsc, "Could not assign %s to a node", rsc->id); if (!stop_if_fail) { return changed; } pe__set_next_role(rsc, pcmk_role_stopped, "unable to assign"); for (GList *iter = rsc->priv->actions; iter != NULL; iter = iter->next) { pcmk_action_t *op = (pcmk_action_t *) iter->data; pcmk__rsc_debug(rsc, "Updating %s for %s assignment failure", op->uuid, rsc->id); if (pcmk__str_eq(op->task, PCMK_ACTION_STOP, pcmk__str_none)) { pcmk__clear_action_flags(op, pcmk__action_optional); } else if (pcmk__str_eq(op->task, PCMK_ACTION_START, pcmk__str_none)) { pcmk__clear_action_flags(op, pcmk__action_runnable); } else { // Cancel recurring actions, unless for stopped state const char *interval_ms_s = NULL; const char *target_rc_s = NULL; interval_ms_s = g_hash_table_lookup(op->meta, PCMK_META_INTERVAL); target_rc_s = g_hash_table_lookup(op->meta, PCMK__META_OP_TARGET_RC); if (rc_stopped == NULL) { rc_stopped = pcmk__itoa(PCMK_OCF_NOT_RUNNING); } if (!pcmk__str_eq(interval_ms_s, "0", pcmk__str_null_matches) && !pcmk__str_eq(rc_stopped, target_rc_s, pcmk__str_none)) { pcmk__clear_action_flags(op, pcmk__action_runnable); } } } free(rc_stopped); return changed; } pcmk__rsc_debug(rsc, "Assigning %s to %s", rsc->id, pcmk__node_name(node)); rsc->priv->assigned_node = pe__copy_node(node); add_assigned_resource(node, rsc); node->priv->num_resources++; node->assign->count++; pcmk__consume_node_capacity(node->priv->utilization, rsc); if (pcmk_is_set(scheduler->flags, pcmk__sched_show_utilization)) { pcmk__output_t *out = scheduler->priv->out; out->message(out, "resource-util", rsc, node, __func__); } return changed; } /*! * \internal * \brief Remove any node assignment from a specified resource and its children * * If a specified resource has been assigned to a node, remove that assignment * and mark the resource as provisional again. * * \param[in,out] rsc Resource to unassign * * \note This function is called recursively on \p rsc and its children. */ void pcmk__unassign_resource(pcmk_resource_t *rsc) { pcmk_node_t *old = rsc->priv->assigned_node; if (old == NULL) { crm_info("Unassigning %s", rsc->id); } else { crm_info("Unassigning %s from %s", rsc->id, pcmk__node_name(old)); } pcmk__set_rsc_flags(rsc, pcmk__rsc_unassigned); if (rsc->priv->children == NULL) { if (old == NULL) { return; } rsc->priv->assigned_node = NULL; /* We're going to free the pcmk_node_t, but its details member is shared * and will remain, so update that appropriately first. */ old->priv->assigned_resources = g_list_remove(old->priv->assigned_resources, rsc); old->priv->num_resources--; pcmk__release_node_capacity(old->priv->utilization, rsc); free(old); return; } for (GList *iter = rsc->priv->children; iter != NULL; iter = iter->next) { pcmk__unassign_resource((pcmk_resource_t *) iter->data); } } /*! * \internal * \brief Check whether a resource has reached its migration threshold on a node * * \param[in,out] rsc Resource to check * \param[in] node Node to check * \param[out] failed If threshold has been reached, this will be set to * resource that failed (possibly a parent of \p rsc) * * \return true if the migration threshold has been reached, false otherwise */ bool pcmk__threshold_reached(pcmk_resource_t *rsc, const pcmk_node_t *node, pcmk_resource_t **failed) { int fail_count, remaining_tries; pcmk_resource_t *rsc_to_ban = rsc; // Migration threshold of 0 means never force away if (rsc->priv->ban_after_failures == 0) { return false; } // If we're ignoring failures, also ignore the migration threshold if (pcmk_is_set(rsc->flags, pcmk__rsc_ignore_failure)) { return false; } // If there are no failures, there's no need to force away fail_count = pe_get_failcount(node, rsc, NULL, pcmk__fc_effective|pcmk__fc_launched, NULL); if (fail_count <= 0) { return false; } // If failed resource is anonymous clone instance, we'll force clone away if (!pcmk_is_set(rsc->flags, pcmk__rsc_unique)) { rsc_to_ban = uber_parent(rsc); } // How many more times recovery will be tried on this node remaining_tries = rsc->priv->ban_after_failures - fail_count; if (remaining_tries <= 0) { pcmk__sched_warn(rsc->priv->scheduler, "%s cannot run on %s due to reaching migration " - "threshold (clean up resource to allow again)" + "threshold (clean up resource to allow again) " QB_XS " failures=%d " PCMK_META_MIGRATION_THRESHOLD "=%d", rsc_to_ban->id, pcmk__node_name(node), fail_count, rsc->priv->ban_after_failures); if (failed != NULL) { *failed = rsc_to_ban; } return true; } crm_info("%s can fail %d more time%s on " "%s before reaching migration threshold (%d)", rsc_to_ban->id, remaining_tries, pcmk__plural_s(remaining_tries), pcmk__node_name(node), rsc->priv->ban_after_failures); return false; } /*! * \internal * \brief Get a node's score * * \param[in] node Node with ID to check * \param[in] nodes List of nodes to look for \p node score in * * \return Node's score, or -INFINITY if not found */ static int get_node_score(const pcmk_node_t *node, GHashTable *nodes) { pcmk_node_t *found_node = NULL; if ((node != NULL) && (nodes != NULL)) { found_node = g_hash_table_lookup(nodes, node->priv->id); } if (found_node == NULL) { return -PCMK_SCORE_INFINITY; } return found_node->assign->score; } /*! * \internal * \brief Compare two resources according to which should be assigned first * * \param[in] a First resource to compare * \param[in] b Second resource to compare * \param[in] data Sorted list of all nodes in cluster * * \return -1 if \p a should be assigned before \b, 0 if they are equal, * or +1 if \p a should be assigned after \b */ static gint cmp_resources(gconstpointer a, gconstpointer b, gpointer data) { /* GLib insists that this function require gconstpointer arguments, but we * make a small, temporary change to each argument (setting the * pe_rsc_merging flag) during comparison */ pcmk_resource_t *resource1 = (pcmk_resource_t *) a; pcmk_resource_t *resource2 = (pcmk_resource_t *) b; const GList *nodes = data; int rc = 0; int r1_score = -PCMK_SCORE_INFINITY; int r2_score = -PCMK_SCORE_INFINITY; pcmk_node_t *r1_node = NULL; pcmk_node_t *r2_node = NULL; GHashTable *r1_nodes = NULL; GHashTable *r2_nodes = NULL; const char *reason = NULL; // Resources with highest priority should be assigned first reason = "priority"; r1_score = resource1->priv->priority; r2_score = resource2->priv->priority; if (r1_score > r2_score) { rc = -1; goto done; } if (r1_score < r2_score) { rc = 1; goto done; } // We need nodes to make any other useful comparisons reason = "no node list"; if (nodes == NULL) { goto done; } // Calculate and log node scores resource1->priv->cmds->add_colocated_node_scores(resource1, NULL, resource1->id, &r1_nodes, NULL, 1, pcmk__coloc_select_this_with); resource2->priv->cmds->add_colocated_node_scores(resource2, NULL, resource2->id, &r2_nodes, NULL, 1, pcmk__coloc_select_this_with); pe__show_node_scores(true, NULL, resource1->id, r1_nodes, resource1->priv->scheduler); pe__show_node_scores(true, NULL, resource2->id, r2_nodes, resource2->priv->scheduler); // The resource with highest score on its current node goes first reason = "current location"; if (resource1->priv->active_nodes != NULL) { r1_node = pcmk__current_node(resource1); } if (resource2->priv->active_nodes != NULL) { r2_node = pcmk__current_node(resource2); } r1_score = get_node_score(r1_node, r1_nodes); r2_score = get_node_score(r2_node, r2_nodes); if (r1_score > r2_score) { rc = -1; goto done; } if (r1_score < r2_score) { rc = 1; goto done; } // Otherwise a higher score on any node will do reason = "score"; for (const GList *iter = nodes; iter != NULL; iter = iter->next) { const pcmk_node_t *node = (const pcmk_node_t *) iter->data; r1_score = get_node_score(node, r1_nodes); r2_score = get_node_score(node, r2_nodes); if (r1_score > r2_score) { rc = -1; goto done; } if (r1_score < r2_score) { rc = 1; goto done; } } done: crm_trace("%s (%d)%s%s %c %s (%d)%s%s: %s", resource1->id, r1_score, ((r1_node == NULL)? "" : " on "), ((r1_node == NULL)? "" : r1_node->priv->id), ((rc < 0)? '>' : ((rc > 0)? '<' : '=')), resource2->id, r2_score, ((r2_node == NULL)? "" : " on "), ((r2_node == NULL)? "" : r2_node->priv->id), reason); if (r1_nodes != NULL) { g_hash_table_destroy(r1_nodes); } if (r2_nodes != NULL) { g_hash_table_destroy(r2_nodes); } return rc; } /*! * \internal * \brief Sort resources in the order they should be assigned to nodes * * \param[in,out] scheduler Scheduler data */ void pcmk__sort_resources(pcmk_scheduler_t *scheduler) { GList *nodes = g_list_copy(scheduler->nodes); nodes = pcmk__sort_nodes(nodes, NULL); scheduler->priv->resources = g_list_sort_with_data(scheduler->priv->resources, cmp_resources, nodes); g_list_free(nodes); }