diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h index 1de341f20e..d7ea160bb0 100644 --- a/include/pcmki/pcmki_sched_utils.h +++ b/include/pcmki/pcmki_sched_utils.h @@ -1,103 +1,106 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PENGINE_AUTILS__H # define PENGINE_AUTILS__H #include // bool #include // GList, GHashTable, gboolean, guint #include // lrmd_event_data_t #include // cib_t #include #include #include /* Constraint helper functions */ pcmk__colocation_t *invert_constraint(pcmk__colocation_t *constraint); pe__location_t *copy_constraint(pe__location_t *constraint); pe__location_t *rsc2node_new(const char *id, pe_resource_t *rsc, int weight, const char *discovery_mode, pe_node_t *node, pe_working_set_t *data_set); void pcmk__new_colocation(const char *id, const char *node_attr, int score, pe_resource_t *rsc_lh, pe_resource_t *rsc_rh, const char *state_lh, const char *state_rh, bool influence, pe_working_set_t *data_set); extern gboolean rsc_ticket_new(const char *id, pe_resource_t * rsc_lh, pe_ticket_t * ticket, const char *state_lh, const char *loss_policy, pe_working_set_t * data_set); GHashTable *pcmk__copy_node_table(GHashTable *nodes); GList *pcmk__copy_node_list(const GList *list, bool reset); GList *sort_nodes_by_weight(GList *nodes, pe_node_t *active_node, pe_working_set_t *data_set); extern gboolean can_run_resources(const pe_node_t * node); extern gboolean native_assign_node(pe_resource_t *rsc, pe_node_t *chosen, gboolean force); void native_deallocate(pe_resource_t * rsc); extern void log_action(unsigned int log_level, const char *pre_text, pe_action_t * action, gboolean details); gboolean can_run_any(GHashTable * nodes); pe_resource_t *find_compatible_child(pe_resource_t *local_child, pe_resource_t *rsc, enum rsc_role_e filter, gboolean current, pe_working_set_t *data_set); pe_resource_t *find_compatible_child_by_node(pe_resource_t * local_child, pe_node_t * local_node, pe_resource_t * rsc, enum rsc_role_e filter, gboolean current); gboolean is_child_compatible(pe_resource_t *child_rsc, pe_node_t * local_node, enum rsc_role_e filter, gboolean current); bool assign_node(pe_resource_t * rsc, pe_node_t * node, gboolean force); enum pe_action_flags summary_action_flags(pe_action_t * action, GList *children, pe_node_t * node); enum action_tasks clone_child_action(pe_action_t * action); int copies_per_node(pe_resource_t * rsc); enum filter_colocation_res { influence_nothing = 0, influence_rsc_location, influence_rsc_priority, }; extern enum filter_colocation_res filter_colocation_constraint(pe_resource_t * rsc_lh, pe_resource_t * rsc_rh, pcmk__colocation_t *constraint, gboolean preview); extern int compare_capacity(const pe_node_t * node1, const pe_node_t * node2); extern void calculate_utilization(GHashTable * current_utilization, GHashTable * utilization, gboolean plus); extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_working_set_t * data_set); pe_action_t *create_pseudo_resource_op(pe_resource_t * rsc, const char *task, bool optional, bool runnable, pe_working_set_t *data_set); pe_action_t *pe_cancel_op(pe_resource_t *rsc, const char *name, guint interval_ms, pe_node_t *node, pe_working_set_t *data_set); pe_action_t *sched_shutdown_op(pe_node_t *node, pe_working_set_t *data_set); xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event, const char *caller_version, int target_rc, const char *node, const char *origin, int level); # define LOAD_STOPPED "load_stopped" void modify_configuration( pe_working_set_t * data_set, cib_t *cib, const char *quorum, const char *watchdog, GList *node_up, GList *node_down, GList *node_fail, GList *op_inject, GList *ticket_grant, GList *ticket_revoke, GList *ticket_standby, GList *ticket_activate); int run_simulation(pe_working_set_t * data_set, cib_t *cib, GList *op_fail_list); pcmk__output_t *pcmk__new_logger(void); +pe_resource_t *pcmk__threshold_reached(pe_resource_t *rsc, pe_node_t *node, + pe_working_set_t *data_set); + #endif diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c index ad395a19e8..f3b638b1f0 100644 --- a/lib/pacemaker/pcmk_sched_allocate.c +++ b/lib/pacemaker/pcmk_sched_allocate.c @@ -1,3043 +1,2995 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include CRM_TRACE_INIT_DATA(pacemaker); extern bool pcmk__is_daemon; void set_alloc_actions(pe_working_set_t * data_set); extern void ReloadRsc(pe_resource_t * rsc, pe_node_t *node, pe_working_set_t * data_set); extern gboolean DeleteRsc(pe_resource_t * rsc, pe_node_t * node, gboolean optional, pe_working_set_t * data_set); static void apply_remote_node_ordering(pe_working_set_t *data_set); static enum remote_connection_state get_remote_node_state(pe_node_t *node); enum remote_connection_state { remote_state_unknown = 0, remote_state_alive = 1, remote_state_resting = 2, remote_state_failed = 3, remote_state_stopped = 4 }; static const char * state2text(enum remote_connection_state state) { switch (state) { case remote_state_unknown: return "unknown"; case remote_state_alive: return "alive"; case remote_state_resting: return "resting"; case remote_state_failed: return "failed"; case remote_state_stopped: return "stopped"; } return "impossible"; } resource_alloc_functions_t resource_class_alloc_functions[] = { { pcmk__native_merge_weights, pcmk__native_allocate, native_create_actions, native_create_probe, native_internal_constraints, native_rsc_colocation_lh, native_rsc_colocation_rh, native_rsc_location, native_action_flags, native_update_actions, native_expand, native_append_meta, }, { pcmk__group_merge_weights, pcmk__group_allocate, group_create_actions, native_create_probe, group_internal_constraints, group_rsc_colocation_lh, group_rsc_colocation_rh, group_rsc_location, group_action_flags, group_update_actions, group_expand, group_append_meta, }, { pcmk__native_merge_weights, pcmk__clone_allocate, clone_create_actions, clone_create_probe, clone_internal_constraints, clone_rsc_colocation_lh, clone_rsc_colocation_rh, clone_rsc_location, clone_action_flags, pcmk__multi_update_actions, clone_expand, clone_append_meta, }, { pcmk__native_merge_weights, pcmk__bundle_allocate, pcmk__bundle_create_actions, pcmk__bundle_create_probe, pcmk__bundle_internal_constraints, pcmk__bundle_rsc_colocation_lh, pcmk__bundle_rsc_colocation_rh, pcmk__bundle_rsc_location, pcmk__bundle_action_flags, pcmk__multi_update_actions, pcmk__bundle_expand, pcmk__bundle_append_meta, } }; static gboolean check_rsc_parameters(pe_resource_t * rsc, pe_node_t * node, xmlNode * rsc_entry, gboolean active_here, pe_working_set_t * data_set) { int attr_lpc = 0; gboolean force_restart = FALSE; gboolean delete_resource = FALSE; gboolean changed = FALSE; const char *value = NULL; const char *old_value = NULL; const char *attr_list[] = { XML_ATTR_TYPE, XML_AGENT_ATTR_CLASS, XML_AGENT_ATTR_PROVIDER }; for (; attr_lpc < PCMK__NELEM(attr_list); attr_lpc++) { value = crm_element_value(rsc->xml, attr_list[attr_lpc]); old_value = crm_element_value(rsc_entry, attr_list[attr_lpc]); if (value == old_value /* i.e. NULL */ || pcmk__str_eq(value, old_value, pcmk__str_none)) { continue; } changed = TRUE; trigger_unfencing(rsc, node, "Device definition changed", NULL, data_set); if (active_here) { force_restart = TRUE; crm_notice("Forcing restart of %s on %s, %s changed: %s -> %s", rsc->id, node->details->uname, attr_list[attr_lpc], crm_str(old_value), crm_str(value)); } } if (force_restart) { /* make sure the restart happens */ stop_action(rsc, node, FALSE); pe__set_resource_flags(rsc, pe_rsc_start_pending); delete_resource = TRUE; } else if (changed) { delete_resource = TRUE; } return delete_resource; } static void CancelXmlOp(pe_resource_t * rsc, xmlNode * xml_op, pe_node_t * active_node, const char *reason, pe_working_set_t * data_set) { guint interval_ms = 0; pe_action_t *cancel = NULL; const char *task = NULL; const char *call_id = NULL; CRM_CHECK(xml_op != NULL, return); CRM_CHECK(active_node != NULL, return); task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); call_id = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); crm_info("Action " PCMK__OP_FMT " on %s will be stopped: %s", rsc->id, task, interval_ms, active_node->details->uname, (reason? reason : "unknown")); cancel = pe_cancel_op(rsc, task, interval_ms, active_node, data_set); add_hash_param(cancel->meta, XML_LRM_ATTR_CALLID, call_id); custom_action_order(rsc, stop_key(rsc), NULL, rsc, NULL, cancel, pe_order_optional, data_set); } static gboolean check_action_definition(pe_resource_t * rsc, pe_node_t * active_node, xmlNode * xml_op, pe_working_set_t * data_set) { char *key = NULL; guint interval_ms = 0; const op_digest_cache_t *digest_data = NULL; gboolean did_change = FALSE; const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *digest_secure = NULL; CRM_CHECK(active_node != NULL, return FALSE); crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); if (interval_ms > 0) { xmlNode *op_match = NULL; /* we need to reconstruct the key because of the way we used to construct resource IDs */ key = pcmk__op_key(rsc->id, task, interval_ms); pe_rsc_trace(rsc, "Checking parameters for %s", key); op_match = find_rsc_op_entry(rsc, key); if ((op_match == NULL) && pcmk_is_set(data_set->flags, pe_flag_stop_action_orphans)) { CancelXmlOp(rsc, xml_op, active_node, "orphan", data_set); free(key); return TRUE; } else if (op_match == NULL) { pe_rsc_debug(rsc, "Orphan action detected: %s on %s", key, active_node->details->uname); free(key); return TRUE; } free(key); key = NULL; } crm_trace("Testing " PCMK__OP_FMT " on %s", rsc->id, task, interval_ms, active_node->details->uname); if ((interval_ms == 0) && pcmk__str_eq(task, RSC_STATUS, pcmk__str_casei)) { /* Reload based on the start action not a probe */ task = RSC_START; } else if ((interval_ms == 0) && pcmk__str_eq(task, RSC_MIGRATED, pcmk__str_casei)) { /* Reload based on the start action not a migrate */ task = RSC_START; } else if ((interval_ms == 0) && pcmk__str_eq(task, RSC_PROMOTE, pcmk__str_casei)) { /* Reload based on the start action not a promote */ task = RSC_START; } digest_data = rsc_action_digest_cmp(rsc, xml_op, active_node, data_set); if (pcmk_is_set(data_set->flags, pe_flag_sanitized)) { digest_secure = crm_element_value(xml_op, XML_LRM_ATTR_SECURE_DIGEST); } if(digest_data->rc != RSC_DIGEST_MATCH && digest_secure && digest_data->digest_secure_calc && strcmp(digest_data->digest_secure_calc, digest_secure) == 0) { if (!pcmk__is_daemon && data_set->priv != NULL) { pcmk__output_t *out = data_set->priv; out->info(out, "Only 'private' parameters to " PCMK__OP_FMT " on %s changed: %s", rsc->id, task, interval_ms, active_node->details->uname, crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); } } else if (digest_data->rc == RSC_DIGEST_RESTART) { /* Changes that force a restart */ pe_action_t *required = NULL; did_change = TRUE; key = pcmk__op_key(rsc->id, task, interval_ms); crm_log_xml_info(digest_data->params_restart, "params:restart"); required = custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); pe_action_set_reason(required, "resource definition change", true); trigger_unfencing(rsc, active_node, "Device parameters changed", NULL, data_set); } else if ((digest_data->rc == RSC_DIGEST_ALL) || (digest_data->rc == RSC_DIGEST_UNKNOWN)) { // Changes that can potentially be handled by an agent reload const char *digest_restart = crm_element_value(xml_op, XML_LRM_ATTR_RESTART_DIGEST); did_change = TRUE; trigger_unfencing(rsc, active_node, "Device parameters changed (reload)", NULL, data_set); crm_log_xml_info(digest_data->params_all, "params:reload"); key = pcmk__op_key(rsc->id, task, interval_ms); if (interval_ms > 0) { pe_action_t *op = NULL; #if 0 /* Always reload/restart the entire resource */ ReloadRsc(rsc, active_node, data_set); #else /* Re-sending the recurring op is sufficient - the old one will be cancelled automatically */ op = custom_action(rsc, key, task, active_node, TRUE, TRUE, data_set); pe__set_action_flags(op, pe_action_reschedule); #endif } else if (digest_restart) { pe_rsc_trace(rsc, "Reloading '%s' action for resource %s", task, rsc->id); /* Reload this resource */ ReloadRsc(rsc, active_node, data_set); free(key); } else { pe_action_t *required = NULL; pe_rsc_trace(rsc, "Resource %s doesn't support agent reloads", rsc->id); /* Re-send the start/demote/promote op * Recurring ops will be detected independently */ required = custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); pe_action_set_reason(required, "resource definition change", true); } } return did_change; } /*! * \internal * \brief Do deferred action checks after allocation * * \param[in] data_set Working set for cluster */ static void check_params(pe_resource_t *rsc, pe_node_t *node, xmlNode *rsc_op, enum pe_check_parameters check, pe_working_set_t *data_set) { const char *reason = NULL; op_digest_cache_t *digest_data = NULL; switch (check) { case pe_check_active: if (check_action_definition(rsc, node, rsc_op, data_set) && pe_get_failcount(node, rsc, NULL, pe_fc_effective, NULL, data_set)) { reason = "action definition changed"; } break; case pe_check_last_failure: digest_data = rsc_action_digest_cmp(rsc, rsc_op, node, data_set); switch (digest_data->rc) { case RSC_DIGEST_UNKNOWN: crm_trace("Resource %s history entry %s on %s has no digest to compare", rsc->id, ID(rsc_op), node->details->id); break; case RSC_DIGEST_MATCH: break; default: reason = "resource parameters have changed"; break; } break; } if (reason) { pe__clear_failcount(rsc, node, reason, data_set); } } static void check_actions_for(xmlNode * rsc_entry, pe_resource_t * rsc, pe_node_t * node, pe_working_set_t * data_set) { GList *gIter = NULL; int offset = -1; int stop_index = 0; int start_index = 0; const char *task = NULL; xmlNode *rsc_op = NULL; GList *op_list = NULL; GList *sorted_op_list = NULL; CRM_CHECK(node != NULL, return); if (pcmk_is_set(rsc->flags, pe_rsc_orphan)) { pe_resource_t *parent = uber_parent(rsc); if(parent == NULL || pe_rsc_is_clone(parent) == FALSE || pcmk_is_set(parent->flags, pe_rsc_unique)) { pe_rsc_trace(rsc, "Skipping param check for %s and deleting: orphan", rsc->id); DeleteRsc(rsc, node, FALSE, data_set); } else { pe_rsc_trace(rsc, "Skipping param check for %s (orphan clone)", rsc->id); } return; } else if (pe_find_node_id(rsc->running_on, node->details->id) == NULL) { if (check_rsc_parameters(rsc, node, rsc_entry, FALSE, data_set)) { DeleteRsc(rsc, node, FALSE, data_set); } pe_rsc_trace(rsc, "Skipping param check for %s: no longer active on %s", rsc->id, node->details->uname); return; } pe_rsc_trace(rsc, "Processing %s on %s", rsc->id, node->details->uname); if (check_rsc_parameters(rsc, node, rsc_entry, TRUE, data_set)) { DeleteRsc(rsc, node, FALSE, data_set); } for (rsc_op = pcmk__xe_first_child(rsc_entry); rsc_op != NULL; rsc_op = pcmk__xe_next(rsc_op)) { if (pcmk__str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, pcmk__str_none)) { op_list = g_list_prepend(op_list, rsc_op); } } sorted_op_list = g_list_sort(op_list, sort_op_by_callid); calculate_active_ops(sorted_op_list, &start_index, &stop_index); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; guint interval_ms = 0; offset++; if (start_index < stop_index) { /* stopped */ continue; } else if (offset < start_index) { /* action occurred prior to a start */ continue; } task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); crm_element_value_ms(rsc_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); if ((interval_ms > 0) && (pcmk_is_set(rsc->flags, pe_rsc_maintenance) || node->details->maintenance)) { // Maintenance mode cancels recurring operations CancelXmlOp(rsc, rsc_op, node, "maintenance mode", data_set); } else if ((interval_ms > 0) || pcmk__strcase_any_of(task, RSC_STATUS, RSC_START, RSC_PROMOTE, RSC_MIGRATED, NULL)) { /* If a resource operation failed, and the operation's definition * has changed, clear any fail count so they can be retried fresh. */ if (pe__bundle_needs_remote_name(rsc, data_set)) { /* We haven't allocated resources to nodes yet, so if the * REMOTE_CONTAINER_HACK is used, we may calculate the digest * based on the literal "#uname" value rather than the properly * substituted value. That would mistakenly make the action * definition appear to have been changed. Defer the check until * later in this case. */ pe__add_param_check(rsc_op, rsc, node, pe_check_active, data_set); } else if (check_action_definition(rsc, node, rsc_op, data_set) && pe_get_failcount(node, rsc, NULL, pe_fc_effective, NULL, data_set)) { pe__clear_failcount(rsc, node, "action definition changed", data_set); } } } g_list_free(sorted_op_list); } static GList * find_rsc_list(GList *result, pe_resource_t * rsc, const char *id, gboolean renamed_clones, gboolean partial, pe_working_set_t * data_set) { GList *gIter = NULL; gboolean match = FALSE; if (id == NULL) { return NULL; } if (rsc == NULL) { if (data_set == NULL) { return NULL; } for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *child = (pe_resource_t *) gIter->data; result = find_rsc_list(result, child, id, renamed_clones, partial, NULL); } return result; } if (partial) { if (strstr(rsc->id, id)) { match = TRUE; } else if (renamed_clones && rsc->clone_name && strstr(rsc->clone_name, id)) { match = TRUE; } } else { if (strcmp(rsc->id, id) == 0) { match = TRUE; } else if (renamed_clones && rsc->clone_name && strcmp(rsc->clone_name, id) == 0) { match = TRUE; } } if (match) { result = g_list_prepend(result, rsc); } if (rsc->children) { gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child = (pe_resource_t *) gIter->data; result = find_rsc_list(result, child, id, renamed_clones, partial, NULL); } } return result; } static void check_actions(pe_working_set_t * data_set) { const char *id = NULL; pe_node_t *node = NULL; xmlNode *lrm_rscs = NULL; xmlNode *status = get_object_root(XML_CIB_TAG_STATUS, data_set->input); xmlNode *node_state = NULL; for (node_state = pcmk__xe_first_child(status); node_state != NULL; node_state = pcmk__xe_next(node_state)) { if (pcmk__str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, pcmk__str_none)) { id = crm_element_value(node_state, XML_ATTR_ID); lrm_rscs = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); lrm_rscs = find_xml_node(lrm_rscs, XML_LRM_TAG_RESOURCES, FALSE); node = pe_find_node_id(data_set->nodes, id); if (node == NULL) { continue; /* Still need to check actions for a maintenance node to cancel existing monitor operations */ } else if (can_run_resources(node) == FALSE && node->details->maintenance == FALSE) { crm_trace("Skipping param check for %s: can't run resources", node->details->uname); continue; } crm_trace("Processing node %s", node->details->uname); if (node->details->online || pcmk_is_set(data_set->flags, pe_flag_stonith_enabled)) { xmlNode *rsc_entry = NULL; for (rsc_entry = pcmk__xe_first_child(lrm_rscs); rsc_entry != NULL; rsc_entry = pcmk__xe_next(rsc_entry)) { if (pcmk__str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, pcmk__str_none)) { if (xml_has_children(rsc_entry)) { GList *gIter = NULL; GList *result = NULL; const char *rsc_id = ID(rsc_entry); CRM_CHECK(rsc_id != NULL, return); result = find_rsc_list(NULL, NULL, rsc_id, TRUE, FALSE, data_set); for (gIter = result; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (rsc->variant != pe_native) { continue; } check_actions_for(rsc_entry, rsc, node, data_set); } g_list_free(result); } } } } } } } static void apply_placement_constraints(pe_working_set_t * data_set) { for (GList *gIter = data_set->placement_constraints; gIter != NULL; gIter = gIter->next) { pe__location_t *cons = gIter->data; cons->rsc_lh->cmds->rsc_location(cons->rsc_lh, cons); } } static gboolean failcount_clear_action_exists(pe_node_t * node, pe_resource_t * rsc) { gboolean rc = FALSE; GList *list = pe__resource_actions(rsc, node, CRM_OP_CLEAR_FAILCOUNT, TRUE); if (list) { rc = TRUE; } g_list_free(list); return rc; } -/*! - * \internal - * \brief Force resource away if failures hit migration threshold - * - * \param[in,out] rsc Resource to check for failures - * \param[in,out] node Node to check for failures - * \param[in,out] data_set Cluster working set to update - */ -static void -check_migration_threshold(pe_resource_t *rsc, pe_node_t *node, - pe_working_set_t *data_set) -{ - int fail_count, countdown; - pe_resource_t *failed; - - /* Migration threshold of 0 means never force away */ - if (rsc->migration_threshold == 0) { - return; - } - - // If we're ignoring failures, also ignore the migration threshold - if (pcmk_is_set(rsc->flags, pe_rsc_failure_ignored)) { - return; - } - - /* If there are no failures, there's no need to force away */ - fail_count = pe_get_failcount(node, rsc, NULL, - pe_fc_effective|pe_fc_fillers, NULL, - data_set); - if (fail_count <= 0) { - return; - } - - /* How many more times recovery will be tried on this node */ - countdown = QB_MAX(rsc->migration_threshold - fail_count, 0); - - /* If failed resource has a parent, we'll force the parent away */ - failed = rsc; - if (!pcmk_is_set(rsc->flags, pe_rsc_unique)) { - failed = uber_parent(rsc); - } - - if (countdown == 0) { - resource_location(failed, node, -INFINITY, "__fail_limit__", data_set); - crm_warn("Forcing %s away from %s after %d failures (max=%d)", - failed->id, node->details->uname, fail_count, - rsc->migration_threshold); - } else { - crm_info("%s can fail %d more times on %s before being forced off", - failed->id, countdown, node->details->uname); - } -} - static void common_apply_stickiness(pe_resource_t * rsc, pe_node_t * node, pe_working_set_t * data_set) { if (rsc->children) { GList *gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; common_apply_stickiness(child_rsc, node, data_set); } return; } if (pcmk_is_set(rsc->flags, pe_rsc_managed) && rsc->stickiness != 0 && pcmk__list_of_1(rsc->running_on)) { pe_node_t *current = pe_find_node_id(rsc->running_on, node->details->id); pe_node_t *match = pe_hash_table_lookup(rsc->allowed_nodes, node->details->id); if (current == NULL) { } else if ((match != NULL) || pcmk_is_set(data_set->flags, pe_flag_symmetric_cluster)) { pe_resource_t *sticky_rsc = rsc; resource_location(sticky_rsc, node, rsc->stickiness, "stickiness", data_set); pe_rsc_debug(sticky_rsc, "Resource %s: preferring current location" " (node=%s, weight=%d)", sticky_rsc->id, node->details->uname, rsc->stickiness); } else { GHashTableIter iter; pe_node_t *nIter = NULL; pe_rsc_debug(rsc, "Ignoring stickiness for %s: the cluster is asymmetric" " and node %s is not explicitly allowed", rsc->id, node->details->uname); g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&nIter)) { crm_err("%s[%s] = %d", rsc->id, nIter->details->uname, nIter->weight); } } } /* Check the migration threshold only if a failcount clear action * has not already been placed for this resource on the node. * There is no sense in potentially forcing the resource from this * node if the failcount is being reset anyway. * * @TODO A clear_failcount operation can be scheduled in stage4() via * check_actions_for(), or in stage5() via check_params(). This runs in * stage2(), so it cannot detect those, meaning we might check the migration * threshold when we shouldn't -- worst case, we stop or move the resource, * then move it back next transition. */ if (failcount_clear_action_exists(node, rsc) == FALSE) { - check_migration_threshold(rsc, node, data_set); + pe_resource_t *failed = pcmk__threshold_reached(rsc, node, data_set); + + if (failed != NULL) { + resource_location(failed, node, -INFINITY, "__fail_limit__", + data_set); + } } } void complex_set_cmds(pe_resource_t * rsc) { GList *gIter = rsc->children; rsc->cmds = &resource_class_alloc_functions[rsc->variant]; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; complex_set_cmds(child_rsc); } } void set_alloc_actions(pe_working_set_t * data_set) { GList *gIter = data_set->resources; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; complex_set_cmds(rsc); } } static void calculate_system_health(gpointer gKey, gpointer gValue, gpointer user_data) { const char *key = (const char *)gKey; const char *value = (const char *)gValue; int *system_health = (int *)user_data; if (!gKey || !gValue || !user_data) { return; } if (pcmk__starts_with(key, "#health")) { int score; /* Convert the value into an integer */ score = char2score(value); /* Add it to the running total */ *system_health = pe__add_scores(score, *system_health); } } static gboolean apply_system_health(pe_working_set_t * data_set) { GList *gIter = NULL; const char *health_strategy = pe_pref(data_set->config_hash, "node-health-strategy"); int base_health = 0; if (pcmk__str_eq(health_strategy, "none", pcmk__str_null_matches | pcmk__str_casei)) { /* Prevent any accidental health -> score translation */ pcmk__score_red = 0; pcmk__score_yellow = 0; pcmk__score_green = 0; return TRUE; } else if (pcmk__str_eq(health_strategy, "migrate-on-red", pcmk__str_casei)) { /* Resources on nodes which have health values of red are * weighted away from that node. */ pcmk__score_red = -INFINITY; pcmk__score_yellow = 0; pcmk__score_green = 0; } else if (pcmk__str_eq(health_strategy, "only-green", pcmk__str_casei)) { /* Resources on nodes which have health values of red or yellow * are forced away from that node. */ pcmk__score_red = -INFINITY; pcmk__score_yellow = -INFINITY; pcmk__score_green = 0; } else if (pcmk__str_eq(health_strategy, "progressive", pcmk__str_casei)) { /* Same as the above, but use the r/y/g scores provided by the user * Defaults are provided by the pe_prefs table * Also, custom health "base score" can be used */ base_health = char2score(pe_pref(data_set->config_hash, "node-health-base")); } else if (pcmk__str_eq(health_strategy, "custom", pcmk__str_casei)) { /* Requires the admin to configure the rsc_location constaints for * processing the stored health scores */ /* TODO: Check for the existence of appropriate node health constraints */ return TRUE; } else { crm_err("Unknown node health strategy: %s", health_strategy); return FALSE; } crm_info("Applying automated node health strategy: %s", health_strategy); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { int system_health = base_health; pe_node_t *node = (pe_node_t *) gIter->data; /* Search through the node hash table for system health entries. */ g_hash_table_foreach(node->details->attrs, calculate_system_health, &system_health); crm_info(" Node %s has an combined system health of %d", node->details->uname, system_health); /* If the health is non-zero, then create a new rsc2node so that the * weight will be added later on. */ if (system_health != 0) { GList *gIter2 = data_set->resources; for (; gIter2 != NULL; gIter2 = gIter2->next) { pe_resource_t *rsc = (pe_resource_t *) gIter2->data; rsc2node_new(health_strategy, rsc, system_health, NULL, node, data_set); } } } return TRUE; } gboolean stage0(pe_working_set_t * data_set) { xmlNode *cib_constraints = get_object_root(XML_CIB_TAG_CONSTRAINTS, data_set->input); if (data_set->input == NULL) { return FALSE; } if (!pcmk_is_set(data_set->flags, pe_flag_have_status)) { crm_trace("Calculating status"); cluster_status(data_set); } set_alloc_actions(data_set); apply_system_health(data_set); unpack_constraints(cib_constraints, data_set); return TRUE; } /* * Check nodes for resources started outside of the LRM */ gboolean probe_resources(pe_working_set_t * data_set) { pe_action_t *probe_node_complete = NULL; for (GList *gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; const char *probed = pe_node_attribute_raw(node, CRM_OP_PROBED); if (node->details->online == FALSE) { if (pe__is_remote_node(node) && node->details->remote_rsc && (get_remote_node_state(node) == remote_state_failed)) { pe_fence_node(data_set, node, "the connection is unrecoverable", FALSE); } continue; } else if (node->details->unclean) { continue; } else if (node->details->rsc_discovery_enabled == FALSE) { /* resource discovery is disabled for this node */ continue; } if (probed != NULL && crm_is_true(probed) == FALSE) { pe_action_t *probe_op = custom_action(NULL, crm_strdup_printf("%s-%s", CRM_OP_REPROBE, node->details->uname), CRM_OP_REPROBE, node, FALSE, TRUE, data_set); add_hash_param(probe_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); continue; } for (GList *gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) { pe_resource_t *rsc = (pe_resource_t *) gIter2->data; rsc->cmds->create_probe(rsc, node, probe_node_complete, FALSE, data_set); } } return TRUE; } static void rsc_discover_filter(pe_resource_t *rsc, pe_node_t *node) { pe_resource_t *top = uber_parent(rsc); pe_node_t *match; if (rsc->exclusive_discover == FALSE && top->exclusive_discover == FALSE) { return; } g_list_foreach(rsc->children, (GFunc) rsc_discover_filter, node); match = g_hash_table_lookup(rsc->allowed_nodes, node->details->id); if (match && match->rsc_discover_mode != pe_discover_exclusive) { match->weight = -INFINITY; } } static time_t shutdown_time(pe_node_t *node, pe_working_set_t *data_set) { const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN); time_t result = 0; if (shutdown) { long long result_ll; if (pcmk__scan_ll(shutdown, &result_ll, 0LL) == pcmk_rc_ok) { result = (time_t) result_ll; } } return result? result : get_effective_time(data_set); } static void apply_shutdown_lock(pe_resource_t *rsc, pe_working_set_t *data_set) { const char *class; // Only primitives and (uncloned) groups may be locked if (rsc->variant == pe_group) { g_list_foreach(rsc->children, (GFunc) apply_shutdown_lock, data_set); } else if (rsc->variant != pe_native) { return; } // Fence devices and remote connections can't be locked class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (pcmk__str_eq(class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_null_matches) || pe__resource_is_remote_conn(rsc, data_set)) { return; } if (rsc->lock_node != NULL) { // The lock was obtained from resource history if (rsc->running_on != NULL) { /* The resource was started elsewhere even though it is now * considered locked. This shouldn't be possible, but as a * failsafe, we don't want to disturb the resource now. */ pe_rsc_info(rsc, "Cancelling shutdown lock because %s is already active", rsc->id); pe__clear_resource_history(rsc, rsc->lock_node, data_set); rsc->lock_node = NULL; rsc->lock_time = 0; } // Only a resource active on exactly one node can be locked } else if (pcmk__list_of_1(rsc->running_on)) { pe_node_t *node = rsc->running_on->data; if (node->details->shutdown) { if (node->details->unclean) { pe_rsc_debug(rsc, "Not locking %s to unclean %s for shutdown", rsc->id, node->details->uname); } else { rsc->lock_node = node; rsc->lock_time = shutdown_time(node, data_set); } } } if (rsc->lock_node == NULL) { // No lock needed return; } if (data_set->shutdown_lock > 0) { time_t lock_expiration = rsc->lock_time + data_set->shutdown_lock; pe_rsc_info(rsc, "Locking %s to %s due to shutdown (expires @%lld)", rsc->id, rsc->lock_node->details->uname, (long long) lock_expiration); pe__update_recheck_time(++lock_expiration, data_set); } else { pe_rsc_info(rsc, "Locking %s to %s due to shutdown", rsc->id, rsc->lock_node->details->uname); } // If resource is locked to one node, ban it from all other nodes for (GList *item = data_set->nodes; item != NULL; item = item->next) { pe_node_t *node = item->data; if (strcmp(node->details->uname, rsc->lock_node->details->uname)) { resource_location(rsc, node, -CRM_SCORE_INFINITY, XML_CONFIG_ATTR_SHUTDOWN_LOCK, data_set); } } } /* * \internal * \brief Stage 2 of cluster status: apply node-specific criteria * * Count known nodes, and apply location constraints, stickiness, and exclusive * resource discovery. */ gboolean stage2(pe_working_set_t * data_set) { GList *gIter = NULL; if (pcmk_is_set(data_set->flags, pe_flag_shutdown_lock)) { g_list_foreach(data_set->resources, (GFunc) apply_shutdown_lock, data_set); } if (!pcmk_is_set(data_set->flags, pe_flag_no_compat)) { // @COMPAT API backward compatibility for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (node && (node->weight >= 0) && node->details->online && (node->details->type != node_ping)) { data_set->max_valid_nodes++; } } } apply_placement_constraints(data_set); gIter = data_set->nodes; for (; gIter != NULL; gIter = gIter->next) { GList *gIter2 = NULL; pe_node_t *node = (pe_node_t *) gIter->data; gIter2 = data_set->resources; for (; gIter2 != NULL; gIter2 = gIter2->next) { pe_resource_t *rsc = (pe_resource_t *) gIter2->data; common_apply_stickiness(rsc, node, data_set); rsc_discover_filter(rsc, node); } } return TRUE; } /* * Create internal resource constraints before allocation */ gboolean stage3(pe_working_set_t * data_set) { GList *gIter = data_set->resources; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; rsc->cmds->internal_constraints(rsc, data_set); } return TRUE; } /* * Check for orphaned or redefined actions */ gboolean stage4(pe_working_set_t * data_set) { check_actions(data_set); return TRUE; } static void * convert_const_pointer(const void *ptr) { /* Worst function ever */ return (void *)ptr; } static gint sort_rsc_process_order(gconstpointer a, gconstpointer b, gpointer data) { int rc = 0; int r1_weight = -INFINITY; int r2_weight = -INFINITY; const char *reason = "existence"; GList *nodes = (GList *) data; const pe_resource_t *resource1 = a; const pe_resource_t *resource2 = b; pe_node_t *r1_node = NULL; pe_node_t *r2_node = NULL; GList *gIter = NULL; GHashTable *r1_nodes = NULL; GHashTable *r2_nodes = NULL; reason = "priority"; r1_weight = resource1->priority; r2_weight = resource2->priority; if (r1_weight > r2_weight) { rc = -1; goto done; } if (r1_weight < r2_weight) { rc = 1; goto done; } reason = "no node list"; if (nodes == NULL) { goto done; } r1_nodes = pcmk__native_merge_weights(convert_const_pointer(resource1), resource1->id, NULL, NULL, 1, pe_weights_forward | pe_weights_init); pe__show_node_weights(true, NULL, resource1->id, r1_nodes, resource1->cluster); r2_nodes = pcmk__native_merge_weights(convert_const_pointer(resource2), resource2->id, NULL, NULL, 1, pe_weights_forward | pe_weights_init); pe__show_node_weights(true, NULL, resource2->id, r2_nodes, resource2->cluster); /* Current location score */ reason = "current location"; r1_weight = -INFINITY; r2_weight = -INFINITY; if (resource1->running_on) { r1_node = pe__current_node(resource1); r1_node = g_hash_table_lookup(r1_nodes, r1_node->details->id); if (r1_node != NULL) { r1_weight = r1_node->weight; } } if (resource2->running_on) { r2_node = pe__current_node(resource2); r2_node = g_hash_table_lookup(r2_nodes, r2_node->details->id); if (r2_node != NULL) { r2_weight = r2_node->weight; } } if (r1_weight > r2_weight) { rc = -1; goto done; } if (r1_weight < r2_weight) { rc = 1; goto done; } reason = "score"; for (gIter = nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; r1_node = NULL; r2_node = NULL; r1_weight = -INFINITY; if (r1_nodes) { r1_node = g_hash_table_lookup(r1_nodes, node->details->id); } if (r1_node) { r1_weight = r1_node->weight; } r2_weight = -INFINITY; if (r2_nodes) { r2_node = g_hash_table_lookup(r2_nodes, node->details->id); } if (r2_node) { r2_weight = r2_node->weight; } if (r1_weight > r2_weight) { rc = -1; goto done; } if (r1_weight < r2_weight) { rc = 1; goto done; } } done: crm_trace("%s (%d) on %s %c %s (%d) on %s: %s", resource1->id, r1_weight, r1_node ? r1_node->details->id : "n/a", rc < 0 ? '>' : rc > 0 ? '<' : '=', resource2->id, r2_weight, r2_node ? r2_node->details->id : "n/a", reason); if (r1_nodes) { g_hash_table_destroy(r1_nodes); } if (r2_nodes) { g_hash_table_destroy(r2_nodes); } return rc; } static void allocate_resources(pe_working_set_t * data_set) { GList *gIter = NULL; if (pcmk_is_set(data_set->flags, pe_flag_have_remote_nodes)) { /* Allocate remote connection resources first (which will also allocate * any colocation dependencies). If the connection is migrating, always * prefer the partial migration target. */ for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (rsc->is_remote_node == FALSE) { continue; } pe_rsc_trace(rsc, "Allocating remote connection resource '%s'", rsc->id); rsc->cmds->allocate(rsc, rsc->partial_migration_target, data_set); } } /* now do the rest of the resources */ for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (rsc->is_remote_node == TRUE) { continue; } pe_rsc_trace(rsc, "Allocating %s resource '%s'", crm_element_name(rsc->xml), rsc->id); rsc->cmds->allocate(rsc, NULL, data_set); } } /* We always use pe_order_preserve with these convenience functions to exempt * internally generated constraints from the prohibition of user constraints * involving remote connection resources. * * The start ordering additionally uses pe_order_runnable_left so that the * specified action is not runnable if the start is not runnable. */ static inline void order_start_then_action(pe_resource_t *lh_rsc, pe_action_t *rh_action, enum pe_ordering extra, pe_working_set_t *data_set) { if (lh_rsc && rh_action && data_set) { custom_action_order(lh_rsc, start_key(lh_rsc), NULL, rh_action->rsc, NULL, rh_action, pe_order_preserve | pe_order_runnable_left | extra, data_set); } } static inline void order_action_then_stop(pe_action_t *lh_action, pe_resource_t *rh_rsc, enum pe_ordering extra, pe_working_set_t *data_set) { if (lh_action && rh_rsc && data_set) { custom_action_order(lh_action->rsc, NULL, lh_action, rh_rsc, stop_key(rh_rsc), NULL, pe_order_preserve | extra, data_set); } } // Clear fail counts for orphaned rsc on all online nodes static void cleanup_orphans(pe_resource_t * rsc, pe_working_set_t * data_set) { GList *gIter = NULL; for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (node->details->online && pe_get_failcount(node, rsc, NULL, pe_fc_effective, NULL, data_set)) { pe_action_t *clear_op = NULL; clear_op = pe__clear_failcount(rsc, node, "it is orphaned", data_set); /* We can't use order_action_then_stop() here because its * pe_order_preserve breaks things */ custom_action_order(clear_op->rsc, NULL, clear_op, rsc, stop_key(rsc), NULL, pe_order_optional, data_set); } } } gboolean stage5(pe_working_set_t * data_set) { pcmk__output_t *out = data_set->priv; GList *gIter = NULL; if (!pcmk__str_eq(data_set->placement_strategy, "default", pcmk__str_casei)) { GList *nodes = g_list_copy(data_set->nodes); nodes = sort_nodes_by_weight(nodes, NULL, data_set); data_set->resources = g_list_sort_with_data(data_set->resources, sort_rsc_process_order, nodes); g_list_free(nodes); } gIter = data_set->nodes; for (; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (pcmk_is_set(data_set->flags, pe_flag_show_utilization)) { out->message(out, "node-capacity", node, "Original"); } } crm_trace("Allocating services"); /* Take (next) highest resource, assign it and create its actions */ allocate_resources(data_set); gIter = data_set->nodes; for (; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (pcmk_is_set(data_set->flags, pe_flag_show_utilization)) { out->message(out, "node-capacity", node, "Remaining"); } } // Process deferred action checks pe__foreach_param_check(data_set, check_params); pe__free_param_checks(data_set); if (pcmk_is_set(data_set->flags, pe_flag_startup_probes)) { crm_trace("Calculating needed probes"); /* This code probably needs optimization * ptest -x with 100 nodes, 100 clones and clone-max=100: With probes: ptest[14781]: 2010/09/27_17:56:46 notice: TRACE: do_calculations: pengine.c:258 Calculate cluster status ptest[14781]: 2010/09/27_17:56:46 notice: TRACE: do_calculations: pengine.c:278 Applying placement constraints ptest[14781]: 2010/09/27_17:56:47 notice: TRACE: do_calculations: pengine.c:285 Create internal constraints ptest[14781]: 2010/09/27_17:56:47 notice: TRACE: do_calculations: pengine.c:292 Check actions ptest[14781]: 2010/09/27_17:56:48 notice: TRACE: do_calculations: pengine.c:299 Allocate resources ptest[14781]: 2010/09/27_17:56:48 notice: TRACE: stage5: allocate.c:881 Allocating services ptest[14781]: 2010/09/27_17:56:49 notice: TRACE: stage5: allocate.c:894 Calculating needed probes ptest[14781]: 2010/09/27_17:56:51 notice: TRACE: stage5: allocate.c:899 Creating actions ptest[14781]: 2010/09/27_17:56:52 notice: TRACE: stage5: allocate.c:905 Creating done ptest[14781]: 2010/09/27_17:56:52 notice: TRACE: do_calculations: pengine.c:306 Processing fencing and shutdown cases ptest[14781]: 2010/09/27_17:56:52 notice: TRACE: do_calculations: pengine.c:313 Applying ordering constraints 36s ptest[14781]: 2010/09/27_17:57:28 notice: TRACE: do_calculations: pengine.c:320 Create transition graph Without probes: ptest[14637]: 2010/09/27_17:56:21 notice: TRACE: do_calculations: pengine.c:258 Calculate cluster status ptest[14637]: 2010/09/27_17:56:22 notice: TRACE: do_calculations: pengine.c:278 Applying placement constraints ptest[14637]: 2010/09/27_17:56:22 notice: TRACE: do_calculations: pengine.c:285 Create internal constraints ptest[14637]: 2010/09/27_17:56:22 notice: TRACE: do_calculations: pengine.c:292 Check actions ptest[14637]: 2010/09/27_17:56:23 notice: TRACE: do_calculations: pengine.c:299 Allocate resources ptest[14637]: 2010/09/27_17:56:23 notice: TRACE: stage5: allocate.c:881 Allocating services ptest[14637]: 2010/09/27_17:56:24 notice: TRACE: stage5: allocate.c:899 Creating actions ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: stage5: allocate.c:905 Creating done ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: do_calculations: pengine.c:306 Processing fencing and shutdown cases ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: do_calculations: pengine.c:313 Applying ordering constraints ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: do_calculations: pengine.c:320 Create transition graph */ probe_resources(data_set); } crm_trace("Handle orphans"); if (pcmk_is_set(data_set->flags, pe_flag_stop_rsc_orphans)) { for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; /* There's no need to recurse into rsc->children because those * should just be unallocated clone instances. */ if (pcmk_is_set(rsc->flags, pe_rsc_orphan)) { cleanup_orphans(rsc, data_set); } } } crm_trace("Creating actions"); for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; rsc->cmds->create_actions(rsc, data_set); } crm_trace("Creating done"); return TRUE; } static gboolean is_managed(const pe_resource_t * rsc) { GList *gIter = rsc->children; if (pcmk_is_set(rsc->flags, pe_rsc_managed)) { return TRUE; } for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; if (is_managed(child_rsc)) { return TRUE; } } return FALSE; } static gboolean any_managed_resources(pe_working_set_t * data_set) { GList *gIter = data_set->resources; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (is_managed(rsc)) { return TRUE; } } return FALSE; } /*! * \internal * \brief Create pseudo-op for guest node fence, and order relative to it * * \param[in] node Guest node to fence * \param[in] data_set Working set of CIB state */ static void fence_guest(pe_node_t *node, pe_working_set_t *data_set) { pe_resource_t *container = node->details->remote_rsc->container; pe_action_t *stop = NULL; pe_action_t *stonith_op = NULL; /* The fence action is just a label; we don't do anything differently for * off vs. reboot. We specify it explicitly, rather than let it default to * cluster's default action, because we are not _initiating_ fencing -- we * are creating a pseudo-event to describe fencing that is already occurring * by other means (container recovery). */ const char *fence_action = "off"; /* Check whether guest's container resource has any explicit stop or * start (the stop may be implied by fencing of the guest's host). */ if (container) { stop = find_first_action(container->actions, NULL, CRMD_ACTION_STOP, NULL); if (find_first_action(container->actions, NULL, CRMD_ACTION_START, NULL)) { fence_action = "reboot"; } } /* Create a fence pseudo-event, so we have an event to order actions * against, and the controller can always detect it. */ stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", FALSE, data_set); pe__set_action_flags(stonith_op, pe_action_pseudo|pe_action_runnable); /* We want to imply stops/demotes after the guest is stopped, not wait until * it is restarted, so we always order pseudo-fencing after stop, not start * (even though start might be closer to what is done for a real reboot). */ if ((stop != NULL) && pcmk_is_set(stop->flags, pe_action_pseudo)) { pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, FALSE, data_set); crm_info("Implying guest node %s is down (action %d) after %s fencing", node->details->uname, stonith_op->id, stop->node->details->uname); order_actions(parent_stonith_op, stonith_op, pe_order_runnable_left|pe_order_implies_then); } else if (stop) { order_actions(stop, stonith_op, pe_order_runnable_left|pe_order_implies_then); crm_info("Implying guest node %s is down (action %d) " "after container %s is stopped (action %d)", node->details->uname, stonith_op->id, container->id, stop->id); } else { /* If we're fencing the guest node but there's no stop for the guest * resource, we must think the guest is already stopped. However, we may * think so because its resource history was just cleaned. To avoid * unnecessarily considering the guest node down if it's really up, * order the pseudo-fencing after any stop of the connection resource, * which will be ordered after any container (re-)probe. */ stop = find_first_action(node->details->remote_rsc->actions, NULL, RSC_STOP, NULL); if (stop) { order_actions(stop, stonith_op, pe_order_optional); crm_info("Implying guest node %s is down (action %d) " "after connection is stopped (action %d)", node->details->uname, stonith_op->id, stop->id); } else { /* Not sure why we're fencing, but everything must already be * cleanly stopped. */ crm_info("Implying guest node %s is down (action %d) ", node->details->uname, stonith_op->id); } } /* Order/imply other actions relative to pseudo-fence as with real fence */ pcmk__order_vs_fence(stonith_op, data_set); } /* * Create dependencies for stonith and shutdown operations */ gboolean stage6(pe_working_set_t * data_set) { pe_action_t *dc_down = NULL; pe_action_t *stonith_op = NULL; gboolean integrity_lost = FALSE; gboolean need_stonith = TRUE; GList *gIter; GList *stonith_ops = NULL; GList *shutdown_ops = NULL; /* Remote ordering constraints need to happen prior to calculating fencing * because it is one more place we will mark the node as dirty. * * A nice side effect of doing them early is that apply_*_ordering() can be * simpler because pe_fence_node() has already done some of the work. */ crm_trace("Creating remote ordering constraints"); apply_remote_node_ordering(data_set); crm_trace("Processing fencing and shutdown cases"); if (any_managed_resources(data_set) == FALSE) { crm_notice("Delaying fencing operations until there are resources to manage"); need_stonith = FALSE; } /* Check each node for stonith/shutdown */ for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; /* Guest nodes are "fenced" by recovering their container resource, * so handle them separately. */ if (pe__is_guest_node(node)) { if (node->details->remote_requires_reset && need_stonith && pe_can_fence(data_set, node)) { fence_guest(node, data_set); } continue; } stonith_op = NULL; if (node->details->unclean && need_stonith && pe_can_fence(data_set, node)) { stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", FALSE, data_set); pe_warn("Scheduling Node %s for STONITH", node->details->uname); pcmk__order_vs_fence(stonith_op, data_set); if (node->details->is_dc) { // Remember if the DC is being fenced dc_down = stonith_op; } else { if (!pcmk_is_set(data_set->flags, pe_flag_concurrent_fencing) && (stonith_ops != NULL)) { /* Concurrent fencing is disabled, so order each non-DC * fencing in a chain. If there is any DC fencing or * shutdown, it will be ordered after the last action in the * chain later. */ order_actions((pe_action_t *) stonith_ops->data, stonith_op, pe_order_optional); } // Remember all non-DC fencing actions in a separate list stonith_ops = g_list_prepend(stonith_ops, stonith_op); } } else if (node->details->online && node->details->shutdown && /* TODO define what a shutdown op means for a remote node. * For now we do not send shutdown operations for remote nodes, but * if we can come up with a good use for this in the future, we will. */ pe__is_guest_or_remote_node(node) == FALSE) { pe_action_t *down_op = sched_shutdown_op(node, data_set); if (node->details->is_dc) { // Remember if the DC is being shut down dc_down = down_op; } else { // Remember non-DC shutdowns for later ordering shutdown_ops = g_list_prepend(shutdown_ops, down_op); } } if (node->details->unclean && stonith_op == NULL) { integrity_lost = TRUE; pe_warn("Node %s is unclean!", node->details->uname); } } if (integrity_lost) { if (!pcmk_is_set(data_set->flags, pe_flag_stonith_enabled)) { pe_warn("YOUR RESOURCES ARE NOW LIKELY COMPROMISED"); pe_err("ENABLE STONITH TO KEEP YOUR RESOURCES SAFE"); } else if (!pcmk_is_set(data_set->flags, pe_flag_have_quorum)) { crm_notice("Cannot fence unclean nodes until quorum is" " attained (or no-quorum-policy is set to ignore)"); } } if (dc_down != NULL) { /* Order any non-DC shutdowns before any DC shutdown, to avoid repeated * DC elections. However, we don't want to order non-DC shutdowns before * a DC *fencing*, because even though we don't want a node that's * shutting down to become DC, the DC fencing could be ordered before a * clone stop that's also ordered before the shutdowns, thus leading to * a graph loop. */ if (pcmk__str_eq(dc_down->task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { for (gIter = shutdown_ops; gIter != NULL; gIter = gIter->next) { pe_action_t *node_stop = (pe_action_t *) gIter->data; crm_debug("Ordering shutdown on %s before %s on DC %s", node_stop->node->details->uname, dc_down->task, dc_down->node->details->uname); order_actions(node_stop, dc_down, pe_order_optional); } } // Order any non-DC fencing before any DC fencing or shutdown if (pcmk_is_set(data_set->flags, pe_flag_concurrent_fencing)) { /* With concurrent fencing, order each non-DC fencing action * separately before any DC fencing or shutdown. */ for (gIter = stonith_ops; gIter != NULL; gIter = gIter->next) { order_actions((pe_action_t *) gIter->data, dc_down, pe_order_optional); } } else if (stonith_ops) { /* Without concurrent fencing, the non-DC fencing actions are * already ordered relative to each other, so we just need to order * the DC fencing after the last action in the chain (which is the * first item in the list). */ order_actions((pe_action_t *) stonith_ops->data, dc_down, pe_order_optional); } } g_list_free(stonith_ops); g_list_free(shutdown_ops); return TRUE; } /* * Determine the sets of independent actions and the correct order for the * actions in each set. * * Mark dependencies of un-runnable actions un-runnable * */ static GList * find_actions_by_task(GList *actions, pe_resource_t * rsc, const char *original_key) { GList *list = NULL; list = find_actions(actions, original_key, NULL); if (list == NULL) { /* we're potentially searching a child of the original resource */ char *key = NULL; char *task = NULL; guint interval_ms = 0; if (parse_op_key(original_key, NULL, &task, &interval_ms)) { key = pcmk__op_key(rsc->id, task, interval_ms); list = find_actions(actions, key, NULL); } else { crm_err("search key: %s", original_key); } free(key); free(task); } return list; } static void rsc_order_then(pe_action_t *lh_action, pe_resource_t *rsc, pe__ordering_t *order) { GList *gIter = NULL; GList *rh_actions = NULL; pe_action_t *rh_action = NULL; enum pe_ordering type; CRM_CHECK(rsc != NULL, return); CRM_CHECK(order != NULL, return); type = order->type; rh_action = order->rh_action; crm_trace("Applying ordering constraint %d (then: %s)", order->id, rsc->id); if (rh_action != NULL) { rh_actions = g_list_prepend(NULL, rh_action); } else if (rsc != NULL) { rh_actions = find_actions_by_task(rsc->actions, rsc, order->rh_action_task); } if (rh_actions == NULL) { pe_rsc_trace(rsc, "Ignoring constraint %d: then (%s for %s) not found", order->id, order->rh_action_task, rsc->id); return; } if ((lh_action != NULL) && (lh_action->rsc == rsc) && pcmk_is_set(lh_action->flags, pe_action_dangle)) { pe_rsc_trace(rsc, "Detected dangling operation %s -> %s", lh_action->uuid, order->rh_action_task); pe__clear_order_flags(type, pe_order_implies_then); } gIter = rh_actions; for (; gIter != NULL; gIter = gIter->next) { pe_action_t *rh_action_iter = (pe_action_t *) gIter->data; if (lh_action) { order_actions(lh_action, rh_action_iter, type); } else if (type & pe_order_implies_then) { pe__clear_action_flags(rh_action_iter, pe_action_runnable); crm_warn("Unrunnable %s 0x%.6x", rh_action_iter->uuid, type); } else { crm_warn("neither %s 0x%.6x", rh_action_iter->uuid, type); } } g_list_free(rh_actions); } static void rsc_order_first(pe_resource_t *lh_rsc, pe__ordering_t *order, pe_working_set_t *data_set) { GList *lh_actions = NULL; pe_action_t *lh_action = order->lh_action; pe_resource_t *rh_rsc = order->rh_rsc; CRM_ASSERT(lh_rsc != NULL); pe_rsc_trace(lh_rsc, "Applying ordering constraint %d (first: %s)", order->id, lh_rsc->id); if (lh_action != NULL) { lh_actions = g_list_prepend(NULL, lh_action); } else { lh_actions = find_actions_by_task(lh_rsc->actions, lh_rsc, order->lh_action_task); } if ((lh_actions == NULL) && (lh_rsc == rh_rsc)) { pe_rsc_trace(lh_rsc, "Ignoring constraint %d: first (%s for %s) not found", order->id, order->lh_action_task, lh_rsc->id); } else if (lh_actions == NULL) { char *key = NULL; char *op_type = NULL; guint interval_ms = 0; parse_op_key(order->lh_action_task, NULL, &op_type, &interval_ms); key = pcmk__op_key(lh_rsc->id, op_type, interval_ms); if (lh_rsc->fns->state(lh_rsc, TRUE) == RSC_ROLE_STOPPED && pcmk__str_eq(op_type, RSC_STOP, pcmk__str_casei)) { free(key); pe_rsc_trace(lh_rsc, "Ignoring constraint %d: first (%s for %s) not found", order->id, order->lh_action_task, lh_rsc->id); } else if ((lh_rsc->fns->state(lh_rsc, TRUE) == RSC_ROLE_UNPROMOTED) && pcmk__str_eq(op_type, RSC_DEMOTE, pcmk__str_casei)) { free(key); pe_rsc_trace(lh_rsc, "Ignoring constraint %d: first (%s for %s) not found", order->id, order->lh_action_task, lh_rsc->id); } else { pe_rsc_trace(lh_rsc, "Creating first (%s for %s) for constraint %d ", order->lh_action_task, lh_rsc->id, order->id); lh_action = custom_action(lh_rsc, key, op_type, NULL, TRUE, TRUE, data_set); lh_actions = g_list_prepend(NULL, lh_action); } free(op_type); } if (rh_rsc == NULL) { if (order->rh_action == NULL) { pe_rsc_trace(lh_rsc, "Ignoring constraint %d: then not found", order->id); return; } rh_rsc = order->rh_action->rsc; } for (GList *gIter = lh_actions; gIter != NULL; gIter = gIter->next) { lh_action = (pe_action_t *) gIter->data; if (rh_rsc == NULL) { order_actions(lh_action, order->rh_action, order->type); } else { rsc_order_then(lh_action, rh_rsc, order); } } g_list_free(lh_actions); } extern void update_colo_start_chain(pe_action_t *action, pe_working_set_t *data_set); static int is_recurring_action(pe_action_t *action) { guint interval_ms; if (pcmk__guint_from_hash(action->meta, XML_LRM_ATTR_INTERVAL_MS, 0, &interval_ms) != pcmk_rc_ok) { return 0; } return (interval_ms > 0); } static void apply_container_ordering(pe_action_t *action, pe_working_set_t *data_set) { /* VMs are also classified as containers for these purposes... in * that they both involve a 'thing' running on a real or remote * cluster node. * * This allows us to be smarter about the type and extent of * recovery actions required in various scenarios */ pe_resource_t *remote_rsc = NULL; pe_resource_t *container = NULL; enum action_tasks task = text2task(action->task); CRM_ASSERT(action->rsc); CRM_ASSERT(action->node); CRM_ASSERT(pe__is_guest_or_remote_node(action->node)); remote_rsc = action->node->details->remote_rsc; CRM_ASSERT(remote_rsc); container = remote_rsc->container; CRM_ASSERT(container); if (pcmk_is_set(container->flags, pe_rsc_failed)) { pe_fence_node(data_set, action->node, "container failed", FALSE); } crm_trace("Order %s action %s relative to %s%s for %s%s", action->task, action->uuid, pcmk_is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "", remote_rsc->id, pcmk_is_set(container->flags, pe_rsc_failed)? "failed " : "", container->id); if (pcmk__strcase_any_of(action->task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { /* Migration ops map to "no_action", but we need to apply the same * ordering as for stop or demote (see get_router_node()). */ task = stop_rsc; } switch (task) { case start_rsc: case action_promote: /* Force resource recovery if the container is recovered */ order_start_then_action(container, action, pe_order_implies_then, data_set); /* Wait for the connection resource to be up too */ order_start_then_action(remote_rsc, action, pe_order_none, data_set); break; case stop_rsc: case action_demote: if (pcmk_is_set(container->flags, pe_rsc_failed)) { /* When the container representing a guest node fails, any stop * or demote actions for resources running on the guest node * are implied by the container stopping. This is similar to * how fencing operations work for cluster nodes and remote * nodes. */ } else { /* Ensure the operation happens before the connection is brought * down. * * If we really wanted to, we could order these after the * connection start, IFF the container's current role was * stopped (otherwise we re-introduce an ordering loop when the * connection is restarting). */ order_action_then_stop(action, remote_rsc, pe_order_none, data_set); } break; default: /* Wait for the connection resource to be up */ if (is_recurring_action(action)) { /* In case we ever get the recovery logic wrong, force * recurring monitors to be restarted, even if just * the connection was re-established */ if(task != no_action) { order_start_then_action(remote_rsc, action, pe_order_implies_then, data_set); } } else { order_start_then_action(remote_rsc, action, pe_order_none, data_set); } break; } } static enum remote_connection_state get_remote_node_state(pe_node_t *node) { pe_resource_t *remote_rsc = NULL; pe_node_t *cluster_node = NULL; CRM_ASSERT(node); remote_rsc = node->details->remote_rsc; CRM_ASSERT(remote_rsc); cluster_node = pe__current_node(remote_rsc); /* If the cluster node the remote connection resource resides on * is unclean or went offline, we can't process any operations * on that remote node until after it starts elsewhere. */ if(remote_rsc->next_role == RSC_ROLE_STOPPED || remote_rsc->allocated_to == NULL) { /* The connection resource is not going to run anywhere */ if (cluster_node && cluster_node->details->unclean) { /* The remote connection is failed because its resource is on a * failed node and can't be recovered elsewhere, so we must fence. */ return remote_state_failed; } if (!pcmk_is_set(remote_rsc->flags, pe_rsc_failed)) { /* Connection resource is cleanly stopped */ return remote_state_stopped; } /* Connection resource is failed */ if ((remote_rsc->next_role == RSC_ROLE_STOPPED) && remote_rsc->remote_reconnect_ms && node->details->remote_was_fenced && !pe__shutdown_requested(node)) { /* We won't know whether the connection is recoverable until the * reconnect interval expires and we reattempt connection. */ return remote_state_unknown; } /* The remote connection is in a failed state. If there are any * resources known to be active on it (stop) or in an unknown state * (probe), we must assume the worst and fence it. */ return remote_state_failed; } else if (cluster_node == NULL) { /* Connection is recoverable but not currently running anywhere, see if we can recover it first */ return remote_state_unknown; } else if(cluster_node->details->unclean == TRUE || cluster_node->details->online == FALSE) { /* Connection is running on a dead node, see if we can recover it first */ return remote_state_resting; } else if (pcmk__list_of_multiple(remote_rsc->running_on) && remote_rsc->partial_migration_source && remote_rsc->partial_migration_target) { /* We're in the middle of migrating a connection resource, * wait until after the resource migrates before performing * any actions. */ return remote_state_resting; } return remote_state_alive; } /*! * \internal * \brief Order actions on remote node relative to actions for the connection */ static void apply_remote_ordering(pe_action_t *action, pe_working_set_t *data_set) { pe_resource_t *remote_rsc = NULL; enum action_tasks task = text2task(action->task); enum remote_connection_state state = get_remote_node_state(action->node); enum pe_ordering order_opts = pe_order_none; if (action->rsc == NULL) { return; } CRM_ASSERT(action->node); CRM_ASSERT(pe__is_guest_or_remote_node(action->node)); remote_rsc = action->node->details->remote_rsc; CRM_ASSERT(remote_rsc); crm_trace("Order %s action %s relative to %s%s (state: %s)", action->task, action->uuid, pcmk_is_set(remote_rsc->flags, pe_rsc_failed)? "failed " : "", remote_rsc->id, state2text(state)); if (pcmk__strcase_any_of(action->task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { /* Migration ops map to "no_action", but we need to apply the same * ordering as for stop or demote (see get_router_node()). */ task = stop_rsc; } switch (task) { case start_rsc: case action_promote: order_opts = pe_order_none; if (state == remote_state_failed) { /* Force recovery, by making this action required */ pe__set_order_flags(order_opts, pe_order_implies_then); } /* Ensure connection is up before running this action */ order_start_then_action(remote_rsc, action, order_opts, data_set); break; case stop_rsc: if(state == remote_state_alive) { order_action_then_stop(action, remote_rsc, pe_order_implies_first, data_set); } else if(state == remote_state_failed) { /* The resource is active on the node, but since we don't have a * valid connection, the only way to stop the resource is by * fencing the node. There is no need to order the stop relative * to the remote connection, since the stop will become implied * by the fencing. */ pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable", FALSE); } else if(remote_rsc->next_role == RSC_ROLE_STOPPED) { /* State must be remote_state_unknown or remote_state_stopped. * Since the connection is not coming back up in this * transition, stop this resource first. */ order_action_then_stop(action, remote_rsc, pe_order_implies_first, data_set); } else { /* The connection is going to be started somewhere else, so * stop this resource after that completes. */ order_start_then_action(remote_rsc, action, pe_order_none, data_set); } break; case action_demote: /* Only order this demote relative to the connection start if the * connection isn't being torn down. Otherwise, the demote would be * blocked because the connection start would not be allowed. */ if(state == remote_state_resting || state == remote_state_unknown) { order_start_then_action(remote_rsc, action, pe_order_none, data_set); } /* Otherwise we can rely on the stop ordering */ break; default: /* Wait for the connection resource to be up */ if (is_recurring_action(action)) { /* In case we ever get the recovery logic wrong, force * recurring monitors to be restarted, even if just * the connection was re-established */ order_start_then_action(remote_rsc, action, pe_order_implies_then, data_set); } else { pe_node_t *cluster_node = pe__current_node(remote_rsc); if(task == monitor_rsc && state == remote_state_failed) { /* We would only be here if we do not know the * state of the resource on the remote node. * Since we have no way to find out, it is * necessary to fence the node. */ pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable", FALSE); } if(cluster_node && state == remote_state_stopped) { /* The connection is currently up, but is going * down permanently. * * Make sure we check services are actually * stopped _before_ we let the connection get * closed */ order_action_then_stop(action, remote_rsc, pe_order_runnable_left, data_set); } else { order_start_then_action(remote_rsc, action, pe_order_none, data_set); } } break; } } static void apply_remote_node_ordering(pe_working_set_t *data_set) { if (!pcmk_is_set(data_set->flags, pe_flag_have_remote_nodes)) { return; } for (GList *gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { pe_action_t *action = (pe_action_t *) gIter->data; pe_resource_t *remote = NULL; // We are only interested in resource actions if (action->rsc == NULL) { continue; } /* Special case: If we are clearing the failcount of an actual * remote connection resource, then make sure this happens before * any start of the resource in this transition. */ if (action->rsc->is_remote_node && pcmk__str_eq(action->task, CRM_OP_CLEAR_FAILCOUNT, pcmk__str_casei)) { custom_action_order(action->rsc, NULL, action, action->rsc, pcmk__op_key(action->rsc->id, RSC_START, 0), NULL, pe_order_optional, data_set); continue; } // We are only interested in actions allocated to a node if (action->node == NULL) { continue; } if (!pe__is_guest_or_remote_node(action->node)) { continue; } /* We are only interested in real actions. * * @TODO This is probably wrong; pseudo-actions might be converted to * real actions and vice versa later in update_actions() at the end of * stage7(). */ if (pcmk_is_set(action->flags, pe_action_pseudo)) { continue; } remote = action->node->details->remote_rsc; if (remote == NULL) { // Orphaned continue; } /* Another special case: if a resource is moving to a Pacemaker Remote * node, order the stop on the original node after any start of the * remote connection. This ensures that if the connection fails to * start, we leave the resource running on the original node. */ if (pcmk__str_eq(action->task, RSC_START, pcmk__str_casei)) { for (GList *item = action->rsc->actions; item != NULL; item = item->next) { pe_action_t *rsc_action = item->data; if ((rsc_action->node->details != action->node->details) && pcmk__str_eq(rsc_action->task, RSC_STOP, pcmk__str_casei)) { custom_action_order(remote, start_key(remote), NULL, action->rsc, NULL, rsc_action, pe_order_optional, data_set); } } } /* The action occurs across a remote connection, so create * ordering constraints that guarantee the action occurs while the node * is active (after start, before stop ... things like that). * * This is somewhat brittle in that we need to make sure the results of * this ordering are compatible with the result of get_router_node(). * It would probably be better to add XML_LRM_ATTR_ROUTER_NODE as part * of this logic rather than action2xml(). */ if (remote->container) { crm_trace("Container ordering for %s", action->uuid); apply_container_ordering(action, data_set); } else { crm_trace("Remote ordering for %s", action->uuid); apply_remote_ordering(action, data_set); } } } static gboolean order_first_probe_unneeded(pe_action_t * probe, pe_action_t * rh_action) { /* No need to probe the resource on the node that is being * unfenced. Otherwise it might introduce transition loop * since probe will be performed after the node is * unfenced. */ if (pcmk__str_eq(rh_action->task, CRM_OP_FENCE, pcmk__str_casei) && probe->node && rh_action->node && probe->node->details == rh_action->node->details) { const char *op = g_hash_table_lookup(rh_action->meta, "stonith_action"); if (pcmk__str_eq(op, "on", pcmk__str_casei)) { return TRUE; } } // Shutdown waits for probe to complete only if it's on the same node if ((pcmk__str_eq(rh_action->task, CRM_OP_SHUTDOWN, pcmk__str_casei)) && probe->node && rh_action->node && probe->node->details != rh_action->node->details) { return TRUE; } return FALSE; } static void order_first_probes_imply_stops(pe_working_set_t * data_set) { GList *gIter = NULL; for (gIter = data_set->ordering_constraints; gIter != NULL; gIter = gIter->next) { pe__ordering_t *order = gIter->data; enum pe_ordering order_type = pe_order_optional; pe_resource_t *lh_rsc = order->lh_rsc; pe_resource_t *rh_rsc = order->rh_rsc; pe_action_t *lh_action = order->lh_action; pe_action_t *rh_action = order->rh_action; const char *lh_action_task = order->lh_action_task; const char *rh_action_task = order->rh_action_task; GList *probes = NULL; GList *rh_actions = NULL; GList *pIter = NULL; if (lh_rsc == NULL) { continue; } else if (rh_rsc && lh_rsc == rh_rsc) { continue; } if (lh_action == NULL && lh_action_task == NULL) { continue; } if (rh_action == NULL && rh_action_task == NULL) { continue; } /* Technically probe is expected to return "not running", which could be * the alternative of stop action if the status of the resource is * unknown yet. */ if (lh_action && !pcmk__str_eq(lh_action->task, RSC_STOP, pcmk__str_casei)) { continue; } else if (lh_action == NULL && lh_action_task && !pcmk__ends_with(lh_action_task, "_" RSC_STOP "_0")) { continue; } /* Do not probe the resource inside of a stopping container. Otherwise * it might introduce transition loop since probe will be performed * after the container starts again. */ if (rh_rsc && lh_rsc->container == rh_rsc) { if (rh_action && pcmk__str_eq(rh_action->task, RSC_STOP, pcmk__str_casei)) { continue; } else if (rh_action == NULL && rh_action_task && pcmk__ends_with(rh_action_task,"_" RSC_STOP "_0")) { continue; } } if (order->type == pe_order_none) { continue; } // Preserve the order options for future filtering if (pcmk_is_set(order->type, pe_order_apply_first_non_migratable)) { pe__set_order_flags(order_type, pe_order_apply_first_non_migratable); } if (pcmk_is_set(order->type, pe_order_same_node)) { pe__set_order_flags(order_type, pe_order_same_node); } // Keep the order types for future filtering if (order->type == pe_order_anti_colocation || order->type == pe_order_load) { order_type = order->type; } probes = pe__resource_actions(lh_rsc, NULL, RSC_STATUS, FALSE); if (probes == NULL) { continue; } if (rh_action) { rh_actions = g_list_prepend(rh_actions, rh_action); } else if (rh_rsc && rh_action_task) { rh_actions = find_actions(rh_rsc->actions, rh_action_task, NULL); } if (rh_actions == NULL) { g_list_free(probes); continue; } crm_trace("Processing for LH probe based on ordering constraint %s -> %s" " (id=%d, type=%.6x)", lh_action ? lh_action->uuid : lh_action_task, rh_action ? rh_action->uuid : rh_action_task, order->id, order->type); for (pIter = probes; pIter != NULL; pIter = pIter->next) { pe_action_t *probe = (pe_action_t *) pIter->data; GList *rIter = NULL; for (rIter = rh_actions; rIter != NULL; rIter = rIter->next) { pe_action_t *rh_action_iter = (pe_action_t *) rIter->data; if (order_first_probe_unneeded(probe, rh_action_iter)) { continue; } order_actions(probe, rh_action_iter, order_type); } } g_list_free(rh_actions); g_list_free(probes); } } static void order_first_probe_then_restart_repromote(pe_action_t * probe, pe_action_t * after, pe_working_set_t * data_set) { GList *gIter = NULL; bool interleave = FALSE; pe_resource_t *compatible_rsc = NULL; if (probe == NULL || probe->rsc == NULL || probe->rsc->variant != pe_native) { return; } if (after == NULL // Avoid running into any possible loop || pcmk_is_set(after->flags, pe_action_tracking)) { return; } if (!pcmk__str_eq(probe->task, RSC_STATUS, pcmk__str_casei)) { return; } pe__set_action_flags(after, pe_action_tracking); crm_trace("Processing based on %s %s -> %s %s", probe->uuid, probe->node ? probe->node->details->uname: "", after->uuid, after->node ? after->node->details->uname : ""); if (after->rsc /* Better not build a dependency directly with a clone/group. * We are going to proceed through the ordering chain and build * dependencies with its children. */ && after->rsc->variant == pe_native && probe->rsc != after->rsc) { GList *then_actions = NULL; enum pe_ordering probe_order_type = pe_order_optional; if (pcmk__str_eq(after->task, RSC_START, pcmk__str_casei)) { then_actions = pe__resource_actions(after->rsc, NULL, RSC_STOP, FALSE); } else if (pcmk__str_eq(after->task, RSC_PROMOTE, pcmk__str_casei)) { then_actions = pe__resource_actions(after->rsc, NULL, RSC_DEMOTE, FALSE); } for (gIter = then_actions; gIter != NULL; gIter = gIter->next) { pe_action_t *then = (pe_action_t *) gIter->data; // Skip any pseudo action which for example is implied by fencing if (pcmk_is_set(then->flags, pe_action_pseudo)) { continue; } order_actions(probe, then, probe_order_type); } g_list_free(then_actions); } if (after->rsc && after->rsc->variant > pe_group) { const char *interleave_s = g_hash_table_lookup(after->rsc->meta, XML_RSC_ATTR_INTERLEAVE); interleave = crm_is_true(interleave_s); if (interleave) { /* For an interleaved clone, we should build a dependency only * with the relevant clone child. */ compatible_rsc = find_compatible_child(probe->rsc, after->rsc, RSC_ROLE_UNKNOWN, FALSE, data_set); } } for (gIter = after->actions_after; gIter != NULL; gIter = gIter->next) { pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) gIter->data; /* pe_order_implies_then is the reason why a required A.start * implies/enforces B.start to be required too, which is the cause of * B.restart/re-promote. * * Not sure about pe_order_implies_then_on_node though. It's now only * used for unfencing case, which tends to introduce transition * loops... */ if (!pcmk_is_set(after_wrapper->type, pe_order_implies_then)) { /* The order type between a group/clone and its child such as * B.start-> B_child.start is: * pe_order_implies_first_printed | pe_order_runnable_left * * Proceed through the ordering chain and build dependencies with * its children. */ if (after->rsc == NULL || after->rsc->variant < pe_group || probe->rsc->parent == after->rsc || after_wrapper->action->rsc == NULL || after_wrapper->action->rsc->variant > pe_group || after->rsc != after_wrapper->action->rsc->parent) { continue; } /* Proceed to the children of a group or a non-interleaved clone. * For an interleaved clone, proceed only to the relevant child. */ if (after->rsc->variant > pe_group && interleave == TRUE && (compatible_rsc == NULL || compatible_rsc != after_wrapper->action->rsc)) { continue; } } crm_trace("Proceeding through %s %s -> %s %s (type=0x%.6x)", after->uuid, after->node ? after->node->details->uname: "", after_wrapper->action->uuid, after_wrapper->action->node ? after_wrapper->action->node->details->uname : "", after_wrapper->type); order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set); } } static void clear_actions_tracking_flag(pe_working_set_t * data_set) { GList *gIter = NULL; for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { pe_action_t *action = (pe_action_t *) gIter->data; if (pcmk_is_set(action->flags, pe_action_tracking)) { pe__clear_action_flags(action, pe_action_tracking); } } } static void order_first_rsc_probes(pe_resource_t * rsc, pe_working_set_t * data_set) { GList *gIter = NULL; GList *probes = NULL; g_list_foreach(rsc->children, (GFunc) order_first_rsc_probes, data_set); if (rsc->variant != pe_native) { return; } probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE); for (gIter = probes; gIter != NULL; gIter= gIter->next) { pe_action_t *probe = (pe_action_t *) gIter->data; GList *aIter = NULL; for (aIter = probe->actions_after; aIter != NULL; aIter = aIter->next) { pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) aIter->data; order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set); clear_actions_tracking_flag(data_set); } } g_list_free(probes); } static void order_first_probes(pe_working_set_t * data_set) { GList *gIter = NULL; for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; order_first_rsc_probes(rsc, data_set); } order_first_probes_imply_stops(data_set); } static void order_then_probes(pe_working_set_t * data_set) { #if 0 GList *gIter = NULL; for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; /* Given "A then B", we would prefer to wait for A to be * started before probing B. * * If A was a filesystem on which the binaries and data for B * lived, it would have been useful if the author of B's agent * could assume that A is running before B.monitor will be * called. * * However we can't _only_ probe once A is running, otherwise * we'd not detect the state of B if A could not be started * for some reason. * * In practice however, we cannot even do an opportunistic * version of this because B may be moving: * * B.probe -> B.start * B.probe -> B.stop * B.stop -> B.start * A.stop -> A.start * A.start -> B.probe * * So far so good, but if we add the result of this code: * * B.stop -> A.stop * * Then we get a loop: * * B.probe -> B.stop -> A.stop -> A.start -> B.probe * * We could kill the 'B.probe -> B.stop' dependency, but that * could mean stopping B "too" soon, because B.start must wait * for the probes to complete. * * Another option is to allow it only if A is a non-unique * clone with clone-max == node-max (since we'll never be * moving it). However, we could still be stopping one * instance at the same time as starting another. * The complexity of checking for allowed conditions combined * with the ever narrowing usecase suggests that this code * should remain disabled until someone gets smarter. */ pe_action_t *start = NULL; GList *actions = NULL; GList *probes = NULL; actions = pe__resource_actions(rsc, NULL, RSC_START, FALSE); if (actions) { start = actions->data; g_list_free(actions); } if(start == NULL) { crm_err("No start action for %s", rsc->id); continue; } probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE); for (actions = start->actions_before; actions != NULL; actions = actions->next) { pe_action_wrapper_t *before = (pe_action_wrapper_t *) actions->data; GList *pIter = NULL; pe_action_t *first = before->action; pe_resource_t *first_rsc = first->rsc; if(first->required_runnable_before) { GList *clone_actions = NULL; for (clone_actions = first->actions_before; clone_actions != NULL; clone_actions = clone_actions->next) { before = (pe_action_wrapper_t *) clone_actions->data; crm_trace("Testing %s -> %s (%p) for %s", first->uuid, before->action->uuid, before->action->rsc, start->uuid); CRM_ASSERT(before->action->rsc); first_rsc = before->action->rsc; break; } } else if(!pcmk__str_eq(first->task, RSC_START, pcmk__str_casei)) { crm_trace("Not a start op %s for %s", first->uuid, start->uuid); } if(first_rsc == NULL) { continue; } else if(uber_parent(first_rsc) == uber_parent(start->rsc)) { crm_trace("Same parent %s for %s", first_rsc->id, start->uuid); continue; } else if(FALSE && pe_rsc_is_clone(uber_parent(first_rsc)) == FALSE) { crm_trace("Not a clone %s for %s", first_rsc->id, start->uuid); continue; } crm_err("Applying %s before %s %d", first->uuid, start->uuid, uber_parent(first_rsc)->variant); for (pIter = probes; pIter != NULL; pIter = pIter->next) { pe_action_t *probe = (pe_action_t *) pIter->data; crm_err("Ordering %s before %s", first->uuid, probe->uuid); order_actions(first, probe, pe_order_optional); } } } #endif } static void order_probes(pe_working_set_t * data_set) { order_first_probes(data_set); order_then_probes(data_set); } gboolean stage7(pe_working_set_t * data_set) { pcmk__output_t *prev_out = data_set->priv; pcmk__output_t *out = NULL; GList *gIter = NULL; crm_trace("Applying ordering constraints"); /* Don't ask me why, but apparently they need to be processed in * the order they were created in... go figure * * Also g_list_append() has horrendous performance characteristics * So we need to use g_list_prepend() and then reverse the list here */ data_set->ordering_constraints = g_list_reverse(data_set->ordering_constraints); for (gIter = data_set->ordering_constraints; gIter != NULL; gIter = gIter->next) { pe__ordering_t *order = gIter->data; pe_resource_t *rsc = order->lh_rsc; if (rsc != NULL) { rsc_order_first(rsc, order, data_set); continue; } rsc = order->rh_rsc; if (rsc != NULL) { rsc_order_then(order->lh_action, rsc, order); } else { crm_trace("Applying ordering constraint %d (non-resource actions)", order->id); order_actions(order->lh_action, order->rh_action, order->type); } } g_list_foreach(data_set->actions, (GFunc) update_colo_start_chain, data_set); crm_trace("Ordering probes"); order_probes(data_set); crm_trace("Updating %d actions", g_list_length(data_set->actions)); g_list_foreach(data_set->actions, (GFunc) update_action, data_set); // Check for invalid orderings for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { pe_action_t *action = (pe_action_t *) gIter->data; pe_action_wrapper_t *input = NULL; for (GList *input_iter = action->actions_before; input_iter != NULL; input_iter = input_iter->next) { input = (pe_action_wrapper_t *) input_iter->data; if (pcmk__ordering_is_invalid(action, input)) { input->type = pe_order_none; } } } /* stage7 only ever outputs to the log, so ignore whatever output object was * previously set and just log instead. */ out = pcmk__new_logger(); if (out == NULL) { return FALSE; } pcmk__output_set_log_level(out, LOG_NOTICE); data_set->priv = out; out->begin_list(out, NULL, NULL, "Actions"); LogNodeActions(data_set); g_list_foreach(data_set->resources, (GFunc) LogActions, data_set); out->end_list(out); out->finish(out, CRM_EX_OK, true, NULL); pcmk__output_free(out); data_set->priv = prev_out; return TRUE; } static int transition_id = -1; /*! * \internal * \brief Log a message after calculating a transition * * \param[in] filename Where transition input is stored */ void pcmk__log_transition_summary(const char *filename) { if (was_processing_error) { crm_err("Calculated transition %d (with errors)%s%s", transition_id, (filename == NULL)? "" : ", saving inputs in ", (filename == NULL)? "" : filename); } else if (was_processing_warning) { crm_warn("Calculated transition %d (with warnings)%s%s", transition_id, (filename == NULL)? "" : ", saving inputs in ", (filename == NULL)? "" : filename); } else { crm_notice("Calculated transition %d%s%s", transition_id, (filename == NULL)? "" : ", saving inputs in ", (filename == NULL)? "" : filename); } if (crm_config_error) { crm_notice("Configuration errors found during scheduler processing," " please run \"crm_verify -L\" to identify issues"); } } /* * Create a dependency graph to send to the transitioner (via the controller) */ gboolean stage8(pe_working_set_t * data_set) { GList *gIter = NULL; const char *value = NULL; long long limit = 0LL; transition_id++; crm_trace("Creating transition graph %d.", transition_id); data_set->graph = create_xml_node(NULL, XML_TAG_GRAPH); value = pe_pref(data_set->config_hash, "cluster-delay"); crm_xml_add(data_set->graph, "cluster-delay", value); value = pe_pref(data_set->config_hash, "stonith-timeout"); crm_xml_add(data_set->graph, "stonith-timeout", value); crm_xml_add(data_set->graph, "failed-stop-offset", "INFINITY"); if (pcmk_is_set(data_set->flags, pe_flag_start_failure_fatal)) { crm_xml_add(data_set->graph, "failed-start-offset", "INFINITY"); } else { crm_xml_add(data_set->graph, "failed-start-offset", "1"); } value = pe_pref(data_set->config_hash, "batch-limit"); crm_xml_add(data_set->graph, "batch-limit", value); crm_xml_add_int(data_set->graph, "transition_id", transition_id); value = pe_pref(data_set->config_hash, "migration-limit"); if ((pcmk__scan_ll(value, &limit, 0LL) == pcmk_rc_ok) && (limit > 0)) { crm_xml_add(data_set->graph, "migration-limit", value); } if (data_set->recheck_by > 0) { char *recheck_epoch = NULL; recheck_epoch = crm_strdup_printf("%llu", (long long) data_set->recheck_by); crm_xml_add(data_set->graph, "recheck-by", recheck_epoch); free(recheck_epoch); } /* errors... slist_iter(action, pe_action_t, action_list, lpc, if(action->optional == FALSE && action->runnable == FALSE) { print_action("Ignoring", action, TRUE); } ); */ /* The following code will de-duplicate action inputs, so nothing past this * should rely on the action input type flags retaining their original * values. */ gIter = data_set->resources; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; pe_rsc_trace(rsc, "processing actions for rsc=%s", rsc->id); rsc->cmds->expand(rsc, data_set); } crm_log_xml_trace(data_set->graph, "created resource-driven action list"); /* pseudo action to distribute list of nodes with maintenance state update */ add_maintenance_update(data_set); /* catch any non-resource specific actions */ crm_trace("processing non-resource actions"); gIter = data_set->actions; for (; gIter != NULL; gIter = gIter->next) { pe_action_t *action = (pe_action_t *) gIter->data; if (action->rsc && action->node && action->node->details->shutdown && !pcmk_is_set(action->rsc->flags, pe_rsc_maintenance) && !pcmk_any_flags_set(action->flags, pe_action_optional|pe_action_runnable) && pcmk__str_eq(action->task, RSC_STOP, pcmk__str_none) ) { /* Eventually we should just ignore the 'fence' case * But for now it's the best way to detect (in CTS) when * CIB resource updates are being lost */ if (pcmk_is_set(data_set->flags, pe_flag_have_quorum) || data_set->no_quorum_policy == no_quorum_ignore) { crm_crit("Cannot %s node '%s' because of %s:%s%s (%s)", action->node->details->unclean ? "fence" : "shut down", action->node->details->uname, action->rsc->id, pcmk_is_set(action->rsc->flags, pe_rsc_managed)? " blocked" : " unmanaged", pcmk_is_set(action->rsc->flags, pe_rsc_failed)? " failed" : "", action->uuid); } } graph_element_from_action(action, data_set); } crm_log_xml_trace(data_set->graph, "created generic action list"); crm_trace("Created transition graph %d.", transition_id); return TRUE; } void LogNodeActions(pe_working_set_t * data_set) { pcmk__output_t *out = data_set->priv; GList *gIter = NULL; for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { char *node_name = NULL; char *task = NULL; pe_action_t *action = (pe_action_t *) gIter->data; if (action->rsc != NULL) { continue; } else if (pcmk_is_set(action->flags, pe_action_optional)) { continue; } if (pe__is_guest_node(action->node)) { node_name = crm_strdup_printf("%s (resource: %s)", action->node->details->uname, action->node->details->remote_rsc->container->id); } else if(action->node) { node_name = crm_strdup_printf("%s", action->node->details->uname); } if (pcmk__str_eq(action->task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { task = strdup("Shutdown"); } else if (pcmk__str_eq(action->task, CRM_OP_FENCE, pcmk__str_casei)) { const char *op = g_hash_table_lookup(action->meta, "stonith_action"); task = crm_strdup_printf("Fence (%s)", op); } out->message(out, "node-action", task, node_name, action->reason); free(node_name); free(task); } } diff --git a/lib/pacemaker/pcmk_sched_bundle.c b/lib/pacemaker/pcmk_sched_bundle.c index 64c4094a87..f8b4ea7a44 100644 --- a/lib/pacemaker/pcmk_sched_bundle.c +++ b/lib/pacemaker/pcmk_sched_bundle.c @@ -1,1103 +1,1064 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #define PE__VARIANT_BUNDLE 1 #include static bool is_bundle_node(pe__bundle_variant_data_t *data, pe_node_t *node) { for (GList *gIter = data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; if (node->details == replica->node->details) { return TRUE; } } return FALSE; } gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set); void distribute_children(pe_resource_t *rsc, GList *children, GList *nodes, int max, int per_host_max, pe_working_set_t * data_set); static GList * get_container_list(pe_resource_t *rsc) { GList *containers = NULL; if (rsc->variant == pe_container) { pe__bundle_variant_data_t *data = NULL; get_bundle_variant_data(data, rsc); for (GList *gIter = data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; containers = g_list_append(containers, replica->container); } } return containers; } static inline GList * get_containers_or_children(pe_resource_t *rsc) { return (rsc->variant == pe_container)? get_container_list(rsc) : rsc->children; } -static bool -migration_threshold_reached(pe_resource_t *rsc, pe_node_t *node, - pe_working_set_t *data_set) -{ - int fail_count, countdown; - - /* Migration threshold of 0 means never force away */ - if (rsc->migration_threshold == 0) { - return FALSE; - } - - // If we're ignoring failures, also ignore the migration threshold - if (pcmk_is_set(rsc->flags, pe_rsc_failure_ignored)) { - return FALSE; - } - - /* If there are no failures, there's no need to force away */ - fail_count = pe_get_failcount(node, rsc, NULL, - pe_fc_effective|pe_fc_fillers, NULL, - data_set); - if (fail_count <= 0) { - return FALSE; - } - - /* How many more times recovery will be tried on this node */ - countdown = QB_MAX(rsc->migration_threshold - fail_count, 0); - - if (countdown == 0) { - crm_warn("Forcing %s away from %s after %d failures (max=%d)", - rsc->id, node->details->uname, fail_count, - rsc->migration_threshold); - return TRUE; - } - - crm_info("%s can fail %d more times on %s before being forced off", - rsc->id, countdown, node->details->uname); - return FALSE; -} - pe_node_t * pcmk__bundle_allocate(pe_resource_t *rsc, pe_node_t *prefer, pe_working_set_t *data_set) { GList *containers = NULL; GList *nodes = NULL; pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(rsc != NULL, return NULL); get_bundle_variant_data(bundle_data, rsc); pe__set_resource_flags(rsc, pe_rsc_allocating); containers = get_container_list(rsc); pe__show_node_weights(!pcmk_is_set(data_set->flags, pe_flag_show_scores), rsc, __func__, rsc->allowed_nodes, data_set); nodes = g_hash_table_get_values(rsc->allowed_nodes); nodes = sort_nodes_by_weight(nodes, NULL, data_set); containers = g_list_sort_with_data(containers, sort_clone_instance, data_set); distribute_children(rsc, containers, nodes, bundle_data->nreplicas, bundle_data->nreplicas_per_host, data_set); g_list_free(nodes); g_list_free(containers); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; pe_node_t *container_host = NULL; CRM_ASSERT(replica); if (replica->ip) { pe_rsc_trace(rsc, "Allocating bundle %s IP %s", rsc->id, replica->ip->id); replica->ip->cmds->allocate(replica->ip, prefer, data_set); } container_host = replica->container->allocated_to; if (replica->remote && pe__is_guest_or_remote_node(container_host)) { /* We need 'nested' connection resources to be on the same * host because pacemaker-remoted only supports a single * active connection */ pcmk__new_colocation("child-remote-with-docker-remote", NULL, INFINITY, replica->remote, container_host->details->remote_rsc, NULL, NULL, true, data_set); } if (replica->remote) { pe_rsc_trace(rsc, "Allocating bundle %s connection %s", rsc->id, replica->remote->id); replica->remote->cmds->allocate(replica->remote, prefer, data_set); } // Explicitly allocate replicas' children before bundle child if (replica->child) { pe_node_t *node = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, replica->child->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) { if (node->details != replica->node->details) { node->weight = -INFINITY; - } else if (!migration_threshold_reached(replica->child, node, - data_set)) { + } else if (pcmk__threshold_reached(replica->child, node, + data_set) == NULL) { node->weight = INFINITY; } } pe__set_resource_flags(replica->child->parent, pe_rsc_allocating); pe_rsc_trace(rsc, "Allocating bundle %s replica child %s", rsc->id, replica->child->id); replica->child->cmds->allocate(replica->child, replica->node, data_set); pe__clear_resource_flags(replica->child->parent, pe_rsc_allocating); } } if (bundle_data->child) { pe_node_t *node = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, bundle_data->child->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) { if (is_bundle_node(bundle_data, node)) { node->weight = 0; } else { node->weight = -INFINITY; } } pe_rsc_trace(rsc, "Allocating bundle %s child %s", rsc->id, bundle_data->child->id); bundle_data->child->cmds->allocate(bundle_data->child, prefer, data_set); } pe__clear_resource_flags(rsc, pe_rsc_allocating|pe_rsc_provisional); return NULL; } void pcmk__bundle_create_actions(pe_resource_t *rsc, pe_working_set_t *data_set) { pe_action_t *action = NULL; GList *containers = NULL; pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(rsc != NULL, return); containers = get_container_list(rsc); get_bundle_variant_data(bundle_data, rsc); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; CRM_ASSERT(replica); if (replica->ip) { replica->ip->cmds->create_actions(replica->ip, data_set); } if (replica->container) { replica->container->cmds->create_actions(replica->container, data_set); } if (replica->remote) { replica->remote->cmds->create_actions(replica->remote, data_set); } } clone_create_pseudo_actions(rsc, containers, NULL, NULL, data_set); if (bundle_data->child) { bundle_data->child->cmds->create_actions(bundle_data->child, data_set); if (pcmk_is_set(bundle_data->child->flags, pe_rsc_promotable)) { /* promote */ create_pseudo_resource_op(rsc, RSC_PROMOTE, TRUE, TRUE, data_set); action = create_pseudo_resource_op(rsc, RSC_PROMOTED, TRUE, TRUE, data_set); action->priority = INFINITY; /* demote */ create_pseudo_resource_op(rsc, RSC_DEMOTE, TRUE, TRUE, data_set); action = create_pseudo_resource_op(rsc, RSC_DEMOTED, TRUE, TRUE, data_set); action->priority = INFINITY; } } g_list_free(containers); } void pcmk__bundle_internal_constraints(pe_resource_t *rsc, pe_working_set_t *data_set) { pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(rsc != NULL, return); get_bundle_variant_data(bundle_data, rsc); if (bundle_data->child) { new_rsc_order(rsc, RSC_START, bundle_data->child, RSC_START, pe_order_implies_first_printed, data_set); new_rsc_order(rsc, RSC_STOP, bundle_data->child, RSC_STOP, pe_order_implies_first_printed, data_set); if (bundle_data->child->children) { new_rsc_order(bundle_data->child, RSC_STARTED, rsc, RSC_STARTED, pe_order_implies_then_printed, data_set); new_rsc_order(bundle_data->child, RSC_STOPPED, rsc, RSC_STOPPED, pe_order_implies_then_printed, data_set); } else { new_rsc_order(bundle_data->child, RSC_START, rsc, RSC_STARTED, pe_order_implies_then_printed, data_set); new_rsc_order(bundle_data->child, RSC_STOP, rsc, RSC_STOPPED, pe_order_implies_then_printed, data_set); } } for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; CRM_ASSERT(replica); CRM_ASSERT(replica->container); replica->container->cmds->internal_constraints(replica->container, data_set); order_start_start(rsc, replica->container, pe_order_runnable_left|pe_order_implies_first_printed); if (replica->child) { order_stop_stop(rsc, replica->child, pe_order_implies_first_printed); } order_stop_stop(rsc, replica->container, pe_order_implies_first_printed); new_rsc_order(replica->container, RSC_START, rsc, RSC_STARTED, pe_order_implies_then_printed, data_set); new_rsc_order(replica->container, RSC_STOP, rsc, RSC_STOPPED, pe_order_implies_then_printed, data_set); if (replica->ip) { replica->ip->cmds->internal_constraints(replica->ip, data_set); // Start ip then container new_rsc_order(replica->ip, RSC_START, replica->container, RSC_START, pe_order_runnable_left|pe_order_preserve, data_set); new_rsc_order(replica->container, RSC_STOP, replica->ip, RSC_STOP, pe_order_implies_first|pe_order_preserve, data_set); pcmk__new_colocation("ip-with-docker", NULL, INFINITY, replica->ip, replica->container, NULL, NULL, true, data_set); } if (replica->remote) { /* This handles ordering and colocating remote relative to container * (via "resource-with-container"). Since IP is also ordered and * colocated relative to the container, we don't need to do anything * explicit here with IP. */ replica->remote->cmds->internal_constraints(replica->remote, data_set); } if (replica->child) { CRM_ASSERT(replica->remote); // "Start remote then child" is implicit in scheduler's remote logic } } if (bundle_data->child) { bundle_data->child->cmds->internal_constraints(bundle_data->child, data_set); if (pcmk_is_set(bundle_data->child->flags, pe_rsc_promotable)) { promote_demote_constraints(rsc, data_set); /* child demoted before global demoted */ new_rsc_order(bundle_data->child, RSC_DEMOTED, rsc, RSC_DEMOTED, pe_order_implies_then_printed, data_set); /* global demote before child demote */ new_rsc_order(rsc, RSC_DEMOTE, bundle_data->child, RSC_DEMOTE, pe_order_implies_first_printed, data_set); /* child promoted before global promoted */ new_rsc_order(bundle_data->child, RSC_PROMOTED, rsc, RSC_PROMOTED, pe_order_implies_then_printed, data_set); /* global promote before child promote */ new_rsc_order(rsc, RSC_PROMOTE, bundle_data->child, RSC_PROMOTE, pe_order_implies_first_printed, data_set); } } else { // int type = pe_order_optional | pe_order_implies_then | pe_order_restart; // custom_action_order(rsc, pcmk__op_key(rsc->id, RSC_STOP, 0), NULL, // rsc, pcmk__op_key(rsc->id, RSC_START, 0), NULL, pe_order_optional, data_set); } } static pe_resource_t * compatible_replica_for_node(pe_resource_t *rsc_lh, pe_node_t *candidate, pe_resource_t *rsc, enum rsc_role_e filter, gboolean current) { pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(candidate != NULL, return NULL); get_bundle_variant_data(bundle_data, rsc); crm_trace("Looking for compatible child from %s for %s on %s", rsc_lh->id, rsc->id, candidate->details->uname); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; if (is_child_compatible(replica->container, candidate, filter, current)) { crm_trace("Pairing %s with %s on %s", rsc_lh->id, replica->container->id, candidate->details->uname); return replica->container; } } crm_trace("Can't pair %s with %s", rsc_lh->id, rsc->id); return NULL; } static pe_resource_t * compatible_replica(pe_resource_t *rsc_lh, pe_resource_t *rsc, enum rsc_role_e filter, gboolean current, pe_working_set_t *data_set) { GList *scratch = NULL; pe_resource_t *pair = NULL; pe_node_t *active_node_lh = NULL; active_node_lh = rsc_lh->fns->location(rsc_lh, NULL, current); if (active_node_lh) { return compatible_replica_for_node(rsc_lh, active_node_lh, rsc, filter, current); } scratch = g_hash_table_get_values(rsc_lh->allowed_nodes); scratch = sort_nodes_by_weight(scratch, NULL, data_set); for (GList *gIter = scratch; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; pair = compatible_replica_for_node(rsc_lh, node, rsc, filter, current); if (pair) { goto done; } } pe_rsc_debug(rsc, "Can't pair %s with %s", rsc_lh->id, (rsc? rsc->id : "none")); done: g_list_free(scratch); return pair; } void pcmk__bundle_rsc_colocation_lh(pe_resource_t *rsc, pe_resource_t *rsc_rh, pcmk__colocation_t *constraint, pe_working_set_t *data_set) { /* -- Never called -- * * Instead we add the colocation constraints to the child and call from there */ CRM_ASSERT(FALSE); } int copies_per_node(pe_resource_t * rsc) { /* Strictly speaking, there should be a 'copies_per_node' addition * to the resource function table and each case would be a * function. However that would be serious overkill to return an * int. In fact, it seems to me that both function tables * could/should be replaced by resources.{c,h} full of * rsc_{some_operation} functions containing a switch as below * which calls out to functions named {variant}_{some_operation} * as needed. */ switch(rsc->variant) { case pe_unknown: return 0; case pe_native: case pe_group: return 1; case pe_clone: { const char *max_clones_node = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_INCARNATION_NODEMAX); if (max_clones_node == NULL) { return 1; } else { int max_i; pcmk__scan_min_int(max_clones_node, &max_i, 0); return max_i; } } case pe_container: { pe__bundle_variant_data_t *data = NULL; get_bundle_variant_data(data, rsc); return data->nreplicas_per_host; } } return 0; } void pcmk__bundle_rsc_colocation_rh(pe_resource_t *rsc_lh, pe_resource_t *rsc, pcmk__colocation_t *constraint, pe_working_set_t *data_set) { GList *allocated_rhs = NULL; pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(constraint != NULL, return); CRM_CHECK(rsc_lh != NULL, pe_err("rsc_lh was NULL for %s", constraint->id); return); CRM_CHECK(rsc != NULL, pe_err("rsc was NULL for %s", constraint->id); return); CRM_ASSERT(rsc_lh->variant == pe_native); if (pcmk_is_set(rsc->flags, pe_rsc_provisional)) { pe_rsc_trace(rsc, "%s is still provisional", rsc->id); return; } else if(constraint->rsc_lh->variant > pe_group) { pe_resource_t *rh_child = compatible_replica(rsc_lh, rsc, RSC_ROLE_UNKNOWN, FALSE, data_set); if (rh_child) { pe_rsc_debug(rsc, "Pairing %s with %s", rsc_lh->id, rh_child->id); rsc_lh->cmds->rsc_colocation_lh(rsc_lh, rh_child, constraint, data_set); } else if (constraint->score >= INFINITY) { crm_notice("Cannot pair %s with instance of %s", rsc_lh->id, rsc->id); assign_node(rsc_lh, NULL, TRUE); } else { pe_rsc_debug(rsc, "Cannot pair %s with instance of %s", rsc_lh->id, rsc->id); } return; } get_bundle_variant_data(bundle_data, rsc); pe_rsc_trace(rsc, "Processing constraint %s: %s -> %s %d", constraint->id, rsc_lh->id, rsc->id, constraint->score); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; if (constraint->score < INFINITY) { replica->container->cmds->rsc_colocation_rh(rsc_lh, replica->container, constraint, data_set); } else { pe_node_t *chosen = replica->container->fns->location(replica->container, NULL, FALSE); if ((chosen == NULL) || is_set_recursive(replica->container, pe_rsc_block, TRUE)) { continue; } if ((constraint->role_rh >= RSC_ROLE_PROMOTED) && (replica->child == NULL)) { continue; } if ((constraint->role_rh >= RSC_ROLE_PROMOTED) && (replica->child->next_role < RSC_ROLE_PROMOTED)) { continue; } pe_rsc_trace(rsc, "Allowing %s: %s %d", constraint->id, chosen->details->uname, chosen->weight); allocated_rhs = g_list_prepend(allocated_rhs, chosen); } } if (constraint->score >= INFINITY) { node_list_exclude(rsc_lh->allowed_nodes, allocated_rhs, FALSE); } g_list_free(allocated_rhs); } enum pe_action_flags pcmk__bundle_action_flags(pe_action_t *action, pe_node_t *node) { GList *containers = NULL; enum pe_action_flags flags = 0; pe__bundle_variant_data_t *data = NULL; get_bundle_variant_data(data, action->rsc); if(data->child) { enum action_tasks task = get_complex_task(data->child, action->task, TRUE); switch(task) { case no_action: case action_notify: case action_notified: case action_promote: case action_promoted: case action_demote: case action_demoted: return summary_action_flags(action, data->child->children, node); default: break; } } containers = get_container_list(action->rsc); flags = summary_action_flags(action, containers, node); g_list_free(containers); return flags; } pe_resource_t * find_compatible_child_by_node(pe_resource_t * local_child, pe_node_t * local_node, pe_resource_t * rsc, enum rsc_role_e filter, gboolean current) { GList *gIter = NULL; GList *children = NULL; if (local_node == NULL) { crm_err("Can't colocate unrunnable child %s with %s", local_child->id, rsc->id); return NULL; } crm_trace("Looking for compatible child from %s for %s on %s", local_child->id, rsc->id, local_node->details->uname); children = get_containers_or_children(rsc); for (gIter = children; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; if(is_child_compatible(child_rsc, local_node, filter, current)) { crm_trace("Pairing %s with %s on %s", local_child->id, child_rsc->id, local_node->details->uname); return child_rsc; } } crm_trace("Can't pair %s with %s", local_child->id, rsc->id); if(children != rsc->children) { g_list_free(children); } return NULL; } static pe__bundle_replica_t * replica_for_container(pe_resource_t *rsc, pe_resource_t *container, pe_node_t *node) { if (rsc->variant == pe_container) { pe__bundle_variant_data_t *data = NULL; get_bundle_variant_data(data, rsc); for (GList *gIter = data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; if (replica->child && (container == replica->container) && (node->details == replica->node->details)) { return replica; } } } return NULL; } static enum pe_graph_flags multi_update_interleave_actions(pe_action_t *first, pe_action_t *then, pe_node_t *node, enum pe_action_flags flags, enum pe_action_flags filter, enum pe_ordering type, pe_working_set_t *data_set) { GList *gIter = NULL; GList *children = NULL; gboolean current = FALSE; enum pe_graph_flags changed = pe_graph_none; /* Fix this - lazy */ if (pcmk__ends_with(first->uuid, "_stopped_0") || pcmk__ends_with(first->uuid, "_demoted_0")) { current = TRUE; } children = get_containers_or_children(then->rsc); for (gIter = children; gIter != NULL; gIter = gIter->next) { pe_resource_t *then_child = gIter->data; pe_resource_t *first_child = find_compatible_child(then_child, first->rsc, RSC_ROLE_UNKNOWN, current, data_set); if (first_child == NULL && current) { crm_trace("Ignore"); } else if (first_child == NULL) { crm_debug("No match found for %s (%d / %s / %s)", then_child->id, current, first->uuid, then->uuid); /* Me no like this hack - but what else can we do? * * If there is no-one active or about to be active * on the same node as then_child, then they must * not be allowed to start */ if (type & (pe_order_runnable_left | pe_order_implies_then) /* Mandatory */ ) { pe_rsc_info(then->rsc, "Inhibiting %s from being active", then_child->id); if(assign_node(then_child, NULL, TRUE)) { pe__set_graph_flags(changed, first, pe_graph_updated_then); } } } else { pe_action_t *first_action = NULL; pe_action_t *then_action = NULL; enum action_tasks task = clone_child_action(first); const char *first_task = task2text(task); pe__bundle_replica_t *first_replica = NULL; pe__bundle_replica_t *then_replica = NULL; first_replica = replica_for_container(first->rsc, first_child, node); if (strstr(first->task, "stop") && first_replica && first_replica->child) { /* Except for 'stopped' we should be looking at the * in-container resource, actions for the child will * happen later and are therefor more likely to align * with the user's intent. */ first_action = find_first_action(first_replica->child->actions, NULL, task2text(task), node); } else { first_action = find_first_action(first_child->actions, NULL, task2text(task), node); } then_replica = replica_for_container(then->rsc, then_child, node); if (strstr(then->task, "mote") && then_replica && then_replica->child) { /* Promote/demote actions will never be found for the * container resource, look in the child instead * * Alternatively treat: * 'XXXX then promote YYYY' as 'XXXX then start container for YYYY', and * 'demote XXXX then stop YYYY' as 'stop container for XXXX then stop YYYY' */ then_action = find_first_action(then_replica->child->actions, NULL, then->task, node); } else { then_action = find_first_action(then_child->actions, NULL, then->task, node); } if (first_action == NULL) { if (!pcmk_is_set(first_child->flags, pe_rsc_orphan) && !pcmk__str_any_of(first_task, RSC_STOP, RSC_DEMOTE, NULL)) { crm_err("Internal error: No action found for %s in %s (first)", first_task, first_child->id); } else { crm_trace("No action found for %s in %s%s (first)", first_task, first_child->id, pcmk_is_set(first_child->flags, pe_rsc_orphan)? " (ORPHAN)" : ""); } continue; } /* We're only interested if 'then' is neither stopping nor being demoted */ if (then_action == NULL) { if (!pcmk_is_set(then_child->flags, pe_rsc_orphan) && !pcmk__str_any_of(then->task, RSC_STOP, RSC_DEMOTE, NULL)) { crm_err("Internal error: No action found for %s in %s (then)", then->task, then_child->id); } else { crm_trace("No action found for %s in %s%s (then)", then->task, then_child->id, pcmk_is_set(then_child->flags, pe_rsc_orphan)? " (ORPHAN)" : ""); } continue; } if (order_actions(first_action, then_action, type)) { crm_debug("Created constraint for %s (%d) -> %s (%d) %.6x", first_action->uuid, pcmk_is_set(first_action->flags, pe_action_optional), then_action->uuid, pcmk_is_set(then_action->flags, pe_action_optional), type); pe__set_graph_flags(changed, first, pe_graph_updated_first|pe_graph_updated_then); } if(first_action && then_action) { changed |= then_child->cmds->update_actions(first_action, then_action, node, first_child->cmds->action_flags(first_action, node), filter, type, data_set); } else { crm_err("Nothing found either for %s (%p) or %s (%p) %s", first_child->id, first_action, then_child->id, then_action, task2text(task)); } } } if(children != then->rsc->children) { g_list_free(children); } return changed; } static bool can_interleave_actions(pe_action_t *first, pe_action_t *then) { bool interleave = FALSE; pe_resource_t *rsc = NULL; const char *interleave_s = NULL; if(first->rsc == NULL || then->rsc == NULL) { crm_trace("Not interleaving %s with %s (both must be resources)", first->uuid, then->uuid); return FALSE; } else if(first->rsc == then->rsc) { crm_trace("Not interleaving %s with %s (must belong to different resources)", first->uuid, then->uuid); return FALSE; } else if(first->rsc->variant < pe_clone || then->rsc->variant < pe_clone) { crm_trace("Not interleaving %s with %s (both sides must be clones or bundles)", first->uuid, then->uuid); return FALSE; } if (pcmk__ends_with(then->uuid, "_stop_0") || pcmk__ends_with(then->uuid, "_demote_0")) { rsc = first->rsc; } else { rsc = then->rsc; } interleave_s = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_INTERLEAVE); interleave = crm_is_true(interleave_s); crm_trace("Interleave %s -> %s: %s (based on %s)", first->uuid, then->uuid, interleave ? "yes" : "no", rsc->id); return interleave; } enum pe_graph_flags pcmk__multi_update_actions(pe_action_t *first, pe_action_t *then, pe_node_t *node, enum pe_action_flags flags, enum pe_action_flags filter, enum pe_ordering type, pe_working_set_t *data_set) { enum pe_graph_flags changed = pe_graph_none; crm_trace("%s -> %s", first->uuid, then->uuid); if(can_interleave_actions(first, then)) { changed = multi_update_interleave_actions(first, then, node, flags, filter, type, data_set); } else if(then->rsc) { GList *gIter = NULL; GList *children = NULL; // Handle the 'primitive' ordering case changed |= native_update_actions(first, then, node, flags, filter, type, data_set); // Now any children (or containers in the case of a bundle) children = get_containers_or_children(then->rsc); for (gIter = children; gIter != NULL; gIter = gIter->next) { pe_resource_t *then_child = (pe_resource_t *) gIter->data; enum pe_graph_flags then_child_changed = pe_graph_none; pe_action_t *then_child_action = find_first_action(then_child->actions, NULL, then->task, node); if (then_child_action) { enum pe_action_flags then_child_flags = then_child->cmds->action_flags(then_child_action, node); if (pcmk_is_set(then_child_flags, pe_action_runnable)) { then_child_changed |= then_child->cmds->update_actions(first, then_child_action, node, flags, filter, type, data_set); } changed |= then_child_changed; if (then_child_changed & pe_graph_updated_then) { for (GList *lpc = then_child_action->actions_after; lpc != NULL; lpc = lpc->next) { pe_action_wrapper_t *next = (pe_action_wrapper_t *) lpc->data; update_action(next->action, data_set); } } } } if(children != then->rsc->children) { g_list_free(children); } } return changed; } void pcmk__bundle_rsc_location(pe_resource_t *rsc, pe__location_t *constraint) { pe__bundle_variant_data_t *bundle_data = NULL; get_bundle_variant_data(bundle_data, rsc); native_rsc_location(rsc, constraint); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; if (replica->container) { replica->container->cmds->rsc_location(replica->container, constraint); } if (replica->ip) { replica->ip->cmds->rsc_location(replica->ip, constraint); } } if (bundle_data->child && ((constraint->role_filter == RSC_ROLE_UNPROMOTED) || (constraint->role_filter == RSC_ROLE_PROMOTED))) { bundle_data->child->cmds->rsc_location(bundle_data->child, constraint); bundle_data->child->rsc_location = g_list_prepend(bundle_data->child->rsc_location, constraint); } } void pcmk__bundle_expand(pe_resource_t *rsc, pe_working_set_t * data_set) { pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(rsc != NULL, return); get_bundle_variant_data(bundle_data, rsc); if (bundle_data->child) { bundle_data->child->cmds->expand(bundle_data->child, data_set); } for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; CRM_ASSERT(replica); if (replica->remote && replica->container && pe__bundle_needs_remote_name(replica->remote, data_set)) { /* REMOTE_CONTAINER_HACK: Allow remote nodes to run containers that * run pacemaker-remoted inside, without needing a separate IP for * the container. This is done by configuring the inner remote's * connection host as the magic string "#uname", then * replacing it with the underlying host when needed. */ xmlNode *nvpair = get_xpath_object("//nvpair[@name='" XML_RSC_ATTR_REMOTE_RA_ADDR "']", replica->remote->xml, LOG_ERR); const char *calculated_addr = NULL; // Replace the value in replica->remote->xml (if appropriate) calculated_addr = pe__add_bundle_remote_name(replica->remote, data_set, nvpair, "value"); if (calculated_addr) { /* Since this is for the bundle as a resource, and not any * particular action, replace the value in the default * parameters (not evaluated for node). action2xml() will grab * it from there to replace it in node-evaluated parameters. */ GHashTable *params = pe_rsc_params(replica->remote, NULL, data_set); crm_trace("Set address for bundle connection %s to bundle host %s", replica->remote->id, calculated_addr); g_hash_table_replace(params, strdup(XML_RSC_ATTR_REMOTE_RA_ADDR), strdup(calculated_addr)); } else { /* The only way to get here is if the remote connection is * neither currently running nor scheduled to run. That means we * won't be doing any operations that require addr (only start * requires it; we additionally use it to compare digests when * unpacking status, promote, and migrate_from history, but * that's already happened by this point). */ crm_info("Unable to determine address for bundle %s remote connection", rsc->id); } } if (replica->ip) { replica->ip->cmds->expand(replica->ip, data_set); } if (replica->container) { replica->container->cmds->expand(replica->container, data_set); } if (replica->remote) { replica->remote->cmds->expand(replica->remote, data_set); } } } gboolean pcmk__bundle_create_probe(pe_resource_t *rsc, pe_node_t *node, pe_action_t *complete, gboolean force, pe_working_set_t * data_set) { bool any_created = FALSE; pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(rsc != NULL, return FALSE); get_bundle_variant_data(bundle_data, rsc); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; CRM_ASSERT(replica); if (replica->ip) { any_created |= replica->ip->cmds->create_probe(replica->ip, node, complete, force, data_set); } if (replica->child && (node->details == replica->node->details)) { any_created |= replica->child->cmds->create_probe(replica->child, node, complete, force, data_set); } if (replica->container) { bool created = replica->container->cmds->create_probe(replica->container, node, complete, force, data_set); if(created) { any_created = TRUE; /* If we're limited to one replica per host (due to * the lack of an IP range probably), then we don't * want any of our peer containers starting until * we've established that no other copies are already * running. * * Partly this is to ensure that nreplicas_per_host is * observed, but also to ensure that the containers * don't fail to start because the necessary port * mappings (which won't include an IP for uniqueness) * are already taken */ for (GList *tIter = bundle_data->replicas; tIter && (bundle_data->nreplicas_per_host == 1); tIter = tIter->next) { pe__bundle_replica_t *other = tIter->data; if ((other != replica) && (other != NULL) && (other->container != NULL)) { custom_action_order(replica->container, pcmk__op_key(replica->container->id, RSC_STATUS, 0), NULL, other->container, pcmk__op_key(other->container->id, RSC_START, 0), NULL, pe_order_optional|pe_order_same_node, data_set); } } } } if (replica->container && replica->remote && replica->remote->cmds->create_probe(replica->remote, node, complete, force, data_set)) { /* Do not probe the remote resource until we know where the * container is running. This is required for REMOTE_CONTAINER_HACK * to correctly probe remote resources. */ char *probe_uuid = pcmk__op_key(replica->remote->id, RSC_STATUS, 0); pe_action_t *probe = find_first_action(replica->remote->actions, probe_uuid, NULL, node); free(probe_uuid); if (probe) { any_created = TRUE; crm_trace("Ordering %s probe on %s", replica->remote->id, node->details->uname); custom_action_order(replica->container, pcmk__op_key(replica->container->id, RSC_START, 0), NULL, replica->remote, NULL, probe, pe_order_probe, data_set); } } } return any_created; } void pcmk__bundle_append_meta(pe_resource_t *rsc, xmlNode *xml) { } void pcmk__bundle_log_actions(pe_resource_t *rsc, pe_working_set_t *data_set) { pe__bundle_variant_data_t *bundle_data = NULL; CRM_CHECK(rsc != NULL, return); get_bundle_variant_data(bundle_data, rsc); for (GList *gIter = bundle_data->replicas; gIter != NULL; gIter = gIter->next) { pe__bundle_replica_t *replica = gIter->data; CRM_ASSERT(replica); if (replica->ip) { LogActions(replica->ip, data_set); } if (replica->container) { LogActions(replica->container, data_set); } if (replica->remote) { LogActions(replica->remote, data_set); } if (replica->child) { LogActions(replica->child, data_set); } } } diff --git a/lib/pacemaker/pcmk_sched_utils.c b/lib/pacemaker/pcmk_sched_utils.c index 85249f8608..b5ec9ede86 100644 --- a/lib/pacemaker/pcmk_sched_utils.c +++ b/lib/pacemaker/pcmk_sched_utils.c @@ -1,789 +1,845 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include // lrmd_event_data_t #include #include pe__location_t * rsc2node_new(const char *id, pe_resource_t *rsc, int node_weight, const char *discover_mode, pe_node_t *foo_node, pe_working_set_t *data_set) { pe__location_t *new_con = NULL; if (rsc == NULL || id == NULL) { pe_err("Invalid constraint %s for rsc=%p", crm_str(id), rsc); return NULL; } else if (foo_node == NULL) { CRM_CHECK(node_weight == 0, return NULL); } new_con = calloc(1, sizeof(pe__location_t)); if (new_con != NULL) { new_con->id = strdup(id); new_con->rsc_lh = rsc; new_con->node_list_rh = NULL; new_con->role_filter = RSC_ROLE_UNKNOWN; if (pcmk__str_eq(discover_mode, "always", pcmk__str_null_matches | pcmk__str_casei)) { new_con->discover_mode = pe_discover_always; } else if (pcmk__str_eq(discover_mode, "never", pcmk__str_casei)) { new_con->discover_mode = pe_discover_never; } else if (pcmk__str_eq(discover_mode, "exclusive", pcmk__str_casei)) { new_con->discover_mode = pe_discover_exclusive; rsc->exclusive_discover = TRUE; } else { pe_err("Invalid %s value %s in location constraint", XML_LOCATION_ATTR_DISCOVERY, discover_mode); } if (foo_node != NULL) { pe_node_t *copy = pe__copy_node(foo_node); copy->weight = node_weight; new_con->node_list_rh = g_list_prepend(NULL, copy); } data_set->placement_constraints = g_list_prepend(data_set->placement_constraints, new_con); rsc->rsc_location = g_list_prepend(rsc->rsc_location, new_con); } return new_con; } gboolean can_run_resources(const pe_node_t * node) { if (node == NULL) { return FALSE; } #if 0 if (node->weight < 0) { return FALSE; } #endif if (node->details->online == FALSE || node->details->shutdown || node->details->unclean || node->details->standby || node->details->maintenance) { crm_trace("%s: online=%d, unclean=%d, standby=%d, maintenance=%d", node->details->uname, node->details->online, node->details->unclean, node->details->standby, node->details->maintenance); return FALSE; } return TRUE; } /*! * \internal * \brief Copy a hash table of node objects * * \param[in] nodes Hash table to copy * * \return New copy of nodes (or NULL if nodes is NULL) */ GHashTable * pcmk__copy_node_table(GHashTable *nodes) { GHashTable *new_table = NULL; GHashTableIter iter; pe_node_t *node = NULL; if (nodes == NULL) { return NULL; } new_table = pcmk__strkey_table(NULL, free); g_hash_table_iter_init(&iter, nodes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { pe_node_t *new_node = pe__copy_node(node); g_hash_table_insert(new_table, (gpointer) new_node->details->id, new_node); } return new_table; } /*! * \internal * \brief Copy a list of node objects * * \param[in] list List to copy * \param[in] reset Set copies' scores to 0 * * \return New list of shallow copies of nodes in original list */ GList * pcmk__copy_node_list(const GList *list, bool reset) { GList *result = NULL; for (const GList *gIter = list; gIter != NULL; gIter = gIter->next) { pe_node_t *new_node = NULL; pe_node_t *this_node = (pe_node_t *) gIter->data; new_node = pe__copy_node(this_node); if (reset) { new_node->weight = 0; } result = g_list_prepend(result, new_node); } return result; } struct node_weight_s { pe_node_t *active; pe_working_set_t *data_set; }; /* return -1 if 'a' is more preferred * return 1 if 'b' is more preferred */ static gint sort_node_weight(gconstpointer a, gconstpointer b, gpointer data) { const pe_node_t *node1 = (const pe_node_t *)a; const pe_node_t *node2 = (const pe_node_t *)b; struct node_weight_s *nw = data; int node1_weight = 0; int node2_weight = 0; int result = 0; if (a == NULL) { return 1; } if (b == NULL) { return -1; } node1_weight = node1->weight; node2_weight = node2->weight; if (can_run_resources(node1) == FALSE) { node1_weight = -INFINITY; } if (can_run_resources(node2) == FALSE) { node2_weight = -INFINITY; } if (node1_weight > node2_weight) { crm_trace("%s (%d) > %s (%d) : weight", node1->details->uname, node1_weight, node2->details->uname, node2_weight); return -1; } if (node1_weight < node2_weight) { crm_trace("%s (%d) < %s (%d) : weight", node1->details->uname, node1_weight, node2->details->uname, node2_weight); return 1; } crm_trace("%s (%d) == %s (%d) : weight", node1->details->uname, node1_weight, node2->details->uname, node2_weight); if (pcmk__str_eq(nw->data_set->placement_strategy, "minimal", pcmk__str_casei)) { goto equal; } if (pcmk__str_eq(nw->data_set->placement_strategy, "balanced", pcmk__str_casei)) { result = compare_capacity(node1, node2); if (result < 0) { crm_trace("%s > %s : capacity (%d)", node1->details->uname, node2->details->uname, result); return -1; } else if (result > 0) { crm_trace("%s < %s : capacity (%d)", node1->details->uname, node2->details->uname, result); return 1; } } /* now try to balance resources across the cluster */ if (node1->details->num_resources < node2->details->num_resources) { crm_trace("%s (%d) > %s (%d) : resources", node1->details->uname, node1->details->num_resources, node2->details->uname, node2->details->num_resources); return -1; } else if (node1->details->num_resources > node2->details->num_resources) { crm_trace("%s (%d) < %s (%d) : resources", node1->details->uname, node1->details->num_resources, node2->details->uname, node2->details->num_resources); return 1; } if (nw->active && nw->active->details == node1->details) { crm_trace("%s (%d) > %s (%d) : active", node1->details->uname, node1->details->num_resources, node2->details->uname, node2->details->num_resources); return -1; } else if (nw->active && nw->active->details == node2->details) { crm_trace("%s (%d) < %s (%d) : active", node1->details->uname, node1->details->num_resources, node2->details->uname, node2->details->num_resources); return 1; } equal: crm_trace("%s = %s", node1->details->uname, node2->details->uname); return strcmp(node1->details->uname, node2->details->uname); } GList * sort_nodes_by_weight(GList *nodes, pe_node_t *active_node, pe_working_set_t *data_set) { struct node_weight_s nw = { active_node, data_set }; return g_list_sort_with_data(nodes, sort_node_weight, &nw); } void native_deallocate(pe_resource_t * rsc) { if (rsc->allocated_to) { pe_node_t *old = rsc->allocated_to; crm_info("Deallocating %s from %s", rsc->id, old->details->uname); pe__set_resource_flags(rsc, pe_rsc_provisional); rsc->allocated_to = NULL; old->details->allocated_rsc = g_list_remove(old->details->allocated_rsc, rsc); old->details->num_resources--; /* old->count--; */ calculate_utilization(old->details->utilization, rsc->utilization, TRUE); free(old); } } gboolean native_assign_node(pe_resource_t *rsc, pe_node_t *chosen, gboolean force) { pcmk__output_t *out = rsc->cluster->priv; CRM_ASSERT(rsc->variant == pe_native); if (force == FALSE && chosen != NULL) { bool unset = FALSE; if(chosen->weight < 0) { unset = TRUE; // Allow the graph to assume that the remote resource will come up } else if (!can_run_resources(chosen) && !pe__is_guest_node(chosen)) { unset = TRUE; } if(unset) { crm_debug("All nodes for resource %s are unavailable" ", unclean or shutting down (%s: %d, %d)", rsc->id, chosen->details->uname, can_run_resources(chosen), chosen->weight); pe__set_next_role(rsc, RSC_ROLE_STOPPED, "node availability"); chosen = NULL; } } /* todo: update the old node for each resource to reflect its * new resource count */ native_deallocate(rsc); pe__clear_resource_flags(rsc, pe_rsc_provisional); if (chosen == NULL) { GList *gIter = NULL; char *rc_inactive = pcmk__itoa(PCMK_OCF_NOT_RUNNING); crm_debug("Could not allocate a node for %s", rsc->id); pe__set_next_role(rsc, RSC_ROLE_STOPPED, "unable to allocate"); for (gIter = rsc->actions; gIter != NULL; gIter = gIter->next) { pe_action_t *op = (pe_action_t *) gIter->data; const char *interval_ms_s = g_hash_table_lookup(op->meta, XML_LRM_ATTR_INTERVAL_MS); crm_debug("Processing %s", op->uuid); if(pcmk__str_eq(RSC_STOP, op->task, pcmk__str_casei)) { pe__clear_action_flags(op, pe_action_optional); } else if(pcmk__str_eq(RSC_START, op->task, pcmk__str_casei)) { pe__clear_action_flags(op, pe_action_runnable); //pe__set_resource_flags(rsc, pe_rsc_block); } else if (interval_ms_s && !pcmk__str_eq(interval_ms_s, "0", pcmk__str_casei)) { if(pcmk__str_eq(rc_inactive, g_hash_table_lookup(op->meta, XML_ATTR_TE_TARGET_RC), pcmk__str_casei)) { /* This is a recurring monitor for the stopped state, leave it alone */ } else { /* Normal monitor operation, cancel it */ pe__clear_action_flags(op, pe_action_runnable); } } } free(rc_inactive); return FALSE; } crm_debug("Assigning %s to %s", chosen->details->uname, rsc->id); rsc->allocated_to = pe__copy_node(chosen); chosen->details->allocated_rsc = g_list_prepend(chosen->details->allocated_rsc, rsc); chosen->details->num_resources++; chosen->count++; calculate_utilization(chosen->details->utilization, rsc->utilization, FALSE); if (pcmk_is_set(rsc->cluster->flags, pe_flag_show_utilization)) { out->message(out, "resource-util", rsc, chosen, __func__); } return TRUE; } void log_action(unsigned int log_level, const char *pre_text, pe_action_t * action, gboolean details) { const char *node_uname = NULL; const char *node_uuid = NULL; const char *desc = NULL; if (action == NULL) { crm_trace("%s%s: ", pre_text == NULL ? "" : pre_text, pre_text == NULL ? "" : ": "); return; } if (pcmk_is_set(action->flags, pe_action_pseudo)) { node_uname = NULL; node_uuid = NULL; } else if (action->node != NULL) { node_uname = action->node->details->uname; node_uuid = action->node->details->id; } else { node_uname = ""; node_uuid = NULL; } switch (text2task(action->task)) { case stonith_node: case shutdown_crm: if (pcmk_is_set(action->flags, pe_action_pseudo)) { desc = "Pseudo "; } else if (pcmk_is_set(action->flags, pe_action_optional)) { desc = "Optional "; } else if (!pcmk_is_set(action->flags, pe_action_runnable)) { desc = "!!Non-Startable!! "; } else if (pcmk_is_set(action->flags, pe_action_processed)) { desc = ""; } else { desc = "(Provisional) "; } crm_trace("%s%s%sAction %d: %s%s%s%s%s%s", ((pre_text == NULL)? "" : pre_text), ((pre_text == NULL)? "" : ": "), desc, action->id, action->uuid, (node_uname? "\ton " : ""), (node_uname? node_uname : ""), (node_uuid? "\t\t(" : ""), (node_uuid? node_uuid : ""), (node_uuid? ")" : "")); break; default: if (pcmk_is_set(action->flags, pe_action_optional)) { desc = "Optional "; } else if (pcmk_is_set(action->flags, pe_action_pseudo)) { desc = "Pseudo "; } else if (!pcmk_is_set(action->flags, pe_action_runnable)) { desc = "!!Non-Startable!! "; } else if (pcmk_is_set(action->flags, pe_action_processed)) { desc = ""; } else { desc = "(Provisional) "; } crm_trace("%s%s%sAction %d: %s %s%s%s%s%s%s", ((pre_text == NULL)? "" : pre_text), ((pre_text == NULL)? "" : ": "), desc, action->id, action->uuid, (action->rsc? action->rsc->id : ""), (node_uname? "\ton " : ""), (node_uname? node_uname : ""), (node_uuid? "\t\t(" : ""), (node_uuid? node_uuid : ""), (node_uuid? ")" : "")); break; } if (details) { GList *gIter = NULL; crm_trace("\t\t====== Preceding Actions"); gIter = action->actions_before; for (; gIter != NULL; gIter = gIter->next) { pe_action_wrapper_t *other = (pe_action_wrapper_t *) gIter->data; log_action(log_level + 1, "\t\t", other->action, FALSE); } crm_trace("\t\t====== Subsequent Actions"); gIter = action->actions_after; for (; gIter != NULL; gIter = gIter->next) { pe_action_wrapper_t *other = (pe_action_wrapper_t *) gIter->data; log_action(log_level + 1, "\t\t", other->action, FALSE); } crm_trace("\t\t====== End"); } else { crm_trace("\t\t(before=%d, after=%d)", g_list_length(action->actions_before), g_list_length(action->actions_after)); } } gboolean can_run_any(GHashTable * nodes) { GHashTableIter iter; pe_node_t *node = NULL; if (nodes == NULL) { return FALSE; } g_hash_table_iter_init(&iter, nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { if (can_run_resources(node) && node->weight >= 0) { return TRUE; } } return FALSE; } pe_action_t * create_pseudo_resource_op(pe_resource_t * rsc, const char *task, bool optional, bool runnable, pe_working_set_t *data_set) { pe_action_t *action = custom_action(rsc, pcmk__op_key(rsc->id, task, 0), task, NULL, optional, TRUE, data_set); pe__set_action_flags(action, pe_action_pseudo); if(runnable) { pe__set_action_flags(action, pe_action_runnable); } return action; } /*! * \internal * \brief Create an executor cancel op * * \param[in] rsc Resource of action to cancel * \param[in] task Name of action to cancel * \param[in] interval_ms Interval of action to cancel * \param[in] node Node of action to cancel * \param[in] data_set Working set of cluster * * \return Created op */ pe_action_t * pe_cancel_op(pe_resource_t *rsc, const char *task, guint interval_ms, pe_node_t *node, pe_working_set_t *data_set) { pe_action_t *cancel_op; char *interval_ms_s = crm_strdup_printf("%u", interval_ms); // @TODO dangerous if possible to schedule another action with this key char *key = pcmk__op_key(rsc->id, task, interval_ms); cancel_op = custom_action(rsc, key, RSC_CANCEL, node, FALSE, TRUE, data_set); free(cancel_op->task); cancel_op->task = strdup(RSC_CANCEL); free(cancel_op->cancel_task); cancel_op->cancel_task = strdup(task); add_hash_param(cancel_op->meta, XML_LRM_ATTR_TASK, task); add_hash_param(cancel_op->meta, XML_LRM_ATTR_INTERVAL_MS, interval_ms_s); free(interval_ms_s); return cancel_op; } /*! * \internal * \brief Create a shutdown op for a scheduler transition * * \param[in] node Node being shut down * \param[in] data_set Working set of cluster * * \return Created op */ pe_action_t * sched_shutdown_op(pe_node_t *node, pe_working_set_t *data_set) { char *shutdown_id = crm_strdup_printf("%s-%s", CRM_OP_SHUTDOWN, node->details->uname); pe_action_t *shutdown_op = custom_action(NULL, shutdown_id, CRM_OP_SHUTDOWN, node, FALSE, TRUE, data_set); crm_notice("Scheduling shutdown of node %s", node->details->uname); shutdown_constraints(node, shutdown_op, data_set); add_hash_param(shutdown_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); return shutdown_op; } static char * generate_transition_magic(const char *transition_key, int op_status, int op_rc) { CRM_CHECK(transition_key != NULL, return NULL); return crm_strdup_printf("%d:%d;%s", op_status, op_rc, transition_key); } static void append_digest(lrmd_event_data_t *op, xmlNode *update, const char *version, const char *magic, int level) { /* this will enable us to later determine that the * resource's parameters have changed and we should force * a restart */ char *digest = NULL; xmlNode *args_xml = NULL; if (op->params == NULL) { return; } args_xml = create_xml_node(NULL, XML_TAG_PARAMS); g_hash_table_foreach(op->params, hash2field, args_xml); pcmk__filter_op_for_digest(args_xml); digest = calculate_operation_digest(args_xml, version); #if 0 if (level < get_crm_log_level() && op->interval_ms == 0 && pcmk__str_eq(op->op_type, CRMD_ACTION_START, pcmk__str_none)) { char *digest_source = dump_xml_unformatted(args_xml); do_crm_log(level, "Calculated digest %s for %s (%s). Source: %s\n", digest, ID(update), magic, digest_source); free(digest_source); } #endif crm_xml_add(update, XML_LRM_ATTR_OP_DIGEST, digest); free_xml(args_xml); free(digest); } #define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" /*! * \internal * \brief Create XML for resource operation history update * * \param[in,out] parent Parent XML node to add to * \param[in,out] op Operation event data * \param[in] caller_version DC feature set * \param[in] target_rc Expected result of operation * \param[in] node Name of node on which operation was performed * \param[in] origin Arbitrary description of update source * \param[in] level A log message will be logged at this level * * \return Newly created XML node for history update */ xmlNode * pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, const char *caller_version, int target_rc, const char *node, const char *origin, int level) { char *key = NULL; char *magic = NULL; char *op_id = NULL; char *op_id_additional = NULL; char *local_user_data = NULL; const char *exit_reason = NULL; xmlNode *xml_op = NULL; const char *task = NULL; CRM_CHECK(op != NULL, return NULL); do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)", origin, op->rsc_id, op->op_type, services_lrm_status_str(op->op_status), op->interval_ms); crm_trace("DC version: %s", caller_version); task = op->op_type; /* Record a successful agent reload as a start, and a failed one as a * monitor, to make life easier for the scheduler when determining the * current state. * * @COMPAT We should check "reload" here only if the operation was for a * pre-OCF-1.1 resource agent, but we don't know that here, and we should * only ever get results for actions scheduled by us, so we can reasonably * assume any "reload" is actually a pre-1.1 agent reload. */ if (pcmk__str_any_of(task, CRMD_ACTION_RELOAD, CRMD_ACTION_RELOAD_AGENT, NULL)) { if (op->op_status == PCMK_LRM_OP_DONE) { task = CRMD_ACTION_START; } else { task = CRMD_ACTION_STATUS; } } key = pcmk__op_key(op->rsc_id, task, op->interval_ms); if (pcmk__str_eq(task, CRMD_ACTION_NOTIFY, pcmk__str_none)) { const char *n_type = crm_meta_value(op->params, "notify_type"); const char *n_task = crm_meta_value(op->params, "notify_operation"); CRM_LOG_ASSERT(n_type != NULL); CRM_LOG_ASSERT(n_task != NULL); op_id = pcmk__notify_key(op->rsc_id, n_type, n_task); if (op->op_status != PCMK_LRM_OP_PENDING) { /* Ignore notify errors. * * @TODO It might be better to keep the correct result here, and * ignore it in process_graph_event(). */ op->op_status = PCMK_LRM_OP_DONE; op->rc = 0; } } else if (did_rsc_op_fail(op, target_rc)) { op_id = pcmk__op_key(op->rsc_id, "last_failure", 0); if (op->interval_ms == 0) { // Ensure 'last' gets updated, in case record-pending is true op_id_additional = pcmk__op_key(op->rsc_id, "last", 0); } exit_reason = op->exit_reason; } else if (op->interval_ms > 0) { op_id = strdup(key); } else { op_id = pcmk__op_key(op->rsc_id, "last", 0); } again: xml_op = pcmk__xe_match(parent, XML_LRM_TAG_RSC_OP, XML_ATTR_ID, op_id); if (xml_op == NULL) { xml_op = create_xml_node(parent, XML_LRM_TAG_RSC_OP); } if (op->user_data == NULL) { crm_debug("Generating fake transition key for: " PCMK__OP_FMT " %d from %s", op->rsc_id, op->op_type, op->interval_ms, op->call_id, origin); local_user_data = pcmk__transition_key(-1, op->call_id, target_rc, FAKE_TE_ID); op->user_data = local_user_data; } if(magic == NULL) { magic = generate_transition_magic(op->user_data, op->op_status, op->rc); } crm_xml_add(xml_op, XML_ATTR_ID, op_id); crm_xml_add(xml_op, XML_LRM_ATTR_TASK_KEY, key); crm_xml_add(xml_op, XML_LRM_ATTR_TASK, task); crm_xml_add(xml_op, XML_ATTR_ORIGIN, origin); crm_xml_add(xml_op, XML_ATTR_CRM_VERSION, caller_version); crm_xml_add(xml_op, XML_ATTR_TRANSITION_KEY, op->user_data); crm_xml_add(xml_op, XML_ATTR_TRANSITION_MAGIC, magic); crm_xml_add(xml_op, XML_LRM_ATTR_EXIT_REASON, exit_reason == NULL ? "" : exit_reason); crm_xml_add(xml_op, XML_LRM_ATTR_TARGET, node); /* For context during triage */ crm_xml_add_int(xml_op, XML_LRM_ATTR_CALLID, op->call_id); crm_xml_add_int(xml_op, XML_LRM_ATTR_RC, op->rc); crm_xml_add_int(xml_op, XML_LRM_ATTR_OPSTATUS, op->op_status); crm_xml_add_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, op->interval_ms); if (compare_version("2.1", caller_version) <= 0) { if (op->t_run || op->t_rcchange || op->exec_time || op->queue_time) { crm_trace("Timing data (" PCMK__OP_FMT "): last=%u change=%u exec=%u queue=%u", op->rsc_id, op->op_type, op->interval_ms, op->t_run, op->t_rcchange, op->exec_time, op->queue_time); if ((op->interval_ms != 0) && (op->t_rcchange != 0)) { // Recurring ops may have changed rc after initial run crm_xml_add_ll(xml_op, XML_RSC_OP_LAST_CHANGE, (long long) op->t_rcchange); } else { crm_xml_add_ll(xml_op, XML_RSC_OP_LAST_CHANGE, (long long) op->t_run); } crm_xml_add_int(xml_op, XML_RSC_OP_T_EXEC, op->exec_time); crm_xml_add_int(xml_op, XML_RSC_OP_T_QUEUE, op->queue_time); } } if (pcmk__str_any_of(op->op_type, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { /* * Record migrate_source and migrate_target always for migrate ops. */ const char *name = XML_LRM_ATTR_MIGRATE_SOURCE; crm_xml_add(xml_op, name, crm_meta_value(op->params, name)); name = XML_LRM_ATTR_MIGRATE_TARGET; crm_xml_add(xml_op, name, crm_meta_value(op->params, name)); } append_digest(op, xml_op, caller_version, magic, LOG_DEBUG); if (op_id_additional) { free(op_id); op_id = op_id_additional; op_id_additional = NULL; goto again; } if (local_user_data) { free(local_user_data); op->user_data = NULL; } free(magic); free(op_id); free(key); return xml_op; } pcmk__output_t * pcmk__new_logger(void) { int rc = pcmk_rc_ok; pcmk__output_t *out = NULL; const char* argv[] = { "", NULL }; pcmk__supported_format_t formats[] = { PCMK__SUPPORTED_FORMAT_LOG, { NULL, NULL, NULL } }; pcmk__register_formats(NULL, formats); rc = pcmk__output_new(&out, "log", NULL, (char**)argv); if ((rc != pcmk_rc_ok) || (out == NULL)) { crm_err("Can't log resource details due to internal error: %s\n", pcmk_rc_str(rc)); return NULL; } pe__register_messages(out); pcmk__register_lib_messages(out); return out; } + +/*! + * \internal + * \brief Check whether a resource has reached its migration threshold on a node + * + * \param[in] rsc Resource to check + * \param[in] node Node to check + * \param[in] data_set Cluster working set + * + * \return Failed resource (possibly a parent of \p rsc) if migration threshold + * has been reached, or NULL otherwise + */ +pe_resource_t * +pcmk__threshold_reached(pe_resource_t *rsc, pe_node_t *node, + pe_working_set_t *data_set) +{ + int fail_count, remaining_tries; + pe_resource_t *failed = rsc; + + // Migration threshold of 0 means never force away + if (rsc->migration_threshold == 0) { + return NULL; + } + + // If we're ignoring failures, also ignore the migration threshold + if (pcmk_is_set(rsc->flags, pe_rsc_failure_ignored)) { + return NULL; + } + + // If there are no failures, there's no need to force away + fail_count = pe_get_failcount(node, rsc, NULL, + pe_fc_effective|pe_fc_fillers, NULL, + data_set); + if (fail_count <= 0) { + return NULL; + } + + // If failed resource is anonymous clone instance, we'll force clone away + if (!pcmk_is_set(rsc->flags, pe_rsc_unique)) { + failed = uber_parent(rsc); + } + + // How many more times recovery will be tried on this node + remaining_tries = rsc->migration_threshold - fail_count; + + if (remaining_tries <= 0) { + crm_warn("Forcing %s away from %s after %d failures (max=%d)", + failed->id, node->details->uname, fail_count, + rsc->migration_threshold); + return failed; + } + + crm_info("%s can fail %d more times on %s before being forced off", + failed->id, remaining_tries, node->details->uname); + return NULL; +}