diff --git a/include/pcmki/pcmki_scheduler.h b/include/pcmki/pcmki_scheduler.h index 1712cd46bc..b6c52767ca 100644 --- a/include/pcmki/pcmki_scheduler.h +++ b/include/pcmki/pcmki_scheduler.h @@ -1,102 +1,101 @@ /* * Copyright 2014-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PENGINE__H # define PENGINE__H typedef struct rsc_ticket_s rsc_ticket_t; # include # include # include # include # include # include # include enum pe_weights { pe_weights_none = 0x0, pe_weights_init = 0x1, pe_weights_forward = 0x4, pe_weights_positive = 0x8, pe_weights_rollback = 0x10, }; typedef struct { const char *id; const char *node_attribute; pe_resource_t *dependent; // The resource being colocated pe_resource_t *primary; // The resource the dependent is colocated with int dependent_role; // Colocation applies only if dependent has this role int primary_role; // Colocation applies only if primary has this role int score; bool influence; // Whether dependent influences active primary placement } pcmk__colocation_t; enum loss_ticket_policy_e { loss_ticket_stop, loss_ticket_demote, loss_ticket_fence, loss_ticket_freeze }; struct rsc_ticket_s { const char *id; pe_resource_t *rsc_lh; pe_ticket_t *ticket; enum loss_ticket_policy_e loss_policy; int role_lh; }; extern gboolean stage0(pe_working_set_t * data_set); -extern gboolean probe_resources(pe_working_set_t * data_set); extern gboolean stage2(pe_working_set_t * data_set); extern gboolean stage4(pe_working_set_t * data_set); extern gboolean stage5(pe_working_set_t * data_set); extern gboolean stage6(pe_working_set_t * data_set); void pcmk__unpack_constraints(pe_working_set_t *data_set); extern void add_maintenance_update(pe_working_set_t *data_set); xmlNode *pcmk__schedule_actions(pe_working_set_t *data_set, xmlNode *xml_input, crm_time_t *now); extern const char *transition_idle_timeout; /*! * \internal * \brief Check whether colocation's left-hand preferences should be considered * * \param[in] colocation Colocation constraint * \param[in] rsc Right-hand instance (normally this will be * colocation->primary, which NULL will be treated as, * but for clones or bundles with multiple instances * this can be a particular instance) * * \return true if colocation influence should be effective, otherwise false */ static inline bool pcmk__colocation_has_influence(const pcmk__colocation_t *colocation, const pe_resource_t *rsc) { if (rsc == NULL) { rsc = colocation->primary; } /* The left hand of a colocation influences the right hand's location * if the influence option is true, or the right hand is not yet active. */ return colocation->influence || (rsc->running_on == NULL); } #endif diff --git a/lib/pacemaker/Makefile.am b/lib/pacemaker/Makefile.am index 1f691aec3e..9403814b5b 100644 --- a/lib/pacemaker/Makefile.am +++ b/lib/pacemaker/Makefile.am @@ -1,62 +1,63 @@ # # Copyright 2004-2021 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # include $(top_srcdir)/mk/common.mk AM_CPPFLAGS += -I$(top_builddir) -I$(top_srcdir) noinst_HEADERS = libpacemaker_private.h ## libraries lib_LTLIBRARIES = libpacemaker.la ## SOURCES libpacemaker_la_LDFLAGS = -version-info 4:0:3 libpacemaker_la_CFLAGS = $(CFLAGS_HARDENED_LIB) libpacemaker_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) libpacemaker_la_LIBADD = $(top_builddir)/lib/pengine/libpe_status.la \ $(top_builddir)/lib/cib/libcib.la \ $(top_builddir)/lib/lrmd/liblrmd.la \ $(top_builddir)/lib/common/libcrmcommon.la # -L$(top_builddir)/lib/pils -lpils -export-dynamic -module -avoid-version # Use += rather than backlashed continuation lines for parsing by bumplibs libpacemaker_la_SOURCES = libpacemaker_la_SOURCES += pcmk_acl.c libpacemaker_la_SOURCES += pcmk_cluster_queries.c libpacemaker_la_SOURCES += pcmk_fence.c libpacemaker_la_SOURCES += pcmk_graph_consumer.c libpacemaker_la_SOURCES += pcmk_graph_logging.c libpacemaker_la_SOURCES += pcmk_graph_producer.c libpacemaker_la_SOURCES += pcmk_output.c libpacemaker_la_SOURCES += pcmk_output_utils.c libpacemaker_la_SOURCES += pcmk_resource.c libpacemaker_la_SOURCES += pcmk_sched_actions.c libpacemaker_la_SOURCES += pcmk_sched_allocate.c libpacemaker_la_SOURCES += pcmk_sched_bundle.c libpacemaker_la_SOURCES += pcmk_sched_clone.c libpacemaker_la_SOURCES += pcmk_sched_colocation.c libpacemaker_la_SOURCES += pcmk_sched_constraints.c libpacemaker_la_SOURCES += pcmk_sched_fencing.c libpacemaker_la_SOURCES += pcmk_sched_group.c libpacemaker_la_SOURCES += pcmk_sched_location.c libpacemaker_la_SOURCES += pcmk_sched_messages.c libpacemaker_la_SOURCES += pcmk_sched_native.c libpacemaker_la_SOURCES += pcmk_sched_nodes.c libpacemaker_la_SOURCES += pcmk_sched_notif.c libpacemaker_la_SOURCES += pcmk_sched_ordering.c +libpacemaker_la_SOURCES += pcmk_sched_probes.c libpacemaker_la_SOURCES += pcmk_sched_promotable.c libpacemaker_la_SOURCES += pcmk_sched_remote.c libpacemaker_la_SOURCES += pcmk_sched_resource.c libpacemaker_la_SOURCES += pcmk_sched_tickets.c libpacemaker_la_SOURCES += pcmk_sched_transition.c libpacemaker_la_SOURCES += pcmk_sched_utilization.c libpacemaker_la_SOURCES += pcmk_simulate.c diff --git a/lib/pacemaker/libpacemaker_private.h b/lib/pacemaker/libpacemaker_private.h index fcd98cad58..e07561bd64 100644 --- a/lib/pacemaker/libpacemaker_private.h +++ b/lib/pacemaker/libpacemaker_private.h @@ -1,266 +1,273 @@ /* * Copyright 2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__LIBPACEMAKER_PRIVATE__H # define PCMK__LIBPACEMAKER_PRIVATE__H /* This header is for the sole use of libpacemaker, so that functions can be * declared with G_GNUC_INTERNAL for efficiency. */ #include // pe_action_t, pe_node_t, pe_working_set_t // Actions G_GNUC_INTERNAL void pcmk__update_action_for_orderings(pe_action_t *action, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__log_action(const char *pre_text, pe_action_t *action, bool details); G_GNUC_INTERNAL pe_action_t *pcmk__new_rsc_pseudo_action(pe_resource_t *rsc, const char *task, bool optional, bool runnable); G_GNUC_INTERNAL pe_action_t *pcmk__new_cancel_action(pe_resource_t *rsc, const char *name, guint interval_ms, pe_node_t *node); G_GNUC_INTERNAL pe_action_t *pcmk__new_shutdown_action(pe_node_t *node, pe_working_set_t *data_set); G_GNUC_INTERNAL bool pcmk__action_locks_rsc_to_node(const pe_action_t *action); G_GNUC_INTERNAL void pcmk__deduplicate_action_inputs(pe_action_t *action); G_GNUC_INTERNAL void pcmk__output_actions(pe_working_set_t *data_set); // Producing transition graphs (pcmk_graph_producer.c) G_GNUC_INTERNAL bool pcmk__graph_has_loop(pe_action_t *init_action, pe_action_t *action, pe_action_wrapper_t *input); G_GNUC_INTERNAL void pcmk__add_action_to_graph(pe_action_t *action, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__create_graph(pe_working_set_t *data_set); // Fencing (pcmk_sched_fencing.c) G_GNUC_INTERNAL void pcmk__order_vs_fence(pe_action_t *stonith_op, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__order_vs_unfence(pe_resource_t *rsc, pe_node_t *node, pe_action_t *action, enum pe_ordering order, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__fence_guest(pe_node_t *node, pe_working_set_t *data_set); G_GNUC_INTERNAL bool pcmk__node_unfenced(pe_node_t *node); G_GNUC_INTERNAL bool pcmk__is_unfence_device(const pe_resource_t *rsc, const pe_working_set_t *data_set); G_GNUC_INTERNAL pe_resource_t *pcmk__find_constraint_resource(GList *rsc_list, const char *id); G_GNUC_INTERNAL xmlNode *pcmk__expand_tags_in_sets(xmlNode *xml_obj, pe_working_set_t *data_set); G_GNUC_INTERNAL bool pcmk__valid_resource_or_tag(pe_working_set_t *data_set, const char *id, pe_resource_t **rsc, pe_tag_t **tag); G_GNUC_INTERNAL bool pcmk__tag_to_set(xmlNode *xml_obj, xmlNode **rsc_set, const char *attr, bool convert_rsc, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__create_internal_constraints(pe_working_set_t *data_set); // Location constraints G_GNUC_INTERNAL void pcmk__unpack_location(xmlNode *xml_obj, pe_working_set_t *data_set); G_GNUC_INTERNAL pe__location_t *pcmk__new_location(const char *id, pe_resource_t *rsc, int node_weight, const char *discover_mode, pe_node_t *foo_node, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__apply_locations(pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__apply_location(pe__location_t *constraint, pe_resource_t *rsc); // Colocation constraints enum pcmk__coloc_affects { pcmk__coloc_affects_nothing = 0, pcmk__coloc_affects_location, pcmk__coloc_affects_role, }; G_GNUC_INTERNAL enum pcmk__coloc_affects pcmk__colocation_affects(pe_resource_t *dependent, pe_resource_t *primary, pcmk__colocation_t *constraint, bool preview); G_GNUC_INTERNAL void pcmk__apply_coloc_to_weights(pe_resource_t *dependent, pe_resource_t *primary, pcmk__colocation_t *constraint); G_GNUC_INTERNAL void pcmk__apply_coloc_to_priority(pe_resource_t *dependent, pe_resource_t *primary, pcmk__colocation_t *constraint); G_GNUC_INTERNAL void pcmk__unpack_colocation(xmlNode *xml_obj, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__new_colocation(const char *id, const char *node_attr, int score, pe_resource_t *dependent, pe_resource_t *primary, const char *dependent_role, const char *primary_role, bool influence, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__block_colocated_starts(pe_action_t *action, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__new_ordering(pe_resource_t *lh_rsc, char *lh_task, pe_action_t *lh_action, pe_resource_t *rh_rsc, char *rh_task, pe_action_t *rh_action, enum pe_ordering type, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__unpack_ordering(xmlNode *xml_obj, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__disable_invalid_orderings(pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__order_stops_before_shutdown(pe_node_t *node, pe_action_t *shutdown_op, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__apply_orderings(pe_working_set_t *data_set); /*! * \internal * \brief Create a new ordering between two resource actions * * \param[in] lh_rsc Resource for 'first' action * \param[in] rh_rsc Resource for 'then' action * \param[in] lh_task Action key for 'first' action * \param[in] rh_task Action key for 'then' action * \param[in] flags Bitmask of enum pe_ordering flags * \param[in] data_set Cluster working set to add ordering to */ #define pcmk__order_resource_actions(lh_rsc, lh_task, rh_rsc, rh_task, \ flags, data_set) \ pcmk__new_ordering((lh_rsc), pcmk__op_key((lh_rsc)->id, (lh_task), 0), \ NULL, \ (rh_rsc), pcmk__op_key((rh_rsc)->id, (rh_task), 0), \ NULL, (flags), (data_set)) #define pcmk__order_starts(rsc1, rsc2, type, data_set) \ pcmk__order_resource_actions((rsc1), CRMD_ACTION_START, \ (rsc2), CRMD_ACTION_START, (type), (data_set)) #define pcmk__order_stops(rsc1, rsc2, type, data_set) \ pcmk__order_resource_actions((rsc1), CRMD_ACTION_STOP, \ (rsc2), CRMD_ACTION_STOP, (type), (data_set)) G_GNUC_INTERNAL void pcmk__unpack_rsc_ticket(xmlNode *xml_obj, pe_working_set_t *data_set); -G_GNUC_INTERNAL -void pcmk__order_probes(pe_working_set_t *data_set); - G_GNUC_INTERNAL bool pcmk__is_failed_remote_node(pe_node_t *node); G_GNUC_INTERNAL void pcmk__order_remote_connection_actions(pe_working_set_t *data_set); G_GNUC_INTERNAL bool pcmk__rsc_corresponds_to_guest(pe_resource_t *rsc, pe_node_t *node); G_GNUC_INTERNAL pe_node_t *pcmk__connection_host_for_action(pe_action_t *action); G_GNUC_INTERNAL void pcmk__substitute_remote_addr(pe_resource_t *rsc, GHashTable *params, pe_working_set_t *data_set); G_GNUC_INTERNAL void pcmk__add_bundle_meta_to_xml(xmlNode *args_xml, pe_action_t *action); // Groups (pcmk_sched_group.c) G_GNUC_INTERNAL GList *pcmk__group_colocated_resources(pe_resource_t *rsc, pe_resource_t *orig_rsc, GList *colocated_rscs); // Bundles (pcmk_sched_bundle.c) G_GNUC_INTERNAL void pcmk__output_bundle_actions(pe_resource_t *rsc); // Functions applying to more than one variant (pcmk_sched_resource.c) G_GNUC_INTERNAL GList *pcmk__colocated_resources(pe_resource_t *rsc, pe_resource_t *orig_rsc, GList *colocated_rscs); G_GNUC_INTERNAL void pcmk__output_resource_actions(pe_resource_t *rsc); G_GNUC_INTERNAL bool pcmk__assign_primitive(pe_resource_t *rsc, pe_node_t *chosen, bool force); G_GNUC_INTERNAL bool pcmk__assign_resource(pe_resource_t *rsc, pe_node_t *node, bool force); G_GNUC_INTERNAL void pcmk__unassign_resource(pe_resource_t *rsc); G_GNUC_INTERNAL bool pcmk__threshold_reached(pe_resource_t *rsc, pe_node_t *node, pe_working_set_t *data_set, pe_resource_t **failed); + +// Functions related to probes (pcmk_sched_probes.c) + +G_GNUC_INTERNAL +void pcmk__order_probes(pe_working_set_t *data_set); + +G_GNUC_INTERNAL +void pcmk__schedule_probes(pe_working_set_t *data_set); + + #endif // PCMK__LIBPACEMAKER_PRIVATE__H diff --git a/lib/pacemaker/pcmk_sched_allocate.c b/lib/pacemaker/pcmk_sched_allocate.c index 6603f6246b..9ee94ac74e 100644 --- a/lib/pacemaker/pcmk_sched_allocate.c +++ b/lib/pacemaker/pcmk_sched_allocate.c @@ -1,1973 +1,1425 @@ /* * Copyright 2004-2021 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include -#include - #include #include #include #include #include #include #include #include #include "libpacemaker_private.h" CRM_TRACE_INIT_DATA(pacemaker); extern bool pcmk__is_daemon; void set_alloc_actions(pe_working_set_t * data_set); extern void ReloadRsc(pe_resource_t * rsc, pe_node_t *node, pe_working_set_t * data_set); extern gboolean DeleteRsc(pe_resource_t * rsc, pe_node_t * node, gboolean optional, pe_working_set_t * data_set); resource_alloc_functions_t resource_class_alloc_functions[] = { { pcmk__native_merge_weights, pcmk__native_allocate, native_create_actions, native_create_probe, native_internal_constraints, native_rsc_colocation_lh, native_rsc_colocation_rh, pcmk__colocated_resources, native_rsc_location, native_action_flags, native_update_actions, pcmk__output_resource_actions, native_expand, native_append_meta, }, { pcmk__group_merge_weights, pcmk__group_allocate, group_create_actions, native_create_probe, group_internal_constraints, group_rsc_colocation_lh, group_rsc_colocation_rh, pcmk__group_colocated_resources, group_rsc_location, group_action_flags, group_update_actions, pcmk__output_resource_actions, group_expand, group_append_meta, }, { pcmk__native_merge_weights, pcmk__clone_allocate, clone_create_actions, clone_create_probe, clone_internal_constraints, clone_rsc_colocation_lh, clone_rsc_colocation_rh, pcmk__colocated_resources, clone_rsc_location, clone_action_flags, pcmk__multi_update_actions, pcmk__output_resource_actions, clone_expand, clone_append_meta, }, { pcmk__native_merge_weights, pcmk__bundle_allocate, pcmk__bundle_create_actions, pcmk__bundle_create_probe, pcmk__bundle_internal_constraints, pcmk__bundle_rsc_colocation_lh, pcmk__bundle_rsc_colocation_rh, pcmk__colocated_resources, pcmk__bundle_rsc_location, pcmk__bundle_action_flags, pcmk__multi_update_actions, pcmk__output_bundle_actions, pcmk__bundle_expand, pcmk__bundle_append_meta, } }; static gboolean check_rsc_parameters(pe_resource_t * rsc, pe_node_t * node, xmlNode * rsc_entry, gboolean active_here, pe_working_set_t * data_set) { int attr_lpc = 0; gboolean force_restart = FALSE; gboolean delete_resource = FALSE; gboolean changed = FALSE; const char *value = NULL; const char *old_value = NULL; const char *attr_list[] = { XML_ATTR_TYPE, XML_AGENT_ATTR_CLASS, XML_AGENT_ATTR_PROVIDER }; for (; attr_lpc < PCMK__NELEM(attr_list); attr_lpc++) { value = crm_element_value(rsc->xml, attr_list[attr_lpc]); old_value = crm_element_value(rsc_entry, attr_list[attr_lpc]); if (value == old_value /* i.e. NULL */ || pcmk__str_eq(value, old_value, pcmk__str_none)) { continue; } changed = TRUE; trigger_unfencing(rsc, node, "Device definition changed", NULL, data_set); if (active_here) { force_restart = TRUE; crm_notice("Forcing restart of %s on %s, %s changed: %s -> %s", rsc->id, node->details->uname, attr_list[attr_lpc], crm_str(old_value), crm_str(value)); } } if (force_restart) { /* make sure the restart happens */ stop_action(rsc, node, FALSE); pe__set_resource_flags(rsc, pe_rsc_start_pending); delete_resource = TRUE; } else if (changed) { delete_resource = TRUE; } return delete_resource; } static void CancelXmlOp(pe_resource_t * rsc, xmlNode * xml_op, pe_node_t * active_node, const char *reason, pe_working_set_t * data_set) { guint interval_ms = 0; pe_action_t *cancel = NULL; const char *task = NULL; const char *call_id = NULL; CRM_CHECK(xml_op != NULL, return); CRM_CHECK(active_node != NULL, return); task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); call_id = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); crm_info("Action " PCMK__OP_FMT " on %s will be stopped: %s", rsc->id, task, interval_ms, active_node->details->uname, (reason? reason : "unknown")); cancel = pcmk__new_cancel_action(rsc, task, interval_ms, active_node); add_hash_param(cancel->meta, XML_LRM_ATTR_CALLID, call_id); pcmk__new_ordering(rsc, stop_key(rsc), NULL, rsc, NULL, cancel, pe_order_optional, data_set); } static gboolean check_action_definition(pe_resource_t * rsc, pe_node_t * active_node, xmlNode * xml_op, pe_working_set_t * data_set) { char *key = NULL; guint interval_ms = 0; const op_digest_cache_t *digest_data = NULL; gboolean did_change = FALSE; const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *digest_secure = NULL; CRM_CHECK(active_node != NULL, return FALSE); crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); if (interval_ms > 0) { xmlNode *op_match = NULL; /* we need to reconstruct the key because of the way we used to construct resource IDs */ key = pcmk__op_key(rsc->id, task, interval_ms); pe_rsc_trace(rsc, "Checking parameters for %s", key); op_match = find_rsc_op_entry(rsc, key); if ((op_match == NULL) && pcmk_is_set(data_set->flags, pe_flag_stop_action_orphans)) { CancelXmlOp(rsc, xml_op, active_node, "orphan", data_set); free(key); return TRUE; } else if (op_match == NULL) { pe_rsc_debug(rsc, "Orphan action detected: %s on %s", key, active_node->details->uname); free(key); return TRUE; } free(key); key = NULL; } crm_trace("Testing " PCMK__OP_FMT " on %s", rsc->id, task, interval_ms, active_node->details->uname); if ((interval_ms == 0) && pcmk__str_eq(task, RSC_STATUS, pcmk__str_casei)) { /* Reload based on the start action not a probe */ task = RSC_START; } else if ((interval_ms == 0) && pcmk__str_eq(task, RSC_MIGRATED, pcmk__str_casei)) { /* Reload based on the start action not a migrate */ task = RSC_START; } else if ((interval_ms == 0) && pcmk__str_eq(task, RSC_PROMOTE, pcmk__str_casei)) { /* Reload based on the start action not a promote */ task = RSC_START; } digest_data = rsc_action_digest_cmp(rsc, xml_op, active_node, data_set); if (pcmk_is_set(data_set->flags, pe_flag_sanitized)) { digest_secure = crm_element_value(xml_op, XML_LRM_ATTR_SECURE_DIGEST); } if(digest_data->rc != RSC_DIGEST_MATCH && digest_secure && digest_data->digest_secure_calc && strcmp(digest_data->digest_secure_calc, digest_secure) == 0) { if (!pcmk__is_daemon && data_set->priv != NULL) { pcmk__output_t *out = data_set->priv; out->info(out, "Only 'private' parameters to " PCMK__OP_FMT " on %s changed: %s", rsc->id, task, interval_ms, active_node->details->uname, crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); } } else if (digest_data->rc == RSC_DIGEST_RESTART) { /* Changes that force a restart */ pe_action_t *required = NULL; did_change = TRUE; key = pcmk__op_key(rsc->id, task, interval_ms); crm_log_xml_info(digest_data->params_restart, "params:restart"); required = custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); pe_action_set_reason(required, "resource definition change", true); trigger_unfencing(rsc, active_node, "Device parameters changed", NULL, data_set); } else if ((digest_data->rc == RSC_DIGEST_ALL) || (digest_data->rc == RSC_DIGEST_UNKNOWN)) { // Changes that can potentially be handled by an agent reload const char *digest_restart = crm_element_value(xml_op, XML_LRM_ATTR_RESTART_DIGEST); did_change = TRUE; trigger_unfencing(rsc, active_node, "Device parameters changed (reload)", NULL, data_set); crm_log_xml_info(digest_data->params_all, "params:reload"); key = pcmk__op_key(rsc->id, task, interval_ms); if (interval_ms > 0) { pe_action_t *op = NULL; #if 0 /* Always reload/restart the entire resource */ ReloadRsc(rsc, active_node, data_set); #else /* Re-sending the recurring op is sufficient - the old one will be cancelled automatically */ op = custom_action(rsc, key, task, active_node, TRUE, TRUE, data_set); pe__set_action_flags(op, pe_action_reschedule); #endif } else if (digest_restart) { pe_rsc_trace(rsc, "Reloading '%s' action for resource %s", task, rsc->id); /* Reload this resource */ ReloadRsc(rsc, active_node, data_set); free(key); } else { pe_action_t *required = NULL; pe_rsc_trace(rsc, "Resource %s doesn't support agent reloads", rsc->id); /* Re-send the start/demote/promote op * Recurring ops will be detected independently */ required = custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); pe_action_set_reason(required, "resource definition change", true); } } return did_change; } /*! * \internal * \brief Do deferred action checks after allocation * * \param[in] data_set Working set for cluster */ static void check_params(pe_resource_t *rsc, pe_node_t *node, xmlNode *rsc_op, enum pe_check_parameters check, pe_working_set_t *data_set) { const char *reason = NULL; op_digest_cache_t *digest_data = NULL; switch (check) { case pe_check_active: if (check_action_definition(rsc, node, rsc_op, data_set) && pe_get_failcount(node, rsc, NULL, pe_fc_effective, NULL, data_set)) { reason = "action definition changed"; } break; case pe_check_last_failure: digest_data = rsc_action_digest_cmp(rsc, rsc_op, node, data_set); switch (digest_data->rc) { case RSC_DIGEST_UNKNOWN: crm_trace("Resource %s history entry %s on %s has no digest to compare", rsc->id, ID(rsc_op), node->details->id); break; case RSC_DIGEST_MATCH: break; default: reason = "resource parameters have changed"; break; } break; } if (reason) { pe__clear_failcount(rsc, node, reason, data_set); } } static void check_actions_for(xmlNode * rsc_entry, pe_resource_t * rsc, pe_node_t * node, pe_working_set_t * data_set) { GList *gIter = NULL; int offset = -1; int stop_index = 0; int start_index = 0; const char *task = NULL; xmlNode *rsc_op = NULL; GList *op_list = NULL; GList *sorted_op_list = NULL; CRM_CHECK(node != NULL, return); if (pcmk_is_set(rsc->flags, pe_rsc_orphan)) { pe_resource_t *parent = uber_parent(rsc); if(parent == NULL || pe_rsc_is_clone(parent) == FALSE || pcmk_is_set(parent->flags, pe_rsc_unique)) { pe_rsc_trace(rsc, "Skipping param check for %s and deleting: orphan", rsc->id); DeleteRsc(rsc, node, FALSE, data_set); } else { pe_rsc_trace(rsc, "Skipping param check for %s (orphan clone)", rsc->id); } return; } else if (pe_find_node_id(rsc->running_on, node->details->id) == NULL) { if (check_rsc_parameters(rsc, node, rsc_entry, FALSE, data_set)) { DeleteRsc(rsc, node, FALSE, data_set); } pe_rsc_trace(rsc, "Skipping param check for %s: no longer active on %s", rsc->id, node->details->uname); return; } pe_rsc_trace(rsc, "Processing %s on %s", rsc->id, node->details->uname); if (check_rsc_parameters(rsc, node, rsc_entry, TRUE, data_set)) { DeleteRsc(rsc, node, FALSE, data_set); } for (rsc_op = pcmk__xe_first_child(rsc_entry); rsc_op != NULL; rsc_op = pcmk__xe_next(rsc_op)) { if (pcmk__str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, pcmk__str_none)) { op_list = g_list_prepend(op_list, rsc_op); } } sorted_op_list = g_list_sort(op_list, sort_op_by_callid); calculate_active_ops(sorted_op_list, &start_index, &stop_index); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; guint interval_ms = 0; offset++; if (start_index < stop_index) { /* stopped */ continue; } else if (offset < start_index) { /* action occurred prior to a start */ continue; } task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); crm_element_value_ms(rsc_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); if ((interval_ms > 0) && (pcmk_is_set(rsc->flags, pe_rsc_maintenance) || node->details->maintenance)) { // Maintenance mode cancels recurring operations CancelXmlOp(rsc, rsc_op, node, "maintenance mode", data_set); } else if ((interval_ms > 0) || pcmk__strcase_any_of(task, RSC_STATUS, RSC_START, RSC_PROMOTE, RSC_MIGRATED, NULL)) { /* If a resource operation failed, and the operation's definition * has changed, clear any fail count so they can be retried fresh. */ if (pe__bundle_needs_remote_name(rsc, data_set)) { /* We haven't allocated resources to nodes yet, so if the * REMOTE_CONTAINER_HACK is used, we may calculate the digest * based on the literal "#uname" value rather than the properly * substituted value. That would mistakenly make the action * definition appear to have been changed. Defer the check until * later in this case. */ pe__add_param_check(rsc_op, rsc, node, pe_check_active, data_set); } else if (check_action_definition(rsc, node, rsc_op, data_set) && pe_get_failcount(node, rsc, NULL, pe_fc_effective, NULL, data_set)) { pe__clear_failcount(rsc, node, "action definition changed", data_set); } } } g_list_free(sorted_op_list); } static GList * find_rsc_list(GList *result, pe_resource_t * rsc, const char *id, gboolean renamed_clones, gboolean partial, pe_working_set_t * data_set) { GList *gIter = NULL; gboolean match = FALSE; if (id == NULL) { return NULL; } if (rsc == NULL) { if (data_set == NULL) { return NULL; } for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *child = (pe_resource_t *) gIter->data; result = find_rsc_list(result, child, id, renamed_clones, partial, NULL); } return result; } if (partial) { if (strstr(rsc->id, id)) { match = TRUE; } else if (renamed_clones && rsc->clone_name && strstr(rsc->clone_name, id)) { match = TRUE; } } else { if (strcmp(rsc->id, id) == 0) { match = TRUE; } else if (renamed_clones && rsc->clone_name && strcmp(rsc->clone_name, id) == 0) { match = TRUE; } } if (match) { result = g_list_prepend(result, rsc); } if (rsc->children) { gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child = (pe_resource_t *) gIter->data; result = find_rsc_list(result, child, id, renamed_clones, partial, NULL); } } return result; } static void check_actions(pe_working_set_t * data_set) { const char *id = NULL; pe_node_t *node = NULL; xmlNode *lrm_rscs = NULL; xmlNode *status = pcmk_find_cib_element(data_set->input, XML_CIB_TAG_STATUS); xmlNode *node_state = NULL; for (node_state = pcmk__xe_first_child(status); node_state != NULL; node_state = pcmk__xe_next(node_state)) { if (pcmk__str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, pcmk__str_none)) { id = crm_element_value(node_state, XML_ATTR_ID); lrm_rscs = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); lrm_rscs = find_xml_node(lrm_rscs, XML_LRM_TAG_RESOURCES, FALSE); node = pe_find_node_id(data_set->nodes, id); if (node == NULL) { continue; /* Still need to check actions for a maintenance node to cancel existing monitor operations */ } else if (!pcmk__node_available(node) && !node->details->maintenance) { crm_trace("Skipping param check for %s: can't run resources", node->details->uname); continue; } crm_trace("Processing node %s", node->details->uname); if (node->details->online || pcmk_is_set(data_set->flags, pe_flag_stonith_enabled)) { xmlNode *rsc_entry = NULL; for (rsc_entry = pcmk__xe_first_child(lrm_rscs); rsc_entry != NULL; rsc_entry = pcmk__xe_next(rsc_entry)) { if (pcmk__str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, pcmk__str_none)) { if (xml_has_children(rsc_entry)) { GList *gIter = NULL; GList *result = NULL; const char *rsc_id = ID(rsc_entry); CRM_CHECK(rsc_id != NULL, return); result = find_rsc_list(NULL, NULL, rsc_id, TRUE, FALSE, data_set); for (gIter = result; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (rsc->variant != pe_native) { continue; } check_actions_for(rsc_entry, rsc, node, data_set); } g_list_free(result); } } } } } } } static gboolean failcount_clear_action_exists(pe_node_t * node, pe_resource_t * rsc) { gboolean rc = FALSE; GList *list = pe__resource_actions(rsc, node, CRM_OP_CLEAR_FAILCOUNT, TRUE); if (list) { rc = TRUE; } g_list_free(list); return rc; } static void common_apply_stickiness(pe_resource_t * rsc, pe_node_t * node, pe_working_set_t * data_set) { if (rsc->children) { GList *gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; common_apply_stickiness(child_rsc, node, data_set); } return; } if (pcmk_is_set(rsc->flags, pe_rsc_managed) && rsc->stickiness != 0 && pcmk__list_of_1(rsc->running_on)) { pe_node_t *current = pe_find_node_id(rsc->running_on, node->details->id); pe_node_t *match = pe_hash_table_lookup(rsc->allowed_nodes, node->details->id); if (current == NULL) { } else if ((match != NULL) || pcmk_is_set(data_set->flags, pe_flag_symmetric_cluster)) { pe_resource_t *sticky_rsc = rsc; resource_location(sticky_rsc, node, rsc->stickiness, "stickiness", data_set); pe_rsc_debug(sticky_rsc, "Resource %s: preferring current location" " (node=%s, weight=%d)", sticky_rsc->id, node->details->uname, rsc->stickiness); } else { GHashTableIter iter; pe_node_t *nIter = NULL; pe_rsc_debug(rsc, "Ignoring stickiness for %s: the cluster is asymmetric" " and node %s is not explicitly allowed", rsc->id, node->details->uname); g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&nIter)) { crm_err("%s[%s] = %d", rsc->id, nIter->details->uname, nIter->weight); } } } /* Check the migration threshold only if a failcount clear action * has not already been placed for this resource on the node. * There is no sense in potentially forcing the resource from this * node if the failcount is being reset anyway. * * @TODO A clear_failcount operation can be scheduled in stage4() via * check_actions_for(), or in stage5() via check_params(). This runs in * stage2(), so it cannot detect those, meaning we might check the migration * threshold when we shouldn't -- worst case, we stop or move the resource, * then move it back next transition. */ if (failcount_clear_action_exists(node, rsc) == FALSE) { pe_resource_t *failed = NULL; if (pcmk__threshold_reached(rsc, node, data_set, &failed)) { resource_location(failed, node, -INFINITY, "__fail_limit__", data_set); } } } void complex_set_cmds(pe_resource_t * rsc) { GList *gIter = rsc->children; rsc->cmds = &resource_class_alloc_functions[rsc->variant]; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; complex_set_cmds(child_rsc); } } void set_alloc_actions(pe_working_set_t * data_set) { GList *gIter = data_set->resources; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; complex_set_cmds(rsc); } } static void calculate_system_health(gpointer gKey, gpointer gValue, gpointer user_data) { const char *key = (const char *)gKey; const char *value = (const char *)gValue; int *system_health = (int *)user_data; if (!gKey || !gValue || !user_data) { return; } if (pcmk__starts_with(key, "#health")) { int score; /* Convert the value into an integer */ score = char2score(value); /* Add it to the running total */ *system_health = pe__add_scores(score, *system_health); } } static gboolean apply_system_health(pe_working_set_t * data_set) { GList *gIter = NULL; const char *health_strategy = pe_pref(data_set->config_hash, "node-health-strategy"); int base_health = 0; if (pcmk__str_eq(health_strategy, "none", pcmk__str_null_matches | pcmk__str_casei)) { /* Prevent any accidental health -> score translation */ pcmk__score_red = 0; pcmk__score_yellow = 0; pcmk__score_green = 0; return TRUE; } else if (pcmk__str_eq(health_strategy, "migrate-on-red", pcmk__str_casei)) { /* Resources on nodes which have health values of red are * weighted away from that node. */ pcmk__score_red = -INFINITY; pcmk__score_yellow = 0; pcmk__score_green = 0; } else if (pcmk__str_eq(health_strategy, "only-green", pcmk__str_casei)) { /* Resources on nodes which have health values of red or yellow * are forced away from that node. */ pcmk__score_red = -INFINITY; pcmk__score_yellow = -INFINITY; pcmk__score_green = 0; } else if (pcmk__str_eq(health_strategy, "progressive", pcmk__str_casei)) { /* Same as the above, but use the r/y/g scores provided by the user * Defaults are provided by the pe_prefs table * Also, custom health "base score" can be used */ base_health = char2score(pe_pref(data_set->config_hash, "node-health-base")); } else if (pcmk__str_eq(health_strategy, "custom", pcmk__str_casei)) { /* Requires the admin to configure the rsc_location constaints for * processing the stored health scores */ /* TODO: Check for the existence of appropriate node health constraints */ return TRUE; } else { crm_err("Unknown node health strategy: %s", health_strategy); return FALSE; } crm_info("Applying automated node health strategy: %s", health_strategy); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { int system_health = base_health; pe_node_t *node = (pe_node_t *) gIter->data; /* Search through the node hash table for system health entries. */ g_hash_table_foreach(node->details->attrs, calculate_system_health, &system_health); crm_info(" Node %s has an combined system health of %d", node->details->uname, system_health); /* If the health is non-zero, then create a new location constraint so * that the weight will be added later on. */ if (system_health != 0) { GList *gIter2 = data_set->resources; for (; gIter2 != NULL; gIter2 = gIter2->next) { pe_resource_t *rsc = (pe_resource_t *) gIter2->data; pcmk__new_location(health_strategy, rsc, system_health, NULL, node, data_set); } } } return TRUE; } gboolean stage0(pe_working_set_t * data_set) { if (data_set->input == NULL) { return FALSE; } if (!pcmk_is_set(data_set->flags, pe_flag_have_status)) { crm_trace("Calculating status"); cluster_status(data_set); } set_alloc_actions(data_set); apply_system_health(data_set); pcmk__unpack_constraints(data_set); return TRUE; } -/* - * Check nodes for resources started outside of the LRM - */ -gboolean -probe_resources(pe_working_set_t * data_set) -{ - for (GList *gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { - pe_node_t *node = (pe_node_t *) gIter->data; - const char *probed = NULL; - - if (node->details->online == FALSE) { - - if (pcmk__is_failed_remote_node(node)) { - pe_fence_node(data_set, node, "the connection is unrecoverable", FALSE); - } - continue; - - } else if (node->details->unclean) { - continue; - - } else if (node->details->rsc_discovery_enabled == FALSE) { - /* resource discovery is disabled for this node */ - continue; - } - - /* This is no longer needed for live clusters, since the probe_complete - * node attribute will never be in the CIB. However this is still useful - * for processing old saved CIBs (< 1.1.14), including the - * reprobe-target_rc regression test. - */ - probed = pe_node_attribute_raw(node, CRM_OP_PROBED); - if (probed != NULL && crm_is_true(probed) == FALSE) { - pe_action_t *probe_op = custom_action(NULL, crm_strdup_printf("%s-%s", CRM_OP_REPROBE, node->details->uname), - CRM_OP_REPROBE, node, FALSE, TRUE, data_set); - - add_hash_param(probe_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); - continue; - } - - for (GList *gIter2 = data_set->resources; gIter2 != NULL; gIter2 = gIter2->next) { - pe_resource_t *rsc = (pe_resource_t *) gIter2->data; - - rsc->cmds->create_probe(rsc, node, NULL, FALSE, data_set); - } - } - return TRUE; -} - static void rsc_discover_filter(pe_resource_t *rsc, pe_node_t *node) { pe_resource_t *top = uber_parent(rsc); pe_node_t *match; if (rsc->exclusive_discover == FALSE && top->exclusive_discover == FALSE) { return; } g_list_foreach(rsc->children, (GFunc) rsc_discover_filter, node); match = g_hash_table_lookup(rsc->allowed_nodes, node->details->id); if (match && match->rsc_discover_mode != pe_discover_exclusive) { match->weight = -INFINITY; } } static time_t shutdown_time(pe_node_t *node, pe_working_set_t *data_set) { const char *shutdown = pe_node_attribute_raw(node, XML_CIB_ATTR_SHUTDOWN); time_t result = 0; if (shutdown) { long long result_ll; if (pcmk__scan_ll(shutdown, &result_ll, 0LL) == pcmk_rc_ok) { result = (time_t) result_ll; } } return result? result : get_effective_time(data_set); } static void apply_shutdown_lock(pe_resource_t *rsc, pe_working_set_t *data_set) { const char *class; // Only primitives and (uncloned) groups may be locked if (rsc->variant == pe_group) { g_list_foreach(rsc->children, (GFunc) apply_shutdown_lock, data_set); } else if (rsc->variant != pe_native) { return; } // Fence devices and remote connections can't be locked class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (pcmk__str_eq(class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_null_matches) || pe__resource_is_remote_conn(rsc, data_set)) { return; } if (rsc->lock_node != NULL) { // The lock was obtained from resource history if (rsc->running_on != NULL) { /* The resource was started elsewhere even though it is now * considered locked. This shouldn't be possible, but as a * failsafe, we don't want to disturb the resource now. */ pe_rsc_info(rsc, "Cancelling shutdown lock because %s is already active", rsc->id); pe__clear_resource_history(rsc, rsc->lock_node, data_set); rsc->lock_node = NULL; rsc->lock_time = 0; } // Only a resource active on exactly one node can be locked } else if (pcmk__list_of_1(rsc->running_on)) { pe_node_t *node = rsc->running_on->data; if (node->details->shutdown) { if (node->details->unclean) { pe_rsc_debug(rsc, "Not locking %s to unclean %s for shutdown", rsc->id, node->details->uname); } else { rsc->lock_node = node; rsc->lock_time = shutdown_time(node, data_set); } } } if (rsc->lock_node == NULL) { // No lock needed return; } if (data_set->shutdown_lock > 0) { time_t lock_expiration = rsc->lock_time + data_set->shutdown_lock; pe_rsc_info(rsc, "Locking %s to %s due to shutdown (expires @%lld)", rsc->id, rsc->lock_node->details->uname, (long long) lock_expiration); pe__update_recheck_time(++lock_expiration, data_set); } else { pe_rsc_info(rsc, "Locking %s to %s due to shutdown", rsc->id, rsc->lock_node->details->uname); } // If resource is locked to one node, ban it from all other nodes for (GList *item = data_set->nodes; item != NULL; item = item->next) { pe_node_t *node = item->data; if (strcmp(node->details->uname, rsc->lock_node->details->uname)) { resource_location(rsc, node, -CRM_SCORE_INFINITY, XML_CONFIG_ATTR_SHUTDOWN_LOCK, data_set); } } } /* * \internal * \brief Stage 2 of cluster status: apply node-specific criteria * * Count known nodes, and apply location constraints, stickiness, and exclusive * resource discovery. */ gboolean stage2(pe_working_set_t * data_set) { GList *gIter = NULL; if (pcmk_is_set(data_set->flags, pe_flag_shutdown_lock)) { g_list_foreach(data_set->resources, (GFunc) apply_shutdown_lock, data_set); } if (!pcmk_is_set(data_set->flags, pe_flag_no_compat)) { // @COMPAT API backward compatibility for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (node && (node->weight >= 0) && node->details->online && (node->details->type != node_ping)) { data_set->max_valid_nodes++; } } } pcmk__apply_locations(data_set); gIter = data_set->nodes; for (; gIter != NULL; gIter = gIter->next) { GList *gIter2 = NULL; pe_node_t *node = (pe_node_t *) gIter->data; gIter2 = data_set->resources; for (; gIter2 != NULL; gIter2 = gIter2->next) { pe_resource_t *rsc = (pe_resource_t *) gIter2->data; common_apply_stickiness(rsc, node, data_set); rsc_discover_filter(rsc, node); } } return TRUE; } /* * Check for orphaned or redefined actions */ gboolean stage4(pe_working_set_t * data_set) { check_actions(data_set); return TRUE; } static void * convert_const_pointer(const void *ptr) { /* Worst function ever */ return (void *)ptr; } static gint sort_rsc_process_order(gconstpointer a, gconstpointer b, gpointer data) { int rc = 0; int r1_weight = -INFINITY; int r2_weight = -INFINITY; const char *reason = "existence"; GList *nodes = (GList *) data; const pe_resource_t *resource1 = a; const pe_resource_t *resource2 = b; pe_node_t *r1_node = NULL; pe_node_t *r2_node = NULL; GList *gIter = NULL; GHashTable *r1_nodes = NULL; GHashTable *r2_nodes = NULL; reason = "priority"; r1_weight = resource1->priority; r2_weight = resource2->priority; if (r1_weight > r2_weight) { rc = -1; goto done; } if (r1_weight < r2_weight) { rc = 1; goto done; } reason = "no node list"; if (nodes == NULL) { goto done; } r1_nodes = pcmk__native_merge_weights(convert_const_pointer(resource1), resource1->id, NULL, NULL, 1, pe_weights_forward | pe_weights_init); pe__show_node_weights(true, NULL, resource1->id, r1_nodes, resource1->cluster); r2_nodes = pcmk__native_merge_weights(convert_const_pointer(resource2), resource2->id, NULL, NULL, 1, pe_weights_forward | pe_weights_init); pe__show_node_weights(true, NULL, resource2->id, r2_nodes, resource2->cluster); /* Current location score */ reason = "current location"; r1_weight = -INFINITY; r2_weight = -INFINITY; if (resource1->running_on) { r1_node = pe__current_node(resource1); r1_node = g_hash_table_lookup(r1_nodes, r1_node->details->id); if (r1_node != NULL) { r1_weight = r1_node->weight; } } if (resource2->running_on) { r2_node = pe__current_node(resource2); r2_node = g_hash_table_lookup(r2_nodes, r2_node->details->id); if (r2_node != NULL) { r2_weight = r2_node->weight; } } if (r1_weight > r2_weight) { rc = -1; goto done; } if (r1_weight < r2_weight) { rc = 1; goto done; } reason = "score"; for (gIter = nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; r1_node = NULL; r2_node = NULL; r1_weight = -INFINITY; if (r1_nodes) { r1_node = g_hash_table_lookup(r1_nodes, node->details->id); } if (r1_node) { r1_weight = r1_node->weight; } r2_weight = -INFINITY; if (r2_nodes) { r2_node = g_hash_table_lookup(r2_nodes, node->details->id); } if (r2_node) { r2_weight = r2_node->weight; } if (r1_weight > r2_weight) { rc = -1; goto done; } if (r1_weight < r2_weight) { rc = 1; goto done; } } done: crm_trace("%s (%d) on %s %c %s (%d) on %s: %s", resource1->id, r1_weight, r1_node ? r1_node->details->id : "n/a", rc < 0 ? '>' : rc > 0 ? '<' : '=', resource2->id, r2_weight, r2_node ? r2_node->details->id : "n/a", reason); if (r1_nodes) { g_hash_table_destroy(r1_nodes); } if (r2_nodes) { g_hash_table_destroy(r2_nodes); } return rc; } static void allocate_resources(pe_working_set_t * data_set) { GList *gIter = NULL; if (pcmk_is_set(data_set->flags, pe_flag_have_remote_nodes)) { /* Allocate remote connection resources first (which will also allocate * any colocation dependencies). If the connection is migrating, always * prefer the partial migration target. */ for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (rsc->is_remote_node == FALSE) { continue; } pe_rsc_trace(rsc, "Allocating remote connection resource '%s'", rsc->id); rsc->cmds->allocate(rsc, rsc->partial_migration_target, data_set); } } /* now do the rest of the resources */ for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (rsc->is_remote_node == TRUE) { continue; } pe_rsc_trace(rsc, "Allocating %s resource '%s'", crm_element_name(rsc->xml), rsc->id); rsc->cmds->allocate(rsc, NULL, data_set); } } // Clear fail counts for orphaned rsc on all online nodes static void cleanup_orphans(pe_resource_t * rsc, pe_working_set_t * data_set) { GList *gIter = NULL; for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (node->details->online && pe_get_failcount(node, rsc, NULL, pe_fc_effective, NULL, data_set)) { pe_action_t *clear_op = NULL; clear_op = pe__clear_failcount(rsc, node, "it is orphaned", data_set); /* We can't use order_action_then_stop() here because its * pe_order_preserve breaks things */ pcmk__new_ordering(clear_op->rsc, NULL, clear_op, rsc, stop_key(rsc), NULL, pe_order_optional, data_set); } } } gboolean stage5(pe_working_set_t * data_set) { pcmk__output_t *out = data_set->priv; GList *gIter = NULL; if (!pcmk__str_eq(data_set->placement_strategy, "default", pcmk__str_casei)) { GList *nodes = g_list_copy(data_set->nodes); nodes = pcmk__sort_nodes(nodes, NULL, data_set); data_set->resources = g_list_sort_with_data(data_set->resources, sort_rsc_process_order, nodes); g_list_free(nodes); } gIter = data_set->nodes; for (; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (pcmk_is_set(data_set->flags, pe_flag_show_utilization)) { out->message(out, "node-capacity", node, "Original"); } } crm_trace("Allocating services"); /* Take (next) highest resource, assign it and create its actions */ allocate_resources(data_set); gIter = data_set->nodes; for (; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; if (pcmk_is_set(data_set->flags, pe_flag_show_utilization)) { out->message(out, "node-capacity", node, "Remaining"); } } // Process deferred action checks pe__foreach_param_check(data_set, check_params); pe__free_param_checks(data_set); if (pcmk_is_set(data_set->flags, pe_flag_startup_probes)) { crm_trace("Calculating needed probes"); - /* This code probably needs optimization - * ptest -x with 100 nodes, 100 clones and clone-max=100: - - With probes: - - ptest[14781]: 2010/09/27_17:56:46 notice: TRACE: do_calculations: pengine.c:258 Calculate cluster status - ptest[14781]: 2010/09/27_17:56:46 notice: TRACE: do_calculations: pengine.c:278 Applying placement constraints - ptest[14781]: 2010/09/27_17:56:47 notice: TRACE: do_calculations: pengine.c:285 Create internal constraints - ptest[14781]: 2010/09/27_17:56:47 notice: TRACE: do_calculations: pengine.c:292 Check actions - ptest[14781]: 2010/09/27_17:56:48 notice: TRACE: do_calculations: pengine.c:299 Allocate resources - ptest[14781]: 2010/09/27_17:56:48 notice: TRACE: stage5: allocate.c:881 Allocating services - ptest[14781]: 2010/09/27_17:56:49 notice: TRACE: stage5: allocate.c:894 Calculating needed probes - ptest[14781]: 2010/09/27_17:56:51 notice: TRACE: stage5: allocate.c:899 Creating actions - ptest[14781]: 2010/09/27_17:56:52 notice: TRACE: stage5: allocate.c:905 Creating done - ptest[14781]: 2010/09/27_17:56:52 notice: TRACE: do_calculations: pengine.c:306 Processing fencing and shutdown cases - ptest[14781]: 2010/09/27_17:56:52 notice: TRACE: do_calculations: pengine.c:313 Applying ordering constraints - 36s - ptest[14781]: 2010/09/27_17:57:28 notice: TRACE: do_calculations: pengine.c:320 Create transition graph - - Without probes: - - ptest[14637]: 2010/09/27_17:56:21 notice: TRACE: do_calculations: pengine.c:258 Calculate cluster status - ptest[14637]: 2010/09/27_17:56:22 notice: TRACE: do_calculations: pengine.c:278 Applying placement constraints - ptest[14637]: 2010/09/27_17:56:22 notice: TRACE: do_calculations: pengine.c:285 Create internal constraints - ptest[14637]: 2010/09/27_17:56:22 notice: TRACE: do_calculations: pengine.c:292 Check actions - ptest[14637]: 2010/09/27_17:56:23 notice: TRACE: do_calculations: pengine.c:299 Allocate resources - ptest[14637]: 2010/09/27_17:56:23 notice: TRACE: stage5: allocate.c:881 Allocating services - ptest[14637]: 2010/09/27_17:56:24 notice: TRACE: stage5: allocate.c:899 Creating actions - ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: stage5: allocate.c:905 Creating done - ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: do_calculations: pengine.c:306 Processing fencing and shutdown cases - ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: do_calculations: pengine.c:313 Applying ordering constraints - ptest[14637]: 2010/09/27_17:56:25 notice: TRACE: do_calculations: pengine.c:320 Create transition graph - */ - - probe_resources(data_set); + pcmk__schedule_probes(data_set); } crm_trace("Handle orphans"); if (pcmk_is_set(data_set->flags, pe_flag_stop_rsc_orphans)) { for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; /* There's no need to recurse into rsc->children because those * should just be unallocated clone instances. */ if (pcmk_is_set(rsc->flags, pe_rsc_orphan)) { cleanup_orphans(rsc, data_set); } } } crm_trace("Creating actions"); for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; rsc->cmds->create_actions(rsc, data_set); } crm_trace("Creating done"); return TRUE; } static gboolean is_managed(const pe_resource_t * rsc) { GList *gIter = rsc->children; if (pcmk_is_set(rsc->flags, pe_rsc_managed)) { return TRUE; } for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *child_rsc = (pe_resource_t *) gIter->data; if (is_managed(child_rsc)) { return TRUE; } } return FALSE; } static gboolean any_managed_resources(pe_working_set_t * data_set) { GList *gIter = data_set->resources; for (; gIter != NULL; gIter = gIter->next) { pe_resource_t *rsc = (pe_resource_t *) gIter->data; if (is_managed(rsc)) { return TRUE; } } return FALSE; } /* * Create dependencies for stonith and shutdown operations */ gboolean stage6(pe_working_set_t * data_set) { pe_action_t *dc_down = NULL; pe_action_t *stonith_op = NULL; gboolean integrity_lost = FALSE; gboolean need_stonith = TRUE; GList *gIter; GList *stonith_ops = NULL; GList *shutdown_ops = NULL; /* Remote ordering constraints need to happen prior to calculating fencing * because it is one more place we can mark nodes as needing fencing. */ pcmk__order_remote_connection_actions(data_set); crm_trace("Processing fencing and shutdown cases"); if (any_managed_resources(data_set) == FALSE) { crm_notice("Delaying fencing operations until there are resources to manage"); need_stonith = FALSE; } /* Check each node for stonith/shutdown */ for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { pe_node_t *node = (pe_node_t *) gIter->data; /* Guest nodes are "fenced" by recovering their container resource, * so handle them separately. */ if (pe__is_guest_node(node)) { if (node->details->remote_requires_reset && need_stonith && pe_can_fence(data_set, node)) { pcmk__fence_guest(node, data_set); } continue; } stonith_op = NULL; if (node->details->unclean && need_stonith && pe_can_fence(data_set, node)) { stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", FALSE, data_set); pe_warn("Scheduling Node %s for STONITH", node->details->uname); pcmk__order_vs_fence(stonith_op, data_set); if (node->details->is_dc) { // Remember if the DC is being fenced dc_down = stonith_op; } else { if (!pcmk_is_set(data_set->flags, pe_flag_concurrent_fencing) && (stonith_ops != NULL)) { /* Concurrent fencing is disabled, so order each non-DC * fencing in a chain. If there is any DC fencing or * shutdown, it will be ordered after the last action in the * chain later. */ order_actions((pe_action_t *) stonith_ops->data, stonith_op, pe_order_optional); } // Remember all non-DC fencing actions in a separate list stonith_ops = g_list_prepend(stonith_ops, stonith_op); } } else if (node->details->online && node->details->shutdown && /* TODO define what a shutdown op means for a remote node. * For now we do not send shutdown operations for remote nodes, but * if we can come up with a good use for this in the future, we will. */ pe__is_guest_or_remote_node(node) == FALSE) { pe_action_t *down_op = pcmk__new_shutdown_action(node, data_set); if (node->details->is_dc) { // Remember if the DC is being shut down dc_down = down_op; } else { // Remember non-DC shutdowns for later ordering shutdown_ops = g_list_prepend(shutdown_ops, down_op); } } if (node->details->unclean && stonith_op == NULL) { integrity_lost = TRUE; pe_warn("Node %s is unclean!", node->details->uname); } } if (integrity_lost) { if (!pcmk_is_set(data_set->flags, pe_flag_stonith_enabled)) { pe_warn("YOUR RESOURCES ARE NOW LIKELY COMPROMISED"); pe_err("ENABLE STONITH TO KEEP YOUR RESOURCES SAFE"); } else if (!pcmk_is_set(data_set->flags, pe_flag_have_quorum)) { crm_notice("Cannot fence unclean nodes until quorum is" " attained (or no-quorum-policy is set to ignore)"); } } if (dc_down != NULL) { /* Order any non-DC shutdowns before any DC shutdown, to avoid repeated * DC elections. However, we don't want to order non-DC shutdowns before * a DC *fencing*, because even though we don't want a node that's * shutting down to become DC, the DC fencing could be ordered before a * clone stop that's also ordered before the shutdowns, thus leading to * a graph loop. */ if (pcmk__str_eq(dc_down->task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { for (gIter = shutdown_ops; gIter != NULL; gIter = gIter->next) { pe_action_t *node_stop = (pe_action_t *) gIter->data; crm_debug("Ordering shutdown on %s before %s on DC %s", node_stop->node->details->uname, dc_down->task, dc_down->node->details->uname); order_actions(node_stop, dc_down, pe_order_optional); } } // Order any non-DC fencing before any DC fencing or shutdown if (pcmk_is_set(data_set->flags, pe_flag_concurrent_fencing)) { /* With concurrent fencing, order each non-DC fencing action * separately before any DC fencing or shutdown. */ for (gIter = stonith_ops; gIter != NULL; gIter = gIter->next) { order_actions((pe_action_t *) gIter->data, dc_down, pe_order_optional); } } else if (stonith_ops) { /* Without concurrent fencing, the non-DC fencing actions are * already ordered relative to each other, so we just need to order * the DC fencing after the last action in the chain (which is the * first item in the list). */ order_actions((pe_action_t *) stonith_ops->data, dc_down, pe_order_optional); } } g_list_free(stonith_ops); g_list_free(shutdown_ops); return TRUE; } - -static gboolean -order_first_probe_unneeded(pe_action_t * probe, pe_action_t * rh_action) -{ - /* No need to probe the resource on the node that is being - * unfenced. Otherwise it might introduce transition loop - * since probe will be performed after the node is - * unfenced. - */ - if (pcmk__str_eq(rh_action->task, CRM_OP_FENCE, pcmk__str_casei) - && probe->node && rh_action->node - && probe->node->details == rh_action->node->details) { - const char *op = g_hash_table_lookup(rh_action->meta, "stonith_action"); - - if (pcmk__str_eq(op, "on", pcmk__str_casei)) { - return TRUE; - } - } - - // Shutdown waits for probe to complete only if it's on the same node - if ((pcmk__str_eq(rh_action->task, CRM_OP_SHUTDOWN, pcmk__str_casei)) - && probe->node && rh_action->node - && probe->node->details != rh_action->node->details) { - return TRUE; - } - return FALSE; -} - -static void -order_first_probes_imply_stops(pe_working_set_t * data_set) -{ - GList *gIter = NULL; - - for (gIter = data_set->ordering_constraints; gIter != NULL; gIter = gIter->next) { - pe__ordering_t *order = gIter->data; - enum pe_ordering order_type = pe_order_optional; - - pe_resource_t *lh_rsc = order->lh_rsc; - pe_resource_t *rh_rsc = order->rh_rsc; - pe_action_t *lh_action = order->lh_action; - pe_action_t *rh_action = order->rh_action; - const char *lh_action_task = order->lh_action_task; - const char *rh_action_task = order->rh_action_task; - - GList *probes = NULL; - GList *rh_actions = NULL; - - GList *pIter = NULL; - - if (lh_rsc == NULL) { - continue; - - } else if (rh_rsc && lh_rsc == rh_rsc) { - continue; - } - - if (lh_action == NULL && lh_action_task == NULL) { - continue; - } - - if (rh_action == NULL && rh_action_task == NULL) { - continue; - } - - /* Technically probe is expected to return "not running", which could be - * the alternative of stop action if the status of the resource is - * unknown yet. - */ - if (lh_action && !pcmk__str_eq(lh_action->task, RSC_STOP, pcmk__str_casei)) { - continue; - - } else if (lh_action == NULL - && lh_action_task - && !pcmk__ends_with(lh_action_task, "_" RSC_STOP "_0")) { - continue; - } - - /* Do not probe the resource inside of a stopping container. Otherwise - * it might introduce transition loop since probe will be performed - * after the container starts again. - */ - if (rh_rsc && lh_rsc->container == rh_rsc) { - if (rh_action && pcmk__str_eq(rh_action->task, RSC_STOP, pcmk__str_casei)) { - continue; - - } else if (rh_action == NULL && rh_action_task - && pcmk__ends_with(rh_action_task,"_" RSC_STOP "_0")) { - continue; - } - } - - if (order->type == pe_order_none) { - continue; - } - - // Preserve the order options for future filtering - if (pcmk_is_set(order->type, pe_order_apply_first_non_migratable)) { - pe__set_order_flags(order_type, - pe_order_apply_first_non_migratable); - } - - if (pcmk_is_set(order->type, pe_order_same_node)) { - pe__set_order_flags(order_type, pe_order_same_node); - } - - // Keep the order types for future filtering - if (order->type == pe_order_anti_colocation - || order->type == pe_order_load) { - order_type = order->type; - } - - probes = pe__resource_actions(lh_rsc, NULL, RSC_STATUS, FALSE); - if (probes == NULL) { - continue; - } - - if (rh_action) { - rh_actions = g_list_prepend(rh_actions, rh_action); - - } else if (rh_rsc && rh_action_task) { - rh_actions = find_actions(rh_rsc->actions, rh_action_task, NULL); - } - - if (rh_actions == NULL) { - g_list_free(probes); - continue; - } - - crm_trace("Processing for LH probe based on ordering constraint %s -> %s" - " (id=%d, type=%.6x)", - lh_action ? lh_action->uuid : lh_action_task, - rh_action ? rh_action->uuid : rh_action_task, - order->id, order->type); - - for (pIter = probes; pIter != NULL; pIter = pIter->next) { - pe_action_t *probe = (pe_action_t *) pIter->data; - GList *rIter = NULL; - - for (rIter = rh_actions; rIter != NULL; rIter = rIter->next) { - pe_action_t *rh_action_iter = (pe_action_t *) rIter->data; - - if (order_first_probe_unneeded(probe, rh_action_iter)) { - continue; - } - order_actions(probe, rh_action_iter, order_type); - } - } - - g_list_free(rh_actions); - g_list_free(probes); - } -} - -static void -order_first_probe_then_restart_repromote(pe_action_t * probe, - pe_action_t * after, - pe_working_set_t * data_set) -{ - GList *gIter = NULL; - bool interleave = FALSE; - pe_resource_t *compatible_rsc = NULL; - - if (probe == NULL - || probe->rsc == NULL - || probe->rsc->variant != pe_native) { - return; - } - - if (after == NULL - // Avoid running into any possible loop - || pcmk_is_set(after->flags, pe_action_tracking)) { - return; - } - - if (!pcmk__str_eq(probe->task, RSC_STATUS, pcmk__str_casei)) { - return; - } - - pe__set_action_flags(after, pe_action_tracking); - - crm_trace("Processing based on %s %s -> %s %s", - probe->uuid, - probe->node ? probe->node->details->uname: "", - after->uuid, - after->node ? after->node->details->uname : ""); - - if (after->rsc - /* Better not build a dependency directly with a clone/group. - * We are going to proceed through the ordering chain and build - * dependencies with its children. - */ - && after->rsc->variant == pe_native - && probe->rsc != after->rsc) { - - GList *then_actions = NULL; - enum pe_ordering probe_order_type = pe_order_optional; - - if (pcmk__str_eq(after->task, RSC_START, pcmk__str_casei)) { - then_actions = pe__resource_actions(after->rsc, NULL, RSC_STOP, FALSE); - - } else if (pcmk__str_eq(after->task, RSC_PROMOTE, pcmk__str_casei)) { - then_actions = pe__resource_actions(after->rsc, NULL, RSC_DEMOTE, FALSE); - } - - for (gIter = then_actions; gIter != NULL; gIter = gIter->next) { - pe_action_t *then = (pe_action_t *) gIter->data; - - // Skip any pseudo action which for example is implied by fencing - if (pcmk_is_set(then->flags, pe_action_pseudo)) { - continue; - } - - order_actions(probe, then, probe_order_type); - } - g_list_free(then_actions); - } - - if (after->rsc - && after->rsc->variant > pe_group) { - const char *interleave_s = g_hash_table_lookup(after->rsc->meta, - XML_RSC_ATTR_INTERLEAVE); - - interleave = crm_is_true(interleave_s); - - if (interleave) { - /* For an interleaved clone, we should build a dependency only - * with the relevant clone child. - */ - compatible_rsc = find_compatible_child(probe->rsc, - after->rsc, - RSC_ROLE_UNKNOWN, - FALSE, data_set); - } - } - - for (gIter = after->actions_after; gIter != NULL; gIter = gIter->next) { - pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) gIter->data; - /* pe_order_implies_then is the reason why a required A.start - * implies/enforces B.start to be required too, which is the cause of - * B.restart/re-promote. - * - * Not sure about pe_order_implies_then_on_node though. It's now only - * used for unfencing case, which tends to introduce transition - * loops... - */ - - if (!pcmk_is_set(after_wrapper->type, pe_order_implies_then)) { - /* The order type between a group/clone and its child such as - * B.start-> B_child.start is: - * pe_order_implies_first_printed | pe_order_runnable_left - * - * Proceed through the ordering chain and build dependencies with - * its children. - */ - if (after->rsc == NULL - || after->rsc->variant < pe_group - || probe->rsc->parent == after->rsc - || after_wrapper->action->rsc == NULL - || after_wrapper->action->rsc->variant > pe_group - || after->rsc != after_wrapper->action->rsc->parent) { - continue; - } - - /* Proceed to the children of a group or a non-interleaved clone. - * For an interleaved clone, proceed only to the relevant child. - */ - if (after->rsc->variant > pe_group - && interleave == TRUE - && (compatible_rsc == NULL - || compatible_rsc != after_wrapper->action->rsc)) { - continue; - } - } - - crm_trace("Proceeding through %s %s -> %s %s (type=%#.6x)", - after->uuid, - after->node ? after->node->details->uname: "", - after_wrapper->action->uuid, - after_wrapper->action->node ? after_wrapper->action->node->details->uname : "", - after_wrapper->type); - - order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set); - } -} - -static void clear_actions_tracking_flag(pe_working_set_t * data_set) -{ - GList *gIter = NULL; - - for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { - pe_action_t *action = (pe_action_t *) gIter->data; - - if (pcmk_is_set(action->flags, pe_action_tracking)) { - pe__clear_action_flags(action, pe_action_tracking); - } - } -} - -static void -order_first_rsc_probes(pe_resource_t * rsc, pe_working_set_t * data_set) -{ - GList *gIter = NULL; - GList *probes = NULL; - - g_list_foreach(rsc->children, (GFunc) order_first_rsc_probes, data_set); - - if (rsc->variant != pe_native) { - return; - } - - probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE); - - for (gIter = probes; gIter != NULL; gIter= gIter->next) { - pe_action_t *probe = (pe_action_t *) gIter->data; - GList *aIter = NULL; - - for (aIter = probe->actions_after; aIter != NULL; aIter = aIter->next) { - pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) aIter->data; - - order_first_probe_then_restart_repromote(probe, after_wrapper->action, data_set); - clear_actions_tracking_flag(data_set); - } - } - - g_list_free(probes); -} - -static void -order_first_probes(pe_working_set_t * data_set) -{ - GList *gIter = NULL; - - for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { - pe_resource_t *rsc = (pe_resource_t *) gIter->data; - - order_first_rsc_probes(rsc, data_set); - } - - order_first_probes_imply_stops(data_set); -} - -static void -order_then_probes(pe_working_set_t * data_set) -{ -#if 0 - GList *gIter = NULL; - - for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { - pe_resource_t *rsc = (pe_resource_t *) gIter->data; - - /* Given "A then B", we would prefer to wait for A to be - * started before probing B. - * - * If A was a filesystem on which the binaries and data for B - * lived, it would have been useful if the author of B's agent - * could assume that A is running before B.monitor will be - * called. - * - * However we can't _only_ probe once A is running, otherwise - * we'd not detect the state of B if A could not be started - * for some reason. - * - * In practice however, we cannot even do an opportunistic - * version of this because B may be moving: - * - * B.probe -> B.start - * B.probe -> B.stop - * B.stop -> B.start - * A.stop -> A.start - * A.start -> B.probe - * - * So far so good, but if we add the result of this code: - * - * B.stop -> A.stop - * - * Then we get a loop: - * - * B.probe -> B.stop -> A.stop -> A.start -> B.probe - * - * We could kill the 'B.probe -> B.stop' dependency, but that - * could mean stopping B "too" soon, because B.start must wait - * for the probes to complete. - * - * Another option is to allow it only if A is a non-unique - * clone with clone-max == node-max (since we'll never be - * moving it). However, we could still be stopping one - * instance at the same time as starting another. - - * The complexity of checking for allowed conditions combined - * with the ever narrowing usecase suggests that this code - * should remain disabled until someone gets smarter. - */ - pe_action_t *start = NULL; - GList *actions = NULL; - GList *probes = NULL; - - actions = pe__resource_actions(rsc, NULL, RSC_START, FALSE); - - if (actions) { - start = actions->data; - g_list_free(actions); - } - - if(start == NULL) { - crm_err("No start action for %s", rsc->id); - continue; - } - - probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE); - - for (actions = start->actions_before; actions != NULL; actions = actions->next) { - pe_action_wrapper_t *before = (pe_action_wrapper_t *) actions->data; - - GList *pIter = NULL; - pe_action_t *first = before->action; - pe_resource_t *first_rsc = first->rsc; - - if(first->required_runnable_before) { - GList *clone_actions = NULL; - for (clone_actions = first->actions_before; clone_actions != NULL; clone_actions = clone_actions->next) { - before = (pe_action_wrapper_t *) clone_actions->data; - - crm_trace("Testing %s -> %s (%p) for %s", first->uuid, before->action->uuid, before->action->rsc, start->uuid); - - CRM_ASSERT(before->action->rsc); - first_rsc = before->action->rsc; - break; - } - - } else if(!pcmk__str_eq(first->task, RSC_START, pcmk__str_casei)) { - crm_trace("Not a start op %s for %s", first->uuid, start->uuid); - } - - if(first_rsc == NULL) { - continue; - - } else if(uber_parent(first_rsc) == uber_parent(start->rsc)) { - crm_trace("Same parent %s for %s", first_rsc->id, start->uuid); - continue; - - } else if(FALSE && pe_rsc_is_clone(uber_parent(first_rsc)) == FALSE) { - crm_trace("Not a clone %s for %s", first_rsc->id, start->uuid); - continue; - } - - crm_err("Applying %s before %s %d", first->uuid, start->uuid, uber_parent(first_rsc)->variant); - - for (pIter = probes; pIter != NULL; pIter = pIter->next) { - pe_action_t *probe = (pe_action_t *) pIter->data; - - crm_err("Ordering %s before %s", first->uuid, probe->uuid); - order_actions(first, probe, pe_order_optional); - } - } - } -#endif -} - -void -pcmk__order_probes(pe_working_set_t *data_set) -{ - order_first_probes(data_set); - order_then_probes(data_set); -} diff --git a/lib/pacemaker/pcmk_sched_probes.c b/lib/pacemaker/pcmk_sched_probes.c new file mode 100644 index 0000000000..1e18584619 --- /dev/null +++ b/lib/pacemaker/pcmk_sched_probes.c @@ -0,0 +1,563 @@ +/* + * Copyright 2004-2022 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include + +#include + +#include +#include +#include +#include "libpacemaker_private.h" + +/*! + * \internal + * \brief Check whether a probe should be ordered before another action + * + * \param[in] probe Probe action to check + * \param[in] then Other action to check + * + * \return true if \p probe should be ordered before \p then, otherwise false + */ +static bool +probe_needed_before_action(pe_action_t *probe, pe_action_t *then) +{ + // Probes on a node are performed after unfencing it, not before + if (pcmk__str_eq(then->task, CRM_OP_FENCE, pcmk__str_casei) + && (probe->node != NULL) && (then->node != NULL) + && (probe->node->details == then->node->details)) { + const char *op = g_hash_table_lookup(then->meta, "stonith_action"); + + if (pcmk__str_eq(op, "on", pcmk__str_casei)) { + return false; + } + } + + // Probes should be done on a node before shutting it down + if (pcmk__str_eq(then->task, CRM_OP_SHUTDOWN, pcmk__str_none) + && (probe->node != NULL) && (then->node != NULL) + && (probe->node->details != then->node->details)) { + return false; + } + + // Otherwise probes should always be done before any other action + return true; +} + +/*! + * \internal + * \brief Add implicit "probe then X" orderings for "stop then X" orderings + * + * If the state of a resource is not known yet, a probe will be scheduled, + * expecting a "not running" result. If the probe fails, a stop will not be + * scheduled until the next transition. Thus, if there are ordering constraints + * like "stop this resource then do something else that's not for the same + * resource", add implicit "probe this resource then do something" equivalents + * so the relation is upheld until we know whether a stop is needed. + * + * \param[in] data_set Cluster working set + */ +static void +add_probe_orderings_for_stops(pe_working_set_t *data_set) +{ + for (GList *iter = data_set->ordering_constraints; iter != NULL; + iter = iter->next) { + + pe__ordering_t *order = iter->data; + enum pe_ordering order_type = pe_order_optional; + GList *probes = NULL; + GList *then_actions = NULL; + + // Skip disabled orderings + if (order->type == pe_order_none) { + continue; + } + + // Skip non-resource orderings, and orderings for the same resource + if ((order->lh_rsc == NULL) || (order->lh_rsc == order->rh_rsc)) { + continue; + } + + // Skip invalid orderings (shouldn't be possible) + if (((order->lh_action == NULL) && (order->lh_action_task == NULL)) || + ((order->rh_action == NULL) && (order->rh_action_task == NULL))) { + continue; + } + + // Skip orderings for first actions other than stop + if ((order->lh_action != NULL) + && !pcmk__str_eq(order->lh_action->task, RSC_STOP, pcmk__str_none)) { + continue; + } else if ((order->lh_action == NULL) + && !pcmk__ends_with(order->lh_action_task, "_" RSC_STOP "_0")) { + continue; + } + + /* Do not imply a probe ordering for a resource inside of a stopping + * container. Otherwise, it might introduce a transition loop, since a + * probe could be scheduled after the container starts again. + */ + if ((order->rh_rsc != NULL) + && (order->lh_rsc->container == order->rh_rsc)) { + + if ((order->rh_action != NULL) + && pcmk__str_eq(order->rh_action->task, RSC_STOP, + pcmk__str_none)) { + continue; + } else if ((order->rh_action == NULL) + && pcmk__ends_with(order->rh_action_task, + "_" RSC_STOP "_0")) { + continue; + } + } + + // Preserve certain order options for future filtering + if (pcmk_is_set(order->type, pe_order_apply_first_non_migratable)) { + pe__set_order_flags(order_type, + pe_order_apply_first_non_migratable); + } + if (pcmk_is_set(order->type, pe_order_same_node)) { + pe__set_order_flags(order_type, pe_order_same_node); + } + + // Preserve certain order types for future filtering + if ((order->type == pe_order_anti_colocation) + || (order->type == pe_order_load)) { + order_type = order->type; + } + + // List all scheduled probes for the first resource + probes = pe__resource_actions(order->lh_rsc, NULL, RSC_STATUS, FALSE); + if (probes == NULL) { // There aren't any + continue; + } + + // List all relevant "then" actions + if (order->rh_action != NULL) { + then_actions = g_list_prepend(NULL, order->rh_action); + + } else if (order->rh_rsc != NULL) { + then_actions = find_actions(order->rh_rsc->actions, + order->rh_action_task, NULL); + if (then_actions == NULL) { // There aren't any + g_list_free(probes); + continue; + } + } + + crm_trace("Implying 'probe then' orderings for '%s then %s' " + "(id=%d, type=%.6x)", + order->lh_action? order->lh_action->uuid : order->lh_action_task, + order->rh_action? order->rh_action->uuid : order->rh_action_task, + order->id, order->type); + + for (GList *probe_iter = probes; probe_iter != NULL; + probe_iter = probe_iter->next) { + + pe_action_t *probe = (pe_action_t *) probe_iter->data; + + for (GList *then_iter = then_actions; then_iter != NULL; + then_iter = then_iter->next) { + + pe_action_t *then = (pe_action_t *) then_iter->data; + + if (probe_needed_before_action(probe, then)) { + order_actions(probe, then, order_type); + } + } + } + + g_list_free(then_actions); + g_list_free(probes); + } +} + +/*! + * \internal + * \brief Order probes before restarts and re-promotes + * + * If a given ordering is a "probe then start" or "probe then promote" ordering, + * add an implicit "probe then stop/demote" ordering in case the action is part + * of a restart/re-promote, and do the same recursively for all actions ordered + * after the "then" action. + * + * \param[in] probe Probe as 'first' action in an ordering + * \param[in] after 'then' action in the ordering + * \param[in] data_set Cluster working set + */ +static void +add_restart_orderings_for_probe(pe_action_t *probe, pe_action_t *after, + pe_working_set_t *data_set) +{ + GList *iter = NULL; + bool interleave = false; + pe_resource_t *compatible_rsc = NULL; + + // Validate that this is a resource probe followed by some action + if ((after == NULL) || (probe == NULL) || (probe->rsc == NULL) + || (probe->rsc->variant != pe_native) + || !pcmk__str_eq(probe->task, RSC_STATUS, pcmk__str_casei)) { + return; + } + + // Avoid running into any possible loop + if (pcmk_is_set(after->flags, pe_action_tracking)) { + return; + } + pe__set_action_flags(after, pe_action_tracking); + + crm_trace("Adding probe restart orderings for '%s@%s then %s@%s'", + probe->uuid, + ((probe->node == NULL)? "" : probe->node->details->uname), + after->uuid, + ((after->node == NULL)? "" : after->node->details->uname)); + + /* Add restart orderings if "then" is for a different primitive. + * Orderings for collective resources will be added later. + */ + if ((after->rsc != NULL) && (after->rsc->variant == pe_native) + && (probe->rsc != after->rsc)) { + + GList *then_actions = NULL; + + if (pcmk__str_eq(after->task, RSC_START, pcmk__str_casei)) { + then_actions = pe__resource_actions(after->rsc, NULL, RSC_STOP, + FALSE); + + } else if (pcmk__str_eq(after->task, RSC_PROMOTE, pcmk__str_casei)) { + then_actions = pe__resource_actions(after->rsc, NULL, + RSC_DEMOTE, FALSE); + } + + for (iter = then_actions; iter != NULL; iter = iter->next) { + pe_action_t *then = (pe_action_t *) iter->data; + + // Skip pseudo-actions (for example, those implied by fencing) + if (!pcmk_is_set(then->flags, pe_action_pseudo)) { + order_actions(probe, then, pe_order_optional); + } + } + g_list_free(then_actions); + } + + /* Detect whether "then" is an interleaved clone action. For these, we want + * to add orderings only for the relevant instance. + */ + if ((after->rsc != NULL) + && (after->rsc->variant > pe_group)) { + const char *interleave_s = g_hash_table_lookup(after->rsc->meta, + XML_RSC_ATTR_INTERLEAVE); + + interleave = crm_is_true(interleave_s); + if (interleave) { + compatible_rsc = find_compatible_child(probe->rsc, + after->rsc, + RSC_ROLE_UNKNOWN, + FALSE, data_set); + } + } + + /* Now recursively do the same for all actions ordered after "then". This + * also handles collective resources since the collective action will be + * ordered before its individual instances' actions. + */ + for (iter = after->actions_after; iter != NULL; iter = iter->next) { + pe_action_wrapper_t *after_wrapper = (pe_action_wrapper_t *) iter->data; + + /* pe_order_implies_then is the reason why a required A.start + * implies/enforces B.start to be required too, which is the cause of + * B.restart/re-promote. + * + * Not sure about pe_order_implies_then_on_node though. It's now only + * used for unfencing case, which tends to introduce transition + * loops... + */ + if (!pcmk_is_set(after_wrapper->type, pe_order_implies_then)) { + /* The order type between a group/clone and its child such as + * B.start-> B_child.start is: + * pe_order_implies_first_printed | pe_order_runnable_left + * + * Proceed through the ordering chain and build dependencies with + * its children. + */ + if ((after->rsc == NULL) + || (after->rsc->variant < pe_group) + || (probe->rsc->parent == after->rsc) + || (after_wrapper->action->rsc == NULL) + || (after_wrapper->action->rsc->variant > pe_group) + || (after->rsc != after_wrapper->action->rsc->parent)) { + continue; + } + + /* Proceed to the children of a group or a non-interleaved clone. + * For an interleaved clone, proceed only to the relevant child. + */ + if ((after->rsc->variant > pe_group) && interleave + && ((compatible_rsc == NULL) + || (compatible_rsc != after_wrapper->action->rsc))) { + continue; + } + } + + crm_trace("Recursively adding probe restart orderings for " + "'%s@%s then %s@%s' (type=%#.6x)", + after->uuid, + ((after->node == NULL)? "" : after->node->details->uname), + after_wrapper->action->uuid, + ((after_wrapper->action->node == NULL)? "" : after_wrapper->action->node->details->uname), + after_wrapper->type); + + add_restart_orderings_for_probe(probe, after_wrapper->action, data_set); + } +} + +/*! + * \internal + * \brief Clear the tracking flag on all scheduled actions + * + * \param[in] data_set Cluster working set + */ +static void +clear_actions_tracking_flag(pe_working_set_t *data_set) +{ + GList *gIter = NULL; + + for (gIter = data_set->actions; gIter != NULL; gIter = gIter->next) { + pe_action_t *action = (pe_action_t *) gIter->data; + + pe__clear_action_flags(action, pe_action_tracking); + } +} + +/*! + * \internal + * \brief Add restart orderings for any scheduled probes for a given resource + * + * \param[in] rsc Resource whose probes should be ordered + * \param[in] data_set Cluster working set + */ +static void +add_restart_orderings_for_rsc(pe_resource_t *rsc, pe_working_set_t *data_set) +{ + GList *probes = NULL; + + // For collective resources, order each instance recursively + if (rsc->variant != pe_native) { + g_list_foreach(rsc->children, (GFunc) add_restart_orderings_for_rsc, + data_set); + return; + } + + // Find all probes for given resource + probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE); + + // Add probe restart orderings for each probe found + for (GList *iter = probes; iter != NULL; iter = iter->next) { + pe_action_t *probe = (pe_action_t *) iter->data; + + for (GList *then_iter = probe->actions_after; then_iter != NULL; + then_iter = then_iter->next) { + + pe_action_wrapper_t *then = (pe_action_wrapper_t *) then_iter->data; + + add_restart_orderings_for_probe(probe, then->action, data_set); + clear_actions_tracking_flag(data_set); + } + } + + g_list_free(probes); +} + +/*! + * \internal + * \brief Add "A then probe B" orderings for "A then B" orderings + * + * \param[in] data_set Cluster working set + * + * \note This function is currently disabled (see next comment). + */ +static void +order_then_probes(pe_working_set_t *data_set) +{ +#if 0 + /* Given an ordering "A then B", we would prefer to wait for A to be started + * before probing B. + * + * For example, if A is a filesystem which B can't even run without, it + * would be helpful if the author of B's agent could assume that A is + * running before B.monitor will be called. + * + * However, we can't _only_ probe after A is running, otherwise we wouldn't + * detect the state of B if A could not be started. We can't even do an + * opportunistic version of this, because B may be moving: + * + * A.stop -> A.start -> B.probe -> B.stop -> B.start + * + * and if we add B.stop -> A.stop here, we get a loop: + * + * A.stop -> A.start -> B.probe -> B.stop -> A.stop + * + * We could kill the "B.probe -> B.stop" dependency, but that could mean + * stopping B "too" soon, because B.start must wait for the probe, and + * we don't want to stop B if we can't start it. + * + * We could add the ordering only if A is an anonymous clone with + * clone-max == node-max (since we'll never be moving it). However, we could + * still be stopping one instance at the same time as starting another. + * + * The complexity of checking for allowed conditions combined with the ever + * narrowing use case suggests that this code should remain disabled until + * someone gets smarter. + */ + for (GList *iter = data_set->resources; iter != NULL; iter = iter->next) { + pe_resource_t *rsc = (pe_resource_t *) iter->data; + + pe_action_t *start = NULL; + GList *actions = NULL; + GList *probes = NULL; + + actions = pe__resource_actions(rsc, NULL, RSC_START, FALSE); + + if (actions) { + start = actions->data; + g_list_free(actions); + } + + if (start == NULL) { + crm_err("No start action for %s", rsc->id); + continue; + } + + probes = pe__resource_actions(rsc, NULL, RSC_STATUS, FALSE); + + for (actions = start->actions_before; actions != NULL; + actions = actions->next) { + + pe_action_wrapper_t *before = (pe_action_wrapper_t *) actions->data; + + pe_action_t *first = before->action; + pe_resource_t *first_rsc = first->rsc; + + if (first->required_runnable_before) { + for (GList *clone_actions = first->actions_before; + clone_actions != NULL; + clone_actions = clone_actions->next) { + + before = (pe_action_wrapper_t *) clone_actions->data; + + crm_trace("Testing '%s then %s' for %s", + first->uuid, before->action->uuid, start->uuid); + + CRM_ASSERT(before->action->rsc != NULL); + first_rsc = before->action->rsc; + break; + } + + } else if (!pcmk__str_eq(first->task, RSC_START, pcmk__str_none)) { + crm_trace("Not a start op %s for %s", first->uuid, start->uuid); + } + + if (first_rsc == NULL) { + continue; + + } else if (uber_parent(first_rsc) == uber_parent(start->rsc)) { + crm_trace("Same parent %s for %s", first_rsc->id, start->uuid); + continue; + + } else if (!pe_rsc_is_clone(uber_parent(first_rsc))) { + crm_trace("Not a clone %s for %s", first_rsc->id, start->uuid); + continue; + } + + crm_err("Applying %s before %s %d", first->uuid, start->uuid, + uber_parent(first_rsc)->variant); + + for (GList *probe_iter = probes; probe_iter != NULL; + probe_iter = probe_iter->next) { + + pe_action_t *probe = (pe_action_t *) probe_iter->data; + + crm_err("Ordering %s before %s", first->uuid, probe->uuid); + order_actions(first, probe, pe_order_optional); + } + } + } +#endif +} + +void +pcmk__order_probes(pe_working_set_t *data_set) +{ + // Add orderings for "probe then X" + g_list_foreach(data_set->resources, (GFunc) add_restart_orderings_for_rsc, + data_set); + add_probe_orderings_for_stops(data_set); + + order_then_probes(data_set); +} + +/*! + * \internal + * \brief Schedule any probes needed + * + * \param[in] data_set Cluster working set + * + * \note This may also schedule fencing of failed remote nodes. + */ +void +pcmk__schedule_probes(pe_working_set_t *data_set) +{ + // Schedule probes on each node in the cluster as needed + for (GList *iter = data_set->nodes; iter != NULL; iter = iter->next) { + pe_node_t *node = (pe_node_t *) iter->data; + const char *probed = NULL; + + if (!node->details->online) { // Don't probe offline nodes + if (pcmk__is_failed_remote_node(node)) { + pe_fence_node(data_set, node, + "the connection is unrecoverable", FALSE); + } + continue; + + } else if (node->details->unclean) { // ... or nodes that need fencing + continue; + + } else if (!node->details->rsc_discovery_enabled) { + // The user requested that probes not be done on this node + continue; + } + + /* This is no longer needed for live clusters, since the probe_complete + * node attribute will never be in the CIB. However this is still useful + * for processing old saved CIBs (< 1.1.14), including the + * reprobe-target_rc regression test. + */ + probed = pe_node_attribute_raw(node, CRM_OP_PROBED); + if (probed != NULL && crm_is_true(probed) == FALSE) { + pe_action_t *probe_op = NULL; + + probe_op = custom_action(NULL, + crm_strdup_printf("%s-%s", CRM_OP_REPROBE, + node->details->uname), + CRM_OP_REPROBE, node, FALSE, TRUE, + data_set); + add_hash_param(probe_op->meta, XML_ATTR_TE_NOWAIT, + XML_BOOLEAN_TRUE); + continue; + } + + // Probe each resource in the cluster on this node, as needed + for (GList *rsc_iter = data_set->resources; rsc_iter != NULL; + rsc_iter = rsc_iter->next) { + pe_resource_t *rsc = (pe_resource_t *) rsc_iter->data; + + rsc->cmds->create_probe(rsc, node, NULL, FALSE, data_set); + } + } +}