diff --git a/include/crm/msg_xml.h b/include/crm/msg_xml.h index d8a95d6b4b..cbcae78dac 100644 --- a/include/crm/msg_xml.h +++ b/include/crm/msg_xml.h @@ -1,459 +1,460 @@ /* * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef PCMK__CRM_MSG_XML__H # define PCMK__CRM_MSG_XML__H # include #if !defined(PCMK_ALLOW_DEPRECATED) || (PCMK_ALLOW_DEPRECATED == 1) #include #endif #ifdef __cplusplus extern "C" { #endif /* This file defines constants for various XML syntax (mainly element and * attribute names). * * For consistency, new constants should start with "PCMK_", followed by "XE" * for XML element names, "XA" for XML attribute names, and "META" for meta * attribute names. Old names that don't follow this policy should eventually be * deprecated and replaced with names that do. */ /* * XML elements */ #define PCMK_XE_DATE_EXPRESSION "date_expression" #define PCMK_XE_OP_EXPRESSION "op_expression" /* This has been deprecated as a CIB element (an alias for with * "promotable" set to "true") since 2.0.0. */ #define PCMK_XE_PROMOTABLE_LEGACY "master" #define PCMK_XE_RSC_EXPRESSION "rsc_expression" /* * XML attributes */ #define PCMK_XA_ADMIN_EPOCH "admin_epoch" #define PCMK_XA_CIB_LAST_WRITTEN "cib-last-written" #define PCMK_XA_CRM_DEBUG_ORIGIN "crm-debug-origin" #define PCMK_XA_CRM_FEATURE_SET "crm_feature_set" #define PCMK_XA_CRM_TIMESTAMP "crm-timestamp" #define PCMK_XA_DESCRIPTION "description" #define PCMK_XA_EPOCH "epoch" #define PCMK_XA_FORMAT "format" #define PCMK_XA_HAVE_QUORUM "have-quorum" #define PCMK_XA_ID "id" #define PCMK_XA_NO_QUORUM_PANIC "no-quorum-panic" #define PCMK_XA_NUM_UPDATES "num_updates" #define PCMK_XA_VALIDATE_WITH "validate-with" #define PCMK_XA_VERSION "version" /* * Meta attributes */ #define PCMK_META_CLONE_MAX "clone-max" #define PCMK_META_CLONE_MIN "clone-min" #define PCMK_META_CLONE_NODE_MAX "clone-node-max" #define PCMK_META_CONTAINER_ATTR_TARGET "container-attribute-target" #define PCMK_META_ENABLED "enabled" #define PCMK_META_FAILURE_TIMEOUT "failure-timeout" +#define PCMK_META_INTERLEAVE "interleave" #define PCMK_META_MIGRATION_THRESHOLD "migration-threshold" #define PCMK_META_ORDERED "ordered" #define PCMK_META_PROMOTED_MAX "promoted-max" #define PCMK_META_PROMOTED_NODE_MAX "promoted-node-max" /* * Older constants that don't follow current naming */ # ifndef F_ORIG # define F_ORIG "src" # endif # ifndef F_SEQ # define F_SEQ "seq" # endif # ifndef F_SUBTYPE # define F_SUBTYPE "subt" # endif # ifndef F_TYPE # define F_TYPE "t" # endif # ifndef F_CLIENTNAME # define F_CLIENTNAME "cn" # endif # ifndef F_XML_TAGNAME # define F_XML_TAGNAME "__name__" # endif # ifndef T_CRM # define T_CRM "crmd" # endif # ifndef T_ATTRD # define T_ATTRD "attrd" # endif # define CIB_OPTIONS_FIRST "cib-bootstrap-options" # define F_CRM_DATA "crm_xml" # define F_CRM_TASK "crm_task" # define F_CRM_HOST_TO "crm_host_to" # define F_CRM_MSG_TYPE F_SUBTYPE # define F_CRM_SYS_TO "crm_sys_to" # define F_CRM_SYS_FROM "crm_sys_from" # define F_CRM_HOST_FROM F_ORIG # define F_CRM_REFERENCE XML_ATTR_REFERENCE # define F_CRM_VERSION PCMK_XA_VERSION # define F_CRM_ORIGIN "origin" # define F_CRM_USER "crm_user" # define F_CRM_JOIN_ID "join_id" # define F_CRM_DC_LEAVING "dc-leaving" # define F_CRM_ELECTION_ID "election-id" # define F_CRM_ELECTION_AGE_S "election-age-sec" # define F_CRM_ELECTION_AGE_US "election-age-nano-sec" # define F_CRM_ELECTION_OWNER "election-owner" # define F_CRM_TGRAPH "crm-tgraph-file" # define F_CRM_TGRAPH_INPUT "crm-tgraph-in" # define F_CRM_THROTTLE_MODE "crm-limit-mode" # define F_CRM_THROTTLE_MAX "crm-limit-max" /*---- Common tags/attrs */ # define XML_DIFF_MARKER "__crm_diff_marker__" # define XML_TAG_CIB "cib" # define XML_TAG_FAILED "failed" # define XML_ATTR_TIMEOUT "timeout" # define XML_ATTR_NAME "name" # define XML_ATTR_IDREF "id-ref" # define XML_ATTR_ID_LONG "long-id" # define XML_ATTR_TYPE "type" # define XML_ATTR_OP "op" # define XML_ATTR_DC_UUID "dc-uuid" # define XML_ATTR_UPDATE_ORIG "update-origin" # define XML_ATTR_UPDATE_CLIENT "update-client" # define XML_ATTR_UPDATE_USER "update-user" # define XML_BOOLEAN_TRUE "true" # define XML_BOOLEAN_FALSE "false" # define XML_BOOLEAN_YES XML_BOOLEAN_TRUE # define XML_BOOLEAN_NO XML_BOOLEAN_FALSE # define XML_TAG_OPTIONS "options" /*---- top level tags/attrs */ # define XML_ATTR_REQUEST "request" # define XML_ATTR_RESPONSE "response" # define XML_ATTR_UNAME "uname" # define XML_ATTR_REFERENCE "reference" # define XML_CRM_TAG_PING "ping_response" # define XML_PING_ATTR_STATUS "result" # define XML_PING_ATTR_SYSFROM "crm_subsystem" # define XML_PING_ATTR_CRMDSTATE "crmd_state" # define XML_PING_ATTR_PACEMAKERDSTATE "pacemakerd_state" # define XML_PING_ATTR_PACEMAKERDSTATE_INIT "init" # define XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS "starting_daemons" # define XML_PING_ATTR_PACEMAKERDSTATE_WAITPING "wait_for_ping" # define XML_PING_ATTR_PACEMAKERDSTATE_RUNNING "running" # define XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN "shutting_down" # define XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE "shutdown_complete" # define XML_PING_ATTR_PACEMAKERDSTATE_REMOTE "remote" # define XML_FAIL_TAG_CIB "failed_update" # define XML_FAILCIB_ATTR_OBJTYPE "object_type" # define XML_FAILCIB_ATTR_OP "operation" # define XML_FAILCIB_ATTR_REASON "reason" /*---- CIB specific tags/attrs */ # define XML_CIB_TAG_SECTION_ALL "all" # define XML_CIB_TAG_CONFIGURATION "configuration" # define XML_CIB_TAG_STATUS "status" # define XML_CIB_TAG_RESOURCES "resources" # define XML_CIB_TAG_NODES "nodes" # define XML_CIB_TAG_CONSTRAINTS "constraints" # define XML_CIB_TAG_CRMCONFIG "crm_config" # define XML_CIB_TAG_OPCONFIG "op_defaults" # define XML_CIB_TAG_RSCCONFIG "rsc_defaults" # define XML_CIB_TAG_ACLS "acls" # define XML_CIB_TAG_ALERTS "alerts" # define XML_CIB_TAG_ALERT "alert" # define XML_CIB_TAG_ALERT_RECIPIENT "recipient" # define XML_CIB_TAG_ALERT_SELECT "select" # define XML_CIB_TAG_ALERT_ATTRIBUTES "select_attributes" # define XML_CIB_TAG_ALERT_FENCING "select_fencing" # define XML_CIB_TAG_ALERT_NODES "select_nodes" # define XML_CIB_TAG_ALERT_RESOURCES "select_resources" # define XML_CIB_TAG_ALERT_ATTR "attribute" # define XML_CIB_TAG_STATE "node_state" # define XML_CIB_TAG_NODE "node" # define XML_CIB_TAG_NVPAIR "nvpair" # define XML_CIB_TAG_PROPSET "cluster_property_set" # define XML_TAG_ATTR_SETS "instance_attributes" # define XML_TAG_META_SETS "meta_attributes" # define XML_TAG_ATTRS "attributes" # define XML_TAG_PARAMS "parameters" # define XML_TAG_PARAM "param" # define XML_TAG_UTILIZATION "utilization" # define XML_TAG_RESOURCE_REF "resource_ref" # define XML_CIB_TAG_RESOURCE "primitive" # define XML_CIB_TAG_GROUP "group" # define XML_CIB_TAG_INCARNATION "clone" # define XML_CIB_TAG_CONTAINER "bundle" # define XML_CIB_TAG_RSC_TEMPLATE "template" # define XML_RSC_ATTR_INTERLEAVE "interleave" # define XML_RSC_ATTR_INCARNATION "clone" # define XML_RSC_ATTR_PROMOTABLE "promotable" # define XML_RSC_ATTR_MANAGED "is-managed" # define XML_RSC_ATTR_TARGET_ROLE "target-role" # define XML_RSC_ATTR_UNIQUE "globally-unique" # define XML_RSC_ATTR_NOTIFY "notify" # define XML_RSC_ATTR_STICKINESS "resource-stickiness" # define XML_RSC_ATTR_MULTIPLE "multiple-active" # define XML_RSC_ATTR_REQUIRES "requires" # define XML_RSC_ATTR_CONTAINER "container" # define XML_RSC_ATTR_INTERNAL_RSC "internal_rsc" # define XML_RSC_ATTR_MAINTENANCE "maintenance" # define XML_RSC_ATTR_REMOTE_NODE "remote-node" # define XML_RSC_ATTR_CLEAR_OP "clear_failure_op" # define XML_RSC_ATTR_CLEAR_INTERVAL "clear_failure_interval" # define XML_RSC_ATTR_REMOTE_RA_ADDR "addr" # define XML_RSC_ATTR_REMOTE_RA_SERVER "server" # define XML_RSC_ATTR_REMOTE_RA_PORT "port" # define XML_RSC_ATTR_CRITICAL "critical" # define XML_REMOTE_ATTR_RECONNECT_INTERVAL "reconnect_interval" # define XML_OP_ATTR_ON_FAIL "on-fail" # define XML_OP_ATTR_START_DELAY "start-delay" # define XML_OP_ATTR_ALLOW_MIGRATE "allow-migrate" # define XML_OP_ATTR_ORIGIN "interval-origin" # define XML_OP_ATTR_PENDING "record-pending" # define XML_OP_ATTR_DIGESTS_ALL "digests-all" # define XML_OP_ATTR_DIGESTS_SECURE "digests-secure" # define XML_CIB_TAG_LRM "lrm" # define XML_LRM_TAG_RESOURCES "lrm_resources" # define XML_LRM_TAG_RESOURCE "lrm_resource" # define XML_LRM_TAG_RSC_OP "lrm_rsc_op" # define XML_AGENT_ATTR_CLASS "class" # define XML_AGENT_ATTR_PROVIDER "provider" //! \deprecated Do not use (will be removed in a future release) # define XML_CIB_ATTR_REPLACE "replace" # define XML_CIB_ATTR_PRIORITY "priority" # define XML_NODE_IS_REMOTE "remote_node" # define XML_NODE_IS_FENCED "node_fenced" # define XML_NODE_IS_MAINTENANCE "node_in_maintenance" # define XML_CIB_ATTR_SHUTDOWN "shutdown" /* Aside from being an old name for the executor, LRM is a misnomer here because * the controller and scheduler use these to track actions, which are not always * executor operations. */ // XML attribute that takes interval specification (user-facing configuration) # define XML_LRM_ATTR_INTERVAL "interval" // XML attribute that takes interval in milliseconds (daemon APIs) // (identical value as above, but different constant allows clearer code intent) # define XML_LRM_ATTR_INTERVAL_MS XML_LRM_ATTR_INTERVAL # define XML_LRM_ATTR_TASK "operation" # define XML_LRM_ATTR_TASK_KEY "operation_key" # define XML_LRM_ATTR_TARGET "on_node" # define XML_LRM_ATTR_TARGET_UUID "on_node_uuid" /*! Actions to be executed on Pacemaker Remote nodes are routed through the * controller on the cluster node hosting the remote connection. That cluster * node is considered the router node for the action. */ # define XML_LRM_ATTR_ROUTER_NODE "router_node" # define XML_LRM_ATTR_RSCID "rsc-id" # define XML_LRM_ATTR_OPSTATUS "op-status" # define XML_LRM_ATTR_RC "rc-code" # define XML_LRM_ATTR_CALLID "call-id" # define XML_LRM_ATTR_OP_DIGEST "op-digest" # define XML_LRM_ATTR_OP_RESTART "op-force-restart" # define XML_LRM_ATTR_OP_SECURE "op-secure-params" # define XML_LRM_ATTR_RESTART_DIGEST "op-restart-digest" # define XML_LRM_ATTR_SECURE_DIGEST "op-secure-digest" # define XML_LRM_ATTR_EXIT_REASON "exit-reason" # define XML_RSC_OP_LAST_CHANGE "last-rc-change" # define XML_RSC_OP_T_EXEC "exec-time" # define XML_RSC_OP_T_QUEUE "queue-time" # define XML_LRM_ATTR_MIGRATE_SOURCE "migrate_source" # define XML_LRM_ATTR_MIGRATE_TARGET "migrate_target" # define XML_TAG_GRAPH "transition_graph" # define XML_GRAPH_TAG_RSC_OP "rsc_op" # define XML_GRAPH_TAG_PSEUDO_EVENT "pseudo_event" # define XML_GRAPH_TAG_CRM_EVENT "crm_event" # define XML_GRAPH_TAG_DOWNED "downed" # define XML_GRAPH_TAG_MAINTENANCE "maintenance" # define XML_TAG_RULE "rule" # define XML_RULE_ATTR_SCORE "score" # define XML_RULE_ATTR_SCORE_ATTRIBUTE "score-attribute" # define XML_RULE_ATTR_ROLE "role" # define XML_RULE_ATTR_BOOLEAN_OP "boolean-op" # define XML_TAG_EXPRESSION "expression" # define XML_EXPR_ATTR_ATTRIBUTE "attribute" # define XML_EXPR_ATTR_OPERATION "operation" # define XML_EXPR_ATTR_VALUE "value" # define XML_EXPR_ATTR_TYPE "type" # define XML_EXPR_ATTR_VALUE_SOURCE "value-source" # define XML_CONS_TAG_RSC_DEPEND "rsc_colocation" # define XML_CONS_TAG_RSC_ORDER "rsc_order" # define XML_CONS_TAG_RSC_LOCATION "rsc_location" # define XML_CONS_TAG_RSC_TICKET "rsc_ticket" # define XML_CONS_TAG_RSC_SET "resource_set" # define XML_CONS_ATTR_SYMMETRICAL "symmetrical" # define XML_LOCATION_ATTR_DISCOVERY "resource-discovery" # define XML_COLOC_ATTR_SOURCE "rsc" # define XML_COLOC_ATTR_SOURCE_ROLE "rsc-role" # define XML_COLOC_ATTR_TARGET "with-rsc" # define XML_COLOC_ATTR_TARGET_ROLE "with-rsc-role" # define XML_COLOC_ATTR_NODE_ATTR "node-attribute" # define XML_COLOC_ATTR_INFLUENCE "influence" //! \deprecated Deprecated since 2.1.5 # define XML_COLOC_ATTR_SOURCE_INSTANCE "rsc-instance" //! \deprecated Deprecated since 2.1.5 # define XML_COLOC_ATTR_TARGET_INSTANCE "with-rsc-instance" # define XML_LOC_ATTR_SOURCE "rsc" # define XML_LOC_ATTR_SOURCE_PATTERN "rsc-pattern" # define XML_ORDER_ATTR_FIRST "first" # define XML_ORDER_ATTR_THEN "then" # define XML_ORDER_ATTR_FIRST_ACTION "first-action" # define XML_ORDER_ATTR_THEN_ACTION "then-action" # define XML_ORDER_ATTR_KIND "kind" //! \deprecated Deprecated since 2.1.5 # define XML_ORDER_ATTR_FIRST_INSTANCE "first-instance" //! \deprecated Deprecated since 2.1.5 # define XML_ORDER_ATTR_THEN_INSTANCE "then-instance" # define XML_TICKET_ATTR_TICKET "ticket" # define XML_TICKET_ATTR_LOSS_POLICY "loss-policy" # define XML_NVPAIR_ATTR_NAME "name" # define XML_NVPAIR_ATTR_VALUE "value" # define XML_NODE_ATTR_RSC_DISCOVERY "resource-discovery-enabled" # define XML_ALERT_ATTR_PATH "path" # define XML_ALERT_ATTR_TIMEOUT "timeout" # define XML_ALERT_ATTR_TSTAMP_FORMAT "timestamp-format" # define XML_ALERT_ATTR_REC_VALUE "value" # define XML_CIB_TAG_GENERATION_TUPPLE "generation_tuple" # define XML_ATTR_TRANSITION_MAGIC "transition-magic" # define XML_ATTR_TRANSITION_KEY "transition-key" # define XML_ATTR_TE_NOWAIT "op_no_wait" # define XML_ATTR_TE_TARGET_RC "op_target_rc" # define XML_TAG_TRANSIENT_NODEATTRS "transient_attributes" //! \deprecated Do not use (will be removed in a future release) # define XML_TAG_DIFF_ADDED "diff-added" //! \deprecated Do not use (will be removed in a future release) # define XML_TAG_DIFF_REMOVED "diff-removed" # define XML_ACL_TAG_USER "acl_target" # define XML_ACL_TAG_USERv1 "acl_user" # define XML_ACL_TAG_GROUP "acl_group" # define XML_ACL_TAG_ROLE "acl_role" # define XML_ACL_TAG_PERMISSION "acl_permission" # define XML_ACL_TAG_ROLE_REF "role" # define XML_ACL_TAG_ROLE_REFv1 "role_ref" # define XML_ACL_ATTR_KIND "kind" # define XML_ACL_TAG_READ "read" # define XML_ACL_TAG_WRITE "write" # define XML_ACL_TAG_DENY "deny" # define XML_ACL_ATTR_REF "reference" # define XML_ACL_ATTR_REFv1 "ref" # define XML_ACL_ATTR_TAG "object-type" # define XML_ACL_ATTR_TAGv1 "tag" # define XML_ACL_ATTR_XPATH "xpath" # define XML_ACL_ATTR_ATTRIBUTE "attribute" # define XML_CIB_TAG_TICKETS "tickets" # define XML_CIB_TAG_TICKET_STATE "ticket_state" # define XML_CIB_TAG_TAGS "tags" # define XML_CIB_TAG_TAG "tag" # define XML_CIB_TAG_OBJ_REF "obj_ref" # define XML_TAG_FENCING_TOPOLOGY "fencing-topology" # define XML_TAG_FENCING_LEVEL "fencing-level" # define XML_ATTR_STONITH_INDEX "index" # define XML_ATTR_STONITH_TARGET "target" # define XML_ATTR_STONITH_TARGET_VALUE "target-value" # define XML_ATTR_STONITH_TARGET_PATTERN "target-pattern" # define XML_ATTR_STONITH_TARGET_ATTRIBUTE "target-attribute" # define XML_ATTR_STONITH_DEVICES "devices" # define XML_TAG_DIFF "diff" # define XML_DIFF_VERSION "version" # define XML_DIFF_VSOURCE "source" # define XML_DIFF_VTARGET "target" # define XML_DIFF_CHANGE "change" # define XML_DIFF_LIST "change-list" # define XML_DIFF_ATTR "change-attr" # define XML_DIFF_RESULT "change-result" # define XML_DIFF_OP "operation" # define XML_DIFF_PATH "path" # define XML_DIFF_POSITION "position" # define ID(x) crm_element_value(x, PCMK_XA_ID) #ifdef __cplusplus } #endif #endif diff --git a/lib/pacemaker/pcmk_sched_clone.c b/lib/pacemaker/pcmk_sched_clone.c index 8f26d8e871..39036de5fd 100644 --- a/lib/pacemaker/pcmk_sched_clone.c +++ b/lib/pacemaker/pcmk_sched_clone.c @@ -1,709 +1,709 @@ /* - * Copyright 2004-2023 the Pacemaker project contributors + * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include "libpacemaker_private.h" /*! * \internal * \brief Assign a clone resource's instances to nodes * * \param[in,out] rsc Clone resource to assign * \param[in] prefer Node to prefer, if all else is equal * \param[in] stop_if_fail If \c true and a primitive descendant of \p rsc * can't be assigned to a node, set the * descendant's next role to stopped and update * existing actions * * \return NULL (clones are not assigned to a single node) * * \note If \p stop_if_fail is \c false, then \c pcmk__unassign_resource() can * completely undo the assignment. A successful assignment can be either * undone or left alone as final. A failed assignment has the same effect * as calling pcmk__unassign_resource(); there are no side effects on * roles or actions. */ pcmk_node_t * pcmk__clone_assign(pcmk_resource_t *rsc, const pcmk_node_t *prefer, bool stop_if_fail) { GList *colocations = NULL; CRM_ASSERT(pe_rsc_is_clone(rsc)); if (!pcmk_is_set(rsc->flags, pcmk_rsc_unassigned)) { return NULL; // Assignment has already been done } // Detect assignment loops if (pcmk_is_set(rsc->flags, pcmk_rsc_assigning)) { pcmk__rsc_debug(rsc, "Breaking assignment loop involving %s", rsc->id); return NULL; } pe__set_resource_flags(rsc, pcmk_rsc_assigning); // If this clone is promotable, consider nodes' promotion scores if (pcmk_is_set(rsc->flags, pcmk_rsc_promotable)) { pcmk__add_promotion_scores(rsc); } // If this clone is colocated with any other resources, assign those first colocations = pcmk__this_with_colocations(rsc); for (GList *iter = colocations; iter != NULL; iter = iter->next) { pcmk__colocation_t *constraint = (pcmk__colocation_t *) iter->data; pcmk__rsc_trace(rsc, "%s: Assigning colocation %s primary %s first", rsc->id, constraint->id, constraint->primary->id); constraint->primary->cmds->assign(constraint->primary, prefer, stop_if_fail); } g_list_free(colocations); // If any resources are colocated with this one, consider their preferences colocations = pcmk__with_this_colocations(rsc); g_list_foreach(colocations, pcmk__add_dependent_scores, rsc); g_list_free(colocations); pe__show_node_scores(!pcmk_is_set(rsc->cluster->flags, pcmk_sched_output_scores), rsc, __func__, rsc->allowed_nodes, rsc->cluster); rsc->children = g_list_sort(rsc->children, pcmk__cmp_instance); pcmk__assign_instances(rsc, rsc->children, pe__clone_max(rsc), pe__clone_node_max(rsc)); if (pcmk_is_set(rsc->flags, pcmk_rsc_promotable)) { pcmk__set_instance_roles(rsc); } pe__clear_resource_flags(rsc, pcmk_rsc_unassigned|pcmk_rsc_assigning); pcmk__rsc_trace(rsc, "Assigned clone %s", rsc->id); return NULL; } /*! * \internal * \brief Create all actions needed for a given clone resource * * \param[in,out] rsc Clone resource to create actions for */ void pcmk__clone_create_actions(pcmk_resource_t *rsc) { CRM_ASSERT(pe_rsc_is_clone(rsc)); pcmk__rsc_trace(rsc, "Creating actions for clone %s", rsc->id); pcmk__create_instance_actions(rsc, rsc->children); if (pcmk_is_set(rsc->flags, pcmk_rsc_promotable)) { pcmk__create_promotable_actions(rsc); } } /*! * \internal * \brief Create implicit constraints needed for a clone resource * * \param[in,out] rsc Clone resource to create implicit constraints for */ void pcmk__clone_internal_constraints(pcmk_resource_t *rsc) { bool ordered = false; CRM_ASSERT(pe_rsc_is_clone(rsc)); pcmk__rsc_trace(rsc, "Creating internal constraints for clone %s", rsc->id); // Restart ordering: Stop -> stopped -> start -> started pcmk__order_resource_actions(rsc, PCMK_ACTION_STOPPED, rsc, PCMK_ACTION_START, pcmk__ar_ordered); pcmk__order_resource_actions(rsc, PCMK_ACTION_START, rsc, PCMK_ACTION_RUNNING, pcmk__ar_unrunnable_first_blocks); pcmk__order_resource_actions(rsc, PCMK_ACTION_STOP, rsc, PCMK_ACTION_STOPPED, pcmk__ar_unrunnable_first_blocks); // Demoted -> stop and started -> promote if (pcmk_is_set(rsc->flags, pcmk_rsc_promotable)) { pcmk__order_resource_actions(rsc, PCMK_ACTION_DEMOTED, rsc, PCMK_ACTION_STOP, pcmk__ar_ordered); pcmk__order_resource_actions(rsc, PCMK_ACTION_RUNNING, rsc, PCMK_ACTION_PROMOTE, pcmk__ar_unrunnable_first_blocks); } ordered = pe__clone_is_ordered(rsc); if (ordered) { /* Ordered clone instances must start and stop by instance number. The * instances might have been previously shuffled for assignment or * promotion purposes, so re-sort them. */ rsc->children = g_list_sort(rsc->children, pcmk__cmp_instance_number); } for (GList *iter = rsc->children; iter != NULL; iter = iter->next) { pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; instance->cmds->internal_constraints(instance); // Start clone -> start instance -> clone started pcmk__order_starts(rsc, instance, pcmk__ar_unrunnable_first_blocks |pcmk__ar_then_implies_first_graphed); pcmk__order_resource_actions(instance, PCMK_ACTION_START, rsc, PCMK_ACTION_RUNNING, pcmk__ar_first_implies_then_graphed); // Stop clone -> stop instance -> clone stopped pcmk__order_stops(rsc, instance, pcmk__ar_then_implies_first_graphed); pcmk__order_resource_actions(instance, PCMK_ACTION_STOP, rsc, PCMK_ACTION_STOPPED, pcmk__ar_first_implies_then_graphed); /* Instances of ordered clones must be started and stopped by instance * number. Since only some instances may be starting or stopping, order * each instance relative to every later instance. */ if (ordered) { for (GList *later = iter->next; later != NULL; later = later->next) { pcmk__order_starts(instance, (pcmk_resource_t *) later->data, pcmk__ar_ordered); pcmk__order_stops((pcmk_resource_t *) later->data, instance, pcmk__ar_ordered); } } } if (pcmk_is_set(rsc->flags, pcmk_rsc_promotable)) { pcmk__order_promotable_instances(rsc); } } /*! * \internal * \brief Check whether colocated resources can be interleaved * * \param[in] colocation Colocation constraint with clone as primary * * \return true if colocated resources can be interleaved, otherwise false */ static bool can_interleave(const pcmk__colocation_t *colocation) { const pcmk_resource_t *dependent = colocation->dependent; // Only colocations between clone or bundle resources use interleaving if (dependent->variant <= pcmk_rsc_variant_group) { return false; } // Only the dependent needs to be marked for interleaving if (!crm_is_true(g_hash_table_lookup(dependent->meta, - XML_RSC_ATTR_INTERLEAVE))) { + PCMK_META_INTERLEAVE))) { return false; } /* @TODO Do we actually care about multiple primary instances sharing a * dependent instance? */ if (dependent->fns->max_per_node(dependent) != colocation->primary->fns->max_per_node(colocation->primary)) { pcmk__config_err("Cannot interleave %s and %s because they do not " "support the same number of instances per node", dependent->id, colocation->primary->id); return false; } return true; } /*! * \internal * \brief Apply a colocation's score to node scores or resource priority * * Given a colocation constraint, apply its score to the dependent's * allowed node scores (if we are still placing resources) or priority (if * we are choosing promotable clone instance roles). * * \param[in,out] dependent Dependent resource in colocation * \param[in] primary Primary resource in colocation * \param[in] colocation Colocation constraint to apply * \param[in] for_dependent true if called on behalf of dependent */ void pcmk__clone_apply_coloc_score(pcmk_resource_t *dependent, const pcmk_resource_t *primary, const pcmk__colocation_t *colocation, bool for_dependent) { const GList *iter = NULL; /* This should never be called for the clone itself as a dependent. Instead, * we add its colocation constraints to its instances and call the * apply_coloc_score() method for the instances as dependents. */ CRM_ASSERT(!for_dependent); CRM_ASSERT((colocation != NULL) && pe_rsc_is_clone(primary) && (dependent != NULL) && (dependent->variant == pcmk_rsc_variant_primitive)); if (pcmk_is_set(primary->flags, pcmk_rsc_unassigned)) { pcmk__rsc_trace(primary, "Delaying processing colocation %s " "because cloned primary %s is still provisional", colocation->id, primary->id); return; } pcmk__rsc_trace(primary, "Processing colocation %s (%s with clone %s @%s)", colocation->id, dependent->id, primary->id, pcmk_readable_score(colocation->score)); // Apply role-specific colocations if (pcmk_is_set(primary->flags, pcmk_rsc_promotable) && (colocation->primary_role != pcmk_role_unknown)) { if (pcmk_is_set(dependent->flags, pcmk_rsc_unassigned)) { // We're assigning the dependent to a node pcmk__update_dependent_with_promotable(primary, dependent, colocation); return; } if (colocation->dependent_role == pcmk_role_promoted) { // We're choosing a role for the dependent pcmk__update_promotable_dependent_priority(primary, dependent, colocation); return; } } // Apply interleaved colocations if (can_interleave(colocation)) { const pcmk_resource_t *primary_instance = NULL; primary_instance = pcmk__find_compatible_instance(dependent, primary, pcmk_role_unknown, false); if (primary_instance != NULL) { pcmk__rsc_debug(primary, "Interleaving %s with %s", dependent->id, primary_instance->id); dependent->cmds->apply_coloc_score(dependent, primary_instance, colocation, true); } else if (colocation->score >= INFINITY) { crm_notice("%s cannot run because it cannot interleave with " "any instance of %s", dependent->id, primary->id); pcmk__assign_resource(dependent, NULL, true, true); } else { pcmk__rsc_debug(primary, "%s will not colocate with %s " "because no instance can interleave with it", dependent->id, primary->id); } return; } // Apply mandatory colocations if (colocation->score >= INFINITY) { GList *primary_nodes = NULL; // Dependent can run only where primary will have unblocked instances for (iter = primary->children; iter != NULL; iter = iter->next) { const pcmk_resource_t *instance = iter->data; pcmk_node_t *chosen = instance->fns->location(instance, NULL, 0); if ((chosen != NULL) && !is_set_recursive(instance, pcmk_rsc_blocked, TRUE)) { pcmk__rsc_trace(primary, "Allowing %s: %s %d", colocation->id, pe__node_name(chosen), chosen->weight); primary_nodes = g_list_prepend(primary_nodes, chosen); } } pcmk__colocation_intersect_nodes(dependent, primary, colocation, primary_nodes, false); g_list_free(primary_nodes); return; } // Apply optional colocations for (iter = primary->children; iter != NULL; iter = iter->next) { const pcmk_resource_t *instance = iter->data; instance->cmds->apply_coloc_score(dependent, instance, colocation, false); } } // Clone implementation of pcmk_assignment_methods_t:with_this_colocations() void pcmk__with_clone_colocations(const pcmk_resource_t *rsc, const pcmk_resource_t *orig_rsc, GList **list) { CRM_CHECK((rsc != NULL) && (orig_rsc != NULL) && (list != NULL), return); pcmk__add_with_this_list(list, rsc->rsc_cons_lhs, orig_rsc); if (rsc->parent != NULL) { rsc->parent->cmds->with_this_colocations(rsc->parent, orig_rsc, list); } } // Clone implementation of pcmk_assignment_methods_t:this_with_colocations() void pcmk__clone_with_colocations(const pcmk_resource_t *rsc, const pcmk_resource_t *orig_rsc, GList **list) { CRM_CHECK((rsc != NULL) && (orig_rsc != NULL) && (list != NULL), return); pcmk__add_this_with_list(list, rsc->rsc_cons, orig_rsc); if (rsc->parent != NULL) { rsc->parent->cmds->this_with_colocations(rsc->parent, orig_rsc, list); } } /*! * \internal * \brief Return action flags for a given clone resource action * * \param[in,out] action Action to get flags for * \param[in] node If not NULL, limit effects to this node * * \return Flags appropriate to \p action on \p node */ uint32_t pcmk__clone_action_flags(pcmk_action_t *action, const pcmk_node_t *node) { CRM_ASSERT((action != NULL) && pe_rsc_is_clone(action->rsc)); return pcmk__collective_action_flags(action, action->rsc->children, node); } /*! * \internal * \brief Apply a location constraint to a clone resource's allowed node scores * * \param[in,out] rsc Clone resource to apply constraint to * \param[in,out] location Location constraint to apply */ void pcmk__clone_apply_location(pcmk_resource_t *rsc, pcmk__location_t *location) { CRM_CHECK((location != NULL) && pe_rsc_is_clone(rsc), return); pcmk__apply_location(rsc, location); for (GList *iter = rsc->children; iter != NULL; iter = iter->next) { pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; instance->cmds->apply_location(instance, location); } } // GFunc wrapper for calling the action_flags() resource method static void call_action_flags(gpointer data, gpointer user_data) { pcmk_resource_t *rsc = user_data; rsc->cmds->action_flags((pcmk_action_t *) data, NULL); } /*! * \internal * \brief Add a clone resource's actions to the transition graph * * \param[in,out] rsc Resource whose actions should be added */ void pcmk__clone_add_actions_to_graph(pcmk_resource_t *rsc) { CRM_ASSERT(pe_rsc_is_clone(rsc)); g_list_foreach(rsc->actions, call_action_flags, rsc); pe__create_clone_notifications(rsc); for (GList *iter = rsc->children; iter != NULL; iter = iter->next) { pcmk_resource_t *child_rsc = (pcmk_resource_t *) iter->data; child_rsc->cmds->add_actions_to_graph(child_rsc); } pcmk__add_rsc_actions_to_graph(rsc); pe__free_clone_notification_data(rsc); } /*! * \internal * \brief Check whether a resource or any children have been probed on a node * * \param[in] rsc Resource to check * \param[in] node Node to check * * \return true if \p node is in the known_on table of \p rsc or any of its * children, otherwise false */ static bool rsc_probed_on(const pcmk_resource_t *rsc, const pcmk_node_t *node) { if (rsc->children != NULL) { for (GList *child_iter = rsc->children; child_iter != NULL; child_iter = child_iter->next) { pcmk_resource_t *child = (pcmk_resource_t *) child_iter->data; if (rsc_probed_on(child, node)) { return true; } } return false; } if (rsc->known_on != NULL) { GHashTableIter iter; pcmk_node_t *known_node = NULL; g_hash_table_iter_init(&iter, rsc->known_on); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &known_node)) { if (pe__same_node(node, known_node)) { return true; } } } return false; } /*! * \internal * \brief Find clone instance that has been probed on given node * * \param[in] clone Clone resource to check * \param[in] node Node to check * * \return Instance of \p clone that has been probed on \p node if any, * otherwise NULL */ static pcmk_resource_t * find_probed_instance_on(const pcmk_resource_t *clone, const pcmk_node_t *node) { for (GList *iter = clone->children; iter != NULL; iter = iter->next) { pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; if (rsc_probed_on(instance, node)) { return instance; } } return NULL; } /*! * \internal * \brief Probe an anonymous clone on a node * * \param[in,out] clone Anonymous clone to probe * \param[in,out] node Node to probe \p clone on */ static bool probe_anonymous_clone(pcmk_resource_t *clone, pcmk_node_t *node) { // Check whether we already probed an instance on this node pcmk_resource_t *child = find_probed_instance_on(clone, node); // Otherwise, check if we plan to start an instance on this node for (GList *iter = clone->children; (iter != NULL) && (child == NULL); iter = iter->next) { pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; const pcmk_node_t *instance_node = NULL; instance_node = instance->fns->location(instance, NULL, 0); if (pe__same_node(instance_node, node)) { child = instance; } } // Otherwise, use the first clone instance if (child == NULL) { child = clone->children->data; } // Anonymous clones only need to probe a single instance return child->cmds->create_probe(child, node); } /*! * \internal * \brief Schedule any probes needed for a resource on a node * * \param[in,out] rsc Resource to create probe for * \param[in,out] node Node to create probe on * * \return true if any probe was created, otherwise false */ bool pcmk__clone_create_probe(pcmk_resource_t *rsc, pcmk_node_t *node) { CRM_ASSERT((node != NULL) && pe_rsc_is_clone(rsc)); if (rsc->exclusive_discover) { /* The clone is configured to be probed only where a location constraint * exists with resource-discovery set to exclusive. * * This check is not strictly necessary here since the instance's * create_probe() method would also check, but doing it here is more * efficient (especially for unique clones with a large number of * instances), and affects the CRM_meta_notify_available_uname variable * passed with notify actions. */ pcmk_node_t *allowed = g_hash_table_lookup(rsc->allowed_nodes, node->details->id); if ((allowed == NULL) || (allowed->rsc_discover_mode != pcmk_probe_exclusive)) { /* This node is not marked for resource discovery. Remove it from * allowed_nodes so that notifications contain only nodes that the * clone can possibly run on. */ pcmk__rsc_trace(rsc, "Skipping probe for %s on %s because resource has " "exclusive discovery but is not allowed on node", rsc->id, pe__node_name(node)); g_hash_table_remove(rsc->allowed_nodes, node->details->id); return false; } } rsc->children = g_list_sort(rsc->children, pcmk__cmp_instance_number); if (pcmk_is_set(rsc->flags, pcmk_rsc_unique)) { return pcmk__probe_resource_list(rsc->children, node); } else { return probe_anonymous_clone(rsc, node); } } /*! * \internal * \brief Add meta-attributes relevant to transition graph actions to XML * * Add clone-specific meta-attributes needed for transition graph actions. * * \param[in] rsc Clone resource whose meta-attributes should be added * \param[in,out] xml Transition graph action attributes XML to add to */ void pcmk__clone_add_graph_meta(const pcmk_resource_t *rsc, xmlNode *xml) { char *name = NULL; CRM_ASSERT(pe_rsc_is_clone(rsc) && (xml != NULL)); name = crm_meta_name(XML_RSC_ATTR_UNIQUE); crm_xml_add(xml, name, pe__rsc_bool_str(rsc, pcmk_rsc_unique)); free(name); name = crm_meta_name(XML_RSC_ATTR_NOTIFY); crm_xml_add(xml, name, pe__rsc_bool_str(rsc, pcmk_rsc_notify)); free(name); name = crm_meta_name(PCMK_META_CLONE_MAX); crm_xml_add_int(xml, name, pe__clone_max(rsc)); free(name); name = crm_meta_name(PCMK_META_CLONE_NODE_MAX); crm_xml_add_int(xml, name, pe__clone_node_max(rsc)); free(name); if (pcmk_is_set(rsc->flags, pcmk_rsc_promotable)) { int promoted_max = pe__clone_promoted_max(rsc); int promoted_node_max = pe__clone_promoted_node_max(rsc); name = crm_meta_name(PCMK_META_PROMOTED_MAX); crm_xml_add_int(xml, name, promoted_max); free(name); name = crm_meta_name(PCMK_META_PROMOTED_NODE_MAX); crm_xml_add_int(xml, name, promoted_node_max); free(name); /* @COMPAT Maintain backward compatibility with resource agents that * expect the old names (deprecated since 2.0.0). */ name = crm_meta_name(PCMK__META_PROMOTED_MAX_LEGACY); crm_xml_add_int(xml, name, promoted_max); free(name); name = crm_meta_name(PCMK__META_PROMOTED_NODE_MAX_LEGACY); crm_xml_add_int(xml, name, promoted_node_max); free(name); } } // Clone implementation of pcmk_assignment_methods_t:add_utilization() void pcmk__clone_add_utilization(const pcmk_resource_t *rsc, const pcmk_resource_t *orig_rsc, GList *all_rscs, GHashTable *utilization) { bool existing = false; pcmk_resource_t *child = NULL; CRM_ASSERT(pe_rsc_is_clone(rsc) && (orig_rsc != NULL) && (utilization != NULL)); if (!pcmk_is_set(rsc->flags, pcmk_rsc_unassigned)) { return; } // Look for any child already existing in the list for (GList *iter = rsc->children; iter != NULL; iter = iter->next) { child = (pcmk_resource_t *) iter->data; if (g_list_find(all_rscs, child)) { existing = true; // Keep checking remaining children } else { // If this is a clone of a group, look for group's members for (GList *member_iter = child->children; member_iter != NULL; member_iter = member_iter->next) { pcmk_resource_t *member = (pcmk_resource_t *) member_iter->data; if (g_list_find(all_rscs, member) != NULL) { // Add *child's* utilization, not group member's child->cmds->add_utilization(child, orig_rsc, all_rscs, utilization); existing = true; break; } } } } if (!existing && (rsc->children != NULL)) { // If nothing was found, still add first child's utilization child = (pcmk_resource_t *) rsc->children->data; child->cmds->add_utilization(child, orig_rsc, all_rscs, utilization); } } // Clone implementation of pcmk_assignment_methods_t:shutdown_lock() void pcmk__clone_shutdown_lock(pcmk_resource_t *rsc) { CRM_ASSERT(pe_rsc_is_clone(rsc)); return; // Clones currently don't support shutdown locks } diff --git a/lib/pacemaker/pcmk_sched_instances.c b/lib/pacemaker/pcmk_sched_instances.c index 4521abe07d..afacded067 100644 --- a/lib/pacemaker/pcmk_sched_instances.c +++ b/lib/pacemaker/pcmk_sched_instances.c @@ -1,1687 +1,1687 @@ /* - * Copyright 2004-2023 the Pacemaker project contributors + * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ /* This file is intended for code usable with both clone instances and bundle * replica containers. */ #include #include #include #include "libpacemaker_private.h" /*! * \internal * \brief Check whether a node is allowed to run an instance * * \param[in] instance Clone instance or bundle container to check * \param[in] node Node to check * \param[in] max_per_node Maximum number of instances allowed to run on a node * * \return true if \p node is allowed to run \p instance, otherwise false */ static bool can_run_instance(const pcmk_resource_t *instance, const pcmk_node_t *node, int max_per_node) { pcmk_node_t *allowed_node = NULL; if (pcmk_is_set(instance->flags, pcmk_rsc_removed)) { pcmk__rsc_trace(instance, "%s cannot run on %s: orphaned", instance->id, pe__node_name(node)); return false; } if (!pcmk__node_available(node, false, false)) { pcmk__rsc_trace(instance, "%s cannot run on %s: node cannot run resources", instance->id, pe__node_name(node)); return false; } allowed_node = pcmk__top_allowed_node(instance, node); if (allowed_node == NULL) { crm_warn("%s cannot run on %s: node not allowed", instance->id, pe__node_name(node)); return false; } if (allowed_node->weight < 0) { pcmk__rsc_trace(instance, "%s cannot run on %s: parent score is %s there", instance->id, pe__node_name(node), pcmk_readable_score(allowed_node->weight)); return false; } if (allowed_node->count >= max_per_node) { pcmk__rsc_trace(instance, "%s cannot run on %s: node already has %d instance%s", instance->id, pe__node_name(node), max_per_node, pcmk__plural_s(max_per_node)); return false; } pcmk__rsc_trace(instance, "%s can run on %s (%d already running)", instance->id, pe__node_name(node), allowed_node->count); return true; } /*! * \internal * \brief Ban a clone instance or bundle replica from unavailable allowed nodes * * \param[in,out] instance Clone instance or bundle replica to ban * \param[in] max_per_node Maximum instances allowed to run on a node */ static void ban_unavailable_allowed_nodes(pcmk_resource_t *instance, int max_per_node) { if (instance->allowed_nodes != NULL) { GHashTableIter iter; pcmk_node_t *node = NULL; g_hash_table_iter_init(&iter, instance->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **) &node)) { if (!can_run_instance(instance, node, max_per_node)) { pcmk__rsc_trace(instance, "Banning %s from unavailable node %s", instance->id, pe__node_name(node)); node->weight = -INFINITY; for (GList *child_iter = instance->children; child_iter != NULL; child_iter = child_iter->next) { pcmk_resource_t *child = child_iter->data; pcmk_node_t *child_node = NULL; child_node = g_hash_table_lookup(child->allowed_nodes, node->details->id); if (child_node != NULL) { pcmk__rsc_trace(instance, "Banning %s child %s " "from unavailable node %s", instance->id, child->id, pe__node_name(node)); child_node->weight = -INFINITY; } } } } } } /*! * \internal * \brief Create a hash table with a single node in it * * \param[in] node Node to copy into new table * * \return Newly created hash table containing a copy of \p node * \note The caller is responsible for freeing the result with * g_hash_table_destroy(). */ static GHashTable * new_node_table(pcmk_node_t *node) { GHashTable *table = pcmk__strkey_table(NULL, free); node = pe__copy_node(node); g_hash_table_insert(table, (gpointer) node->details->id, node); return table; } /*! * \internal * \brief Apply a resource's parent's colocation scores to a node table * * \param[in] rsc Resource whose colocations should be applied * \param[in,out] nodes Node table to apply colocations to */ static void apply_parent_colocations(const pcmk_resource_t *rsc, GHashTable **nodes) { GList *colocations = pcmk__this_with_colocations(rsc); for (const GList *iter = colocations; iter != NULL; iter = iter->next) { const pcmk__colocation_t *colocation = iter->data; pcmk_resource_t *other = colocation->primary; float factor = colocation->score / (float) INFINITY; other->cmds->add_colocated_node_scores(other, rsc, rsc->id, nodes, colocation, factor, pcmk__coloc_select_default); } g_list_free(colocations); colocations = pcmk__with_this_colocations(rsc); for (const GList *iter = colocations; iter != NULL; iter = iter->next) { const pcmk__colocation_t *colocation = iter->data; pcmk_resource_t *other = colocation->dependent; float factor = colocation->score / (float) INFINITY; if (!pcmk__colocation_has_influence(colocation, rsc)) { continue; } other->cmds->add_colocated_node_scores(other, rsc, rsc->id, nodes, colocation, factor, pcmk__coloc_select_nonnegative); } g_list_free(colocations); } /*! * \internal * \brief Compare clone or bundle instances based on colocation scores * * Determine the relative order in which two clone or bundle instances should be * assigned to nodes, considering the scores of colocation constraints directly * or indirectly involving them. * * \param[in] instance1 First instance to compare * \param[in] instance2 Second instance to compare * * \return A negative number if \p instance1 should be assigned first, * a positive number if \p instance2 should be assigned first, * or 0 if assignment order doesn't matter */ static int cmp_instance_by_colocation(const pcmk_resource_t *instance1, const pcmk_resource_t *instance2) { int rc = 0; pcmk_node_t *node1 = NULL; pcmk_node_t *node2 = NULL; pcmk_node_t *current_node1 = pe__current_node(instance1); pcmk_node_t *current_node2 = pe__current_node(instance2); GHashTable *colocated_scores1 = NULL; GHashTable *colocated_scores2 = NULL; CRM_ASSERT((instance1 != NULL) && (instance1->parent != NULL) && (instance2 != NULL) && (instance2->parent != NULL) && (current_node1 != NULL) && (current_node2 != NULL)); // Create node tables initialized with each node colocated_scores1 = new_node_table(current_node1); colocated_scores2 = new_node_table(current_node2); // Apply parental colocations apply_parent_colocations(instance1, &colocated_scores1); apply_parent_colocations(instance2, &colocated_scores2); // Find original nodes again, with scores updated for colocations node1 = g_hash_table_lookup(colocated_scores1, current_node1->details->id); node2 = g_hash_table_lookup(colocated_scores2, current_node2->details->id); // Compare nodes by updated scores if (node1->weight < node2->weight) { crm_trace("Assign %s (%d on %s) after %s (%d on %s)", instance1->id, node1->weight, pe__node_name(node1), instance2->id, node2->weight, pe__node_name(node2)); rc = 1; } else if (node1->weight > node2->weight) { crm_trace("Assign %s (%d on %s) before %s (%d on %s)", instance1->id, node1->weight, pe__node_name(node1), instance2->id, node2->weight, pe__node_name(node2)); rc = -1; } g_hash_table_destroy(colocated_scores1); g_hash_table_destroy(colocated_scores2); return rc; } /*! * \internal * \brief Check whether a resource or any of its children are failed * * \param[in] rsc Resource to check * * \return true if \p rsc or any of its children are failed, otherwise false */ static bool did_fail(const pcmk_resource_t *rsc) { if (pcmk_is_set(rsc->flags, pcmk_rsc_failed)) { return true; } for (GList *iter = rsc->children; iter != NULL; iter = iter->next) { if (did_fail((const pcmk_resource_t *) iter->data)) { return true; } } return false; } /*! * \internal * \brief Check whether a node is allowed to run a resource * * \param[in] rsc Resource to check * \param[in,out] node Node to check (will be set NULL if not allowed) * * \return true if *node is either NULL or allowed for \p rsc, otherwise false */ static bool node_is_allowed(const pcmk_resource_t *rsc, pcmk_node_t **node) { if (*node != NULL) { pcmk_node_t *allowed = g_hash_table_lookup(rsc->allowed_nodes, (*node)->details->id); if ((allowed == NULL) || (allowed->weight < 0)) { pcmk__rsc_trace(rsc, "%s: current location (%s) is unavailable", rsc->id, pe__node_name(*node)); *node = NULL; return false; } } return true; } /*! * \internal * \brief Compare two clone or bundle instances' instance numbers * * \param[in] a First instance to compare * \param[in] b Second instance to compare * * \return A negative number if \p a's instance number is lower, * a positive number if \p b's instance number is lower, * or 0 if their instance numbers are the same */ gint pcmk__cmp_instance_number(gconstpointer a, gconstpointer b) { const pcmk_resource_t *instance1 = (const pcmk_resource_t *) a; const pcmk_resource_t *instance2 = (const pcmk_resource_t *) b; char *div1 = NULL; char *div2 = NULL; CRM_ASSERT((instance1 != NULL) && (instance2 != NULL)); // Clone numbers are after a colon, bundle numbers after a dash div1 = strrchr(instance1->id, ':'); if (div1 == NULL) { div1 = strrchr(instance1->id, '-'); } div2 = strrchr(instance2->id, ':'); if (div2 == NULL) { div2 = strrchr(instance2->id, '-'); } CRM_ASSERT((div1 != NULL) && (div2 != NULL)); return (gint) (strtol(div1 + 1, NULL, 10) - strtol(div2 + 1, NULL, 10)); } /*! * \internal * \brief Compare clone or bundle instances according to assignment order * * Compare two clone or bundle instances according to the order they should be * assigned to nodes, preferring (in order): * * - Active instance that is less multiply active * - Instance that is not active on a disallowed node * - Instance with higher configured priority * - Active instance whose current node can run resources * - Active instance whose parent is allowed on current node * - Active instance whose current node has fewer other instances * - Active instance * - Instance that isn't failed * - Instance whose colocations result in higher score on current node * - Instance with lower ID in lexicographic order * * \param[in] a First instance to compare * \param[in] b Second instance to compare * * \return A negative number if \p a should be assigned first, * a positive number if \p b should be assigned first, * or 0 if assignment order doesn't matter */ gint pcmk__cmp_instance(gconstpointer a, gconstpointer b) { int rc = 0; pcmk_node_t *node1 = NULL; pcmk_node_t *node2 = NULL; unsigned int nnodes1 = 0; unsigned int nnodes2 = 0; bool can1 = true; bool can2 = true; const pcmk_resource_t *instance1 = (const pcmk_resource_t *) a; const pcmk_resource_t *instance2 = (const pcmk_resource_t *) b; CRM_ASSERT((instance1 != NULL) && (instance2 != NULL)); node1 = instance1->fns->active_node(instance1, &nnodes1, NULL); node2 = instance2->fns->active_node(instance2, &nnodes2, NULL); /* If both instances are running and at least one is multiply * active, prefer instance that's running on fewer nodes. */ if ((nnodes1 > 0) && (nnodes2 > 0)) { if (nnodes1 < nnodes2) { crm_trace("Assign %s (active on %d) before %s (active on %d): " "less multiply active", instance1->id, nnodes1, instance2->id, nnodes2); return -1; } else if (nnodes1 > nnodes2) { crm_trace("Assign %s (active on %d) after %s (active on %d): " "more multiply active", instance1->id, nnodes1, instance2->id, nnodes2); return 1; } } /* An instance that is either inactive or active on an allowed node is * preferred over an instance that is active on a no-longer-allowed node. */ can1 = node_is_allowed(instance1, &node1); can2 = node_is_allowed(instance2, &node2); if (can1 && !can2) { crm_trace("Assign %s before %s: not active on a disallowed node", instance1->id, instance2->id); return -1; } else if (!can1 && can2) { crm_trace("Assign %s after %s: active on a disallowed node", instance1->id, instance2->id); return 1; } // Prefer instance with higher configured priority if (instance1->priority > instance2->priority) { crm_trace("Assign %s before %s: priority (%d > %d)", instance1->id, instance2->id, instance1->priority, instance2->priority); return -1; } else if (instance1->priority < instance2->priority) { crm_trace("Assign %s after %s: priority (%d < %d)", instance1->id, instance2->id, instance1->priority, instance2->priority); return 1; } // Prefer active instance if ((node1 == NULL) && (node2 == NULL)) { crm_trace("No assignment preference for %s vs. %s: inactive", instance1->id, instance2->id); return 0; } else if (node1 == NULL) { crm_trace("Assign %s after %s: active", instance1->id, instance2->id); return 1; } else if (node2 == NULL) { crm_trace("Assign %s before %s: active", instance1->id, instance2->id); return -1; } // Prefer instance whose current node can run resources can1 = pcmk__node_available(node1, false, false); can2 = pcmk__node_available(node2, false, false); if (can1 && !can2) { crm_trace("Assign %s before %s: current node can run resources", instance1->id, instance2->id); return -1; } else if (!can1 && can2) { crm_trace("Assign %s after %s: current node can't run resources", instance1->id, instance2->id); return 1; } // Prefer instance whose parent is allowed to run on instance's current node node1 = pcmk__top_allowed_node(instance1, node1); node2 = pcmk__top_allowed_node(instance2, node2); if ((node1 == NULL) && (node2 == NULL)) { crm_trace("No assignment preference for %s vs. %s: " "parent not allowed on either instance's current node", instance1->id, instance2->id); return 0; } else if (node1 == NULL) { crm_trace("Assign %s after %s: parent not allowed on current node", instance1->id, instance2->id); return 1; } else if (node2 == NULL) { crm_trace("Assign %s before %s: parent allowed on current node", instance1->id, instance2->id); return -1; } // Prefer instance whose current node is running fewer other instances if (node1->count < node2->count) { crm_trace("Assign %s before %s: fewer active instances on current node", instance1->id, instance2->id); return -1; } else if (node1->count > node2->count) { crm_trace("Assign %s after %s: more active instances on current node", instance1->id, instance2->id); return 1; } // Prefer instance that isn't failed can1 = did_fail(instance1); can2 = did_fail(instance2); if (!can1 && can2) { crm_trace("Assign %s before %s: not failed", instance1->id, instance2->id); return -1; } else if (can1 && !can2) { crm_trace("Assign %s after %s: failed", instance1->id, instance2->id); return 1; } // Prefer instance with higher cumulative colocation score on current node rc = cmp_instance_by_colocation(instance1, instance2); if (rc != 0) { return rc; } // Prefer instance with lower instance number rc = pcmk__cmp_instance_number(instance1, instance2); if (rc < 0) { crm_trace("Assign %s before %s: instance number", instance1->id, instance2->id); } else if (rc > 0) { crm_trace("Assign %s after %s: instance number", instance1->id, instance2->id); } else { crm_trace("No assignment preference for %s vs. %s", instance1->id, instance2->id); } return rc; } /*! * \internal * \brief Increment the parent's instance count after assigning an instance * * An instance's parent tracks how many instances have been assigned to each * node via its pcmk_node_t:count member. After assigning an instance to a node, * find the corresponding node in the parent's allowed table and increment it. * * \param[in,out] instance Instance whose parent to update * \param[in] assigned_to Node to which the instance was assigned */ static void increment_parent_count(pcmk_resource_t *instance, const pcmk_node_t *assigned_to) { pcmk_node_t *allowed = NULL; if (assigned_to == NULL) { return; } allowed = pcmk__top_allowed_node(instance, assigned_to); if (allowed == NULL) { /* The instance is allowed on the node, but its parent isn't. This * shouldn't be possible if the resource is managed, and we won't be * able to limit the number of instances assigned to the node. */ CRM_LOG_ASSERT(!pcmk_is_set(instance->flags, pcmk_rsc_managed)); } else { allowed->count++; } } /*! * \internal * \brief Assign an instance to a node * * \param[in,out] instance Clone instance or bundle replica container * \param[in] prefer If not NULL, attempt early assignment to this * node, if still the best choice; otherwise, * perform final assignment * \param[in] max_per_node Assign at most this many instances to one node * * \return Node to which \p instance is assigned */ static const pcmk_node_t * assign_instance(pcmk_resource_t *instance, const pcmk_node_t *prefer, int max_per_node) { pcmk_node_t *chosen = NULL; pcmk__rsc_trace(instance, "Assigning %s (preferring %s)", instance->id, ((prefer == NULL)? "no node" : prefer->details->uname)); if (pcmk_is_set(instance->flags, pcmk_rsc_assigning)) { pcmk__rsc_debug(instance, "Assignment loop detected involving %s colocations", instance->id); return NULL; } ban_unavailable_allowed_nodes(instance, max_per_node); // Failed early assignments are reversible (stop_if_fail=false) chosen = instance->cmds->assign(instance, prefer, (prefer == NULL)); increment_parent_count(instance, chosen); return chosen; } /*! * \internal * \brief Try to assign an instance to its current node early * * \param[in] rsc Clone or bundle being assigned (for logs only) * \param[in] instance Clone instance or bundle replica container * \param[in] current Instance's current node * \param[in] max_per_node Maximum number of instances per node * \param[in] available Number of instances still available for assignment * * \return \c true if \p instance was successfully assigned to its current node, * or \c false otherwise */ static bool assign_instance_early(const pcmk_resource_t *rsc, pcmk_resource_t *instance, const pcmk_node_t *current, int max_per_node, int available) { const pcmk_node_t *chosen = NULL; int reserved = 0; pcmk_resource_t *parent = instance->parent; GHashTable *allowed_orig = NULL; GHashTable *allowed_orig_parent = parent->allowed_nodes; const pcmk_node_t *allowed_node = NULL; pcmk__rsc_trace(instance, "Trying to assign %s to its current node %s", instance->id, pe__node_name(current)); allowed_node = g_hash_table_lookup(instance->allowed_nodes, current->details->id); if (!pcmk__node_available(allowed_node, true, false)) { pcmk__rsc_info(instance, "Not assigning %s to current node %s: unavailable", instance->id, pe__node_name(current)); return false; } /* On each iteration, if instance gets assigned to a node other than its * current one, we reserve one instance for the chosen node, unassign * instance, restore instance's original node tables, and try again. This * way, instances are proportionally assigned to nodes based on preferences, * but shuffling of specific instances is minimized. If a node will be * assigned instances at all, it preferentially receives instances that are * currently active there. * * parent->allowed_nodes tracks the number of instances assigned to each * node. If a node already has max_per_node instances assigned, * ban_unavailable_allowed_nodes() marks it as unavailable. * * In the end, we restore the original parent->allowed_nodes to undo the * changes to counts during tentative assignments. If we successfully * assigned instance to its current node, we increment that node's counter. */ // Back up the allowed node tables of instance and its children recursively pcmk__copy_node_tables(instance, &allowed_orig); // Update instances-per-node counts in a scratch table parent->allowed_nodes = pcmk__copy_node_table(parent->allowed_nodes); while (reserved < available) { chosen = assign_instance(instance, current, max_per_node); if (pe__same_node(chosen, current)) { // Successfully assigned to current node break; } // Assignment updates scores, so restore to original state pcmk__rsc_debug(instance, "Rolling back node scores for %s", instance->id); pcmk__restore_node_tables(instance, allowed_orig); if (chosen == NULL) { // Assignment failed, so give up pcmk__rsc_info(instance, "Not assigning %s to current node %s: unavailable", instance->id, pe__node_name(current)); pe__set_resource_flags(instance, pcmk_rsc_unassigned); break; } // We prefer more strongly to assign an instance to the chosen node pcmk__rsc_debug(instance, "Not assigning %s to current node %s: %s is better", instance->id, pe__node_name(current), pe__node_name(chosen)); // Reserve one instance for the chosen node and try again if (++reserved >= available) { pcmk__rsc_info(instance, "Not assigning %s to current node %s: " "other assignments are more important", instance->id, pe__node_name(current)); } else { pcmk__rsc_debug(instance, "Reserved an instance of %s for %s. Retrying " "assignment of %s to %s", rsc->id, pe__node_name(chosen), instance->id, pe__node_name(current)); } // Clear this assignment (frees chosen); leave instance counts in parent pcmk__unassign_resource(instance); chosen = NULL; } g_hash_table_destroy(allowed_orig); // Restore original instances-per-node counts g_hash_table_destroy(parent->allowed_nodes); parent->allowed_nodes = allowed_orig_parent; if (chosen == NULL) { // Couldn't assign instance to current node return false; } pcmk__rsc_trace(instance, "Assigned %s to current node %s", instance->id, pe__node_name(current)); increment_parent_count(instance, chosen); return true; } /*! * \internal * \brief Reset the node counts of a resource's allowed nodes to zero * * \param[in,out] rsc Resource to reset * * \return Number of nodes that are available to run resources */ static unsigned int reset_allowed_node_counts(pcmk_resource_t *rsc) { unsigned int available_nodes = 0; pcmk_node_t *node = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { node->count = 0; if (pcmk__node_available(node, false, false)) { available_nodes++; } } return available_nodes; } /*! * \internal * \brief Check whether an instance has a preferred node * * \param[in] instance Clone instance or bundle replica container * \param[in] optimal_per_node Optimal number of instances per node * * \return Instance's current node if still available, otherwise NULL */ static const pcmk_node_t * preferred_node(const pcmk_resource_t *instance, int optimal_per_node) { const pcmk_node_t *node = NULL; const pcmk_node_t *parent_node = NULL; // Check whether instance is active, healthy, and not yet assigned if ((instance->running_on == NULL) || !pcmk_is_set(instance->flags, pcmk_rsc_unassigned) || pcmk_is_set(instance->flags, pcmk_rsc_failed)) { return NULL; } // Check whether instance's current node can run resources node = pe__current_node(instance); if (!pcmk__node_available(node, true, false)) { pcmk__rsc_trace(instance, "Not assigning %s to %s early (unavailable)", instance->id, pe__node_name(node)); return NULL; } // Check whether node already has optimal number of instances assigned parent_node = pcmk__top_allowed_node(instance, node); if ((parent_node != NULL) && (parent_node->count >= optimal_per_node)) { pcmk__rsc_trace(instance, "Not assigning %s to %s early " "(optimal instances already assigned)", instance->id, pe__node_name(node)); return NULL; } return node; } /*! * \internal * \brief Assign collective instances to nodes * * \param[in,out] collective Clone or bundle resource being assigned * \param[in,out] instances List of clone instances or bundle containers * \param[in] max_total Maximum instances to assign in total * \param[in] max_per_node Maximum instances to assign to any one node */ void pcmk__assign_instances(pcmk_resource_t *collective, GList *instances, int max_total, int max_per_node) { // Reuse node count to track number of assigned instances unsigned int available_nodes = reset_allowed_node_counts(collective); int optimal_per_node = 0; int assigned = 0; GList *iter = NULL; pcmk_resource_t *instance = NULL; const pcmk_node_t *current = NULL; if (available_nodes > 0) { optimal_per_node = max_total / available_nodes; } if (optimal_per_node < 1) { optimal_per_node = 1; } pcmk__rsc_debug(collective, "Assigning up to %d %s instance%s to up to %u node%s " "(at most %d per host, %d optimal)", max_total, collective->id, pcmk__plural_s(max_total), available_nodes, pcmk__plural_s(available_nodes), max_per_node, optimal_per_node); // Assign as many instances as possible to their current location for (iter = instances; (iter != NULL) && (assigned < max_total); iter = iter->next) { int available = max_total - assigned; instance = iter->data; if (!pcmk_is_set(instance->flags, pcmk_rsc_unassigned)) { continue; // Already assigned } current = preferred_node(instance, optimal_per_node); if ((current != NULL) && assign_instance_early(collective, instance, current, max_per_node, available)) { assigned++; } } pcmk__rsc_trace(collective, "Assigned %d of %d instance%s to current node", assigned, max_total, pcmk__plural_s(max_total)); for (iter = instances; iter != NULL; iter = iter->next) { instance = (pcmk_resource_t *) iter->data; if (!pcmk_is_set(instance->flags, pcmk_rsc_unassigned)) { continue; // Already assigned } if (instance->running_on != NULL) { current = pe__current_node(instance); if (pcmk__top_allowed_node(instance, current) == NULL) { const char *unmanaged = ""; if (!pcmk_is_set(instance->flags, pcmk_rsc_managed)) { unmanaged = "Unmanaged resource "; } crm_notice("%s%s is running on %s which is no longer allowed", unmanaged, instance->id, pe__node_name(current)); } } if (assigned >= max_total) { pcmk__rsc_debug(collective, "Not assigning %s because maximum %d instances " "already assigned", instance->id, max_total); resource_location(instance, NULL, -INFINITY, "collective_limit_reached", collective->cluster); } else if (assign_instance(instance, NULL, max_per_node) != NULL) { assigned++; } } pcmk__rsc_debug(collective, "Assigned %d of %d possible instance%s of %s", assigned, max_total, pcmk__plural_s(max_total), collective->id); } enum instance_state { instance_starting = (1 << 0), instance_stopping = (1 << 1), /* This indicates that some instance is restarting. It's not the same as * instance_starting|instance_stopping, which would indicate that some * instance is starting, and some instance (not necessarily the same one) is * stopping. */ instance_restarting = (1 << 2), instance_active = (1 << 3), instance_all = instance_starting|instance_stopping |instance_restarting|instance_active, }; /*! * \internal * \brief Check whether an instance is active, starting, and/or stopping * * \param[in] instance Clone instance or bundle replica container * \param[in,out] state Whether any instance is starting, stopping, etc. */ static void check_instance_state(const pcmk_resource_t *instance, uint32_t *state) { const GList *iter = NULL; uint32_t instance_state = 0; // State of just this instance // No need to check further if all conditions have already been detected if (pcmk_all_flags_set(*state, instance_all)) { return; } // If instance is a collective (a cloned group), check its children instead if (instance->variant > pcmk_rsc_variant_primitive) { for (iter = instance->children; (iter != NULL) && !pcmk_all_flags_set(*state, instance_all); iter = iter->next) { check_instance_state((const pcmk_resource_t *) iter->data, state); } return; } // If we get here, instance is a primitive if (instance->running_on != NULL) { instance_state |= instance_active; } // Check each of the instance's actions for runnable start or stop for (iter = instance->actions; (iter != NULL) && !pcmk_all_flags_set(instance_state, instance_starting |instance_stopping); iter = iter->next) { const pcmk_action_t *action = (const pcmk_action_t *) iter->data; const bool optional = pcmk_is_set(action->flags, pcmk_action_optional); if (pcmk__str_eq(PCMK_ACTION_START, action->task, pcmk__str_none)) { if (!optional && pcmk_is_set(action->flags, pcmk_action_runnable)) { pcmk__rsc_trace(instance, "Instance is starting due to %s", action->uuid); instance_state |= instance_starting; } else { pcmk__rsc_trace(instance, "%s doesn't affect %s state (%s)", action->uuid, instance->id, (optional? "optional" : "unrunnable")); } } else if (pcmk__str_eq(PCMK_ACTION_STOP, action->task, pcmk__str_none)) { /* Only stop actions can be pseudo-actions for primitives. That * indicates that the node they are on is being fenced, so the stop * is implied rather than actually executed. */ if (!optional && pcmk_any_flags_set(action->flags, pcmk_action_pseudo |pcmk_action_runnable)) { pcmk__rsc_trace(instance, "Instance is stopping due to %s", action->uuid); instance_state |= instance_stopping; } else { pcmk__rsc_trace(instance, "%s doesn't affect %s state (%s)", action->uuid, instance->id, (optional? "optional" : "unrunnable")); } } } if (pcmk_all_flags_set(instance_state, instance_starting|instance_stopping)) { instance_state |= instance_restarting; } *state |= instance_state; } /*! * \internal * \brief Create actions for collective resource instances * * \param[in,out] collective Clone or bundle resource to create actions for * \param[in,out] instances List of clone instances or bundle containers */ void pcmk__create_instance_actions(pcmk_resource_t *collective, GList *instances) { uint32_t state = 0; pcmk_action_t *stop = NULL; pcmk_action_t *stopped = NULL; pcmk_action_t *start = NULL; pcmk_action_t *started = NULL; pcmk__rsc_trace(collective, "Creating collective instance actions for %s", collective->id); // Create actions for each instance appropriate to its variant for (GList *iter = instances; iter != NULL; iter = iter->next) { pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; instance->cmds->create_actions(instance); check_instance_state(instance, &state); } // Create pseudo-actions for rsc start and started start = pe__new_rsc_pseudo_action(collective, PCMK_ACTION_START, !pcmk_is_set(state, instance_starting), true); started = pe__new_rsc_pseudo_action(collective, PCMK_ACTION_RUNNING, !pcmk_is_set(state, instance_starting), false); started->priority = INFINITY; if (pcmk_any_flags_set(state, instance_active|instance_starting)) { pe__set_action_flags(started, pcmk_action_runnable); } // Create pseudo-actions for rsc stop and stopped stop = pe__new_rsc_pseudo_action(collective, PCMK_ACTION_STOP, !pcmk_is_set(state, instance_stopping), true); stopped = pe__new_rsc_pseudo_action(collective, PCMK_ACTION_STOPPED, !pcmk_is_set(state, instance_stopping), true); stopped->priority = INFINITY; if (!pcmk_is_set(state, instance_restarting)) { pe__set_action_flags(stop, pcmk_action_migratable); } if (collective->variant == pcmk_rsc_variant_clone) { pe__create_clone_notif_pseudo_ops(collective, start, started, stop, stopped); } } /*! * \internal * \brief Get a list of clone instances or bundle replica containers * * \param[in] rsc Clone or bundle resource * * \return Clone instances if \p rsc is a clone, or a newly created list of * \p rsc's replica containers if \p rsc is a bundle * \note The caller must call free_instance_list() on the result when the list * is no longer needed. */ static inline GList * get_instance_list(const pcmk_resource_t *rsc) { if (rsc->variant == pcmk_rsc_variant_bundle) { return pe__bundle_containers(rsc); } else { return rsc->children; } } /*! * \internal * \brief Free any memory created by get_instance_list() * * \param[in] rsc Clone or bundle resource passed to get_instance_list() * \param[in,out] list Return value of get_instance_list() for \p rsc */ static inline void free_instance_list(const pcmk_resource_t *rsc, GList *list) { if (list != rsc->children) { g_list_free(list); } } /*! * \internal * \brief Check whether an instance is compatible with a role and node * * \param[in] instance Clone instance or bundle replica container * \param[in] node Instance must match this node * \param[in] role If not pcmk_role_unknown, instance must match this role * \param[in] current If true, compare instance's original node and role, * otherwise compare assigned next node and role * * \return true if \p instance is compatible with \p node and \p role, * otherwise false */ bool pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node, enum rsc_role_e role, bool current) { pcmk_node_t *instance_node = NULL; CRM_CHECK((instance != NULL) && (node != NULL), return false); if ((role != pcmk_role_unknown) && (role != instance->fns->state(instance, current))) { pcmk__rsc_trace(instance, "%s is not a compatible instance (role is not %s)", instance->id, role2text(role)); return false; } if (!is_set_recursive(instance, pcmk_rsc_blocked, true)) { // We only want instances that haven't failed instance_node = instance->fns->location(instance, NULL, current); } if (instance_node == NULL) { pcmk__rsc_trace(instance, "%s is not a compatible instance " "(not assigned to a node)", instance->id); return false; } if (!pe__same_node(instance_node, node)) { pcmk__rsc_trace(instance, "%s is not a compatible instance " "(assigned to %s not %s)", instance->id, pe__node_name(instance_node), pe__node_name(node)); return false; } return true; } /*! * \internal * \brief Find an instance that matches a given resource by node and role * * \param[in] match_rsc Resource that instance must match (for logging only) * \param[in] rsc Clone or bundle resource to check for matching instance * \param[in] node Instance must match this node * \param[in] role If not pcmk_role_unknown, instance must match this role * \param[in] current If true, compare instance's original node and role, * otherwise compare assigned next node and role * * \return \p rsc instance matching \p node and \p role if any, otherwise NULL */ static pcmk_resource_t * find_compatible_instance_on_node(const pcmk_resource_t *match_rsc, const pcmk_resource_t *rsc, const pcmk_node_t *node, enum rsc_role_e role, bool current) { GList *instances = NULL; instances = get_instance_list(rsc); for (GList *iter = instances; iter != NULL; iter = iter->next) { pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; if (pcmk__instance_matches(instance, node, role, current)) { pcmk__rsc_trace(match_rsc, "Found %s %s instance %s compatible with %s on %s", role == pcmk_role_unknown? "matching" : role2text(role), rsc->id, instance->id, match_rsc->id, pe__node_name(node)); free_instance_list(rsc, instances); // Only frees list, not contents return instance; } } free_instance_list(rsc, instances); pcmk__rsc_trace(match_rsc, "No %s %s instance found compatible with %s on %s", ((role == pcmk_role_unknown)? "matching" : role2text(role)), rsc->id, match_rsc->id, pe__node_name(node)); return NULL; } /*! * \internal * \brief Find a clone instance or bundle container compatible with a resource * * \param[in] match_rsc Resource that instance must match * \param[in] rsc Clone or bundle resource to check for matching instance * \param[in] role If not pcmk_role_unknown, instance must match this role * \param[in] current If true, compare instance's original node and role, * otherwise compare assigned next node and role * * \return Compatible (by \p role and \p match_rsc location) instance of \p rsc * if any, otherwise NULL */ pcmk_resource_t * pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc, const pcmk_resource_t *rsc, enum rsc_role_e role, bool current) { pcmk_resource_t *instance = NULL; GList *nodes = NULL; const pcmk_node_t *node = NULL; // If match_rsc has a node, check only that node node = match_rsc->fns->location(match_rsc, NULL, current); if (node != NULL) { return find_compatible_instance_on_node(match_rsc, rsc, node, role, current); } // Otherwise check for an instance matching any of match_rsc's allowed nodes nodes = pcmk__sort_nodes(g_hash_table_get_values(match_rsc->allowed_nodes), NULL); for (GList *iter = nodes; (iter != NULL) && (instance == NULL); iter = iter->next) { instance = find_compatible_instance_on_node(match_rsc, rsc, (pcmk_node_t *) iter->data, role, current); } if (instance == NULL) { pcmk__rsc_debug(rsc, "No %s instance found compatible with %s", rsc->id, match_rsc->id); } g_list_free(nodes); return instance; } /*! * \internal * \brief Unassign an instance if mandatory ordering has no interleave match * * \param[in] first 'First' action in an ordering * \param[in] then 'Then' action in an ordering * \param[in,out] then_instance 'Then' instance that has no interleave match * \param[in] type Group of enum pcmk__action_relation_flags * \param[in] current If true, "then" action is stopped or demoted * * \return true if \p then_instance was unassigned, otherwise false */ static bool unassign_if_mandatory(const pcmk_action_t *first, const pcmk_action_t *then, pcmk_resource_t *then_instance, uint32_t type, bool current) { // Allow "then" instance to go down even without an interleave match if (current) { pcmk__rsc_trace(then->rsc, "%s has no instance to order before stopping " "or demoting %s", first->rsc->id, then_instance->id); /* If the "first" action must be runnable, but there is no "first" * instance, the "then" instance must not be allowed to come up. */ } else if (pcmk_any_flags_set(type, pcmk__ar_unrunnable_first_blocks |pcmk__ar_first_implies_then)) { pcmk__rsc_info(then->rsc, "Inhibiting %s from being active " "because there is no %s instance to interleave", then_instance->id, first->rsc->id); return pcmk__assign_resource(then_instance, NULL, true, true); } return false; } /*! * \internal * \brief Find first matching action for a clone instance or bundle container * * \param[in] action Action in an interleaved ordering * \param[in] instance Clone instance or bundle container being interleaved * \param[in] action_name Action to look for * \param[in] node If not NULL, require action to be on this node * \param[in] for_first If true, \p instance is the 'first' resource in the * ordering, otherwise it is the 'then' resource * * \return First action for \p instance (or in some cases if \p instance is a * bundle container, its containerized resource) that matches * \p action_name and \p node if any, otherwise NULL */ static pcmk_action_t * find_instance_action(const pcmk_action_t *action, const pcmk_resource_t *instance, const char *action_name, const pcmk_node_t *node, bool for_first) { const pcmk_resource_t *rsc = NULL; pcmk_action_t *matching_action = NULL; /* If instance is a bundle container, sometimes we should interleave the * action for the container itself, and sometimes for the containerized * resource. * * For example, given "start bundle A then bundle B", B likely requires the * service inside A's container to be active, rather than just the * container, so we should interleave the action for A's containerized * resource. On the other hand, it's possible B's container itself requires * something from A, so we should interleave the action for B's container. * * Essentially, for 'first', we should use the containerized resource for * everything except stop, and for 'then', we should use the container for * everything except promote and demote (which can only be performed on the * containerized resource). */ if ((for_first && !pcmk__str_any_of(action->task, PCMK_ACTION_STOP, PCMK_ACTION_STOPPED, NULL)) || (!for_first && pcmk__str_any_of(action->task, PCMK_ACTION_PROMOTE, PCMK_ACTION_PROMOTED, PCMK_ACTION_DEMOTE, PCMK_ACTION_DEMOTED, NULL))) { rsc = pe__get_rsc_in_container(instance); } if (rsc == NULL) { rsc = instance; // No containerized resource, use instance itself } else { node = NULL; // Containerized actions are on bundle-created guest } matching_action = find_first_action(rsc->actions, NULL, action_name, node); if (matching_action != NULL) { return matching_action; } if (pcmk_is_set(instance->flags, pcmk_rsc_removed) || pcmk__str_any_of(action_name, PCMK_ACTION_STOP, PCMK_ACTION_DEMOTE, NULL)) { crm_trace("No %s action found for %s%s", action_name, pcmk_is_set(instance->flags, pcmk_rsc_removed)? "orphan " : "", instance->id); } else { crm_err("No %s action found for %s to interleave (bug?)", action_name, instance->id); } return NULL; } /*! * \internal * \brief Get the original action name of a bundle or clone action * * Given an action for a bundle or clone, get the original action name, * mapping notify to the action being notified, and if the instances are * primitives, mapping completion actions to the action that was completed * (for example, stopped to stop). * * \param[in] action Clone or bundle action to check * * \return Original action name for \p action */ static const char * orig_action_name(const pcmk_action_t *action) { // Any instance will do const pcmk_resource_t *instance = action->rsc->children->data; char *action_type = NULL; const char *action_name = action->task; enum action_tasks orig_task = pcmk_action_unspecified; if (pcmk__strcase_any_of(action->task, PCMK_ACTION_NOTIFY, PCMK_ACTION_NOTIFIED, NULL)) { // action->uuid is RSC_(confirmed-){pre,post}_notify_ACTION_INTERVAL CRM_CHECK(parse_op_key(action->uuid, NULL, &action_type, NULL), return task2text(pcmk_action_unspecified)); action_name = strstr(action_type, "_notify_"); CRM_CHECK(action_name != NULL, return task2text(pcmk_action_unspecified)); action_name += strlen("_notify_"); } orig_task = get_complex_task(instance, action_name); free(action_type); return task2text(orig_task); } /*! * \internal * \brief Update two interleaved actions according to an ordering between them * * Given information about an ordering of two interleaved actions, update the * actions' flags (and runnable_before members if appropriate) as appropriate * for the ordering. Effects may cascade to other orderings involving the * actions as well. * * \param[in,out] first 'First' action in an ordering * \param[in,out] then 'Then' action in an ordering * \param[in] node If not NULL, limit scope of ordering to this node * \param[in] filter Action flags to limit scope of certain updates (may * include pcmk_action_optional to affect only * mandatory actions, and pcmk_action_runnable to * affect only runnable actions) * \param[in] type Group of enum pcmk__action_relation_flags to apply * * \return Group of enum pcmk__updated flags indicating what was updated */ static uint32_t update_interleaved_actions(pcmk_action_t *first, pcmk_action_t *then, const pcmk_node_t *node, uint32_t filter, uint32_t type) { GList *instances = NULL; uint32_t changed = pcmk__updated_none; const char *orig_first_task = orig_action_name(first); // Stops and demotes must be interleaved with instance on current node bool current = pcmk__ends_with(first->uuid, "_" PCMK_ACTION_STOPPED "_0") || pcmk__ends_with(first->uuid, "_" PCMK_ACTION_DEMOTED "_0"); // Update the specified actions for each "then" instance individually instances = get_instance_list(then->rsc); for (GList *iter = instances; iter != NULL; iter = iter->next) { pcmk_resource_t *first_instance = NULL; pcmk_resource_t *then_instance = iter->data; pcmk_action_t *first_action = NULL; pcmk_action_t *then_action = NULL; // Find a "first" instance to interleave with this "then" instance first_instance = pcmk__find_compatible_instance(then_instance, first->rsc, pcmk_role_unknown, current); if (first_instance == NULL) { // No instance can be interleaved if (unassign_if_mandatory(first, then, then_instance, type, current)) { pcmk__set_updated_flags(changed, first, pcmk__updated_then); } continue; } first_action = find_instance_action(first, first_instance, orig_first_task, node, true); if (first_action == NULL) { continue; } then_action = find_instance_action(then, then_instance, then->task, node, false); if (then_action == NULL) { continue; } if (order_actions(first_action, then_action, type)) { pcmk__set_updated_flags(changed, first, pcmk__updated_first|pcmk__updated_then); } changed |= then_instance->cmds->update_ordered_actions( first_action, then_action, node, first_instance->cmds->action_flags(first_action, node), filter, type, then->rsc->cluster); } free_instance_list(then->rsc, instances); return changed; } /*! * \internal * \brief Check whether two actions in an ordering can be interleaved * * \param[in] first 'First' action in the ordering * \param[in] then 'Then' action in the ordering * * \return true if \p first and \p then can be interleaved, otherwise false */ static bool can_interleave_actions(const pcmk_action_t *first, const pcmk_action_t *then) { bool interleave = false; pcmk_resource_t *rsc = NULL; if ((first->rsc == NULL) || (then->rsc == NULL)) { crm_trace("Not interleaving %s with %s: not resource actions", first->uuid, then->uuid); return false; } if (first->rsc == then->rsc) { crm_trace("Not interleaving %s with %s: same resource", first->uuid, then->uuid); return false; } if ((first->rsc->variant < pcmk_rsc_variant_clone) || (then->rsc->variant < pcmk_rsc_variant_clone)) { crm_trace("Not interleaving %s with %s: not clones or bundles", first->uuid, then->uuid); return false; } if (pcmk__ends_with(then->uuid, "_stop_0") || pcmk__ends_with(then->uuid, "_demote_0")) { rsc = first->rsc; } else { rsc = then->rsc; } interleave = crm_is_true(g_hash_table_lookup(rsc->meta, - XML_RSC_ATTR_INTERLEAVE)); + PCMK_META_INTERLEAVE)); pcmk__rsc_trace(rsc, "'%s then %s' will %sbe interleaved (based on %s)", first->uuid, then->uuid, (interleave? "" : "not "), rsc->id); return interleave; } /*! * \internal * \brief Update non-interleaved instance actions according to an ordering * * Given information about an ordering of two non-interleaved actions, update * the actions' flags (and runnable_before members if appropriate) as * appropriate for the ordering. Effects may cascade to other orderings * involving the actions as well. * * \param[in,out] instance Clone instance or bundle container * \param[in,out] first "First" action in ordering * \param[in] then "Then" action in ordering (for \p instance's parent) * \param[in] node If not NULL, limit scope of ordering to this node * \param[in] flags Action flags for \p first for ordering purposes * \param[in] filter Action flags to limit scope of certain updates (may * include pcmk_action_optional to affect only * mandatory actions, and pcmk_action_runnable to * affect only runnable actions) * \param[in] type Group of enum pcmk__action_relation_flags to apply * * \return Group of enum pcmk__updated flags indicating what was updated */ static uint32_t update_noninterleaved_actions(pcmk_resource_t *instance, pcmk_action_t *first, const pcmk_action_t *then, const pcmk_node_t *node, uint32_t flags, uint32_t filter, uint32_t type) { pcmk_action_t *instance_action = NULL; uint32_t instance_flags = 0; uint32_t changed = pcmk__updated_none; // Check whether instance has an equivalent of "then" action instance_action = find_first_action(instance->actions, NULL, then->task, node); if (instance_action == NULL) { return changed; } // Check whether action is runnable instance_flags = instance->cmds->action_flags(instance_action, node); if (!pcmk_is_set(instance_flags, pcmk_action_runnable)) { return changed; } // If so, update actions for the instance changed = instance->cmds->update_ordered_actions(first, instance_action, node, flags, filter, type, instance->cluster); // Propagate any changes to later actions if (pcmk_is_set(changed, pcmk__updated_then)) { for (GList *after_iter = instance_action->actions_after; after_iter != NULL; after_iter = after_iter->next) { pcmk__related_action_t *after = after_iter->data; pcmk__update_action_for_orderings(after->action, instance->cluster); } } return changed; } /*! * \internal * \brief Update two actions according to an ordering between them * * Given information about an ordering of two clone or bundle actions, update * the actions' flags (and runnable_before members if appropriate) as * appropriate for the ordering. Effects may cascade to other orderings * involving the actions as well. * * \param[in,out] first 'First' action in an ordering * \param[in,out] then 'Then' action in an ordering * \param[in] node If not NULL, limit scope of ordering to this node * (only used when interleaving instances) * \param[in] flags Action flags for \p first for ordering purposes * \param[in] filter Action flags to limit scope of certain updates (may * include pcmk_action_optional to affect only * mandatory actions, and pcmk_action_runnable to * affect only runnable actions) * \param[in] type Group of enum pcmk__action_relation_flags to apply * \param[in,out] scheduler Scheduler data * * \return Group of enum pcmk__updated flags indicating what was updated */ uint32_t pcmk__instance_update_ordered_actions(pcmk_action_t *first, pcmk_action_t *then, const pcmk_node_t *node, uint32_t flags, uint32_t filter, uint32_t type, pcmk_scheduler_t *scheduler) { CRM_ASSERT((first != NULL) && (then != NULL) && (scheduler != NULL)); if (then->rsc == NULL) { return pcmk__updated_none; } else if (can_interleave_actions(first, then)) { return update_interleaved_actions(first, then, node, filter, type); } else { uint32_t changed = pcmk__updated_none; GList *instances = get_instance_list(then->rsc); // Update actions for the clone or bundle resource itself changed |= pcmk__update_ordered_actions(first, then, node, flags, filter, type, scheduler); // Update the 'then' clone instances or bundle containers individually for (GList *iter = instances; iter != NULL; iter = iter->next) { pcmk_resource_t *instance = iter->data; changed |= update_noninterleaved_actions(instance, first, then, node, flags, filter, type); } free_instance_list(then->rsc, instances); return changed; } } #define pe__clear_action_summary_flags(flags, action, flag) do { \ flags = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \ "Action summary", action->rsc->id, \ flags, flag, #flag); \ } while (0) /*! * \internal * \brief Return action flags for a given clone or bundle action * * \param[in,out] action Action for a clone or bundle * \param[in] instances Clone instances or bundle containers * \param[in] node If not NULL, limit effects to this node * * \return Flags appropriate to \p action on \p node */ uint32_t pcmk__collective_action_flags(pcmk_action_t *action, const GList *instances, const pcmk_node_t *node) { bool any_runnable = false; const char *action_name = orig_action_name(action); // Set original assumptions (optional and runnable may be cleared below) uint32_t flags = pcmk_action_optional |pcmk_action_runnable |pcmk_action_pseudo; for (const GList *iter = instances; iter != NULL; iter = iter->next) { const pcmk_resource_t *instance = iter->data; const pcmk_node_t *instance_node = NULL; pcmk_action_t *instance_action = NULL; uint32_t instance_flags; // Node is relevant only to primitive instances if (instance->variant == pcmk_rsc_variant_primitive) { instance_node = node; } instance_action = find_first_action(instance->actions, NULL, action_name, instance_node); if (instance_action == NULL) { pcmk__rsc_trace(action->rsc, "%s has no %s action on %s", instance->id, action_name, pe__node_name(node)); continue; } pcmk__rsc_trace(action->rsc, "%s has %s for %s on %s", instance->id, instance_action->uuid, action_name, pe__node_name(node)); instance_flags = instance->cmds->action_flags(instance_action, node); // If any instance action is mandatory, so is the collective action if (pcmk_is_set(flags, pcmk_action_optional) && !pcmk_is_set(instance_flags, pcmk_action_optional)) { pcmk__rsc_trace(instance, "%s is mandatory because %s is", action->uuid, instance_action->uuid); pe__clear_action_summary_flags(flags, action, pcmk_action_optional); pe__clear_action_flags(action, pcmk_action_optional); } // If any instance action is runnable, so is the collective action if (pcmk_is_set(instance_flags, pcmk_action_runnable)) { any_runnable = true; } } if (!any_runnable) { pcmk__rsc_trace(action->rsc, "%s is not runnable because no instance can run %s", action->uuid, action_name); pe__clear_action_summary_flags(flags, action, pcmk_action_runnable); if (node == NULL) { pe__clear_action_flags(action, pcmk_action_runnable); } } return flags; } diff --git a/lib/pacemaker/pcmk_sched_probes.c b/lib/pacemaker/pcmk_sched_probes.c index 4df13d98d8..b3f2b57cc7 100644 --- a/lib/pacemaker/pcmk_sched_probes.c +++ b/lib/pacemaker/pcmk_sched_probes.c @@ -1,903 +1,903 @@ /* - * Copyright 2004-2023 the Pacemaker project contributors + * Copyright 2004-2024 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include "libpacemaker_private.h" /*! * \internal * \brief Add the expected result to a newly created probe * * \param[in,out] probe Probe action to add expected result to * \param[in] rsc Resource that probe is for * \param[in] node Node that probe will run on */ static void add_expected_result(pcmk_action_t *probe, const pcmk_resource_t *rsc, const pcmk_node_t *node) { // Check whether resource is currently active on node pcmk_node_t *running = pe_find_node_id(rsc->running_on, node->details->id); // The expected result is what we think the resource's current state is if (running == NULL) { pe__add_action_expected_result(probe, CRM_EX_NOT_RUNNING); } else if (rsc->role == pcmk_role_promoted) { pe__add_action_expected_result(probe, CRM_EX_PROMOTED); } } /*! * \internal * \brief Create any needed robes on a node for a list of resources * * \param[in,out] rscs List of resources to create probes for * \param[in,out] node Node to create probes on * * \return true if any probe was created, otherwise false */ bool pcmk__probe_resource_list(GList *rscs, pcmk_node_t *node) { bool any_created = false; for (GList *iter = rscs; iter != NULL; iter = iter->next) { pcmk_resource_t *rsc = (pcmk_resource_t *) iter->data; if (rsc->cmds->create_probe(rsc, node)) { any_created = true; } } return any_created; } /*! * \internal * \brief Order one resource's start after another's start-up probe * * \param[in,out] rsc1 Resource that might get start-up probe * \param[in] rsc2 Resource that might be started */ static void probe_then_start(pcmk_resource_t *rsc1, pcmk_resource_t *rsc2) { if ((rsc1->allocated_to != NULL) && (g_hash_table_lookup(rsc1->known_on, rsc1->allocated_to->details->id) == NULL)) { pcmk__new_ordering(rsc1, pcmk__op_key(rsc1->id, PCMK_ACTION_MONITOR, 0), NULL, rsc2, pcmk__op_key(rsc2->id, PCMK_ACTION_START, 0), NULL, pcmk__ar_ordered, rsc1->cluster); } } /*! * \internal * \brief Check whether a guest resource will stop * * \param[in] node Guest node to check * * \return true if guest resource will likely stop, otherwise false */ static bool guest_resource_will_stop(const pcmk_node_t *node) { const pcmk_resource_t *guest_rsc = node->details->remote_rsc->container; /* Ideally, we'd check whether the guest has a required stop, but that * information doesn't exist yet, so approximate it ... */ return node->details->remote_requires_reset || node->details->unclean || pcmk_is_set(guest_rsc->flags, pcmk_rsc_failed) || (guest_rsc->next_role == pcmk_role_stopped) // Guest is moving || ((guest_rsc->role > pcmk_role_stopped) && (guest_rsc->allocated_to != NULL) && (pe_find_node(guest_rsc->running_on, guest_rsc->allocated_to->details->uname) == NULL)); } /*! * \internal * \brief Create a probe action for a resource on a node * * \param[in,out] rsc Resource to create probe for * \param[in,out] node Node to create probe on * * \return Newly created probe action */ static pcmk_action_t * probe_action(pcmk_resource_t *rsc, pcmk_node_t *node) { pcmk_action_t *probe = NULL; char *key = pcmk__op_key(rsc->id, PCMK_ACTION_MONITOR, 0); crm_debug("Scheduling probe of %s %s on %s", role2text(rsc->role), rsc->id, pe__node_name(node)); probe = custom_action(rsc, key, PCMK_ACTION_MONITOR, node, FALSE, rsc->cluster); pe__clear_action_flags(probe, pcmk_action_optional); pcmk__order_vs_unfence(rsc, node, probe, pcmk__ar_ordered); add_expected_result(probe, rsc, node); return probe; } /*! * \internal * \brief Create probes for a resource on a node, if needed * * \brief Schedule any probes needed for a resource on a node * * \param[in,out] rsc Resource to create probe for * \param[in,out] node Node to create probe on * * \return true if any probe was created, otherwise false */ bool pcmk__probe_rsc_on_node(pcmk_resource_t *rsc, pcmk_node_t *node) { uint32_t flags = pcmk__ar_ordered; pcmk_action_t *probe = NULL; pcmk_node_t *allowed = NULL; pcmk_resource_t *top = uber_parent(rsc); const char *reason = NULL; CRM_ASSERT((rsc != NULL) && (node != NULL)); if (!pcmk_is_set(rsc->cluster->flags, pcmk_sched_probe_resources)) { reason = "start-up probes are disabled"; goto no_probe; } if (pe__is_guest_or_remote_node(node)) { const char *class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (pcmk__str_eq(class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_none)) { reason = "Pacemaker Remote nodes cannot run stonith agents"; goto no_probe; } else if (pe__is_guest_node(node) && pe__resource_contains_guest_node(rsc->cluster, rsc)) { reason = "guest nodes cannot run resources containing guest nodes"; goto no_probe; } else if (rsc->is_remote_node) { reason = "Pacemaker Remote nodes cannot host remote connections"; goto no_probe; } } // If this is a collective resource, probes are created for its children if (rsc->children != NULL) { return pcmk__probe_resource_list(rsc->children, node); } if ((rsc->container != NULL) && !rsc->is_remote_node) { reason = "resource is inside a container"; goto no_probe; } else if (pcmk_is_set(rsc->flags, pcmk_rsc_removed)) { reason = "resource is orphaned"; goto no_probe; } else if (g_hash_table_lookup(rsc->known_on, node->details->id) != NULL) { reason = "resource state is already known"; goto no_probe; } allowed = g_hash_table_lookup(rsc->allowed_nodes, node->details->id); if (rsc->exclusive_discover || top->exclusive_discover) { // Exclusive discovery is enabled ... if (allowed == NULL) { // ... but this node is not allowed to run the resource reason = "resource has exclusive discovery but is not allowed " "on node"; goto no_probe; } else if (allowed->rsc_discover_mode != pcmk_probe_exclusive) { // ... but no constraint marks this node for discovery of resource reason = "resource has exclusive discovery but is not enabled " "on node"; goto no_probe; } } if (allowed == NULL) { allowed = node; } if (allowed->rsc_discover_mode == pcmk_probe_never) { reason = "node has discovery disabled"; goto no_probe; } if (pe__is_guest_node(node)) { pcmk_resource_t *guest = node->details->remote_rsc->container; if (guest->role == pcmk_role_stopped) { // The guest is stopped, so we know no resource is active there reason = "node's guest is stopped"; probe_then_start(guest, top); goto no_probe; } else if (guest_resource_will_stop(node)) { reason = "node's guest will stop"; // Order resource start after guest stop (in case it's restarting) pcmk__new_ordering(guest, pcmk__op_key(guest->id, PCMK_ACTION_STOP, 0), NULL, top, pcmk__op_key(top->id, PCMK_ACTION_START, 0), NULL, pcmk__ar_ordered, rsc->cluster); goto no_probe; } } // We've eliminated all cases where a probe is not needed, so now it is probe = probe_action(rsc, node); /* Below, we will order the probe relative to start or reload. If this is a * clone instance, the start or reload is for the entire clone rather than * just the instance. Otherwise, the start or reload is for the resource * itself. */ if (!pe_rsc_is_clone(top)) { top = rsc; } /* Prevent a start if the resource can't be probed, but don't cause the * resource or entire clone to stop if already active. */ if (!pcmk_is_set(probe->flags, pcmk_action_runnable) && (top->running_on == NULL)) { pe__set_order_flags(flags, pcmk__ar_unrunnable_first_blocks); } // Start or reload after probing the resource pcmk__new_ordering(rsc, NULL, probe, top, pcmk__op_key(top->id, PCMK_ACTION_START, 0), NULL, flags, rsc->cluster); pcmk__new_ordering(rsc, NULL, probe, top, reload_key(rsc), NULL, pcmk__ar_ordered, rsc->cluster); return true; no_probe: pcmk__rsc_trace(rsc, "Skipping probe for %s on %s because %s", rsc->id, node->details->id, reason); return false; } /*! * \internal * \brief Check whether a probe should be ordered before another action * * \param[in] probe Probe action to check * \param[in] then Other action to check * * \return true if \p probe should be ordered before \p then, otherwise false */ static bool probe_needed_before_action(const pcmk_action_t *probe, const pcmk_action_t *then) { // Probes on a node are performed after unfencing it, not before if (pcmk__str_eq(then->task, PCMK_ACTION_STONITH, pcmk__str_none) && pe__same_node(probe->node, then->node)) { const char *op = g_hash_table_lookup(then->meta, "stonith_action"); if (pcmk__str_eq(op, PCMK_ACTION_ON, pcmk__str_casei)) { return false; } } // Probes should be done on a node before shutting it down if (pcmk__str_eq(then->task, PCMK_ACTION_DO_SHUTDOWN, pcmk__str_none) && (probe->node != NULL) && (then->node != NULL) && !pe__same_node(probe->node, then->node)) { return false; } // Otherwise probes should always be done before any other action return true; } /*! * \internal * \brief Add implicit "probe then X" orderings for "stop then X" orderings * * If the state of a resource is not known yet, a probe will be scheduled, * expecting a "not running" result. If the probe fails, a stop will not be * scheduled until the next transition. Thus, if there are ordering constraints * like "stop this resource then do something else that's not for the same * resource", add implicit "probe this resource then do something" equivalents * so the relation is upheld until we know whether a stop is needed. * * \param[in,out] scheduler Scheduler data */ static void add_probe_orderings_for_stops(pcmk_scheduler_t *scheduler) { for (GList *iter = scheduler->ordering_constraints; iter != NULL; iter = iter->next) { pcmk__action_relation_t *order = iter->data; uint32_t order_flags = pcmk__ar_ordered; GList *probes = NULL; GList *then_actions = NULL; pcmk_action_t *first = NULL; pcmk_action_t *then = NULL; // Skip disabled orderings if (order->flags == pcmk__ar_none) { continue; } // Skip non-resource orderings, and orderings for the same resource if ((order->rsc1 == NULL) || (order->rsc1 == order->rsc2)) { continue; } // Skip invalid orderings (shouldn't be possible) first = order->action1; then = order->action2; if (((first == NULL) && (order->task1 == NULL)) || ((then == NULL) && (order->task2 == NULL))) { continue; } // Skip orderings for first actions other than stop if ((first != NULL) && !pcmk__str_eq(first->task, PCMK_ACTION_STOP, pcmk__str_none)) { continue; } else if ((first == NULL) && !pcmk__ends_with(order->task1, "_" PCMK_ACTION_STOP "_0")) { continue; } /* Do not imply a probe ordering for a resource inside of a stopping * container. Otherwise, it might introduce a transition loop, since a * probe could be scheduled after the container starts again. */ if ((order->rsc2 != NULL) && (order->rsc1->container == order->rsc2)) { if ((then != NULL) && pcmk__str_eq(then->task, PCMK_ACTION_STOP, pcmk__str_none)) { continue; } else if ((then == NULL) && pcmk__ends_with(order->task2, "_" PCMK_ACTION_STOP "_0")) { continue; } } // Preserve certain order options for future filtering if (pcmk_is_set(order->flags, pcmk__ar_if_first_unmigratable)) { pe__set_order_flags(order_flags, pcmk__ar_if_first_unmigratable); } if (pcmk_is_set(order->flags, pcmk__ar_if_on_same_node)) { pe__set_order_flags(order_flags, pcmk__ar_if_on_same_node); } // Preserve certain order types for future filtering if ((order->flags == pcmk__ar_if_required_on_same_node) || (order->flags == pcmk__ar_if_on_same_node_or_target)) { order_flags = order->flags; } // List all scheduled probes for the first resource probes = pe__resource_actions(order->rsc1, NULL, PCMK_ACTION_MONITOR, FALSE); if (probes == NULL) { // There aren't any continue; } // List all relevant "then" actions if (then != NULL) { then_actions = g_list_prepend(NULL, then); } else if (order->rsc2 != NULL) { then_actions = find_actions(order->rsc2->actions, order->task2, NULL); if (then_actions == NULL) { // There aren't any g_list_free(probes); continue; } } crm_trace("Implying 'probe then' orderings for '%s then %s' " "(id=%d, type=%.6x)", ((first == NULL)? order->task1 : first->uuid), ((then == NULL)? order->task2 : then->uuid), order->id, order->flags); for (GList *probe_iter = probes; probe_iter != NULL; probe_iter = probe_iter->next) { pcmk_action_t *probe = (pcmk_action_t *) probe_iter->data; for (GList *then_iter = then_actions; then_iter != NULL; then_iter = then_iter->next) { pcmk_action_t *then = (pcmk_action_t *) then_iter->data; if (probe_needed_before_action(probe, then)) { order_actions(probe, then, order_flags); } } } g_list_free(then_actions); g_list_free(probes); } } /*! * \internal * \brief Add necessary orderings between probe and starts of clone instances * * , in additon to the ordering with the parent resource added upon creating * the probe. * * \param[in,out] probe Probe as 'first' action in an ordering * \param[in,out] after 'then' action wrapper in the ordering */ static void add_start_orderings_for_probe(pcmk_action_t *probe, pcmk__related_action_t *after) { uint32_t flags = pcmk__ar_ordered|pcmk__ar_unrunnable_first_blocks; /* Although the ordering between the probe of the clone instance and the * start of its parent has been added in pcmk__probe_rsc_on_node(), we * avoided enforcing `pcmk__ar_unrunnable_first_blocks` order type for that * as long as any of the clone instances are running to prevent them from * being unexpectedly stopped. * * On the other hand, we still need to prevent any inactive instances from * starting unless the probe is runnable so that we don't risk starting too * many instances before we know the state on all nodes. */ if ((after->action->rsc->variant <= pcmk_rsc_variant_group) || pcmk_is_set(probe->flags, pcmk_action_runnable) // The order type is already enforced for its parent. || pcmk_is_set(after->type, pcmk__ar_unrunnable_first_blocks) || (pe__const_top_resource(probe->rsc, false) != after->action->rsc) || !pcmk__str_eq(after->action->task, PCMK_ACTION_START, pcmk__str_none)) { return; } crm_trace("Adding probe start orderings for 'unrunnable %s@%s " "then instances of %s@%s'", probe->uuid, pe__node_name(probe->node), after->action->uuid, pe__node_name(after->action->node)); for (GList *then_iter = after->action->actions_after; then_iter != NULL; then_iter = then_iter->next) { pcmk__related_action_t *then = then_iter->data; if (then->action->rsc->running_on || (pe__const_top_resource(then->action->rsc, false) != after->action->rsc) || !pcmk__str_eq(then->action->task, PCMK_ACTION_START, pcmk__str_none)) { continue; } crm_trace("Adding probe start ordering for 'unrunnable %s@%s " "then %s@%s' (type=%#.6x)", probe->uuid, pe__node_name(probe->node), then->action->uuid, pe__node_name(then->action->node), flags); /* Prevent the instance from starting if the instance can't, but don't * cause any other intances to stop if already active. */ order_actions(probe, then->action, flags); } return; } /*! * \internal * \brief Order probes before restarts and re-promotes * * If a given ordering is a "probe then start" or "probe then promote" ordering, * add an implicit "probe then stop/demote" ordering in case the action is part * of a restart/re-promote, and do the same recursively for all actions ordered * after the "then" action. * * \param[in,out] probe Probe as 'first' action in an ordering * \param[in,out] after 'then' action in the ordering */ static void add_restart_orderings_for_probe(pcmk_action_t *probe, pcmk_action_t *after) { GList *iter = NULL; bool interleave = false; pcmk_resource_t *compatible_rsc = NULL; // Validate that this is a resource probe followed by some action if ((after == NULL) || (probe == NULL) || (probe->rsc == NULL) || (probe->rsc->variant != pcmk_rsc_variant_primitive) || !pcmk__str_eq(probe->task, PCMK_ACTION_MONITOR, pcmk__str_none)) { return; } // Avoid running into any possible loop if (pcmk_is_set(after->flags, pcmk_action_detect_loop)) { return; } pe__set_action_flags(after, pcmk_action_detect_loop); crm_trace("Adding probe restart orderings for '%s@%s then %s@%s'", probe->uuid, pe__node_name(probe->node), after->uuid, pe__node_name(after->node)); /* Add restart orderings if "then" is for a different primitive. * Orderings for collective resources will be added later. */ if ((after->rsc != NULL) && (after->rsc->variant == pcmk_rsc_variant_primitive) && (probe->rsc != after->rsc)) { GList *then_actions = NULL; if (pcmk__str_eq(after->task, PCMK_ACTION_START, pcmk__str_none)) { then_actions = pe__resource_actions(after->rsc, NULL, PCMK_ACTION_STOP, FALSE); } else if (pcmk__str_eq(after->task, PCMK_ACTION_PROMOTE, pcmk__str_none)) { then_actions = pe__resource_actions(after->rsc, NULL, PCMK_ACTION_DEMOTE, FALSE); } for (iter = then_actions; iter != NULL; iter = iter->next) { pcmk_action_t *then = (pcmk_action_t *) iter->data; // Skip pseudo-actions (for example, those implied by fencing) if (!pcmk_is_set(then->flags, pcmk_action_pseudo)) { order_actions(probe, then, pcmk__ar_ordered); } } g_list_free(then_actions); } /* Detect whether "then" is an interleaved clone action. For these, we want * to add orderings only for the relevant instance. */ if ((after->rsc != NULL) && (after->rsc->variant > pcmk_rsc_variant_group)) { const char *interleave_s = g_hash_table_lookup(after->rsc->meta, - XML_RSC_ATTR_INTERLEAVE); + PCMK_META_INTERLEAVE); interleave = crm_is_true(interleave_s); if (interleave) { compatible_rsc = pcmk__find_compatible_instance(probe->rsc, after->rsc, pcmk_role_unknown, false); } } /* Now recursively do the same for all actions ordered after "then". This * also handles collective resources since the collective action will be * ordered before its individual instances' actions. */ for (iter = after->actions_after; iter != NULL; iter = iter->next) { pcmk__related_action_t *after_wrapper = iter->data; /* pcmk__ar_first_implies_then is the reason why a required A.start * implies/enforces B.start to be required too, which is the cause of * B.restart/re-promote. * * Not sure about pcmk__ar_first_implies_same_node_then though. It's now * only used for unfencing case, which tends to introduce transition * loops... */ if (!pcmk_is_set(after_wrapper->type, pcmk__ar_first_implies_then)) { /* The order type between a group/clone and its child such as * B.start-> B_child.start is: * pcmk__ar_then_implies_first_graphed * |pcmk__ar_unrunnable_first_blocks * * Proceed through the ordering chain and build dependencies with * its children. */ if ((after->rsc == NULL) || (after->rsc->variant < pcmk_rsc_variant_group) || (probe->rsc->parent == after->rsc) || (after_wrapper->action->rsc == NULL) || (after_wrapper->action->rsc->variant > pcmk_rsc_variant_group) || (after->rsc != after_wrapper->action->rsc->parent)) { continue; } /* Proceed to the children of a group or a non-interleaved clone. * For an interleaved clone, proceed only to the relevant child. */ if ((after->rsc->variant > pcmk_rsc_variant_group) && interleave && ((compatible_rsc == NULL) || (compatible_rsc != after_wrapper->action->rsc))) { continue; } } crm_trace("Recursively adding probe restart orderings for " "'%s@%s then %s@%s' (type=%#.6x)", after->uuid, pe__node_name(after->node), after_wrapper->action->uuid, pe__node_name(after_wrapper->action->node), after_wrapper->type); add_restart_orderings_for_probe(probe, after_wrapper->action); } } /*! * \internal * \brief Clear the tracking flag on all scheduled actions * * \param[in,out] scheduler Scheduler data */ static void clear_actions_tracking_flag(pcmk_scheduler_t *scheduler) { for (GList *iter = scheduler->actions; iter != NULL; iter = iter->next) { pcmk_action_t *action = iter->data; pe__clear_action_flags(action, pcmk_action_detect_loop); } } /*! * \internal * \brief Add start and restart orderings for probes scheduled for a resource * * \param[in,out] data Resource whose probes should be ordered * \param[in] user_data Unused */ static void add_start_restart_orderings_for_rsc(gpointer data, gpointer user_data) { pcmk_resource_t *rsc = data; GList *probes = NULL; // For collective resources, order each instance recursively if (rsc->variant != pcmk_rsc_variant_primitive) { g_list_foreach(rsc->children, add_start_restart_orderings_for_rsc, NULL); return; } // Find all probes for given resource probes = pe__resource_actions(rsc, NULL, PCMK_ACTION_MONITOR, FALSE); // Add probe restart orderings for each probe found for (GList *iter = probes; iter != NULL; iter = iter->next) { pcmk_action_t *probe = (pcmk_action_t *) iter->data; for (GList *then_iter = probe->actions_after; then_iter != NULL; then_iter = then_iter->next) { pcmk__related_action_t *then = then_iter->data; add_start_orderings_for_probe(probe, then); add_restart_orderings_for_probe(probe, then->action); clear_actions_tracking_flag(rsc->cluster); } } g_list_free(probes); } /*! * \internal * \brief Add "A then probe B" orderings for "A then B" orderings * * \param[in,out] scheduler Scheduler data * * \note This function is currently disabled (see next comment). */ static void order_then_probes(pcmk_scheduler_t *scheduler) { #if 0 /* Given an ordering "A then B", we would prefer to wait for A to be started * before probing B. * * For example, if A is a filesystem which B can't even run without, it * would be helpful if the author of B's agent could assume that A is * running before B.monitor will be called. * * However, we can't _only_ probe after A is running, otherwise we wouldn't * detect the state of B if A could not be started. We can't even do an * opportunistic version of this, because B may be moving: * * A.stop -> A.start -> B.probe -> B.stop -> B.start * * and if we add B.stop -> A.stop here, we get a loop: * * A.stop -> A.start -> B.probe -> B.stop -> A.stop * * We could kill the "B.probe -> B.stop" dependency, but that could mean * stopping B "too" soon, because B.start must wait for the probe, and * we don't want to stop B if we can't start it. * * We could add the ordering only if A is an anonymous clone with * clone-max == node-max (since we'll never be moving it). However, we could * still be stopping one instance at the same time as starting another. * * The complexity of checking for allowed conditions combined with the ever * narrowing use case suggests that this code should remain disabled until * someone gets smarter. */ for (GList *iter = scheduler->resources; iter != NULL; iter = iter->next) { pcmk_resource_t *rsc = (pcmk_resource_t *) iter->data; pcmk_action_t *start = NULL; GList *actions = NULL; GList *probes = NULL; actions = pe__resource_actions(rsc, NULL, PCMK_ACTION_START, FALSE); if (actions) { start = actions->data; g_list_free(actions); } if (start == NULL) { crm_debug("No start action for %s", rsc->id); continue; } probes = pe__resource_actions(rsc, NULL, PCMK_ACTION_MONITOR, FALSE); for (actions = start->actions_before; actions != NULL; actions = actions->next) { pcmk__related_action_t *before = actions->data; pcmk_action_t *first = before->action; pcmk_resource_t *first_rsc = first->rsc; if (first->required_runnable_before) { for (GList *clone_actions = first->actions_before; clone_actions != NULL; clone_actions = clone_actions->next) { before = clone_actions->data; crm_trace("Testing '%s then %s' for %s", first->uuid, before->action->uuid, start->uuid); CRM_ASSERT(before->action->rsc != NULL); first_rsc = before->action->rsc; break; } } else if (!pcmk__str_eq(first->task, PCMK_ACTION_START, pcmk__str_none)) { crm_trace("Not a start op %s for %s", first->uuid, start->uuid); } if (first_rsc == NULL) { continue; } else if (pe__const_top_resource(first_rsc, false) == pe__const_top_resource(start->rsc, false)) { crm_trace("Same parent %s for %s", first_rsc->id, start->uuid); continue; } else if (!pe_rsc_is_clone(pe__const_top_resource(first_rsc, false))) { crm_trace("Not a clone %s for %s", first_rsc->id, start->uuid); continue; } crm_debug("Applying %s before %s %d", first->uuid, start->uuid, pe__const_top_resource(first_rsc, false)->variant); for (GList *probe_iter = probes; probe_iter != NULL; probe_iter = probe_iter->next) { pcmk_action_t *probe = (pcmk_action_t *) probe_iter->data; crm_debug("Ordering %s before %s", first->uuid, probe->uuid); order_actions(first, probe, pcmk__ar_ordered); } } } #endif } void pcmk__order_probes(pcmk_scheduler_t *scheduler) { // Add orderings for "probe then X" g_list_foreach(scheduler->resources, add_start_restart_orderings_for_rsc, NULL); add_probe_orderings_for_stops(scheduler); order_then_probes(scheduler); } /*! * \internal * \brief Schedule any probes needed * * \param[in,out] scheduler Scheduler data * * \note This may also schedule fencing of failed remote nodes. */ void pcmk__schedule_probes(pcmk_scheduler_t *scheduler) { // Schedule probes on each node in the cluster as needed for (GList *iter = scheduler->nodes; iter != NULL; iter = iter->next) { pcmk_node_t *node = (pcmk_node_t *) iter->data; const char *probed = NULL; if (!node->details->online) { // Don't probe offline nodes if (pcmk__is_failed_remote_node(node)) { pe_fence_node(scheduler, node, "the connection is unrecoverable", FALSE); } continue; } else if (node->details->unclean) { // ... or nodes that need fencing continue; } else if (!node->details->rsc_discovery_enabled) { // The user requested that probes not be done on this node continue; } /* This is no longer needed for live clusters, since the probe_complete * node attribute will never be in the CIB. However this is still useful * for processing old saved CIBs (< 1.1.14), including the * reprobe-target_rc regression test. */ probed = pe_node_attribute_raw(node, CRM_OP_PROBED); if (probed != NULL && crm_is_true(probed) == FALSE) { pcmk_action_t *probe_op = NULL; probe_op = custom_action(NULL, crm_strdup_printf("%s-%s", CRM_OP_REPROBE, node->details->uname), CRM_OP_REPROBE, node, FALSE, scheduler); add_hash_param(probe_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); continue; } // Probe each resource in the cluster on this node, as needed pcmk__probe_resource_list(scheduler->resources, node); } }