diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in
index 4481de4444..2d61df24cb 100644
--- a/cts/cts-scheduler.in
+++ b/cts/cts-scheduler.in
@@ -1,1613 +1,1614 @@
#!@PYTHON@
""" Regression tests for Pacemaker's scheduler
"""
__copyright__ = "Copyright 2004-2022 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import io
import os
import re
import sys
import stat
import shlex
import shutil
import argparse
import subprocess
import platform
import tempfile
DESC = """Regression tests for Pacemaker's scheduler"""
# Each entry in TESTS is a group of tests, where each test consists of a
# test base name, test description, and additional test arguments.
# Test groups will be separated by newlines in output.
TESTS = [
[
[ "simple1", "Offline" ],
[ "simple2", "Start" ],
[ "simple3", "Start 2" ],
[ "simple4", "Start Failed" ],
[ "simple6", "Stop Start" ],
[ "simple7", "Shutdown" ],
#[ "simple8", "Stonith" ],
#[ "simple9", "Lower version" ],
#[ "simple10", "Higher version" ],
[ "simple11", "Priority (ne)" ],
[ "simple12", "Priority (eq)" ],
[ "simple8", "Stickiness" ],
],
[
[ "group1", "Group" ],
[ "group2", "Group + Native" ],
[ "group3", "Group + Group" ],
[ "group4", "Group + Native (nothing)" ],
[ "group5", "Group + Native (move)" ],
[ "group6", "Group + Group (move)" ],
[ "group7", "Group colocation" ],
[ "group13", "Group colocation (cant run)" ],
[ "group8", "Group anti-colocation" ],
[ "group9", "Group recovery" ],
[ "group10", "Group partial recovery" ],
[ "group11", "Group target_role" ],
[ "group14", "Group stop (graph terminated)" ],
[ "group15", "Negative group colocation" ],
[ "bug-1573", "Partial stop of a group with two children" ],
[ "bug-1718", "Mandatory group ordering - Stop group_FUN" ],
[ "bug-lf-2613", "Move group on failure" ],
[ "bug-lf-2619", "Move group on clone failure" ],
[ "group-fail", "Ensure stop order is preserved for partially active groups" ],
[ "group-unmanaged", "No need to restart r115 because r114 is unmanaged" ],
[ "group-unmanaged-stopped", "Make sure r115 is stopped when r114 fails" ],
[ "group-dependents", "Account for the location preferences of things colocated with a group" ],
[ "group-stop-ordering", "Ensure blocked group member stop does not force other member stops" ],
[ "colocate-unmanaged-group", "Respect mandatory colocations even if earlier group member is unmanaged" ],
],
[
[ "rsc_dep1", "Must not" ],
[ "rsc_dep3", "Must" ],
[ "rsc_dep5", "Must not 3" ],
[ "rsc_dep7", "Must 3" ],
[ "rsc_dep10", "Must (but cant)" ],
[ "rsc_dep2", "Must (running)" ],
[ "rsc_dep8", "Must (running : alt)" ],
[ "rsc_dep4", "Must (running + move)" ],
[ "asymmetric", "Asymmetric - require explicit location constraints" ],
],
[
[ "orphan-0", "Orphan ignore" ],
[ "orphan-1", "Orphan stop" ],
[ "orphan-2", "Orphan stop, remove failcount" ],
],
[
[ "params-0", "Params: No change" ],
[ "params-1", "Params: Changed" ],
[ "params-2", "Params: Resource definition" ],
[ "params-3", "Params: Restart instead of reload if start pending" ],
[ "params-4", "Params: Reload" ],
[ "params-5", "Params: Restart based on probe digest" ],
[ "novell-251689", "Resource definition change + target_role=stopped" ],
[ "bug-lf-2106", "Restart all anonymous clone instances after config change" ],
[ "params-6", "Params: Detect reload in previously migrated resource" ],
[ "nvpair-id-ref", "Support id-ref in nvpair with optional name" ],
[ "not-reschedule-unneeded-monitor",
"Do not reschedule unneeded monitors while resource definitions have changed" ],
[ "reload-becomes-restart", "Cancel reload if restart becomes required" ],
+ [ "restart-with-extra-op-params", "Restart if with extra operation parameters upon changes of any" ],
],
[
[ "target-0", "Target Role : baseline" ],
[ "target-1", "Target Role : promoted" ],
[ "target-2", "Target Role : invalid" ],
],
[
[ "base-score", "Set a node's default score for all nodes" ],
],
[
[ "date-1", "Dates", [ "-t", "2005-020" ] ],
[ "date-2", "Date Spec - Pass", [ "-t", "2005-020T12:30" ] ],
[ "date-3", "Date Spec - Fail", [ "-t", "2005-020T11:30" ] ],
[ "origin", "Timing of recurring operations", [ "-t", "2014-05-07 00:28:00" ] ],
[ "probe-0", "Probe (anon clone)" ],
[ "probe-1", "Pending Probe" ],
[ "probe-2", "Correctly re-probe cloned groups" ],
[ "probe-3", "Probe (pending node)" ],
[ "probe-4", "Probe (pending node + stopped resource)" ],
[ "probe-pending-node", "Probe (pending node + unmanaged resource)" ],
[ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ],
[ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ],
[ "expired-failed-probe-primitive", "Maskable, expired probe failure on primitive resources" ],
[ "standby", "Standby" ],
[ "comments", "Comments" ],
],
[
[ "one-or-more-0", "Everything starts" ],
[ "one-or-more-1", "Nothing starts because of A" ],
[ "one-or-more-2", "D can start because of C" ],
[ "one-or-more-3", "D cannot start because of B and C" ],
[ "one-or-more-4", "D cannot start because of target-role" ],
[ "one-or-more-5", "Start A and F even though C and D are stopped" ],
[ "one-or-more-6", "Leave A running even though B is stopped" ],
[ "one-or-more-7", "Leave A running even though C is stopped" ],
[ "bug-5140-require-all-false", "Allow basegrp:0 to stop" ],
[ "clone-require-all-1", "clone B starts node 3 and 4" ],
[ "clone-require-all-2", "clone B remains stopped everywhere" ],
[ "clone-require-all-3", "clone B stops everywhere because A stops everywhere" ],
[ "clone-require-all-4", "clone B remains on node 3 and 4 with only one instance of A remaining" ],
[ "clone-require-all-5", "clone B starts on node 1 3 and 4" ],
[ "clone-require-all-6", "clone B remains active after shutting down instances of A" ],
[ "clone-require-all-7",
"clone A and B both start at the same time. all instances of A start before B" ],
[ "clone-require-all-no-interleave-1", "C starts everywhere after A and B" ],
[ "clone-require-all-no-interleave-2",
"C starts on nodes 1, 2, and 4 with only one active instance of B" ],
[ "clone-require-all-no-interleave-3",
"C remains active when instance of B is stopped on one node and started on another" ],
[ "one-or-more-unrunnable-instances", "Avoid dependencies on instances that won't ever be started" ],
],
[
[ "location-date-rules-1", "Use location constraints with ineffective date-based rules" ],
[ "location-date-rules-2", "Use location constraints with effective date-based rules" ],
[ "nvpair-date-rules-1", "Use nvpair blocks with a variety of date-based rules" ],
[ "value-source", "Use location constraints with node attribute expressions using value-source" ],
[ "rule-dbl-as-auto-number-match",
"Floating-point rule values default to number comparison: match" ],
[ "rule-dbl-as-auto-number-no-match",
"Floating-point rule values default to number comparison: no "
"match" ],
[ "rule-dbl-as-integer-match",
"Floating-point rule values set to integer comparison: match" ],
[ "rule-dbl-as-integer-no-match",
"Floating-point rule values set to integer comparison: no match" ],
[ "rule-dbl-as-number-match",
"Floating-point rule values set to number comparison: match" ],
[ "rule-dbl-as-number-no-match",
"Floating-point rule values set to number comparison: no match" ],
[ "rule-dbl-parse-fail-default-str-match",
"Floating-point rule values fail to parse, default to string "
"comparison: match" ],
[ "rule-dbl-parse-fail-default-str-no-match",
"Floating-point rule values fail to parse, default to string "
"comparison: no match" ],
[ "rule-int-as-auto-integer-match",
"Integer rule values default to integer comparison: match" ],
[ "rule-int-as-auto-integer-no-match",
"Integer rule values default to integer comparison: no match" ],
[ "rule-int-as-integer-match",
"Integer rule values set to integer comparison: match" ],
[ "rule-int-as-integer-no-match",
"Integer rule values set to integer comparison: no match" ],
[ "rule-int-as-number-match",
"Integer rule values set to number comparison: match" ],
[ "rule-int-as-number-no-match",
"Integer rule values set to number comparison: no match" ],
[ "rule-int-parse-fail-default-str-match",
"Integer rule values fail to parse, default to string "
"comparison: match" ],
[ "rule-int-parse-fail-default-str-no-match",
"Integer rule values fail to parse, default to string "
"comparison: no match" ],
],
[
[ "order1", "Order start 1" ],
[ "order2", "Order start 2" ],
[ "order3", "Order stop" ],
[ "order4", "Order (multiple)" ],
[ "order5", "Order (move)" ],
[ "order6", "Order (move w/ restart)" ],
[ "order7", "Order (mandatory)" ],
[ "order-optional", "Order (score=0)" ],
[ "order-required", "Order (score=INFINITY)" ],
[ "bug-lf-2171", "Prevent group start when clone is stopped" ],
[ "order-clone", "Clone ordering should be able to prevent startup of dependent clones" ],
[ "order-sets", "Ordering for resource sets" ],
[ "order-serialize", "Serialize resources without inhibiting migration" ],
[ "order-serialize-set", "Serialize a set of resources without inhibiting migration" ],
[ "clone-order-primitive", "Order clone start after a primitive" ],
[ "clone-order-16instances", "Verify ordering of 16 cloned resources" ],
[ "order-optional-keyword", "Order (optional keyword)" ],
[ "order-mandatory", "Order (mandatory keyword)" ],
[ "bug-lf-2493",
"Don't imply colocation requirements when applying ordering constraints with clones" ],
[ "ordered-set-basic-startup", "Constraint set with default order settings" ],
[ "ordered-set-natural", "Allow natural set ordering" ],
[ "order-wrong-kind", "Order (error)" ],
],
[
[ "coloc-loop", "Colocation - loop" ],
[ "coloc-many-one", "Colocation - many-to-one" ],
[ "coloc-list", "Colocation - many-to-one with list" ],
[ "coloc-group", "Colocation - groups" ],
[ "coloc-unpromoted-anti", "Anti-colocation with unpromoted shouldn't prevent promoted colocation" ],
[ "coloc-attr", "Colocation based on node attributes" ],
[ "coloc-negative-group", "Negative colocation with a group" ],
[ "coloc-intra-set", "Intra-set colocation" ],
[ "bug-lf-2435", "Colocation sets with a negative score" ],
[ "coloc-clone-stays-active",
"Ensure clones don't get stopped/demoted because a dependent must stop" ],
[ "coloc_fp_logic", "Verify floating point calculations in colocation are working" ],
[ "colo_promoted_w_native",
"cl#5070 - Verify promotion order is affected when colocating promoted with primitive" ],
[ "colo_unpromoted_w_native",
"cl#5070 - Verify promotion order is affected when colocating unpromoted with primitive" ],
[ "anti-colocation-order",
"cl#5187 - Prevent resources in an anti-colocation from even temporarily running on a same node" ],
[ "anti-colocation-promoted", "Organize order of actions for promoted resources in anti-colocations" ],
[ "anti-colocation-unpromoted", "Organize order of actions for unpromoted resources in anti-colocations" ],
[ "enforce-colo1", "Always enforce B with A INFINITY" ],
[ "complex_enforce_colo", "Always enforce B with A INFINITY. (make sure heat-engine stops)" ],
[ "coloc-dependee-should-stay", "Stickiness outweighs group colocation" ],
[ "coloc-dependee-should-move", "Group colocation outweighs stickiness" ],
[ "colocation-influence", "Respect colocation influence" ],
],
[
[ "rsc-sets-seq-true", "Resource Sets - sequential=false" ],
[ "rsc-sets-seq-false", "Resource Sets - sequential=true" ],
[ "rsc-sets-clone", "Resource Sets - Clone" ],
[ "rsc-sets-promoted", "Resource Sets - Promoted" ],
[ "rsc-sets-clone-1", "Resource Sets - Clone (lf#2404)" ],
],
[
[ "attrs1", "string: eq (and)" ],
[ "attrs2", "string: lt / gt (and)" ],
[ "attrs3", "string: ne (or)" ],
[ "attrs4", "string: exists" ],
[ "attrs5", "string: not_exists" ],
[ "attrs6", "is_dc: true" ],
[ "attrs7", "is_dc: false" ],
[ "attrs8", "score_attribute" ],
[ "per-node-attrs", "Per node resource parameters" ],
],
[
[ "mon-rsc-1", "Schedule Monitor - start" ],
[ "mon-rsc-2", "Schedule Monitor - move" ],
[ "mon-rsc-3", "Schedule Monitor - pending start" ],
[ "mon-rsc-4", "Schedule Monitor - move/pending start" ],
],
[
[ "rec-rsc-0", "Resource Recover - no start" ],
[ "rec-rsc-1", "Resource Recover - start" ],
[ "rec-rsc-2", "Resource Recover - monitor" ],
[ "rec-rsc-3", "Resource Recover - stop - ignore" ],
[ "rec-rsc-4", "Resource Recover - stop - block" ],
[ "rec-rsc-5", "Resource Recover - stop - fence" ],
[ "rec-rsc-6", "Resource Recover - multiple - restart" ],
[ "rec-rsc-7", "Resource Recover - multiple - stop" ],
[ "rec-rsc-8", "Resource Recover - multiple - block" ],
[ "rec-rsc-9", "Resource Recover - group/group" ],
[ "stop-unexpected", "Recover multiply active group with stop_unexpected" ],
[ "stop-unexpected-2", "Resource multiply active primitve with stop_unexpected" ],
[ "monitor-recovery", "on-fail=block + resource recovery detected by recurring monitor" ],
[ "stop-failure-no-quorum", "Stop failure without quorum" ],
[ "stop-failure-no-fencing", "Stop failure without fencing available" ],
[ "stop-failure-with-fencing", "Stop failure with fencing available" ],
[ "multiple-active-block-group", "Support of multiple-active=block for resource groups" ],
[ "multiple-monitor-one-failed",
"Consider resource failed if any of the configured monitor operations failed" ],
],
[
[ "quorum-1", "No quorum - ignore" ],
[ "quorum-2", "No quorum - freeze" ],
[ "quorum-3", "No quorum - stop" ],
[ "quorum-4", "No quorum - start anyway" ],
[ "quorum-5", "No quorum - start anyway (group)" ],
[ "quorum-6", "No quorum - start anyway (clone)" ],
[ "bug-cl-5212", "No promotion with no-quorum-policy=freeze" ],
[ "suicide-needed-inquorate", "no-quorum-policy=suicide: suicide necessary" ],
[ "suicide-not-needed-initial-quorum",
"no-quorum-policy=suicide: suicide not necessary at initial quorum" ],
[ "suicide-not-needed-never-quorate",
"no-quorum-policy=suicide: suicide not necessary if never quorate" ],
[ "suicide-not-needed-quorate", "no-quorum-policy=suicide: suicide necessary if quorate" ],
],
[
[ "rec-node-1", "Node Recover - Startup - no fence" ],
[ "rec-node-2", "Node Recover - Startup - fence" ],
[ "rec-node-3", "Node Recover - HA down - no fence" ],
[ "rec-node-4", "Node Recover - HA down - fence" ],
[ "rec-node-5", "Node Recover - CRM down - no fence" ],
[ "rec-node-6", "Node Recover - CRM down - fence" ],
[ "rec-node-7", "Node Recover - no quorum - ignore" ],
[ "rec-node-8", "Node Recover - no quorum - freeze" ],
[ "rec-node-9", "Node Recover - no quorum - stop" ],
[ "rec-node-10", "Node Recover - no quorum - stop w/fence" ],
[ "rec-node-11", "Node Recover - CRM down w/ group - fence" ],
[ "rec-node-12", "Node Recover - nothing active - fence" ],
[ "rec-node-13", "Node Recover - failed resource + shutdown - fence" ],
[ "rec-node-15", "Node Recover - unknown lrm section" ],
[ "rec-node-14", "Serialize all stonith's" ],
],
[
[ "multi1", "Multiple Active (stop/start)" ],
],
[
[ "migrate-begin", "Normal migration" ],
[ "migrate-success", "Completed migration" ],
[ "migrate-partial-1", "Completed migration, missing stop on source" ],
[ "migrate-partial-2", "Successful migrate_to only" ],
[ "migrate-partial-3", "Successful migrate_to only, target down" ],
[ "migrate-partial-4", "Migrate from the correct host after migrate_to+migrate_from" ],
[ "bug-5186-partial-migrate", "Handle partial migration when src node loses membership" ],
[ "migrate-fail-2", "Failed migrate_from" ],
[ "migrate-fail-3", "Failed migrate_from + stop on source" ],
[ "migrate-fail-4",
"Failed migrate_from + stop on target - ideally we wouldn't need to re-stop on target" ],
[ "migrate-fail-5", "Failed migrate_from + stop on source and target" ],
[ "migrate-fail-6", "Failed migrate_to" ],
[ "migrate-fail-7", "Failed migrate_to + stop on source" ],
[ "migrate-fail-8",
"Failed migrate_to + stop on target - ideally we wouldn't need to re-stop on target" ],
[ "migrate-fail-9", "Failed migrate_to + stop on source and target" ],
[ "migration-ping-pong", "Old migrate_to failure + successful migrate_from on same node" ],
[ "migrate-stop", "Migration in a stopping stack" ],
[ "migrate-start", "Migration in a starting stack" ],
[ "migrate-stop_start", "Migration in a restarting stack" ],
[ "migrate-stop-complex", "Migration in a complex stopping stack" ],
[ "migrate-start-complex", "Migration in a complex starting stack" ],
[ "migrate-stop-start-complex", "Migration in a complex moving stack" ],
[ "migrate-shutdown", "Order the post-migration 'stop' before node shutdown" ],
[ "migrate-1", "Migrate (migrate)" ],
[ "migrate-2", "Migrate (stable)" ],
[ "migrate-3", "Migrate (failed migrate_to)" ],
[ "migrate-4", "Migrate (failed migrate_from)" ],
[ "novell-252693", "Migration in a stopping stack" ],
[ "novell-252693-2", "Migration in a starting stack" ],
[ "novell-252693-3", "Non-Migration in a starting and stopping stack" ],
[ "bug-1820", "Migration in a group" ],
[ "bug-1820-1", "Non-migration in a group" ],
[ "migrate-5", "Primitive migration with a clone" ],
[ "migrate-fencing", "Migration after Fencing" ],
[ "migrate-both-vms", "Migrate two VMs that have no colocation" ],
[ "migration-behind-migrating-remote", "Migrate resource behind migrating remote connection" ],
[ "1-a-then-bm-move-b", "Advanced migrate logic. A then B. migrate B" ],
[ "2-am-then-b-move-a", "Advanced migrate logic, A then B, migrate A without stopping B" ],
[ "3-am-then-bm-both-migrate", "Advanced migrate logic. A then B. migrate both" ],
[ "4-am-then-bm-b-not-migratable", "Advanced migrate logic, A then B, B not migratable" ],
[ "5-am-then-bm-a-not-migratable", "Advanced migrate logic. A then B. move both, a not migratable" ],
[ "6-migrate-group", "Advanced migrate logic, migrate a group" ],
[ "7-migrate-group-one-unmigratable",
"Advanced migrate logic, migrate group mixed with allow-migrate true/false" ],
[ "8-am-then-bm-a-migrating-b-stopping",
"Advanced migrate logic, A then B, A migrating, B stopping" ],
[ "9-am-then-bm-b-migrating-a-stopping",
"Advanced migrate logic, A then B, B migrate, A stopping" ],
[ "10-a-then-bm-b-move-a-clone",
"Advanced migrate logic, A clone then B, migrate B while stopping A" ],
[ "11-a-then-bm-b-move-a-clone-starting",
"Advanced migrate logic, A clone then B, B moving while A is start/stopping" ],
[ "a-promote-then-b-migrate", "A promote then B start. migrate B" ],
[ "a-demote-then-b-migrate", "A demote then B stop. migrate B" ],
[ "probe-target-of-failed-migrate_to-1", "Failed migrate_to, target rejoins" ],
[ "probe-target-of-failed-migrate_to-2", "Failed migrate_to, target rejoined and probed" ],
[ "partial-live-migration-multiple-active", "Prevent running on multiple nodes due to partial live migration" ],
# @TODO: If pacemaker implements versioned attributes, uncomment this test
#[ "migrate-versioned", "Disable migration for versioned resources" ],
[ "bug-lf-2422", "Dependency on partially active group - stop ocfs:*" ],
],
[
[ "clone-anon-probe-1", "Probe the correct (anonymous) clone instance for each node" ],
[ "clone-anon-probe-2", "Avoid needless re-probing of anonymous clones" ],
[ "clone-anon-failcount", "Merge failcounts for anonymous clones" ],
[ "force-anon-clone-max", "Update clone-max properly when forcing a clone to be anonymous" ],
[ "anon-instance-pending", "Assign anonymous clone instance numbers properly when action pending" ],
[ "inc0", "Incarnation start" ],
[ "inc1", "Incarnation start order" ],
[ "inc2", "Incarnation silent restart, stop, move" ],
[ "inc3", "Inter-incarnation ordering, silent restart, stop, move" ],
[ "inc4", "Inter-incarnation ordering, silent restart, stop, move (ordered)" ],
[ "inc5", "Inter-incarnation ordering, silent restart, stop, move (restart 1)" ],
[ "inc6", "Inter-incarnation ordering, silent restart, stop, move (restart 2)" ],
[ "inc7", "Clone colocation" ],
[ "inc8", "Clone anti-colocation" ],
[ "inc9", "Non-unique clone" ],
[ "inc10", "Non-unique clone (stop)" ],
[ "inc11", "Primitive colocation with clones" ],
[ "inc12", "Clone shutdown" ],
[ "cloned-group", "Make sure only the correct number of cloned groups are started" ],
[ "cloned-group-stop", "Ensure stopping qpidd also stops glance and cinder" ],
[ "clone-no-shuffle", "Don't prioritize allocation of instances that must be moved" ],
[ "clone-max-zero", "Orphan processing with clone-max=0" ],
[ "clone-anon-dup",
"Bug LF#2087 - Correctly parse the state of anonymous clones that are active more than once per node" ],
[ "bug-lf-2160", "Don't shuffle clones due to colocation" ],
[ "bug-lf-2213", "clone-node-max enforcement for cloned groups" ],
[ "bug-lf-2153", "Clone ordering constraints" ],
[ "bug-lf-2361", "Ensure clones observe mandatory ordering constraints if the LHS is unrunnable" ],
[ "bug-lf-2317", "Avoid needless restart of primitive depending on a clone" ],
[ "clone-colocate-instance-1", "Colocation with a specific clone instance (negative example)" ],
[ "clone-colocate-instance-2", "Colocation with a specific clone instance" ],
[ "clone-order-instance", "Ordering with specific clone instances" ],
[ "bug-lf-2453", "Enforce mandatory clone ordering without colocation" ],
[ "bug-lf-2508", "Correctly reconstruct the status of anonymous cloned groups" ],
[ "bug-lf-2544", "Balanced clone placement" ],
[ "bug-lf-2445", "Redistribute clones with node-max > 1 and stickiness = 0" ],
[ "bug-lf-2574", "Avoid clone shuffle" ],
[ "bug-lf-2581", "Avoid group restart due to unrelated clone (re)start" ],
[ "bug-cl-5168", "Don't shuffle clones" ],
[ "bug-cl-5170", "Prevent clone from starting with on-fail=block" ],
[ "clone-fail-block-colocation", "Move colocated group when failed clone has on-fail=block" ],
[ "clone-interleave-1",
"Clone-3 cannot start on pcmk-1 due to interleaved ordering (no colocation)" ],
[ "clone-interleave-2", "Clone-3 must stop on pcmk-1 due to interleaved ordering (no colocation)" ],
[ "clone-interleave-3",
"Clone-3 must be recovered on pcmk-1 due to interleaved ordering (no colocation)" ],
[ "rebalance-unique-clones", "Rebalance unique clone instances with no stickiness" ],
[ "clone-requires-quorum-recovery", "Clone with requires=quorum on failed node needing recovery" ],
[ "clone-requires-quorum",
"Clone with requires=quorum with presumed-inactive instance on failed node" ],
],
[
[ "cloned_start_one", "order first clone then clone... first clone_min=2" ],
[ "cloned_start_two", "order first clone then clone... first clone_min=2" ],
[ "cloned_stop_one", "order first clone then clone... first clone_min=2" ],
[ "cloned_stop_two", "order first clone then clone... first clone_min=2" ],
[ "clone_min_interleave_start_one",
"order first clone then clone... first clone_min=2 and then has interleave=true" ],
[ "clone_min_interleave_start_two",
"order first clone then clone... first clone_min=2 and then has interleave=true" ],
[ "clone_min_interleave_stop_one",
"order first clone then clone... first clone_min=2 and then has interleave=true" ],
[ "clone_min_interleave_stop_two",
"order first clone then clone... first clone_min=2 and then has interleave=true" ],
[ "clone_min_start_one", "order first clone then primitive... first clone_min=2" ],
[ "clone_min_start_two", "order first clone then primitive... first clone_min=2" ],
[ "clone_min_stop_all", "order first clone then primitive... first clone_min=2" ],
[ "clone_min_stop_one", "order first clone then primitive... first clone_min=2" ],
[ "clone_min_stop_two", "order first clone then primitive... first clone_min=2" ],
],
[
[ "unfence-startup", "Clean unfencing" ],
[ "unfence-definition", "Unfencing when the agent changes" ],
[ "unfence-parameters", "Unfencing when the agent parameters changes" ],
[ "unfence-device", "Unfencing when a cluster has only fence devices" ],
],
[
[ "promoted-0", "Stopped -> Unpromoted" ],
[ "promoted-1", "Stopped -> Promote" ],
[ "promoted-2", "Stopped -> Promote : notify" ],
[ "promoted-3", "Stopped -> Promote : promoted location" ],
[ "promoted-4", "Started -> Promote : promoted location" ],
[ "promoted-5", "Promoted -> Promoted" ],
[ "promoted-6", "Promoted -> Promoted (2)" ],
[ "promoted-7", "Promoted -> Fenced" ],
[ "promoted-8", "Promoted -> Fenced -> Moved" ],
[ "promoted-9", "Stopped + Promotable + No quorum" ],
[ "promoted-10", "Stopped -> Promotable : notify with monitor" ],
[ "promoted-11", "Stopped -> Promote : colocation" ],
[ "novell-239082", "Demote/Promote ordering" ],
[ "novell-239087", "Stable promoted placement" ],
[ "promoted-12", "Promotion based solely on rsc_location constraints" ],
[ "promoted-13", "Include preferences of colocated resources when placing promoted" ],
[ "promoted-demote", "Ordering when actions depends on demoting an unpromoted resource" ],
[ "promoted-ordering", "Prevent resources from starting that need a promoted" ],
[ "bug-1765", "Verify promoted-with-promoted colocation does not stop unpromoted instances" ],
[ "promoted-group", "Promotion of cloned groups" ],
[ "bug-lf-1852", "Don't shuffle promotable instances unnecessarily" ],
[ "promoted-failed-demote", "Don't retry failed demote actions" ],
[ "promoted-failed-demote-2", "Don't retry failed demote actions (notify=false)" ],
[ "promoted-depend",
"Ensure resources that depend on promoted instance don't get allocated until that does" ],
[ "promoted-reattach", "Re-attach to a running promoted" ],
[ "promoted-allow-start", "Don't include promoted score if it would prevent allocation" ],
[ "promoted-colocation",
"Allow promoted instances placemaker to be influenced by colocation constraints" ],
[ "promoted-pseudo", "Make sure promote/demote pseudo actions are created correctly" ],
[ "promoted-role", "Prevent target-role from promoting more than promoted-max instances" ],
[ "bug-lf-2358", "Anti-colocation of promoted instances" ],
[ "promoted-promotion-constraint", "Mandatory promoted colocation constraints" ],
[ "unmanaged-promoted", "Ensure role is preserved for unmanaged resources" ],
[ "promoted-unmanaged-monitor", "Start correct monitor for unmanaged promoted instances" ],
[ "promoted-demote-2", "Demote does not clear past failure" ],
[ "promoted-move", "Move promoted based on failure of colocated group" ],
[ "promoted-probed-score", "Observe the promotion score of probed resources" ],
[ "colocation_constraint_stops_promoted",
"cl#5054 - Ensure promoted is demoted when stopped by colocation constraint" ],
[ "colocation_constraint_stops_unpromoted",
"cl#5054 - Ensure unpromoted is not demoted when stopped by colocation constraint" ],
[ "order_constraint_stops_promoted",
"cl#5054 - Ensure promoted is demoted when stopped by order constraint" ],
[ "order_constraint_stops_unpromoted",
"cl#5054 - Ensure unpromoted is not demoted when stopped by order constraint" ],
[ "promoted_monitor_restart", "cl#5072 - Ensure promoted monitor operation will start after promotion" ],
[ "bug-rh-880249", "Handle replacement of an m/s resource with a primitive" ],
[ "bug-5143-ms-shuffle", "Prevent promoted instance shuffling due to promotion score" ],
[ "promoted-demote-block", "Block promotion if demote fails with on-fail=block" ],
[ "promoted-dependent-ban",
"Don't stop instances from being active because a dependent is banned from that host" ],
[ "promoted-stop", "Stop instances due to location constraint with role=Started" ],
[ "promoted-partially-demoted-group", "Allow partially demoted group to finish demoting" ],
[ "bug-cl-5213", "Ensure role colocation with -INFINITY is enforced" ],
[ "bug-cl-5219", "Allow unrelated resources with a common colocation target to remain promoted" ],
[ "promoted-asymmetrical-order",
"Fix the behaviors of multi-state resources with asymmetrical ordering" ],
[ "promoted-notify", "Promotion with notifications" ],
[ "promoted-score-startup", "Use permanent promoted scores without LRM history" ],
[ "failed-demote-recovery", "Recover resource in unpromoted role after demote fails" ],
[ "failed-demote-recovery-promoted", "Recover resource in promoted role after demote fails" ],
[ "on_fail_demote1", "Recovery with on-fail=\"demote\" on healthy cluster, remote, guest, and bundle nodes" ],
[ "on_fail_demote2", "Recovery with on-fail=\"demote\" with promotion on different node" ],
[ "on_fail_demote3", "Recovery with on-fail=\"demote\" with no promotion" ],
[ "on_fail_demote4", "Recovery with on-fail=\"demote\" on failed cluster, remote, guest, and bundle nodes" ],
[ "no_quorum_demote", "Promotable demotion and primitive stop with no-quorum-policy=\"demote\"" ],
[ "no-promote-on-unrunnable-guest", "Don't select bundle instance for promotion when container can't run" ],
],
[
[ "history-1", "Correctly parse stateful-1 resource state" ],
],
[
[ "managed-0", "Managed (reference)" ],
[ "managed-1", "Not managed - down" ],
[ "managed-2", "Not managed - up" ],
[ "bug-5028", "Shutdown should block if anything depends on an unmanaged resource" ],
[ "bug-5028-detach", "Ensure detach still works" ],
[ "bug-5028-bottom",
"Ensure shutdown still blocks if the blocked resource is at the bottom of the stack" ],
[ "unmanaged-stop-1",
"cl#5155 - Block the stop of resources if any depending resource is unmanaged" ],
[ "unmanaged-stop-2",
"cl#5155 - Block the stop of resources if the first resource in a mandatory stop order is unmanaged" ],
[ "unmanaged-stop-3",
"cl#5155 - Block the stop of resources if any depending resource in a group is unmanaged" ],
[ "unmanaged-stop-4",
"cl#5155 - Block the stop of resources if any depending resource in the middle of a group is unmanaged" ],
[ "unmanaged-block-restart",
"Block restart of resources if any dependent resource in a group is unmanaged" ],
],
[
[ "interleave-0", "Interleave (reference)" ],
[ "interleave-1", "coloc - not interleaved" ],
[ "interleave-2", "coloc - interleaved" ],
[ "interleave-3", "coloc - interleaved (2)" ],
[ "interleave-pseudo-stop", "Interleaved clone during stonith" ],
[ "interleave-stop", "Interleaved clone during stop" ],
[ "interleave-restart", "Interleaved clone during dependency restart" ],
],
[
[ "notify-0", "Notify reference" ],
[ "notify-1", "Notify simple" ],
[ "notify-2", "Notify simple, confirm" ],
[ "notify-3", "Notify move, confirm" ],
[ "novell-239079", "Notification priority" ],
#[ "notify-2", "Notify - 764" ],
[ "notifs-for-unrunnable", "Don't schedule notifications for an unrunnable action" ],
[ "route-remote-notify", "Route remote notify actions through correct cluster node" ],
[ "notify-behind-stopping-remote", "Don't schedule notifications behind stopped remote" ],
],
[
[ "594", "OSDL #594 - Unrunnable actions scheduled in transition" ],
[ "662", "OSDL #662 - Two resources start on one node when incarnation_node_max = 1" ],
[ "696", "OSDL #696 - CRM starts stonith RA without monitor" ],
[ "726", "OSDL #726 - Attempting to schedule rsc_posic041_monitor_5000 _after_ a stop" ],
[ "735", "OSDL #735 - Correctly detect that rsc_hadev1 is stopped on hadev3" ],
[ "764", "OSDL #764 - Missing monitor op for DoFencing:child_DoFencing:1" ],
[ "797", "OSDL #797 - Assert triggered: task_id_i > max_call_id" ],
[ "829", "OSDL #829" ],
[ "994",
"OSDL #994 - Stopping the last resource in a resource group causes the entire group to be restarted" ],
[ "994-2", "OSDL #994 - with a dependent resource" ],
[ "1360", "OSDL #1360 - Clone stickiness" ],
[ "1484", "OSDL #1484 - on_fail=stop" ],
[ "1494", "OSDL #1494 - Clone stability" ],
[ "unrunnable-1", "Unrunnable" ],
[ "unrunnable-2", "Unrunnable 2" ],
[ "stonith-0", "Stonith loop - 1" ],
[ "stonith-1", "Stonith loop - 2" ],
[ "stonith-2", "Stonith loop - 3" ],
[ "stonith-3", "Stonith startup" ],
[ "stonith-4", "Stonith node state" ],
[ "dc-fence-ordering", "DC needs fencing while other nodes are shutting down" ],
[ "bug-1572-1", "Recovery of groups depending on promotable role" ],
[ "bug-1572-2", "Recovery of groups depending on promotable role when promoted is not re-promoted" ],
[ "bug-1685", "Depends-on-promoted ordering" ],
[ "bug-1822", "Don't promote partially active groups" ],
[ "bug-pm-11", "New resource added to a m/s group" ],
[ "bug-pm-12", "Recover only the failed portion of a cloned group" ],
[ "bug-n-387749", "Don't shuffle clone instances" ],
[ "bug-n-385265",
"Don't ignore the failure stickiness of group children - resource_idvscommon should stay stopped" ],
[ "bug-n-385265-2",
"Ensure groups are migrated instead of remaining partially active on the current node" ],
[ "bug-lf-1920", "Correctly handle probes that find active resources" ],
[ "bnc-515172", "Location constraint with multiple expressions" ],
[ "colocate-primitive-with-clone", "Optional colocation with a clone" ],
[ "use-after-free-merge", "Use-after-free in native_merge_weights" ],
[ "bug-lf-2551", "STONITH ordering for stop" ],
[ "bug-lf-2606", "Stonith implies demote" ],
[ "bug-lf-2474", "Ensure resource op timeout takes precedence over op_defaults" ],
[ "bug-suse-707150", "Prevent vm-01 from starting due to colocation/ordering" ],
[ "bug-5014-A-start-B-start", "Verify when A starts B starts using symmetrical=false" ],
[ "bug-5014-A-stop-B-started",
"Verify when A stops B does not stop if it has already started using symmetric=false" ],
[ "bug-5014-A-stopped-B-stopped",
"Verify when A is stopped and B has not started, B does not start before A using symmetric=false" ],
[ "bug-5014-CthenAthenB-C-stopped",
"Verify when C then A is symmetrical=true, A then B is symmetric=false, and C is stopped that nothing starts" ],
[ "bug-5014-CLONE-A-start-B-start",
"Verify when A starts B starts using clone resources with symmetric=false" ],
[ "bug-5014-CLONE-A-stop-B-started",
"Verify when A stops B does not stop if it has already started using clone resources with symmetric=false" ],
[ "bug-5014-GROUP-A-start-B-start",
"Verify when A starts B starts when using group resources with symmetric=false" ],
[ "bug-5014-GROUP-A-stopped-B-started",
"Verify when A stops B does not stop if it has already started using group resources with symmetric=false" ],
[ "bug-5014-GROUP-A-stopped-B-stopped",
"Verify when A is stopped and B has not started, B does not start before A using group resources with symmetric=false" ],
[ "bug-5014-ordered-set-symmetrical-false",
"Verify ordered sets work with symmetrical=false" ],
[ "bug-5014-ordered-set-symmetrical-true",
"Verify ordered sets work with symmetrical=true" ],
[ "clbz5007-promotable-colocation",
"Verify use of colocation scores other than INFINITY and -INFINITY work on multi-state resources" ],
[ "bug-5038", "Prevent restart of anonymous clones when clone-max decreases" ],
[ "bug-5025-1", "Automatically clean up failcount after resource config change with reload" ],
[ "bug-5025-2", "Make sure clear failcount action isn't set when config does not change" ],
[ "bug-5025-3", "Automatically clean up failcount after resource config change with restart" ],
[ "bug-5025-4", "Clear failcount when last failure is a start op and rsc attributes changed" ],
[ "failcount", "Ensure failcounts are correctly expired" ],
[ "failcount-block", "Ensure failcounts are not expired when on-fail=block is present" ],
[ "per-op-failcount", "Ensure per-operation failcount is handled and not passed to fence agent" ],
[ "on-fail-ignore", "Ensure on-fail=ignore works even beyond migration-threshold" ],
[ "monitor-onfail-restart", "bug-5058 - Monitor failure with on-fail set to restart" ],
[ "monitor-onfail-stop", "bug-5058 - Monitor failure wiht on-fail set to stop" ],
[ "bug-5059", "No need to restart p_stateful1:*" ],
[ "bug-5069-op-enabled", "Test on-fail=ignore with failure when monitor is enabled" ],
[ "bug-5069-op-disabled", "Test on-fail-ignore with failure when monitor is disabled" ],
[ "obsolete-lrm-resource", "cl#5115 - Do not use obsolete lrm_resource sections" ],
[ "expire-non-blocked-failure",
"Ignore failure-timeout only if the failed operation has on-fail=block" ],
[ "asymmetrical-order-move", "Respect asymmetrical ordering when trying to move resources" ],
[ "asymmetrical-order-restart", "Respect asymmetrical ordering when restarting dependent resource" ],
[ "start-then-stop-with-unfence", "Avoid graph loop with start-then-stop constraint plus unfencing" ],
[ "order-expired-failure", "Order failcount cleanup after remote fencing" ],
[ "ignore_stonith_rsc_order1",
"cl#5056- Ignore order constraint between stonith and non-stonith rsc" ],
[ "ignore_stonith_rsc_order2",
"cl#5056- Ignore order constraint with group rsc containing mixed stonith and non-stonith" ],
[ "ignore_stonith_rsc_order3", "cl#5056- Ignore order constraint, stonith clone and mixed group" ],
[ "ignore_stonith_rsc_order4",
"cl#5056- Ignore order constraint, stonith clone and clone with nested mixed group" ],
[ "honor_stonith_rsc_order1",
"cl#5056- Honor order constraint, stonith clone and pure stonith group(single rsc)" ],
[ "honor_stonith_rsc_order2",
"cl#5056- Honor order constraint, stonith clone and pure stonith group(multiple rsc)" ],
[ "honor_stonith_rsc_order3",
"cl#5056- Honor order constraint, stonith clones with nested pure stonith group" ],
[ "honor_stonith_rsc_order4",
"cl#5056- Honor order constraint, between two native stonith rscs" ],
[ "multiply-active-stonith", "Multiply active stonith" ],
[ "probe-timeout", "cl#5099 - Default probe timeout" ],
[ "order-first-probes",
"cl#5301 - respect order constraints when relevant resources are being probed" ],
[ "concurrent-fencing", "Allow performing fencing operations in parallel" ],
[ "priority-fencing-delay", "Delay fencing targeting the more significant node" ],
],
[
[ "systemhealth1", "System Health () #1" ],
[ "systemhealth2", "System Health () #2" ],
[ "systemhealth3", "System Health () #3" ],
[ "systemhealthn1", "System Health (None) #1" ],
[ "systemhealthn2", "System Health (None) #2" ],
[ "systemhealthn3", "System Health (None) #3" ],
[ "systemhealthm1", "System Health (Migrate On Red) #1" ],
[ "systemhealthm2", "System Health (Migrate On Red) #2" ],
[ "systemhealthm3", "System Health (Migrate On Red) #3" ],
[ "systemhealtho1", "System Health (Only Green) #1" ],
[ "systemhealtho2", "System Health (Only Green) #2" ],
[ "systemhealtho3", "System Health (Only Green) #3" ],
[ "systemhealthp1", "System Health (Progessive) #1" ],
[ "systemhealthp2", "System Health (Progessive) #2" ],
[ "systemhealthp3", "System Health (Progessive) #3" ],
[ "allow-unhealthy-nodes", "System Health (migrate-on-red + allow-unhealth-nodes)" ],
],
[
[ "utilization", "Placement Strategy - utilization" ],
[ "minimal", "Placement Strategy - minimal" ],
[ "balanced", "Placement Strategy - balanced" ],
],
[
[ "placement-stickiness", "Optimized Placement Strategy - stickiness" ],
[ "placement-priority", "Optimized Placement Strategy - priority" ],
[ "placement-location", "Optimized Placement Strategy - location" ],
[ "placement-capacity", "Optimized Placement Strategy - capacity" ],
],
[
[ "utilization-order1", "Utilization Order - Simple" ],
[ "utilization-order2", "Utilization Order - Complex" ],
[ "utilization-order3", "Utilization Order - Migrate" ],
[ "utilization-order4", "Utilization Order - Live Migration (bnc#695440)" ],
[ "utilization-complex", "Utilization with complex relationships" ],
[ "utilization-shuffle",
"Don't displace prmExPostgreSQLDB2 on act2, Start prmExPostgreSQLDB1 on act3" ],
[ "load-stopped-loop", "Avoid transition loop due to load_stopped (cl#5044)" ],
[ "load-stopped-loop-2",
"cl#5235 - Prevent graph loops that can be introduced by load_stopped -> migrate_to ordering" ],
],
[
[ "colocated-utilization-primitive-1", "Colocated Utilization - Primitive" ],
[ "colocated-utilization-primitive-2", "Colocated Utilization - Choose the most capable node" ],
[ "colocated-utilization-group", "Colocated Utilization - Group" ],
[ "colocated-utilization-clone", "Colocated Utilization - Clone" ],
[ "utilization-check-allowed-nodes",
"Only check the capacities of the nodes that can run the resource" ],
],
[
[ "reprobe-target_rc", "Ensure correct target_rc for reprobe of inactive resources" ],
[ "node-maintenance-1", "cl#5128 - Node maintenance" ],
[ "node-maintenance-2", "cl#5128 - Node maintenance (coming out of maintenance mode)" ],
[ "shutdown-maintenance-node", "Do not fence a maintenance node if it shuts down cleanly" ],
[ "rsc-maintenance", "Per-resource maintenance" ],
],
[
[ "not-installed-agent", "The resource agent is missing" ],
[ "not-installed-tools", "Something the resource agent needs is missing" ],
],
[
[ "stopped-monitor-00", "Stopped Monitor - initial start" ],
[ "stopped-monitor-01", "Stopped Monitor - failed started" ],
[ "stopped-monitor-02", "Stopped Monitor - started multi-up" ],
[ "stopped-monitor-03", "Stopped Monitor - stop started" ],
[ "stopped-monitor-04", "Stopped Monitor - failed stop" ],
[ "stopped-monitor-05", "Stopped Monitor - start unmanaged" ],
[ "stopped-monitor-06", "Stopped Monitor - unmanaged multi-up" ],
[ "stopped-monitor-07", "Stopped Monitor - start unmanaged multi-up" ],
[ "stopped-monitor-08", "Stopped Monitor - migrate" ],
[ "stopped-monitor-09", "Stopped Monitor - unmanage started" ],
[ "stopped-monitor-10", "Stopped Monitor - unmanaged started multi-up" ],
[ "stopped-monitor-11", "Stopped Monitor - stop unmanaged started" ],
[ "stopped-monitor-12", "Stopped Monitor - unmanaged started multi-up (target-role=Stopped)" ],
[ "stopped-monitor-20", "Stopped Monitor - initial stop" ],
[ "stopped-monitor-21", "Stopped Monitor - stopped single-up" ],
[ "stopped-monitor-22", "Stopped Monitor - stopped multi-up" ],
[ "stopped-monitor-23", "Stopped Monitor - start stopped" ],
[ "stopped-monitor-24", "Stopped Monitor - unmanage stopped" ],
[ "stopped-monitor-25", "Stopped Monitor - unmanaged stopped multi-up" ],
[ "stopped-monitor-26", "Stopped Monitor - start unmanaged stopped" ],
[ "stopped-monitor-27", "Stopped Monitor - unmanaged stopped multi-up (target-role=Started)" ],
[ "stopped-monitor-30", "Stopped Monitor - new node started" ],
[ "stopped-monitor-31", "Stopped Monitor - new node stopped" ],
],
[
# This is a combo test to check:
# - probe timeout defaults to the minimum-interval monitor's
# - duplicate recurring operations are ignored
# - if timeout spec is bad, the default timeout is used
# - failure is blocked with on-fail=block even if ISO8601 interval is specified
# - started/stopped role monitors are started/stopped on right nodes
[ "intervals", "Recurring monitor interval handling" ],
],
[
[ "ticket-primitive-1", "Ticket - Primitive (loss-policy=stop, initial)" ],
[ "ticket-primitive-2", "Ticket - Primitive (loss-policy=stop, granted)" ],
[ "ticket-primitive-3", "Ticket - Primitive (loss-policy-stop, revoked)" ],
[ "ticket-primitive-4", "Ticket - Primitive (loss-policy=demote, initial)" ],
[ "ticket-primitive-5", "Ticket - Primitive (loss-policy=demote, granted)" ],
[ "ticket-primitive-6", "Ticket - Primitive (loss-policy=demote, revoked)" ],
[ "ticket-primitive-7", "Ticket - Primitive (loss-policy=fence, initial)" ],
[ "ticket-primitive-8", "Ticket - Primitive (loss-policy=fence, granted)" ],
[ "ticket-primitive-9", "Ticket - Primitive (loss-policy=fence, revoked)" ],
[ "ticket-primitive-10", "Ticket - Primitive (loss-policy=freeze, initial)" ],
[ "ticket-primitive-11", "Ticket - Primitive (loss-policy=freeze, granted)" ],
[ "ticket-primitive-12", "Ticket - Primitive (loss-policy=freeze, revoked)" ],
[ "ticket-primitive-13", "Ticket - Primitive (loss-policy=stop, standby, granted)" ],
[ "ticket-primitive-14", "Ticket - Primitive (loss-policy=stop, granted, standby)" ],
[ "ticket-primitive-15", "Ticket - Primitive (loss-policy=stop, standby, revoked)" ],
[ "ticket-primitive-16", "Ticket - Primitive (loss-policy=demote, standby, granted)" ],
[ "ticket-primitive-17", "Ticket - Primitive (loss-policy=demote, granted, standby)" ],
[ "ticket-primitive-18", "Ticket - Primitive (loss-policy=demote, standby, revoked)" ],
[ "ticket-primitive-19", "Ticket - Primitive (loss-policy=fence, standby, granted)" ],
[ "ticket-primitive-20", "Ticket - Primitive (loss-policy=fence, granted, standby)" ],
[ "ticket-primitive-21", "Ticket - Primitive (loss-policy=fence, standby, revoked)" ],
[ "ticket-primitive-22", "Ticket - Primitive (loss-policy=freeze, standby, granted)" ],
[ "ticket-primitive-23", "Ticket - Primitive (loss-policy=freeze, granted, standby)" ],
[ "ticket-primitive-24", "Ticket - Primitive (loss-policy=freeze, standby, revoked)" ],
],
[
[ "ticket-group-1", "Ticket - Group (loss-policy=stop, initial)" ],
[ "ticket-group-2", "Ticket - Group (loss-policy=stop, granted)" ],
[ "ticket-group-3", "Ticket - Group (loss-policy-stop, revoked)" ],
[ "ticket-group-4", "Ticket - Group (loss-policy=demote, initial)" ],
[ "ticket-group-5", "Ticket - Group (loss-policy=demote, granted)" ],
[ "ticket-group-6", "Ticket - Group (loss-policy=demote, revoked)" ],
[ "ticket-group-7", "Ticket - Group (loss-policy=fence, initial)" ],
[ "ticket-group-8", "Ticket - Group (loss-policy=fence, granted)" ],
[ "ticket-group-9", "Ticket - Group (loss-policy=fence, revoked)" ],
[ "ticket-group-10", "Ticket - Group (loss-policy=freeze, initial)" ],
[ "ticket-group-11", "Ticket - Group (loss-policy=freeze, granted)" ],
[ "ticket-group-12", "Ticket - Group (loss-policy=freeze, revoked)" ],
[ "ticket-group-13", "Ticket - Group (loss-policy=stop, standby, granted)" ],
[ "ticket-group-14", "Ticket - Group (loss-policy=stop, granted, standby)" ],
[ "ticket-group-15", "Ticket - Group (loss-policy=stop, standby, revoked)" ],
[ "ticket-group-16", "Ticket - Group (loss-policy=demote, standby, granted)" ],
[ "ticket-group-17", "Ticket - Group (loss-policy=demote, granted, standby)" ],
[ "ticket-group-18", "Ticket - Group (loss-policy=demote, standby, revoked)" ],
[ "ticket-group-19", "Ticket - Group (loss-policy=fence, standby, granted)" ],
[ "ticket-group-20", "Ticket - Group (loss-policy=fence, granted, standby)" ],
[ "ticket-group-21", "Ticket - Group (loss-policy=fence, standby, revoked)" ],
[ "ticket-group-22", "Ticket - Group (loss-policy=freeze, standby, granted)" ],
[ "ticket-group-23", "Ticket - Group (loss-policy=freeze, granted, standby)" ],
[ "ticket-group-24", "Ticket - Group (loss-policy=freeze, standby, revoked)" ],
],
[
[ "ticket-clone-1", "Ticket - Clone (loss-policy=stop, initial)" ],
[ "ticket-clone-2", "Ticket - Clone (loss-policy=stop, granted)" ],
[ "ticket-clone-3", "Ticket - Clone (loss-policy-stop, revoked)" ],
[ "ticket-clone-4", "Ticket - Clone (loss-policy=demote, initial)" ],
[ "ticket-clone-5", "Ticket - Clone (loss-policy=demote, granted)" ],
[ "ticket-clone-6", "Ticket - Clone (loss-policy=demote, revoked)" ],
[ "ticket-clone-7", "Ticket - Clone (loss-policy=fence, initial)" ],
[ "ticket-clone-8", "Ticket - Clone (loss-policy=fence, granted)" ],
[ "ticket-clone-9", "Ticket - Clone (loss-policy=fence, revoked)" ],
[ "ticket-clone-10", "Ticket - Clone (loss-policy=freeze, initial)" ],
[ "ticket-clone-11", "Ticket - Clone (loss-policy=freeze, granted)" ],
[ "ticket-clone-12", "Ticket - Clone (loss-policy=freeze, revoked)" ],
[ "ticket-clone-13", "Ticket - Clone (loss-policy=stop, standby, granted)" ],
[ "ticket-clone-14", "Ticket - Clone (loss-policy=stop, granted, standby)" ],
[ "ticket-clone-15", "Ticket - Clone (loss-policy=stop, standby, revoked)" ],
[ "ticket-clone-16", "Ticket - Clone (loss-policy=demote, standby, granted)" ],
[ "ticket-clone-17", "Ticket - Clone (loss-policy=demote, granted, standby)" ],
[ "ticket-clone-18", "Ticket - Clone (loss-policy=demote, standby, revoked)" ],
[ "ticket-clone-19", "Ticket - Clone (loss-policy=fence, standby, granted)" ],
[ "ticket-clone-20", "Ticket - Clone (loss-policy=fence, granted, standby)" ],
[ "ticket-clone-21", "Ticket - Clone (loss-policy=fence, standby, revoked)" ],
[ "ticket-clone-22", "Ticket - Clone (loss-policy=freeze, standby, granted)" ],
[ "ticket-clone-23", "Ticket - Clone (loss-policy=freeze, granted, standby)" ],
[ "ticket-clone-24", "Ticket - Clone (loss-policy=freeze, standby, revoked)" ],
],
[
[ "ticket-promoted-1", "Ticket - Promoted (loss-policy=stop, initial)" ],
[ "ticket-promoted-2", "Ticket - Promoted (loss-policy=stop, granted)" ],
[ "ticket-promoted-3", "Ticket - Promoted (loss-policy-stop, revoked)" ],
[ "ticket-promoted-4", "Ticket - Promoted (loss-policy=demote, initial)" ],
[ "ticket-promoted-5", "Ticket - Promoted (loss-policy=demote, granted)" ],
[ "ticket-promoted-6", "Ticket - Promoted (loss-policy=demote, revoked)" ],
[ "ticket-promoted-7", "Ticket - Promoted (loss-policy=fence, initial)" ],
[ "ticket-promoted-8", "Ticket - Promoted (loss-policy=fence, granted)" ],
[ "ticket-promoted-9", "Ticket - Promoted (loss-policy=fence, revoked)" ],
[ "ticket-promoted-10", "Ticket - Promoted (loss-policy=freeze, initial)" ],
[ "ticket-promoted-11", "Ticket - Promoted (loss-policy=freeze, granted)" ],
[ "ticket-promoted-12", "Ticket - Promoted (loss-policy=freeze, revoked)" ],
[ "ticket-promoted-13", "Ticket - Promoted (loss-policy=stop, standby, granted)" ],
[ "ticket-promoted-14", "Ticket - Promoted (loss-policy=stop, granted, standby)" ],
[ "ticket-promoted-15", "Ticket - Promoted (loss-policy=stop, standby, revoked)" ],
[ "ticket-promoted-16", "Ticket - Promoted (loss-policy=demote, standby, granted)" ],
[ "ticket-promoted-17", "Ticket - Promoted (loss-policy=demote, granted, standby)" ],
[ "ticket-promoted-18", "Ticket - Promoted (loss-policy=demote, standby, revoked)" ],
[ "ticket-promoted-19", "Ticket - Promoted (loss-policy=fence, standby, granted)" ],
[ "ticket-promoted-20", "Ticket - Promoted (loss-policy=fence, granted, standby)" ],
[ "ticket-promoted-21", "Ticket - Promoted (loss-policy=fence, standby, revoked)" ],
[ "ticket-promoted-22", "Ticket - Promoted (loss-policy=freeze, standby, granted)" ],
[ "ticket-promoted-23", "Ticket - Promoted (loss-policy=freeze, granted, standby)" ],
[ "ticket-promoted-24", "Ticket - Promoted (loss-policy=freeze, standby, revoked)" ],
],
[
[ "ticket-rsc-sets-1", "Ticket - Resource sets (1 ticket, initial)" ],
[ "ticket-rsc-sets-2", "Ticket - Resource sets (1 ticket, granted)" ],
[ "ticket-rsc-sets-3", "Ticket - Resource sets (1 ticket, revoked)" ],
[ "ticket-rsc-sets-4", "Ticket - Resource sets (2 tickets, initial)" ],
[ "ticket-rsc-sets-5", "Ticket - Resource sets (2 tickets, granted)" ],
[ "ticket-rsc-sets-6", "Ticket - Resource sets (2 tickets, granted)" ],
[ "ticket-rsc-sets-7", "Ticket - Resource sets (2 tickets, revoked)" ],
[ "ticket-rsc-sets-8", "Ticket - Resource sets (1 ticket, standby, granted)" ],
[ "ticket-rsc-sets-9", "Ticket - Resource sets (1 ticket, granted, standby)" ],
[ "ticket-rsc-sets-10", "Ticket - Resource sets (1 ticket, standby, revoked)" ],
[ "ticket-rsc-sets-11", "Ticket - Resource sets (2 tickets, standby, granted)" ],
[ "ticket-rsc-sets-12", "Ticket - Resource sets (2 tickets, standby, granted)" ],
[ "ticket-rsc-sets-13", "Ticket - Resource sets (2 tickets, granted, standby)" ],
[ "ticket-rsc-sets-14", "Ticket - Resource sets (2 tickets, standby, revoked)" ],
[ "cluster-specific-params", "Cluster-specific instance attributes based on rules" ],
[ "site-specific-params", "Site-specific instance attributes based on rules" ],
],
[
[ "template-1", "Template - 1" ],
[ "template-2", "Template - 2" ],
[ "template-3", "Template - 3 (merge operations)" ],
[ "template-coloc-1", "Template - Colocation 1" ],
[ "template-coloc-2", "Template - Colocation 2" ],
[ "template-coloc-3", "Template - Colocation 3" ],
[ "template-order-1", "Template - Order 1" ],
[ "template-order-2", "Template - Order 2" ],
[ "template-order-3", "Template - Order 3" ],
[ "template-ticket", "Template - Ticket" ],
[ "template-rsc-sets-1", "Template - Resource Sets 1" ],
[ "template-rsc-sets-2", "Template - Resource Sets 2" ],
[ "template-rsc-sets-3", "Template - Resource Sets 3" ],
[ "template-rsc-sets-4", "Template - Resource Sets 4" ],
[ "template-clone-primitive", "Cloned primitive from template" ],
[ "template-clone-group", "Cloned group from template" ],
[ "location-sets-templates", "Resource sets and templates - Location" ],
[ "tags-coloc-order-1", "Tags - Colocation and Order (Simple)" ],
[ "tags-coloc-order-2", "Tags - Colocation and Order (Resource Sets with Templates)" ],
[ "tags-location", "Tags - Location" ],
[ "tags-ticket", "Tags - Ticket" ],
],
[
[ "container-1", "Container - initial" ],
[ "container-2", "Container - monitor failed" ],
[ "container-3", "Container - stop failed" ],
[ "container-4", "Container - reached migration-threshold" ],
[ "container-group-1", "Container in group - initial" ],
[ "container-group-2", "Container in group - monitor failed" ],
[ "container-group-3", "Container in group - stop failed" ],
[ "container-group-4", "Container in group - reached migration-threshold" ],
[ "container-is-remote-node", "Place resource within container when container is remote-node" ],
[ "bug-rh-1097457", "Kill user defined container/contents ordering" ],
[ "bug-cl-5247", "Graph loop when recovering m/s resource in a container" ],
[ "bundle-order-startup", "Bundle startup ordering" ],
[ "bundle-order-partial-start",
"Bundle startup ordering when some dependencies are already running" ],
[ "bundle-order-partial-start-2",
"Bundle startup ordering when some dependencies and the container are already running" ],
[ "bundle-order-stop", "Bundle stop ordering" ],
[ "bundle-order-partial-stop", "Bundle startup ordering when some dependencies are already stopped" ],
[ "bundle-order-stop-on-remote", "Stop nested resource after bringing up the connection" ],
[ "bundle-order-startup-clone", "Prevent startup because bundle isn't promoted" ],
[ "bundle-order-startup-clone-2", "Bundle startup with clones" ],
[ "bundle-order-stop-clone", "Stop bundle because clone is stopping" ],
[ "bundle-nested-colocation", "Colocation of nested connection resources" ],
[ "bundle-order-fencing",
"Order pseudo bundle fencing after parent node fencing if both are happening" ],
[ "bundle-probe-order-1", "order 1" ],
[ "bundle-probe-order-2", "order 2" ],
[ "bundle-probe-order-3", "order 3" ],
[ "bundle-probe-remotes", "Ensure remotes get probed too" ],
[ "bundle-replicas-change", "Change bundle from 1 replica to multiple" ],
[ "bundle-connection-with-container", "Don't move a container due to connection preferences" ],
[ "nested-remote-recovery", "Recover bundle's container hosted on remote node" ],
],
[
[ "whitebox-fail1", "Fail whitebox container rsc" ],
[ "whitebox-fail2", "Fail cluster connection to guest node" ],
[ "whitebox-fail3", "Failed containers should not run nested on remote nodes" ],
[ "whitebox-start", "Start whitebox container with resources assigned to it" ],
[ "whitebox-stop", "Stop whitebox container with resources assigned to it" ],
[ "whitebox-move", "Move whitebox container with resources assigned to it" ],
[ "whitebox-asymmetric", "Verify connection rsc opts-in based on container resource" ],
[ "whitebox-ms-ordering", "Verify promote/demote can not occur before connection is established" ],
[ "whitebox-ms-ordering-move", "Stop/Start cycle within a moving container" ],
[ "whitebox-orphaned", "Properly shutdown orphaned whitebox container" ],
[ "whitebox-orphan-ms", "Properly tear down orphan ms resources on remote-nodes" ],
[ "whitebox-unexpectedly-running", "Recover container nodes the cluster did not start" ],
[ "whitebox-migrate1", "Migrate both container and connection resource" ],
[ "whitebox-imply-stop-on-fence",
"imply stop action on container node rsc when host node is fenced" ],
[ "whitebox-nested-group", "Verify guest remote-node works nested in a group" ],
[ "guest-node-host-dies", "Verify guest node is recovered if host goes away" ],
[ "guest-node-cleanup", "Order guest node connection recovery after container probe" ],
[ "guest-host-not-fenceable", "Actions on guest node are unrunnable if host is unclean and cannot be fenced" ],
],
[
[ "remote-startup-probes", "Baremetal remote-node startup probes" ],
[ "remote-startup", "Startup a newly discovered remote-nodes with no status" ],
[ "remote-fence-unclean", "Fence unclean baremetal remote-node" ],
[ "remote-fence-unclean2",
"Fence baremetal remote-node after cluster node fails and connection can not be recovered" ],
[ "remote-fence-unclean-3", "Probe failed remote nodes (triggers fencing)" ],
[ "remote-move", "Move remote-node connection resource" ],
[ "remote-disable", "Disable a baremetal remote-node" ],
[ "remote-probe-disable", "Probe then stop a baremetal remote-node" ],
[ "remote-orphaned", "Properly shutdown orphaned connection resource" ],
[ "remote-orphaned2",
"verify we can handle orphaned remote connections with active resources on the remote" ],
[ "remote-recover", "Recover connection resource after cluster-node fails" ],
[ "remote-stale-node-entry",
"Make sure we properly handle leftover remote-node entries in the node section" ],
[ "remote-partial-migrate",
"Make sure partial migrations are handled before ops on the remote node" ],
[ "remote-partial-migrate2",
"Make sure partial migration target is prefered for remote connection" ],
[ "remote-recover-fail", "Make sure start failure causes fencing if rsc are active on remote" ],
[ "remote-start-fail",
"Make sure a start failure does not result in fencing if no active resources are on remote" ],
[ "remote-unclean2",
"Make monitor failure always results in fencing, even if no rsc are active on remote" ],
[ "remote-fence-before-reconnect", "Fence before clearing recurring monitor failure" ],
[ "remote-recovery", "Recover remote connections before attempting demotion" ],
[ "remote-recover-connection", "Optimistically recovery of only the connection" ],
[ "remote-recover-all", "Fencing when the connection has no home" ],
[ "remote-recover-no-resources", "Fencing when the connection has no home and no active resources" ],
[ "remote-recover-unknown",
"Fencing when the connection has no home and the remote has no operation history" ],
[ "remote-reconnect-delay", "Waiting for remote reconnect interval to expire" ],
[ "remote-connection-unrecoverable",
"Remote connection host must be fenced, with connection unrecoverable" ],
[ "remote-connection-shutdown", "Remote connection shutdown" ],
[ "cancel-behind-moving-remote",
"Route recurring monitor cancellations through original node of a moving remote connection" ],
],
[
[ "resource-discovery", "Exercises resource-discovery location constraint option" ],
[ "rsc-discovery-per-node", "Disable resource discovery per node" ],
[ "shutdown-lock", "Ensure shutdown lock works properly" ],
[ "shutdown-lock-expiration", "Ensure shutdown lock expiration works properly" ],
],
[
[ "op-defaults", "Test op_defaults conditional expressions" ],
[ "op-defaults-2", "Test op_defaults AND'ed conditional expressions" ],
[ "op-defaults-3", "Test op_defaults precedence" ],
[ "rsc-defaults", "Test rsc_defaults conditional expressions" ],
[ "rsc-defaults-2", "Test rsc_defaults conditional expressions without type" ],
],
[ [ "stop-all-resources", "Test stop-all-resources=true "],
],
[ [ "ocf_degraded-remap-ocf_ok", "Test degraded remapped to OK" ],
[ "ocf_degraded_promoted-remap-ocf_ok", "Test degraded promoted remapped to OK"],
],
# @TODO: If pacemaker implements versioned attributes, uncomment these tests
#[
# [ "versioned-resources", "Start resources with #ra-version rules" ],
# [ "restart-versioned", "Restart resources on #ra-version change" ],
# [ "reload-versioned", "Reload resources on #ra-version change" ],
#],
#[
# [ "versioned-operations-1", "Use #ra-version to configure operations of native resources" ],
# [ "versioned-operations-2", "Use #ra-version to configure operations of stonith resources" ],
# [ "versioned-operations-3", "Use #ra-version to configure operations of promotable resources" ],
# [ "versioned-operations-4", "Use #ra-version to configure operations of groups of the resources" ],
#],
]
TESTS_64BIT = [
[
[ "year-2038", "Check handling of timestamps beyond 2038-01-19 03:14:08 UTC" ],
],
]
# Constants substituted in the build process
class BuildVars(object):
SBINDIR = "@sbindir@"
BUILDDIR = "@abs_top_builddir@"
CRM_SCHEMA_DIRECTORY = "@CRM_SCHEMA_DIRECTORY@"
# These values must be kept in sync with crm_exit_t
class CrmExit(object):
OK = 0
ERROR = 1
NOT_INSTALLED = 5
NOINPUT = 66
CANTCREAT = 73
def is_executable(path):
""" Check whether a file at a given path is executable. """
try:
return os.stat(path)[stat.ST_MODE] & stat.S_IXUSR
except OSError:
return False
def diff(file1, file2, **kwargs):
""" Call diff on two files """
return subprocess.call([ "diff", "-u", "-N", "--ignore-all-space",
"--ignore-blank-lines", file1, file2 ], **kwargs)
def sort_file(filename):
""" Sort a file alphabetically """
with io.open(filename, "rt") as f:
lines = sorted(f)
with io.open(filename, "wt") as f:
f.writelines(lines)
def remove_files(filenames):
""" Remove a list of files """
for filename in filenames:
try:
os.remove(filename)
except OSError:
pass
def normalize(filename):
""" Remove text from a file that isn't important for comparison """
if not hasattr(normalize, "patterns"):
normalize.patterns = [
re.compile(r'crm_feature_set="[^"]*"'),
re.compile(r'batch-limit="[0-9]*"')
]
if os.path.isfile(filename):
with io.open(filename, "rt") as f:
lines = f.readlines()
with io.open(filename, "wt") as f:
for line in lines:
for pattern in normalize.patterns:
line = pattern.sub("", line)
f.write(line)
def cat(filename, dest=sys.stdout):
""" Copy a file to a destination file descriptor """
with io.open(filename, "rt") as f:
shutil.copyfileobj(f, dest)
class CtsScheduler(object):
""" Regression tests for Pacemaker's scheduler """
def _parse_args(self, argv):
""" Parse command-line arguments """
parser = argparse.ArgumentParser(description=DESC)
parser.add_argument('-V', '--verbose', action='count',
help='Display any differences from expected output')
parser.add_argument('--run', metavar='TEST',
help=('Run only single specified test (any further '
'arguments will be passed to crm_simulate)'))
parser.add_argument('--update', action='store_true',
help='Update expected results with actual results')
parser.add_argument('-b', '--binary', metavar='PATH',
help='Specify path to crm_simulate')
parser.add_argument('-i', '--io-dir', metavar='PATH',
help='Specify path to regression test data directory')
parser.add_argument('-o', '--out-dir', metavar='PATH',
help='Specify where intermediate and output files should go')
parser.add_argument('-v', '--valgrind', action='store_true',
help='Run all commands under valgrind')
parser.add_argument('--valgrind-dhat', action='store_true',
help='Run all commands under valgrind with heap analyzer')
parser.add_argument('--valgrind-skip-output', action='store_true',
help='If running under valgrind, do not display output')
parser.add_argument('--testcmd-options', metavar='OPTIONS', default='',
help='Additional options for command under test')
# argparse can't handle "everything after --run TEST", so grab that
self.single_test_args = []
narg = 0
for arg in argv:
narg = narg + 1
if arg == '--run':
(argv, self.single_test_args) = (argv[:narg+1], argv[narg+1:])
break
self.args = parser.parse_args(argv[1:])
def _error(self, s):
print(" * ERROR: %s" % s)
def _failed(self, s):
print(" * FAILED: %s" % s)
def _get_valgrind_cmd(self):
""" Return command arguments needed (or not) to run valgrind """
if self.args.valgrind:
os.environ['G_SLICE'] = "always-malloc"
return [
"valgrind",
"-q",
"--gen-suppressions=all",
"--time-stamp=yes",
"--trace-children=no",
"--show-reachable=no",
"--leak-check=full",
"--num-callers=20",
"--suppressions=%s/valgrind-pcmk.suppressions" % (self.test_home)
]
if self.args.valgrind_dhat:
os.environ['G_SLICE'] = "always-malloc"
return [
"valgrind",
"--tool=exp-dhat",
"--time-stamp=yes",
"--trace-children=no",
"--show-top-n=100",
"--num-callers=4"
]
return []
def _get_simulator_cmd(self):
""" Locate the simulation binary """
if self.args.binary is None:
self.args.binary = BuildVars.BUILDDIR + "/tools/crm_simulate"
if not is_executable(self.args.binary):
self.args.binary = BuildVars.SBINDIR + "/crm_simulate"
if not is_executable(self.args.binary):
# @TODO it would be more pythonic to raise an exception
self._error("Test binary " + self.args.binary + " not found")
sys.exit(CrmExit.NOT_INSTALLED)
return [ self.args.binary ] + shlex.split(self.args.testcmd_options)
def set_schema_env(self):
""" Ensure schema directory environment variable is set, if possible """
try:
return os.environ['PCMK_schema_directory']
except KeyError:
for d in [ os.path.join(BuildVars.BUILDDIR, "xml"),
BuildVars.CRM_SCHEMA_DIRECTORY ]:
if os.path.isdir(d):
os.environ['PCMK_schema_directory'] = d
return d
return None
def __init__(self, argv=sys.argv):
# Ensure all command output is in portable locale for comparison
os.environ['LC_ALL'] = "C"
self._parse_args(argv)
# Where this executable lives
self.test_home = os.path.dirname(os.path.realpath(argv[0]))
# Where test data resides
if self.args.io_dir is None:
self.args.io_dir = os.path.join(self.test_home, "scheduler")
self.xml_input_dir = os.path.join(self.args.io_dir, "xml")
self.expected_dir = os.path.join(self.args.io_dir, "exp")
self.dot_expected_dir = os.path.join(self.args.io_dir, "dot")
self.scores_dir = os.path.join(self.args.io_dir, "scores")
self.summary_dir = os.path.join(self.args.io_dir, "summary")
self.stderr_expected_dir = os.path.join(self.args.io_dir, "stderr")
# Create a temporary directory to store diff file
self.failed_dir = tempfile.mkdtemp(prefix='cts-scheduler_')
# Where to store generated files
if self.args.out_dir is None:
self.args.out_dir = self.args.io_dir
self.failed_filename = os.path.join(self.failed_dir, "test-output.diff")
else:
self.failed_filename = os.path.join(self.args.out_dir, "test-output.diff")
os.environ['CIB_shadow_dir'] = self.args.out_dir
self.failed_file = None
self.outfile_out_dir = os.path.join(self.args.out_dir, "out")
self.dot_out_dir = os.path.join(self.args.out_dir, "dot")
self.scores_out_dir = os.path.join(self.args.out_dir, "scores")
self.summary_out_dir = os.path.join(self.args.out_dir, "summary")
self.stderr_out_dir = os.path.join(self.args.out_dir, "stderr")
self.valgrind_out_dir = os.path.join(self.args.out_dir, "valgrind")
# Single test mode (if requested)
try:
# User can give test base name or file name of a test input
self.args.run = os.path.splitext(os.path.basename(self.args.run))[0]
except (AttributeError, TypeError):
pass # --run was not specified
self.set_schema_env()
# Arguments needed (or not) to run commands
self.valgrind_args = self._get_valgrind_cmd()
self.simulate_args = self._get_simulator_cmd()
# Test counters
self.num_failed = 0
self.num_tests = 0
# Ensure that the main output directory exists
# We don't want to create it with os.makedirs below
if not os.path.isdir(self.args.out_dir):
self._error("Output directory missing; can't create output files")
sys.exit(CrmExit.CANTCREAT)
# Create output subdirectories if they don't exist
try:
os.makedirs(self.outfile_out_dir, 0o755, True)
os.makedirs(self.dot_out_dir, 0o755, True)
os.makedirs(self.scores_out_dir, 0o755, True)
os.makedirs(self.summary_out_dir, 0o755, True)
os.makedirs(self.stderr_out_dir, 0o755, True)
if self.valgrind_args:
os.makedirs(self.valgrind_out_dir, 0o755, True)
except OSError as ex:
self._error("Unable to create output subdirectory: %s" % ex)
remove_files([
self.outfile_out_dir,
self.dot_out_dir,
self.scores_out_dir,
self.summary_out_dir,
self.stderr_out_dir,
])
sys.exit(CrmExit.CANTCREAT)
def _compare_files(self, filename1, filename2):
""" Add any file differences to failed results """
if diff(filename1, filename2, stdout=subprocess.DEVNULL) != 0:
diff(filename1, filename2, stdout=self.failed_file, stderr=subprocess.DEVNULL)
self.failed_file.write("\n")
return True
return False
def run_one(self, test_name, test_desc, test_args=[]):
""" Run one scheduler test """
print(" Test %-25s %s" % ((test_name + ":"), test_desc))
did_fail = False
self.num_tests = self.num_tests + 1
# Test inputs
input_filename = os.path.join(
self.xml_input_dir, "%s.xml" % test_name)
expected_filename = os.path.join(
self.expected_dir, "%s.exp" % test_name)
dot_expected_filename = os.path.join(
self.dot_expected_dir, "%s.dot" % test_name)
scores_filename = os.path.join(
self.scores_dir, "%s.scores" % test_name)
summary_filename = os.path.join(
self.summary_dir, "%s.summary" % test_name)
stderr_expected_filename = os.path.join(
self.stderr_expected_dir, "%s.stderr" % test_name)
# (Intermediate) test outputs
output_filename = os.path.join(
self.outfile_out_dir, "%s.out" % test_name)
dot_output_filename = os.path.join(
self.dot_out_dir, "%s.dot.pe" % test_name)
score_output_filename = os.path.join(
self.scores_out_dir, "%s.scores.pe" % test_name)
summary_output_filename = os.path.join(
self.summary_out_dir, "%s.summary.pe" % test_name)
stderr_output_filename = os.path.join(
self.stderr_out_dir, "%s.stderr.pe" % test_name)
valgrind_output_filename = os.path.join(
self.valgrind_out_dir, "%s.valgrind" % test_name)
# Common arguments for running test
test_cmd = []
if self.valgrind_args:
test_cmd = self.valgrind_args + [ "--log-file=%s" % valgrind_output_filename ]
test_cmd = test_cmd + self.simulate_args
# @TODO It would be more pythonic to raise exceptions for errors,
# then perhaps it would be nice to make a single-test class
# Ensure necessary test inputs exist
if not os.path.isfile(input_filename):
self._error("No input")
self.num_failed = self.num_failed + 1
return CrmExit.NOINPUT
if not self.args.update and not os.path.isfile(expected_filename):
self._error("no stored output")
return CrmExit.NOINPUT
# Run simulation to generate summary output
if self.args.run: # Single test mode
test_cmd_full = test_cmd + [ '-x', input_filename, '-S' ] + test_args
print(" ".join(test_cmd_full))
else:
# @TODO Why isn't test_args added here?
test_cmd_full = test_cmd + [ '-x', input_filename, '-S' ]
with io.open(summary_output_filename, "wt") as f:
simulation = subprocess.Popen(test_cmd_full, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
env=os.environ)
# This makes diff happy regardless of --enable-compat-2.0.
# Use sed -E to make Linux and BSD special characters more compatible.
sed = subprocess.Popen(["sed", "-E",
"-e", "s/ocf::/ocf:/g",
"-e", r"s/Masters:/Promoted:/",
"-e", r"s/Slaves:/Unpromoted:/",
"-e", r"s/ Master( |\[|$)/ Promoted\1/",
"-e", r"s/ Slave / Unpromoted /",
], stdin=simulation.stdout, stdout=f,
stderr=subprocess.STDOUT)
simulation.stdout.close()
sed.communicate()
if self.args.run:
cat(summary_output_filename)
# Re-run simulation to generate dot, graph, and scores
test_cmd_full = test_cmd + [
'-x', input_filename,
'-D', dot_output_filename,
'-G', output_filename,
'-sSQ' ] + test_args
with io.open(stderr_output_filename, "wt") as f_stderr, \
io.open(score_output_filename, "wt") as f_score:
rc = subprocess.call(test_cmd_full, stdout=f_score, stderr=f_stderr, env=os.environ)
# Check for test command failure
if rc != CrmExit.OK:
self._failed("Test returned: %d" % rc)
did_fail = True
print(" ".join(test_cmd_full))
# Check for valgrind errors
if self.valgrind_args and not self.args.valgrind_skip_output:
if os.stat(valgrind_output_filename).st_size > 0:
self._failed("Valgrind reported errors")
did_fail = True
cat(valgrind_output_filename)
remove_files([ valgrind_output_filename ])
# Check for core dump
if os.path.isfile("core"):
self._failed("Core-file detected: core." + test_name)
did_fail = True
os.rename("core", "%s/core.%s" % (self.test_home, test_name))
# Check any stderr output
if os.path.isfile(stderr_expected_filename):
if self._compare_files(stderr_expected_filename, stderr_output_filename):
self._failed("stderr changed")
did_fail = True
elif os.stat(stderr_output_filename).st_size > 0:
self._failed("Output was written to stderr")
did_fail = True
cat(stderr_output_filename)
remove_files([ stderr_output_filename ])
# Check whether output graph exists, and normalize it
if (not os.path.isfile(output_filename)
or os.stat(output_filename).st_size == 0):
self._error("No graph produced")
did_fail = True
self.num_failed = self.num_failed + 1
remove_files([ output_filename ])
return CrmExit.ERROR
normalize(output_filename)
# Check whether dot output exists, and sort it
if (not os.path.isfile(dot_output_filename) or
os.stat(dot_output_filename).st_size == 0):
self._error("No dot-file summary produced")
did_fail = True
self.num_failed = self.num_failed + 1
remove_files([ dot_output_filename, output_filename ])
return CrmExit.ERROR
with io.open(dot_output_filename, "rt") as f:
first_line = f.readline() # "digraph" line with opening brace
lines = f.readlines()
last_line = lines[-1] # closing brace
del lines[-1]
lines = sorted(set(lines)) # unique sort
with io.open(dot_output_filename, "wt") as f:
f.write(first_line)
f.writelines(lines)
f.write(last_line)
# Check whether score output exists, and sort it
if (not os.path.isfile(score_output_filename)
or os.stat(score_output_filename).st_size == 0):
self._error("No allocation scores produced")
did_fail = True
self.num_failed = self.num_failed + 1
remove_files([ score_output_filename, output_filename ])
return CrmExit.ERROR
else:
sort_file(score_output_filename)
if self.args.update:
shutil.copyfile(output_filename, expected_filename)
shutil.copyfile(dot_output_filename, dot_expected_filename)
shutil.copyfile(score_output_filename, scores_filename)
shutil.copyfile(summary_output_filename, summary_filename)
print(" Updated expected outputs")
if self._compare_files(summary_filename, summary_output_filename):
self._failed("summary changed")
did_fail = True
if self._compare_files(dot_expected_filename, dot_output_filename):
self._failed("dot-file summary changed")
did_fail = True
else:
remove_files([ dot_output_filename ])
if self._compare_files(expected_filename, output_filename):
self._failed("xml-file changed")
did_fail = True
if self._compare_files(scores_filename, score_output_filename):
self._failed("scores-file changed")
did_fail = True
remove_files([ output_filename,
score_output_filename,
summary_output_filename])
if did_fail:
self.num_failed = self.num_failed + 1
return CrmExit.ERROR
return CrmExit.OK
def run_all(self):
""" Run all defined tests """
if platform.architecture()[0] == "64bit":
TESTS.extend(TESTS_64BIT)
for group in TESTS:
for test in group:
try:
args = test[2]
except IndexError:
args = []
self.run_one(test[0], test[1], args)
print()
def _print_summary(self):
""" Print a summary of parameters for this test run """
print("Test home is:\t" + self.test_home)
print("Test binary is:\t" + self.args.binary)
if 'PCMK_schema_directory' in os.environ:
print("Schema home is:\t" + os.environ['PCMK_schema_directory'])
if self.valgrind_args != []:
print("Activating memory testing with valgrind")
print()
def _test_results(self):
if self.num_failed == 0:
shutil.rmtree(self.failed_dir)
return CrmExit.OK
if os.path.isfile(self.failed_filename) and os.stat(self.failed_filename).st_size != 0:
if self.args.verbose:
self._error("Results of %d failed tests (out of %d):" %
(self.num_failed, self.num_tests))
cat(self.failed_filename)
else:
self._error("Results of %d failed tests (out of %d) are in %s" %
(self.num_failed, self.num_tests, self.failed_filename))
self._error("Use -V to display them after running the tests")
else:
self._error("%d (of %d) tests failed (no diff results)" %
(self.num_failed, self.num_tests))
if os.path.isfile(self.failed_filename):
shutil.rmtree(self.failed_dir)
return CrmExit.ERROR
def run(self):
""" Run test(s) as specified """
self._print_summary()
# Zero out the error log
self.failed_file = io.open(self.failed_filename, "wt")
if self.args.run is None:
print("Performing the following tests from " + self.args.io_dir)
print()
self.run_all()
print()
self.failed_file.close()
rc = self._test_results()
else:
rc = self.run_one(self.args.run, "Single shot", self.single_test_args)
self.failed_file.close()
if self.num_failed > 0:
print("\nFailures:\nThese have also been written to: " + self.failed_filename + "\n")
cat(self.failed_filename)
shutil.rmtree(self.failed_dir)
return rc
if __name__ == "__main__":
sys.exit(CtsScheduler().run())
# vim: set filetype=python expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=120:
diff --git a/cts/scheduler/dot/restart-with-extra-op-params.dot b/cts/scheduler/dot/restart-with-extra-op-params.dot
new file mode 100644
index 0000000000..e13a1bf84c
--- /dev/null
+++ b/cts/scheduler/dot/restart-with-extra-op-params.dot
@@ -0,0 +1,7 @@
+ digraph "g" {
+"dummy1_monitor_10000 node2" [ style=bold color="green" fontcolor="black"]
+"dummy1_start_0 node2" -> "dummy1_monitor_10000 node2" [ style = bold]
+"dummy1_start_0 node2" [ style=bold color="green" fontcolor="black"]
+"dummy1_stop_0 node2" -> "dummy1_start_0 node2" [ style = bold]
+"dummy1_stop_0 node2" [ style=bold color="green" fontcolor="black"]
+}
diff --git a/cts/scheduler/exp/restart-with-extra-op-params.exp b/cts/scheduler/exp/restart-with-extra-op-params.exp
new file mode 100644
index 0000000000..6d119b66ec
--- /dev/null
+++ b/cts/scheduler/exp/restart-with-extra-op-params.exp
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/cts/scheduler/scores/restart-with-extra-op-params.scores b/cts/scheduler/scores/restart-with-extra-op-params.scores
new file mode 100644
index 0000000000..c28b31aca0
--- /dev/null
+++ b/cts/scheduler/scores/restart-with-extra-op-params.scores
@@ -0,0 +1,5 @@
+
+pcmk__native_allocate: Fencing allocation score on node1: 0
+pcmk__native_allocate: Fencing allocation score on node2: 0
+pcmk__native_allocate: dummy1 allocation score on node1: 0
+pcmk__native_allocate: dummy1 allocation score on node2: 0
diff --git a/cts/scheduler/summary/restart-with-extra-op-params.summary b/cts/scheduler/summary/restart-with-extra-op-params.summary
new file mode 100644
index 0000000000..d80e0d9d87
--- /dev/null
+++ b/cts/scheduler/summary/restart-with-extra-op-params.summary
@@ -0,0 +1,25 @@
+Using the original execution date of: 2021-03-31 14:58:18Z
+Current cluster status:
+ * Node List:
+ * Online: [ node1 node2 ]
+
+ * Full List of Resources:
+ * Fencing (stonith:fence_dummy): Started node1
+ * dummy1 (ocf:pacemaker:Dummy): Started node2
+
+Transition Summary:
+ * Restart dummy1 ( node2 ) due to resource definition change
+
+Executing Cluster Transition:
+ * Resource action: dummy1 stop on node2
+ * Resource action: dummy1 start on node2
+ * Resource action: dummy1 monitor=10000 on node2
+Using the original execution date of: 2021-03-31 14:58:18Z
+
+Revised Cluster Status:
+ * Node List:
+ * Online: [ node1 node2 ]
+
+ * Full List of Resources:
+ * Fencing (stonith:fence_dummy): Started node1
+ * dummy1 (ocf:pacemaker:Dummy): Started node2
diff --git a/cts/scheduler/xml/restart-with-extra-op-params.xml b/cts/scheduler/xml/restart-with-extra-op-params.xml
new file mode 100644
index 0000000000..92f79d2b4e
--- /dev/null
+++ b/cts/scheduler/xml/restart-with-extra-op-params.xml
@@ -0,0 +1,66 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index c5c7dade8d..cd61026153 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1,2930 +1,2943 @@
/*
* Copyright 2004-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include
#include
#include
#include
#include
#include
#include // lrmd_event_data_t, lrmd_rsc_info_t, etc.
#include
#include
#include
#include
#include
#include
#include
#define START_DELAY_THRESHOLD 5 * 60 * 1000
#define MAX_LRM_REG_FAILS 30
struct delete_event_s {
int rc;
const char *rsc;
lrm_state_t *lrm_state;
};
extern pcmk__output_t *logger_out;
static gboolean is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id);
static gboolean build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list);
static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data);
static lrmd_event_data_t *construct_op(lrm_state_t * lrm_state, xmlNode * rsc_op,
const char *rsc_id, const char *operation);
static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
const char *operation, xmlNode *msg);
static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
int log_level);
static int do_update_resource(const char *node_name, lrmd_rsc_info_t *rsc,
lrmd_event_data_t *op, time_t lock_time);
static void
lrm_connection_destroy(void)
{
if (pcmk_is_set(fsa_input_register, R_LRM_CONNECTED)) {
crm_crit("Connection to executor failed");
register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
controld_clear_fsa_input_flags(R_LRM_CONNECTED);
} else {
crm_info("Disconnected from executor");
}
}
static char *
make_stop_id(const char *rsc, int call_id)
{
return crm_strdup_printf("%s:%d", rsc, call_id);
}
static void
copy_instance_keys(gpointer key, gpointer value, gpointer user_data)
{
if (strstr(key, CRM_META "_") == NULL) {
g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value));
}
}
static void
copy_meta_keys(gpointer key, gpointer value, gpointer user_data)
{
if (strstr(key, CRM_META "_") != NULL) {
g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value));
}
}
/*!
* \internal
* \brief Remove a recurring operation from a resource's history
*
* \param[in,out] history Resource history to modify
* \param[in] op Operation to remove
*
* \return TRUE if the operation was found and removed, FALSE otherwise
*/
static gboolean
history_remove_recurring_op(rsc_history_t *history, const lrmd_event_data_t *op)
{
GList *iter;
for (iter = history->recurring_op_list; iter != NULL; iter = iter->next) {
lrmd_event_data_t *existing = iter->data;
if ((op->interval_ms == existing->interval_ms)
&& pcmk__str_eq(op->rsc_id, existing->rsc_id, pcmk__str_none)
&& pcmk__str_eq(op->op_type, existing->op_type, pcmk__str_casei)) {
history->recurring_op_list = g_list_delete_link(history->recurring_op_list, iter);
lrmd_free_event(existing);
return TRUE;
}
}
return FALSE;
}
/*!
* \internal
* \brief Free all recurring operations in resource history
*
* \param[in,out] history Resource history to modify
*/
static void
history_free_recurring_ops(rsc_history_t *history)
{
GList *iter;
for (iter = history->recurring_op_list; iter != NULL; iter = iter->next) {
lrmd_free_event(iter->data);
}
g_list_free(history->recurring_op_list);
history->recurring_op_list = NULL;
}
/*!
* \internal
* \brief Free resource history
*
* \param[in,out] history Resource history to free
*/
void
history_free(gpointer data)
{
rsc_history_t *history = (rsc_history_t*)data;
if (history->stop_params) {
g_hash_table_destroy(history->stop_params);
}
/* Don't need to free history->rsc.id because it's set to history->id */
free(history->rsc.type);
free(history->rsc.standard);
free(history->rsc.provider);
lrmd_free_event(history->failed);
lrmd_free_event(history->last);
free(history->id);
history_free_recurring_ops(history);
free(history);
}
static void
update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op)
{
int target_rc = 0;
rsc_history_t *entry = NULL;
if (op->rsc_deleted) {
crm_debug("Purged history for '%s' after %s", op->rsc_id, op->op_type);
controld_delete_resource_history(op->rsc_id, lrm_state->node_name,
NULL, crmd_cib_smart_opt());
return;
}
if (pcmk__str_eq(op->op_type, RSC_NOTIFY, pcmk__str_casei)) {
return;
}
crm_debug("Updating history for '%s' with %s op", op->rsc_id, op->op_type);
entry = g_hash_table_lookup(lrm_state->resource_history, op->rsc_id);
if (entry == NULL && rsc) {
entry = calloc(1, sizeof(rsc_history_t));
entry->id = strdup(op->rsc_id);
g_hash_table_insert(lrm_state->resource_history, entry->id, entry);
entry->rsc.id = entry->id;
entry->rsc.type = strdup(rsc->type);
entry->rsc.standard = strdup(rsc->standard);
pcmk__str_update(&entry->rsc.provider, rsc->provider);
} else if (entry == NULL) {
crm_info("Resource %s no longer exists, not updating cache", op->rsc_id);
return;
}
entry->last_callid = op->call_id;
target_rc = rsc_op_expected_rc(op);
if (op->op_status == PCMK_EXEC_CANCELLED) {
if (op->interval_ms > 0) {
crm_trace("Removing cancelled recurring op: " PCMK__OP_FMT,
op->rsc_id, op->op_type, op->interval_ms);
history_remove_recurring_op(entry, op);
return;
} else {
crm_trace("Skipping " PCMK__OP_FMT " rc=%d, status=%d",
op->rsc_id, op->op_type, op->interval_ms, op->rc,
op->op_status);
}
} else if (did_rsc_op_fail(op, target_rc)) {
/* Store failed monitors here, otherwise the block below will cause them
* to be forgotten when a stop happens.
*/
if (entry->failed) {
lrmd_free_event(entry->failed);
}
entry->failed = lrmd_copy_event(op);
} else if (op->interval_ms == 0) {
if (entry->last) {
lrmd_free_event(entry->last);
}
entry->last = lrmd_copy_event(op);
if (op->params && pcmk__strcase_any_of(op->op_type, CRMD_ACTION_START,
CRMD_ACTION_RELOAD,
CRMD_ACTION_RELOAD_AGENT,
CRMD_ACTION_STATUS, NULL)) {
if (entry->stop_params) {
g_hash_table_destroy(entry->stop_params);
}
entry->stop_params = pcmk__strkey_table(free, free);
g_hash_table_foreach(op->params, copy_instance_keys, entry->stop_params);
}
}
if (op->interval_ms > 0) {
/* Ensure there are no duplicates */
history_remove_recurring_op(entry, op);
crm_trace("Adding recurring op: " PCMK__OP_FMT,
op->rsc_id, op->op_type, op->interval_ms);
entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op));
} else if (entry->recurring_op_list && !pcmk__str_eq(op->op_type, RSC_STATUS, pcmk__str_casei)) {
crm_trace("Dropping %d recurring ops because of: " PCMK__OP_FMT,
g_list_length(entry->recurring_op_list), op->rsc_id,
op->op_type, op->interval_ms);
history_free_recurring_ops(entry);
}
}
/*!
* \internal
* \brief Send a direct OK ack for a resource task
*
* \param[in] lrm_state LRM connection
* \param[in] input Input message being ack'ed
* \param[in] rsc_id ID of affected resource
* \param[in] rsc Affected resource (if available)
* \param[in] task Operation task being ack'ed
* \param[in] ack_host Name of host to send ack to
* \param[in] ack_sys IPC system name to ack
*/
static void
send_task_ok_ack(lrm_state_t *lrm_state, ha_msg_input_t *input,
const char *rsc_id, lrmd_rsc_info_t *rsc, const char *task,
const char *ack_host, const char *ack_sys)
{
lrmd_event_data_t *op = construct_op(lrm_state, input->xml, rsc_id, task);
lrmd__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
controld_ack_event_directly(ack_host, ack_sys, rsc, op, rsc_id);
lrmd_free_event(op);
}
static inline const char *
op_node_name(lrmd_event_data_t *op)
{
return op->remote_nodename? op->remote_nodename : fsa_our_uname;
}
void
lrm_op_callback(lrmd_event_data_t * op)
{
CRM_CHECK(op != NULL, return);
switch (op->type) {
case lrmd_event_disconnect:
if (op->remote_nodename == NULL) {
/* If this is the local executor IPC connection, set the right
* bits in the controller when the connection goes down.
*/
lrm_connection_destroy();
}
break;
case lrmd_event_exec_complete:
{
lrm_state_t *lrm_state = lrm_state_find(op_node_name(op));
CRM_ASSERT(lrm_state != NULL);
process_lrm_event(lrm_state, op, NULL, NULL);
}
break;
default:
break;
}
}
static void
try_local_executor_connect(long long action, fsa_data_t *msg_data,
lrm_state_t *lrm_state)
{
int rc = pcmk_rc_ok;
crm_debug("Connecting to the local executor");
// If we can connect, great
rc = controld_connect_local_executor(lrm_state);
if (rc == pcmk_rc_ok) {
controld_set_fsa_input_flags(R_LRM_CONNECTED);
crm_info("Connection to the local executor established");
return;
}
// Otherwise, if we can try again, set a timer to do so
if (lrm_state->num_lrm_register_fails < MAX_LRM_REG_FAILS) {
crm_warn("Failed to connect to the local executor %d time%s "
"(%d max): %s", lrm_state->num_lrm_register_fails,
pcmk__plural_s(lrm_state->num_lrm_register_fails),
MAX_LRM_REG_FAILS, pcmk_rc_str(rc));
controld_start_timer(wait_timer);
crmd_fsa_stall(FALSE);
return;
}
// Otherwise give up
crm_err("Failed to connect to the executor the max allowed "
"%d time%s: %s", lrm_state->num_lrm_register_fails,
pcmk__plural_s(lrm_state->num_lrm_register_fails),
pcmk_rc_str(rc));
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
/* A_LRM_CONNECT */
void
do_lrm_control(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
/* This only pertains to local executor connections. Remote connections are
* handled as resources within the scheduler. Connecting and disconnecting
* from remote executor instances is handled differently.
*/
lrm_state_t *lrm_state = NULL;
if(fsa_our_uname == NULL) {
return; /* Nothing to do */
}
lrm_state = lrm_state_find_or_create(fsa_our_uname);
if (lrm_state == NULL) {
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}
if (action & A_LRM_DISCONNECT) {
if (lrm_state_verify_stopped(lrm_state, cur_state, LOG_INFO) == FALSE) {
if (action == A_LRM_DISCONNECT) {
crmd_fsa_stall(FALSE);
return;
}
}
controld_clear_fsa_input_flags(R_LRM_CONNECTED);
crm_info("Disconnecting from the executor");
lrm_state_disconnect(lrm_state);
lrm_state_reset_tables(lrm_state, FALSE);
crm_notice("Disconnected from the executor");
}
if (action & A_LRM_CONNECT) {
try_local_executor_connect(action, msg_data, lrm_state);
}
if (action & ~(A_LRM_CONNECT | A_LRM_DISCONNECT)) {
crm_err("Unexpected action %s in %s", fsa_action2string(action),
__func__);
}
}
static gboolean
lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, int log_level)
{
int counter = 0;
gboolean rc = TRUE;
const char *when = "lrm disconnect";
GHashTableIter gIter;
const char *key = NULL;
rsc_history_t *entry = NULL;
active_op_t *pending = NULL;
crm_debug("Checking for active resources before exit");
if (cur_state == S_TERMINATE) {
log_level = LOG_ERR;
when = "shutdown";
} else if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) {
when = "shutdown... waiting";
}
if (lrm_state->pending_ops && lrm_state_is_connected(lrm_state) == TRUE) {
guint removed = g_hash_table_foreach_remove(
lrm_state->pending_ops, stop_recurring_actions, lrm_state);
guint nremaining = g_hash_table_size(lrm_state->pending_ops);
if (removed || nremaining) {
crm_notice("Stopped %u recurring operation%s at %s (%u remaining)",
removed, pcmk__plural_s(removed), when, nremaining);
}
}
if (lrm_state->pending_ops) {
g_hash_table_iter_init(&gIter, lrm_state->pending_ops);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&pending)) {
/* Ignore recurring actions in the shutdown calculations */
if (pending->interval_ms == 0) {
counter++;
}
}
}
if (counter > 0) {
do_crm_log(log_level, "%d pending executor operation%s at %s",
counter, pcmk__plural_s(counter), when);
if ((cur_state == S_TERMINATE)
|| !pcmk_is_set(fsa_input_register, R_SENT_RSC_STOP)) {
g_hash_table_iter_init(&gIter, lrm_state->pending_ops);
while (g_hash_table_iter_next(&gIter, (gpointer*)&key, (gpointer*)&pending)) {
do_crm_log(log_level, "Pending action: %s (%s)", key, pending->op_key);
}
} else {
rc = FALSE;
}
return rc;
}
if (lrm_state->resource_history == NULL) {
return rc;
}
if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)) {
/* At this point we're not waiting, we're just shutting down */
when = "shutdown";
}
counter = 0;
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer*)&entry)) {
if (is_rsc_active(lrm_state, entry->id) == FALSE) {
continue;
}
counter++;
if (log_level == LOG_ERR) {
crm_info("Found %s active at %s", entry->id, when);
} else {
crm_trace("Found %s active at %s", entry->id, when);
}
if (lrm_state->pending_ops) {
GHashTableIter hIter;
g_hash_table_iter_init(&hIter, lrm_state->pending_ops);
while (g_hash_table_iter_next(&hIter, (gpointer*)&key, (gpointer*)&pending)) {
if (pcmk__str_eq(entry->id, pending->rsc_id, pcmk__str_none)) {
crm_notice("%sction %s (%s) incomplete at %s",
pending->interval_ms == 0 ? "A" : "Recurring a",
key, pending->op_key, when);
}
}
}
}
if (counter) {
crm_err("%d resource%s active at %s",
counter, (counter == 1)? " was" : "s were", when);
}
return rc;
}
/*!
* \internal
* \brief Build XML and string of parameters meeting some criteria, for digest
*
* \param[in] op Executor event with parameter table to use
* \param[in] metadata Parsed meta-data for executed resource agent
* \param[in] param_type Flag used for selection criteria
* \param[out] result Will be set to newly created XML with selected
* parameters as attributes
*
* \return Newly allocated space-separated string of parameter names
* \note Selection criteria varies by param_type: for the restart digest, we
* want parameters that are *not* marked reloadable (OCF 1.1) or that
* *are* marked unique (pre-1.1), for both string and XML results; for the
* secure digest, we want parameters that *are* marked private for the
* string, but parameters that are *not* marked private for the XML.
* \note It is the caller's responsibility to free the string return value with
* free() and the XML result with free_xml().
*/
static char *
build_parameter_list(const lrmd_event_data_t *op,
const struct ra_metadata_s *metadata,
enum ra_param_flags_e param_type, xmlNode **result)
{
char *list = NULL;
size_t len = 0;
*result = create_xml_node(NULL, XML_TAG_PARAMS);
+ /* Consider all parameters only except private ones to be consistent with
+ * what scheduler does with calculate_secure_digest().
+ */
+ if (param_type == ra_param_private
+ && compare_version(fsa_our_dc_version, "3.16.0") >= 0) {
+ g_hash_table_foreach(op->params, hash2field, *result);
+ pcmk__filter_op_for_digest(*result);
+ }
+
for (GList *iter = metadata->ra_params; iter != NULL; iter = iter->next) {
struct ra_param_s *param = (struct ra_param_s *) iter->data;
bool accept_for_list = false;
bool accept_for_xml = false;
switch (param_type) {
case ra_param_reloadable:
accept_for_list = !pcmk_is_set(param->rap_flags, param_type);
accept_for_xml = accept_for_list;
break;
case ra_param_unique:
accept_for_list = pcmk_is_set(param->rap_flags, param_type);
accept_for_xml = accept_for_list;
break;
case ra_param_private:
accept_for_list = pcmk_is_set(param->rap_flags, param_type);
accept_for_xml = !accept_for_list;
break;
}
if (accept_for_list) {
crm_trace("Attr %s is %s", param->rap_name, ra_param_flag2text(param_type));
if (list == NULL) {
// We will later search for " WORD ", so start list with a space
pcmk__add_word(&list, &len, " ");
}
pcmk__add_word(&list, &len, param->rap_name);
} else {
crm_trace("Rejecting %s for %s", param->rap_name, ra_param_flag2text(param_type));
}
if (accept_for_xml) {
const char *v = g_hash_table_lookup(op->params, param->rap_name);
if (v != NULL) {
crm_trace("Adding attr %s=%s to the xml result", param->rap_name, v);
crm_xml_add(*result, param->rap_name, v);
}
+
+ } else {
+ crm_trace("Removing attr %s from the xml result", param->rap_name);
+ xml_remove_prop(*result, param->rap_name);
}
}
if (list != NULL) {
// We will later search for " WORD ", so end list with a space
pcmk__add_word(&list, &len, " ");
}
return list;
}
static void
append_restart_list(lrmd_event_data_t *op, struct ra_metadata_s *metadata,
xmlNode *update, const char *version)
{
char *list = NULL;
char *digest = NULL;
xmlNode *restart = NULL;
CRM_LOG_ASSERT(op->params != NULL);
if (op->interval_ms > 0) {
/* monitors are not reloadable */
return;
}
if (pcmk_is_set(metadata->ra_flags, ra_supports_reload_agent)) {
// Add parameters not marked reloadable to the "op-force-restart" list
list = build_parameter_list(op, metadata, ra_param_reloadable,
&restart);
} else if (pcmk_is_set(metadata->ra_flags, ra_supports_legacy_reload)) {
/* @COMPAT pre-OCF-1.1 resource agents
*
* Before OCF 1.1, Pacemaker abused "unique=0" to indicate
* reloadability. Add any parameters with unique="1" to the
* "op-force-restart" list.
*/
list = build_parameter_list(op, metadata, ra_param_unique, &restart);
} else {
// Resource does not support agent reloads
return;
}
digest = calculate_operation_digest(restart, version);
/* Add "op-force-restart" and "op-restart-digest" to indicate the resource supports reload,
* no matter if it actually supports any parameters with unique="1"). */
crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list? list: "");
crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest);
crm_trace("%s: %s, %s", op->rsc_id, digest, list);
crm_log_xml_trace(restart, "restart digest source");
free_xml(restart);
free(digest);
free(list);
}
static void
append_secure_list(lrmd_event_data_t *op, struct ra_metadata_s *metadata,
xmlNode *update, const char *version)
{
char *list = NULL;
char *digest = NULL;
xmlNode *secure = NULL;
CRM_LOG_ASSERT(op->params != NULL);
/*
* To keep XML_LRM_ATTR_OP_SECURE short, we want it to contain the
* secure parameters but XML_LRM_ATTR_SECURE_DIGEST to be based on
* the insecure ones
*/
list = build_parameter_list(op, metadata, ra_param_private, &secure);
if (list != NULL) {
digest = calculate_operation_digest(secure, version);
crm_xml_add(update, XML_LRM_ATTR_OP_SECURE, list);
crm_xml_add(update, XML_LRM_ATTR_SECURE_DIGEST, digest);
crm_trace("%s: %s, %s", op->rsc_id, digest, list);
crm_log_xml_trace(secure, "secure digest source");
} else {
crm_trace("%s: no secure parameters", op->rsc_id);
}
free_xml(secure);
free(digest);
free(list);
}
static gboolean
build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op,
const char *node_name, const char *src)
{
int target_rc = 0;
xmlNode *xml_op = NULL;
struct ra_metadata_s *metadata = NULL;
const char *caller_version = NULL;
lrm_state_t *lrm_state = NULL;
uint32_t metadata_source = controld_metadata_from_agent;
if (op == NULL) {
return FALSE;
}
target_rc = rsc_op_expected_rc(op);
/* there is a small risk in formerly mixed clusters that it will
* be sub-optimal.
*
* however with our upgrade policy, the update we send should
* still be completely supported anyway
*/
caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION);
CRM_LOG_ASSERT(caller_version != NULL);
if(caller_version == NULL) {
caller_version = CRM_FEATURE_SET;
}
xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
fsa_our_uname, src);
if (xml_op == NULL) {
return TRUE;
}
if ((rsc == NULL) || (op->params == NULL)
|| !crm_op_needs_metadata(rsc->standard, op->op_type)) {
crm_trace("No digests needed for %s action on %s (params=%p rsc=%p)",
op->op_type, op->rsc_id, op->params, rsc);
return TRUE;
}
lrm_state = lrm_state_find(node_name);
if (lrm_state == NULL) {
crm_warn("Cannot calculate digests for operation " PCMK__OP_FMT
" because we have no connection to executor for %s",
op->rsc_id, op->op_type, op->interval_ms, node_name);
return TRUE;
}
/* Getting meta-data from cache is OK unless this is a successful start
* action -- always refresh from the agent for those, in case the
* resource agent was updated.
*
* @TODO Only refresh the meta-data after starts if the agent actually
* changed (using something like inotify, or a hash or modification time of
* the agent executable).
*/
if ((op->op_status != PCMK_EXEC_DONE) || (op->rc != target_rc)
|| !pcmk__str_eq(op->op_type, CRMD_ACTION_START, pcmk__str_none)) {
metadata_source |= controld_metadata_from_cache;
}
metadata = controld_get_rsc_metadata(lrm_state, rsc, metadata_source);
if (metadata == NULL) {
return TRUE;
}
#if ENABLE_VERSIONED_ATTRS
crm_xml_add(xml_op, XML_ATTR_RA_VERSION, metadata->ra_version);
#endif
crm_trace("Including additional digests for %s:%s:%s",
rsc->standard, rsc->provider, rsc->type);
append_restart_list(op, metadata, xml_op, caller_version);
append_secure_list(op, metadata, xml_op, caller_version);
return TRUE;
}
static gboolean
is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id)
{
rsc_history_t *entry = NULL;
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
if (entry == NULL || entry->last == NULL) {
return FALSE;
}
crm_trace("Processing %s: %s.%d=%d", rsc_id, entry->last->op_type,
entry->last->interval_ms, entry->last->rc);
if (entry->last->rc == PCMK_OCF_OK && pcmk__str_eq(entry->last->op_type, CRMD_ACTION_STOP, pcmk__str_casei)) {
return FALSE;
} else if (entry->last->rc == PCMK_OCF_OK
&& pcmk__str_eq(entry->last->op_type, CRMD_ACTION_MIGRATE, pcmk__str_casei)) {
// A stricter check is too complex ... leave that to the scheduler
return FALSE;
} else if (entry->last->rc == PCMK_OCF_NOT_RUNNING) {
return FALSE;
} else if ((entry->last->interval_ms == 0)
&& (entry->last->rc == PCMK_OCF_NOT_CONFIGURED)) {
/* Badly configured resources can't be reliably stopped */
return FALSE;
}
return TRUE;
}
static gboolean
build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
{
GHashTableIter iter;
rsc_history_t *entry = NULL;
g_hash_table_iter_init(&iter, lrm_state->resource_history);
while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) {
GList *gIter = NULL;
xmlNode *xml_rsc = create_xml_node(rsc_list, XML_LRM_TAG_RESOURCE);
crm_xml_add(xml_rsc, XML_ATTR_ID, entry->id);
crm_xml_add(xml_rsc, XML_ATTR_TYPE, entry->rsc.type);
crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, entry->rsc.standard);
crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, entry->rsc.provider);
if (entry->last && entry->last->params) {
const char *container = g_hash_table_lookup(entry->last->params, CRM_META"_"XML_RSC_ATTR_CONTAINER);
if (container) {
crm_trace("Resource %s is a part of container resource %s", entry->id, container);
crm_xml_add(xml_rsc, XML_RSC_ATTR_CONTAINER, container);
}
}
build_operation_update(xml_rsc, &(entry->rsc), entry->failed, lrm_state->node_name,
__func__);
build_operation_update(xml_rsc, &(entry->rsc), entry->last, lrm_state->node_name,
__func__);
for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIter->next) {
build_operation_update(xml_rsc, &(entry->rsc), gIter->data, lrm_state->node_name,
__func__);
}
}
return FALSE;
}
static xmlNode *
do_lrm_query_internal(lrm_state_t *lrm_state, int update_flags)
{
xmlNode *xml_state = NULL;
xmlNode *xml_data = NULL;
xmlNode *rsc_list = NULL;
crm_node_t *peer = NULL;
peer = crm_get_peer_full(0, lrm_state->node_name, CRM_GET_PEER_ANY);
CRM_CHECK(peer != NULL, return NULL);
xml_state = create_node_state_update(peer, update_flags, NULL,
__func__);
if (xml_state == NULL) {
return NULL;
}
xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM);
crm_xml_add(xml_data, XML_ATTR_ID, peer->uuid);
rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES);
/* Build a list of active (not always running) resources */
build_active_RAs(lrm_state, rsc_list);
crm_log_xml_trace(xml_state, "Current executor state");
return xml_state;
}
xmlNode *
controld_query_executor_state(const char *node_name)
{
lrm_state_t *lrm_state = lrm_state_find(node_name);
if (!lrm_state) {
crm_err("Could not find executor state for node %s", node_name);
return NULL;
}
return do_lrm_query_internal(lrm_state,
node_update_cluster|node_update_peer);
}
/*!
* \internal
* \brief Map standard Pacemaker return code to operation status and OCF code
*
* \param[out] event Executor event whose status and return code should be set
* \param[in] rc Standard Pacemaker return code
*/
void
controld_rc2event(lrmd_event_data_t *event, int rc)
{
/* This is called for cleanup requests from controller peers/clients, not
* for resource actions, so no exit reason is needed.
*/
switch (rc) {
case pcmk_rc_ok:
lrmd__set_result(event, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
break;
case EACCES:
lrmd__set_result(event, PCMK_OCF_INSUFFICIENT_PRIV,
PCMK_EXEC_ERROR, NULL);
break;
default:
lrmd__set_result(event, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
NULL);
break;
}
}
/*!
* \internal
* \brief Trigger a new transition after CIB status was deleted
*
* If a CIB status delete was not expected (as part of the transition graph),
* trigger a new transition by updating the (arbitrary) "last-lrm-refresh"
* cluster property.
*
* \param[in] from_sys IPC name that requested the delete
* \param[in] rsc_id Resource whose status was deleted (for logging only)
*/
void
controld_trigger_delete_refresh(const char *from_sys, const char *rsc_id)
{
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_casei)) {
char *now_s = crm_strdup_printf("%lld", (long long) time(NULL));
crm_debug("Triggering a refresh after %s cleaned %s", from_sys, rsc_id);
cib__update_node_attr(logger_out, fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG,
NULL, NULL, NULL, NULL, "last-lrm-refresh", now_s, NULL, NULL);
free(now_s);
}
}
static void
notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, int rc)
{
lrmd_event_data_t *op = NULL;
const char *from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM);
const char *from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
crm_info("Notifying %s on %s that %s was%s deleted",
from_sys, (from_host? from_host : "localhost"), rsc_id,
((rc == pcmk_ok)? "" : " not"));
op = construct_op(lrm_state, input->xml, rsc_id, CRMD_ACTION_DELETE);
controld_rc2event(op, pcmk_legacy2rc(rc));
controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id);
lrmd_free_event(op);
controld_trigger_delete_refresh(from_sys, rsc_id);
}
static gboolean
lrm_remove_deleted_rsc(gpointer key, gpointer value, gpointer user_data)
{
struct delete_event_s *event = user_data;
struct pending_deletion_op_s *op = value;
if (pcmk__str_eq(event->rsc, op->rsc, pcmk__str_none)) {
notify_deleted(event->lrm_state, op->input, event->rsc, event->rc);
return TRUE;
}
return FALSE;
}
static gboolean
lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data)
{
const char *rsc = user_data;
active_op_t *pending = value;
if (pcmk__str_eq(rsc, pending->rsc_id, pcmk__str_none)) {
crm_info("Removing op %s:%d for deleted resource %s",
pending->op_key, pending->call_id, rsc);
return TRUE;
}
return FALSE;
}
static void
delete_rsc_entry(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id,
GHashTableIter * rsc_gIter, int rc, const char *user_name)
{
struct delete_event_s event;
CRM_CHECK(rsc_id != NULL, return);
if (rc == pcmk_ok) {
char *rsc_id_copy = strdup(rsc_id);
if (rsc_gIter) {
g_hash_table_iter_remove(rsc_gIter);
} else {
g_hash_table_remove(lrm_state->resource_history, rsc_id_copy);
}
controld_delete_resource_history(rsc_id_copy, lrm_state->node_name,
user_name, crmd_cib_smart_opt());
g_hash_table_foreach_remove(lrm_state->pending_ops, lrm_remove_deleted_op, rsc_id_copy);
free(rsc_id_copy);
}
if (input) {
notify_deleted(lrm_state, input, rsc_id, rc);
}
event.rc = rc;
event.rsc = rsc_id;
event.lrm_state = lrm_state;
g_hash_table_foreach_remove(lrm_state->deletion_ops, lrm_remove_deleted_rsc, &event);
}
/*!
* \internal
* \brief Erase an LRM history entry from the CIB, given the operation data
*
* \param[in] lrm_state LRM state of the desired node
* \param[in] op Operation whose history should be deleted
*/
static void
erase_lrm_history_by_op(lrm_state_t *lrm_state, lrmd_event_data_t *op)
{
xmlNode *xml_top = NULL;
CRM_CHECK(op != NULL, return);
xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP);
crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id);
crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data);
if (op->interval_ms > 0) {
char *op_id = pcmk__op_key(op->rsc_id, op->op_type, op->interval_ms);
/* Avoid deleting last_failure too (if it was a result of this recurring op failing) */
crm_xml_add(xml_top, XML_ATTR_ID, op_id);
free(op_id);
}
crm_debug("Erasing resource operation history for " PCMK__OP_FMT " (call=%d)",
op->rsc_id, op->op_type, op->interval_ms, op->call_id);
fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top,
cib_quorum_override);
crm_log_xml_trace(xml_top, "op:cancel");
free_xml(xml_top);
}
/* Define xpath to find LRM resource history entry by node and resource */
#define XPATH_HISTORY \
"/" XML_TAG_CIB "/" XML_CIB_TAG_STATUS \
"/" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
"/" XML_CIB_TAG_LRM "/" XML_LRM_TAG_RESOURCES \
"/" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \
"/" XML_LRM_TAG_RSC_OP
/* ... and also by operation key */
#define XPATH_HISTORY_ID XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s']"
/* ... and also by operation key and operation call ID */
#define XPATH_HISTORY_CALL XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s' and @" XML_LRM_ATTR_CALLID "='%d']"
/* ... and also by operation key and original operation key */
#define XPATH_HISTORY_ORIG XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s' and @" XML_LRM_ATTR_TASK_KEY "='%s']"
/*!
* \internal
* \brief Erase an LRM history entry from the CIB, given operation identifiers
*
* \param[in] lrm_state LRM state of the node to clear history for
* \param[in] rsc_id Name of resource to clear history for
* \param[in] key Operation key of operation to clear history for
* \param[in] orig_op If specified, delete only if it has this original op
* \param[in] call_id If specified, delete entry only if it has this call ID
*/
static void
erase_lrm_history_by_id(lrm_state_t *lrm_state, const char *rsc_id,
const char *key, const char *orig_op, int call_id)
{
char *op_xpath = NULL;
CRM_CHECK((rsc_id != NULL) && (key != NULL), return);
if (call_id > 0) {
op_xpath = crm_strdup_printf(XPATH_HISTORY_CALL,
lrm_state->node_name, rsc_id, key,
call_id);
} else if (orig_op) {
op_xpath = crm_strdup_printf(XPATH_HISTORY_ORIG,
lrm_state->node_name, rsc_id, key,
orig_op);
} else {
op_xpath = crm_strdup_printf(XPATH_HISTORY_ID,
lrm_state->node_name, rsc_id, key);
}
crm_debug("Erasing resource operation history for %s on %s (call=%d)",
key, rsc_id, call_id);
fsa_cib_conn->cmds->remove(fsa_cib_conn, op_xpath, NULL,
cib_quorum_override | cib_xpath);
free(op_xpath);
}
static inline gboolean
last_failed_matches_op(rsc_history_t *entry, const char *op, guint interval_ms)
{
if (entry == NULL) {
return FALSE;
}
if (op == NULL) {
return TRUE;
}
return (pcmk__str_eq(op, entry->failed->op_type, pcmk__str_casei)
&& (interval_ms == entry->failed->interval_ms));
}
/*!
* \internal
* \brief Clear a resource's last failure
*
* Erase a resource's last failure on a particular node from both the
* LRM resource history in the CIB, and the resource history remembered
* for the LRM state.
*
* \param[in] rsc_id Resource name
* \param[in] node_name Node name
* \param[in] operation If specified, only clear if matching this operation
* \param[in] interval_ms If operation is specified, it has this interval
*/
void
lrm_clear_last_failure(const char *rsc_id, const char *node_name,
const char *operation, guint interval_ms)
{
char *op_key = NULL;
char *orig_op_key = NULL;
lrm_state_t *lrm_state = NULL;
lrm_state = lrm_state_find(node_name);
if (lrm_state == NULL) {
return;
}
/* Erase from CIB */
op_key = pcmk__op_key(rsc_id, "last_failure", 0);
if (operation) {
orig_op_key = pcmk__op_key(rsc_id, operation, interval_ms);
}
erase_lrm_history_by_id(lrm_state, rsc_id, op_key, orig_op_key, 0);
free(op_key);
free(orig_op_key);
/* Remove from memory */
if (lrm_state->resource_history) {
rsc_history_t *entry = g_hash_table_lookup(lrm_state->resource_history,
rsc_id);
if (last_failed_matches_op(entry, operation, interval_ms)) {
lrmd_free_event(entry->failed);
entry->failed = NULL;
}
}
}
/* Returns: gboolean - cancellation is in progress */
static gboolean
cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op, gboolean remove)
{
int rc = pcmk_ok;
char *local_key = NULL;
active_op_t *pending = NULL;
CRM_CHECK(op != 0, return FALSE);
CRM_CHECK(rsc_id != NULL, return FALSE);
if (key == NULL) {
local_key = make_stop_id(rsc_id, op);
key = local_key;
}
pending = g_hash_table_lookup(lrm_state->pending_ops, key);
if (pending) {
if (remove && !pcmk_is_set(pending->flags, active_op_remove)) {
controld_set_active_op_flags(pending, active_op_remove);
crm_debug("Scheduling %s for removal", key);
}
if (pcmk_is_set(pending->flags, active_op_cancelled)) {
crm_debug("Operation %s already cancelled", key);
free(local_key);
return FALSE;
}
controld_set_active_op_flags(pending, active_op_cancelled);
} else {
crm_info("No pending op found for %s", key);
free(local_key);
return FALSE;
}
crm_debug("Cancelling op %d for %s (%s)", op, rsc_id, key);
rc = lrm_state_cancel(lrm_state, pending->rsc_id, pending->op_type,
pending->interval_ms);
if (rc == pcmk_ok) {
crm_debug("Op %d for %s (%s): cancelled", op, rsc_id, key);
free(local_key);
return TRUE;
}
crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc_id, key);
/* The caller needs to make sure the entry is
* removed from the pending_ops list
*
* Usually by returning TRUE inside the worker function
* supplied to g_hash_table_foreach_remove()
*
* Not removing the entry from pending_ops will block
* the node from shutting down
*/
free(local_key);
return FALSE;
}
struct cancel_data {
gboolean done;
gboolean remove;
const char *key;
lrmd_rsc_info_t *rsc;
lrm_state_t *lrm_state;
};
static gboolean
cancel_action_by_key(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
struct cancel_data *data = user_data;
active_op_t *op = value;
if (pcmk__str_eq(op->op_key, data->key, pcmk__str_none)) {
data->done = TRUE;
remove = !cancel_op(data->lrm_state, data->rsc->id, key, op->call_id, data->remove);
}
return remove;
}
static gboolean
cancel_op_key(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *key, gboolean remove)
{
guint removed = 0;
struct cancel_data data;
CRM_CHECK(rsc != NULL, return FALSE);
CRM_CHECK(key != NULL, return FALSE);
data.key = key;
data.rsc = rsc;
data.done = FALSE;
data.remove = remove;
data.lrm_state = lrm_state;
removed = g_hash_table_foreach_remove(lrm_state->pending_ops, cancel_action_by_key, &data);
crm_trace("Removed %u op cache entries, new size: %u",
removed, g_hash_table_size(lrm_state->pending_ops));
return data.done;
}
/*!
* \internal
* \brief Retrieve resource information from LRM
*
* \param[in] lrm_state LRM connection to use
* \param[in] rsc_xml XML containing resource configuration
* \param[in] do_create If true, register resource with LRM if not already
* \param[out] rsc_info Where to store resource information obtained from LRM
*
* \retval pcmk_ok Success (and rsc_info holds newly allocated result)
* \retval -EINVAL Required information is missing from arguments
* \retval -ENOTCONN No active connection to LRM
* \retval -ENODEV Resource not found
* \retval -errno Error communicating with executor when registering resource
*
* \note Caller is responsible for freeing result on success.
*/
static int
get_lrm_resource(lrm_state_t *lrm_state, xmlNode *rsc_xml, gboolean do_create,
lrmd_rsc_info_t **rsc_info)
{
const char *id = ID(rsc_xml);
CRM_CHECK(lrm_state && rsc_xml && rsc_info, return -EINVAL);
CRM_CHECK(id, return -EINVAL);
if (lrm_state_is_connected(lrm_state) == FALSE) {
return -ENOTCONN;
}
crm_trace("Retrieving resource information for %s from the executor", id);
*rsc_info = lrm_state_get_rsc_info(lrm_state, id, 0);
// If resource isn't known by ID, try clone name, if provided
if (!*rsc_info) {
const char *long_id = crm_element_value(rsc_xml, XML_ATTR_ID_LONG);
if (long_id) {
*rsc_info = lrm_state_get_rsc_info(lrm_state, long_id, 0);
}
}
if ((*rsc_info == NULL) && do_create) {
const char *class = crm_element_value(rsc_xml, XML_AGENT_ATTR_CLASS);
const char *provider = crm_element_value(rsc_xml, XML_AGENT_ATTR_PROVIDER);
const char *type = crm_element_value(rsc_xml, XML_ATTR_TYPE);
int rc;
crm_trace("Registering resource %s with the executor", id);
rc = lrm_state_register_rsc(lrm_state, id, class, provider, type,
lrmd_opt_drop_recurring);
if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL;
crm_err("Could not register resource %s with the executor on %s: %s "
CRM_XS " rc=%d",
id, lrm_state->node_name, pcmk_strerror(rc), rc);
/* Register this as an internal error if this involves the local
* executor. Otherwise, we're likely dealing with an unresponsive
* remote node, which is not an FSA failure.
*/
if (lrm_state_is_local(lrm_state) == TRUE) {
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
}
return rc;
}
*rsc_info = lrm_state_get_rsc_info(lrm_state, id, 0);
}
return *rsc_info? pcmk_ok : -ENODEV;
}
static void
delete_resource(lrm_state_t * lrm_state,
const char *id,
lrmd_rsc_info_t * rsc,
GHashTableIter * gIter,
const char *sys,
const char *user,
ha_msg_input_t * request,
gboolean unregister)
{
int rc = pcmk_ok;
crm_info("Removing resource %s from executor for %s%s%s",
id, sys, (user? " as " : ""), (user? user : ""));
if (rsc && unregister) {
rc = lrm_state_unregister_rsc(lrm_state, id, 0);
}
if (rc == pcmk_ok) {
crm_trace("Resource %s deleted from executor", id);
} else if (rc == -EINPROGRESS) {
crm_info("Deletion of resource '%s' from executor is pending", id);
if (request) {
struct pending_deletion_op_s *op = NULL;
char *ref = crm_element_value_copy(request->msg, XML_ATTR_REFERENCE);
op = calloc(1, sizeof(struct pending_deletion_op_s));
op->rsc = strdup(rsc->id);
op->input = copy_ha_msg_input(request);
g_hash_table_insert(lrm_state->deletion_ops, ref, op);
}
return;
} else {
crm_warn("Could not delete '%s' from executor for %s%s%s: %s "
CRM_XS " rc=%d", id, sys, (user? " as " : ""),
(user? user : ""), pcmk_strerror(rc), rc);
}
delete_rsc_entry(lrm_state, request, id, gIter, rc, user);
}
static int
get_fake_call_id(lrm_state_t *lrm_state, const char *rsc_id)
{
int call_id = 999999999;
rsc_history_t *entry = NULL;
if(lrm_state) {
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
}
/* Make sure the call id is greater than the last successful operation,
* otherwise the failure will not result in a possible recovery of the resource
* as it could appear the failure occurred before the successful start */
if (entry) {
call_id = entry->last_callid + 1;
}
if (call_id < 0) {
call_id = 1;
}
return call_id;
}
static void
fake_op_status(lrm_state_t *lrm_state, lrmd_event_data_t *op, int op_status,
enum ocf_exitcode op_exitcode, const char *exit_reason)
{
op->call_id = get_fake_call_id(lrm_state, op->rsc_id);
op->t_run = time(NULL);
op->t_rcchange = op->t_run;
lrmd__set_result(op, op_exitcode, op_status, exit_reason);
}
static void
force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
const char *from_host, const char *user_name,
gboolean is_remote_node)
{
GHashTableIter gIter;
rsc_history_t *entry = NULL;
crm_info("Clearing resource history on node %s", lrm_state->node_name);
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
/* only unregister the resource during a reprobe if it is not a remote connection
* resource. otherwise unregistering the connection will terminate remote-node
* membership */
gboolean unregister = TRUE;
if (is_remote_lrmd_ra(NULL, NULL, entry->id)) {
lrm_state_t *remote_lrm_state = lrm_state_find(entry->id);
if (remote_lrm_state) {
/* when forcing a reprobe, make sure to clear remote node before
* clearing the remote node's connection resource */
force_reprobe(remote_lrm_state, from_sys, from_host, user_name, TRUE);
}
unregister = FALSE;
}
delete_resource(lrm_state, entry->id, &entry->rsc, &gIter, from_sys,
user_name, NULL, unregister);
}
/* Now delete the copy in the CIB */
controld_delete_node_state(lrm_state->node_name, controld_section_lrm,
cib_scope_local);
// @COMPAT DCs < 1.1.14 need this deleted (in case it was explicitly false)
update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, user_name, is_remote_node);
}
/*!
* \internal
* \brief Fail a requested action without actually executing it
*
* For an action that can't be executed, process it similarly to an actual
* execution result, with specified error status (except for notify actions,
* which will always be treated as successful).
*
* \param[in] lrm_state Executor connection that action is for
* \param[in] action Action XML from request
* \param[in] rc Desired return code to use
* \param[in] op_status Desired operation status to use
* \param[in] exit_reason Human-friendly detail, if error
*/
static void
synthesize_lrmd_failure(lrm_state_t *lrm_state, xmlNode *action,
int op_status, enum ocf_exitcode rc,
const char *exit_reason)
{
lrmd_event_data_t *op = NULL;
const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK);
const char *target_node = crm_element_value(action, XML_LRM_ATTR_TARGET);
xmlNode *xml_rsc = find_xml_node(action, XML_CIB_TAG_RESOURCE, TRUE);
if ((xml_rsc == NULL) || (ID(xml_rsc) == NULL)) {
/* @TODO Should we do something else, like direct ack? */
crm_info("Can't fake %s failure (%d) on %s without resource configuration",
crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc,
target_node);
return;
} else if(operation == NULL) {
/* This probably came from crm_resource -C, nothing to do */
crm_info("Can't fake %s failure (%d) on %s without operation",
ID(xml_rsc), rc, target_node);
return;
}
op = construct_op(lrm_state, action, ID(xml_rsc), operation);
if (pcmk__str_eq(operation, RSC_NOTIFY, pcmk__str_casei)) { // Notifications can't fail
fake_op_status(lrm_state, op, PCMK_EXEC_DONE, PCMK_OCF_OK, NULL);
} else {
fake_op_status(lrm_state, op, op_status, rc, exit_reason);
}
crm_info("Faking " PCMK__OP_FMT " result (%d) on %s",
op->rsc_id, op->op_type, op->interval_ms, op->rc, target_node);
// Process the result as if it came from the LRM
process_lrm_event(lrm_state, op, NULL, action);
lrmd_free_event(op);
}
/*!
* \internal
* \brief Get target of an LRM operation
*
* \param[in] xml LRM operation data XML
*
* \return LRM operation target node name (local node or Pacemaker Remote node)
*/
static const char *
lrm_op_target(xmlNode *xml)
{
const char *target = NULL;
if (xml) {
target = crm_element_value(xml, XML_LRM_ATTR_TARGET);
}
if (target == NULL) {
target = fsa_our_uname;
}
return target;
}
static void
fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
const char *from_host, const char *from_sys)
{
lrmd_event_data_t *op = NULL;
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(xml, XML_CIB_TAG_RESOURCE, TRUE);
CRM_CHECK(xml_rsc != NULL, return);
/* The executor simply executes operations and reports the results, without
* any concept of success or failure, so to fail a resource, we must fake
* what a failure looks like.
*
* To do this, we create a fake executor operation event for the resource,
* and pass that event to the executor client callback so it will be
* processed as if it came from the executor.
*/
op = construct_op(lrm_state, xml, ID(xml_rsc), "asyncmon");
free((char*) op->user_data);
op->user_data = NULL;
op->interval_ms = 0;
if (user_name && !pcmk__is_privileged(user_name)) {
crm_err("%s does not have permission to fail %s", user_name, ID(xml_rsc));
fake_op_status(lrm_state, op, PCMK_EXEC_ERROR,
PCMK_OCF_INSUFFICIENT_PRIV,
"Unprivileged user cannot fail resources");
controld_ack_event_directly(from_host, from_sys, NULL, op, ID(xml_rsc));
lrmd_free_event(op);
return;
}
if (get_lrm_resource(lrm_state, xml_rsc, TRUE, &rsc) == pcmk_ok) {
crm_info("Failing resource %s...", rsc->id);
fake_op_status(lrm_state, op, PCMK_EXEC_DONE, PCMK_OCF_UNKNOWN_ERROR,
"Simulated failure");
process_lrm_event(lrm_state, op, NULL, xml);
op->rc = PCMK_OCF_OK; // The request to fail the resource succeeded
lrmd_free_rsc_info(rsc);
} else {
crm_info("Cannot find/create resource in order to fail it...");
crm_log_xml_warn(xml, "bad input");
fake_op_status(lrm_state, op, PCMK_EXEC_ERROR, PCMK_OCF_UNKNOWN_ERROR,
"Cannot fail unknown resource");
}
controld_ack_event_directly(from_host, from_sys, NULL, op, ID(xml_rsc));
lrmd_free_event(op);
}
static void
handle_refresh_op(lrm_state_t *lrm_state, const char *user_name,
const char *from_host, const char *from_sys)
{
int rc = pcmk_ok;
xmlNode *fragment = do_lrm_query_internal(lrm_state, node_update_all);
fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc, user_name);
crm_info("Forced a local resource history refresh: call=%d", rc);
if (!pcmk__str_eq(CRM_SYSTEM_CRMD, from_sys, pcmk__str_casei)) {
xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, fragment, from_host,
from_sys, CRM_SYSTEM_LRMD,
fsa_our_uuid);
crm_debug("ACK'ing refresh from %s (%s)", from_sys, from_host);
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(reply);
}
free_xml(fragment);
}
static void
handle_query_op(xmlNode *msg, lrm_state_t *lrm_state)
{
xmlNode *data = do_lrm_query_internal(lrm_state, node_update_all);
xmlNode *reply = create_reply(msg, data);
if (relay_message(reply, TRUE) == FALSE) {
crm_err("Unable to route reply");
crm_log_xml_err(reply, "reply");
}
free_xml(reply);
free_xml(data);
}
static void
handle_reprobe_op(lrm_state_t *lrm_state, const char *from_sys,
const char *from_host, const char *user_name,
gboolean is_remote_node)
{
crm_notice("Forcing the status of all resources to be redetected");
force_reprobe(lrm_state, from_sys, from_host, user_name, is_remote_node);
if (!pcmk__strcase_any_of(from_sys, CRM_SYSTEM_PENGINE, CRM_SYSTEM_TENGINE, NULL)) {
xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, NULL, from_host,
from_sys, CRM_SYSTEM_LRMD,
fsa_our_uuid);
crm_debug("ACK'ing re-probe from %s (%s)", from_sys, from_host);
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(reply);
}
}
static bool do_lrm_cancel(ha_msg_input_t *input, lrm_state_t *lrm_state,
lrmd_rsc_info_t *rsc, const char *from_host, const char *from_sys)
{
char *op_key = NULL;
char *meta_key = NULL;
int call = 0;
const char *call_id = NULL;
const char *op_task = NULL;
guint interval_ms = 0;
gboolean in_progress = FALSE;
xmlNode *params = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE);
CRM_CHECK(params != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_TASK);
op_task = crm_element_value(params, meta_key);
free(meta_key);
CRM_CHECK(op_task != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_INTERVAL_MS);
if (crm_element_value_ms(params, meta_key, &interval_ms) != pcmk_ok) {
free(meta_key);
return FALSE;
}
free(meta_key);
op_key = pcmk__op_key(rsc->id, op_task, interval_ms);
meta_key = crm_meta_name(XML_LRM_ATTR_CALLID);
call_id = crm_element_value(params, meta_key);
free(meta_key);
crm_debug("Scheduler requested op %s (call=%s) be cancelled",
op_key, (call_id? call_id : "NA"));
pcmk__scan_min_int(call_id, &call, 0);
if (call == 0) {
// Normal case when the scheduler cancels a recurring op
in_progress = cancel_op_key(lrm_state, rsc, op_key, TRUE);
} else {
// Normal case when the scheduler cancels an orphan op
in_progress = cancel_op(lrm_state, rsc->id, NULL, call, TRUE);
}
// Acknowledge cancellation operation if for a remote connection resource
if (!in_progress || is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
char *op_id = make_stop_id(rsc->id, call);
if (is_remote_lrmd_ra(NULL, NULL, rsc->id) == FALSE) {
crm_info("Nothing known about operation %d for %s", call, op_key);
}
erase_lrm_history_by_id(lrm_state, rsc->id, op_key, NULL, call);
send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
from_host, from_sys);
/* needed at least for cancellation of a remote operation */
g_hash_table_remove(lrm_state->pending_ops, op_id);
free(op_id);
} else {
/* No ack is needed since abcdaa8, but peers with older versions
* in a rolling upgrade need one. We didn't bump the feature set
* at that commit, so we can only compare against the previous
* CRM version (3.0.8). If any peers have feature set 3.0.9 but
* not abcdaa8, they will time out waiting for the ack (no
* released versions of Pacemaker are affected).
*/
const char *peer_version = crm_element_value(params, XML_ATTR_CRM_VERSION);
if (compare_version(peer_version, "3.0.8") <= 0) {
crm_info("Sending compatibility ack for %s cancellation to %s (CRM version %s)",
op_key, from_host, peer_version);
send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
from_host, from_sys);
}
}
free(op_key);
return TRUE;
}
static void
do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state,
lrmd_rsc_info_t *rsc, const char *from_sys, const char *from_host,
bool crm_rsc_delete, const char *user_name)
{
gboolean unregister = TRUE;
int cib_rc = controld_delete_resource_history(rsc->id, lrm_state->node_name,
user_name,
cib_dryrun|cib_sync_call);
if (cib_rc != pcmk_rc_ok) {
lrmd_event_data_t *op = NULL;
op = construct_op(lrm_state, input->xml, rsc->id, CRMD_ACTION_DELETE);
/* These are resource clean-ups, not actions, so no exit reason is
* needed.
*/
lrmd__set_result(op, pcmk_rc2ocf(cib_rc), PCMK_EXEC_ERROR, NULL);
controld_ack_event_directly(from_host, from_sys, NULL, op, rsc->id);
lrmd_free_event(op);
return;
}
if (crm_rsc_delete && is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
unregister = FALSE;
}
delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys,
user_name, input, unregister);
}
/* A_LRM_INVOKE */
void
do_lrm_invoke(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
lrm_state_t *lrm_state = NULL;
const char *crm_op = NULL;
const char *from_sys = NULL;
const char *from_host = NULL;
const char *operation = NULL;
ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
const char *user_name = NULL;
const char *target_node = NULL;
gboolean is_remote_node = FALSE;
bool crm_rsc_delete = FALSE;
target_node = lrm_op_target(input->xml);
is_remote_node = !pcmk__str_eq(target_node, fsa_our_uname,
pcmk__str_casei);
lrm_state = lrm_state_find(target_node);
if ((lrm_state == NULL) && is_remote_node) {
crm_err("Failing action because local node has never had connection to remote node %s",
target_node);
synthesize_lrmd_failure(NULL, input->xml, PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR,
"Local node has no connection to remote");
return;
}
CRM_ASSERT(lrm_state != NULL);
user_name = pcmk__update_acl_user(input->msg, F_CRM_USER, NULL);
crm_op = crm_element_value(input->msg, F_CRM_TASK);
from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM);
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) {
from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
}
if (pcmk__str_eq(crm_op, CRM_OP_LRM_DELETE, pcmk__str_none)) {
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) {
crm_rsc_delete = TRUE; // from crm_resource
}
operation = CRMD_ACTION_DELETE;
} else if (input->xml != NULL) {
operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK);
}
CRM_CHECK(!pcmk__str_empty(crm_op) || !pcmk__str_empty(operation), return);
crm_trace("'%s' execution request from %s as %s user",
pcmk__s(crm_op, operation),
pcmk__s(from_sys, "unknown subsystem"),
pcmk__s(user_name, "current"));
if (pcmk__str_eq(crm_op, CRM_OP_LRM_FAIL, pcmk__str_none)) {
fail_lrm_resource(input->xml, lrm_state, user_name, from_host,
from_sys);
} else if (pcmk__str_eq(crm_op, CRM_OP_LRM_REFRESH, pcmk__str_none)) {
handle_refresh_op(lrm_state, user_name, from_host, from_sys);
} else if (pcmk__str_eq(crm_op, CRM_OP_LRM_QUERY, pcmk__str_none)) {
handle_query_op(input->msg, lrm_state);
// @COMPAT DCs <1.1.14 in a rolling upgrade might schedule this op
} else if (pcmk__str_eq(operation, CRM_OP_PROBED, pcmk__str_none)) {
update_attrd(lrm_state->node_name, CRM_OP_PROBED, XML_BOOLEAN_TRUE,
user_name, is_remote_node);
} else if (pcmk__str_eq(crm_op, CRM_OP_REPROBE, pcmk__str_none)
|| pcmk__str_eq(operation, CRM_OP_REPROBE, pcmk__str_none)) {
handle_reprobe_op(lrm_state, from_sys, from_host, user_name,
is_remote_node);
} else if (operation != NULL) {
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE);
gboolean create_rsc = !pcmk__str_eq(operation, CRMD_ACTION_DELETE,
pcmk__str_none);
int rc;
// We can't return anything meaningful without a resource ID
CRM_CHECK(xml_rsc && ID(xml_rsc), return);
rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc);
if (rc == -ENOTCONN) {
synthesize_lrmd_failure(lrm_state, input->xml,
PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR,
"Not connected to remote executor");
return;
} else if ((rc < 0) && !create_rsc) {
/* Delete of malformed or nonexistent resource
* (deleting something that does not exist is a success)
*/
crm_notice("Not registering resource '%s' for a %s event "
CRM_XS " get-rc=%d (%s) transition-key=%s",
ID(xml_rsc), operation,
rc, pcmk_strerror(rc), ID(input->xml));
delete_rsc_entry(lrm_state, input, ID(xml_rsc), NULL, pcmk_ok,
user_name);
return;
} else if (rc == -EINVAL) {
// Resource operation on malformed resource
crm_err("Invalid resource definition for %s", ID(xml_rsc));
crm_log_xml_warn(input->msg, "invalid resource");
synthesize_lrmd_failure(lrm_state, input->xml, PCMK_EXEC_ERROR,
PCMK_OCF_NOT_CONFIGURED, // fatal error
"Invalid resource definition");
return;
} else if (rc < 0) {
// Error communicating with the executor
crm_err("Could not register resource '%s' with executor: %s "
CRM_XS " rc=%d",
ID(xml_rsc), pcmk_strerror(rc), rc);
crm_log_xml_warn(input->msg, "failed registration");
synthesize_lrmd_failure(lrm_state, input->xml, PCMK_EXEC_ERROR,
PCMK_OCF_INVALID_PARAM, // hard error
"Could not register resource with executor");
return;
}
if (pcmk__str_eq(operation, CRMD_ACTION_CANCEL, pcmk__str_none)) {
if (!do_lrm_cancel(input, lrm_state, rsc, from_host, from_sys)) {
crm_log_xml_warn(input->xml, "Bad command");
}
} else if (pcmk__str_eq(operation, CRMD_ACTION_DELETE, pcmk__str_none)) {
do_lrm_delete(input, lrm_state, rsc, from_sys, from_host,
crm_rsc_delete, user_name);
} else if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD,
CRMD_ACTION_RELOAD_AGENT, NULL)) {
/* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
* will schedule reload-agent actions only. In either case, we need
* to map that to whatever the resource agent actually supports.
* Default to the OCF 1.1 name.
*/
struct ra_metadata_s *md = NULL;
const char *reload_name = CRMD_ACTION_RELOAD_AGENT;
md = controld_get_rsc_metadata(lrm_state, rsc,
controld_metadata_from_cache);
if ((md != NULL)
&& pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
reload_name = CRMD_ACTION_RELOAD;
}
do_lrm_rsc_op(lrm_state, rsc, reload_name, input->xml);
} else {
do_lrm_rsc_op(lrm_state, rsc, operation, input->xml);
}
lrmd_free_rsc_info(rsc);
} else {
crm_err("Invalid execution request: unknown command '%s' (bug?)",
crm_op);
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
}
#if ENABLE_VERSIONED_ATTRS
static void
resolve_versioned_parameters(lrm_state_t *lrm_state, const char *rsc_id,
const xmlNode *rsc_op, GHashTable *params)
{
/* Resource info *should* already be cached, so we don't get
* executor call */
lrmd_rsc_info_t *rsc = lrm_state_get_rsc_info(lrm_state, rsc_id, 0);
struct ra_metadata_s *metadata;
metadata = controld_get_rsc_metadata(lrm_state, rsc,
controld_metadata_from_cache);
if (metadata) {
xmlNode *versioned_attrs = NULL;
GHashTable *hash = NULL;
char *key = NULL;
char *value = NULL;
GHashTableIter iter;
versioned_attrs = first_named_child(rsc_op, XML_TAG_OP_VER_ATTRS);
hash = pe_unpack_versioned_parameters(versioned_attrs, metadata->ra_version);
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
g_hash_table_iter_steal(&iter);
g_hash_table_replace(params, key, value);
}
g_hash_table_destroy(hash);
versioned_attrs = first_named_child(rsc_op, XML_TAG_OP_VER_META);
hash = pe_unpack_versioned_parameters(versioned_attrs, metadata->ra_version);
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
g_hash_table_replace(params, crm_meta_name(key), strdup(value));
if (pcmk__str_eq(key, XML_ATTR_TIMEOUT, pcmk__str_casei)) {
pcmk__scan_min_int(value, &op->timeout, 0);
} else if (pcmk__str_eq(key, XML_OP_ATTR_START_DELAY, pcmk__str_casei)) {
pcmk__scan_min_int(value, &op->start_delay, 0);
}
}
g_hash_table_destroy(hash);
versioned_attrs = first_named_child(rsc_op, XML_TAG_RSC_VER_ATTRS);
hash = pe_unpack_versioned_parameters(versioned_attrs, metadata->ra_version);
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
g_hash_table_iter_steal(&iter);
g_hash_table_replace(params, key, value);
}
g_hash_table_destroy(hash);
}
lrmd_free_rsc_info(rsc);
}
#endif
static lrmd_event_data_t *
construct_op(lrm_state_t *lrm_state, xmlNode *rsc_op, const char *rsc_id,
const char *operation)
{
lrmd_event_data_t *op = NULL;
const char *op_delay = NULL;
const char *op_timeout = NULL;
GHashTable *params = NULL;
xmlNode *primitive = NULL;
const char *class = NULL;
const char *transition = NULL;
CRM_ASSERT(rsc_id && operation);
op = lrmd_new_event(rsc_id, operation, 0);
op->type = lrmd_event_exec_complete;
op->timeout = 0;
op->start_delay = 0;
lrmd__set_result(op, PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL);
if (rsc_op == NULL) {
CRM_LOG_ASSERT(pcmk__str_eq(CRMD_ACTION_STOP, operation, pcmk__str_casei));
op->user_data = NULL;
/* the stop_all_resources() case
* by definition there is no DC (or they'd be shutting
* us down).
* So we should put our version here.
*/
op->params = pcmk__strkey_table(free, free);
g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET));
crm_trace("Constructed %s op for %s", operation, rsc_id);
return op;
}
params = xml2list(rsc_op);
g_hash_table_remove(params, CRM_META "_op_target_rc");
op_delay = crm_meta_value(params, XML_OP_ATTR_START_DELAY);
pcmk__scan_min_int(op_delay, &op->start_delay, 0);
op_timeout = crm_meta_value(params, XML_ATTR_TIMEOUT);
pcmk__scan_min_int(op_timeout, &op->timeout, 0);
if (pcmk__guint_from_hash(params, CRM_META "_" XML_LRM_ATTR_INTERVAL_MS, 0,
&(op->interval_ms)) != pcmk_rc_ok) {
op->interval_ms = 0;
}
/* Use pcmk_monitor_timeout instead of meta timeout for stonith
recurring monitor, if set */
primitive = find_xml_node(rsc_op, XML_CIB_TAG_RESOURCE, FALSE);
class = crm_element_value(primitive, XML_AGENT_ATTR_CLASS);
if (pcmk_is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_fence_params)
&& pcmk__str_eq(operation, CRMD_ACTION_STATUS, pcmk__str_casei)
&& (op->interval_ms > 0)) {
op_timeout = g_hash_table_lookup(params, "pcmk_monitor_timeout");
if (op_timeout != NULL) {
op->timeout = crm_get_msec(op_timeout);
}
}
#if ENABLE_VERSIONED_ATTRS
if (lrm_state && !is_remote_lrmd_ra(NULL, NULL, rsc_id)
&& !pcmk__strcase_any_of(op_type, CRMD_ACTION_METADATA, CRMD_ACTION_DELETE,
NULL)) {
resolve_versioned_parameters(lrm_state, rsc_id, rsc_op, params);
}
#endif
if (!pcmk__str_eq(operation, RSC_STOP, pcmk__str_casei)) {
op->params = params;
} else {
rsc_history_t *entry = NULL;
if (lrm_state) {
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
}
/* If we do not have stop parameters cached, use
* whatever we are given */
if (!entry || !entry->stop_params) {
op->params = params;
} else {
/* Copy the cached parameter list so that we stop the resource
* with the old attributes, not the new ones */
op->params = pcmk__strkey_table(free, free);
g_hash_table_foreach(params, copy_meta_keys, op->params);
g_hash_table_foreach(entry->stop_params, copy_instance_keys, op->params);
g_hash_table_destroy(params);
params = NULL;
}
}
/* sanity */
if (op->timeout <= 0) {
op->timeout = op->interval_ms;
}
if (op->start_delay < 0) {
op->start_delay = 0;
}
transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY);
CRM_CHECK(transition != NULL, return op);
op->user_data = strdup(transition);
if (op->interval_ms != 0) {
if (pcmk__strcase_any_of(operation, CRMD_ACTION_START, CRMD_ACTION_STOP, NULL)) {
crm_err("Start and Stop actions cannot have an interval: %u",
op->interval_ms);
op->interval_ms = 0;
}
}
crm_trace("Constructed %s op for %s: interval=%u",
operation, rsc_id, op->interval_ms);
return op;
}
/*!
* \internal
* \brief Send a (synthesized) event result
*
* Reply with a synthesized event result directly, as opposed to going through
* the executor.
*
* \param[in] to_host Host to send result to
* \param[in] to_sys IPC name to send result to (NULL for transition engine)
* \param[in] rsc Type information about resource the result is for
* \param[in] op Event with result to send
* \param[in] rsc_id ID of resource the result is for
*/
void
controld_ack_event_directly(const char *to_host, const char *to_sys,
lrmd_rsc_info_t *rsc, lrmd_event_data_t *op,
const char *rsc_id)
{
xmlNode *reply = NULL;
xmlNode *update, *iter;
crm_node_t *peer = NULL;
CRM_CHECK(op != NULL, return);
if (op->rsc_id == NULL) {
CRM_ASSERT(rsc_id != NULL);
op->rsc_id = strdup(rsc_id);
}
if (to_sys == NULL) {
to_sys = CRM_SYSTEM_TENGINE;
}
peer = crm_get_peer(0, fsa_our_uname);
update = create_node_state_update(peer, node_update_none, NULL,
__func__);
iter = create_xml_node(update, XML_CIB_TAG_LRM);
crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE);
crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
build_operation_update(iter, rsc, op, fsa_our_uname, __func__);
reply = create_request(CRM_OP_INVOKE_LRM, update, to_host, to_sys, CRM_SYSTEM_LRMD, NULL);
crm_log_xml_trace(update, "[direct ACK]");
crm_debug("ACK'ing resource op " PCMK__OP_FMT " from %s: %s",
op->rsc_id, op->op_type, op->interval_ms, op->user_data,
crm_element_value(reply, XML_ATTR_REFERENCE));
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(update);
free_xml(reply);
}
gboolean
verify_stopped(enum crmd_fsa_state cur_state, int log_level)
{
gboolean res = TRUE;
GList *lrm_state_list = lrm_state_get_list();
GList *state_entry;
for (state_entry = lrm_state_list; state_entry != NULL; state_entry = state_entry->next) {
lrm_state_t *lrm_state = state_entry->data;
if (!lrm_state_verify_stopped(lrm_state, cur_state, log_level)) {
/* keep iterating through all even when false is returned */
res = FALSE;
}
}
controld_set_fsa_input_flags(R_SENT_RSC_STOP);
g_list_free(lrm_state_list); lrm_state_list = NULL;
return res;
}
struct stop_recurring_action_s {
lrmd_rsc_info_t *rsc;
lrm_state_t *lrm_state;
};
static gboolean
stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
struct stop_recurring_action_s *event = user_data;
active_op_t *op = value;
if ((op->interval_ms != 0)
&& pcmk__str_eq(op->rsc_id, event->rsc->id, pcmk__str_none)) {
crm_debug("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, (char*)key);
remove = !cancel_op(event->lrm_state, event->rsc->id, key, op->call_id, FALSE);
}
return remove;
}
static gboolean
stop_recurring_actions(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
lrm_state_t *lrm_state = user_data;
active_op_t *op = value;
if (op->interval_ms != 0) {
crm_info("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id,
(const char *) key);
remove = !cancel_op(lrm_state, op->rsc_id, key, op->call_id, FALSE);
}
return remove;
}
static void
record_pending_op(const char *node_name, lrmd_rsc_info_t *rsc, lrmd_event_data_t *op)
{
const char *record_pending = NULL;
CRM_CHECK(node_name != NULL, return);
CRM_CHECK(rsc != NULL, return);
CRM_CHECK(op != NULL, return);
// Never record certain operation types as pending
if ((op->op_type == NULL) || (op->params == NULL)
|| !controld_action_is_recordable(op->op_type)) {
return;
}
// defaults to true
record_pending = crm_meta_value(op->params, XML_OP_ATTR_PENDING);
if (record_pending && !crm_is_true(record_pending)) {
return;
}
op->call_id = -1;
lrmd__set_result(op, PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL);
op->t_run = time(NULL);
op->t_rcchange = op->t_run;
/* write a "pending" entry to the CIB, inhibit notification */
crm_debug("Recording pending op " PCMK__OP_FMT " on %s in the CIB",
op->rsc_id, op->op_type, op->interval_ms, node_name);
do_update_resource(node_name, rsc, op, 0);
}
static void
do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
const char *operation, xmlNode *msg)
{
int rc;
int call_id = 0;
char *op_id = NULL;
lrmd_event_data_t *op = NULL;
lrmd_key_value_t *params = NULL;
fsa_data_t *msg_data = NULL;
const char *transition = NULL;
gboolean stop_recurring = FALSE;
const char *nack_reason = NULL;
CRM_CHECK(rsc != NULL, return);
CRM_CHECK(operation != NULL, return);
if (msg != NULL) {
transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
if (transition == NULL) {
crm_log_xml_err(msg, "Missing transition number");
}
}
op = construct_op(lrm_state, msg, rsc->id, operation);
CRM_CHECK(op != NULL, return);
if (is_remote_lrmd_ra(NULL, NULL, rsc->id)
&& (op->interval_ms == 0)
&& strcmp(operation, CRMD_ACTION_MIGRATE) == 0) {
/* pcmk remote connections are a special use case.
* We never ever want to stop monitoring a connection resource until
* the entire migration has completed. If the connection is unexpectedly
* severed, even during a migration, this is an event we must detect.*/
stop_recurring = FALSE;
} else if ((op->interval_ms == 0)
&& strcmp(operation, CRMD_ACTION_STATUS) != 0
&& strcmp(operation, CRMD_ACTION_NOTIFY) != 0) {
/* stop any previous monitor operations before changing the resource state */
stop_recurring = TRUE;
}
if (stop_recurring == TRUE) {
guint removed = 0;
struct stop_recurring_action_s data;
data.rsc = rsc;
data.lrm_state = lrm_state;
removed = g_hash_table_foreach_remove(
lrm_state->pending_ops, stop_recurring_action_by_rsc, &data);
if (removed) {
crm_debug("Stopped %u recurring operation%s in preparation for "
PCMK__OP_FMT, removed, pcmk__plural_s(removed),
rsc->id, operation, op->interval_ms);
}
}
/* now do the op */
crm_notice("Requesting local execution of %s operation for %s on %s "
CRM_XS " transition_key=%s op_key=" PCMK__OP_FMT,
crm_action_str(op->op_type, op->interval_ms), rsc->id, lrm_state->node_name,
transition, rsc->id, operation, op->interval_ms);
if (pcmk_is_set(fsa_input_register, R_SHUTDOWN)
&& pcmk__str_eq(operation, RSC_START, pcmk__str_casei)) {
register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL);
nack_reason = "Not attempting start due to shutdown in progress";
} else if (fsa_state != S_NOT_DC
&& fsa_state != S_POLICY_ENGINE /* Recalculating */
&& fsa_state != S_TRANSITION_ENGINE
&& !pcmk__str_eq(operation, CRMD_ACTION_STOP, pcmk__str_casei)) {
nack_reason = "Controller cannot attempt actions at this time";
}
if (nack_reason != NULL) {
crm_notice("Discarding attempt to perform action %s on %s in state %s (shutdown=%s)",
operation, rsc->id, fsa_state2string(fsa_state),
pcmk__btoa(pcmk_is_set(fsa_input_register, R_SHUTDOWN)));
lrmd__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_INVALID,
nack_reason);
controld_ack_event_directly(NULL, NULL, rsc, op, rsc->id);
lrmd_free_event(op);
free(op_id);
return;
}
record_pending_op(lrm_state->node_name, rsc, op);
op_id = pcmk__op_key(rsc->id, op->op_type, op->interval_ms);
if (op->interval_ms > 0) {
/* cancel it so we can then restart it without conflict */
cancel_op_key(lrm_state, rsc, op_id, FALSE);
}
if (op->params) {
char *key = NULL;
char *value = NULL;
GHashTableIter iter;
g_hash_table_iter_init(&iter, op->params);
while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) {
params = lrmd_key_value_add(params, key, value);
}
}
rc = controld_execute_resource_agent(lrm_state, rsc->id, op->op_type,
op->user_data, op->interval_ms,
op->timeout, op->start_delay, params,
&call_id);
if (rc == pcmk_rc_ok) {
/* record all operations so we can wait
* for them to complete during shutdown
*/
char *call_id_s = make_stop_id(rsc->id, call_id);
active_op_t *pending = NULL;
pending = calloc(1, sizeof(active_op_t));
crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s);
pending->call_id = call_id;
pending->interval_ms = op->interval_ms;
pending->op_type = strdup(operation);
pending->op_key = strdup(op_id);
pending->rsc_id = strdup(rsc->id);
pending->start_time = time(NULL);
pcmk__str_update(&pending->user_data, op->user_data);
if (crm_element_value_epoch(msg, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
&(pending->lock_time)) != pcmk_ok) {
pending->lock_time = 0;
}
g_hash_table_replace(lrm_state->pending_ops, call_id_s, pending);
if ((op->interval_ms > 0)
&& (op->start_delay > START_DELAY_THRESHOLD)) {
int target_rc = PCMK_OCF_OK;
crm_info("Faking confirmation of %s: execution postponed for over 5 minutes", op_id);
decode_transition_key(op->user_data, NULL, NULL, NULL, &target_rc);
lrmd__set_result(op, target_rc, PCMK_EXEC_DONE, NULL);
controld_ack_event_directly(NULL, NULL, rsc, op, rsc->id);
}
pending->params = op->params;
op->params = NULL;
} else if (lrm_state_is_local(lrm_state)) {
crm_err("Could not initiate %s action for resource %s locally: %s "
CRM_XS " rc=%d", operation, rsc->id, pcmk_rc_str(rc), rc);
fake_op_status(lrm_state, op, PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR, pcmk_rc_str(rc));
process_lrm_event(lrm_state, op, NULL, NULL);
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
} else {
crm_err("Could not initiate %s action for resource %s remotely on %s: "
"%s " CRM_XS " rc=%d",
operation, rsc->id, lrm_state->node_name, pcmk_rc_str(rc), rc);
fake_op_status(lrm_state, op, PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR, pcmk_rc_str(rc));
process_lrm_event(lrm_state, op, NULL, NULL);
}
free(op_id);
lrmd_free_event(op);
}
int last_resource_update = 0;
static void
cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
switch (rc) {
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
crm_trace("Resource update %d complete: rc=%d", call_id, rc);
break;
default:
crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
}
if (call_id == last_resource_update) {
last_resource_update = 0;
trigger_fsa();
}
}
/* Only successful stops, and probes that found the resource inactive, get locks
* recorded in the history. This ensures the resource stays locked to the node
* until it is active there again after the node comes back up.
*/
static bool
should_preserve_lock(lrmd_event_data_t *op)
{
if (!controld_shutdown_lock_enabled) {
return false;
}
if (!strcmp(op->op_type, RSC_STOP) && (op->rc == PCMK_OCF_OK)) {
return true;
}
if (!strcmp(op->op_type, RSC_STATUS) && (op->rc == PCMK_OCF_NOT_RUNNING)) {
return true;
}
return false;
}
static int
do_update_resource(const char *node_name, lrmd_rsc_info_t *rsc,
lrmd_event_data_t *op, time_t lock_time)
{
/*
*/
int rc = pcmk_ok;
xmlNode *update, *iter = NULL;
int call_opt = crmd_cib_smart_opt();
const char *uuid = NULL;
CRM_CHECK(op != NULL, return 0);
iter = create_xml_node(iter, XML_CIB_TAG_STATUS);
update = iter;
iter = create_xml_node(iter, XML_CIB_TAG_STATE);
if (pcmk__str_eq(node_name, fsa_our_uname, pcmk__str_casei)) {
uuid = fsa_our_uuid;
} else {
/* remote nodes uuid and uname are equal */
uuid = node_name;
pcmk__xe_set_bool_attr(iter, XML_NODE_IS_REMOTE, true);
}
CRM_LOG_ASSERT(uuid != NULL);
if(uuid == NULL) {
rc = -EINVAL;
goto done;
}
crm_xml_add(iter, XML_ATTR_UUID, uuid);
crm_xml_add(iter, XML_ATTR_UNAME, node_name);
crm_xml_add(iter, XML_ATTR_ORIGIN, __func__);
iter = create_xml_node(iter, XML_CIB_TAG_LRM);
crm_xml_add(iter, XML_ATTR_ID, uuid);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE);
crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
build_operation_update(iter, rsc, op, node_name, __func__);
if (rsc) {
const char *container = NULL;
crm_xml_add(iter, XML_ATTR_TYPE, rsc->type);
crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->standard);
crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER, rsc->provider);
if (lock_time != 0) {
/* Actions on a locked resource should either preserve the lock by
* recording it with the action result, or clear it.
*/
if (!should_preserve_lock(op)) {
lock_time = 0;
}
crm_xml_add_ll(iter, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
(long long) lock_time);
}
if (op->params) {
container = g_hash_table_lookup(op->params, CRM_META"_"XML_RSC_ATTR_CONTAINER);
}
if (container) {
crm_trace("Resource %s is a part of container resource %s", op->rsc_id, container);
crm_xml_add(iter, XML_RSC_ATTR_CONTAINER, container);
}
} else {
crm_warn("Resource %s no longer exists in the executor", op->rsc_id);
controld_ack_event_directly(NULL, NULL, rsc, op, op->rsc_id);
goto cleanup;
}
crm_log_xml_trace(update, __func__);
/* make it an asynchronous call and be done with it
*
* Best case:
* the resource state will be discovered during
* the next signup or election.
*
* Bad case:
* we are shutting down and there is no DC at the time,
* but then why were we shutting down then anyway?
* (probably because of an internal error)
*
* Worst case:
* we get shot for having resources "running" that really weren't
*
* the alternative however means blocking here for too long, which
* isn't acceptable
*/
fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, rc, NULL);
if (rc > 0) {
last_resource_update = rc;
}
done:
/* the return code is a call number, not an error code */
crm_trace("Sent resource state update message: %d for %s=%u on %s",
rc, op->op_type, op->interval_ms, op->rsc_id);
fsa_register_cib_callback(rc, FALSE, NULL, cib_rsc_callback);
cleanup:
free_xml(update);
return rc;
}
void
do_lrm_event(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)
{
CRM_CHECK(FALSE, return);
}
static char *
unescape_newlines(const char *string)
{
char *pch = NULL;
char *ret = NULL;
static const char *escaped_newline = "\\n";
if (!string) {
return NULL;
}
ret = strdup(string);
pch = strstr(ret, escaped_newline);
while (pch != NULL) {
/* Replace newline escape pattern with actual newline (and a space so we
* don't have to shuffle the rest of the buffer)
*/
pch[0] = '\n';
pch[1] = ' ';
pch = strstr(pch, escaped_newline);
}
return ret;
}
static bool
did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id,
const char * op_type, guint interval_ms)
{
rsc_history_t *entry = NULL;
CRM_CHECK(lrm_state != NULL, return FALSE);
CRM_CHECK(rsc_id != NULL, return FALSE);
CRM_CHECK(op_type != NULL, return FALSE);
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
if (entry == NULL || entry->failed == NULL) {
return FALSE;
}
if (pcmk__str_eq(entry->failed->rsc_id, rsc_id, pcmk__str_none)
&& pcmk__str_eq(entry->failed->op_type, op_type, pcmk__str_casei)
&& entry->failed->interval_ms == interval_ms) {
return TRUE;
}
return FALSE;
}
/*!
* \internal
* \brief Log the result of an executor action (actual or synthesized)
*
* \param[in] op Executor action to log result for
* \param[in] op_key Operation key for action
* \param[in] node_name Name of node action was performed on, if known
* \param[in] update_id Call ID for CIB update (or 0 if none)
* \param[in] confirmed Whether to log that graph action was confirmed
*/
static void
log_executor_event(lrmd_event_data_t *op, const char *op_key,
const char *node_name, int update_id, gboolean confirmed)
{
int log_level = LOG_ERR;
GString *str = g_string_sized_new(100); // reasonable starting size
g_string_printf(str, "Result of %s operation for %s",
crm_action_str(op->op_type, op->interval_ms), op->rsc_id);
if (node_name != NULL) {
g_string_append_printf(str, " on %s", node_name);
}
switch (op->op_status) {
case PCMK_EXEC_DONE:
log_level = LOG_NOTICE;
g_string_append_printf(str, ": %s",
services_ocf_exitcode_str(op->rc));
break;
case PCMK_EXEC_TIMEOUT:
g_string_append_printf(str, ": %s after %s",
pcmk_exec_status_str(op->op_status),
pcmk__readable_interval(op->timeout));
break;
case PCMK_EXEC_CANCELLED:
log_level = LOG_INFO;
// Fall through
default:
g_string_append_printf(str, ": %s",
pcmk_exec_status_str(op->op_status));
}
if ((op->exit_reason != NULL)
&& ((op->op_status != PCMK_EXEC_DONE) || (op->rc != PCMK_OCF_OK))) {
g_string_append_printf(str, " (%s)", op->exit_reason);
}
g_string_append(str, " " CRM_XS);
if (update_id != 0) {
g_string_append_printf(str, " CIB update %d,", update_id);
}
g_string_append_printf(str, " graph action %sconfirmed; call=%d key=%s",
(confirmed? "" : "un"), op->call_id, op_key);
if (op->op_status == PCMK_EXEC_DONE) {
g_string_append_printf(str, " rc=%d", op->rc);
}
do_crm_log(log_level, "%s", str->str);
g_string_free(str, TRUE);
/* The services library has already logged the output at info or debug
* level, so just raise to notice if it looks like a failure.
*/
if ((op->output != NULL) && (op->rc != PCMK_OCF_OK)) {
char *prefix = crm_strdup_printf(PCMK__OP_FMT "@%s output",
op->rsc_id, op->op_type,
op->interval_ms, node_name);
crm_log_output(LOG_NOTICE, prefix, op->output);
free(prefix);
}
}
void
process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
active_op_t *pending, xmlNode *action_xml)
{
char *op_id = NULL;
char *op_key = NULL;
int update_id = 0;
gboolean remove = FALSE;
gboolean removed = FALSE;
bool need_direct_ack = FALSE;
lrmd_rsc_info_t *rsc = NULL;
const char *node_name = NULL;
CRM_CHECK(op != NULL, return);
CRM_CHECK(op->rsc_id != NULL, return);
// Remap new status codes for older DCs
if (compare_version(fsa_our_dc_version, "3.2.0") < 0) {
switch (op->op_status) {
case PCMK_EXEC_NOT_CONNECTED:
lrmd__set_result(op, PCMK_OCF_CONNECTION_DIED,
PCMK_EXEC_ERROR, op->exit_reason);
break;
case PCMK_EXEC_INVALID:
lrmd__set_result(op, CRM_DIRECT_NACK_RC, PCMK_EXEC_ERROR,
op->exit_reason);
break;
default:
break;
}
}
op_id = make_stop_id(op->rsc_id, op->call_id);
op_key = pcmk__op_key(op->rsc_id, op->op_type, op->interval_ms);
// Get resource info if available (from executor state or action XML)
if (lrm_state) {
rsc = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0);
}
if ((rsc == NULL) && action_xml) {
xmlNode *xml = find_xml_node(action_xml, XML_CIB_TAG_RESOURCE, TRUE);
const char *standard = crm_element_value(xml, XML_AGENT_ATTR_CLASS);
const char *provider = crm_element_value(xml, XML_AGENT_ATTR_PROVIDER);
const char *type = crm_element_value(xml, XML_ATTR_TYPE);
if (standard && type) {
crm_info("%s agent information not cached, using %s%s%s:%s from action XML",
op->rsc_id, standard,
(provider? ":" : ""), (provider? provider : ""), type);
rsc = lrmd_new_rsc_info(op->rsc_id, standard, provider, type);
} else {
crm_err("Can't process %s result because %s agent information not cached or in XML",
op_key, op->rsc_id);
}
}
// Get node name if available (from executor state or action XML)
if (lrm_state) {
node_name = lrm_state->node_name;
} else if (action_xml) {
node_name = crm_element_value(action_xml, XML_LRM_ATTR_TARGET);
}
if(pending == NULL) {
remove = TRUE;
if (lrm_state) {
pending = g_hash_table_lookup(lrm_state->pending_ops, op_id);
}
}
if (op->op_status == PCMK_EXEC_ERROR) {
switch(op->rc) {
case PCMK_OCF_NOT_RUNNING:
case PCMK_OCF_RUNNING_PROMOTED:
case PCMK_OCF_DEGRADED:
case PCMK_OCF_DEGRADED_PROMOTED:
// Leave it to the TE/scheduler to decide if this is an error
op->op_status = PCMK_EXEC_DONE;
break;
default:
/* Nothing to do */
break;
}
}
if (op->op_status != PCMK_EXEC_CANCELLED) {
/* We might not record the result, so directly acknowledge it to the
* originator instead, so it doesn't time out waiting for the result
* (especially important if part of a transition).
*/
need_direct_ack = TRUE;
if (controld_action_is_recordable(op->op_type)) {
if (node_name && rsc) {
// We should record the result, and happily, we can
update_id = do_update_resource(node_name, rsc, op,
pending? pending->lock_time : 0);
need_direct_ack = FALSE;
} else if (op->rsc_deleted) {
/* We shouldn't record the result (likely the resource was
* refreshed, cleaned, or removed while this operation was
* in flight).
*/
crm_notice("Not recording %s result in CIB because "
"resource information was removed since it was initiated",
op_key);
} else {
/* This shouldn't be possible; the executor didn't consider the
* resource deleted, but we couldn't find resource or node
* information.
*/
crm_err("Unable to record %s result in CIB: %s", op_key,
(node_name? "No resource information" : "No node name"));
}
}
} else if (op->interval_ms == 0) {
/* A non-recurring operation was cancelled. Most likely, the
* never-initiated action was removed from the executor's pending
* operations list upon resource removal.
*/
need_direct_ack = TRUE;
} else if (pending == NULL) {
/* This recurring operation was cancelled, but was not pending. No
* transition actions are waiting on it, nothing needs to be done.
*/
} else if (op->user_data == NULL) {
/* This recurring operation was cancelled and pending, but we don't
* have a transition key. This should never happen.
*/
crm_err("Recurring operation %s was cancelled without transition information",
op_key);
} else if (pcmk_is_set(pending->flags, active_op_remove)) {
/* This recurring operation was cancelled (by us) and pending, and we
* have been waiting for it to finish.
*/
if (lrm_state) {
erase_lrm_history_by_op(lrm_state, op);
}
/* If the recurring operation had failed, the lrm_rsc_op is recorded as
* "last_failure" which won't get erased from the cib given the logic on
* purpose in erase_lrm_history_by_op(). So that the cancel action won't
* have a chance to get confirmed by DC with process_op_deletion().
* Cluster transition would get stuck waiting for the remaining action
* timer to time out.
*
* Directly acknowledge the cancel operation in this case.
*/
if (did_lrm_rsc_op_fail(lrm_state, pending->rsc_id,
pending->op_type, pending->interval_ms)) {
need_direct_ack = TRUE;
}
} else if (op->rsc_deleted) {
/* This recurring operation was cancelled (but not by us, and the
* executor does not have resource information, likely due to resource
* cleanup, refresh, or removal) and pending.
*/
crm_debug("Recurring op %s was cancelled due to resource deletion",
op_key);
need_direct_ack = TRUE;
} else {
/* This recurring operation was cancelled (but not by us, likely by the
* executor before stopping the resource) and pending. We don't need to
* do anything special.
*/
}
if (need_direct_ack) {
controld_ack_event_directly(NULL, NULL, NULL, op, op->rsc_id);
}
if(remove == FALSE) {
/* The caller will do this afterwards, but keep the logging consistent */
removed = TRUE;
} else if (lrm_state && ((op->interval_ms == 0)
|| (op->op_status == PCMK_EXEC_CANCELLED))) {
gboolean found = g_hash_table_remove(lrm_state->pending_ops, op_id);
if (op->interval_ms != 0) {
removed = TRUE;
} else if (found) {
removed = TRUE;
crm_trace("Op %s (call=%d, stop-id=%s, remaining=%u): Confirmed",
op_key, op->call_id, op_id,
g_hash_table_size(lrm_state->pending_ops));
}
}
log_executor_event(op, op_key, node_name, update_id, removed);
if (lrm_state) {
if (!pcmk__str_eq(op->op_type, RSC_METADATA, pcmk__str_casei)) {
crmd_alert_resource_op(lrm_state->node_name, op);
} else if (rsc && (op->rc == PCMK_OCF_OK)) {
char *metadata = unescape_newlines(op->output);
metadata_cache_update(lrm_state->metadata_cache, rsc, metadata);
free(metadata);
}
}
if (op->rsc_deleted) {
crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key);
if (lrm_state) {
delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL);
}
}
/* If a shutdown was escalated while operations were pending,
* then the FSA will be stalled right now... allow it to continue
*/
mainloop_set_trigger(fsa_source);
if (lrm_state && rsc) {
update_history_cache(lrm_state, rsc, op);
}
lrmd_free_rsc_info(rsc);
free(op_key);
free(op_id);
}
diff --git a/include/crm/crm.h b/include/crm/crm.h
index 47528dcae9..9aa8ef7397 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -1,241 +1,241 @@
/*
* Copyright 2004-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef PCMK__CRM_CRM__H
# define PCMK__CRM_CRM__H
# include
# include
# include
# include
# include
# include
#ifdef __cplusplus
extern "C" {
#endif
/**
* \file
* \brief A dumping ground
* \ingroup core
*/
#ifndef PCMK_ALLOW_DEPRECATED
/*!
* \brief Allow use of deprecated Pacemaker APIs
*
* By default, external code using Pacemaker headers is allowed to use
* deprecated Pacemaker APIs. If PCMK_ALLOW_DEPRECATED is defined to 0 before
* including any Pacemaker headers, deprecated APIs will be unusable. It is
* strongly recommended to leave this unchanged for production and release
* builds, to avoid breakage when users upgrade to new Pacemaker releases that
* deprecate more APIs. This should be defined to 0 only for development and
* testing builds when desiring to check for usage of currently deprecated APIs.
*/
#define PCMK_ALLOW_DEPRECATED 1
#endif
/*!
* The CRM feature set assists with compatibility in mixed-version clusters.
* The major version number increases when nodes with different versions
* would not work (rolling upgrades are not allowed). The minor version
* number increases when mixed-version clusters are allowed only during
* rolling upgrades (a node with the oldest feature set will be elected DC). The
* minor-minor version number is ignored, but allows resource agents to detect
* cluster support for various features.
*
* The feature set also affects the processing of old saved CIBs (such as for
* many scheduler regression tests).
*
* Particular feature points currently tested by Pacemaker code:
*
* >2.1: Operation updates include timing data
* >=3.0.5: XML v2 digests are created
* >=3.0.8: Peers do not need acks for cancellations
* >=3.0.9: DC will send its own shutdown request to all peers
* XML v2 patchsets are created by default
* >=3.0.13: Fail counts include operation name and interval
* >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED
*/
-# define CRM_FEATURE_SET "3.15.1"
+# define CRM_FEATURE_SET "3.16.0"
/* Pacemaker's CPG protocols use fixed-width binary fields for the sender and
* recipient of a CPG message. This imposes an arbitrary limit on cluster node
* names.
*/
//! \brief Maximum length of a Corosync cluster node name (in bytes)
#define MAX_NAME 256
# define CRM_META "CRM_meta"
extern char *crm_system_name;
/* *INDENT-OFF* */
// How we represent "infinite" scores
# define CRM_SCORE_INFINITY 1000000
# define CRM_INFINITY_S "INFINITY"
# define CRM_PLUS_INFINITY_S "+" CRM_INFINITY_S
# define CRM_MINUS_INFINITY_S "-" CRM_INFINITY_S
/* @COMPAT API < 2.0.0 Deprecated "infinity" aliases
*
* INFINITY might be defined elsewhere (e.g. math.h), so undefine it first.
* This, of course, complicates any attempt to use the other definition in any
* code that includes this header.
*/
# undef INFINITY
# define INFINITY_S "INFINITY"
# define MINUS_INFINITY_S "-INFINITY"
# define INFINITY 1000000
/* Sub-systems */
# define CRM_SYSTEM_DC "dc"
# define CRM_SYSTEM_DCIB "dcib"
/* The master CIB */
# define CRM_SYSTEM_CIB "cib"
# define CRM_SYSTEM_CRMD "crmd"
# define CRM_SYSTEM_LRMD "lrmd"
# define CRM_SYSTEM_PENGINE "pengine"
# define CRM_SYSTEM_TENGINE "tengine"
# define CRM_SYSTEM_STONITHD "stonithd"
# define CRM_SYSTEM_MCP "pacemakerd"
// Names of internally generated node attributes
# define CRM_ATTR_UNAME "#uname"
# define CRM_ATTR_ID "#id"
# define CRM_ATTR_KIND "#kind"
# define CRM_ATTR_ROLE "#role"
# define CRM_ATTR_IS_DC "#is_dc"
# define CRM_ATTR_CLUSTER_NAME "#cluster-name"
# define CRM_ATTR_SITE_NAME "#site-name"
# define CRM_ATTR_UNFENCED "#node-unfenced"
# define CRM_ATTR_DIGESTS_ALL "#digests-all"
# define CRM_ATTR_DIGESTS_SECURE "#digests-secure"
# define CRM_ATTR_RA_VERSION "#ra-version"
# define CRM_ATTR_PROTOCOL "#attrd-protocol"
# define CRM_ATTR_FEATURE_SET "#feature-set"
/* Valid operations */
# define CRM_OP_NOOP "noop"
# define CRM_OP_JOIN_ANNOUNCE "join_announce"
# define CRM_OP_JOIN_OFFER "join_offer"
# define CRM_OP_JOIN_REQUEST "join_request"
# define CRM_OP_JOIN_ACKNAK "join_ack_nack"
# define CRM_OP_JOIN_CONFIRM "join_confirm"
# define CRM_OP_PING "ping"
# define CRM_OP_NODE_INFO "node-info"
# define CRM_OP_THROTTLE "throttle"
# define CRM_OP_VOTE "vote"
# define CRM_OP_NOVOTE "no-vote"
# define CRM_OP_HELLO "hello"
# define CRM_OP_PECALC "pe_calc"
# define CRM_OP_QUIT "quit"
# define CRM_OP_LOCAL_SHUTDOWN "start_shutdown"
# define CRM_OP_SHUTDOWN_REQ "req_shutdown"
# define CRM_OP_SHUTDOWN "do_shutdown"
# define CRM_OP_FENCE "stonith"
# define CRM_OP_REGISTER "register"
# define CRM_OP_IPC_FWD "ipc_fwd"
# define CRM_OP_INVOKE_LRM "lrm_invoke"
# define CRM_OP_LRM_REFRESH "lrm_refresh" /* Deprecated */
# define CRM_OP_LRM_QUERY "lrm_query"
# define CRM_OP_LRM_DELETE "lrm_delete"
# define CRM_OP_LRM_FAIL "lrm_fail"
# define CRM_OP_PROBED "probe_complete"
# define CRM_OP_REPROBE "probe_again"
# define CRM_OP_CLEAR_FAILCOUNT "clear_failcount"
# define CRM_OP_REMOTE_STATE "remote_state"
# define CRM_OP_RELAXED_SET "one-or-more"
# define CRM_OP_RELAXED_CLONE "clone-one-or-more"
# define CRM_OP_RM_NODE_CACHE "rm_node_cache"
# define CRM_OP_MAINTENANCE_NODES "maintenance_nodes"
/* Possible cluster membership states */
# define CRMD_JOINSTATE_DOWN "down"
# define CRMD_JOINSTATE_PENDING "pending"
# define CRMD_JOINSTATE_MEMBER "member"
# define CRMD_JOINSTATE_NACK "banned"
# define CRMD_ACTION_DELETE "delete"
# define CRMD_ACTION_CANCEL "cancel"
# define CRMD_ACTION_RELOAD "reload"
# define CRMD_ACTION_RELOAD_AGENT "reload-agent"
# define CRMD_ACTION_MIGRATE "migrate_to"
# define CRMD_ACTION_MIGRATED "migrate_from"
# define CRMD_ACTION_START "start"
# define CRMD_ACTION_STARTED "running"
# define CRMD_ACTION_STOP "stop"
# define CRMD_ACTION_STOPPED "stopped"
# define CRMD_ACTION_PROMOTE "promote"
# define CRMD_ACTION_PROMOTED "promoted"
# define CRMD_ACTION_DEMOTE "demote"
# define CRMD_ACTION_DEMOTED "demoted"
# define CRMD_ACTION_NOTIFY "notify"
# define CRMD_ACTION_NOTIFIED "notified"
# define CRMD_ACTION_STATUS "monitor"
# define CRMD_ACTION_METADATA "meta-data"
# define CRMD_METADATA_CALL_TIMEOUT 30000
/* short names */
# define RSC_DELETE CRMD_ACTION_DELETE
# define RSC_CANCEL CRMD_ACTION_CANCEL
# define RSC_MIGRATE CRMD_ACTION_MIGRATE
# define RSC_MIGRATED CRMD_ACTION_MIGRATED
# define RSC_START CRMD_ACTION_START
# define RSC_STARTED CRMD_ACTION_STARTED
# define RSC_STOP CRMD_ACTION_STOP
# define RSC_STOPPED CRMD_ACTION_STOPPED
# define RSC_PROMOTE CRMD_ACTION_PROMOTE
# define RSC_PROMOTED CRMD_ACTION_PROMOTED
# define RSC_DEMOTE CRMD_ACTION_DEMOTE
# define RSC_DEMOTED CRMD_ACTION_DEMOTED
# define RSC_NOTIFY CRMD_ACTION_NOTIFY
# define RSC_NOTIFIED CRMD_ACTION_NOTIFIED
# define RSC_STATUS CRMD_ACTION_STATUS
# define RSC_METADATA CRMD_ACTION_METADATA
/* *INDENT-ON* */
# include
# include
# include
static inline const char *
crm_action_str(const char *task, guint interval_ms) {
if ((task != NULL) && (interval_ms == 0)
&& (strcasecmp(task, RSC_STATUS) == 0)) {
return "probe";
}
return task;
}
#if !defined(PCMK_ALLOW_DEPRECATED) || (PCMK_ALLOW_DEPRECATED == 1)
#include
#endif
#ifdef __cplusplus
}
#endif
#endif
diff --git a/lib/pacemaker/pcmk_resource.c b/lib/pacemaker/pcmk_resource.c
index 90739165a5..159ba8e1dc 100644
--- a/lib/pacemaker/pcmk_resource.c
+++ b/lib/pacemaker/pcmk_resource.c
@@ -1,138 +1,172 @@
/*
* Copyright 2021-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
// Search path for resource operation history (takes node name and resource ID)
#define XPATH_OP_HISTORY "//" XML_CIB_TAG_STATUS \
"/" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
"/" XML_CIB_TAG_LRM "/" XML_LRM_TAG_RESOURCES \
"/" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']"
static xmlNode *
best_op(pe_resource_t *rsc, pe_node_t *node, pe_working_set_t *data_set)
{
char *xpath = NULL;
xmlNode *history = NULL;
xmlNode *best = NULL;
+ bool best_effective_op = false;
+ guint best_interval = 0;
+ bool best_failure = false;
+ const char *best_digest = NULL;
// Find node's resource history
xpath = crm_strdup_printf(XPATH_OP_HISTORY, node->details->uname, rsc->id);
history = get_xpath_object(xpath, data_set->input, LOG_NEVER);
free(xpath);
// Examine each history entry
for (xmlNode *lrm_rsc_op = first_named_child(history, XML_LRM_TAG_RSC_OP);
lrm_rsc_op != NULL; lrm_rsc_op = crm_next_same_xml(lrm_rsc_op)) {
const char *digest = crm_element_value(lrm_rsc_op,
XML_LRM_ATTR_RESTART_DIGEST);
guint interval_ms = 0;
+ const char *task = crm_element_value(lrm_rsc_op, XML_LRM_ATTR_TASK);
+ bool effective_op = false;
+ bool failure = pcmk__ends_with(ID(lrm_rsc_op), "_last_failure_0");
+
crm_element_value_ms(lrm_rsc_op, XML_LRM_ATTR_INTERVAL, &interval_ms);
+ effective_op = interval_ms == 0
+ && pcmk__strcase_any_of(task, RSC_STATUS,
+ RSC_START, RSC_PROMOTE,
+ RSC_MIGRATED, NULL);
- if (pcmk__ends_with(ID(lrm_rsc_op), "_last_failure_0")
- || (interval_ms != 0)) {
+ if (best == NULL) {
+ goto is_best;
+ }
- // Only use last failure or recurring op if nothing else available
- if (best == NULL) {
- best = lrm_rsc_op;
+ if (best_effective_op) {
+ // Do not use an ineffective op if there's an effective one.
+ if (!effective_op) {
+ continue;
}
+ // Do not use an ineffective non-recurring op if there's a recurring one.
+ } else if (best_interval != 0
+ && !effective_op
+ && interval_ms == 0) {
+ continue;
+ }
+
+ // Do not use last failure if there's a successful one.
+ if (!best_failure && failure) {
continue;
}
- best = lrm_rsc_op;
- if (digest != NULL) {
- // Any non-recurring action with a restart digest is sufficient
- break;
+ // Do not use an op without a restart digest if there's one with.
+ if (best_digest != NULL && digest == NULL) {
+ continue;
}
+
+ // Do not use an older op if there's a newer one.
+ if (pe__is_newer_op(best, lrm_rsc_op, true) > 0) {
+ continue;
+ }
+
+is_best:
+ best = lrm_rsc_op;
+ best_effective_op = effective_op;
+ best_interval = interval_ms;
+ best_failure = failure;
+ best_digest = digest;
}
return best;
}
/*!
* \internal
* \brief Calculate and output resource operation digests
*
* \param[in] out Output object
* \param[in] rsc Resource to calculate digests for
* \param[in] node Node whose operation history should be used
* \param[in] overrides Hash table of configuration parameters to override
*
* \return Standard Pacemaker return code
*/
int
pcmk__resource_digests(pcmk__output_t *out, pe_resource_t *rsc,
pe_node_t *node, GHashTable *overrides)
{
const char *task = NULL;
xmlNode *xml_op = NULL;
op_digest_cache_t *digests = NULL;
guint interval_ms = 0;
int rc = pcmk_rc_ok;
if ((out == NULL) || (rsc == NULL) || (node == NULL)) {
return EINVAL;
}
if (rsc->variant != pe_native) {
// Only primitives get operation digests
return EOPNOTSUPP;
}
// Find XML of operation history to use
xml_op = best_op(rsc, node, rsc->cluster);
// Generate an operation key
if (xml_op != NULL) {
task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
}
if (task == NULL) { // Assume start if no history is available
task = RSC_START;
interval_ms = 0;
}
// Calculate and show digests
digests = pe__calculate_digests(rsc, task, &interval_ms, node, xml_op,
overrides, true, rsc->cluster);
rc = out->message(out, "digests", rsc, node, task, interval_ms, digests);
pe__free_digests(digests);
return rc;
}
int
pcmk_resource_digests(xmlNodePtr *xml, pe_resource_t *rsc,
pe_node_t *node, GHashTable *overrides,
pe_working_set_t *data_set)
{
pcmk__output_t *out = NULL;
int rc = pcmk_rc_ok;
rc = pcmk__xml_output_new(&out, xml);
if (rc != pcmk_rc_ok) {
return rc;
}
pcmk__register_lib_messages(out);
rc = pcmk__resource_digests(out, rsc, node, overrides);
pcmk__xml_output_finish(out, xml);
return rc;
}
diff --git a/lib/pengine/pe_digest.c b/lib/pengine/pe_digest.c
index cb58397218..d0238950d3 100644
--- a/lib/pengine/pe_digest.c
+++ b/lib/pengine/pe_digest.c
@@ -1,580 +1,637 @@
/*
* Copyright 2004-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "pe_status_private.h"
extern bool pcmk__is_daemon;
/*!
* \internal
* \brief Free an operation digest cache entry
*
* \param[in] ptr Pointer to cache entry to free
*
* \note The argument is a gpointer so this can be used as a hash table
* free function.
*/
void
pe__free_digests(gpointer ptr)
{
op_digest_cache_t *data = ptr;
if (data != NULL) {
free_xml(data->params_all);
free_xml(data->params_secure);
free_xml(data->params_restart);
free(data->digest_all_calc);
free(data->digest_restart_calc);
free(data->digest_secure_calc);
free(data);
}
}
-// Return true if XML attribute name is substring of a given string
+// Return true if XML attribute name is not substring of a given string
static bool
-attr_in_string(xmlAttrPtr a, void *user_data)
+attr_not_in_string(xmlAttrPtr a, void *user_data)
{
bool filter = false;
char *name = crm_strdup_printf(" %s ", (const char *) a->name);
if (strstr((const char *) user_data, name) == NULL) {
crm_trace("Filtering %s (not found in '%s')",
(const char *) a->name, (const char *) user_data);
filter = true;
}
free(name);
return filter;
}
-// Return true if XML attribute name is not substring of a given string
+// Return true if XML attribute name is substring of a given string
static bool
-attr_not_in_string(xmlAttrPtr a, void *user_data)
+attr_in_string(xmlAttrPtr a, void *user_data)
{
bool filter = false;
char *name = crm_strdup_printf(" %s ", (const char *) a->name);
if (strstr((const char *) user_data, name) != NULL) {
crm_trace("Filtering %s (found in '%s')",
(const char *) a->name, (const char *) user_data);
filter = true;
}
free(name);
return filter;
}
#if ENABLE_VERSIONED_ATTRS
static void
append_versioned_params(xmlNode *versioned_params, const char *ra_version, xmlNode *params)
{
GHashTable *hash = pe_unpack_versioned_parameters(versioned_params, ra_version);
char *key = NULL;
char *value = NULL;
GHashTableIter iter;
g_hash_table_iter_init(&iter, hash);
while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
crm_xml_add(params, key, value);
}
g_hash_table_destroy(hash);
}
static void
append_all_versioned_params(pe_resource_t *rsc, pe_node_t *node,
pe_action_t *action, xmlNode *xml_op,
pe_working_set_t *data_set)
{
const char *ra_version = NULL;
xmlNode *local_versioned_params = NULL;
pe_rsc_action_details_t *details = pe_rsc_action_details(action);
local_versioned_params = create_xml_node(NULL, XML_TAG_RSC_VER_ATTRS);
pe_get_versioned_attributes(local_versioned_params, rsc, node, data_set);
if (xml_op != NULL) {
ra_version = crm_element_value(xml_op, XML_ATTR_RA_VERSION);
}
append_versioned_params(local_versioned_params, ra_version,
data->params_all);
append_versioned_params(rsc->versioned_parameters, ra_version,
data->params_all);
append_versioned_params(details->versioned_parameters, ra_version,
data->params_all);
}
#endif
/*!
* \internal
* \brief Add digest of all parameters to a digest cache entry
*
* \param[out] data Digest cache entry to modify
* \param[in] rsc Resource that action was for
* \param[in] node Node action was performed on
* \param[in] params Resource parameters evaluated for node
* \param[in] task Name of action performed
* \param[in,out] interval_ms Action's interval (will be reset if in overrides)
* \param[in] xml_op XML of operation in CIB status (if available)
* \param[in] op_version CRM feature set to use for digest calculation
* \param[in] overrides Key/value table to override resource parameters
* \param[in] data_set Cluster working set
*/
static void
calculate_main_digest(op_digest_cache_t *data, pe_resource_t *rsc,
pe_node_t *node, GHashTable *params,
const char *task, guint *interval_ms,
xmlNode *xml_op, const char *op_version,
GHashTable *overrides, pe_working_set_t *data_set)
{
pe_action_t *action = NULL;
data->params_all = create_xml_node(NULL, XML_TAG_PARAMS);
/* REMOTE_CONTAINER_HACK: Allow Pacemaker Remote nodes to run containers
* that themselves are Pacemaker Remote nodes
*/
if (pe__add_bundle_remote_name(rsc, data_set, data->params_all,
XML_RSC_ATTR_REMOTE_RA_ADDR)) {
crm_trace("Set address for bundle connection %s (on %s)",
rsc->id, node->details->uname);
}
// If interval was overridden, reset it
if (overrides != NULL) {
const char *interval_s = g_hash_table_lookup(overrides, CRM_META "_"
XML_LRM_ATTR_INTERVAL);
if (interval_s != NULL) {
long long value_ll;
if ((pcmk__scan_ll(interval_s, &value_ll, 0LL) == pcmk_rc_ok)
&& (value_ll >= 0) && (value_ll <= G_MAXUINT)) {
*interval_ms = (guint) value_ll;
}
}
}
action = custom_action(rsc, pcmk__op_key(rsc->id, task, *interval_ms),
task, node, TRUE, FALSE, data_set);
if (overrides != NULL) {
g_hash_table_foreach(overrides, hash2field, data->params_all);
}
g_hash_table_foreach(params, hash2field, data->params_all);
g_hash_table_foreach(action->extra, hash2field, data->params_all);
g_hash_table_foreach(action->meta, hash2metafield, data->params_all);
#if ENABLE_VERSIONED_ATTRS
append_all_versioned_params(rsc, node, action, xml_op, data_set);
#endif
pcmk__filter_op_for_digest(data->params_all);
+ /* Given a non-recurring operation with extra parameters configured,
+ * in case that the main digest doesn't match, even if the restart
+ * digest matches, enforce a restart rather than a reload-agent anyway.
+ * So that it ensures any changes of the extra parameters get applied
+ * for this specific operation, and the digests calculated for the
+ * resulting lrm_rsc_op will be correct.
+ * Mark the implied rc RSC_DIGEST_RESTART for the case that the main
+ * digest doesn't match.
+ */
+ if (*interval_ms == 0
+ && g_hash_table_size(action->extra) > 0) {
+ data->rc = RSC_DIGEST_RESTART;
+ }
+
pe_free_action(action);
data->digest_all_calc = calculate_operation_digest(data->params_all,
op_version);
}
// Return true if XML attribute name is a Pacemaker-defined fencing parameter
static bool
is_fence_param(xmlAttrPtr attr, void *user_data)
{
return pcmk_stonith_param((const char *) attr->name);
}
/*!
* \internal
* \brief Add secure digest to a digest cache entry
*
* \param[out] data Digest cache entry to modify
* \param[in] rsc Resource that action was for
* \param[in] params Resource parameters evaluated for node
* \param[in] xml_op XML of operation in CIB status (if available)
* \param[in] op_version CRM feature set to use for digest calculation
* \param[in] overrides Key/value hash table to override resource parameters
*/
static void
calculate_secure_digest(op_digest_cache_t *data, pe_resource_t *rsc,
GHashTable *params, xmlNode *xml_op,
const char *op_version, GHashTable *overrides)
{
const char *class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS);
const char *secure_list = NULL;
+ bool old_version = (compare_version(op_version, "3.16.0") < 0);
if (xml_op == NULL) {
secure_list = " passwd password user ";
} else {
secure_list = crm_element_value(xml_op, XML_LRM_ATTR_OP_SECURE);
}
- data->params_secure = create_xml_node(NULL, XML_TAG_PARAMS);
- if (overrides != NULL) {
- g_hash_table_foreach(overrides, hash2field, data->params_secure);
+ if (old_version) {
+ data->params_secure = create_xml_node(NULL, XML_TAG_PARAMS);
+ if (overrides != NULL) {
+ g_hash_table_foreach(overrides, hash2field, data->params_secure);
+ }
+
+ g_hash_table_foreach(params, hash2field, data->params_secure);
+
+ } else {
+ // Start with a copy of all parameters
+ data->params_secure = copy_xml(data->params_all);
}
- g_hash_table_foreach(params, hash2field, data->params_secure);
if (secure_list != NULL) {
- pcmk__xe_remove_matching_attrs(data->params_secure, attr_not_in_string,
+ pcmk__xe_remove_matching_attrs(data->params_secure, attr_in_string,
(void *) secure_list);
}
- if (pcmk_is_set(pcmk_get_ra_caps(class),
- pcmk_ra_cap_fence_params)) {
+ if (old_version
+ && pcmk_is_set(pcmk_get_ra_caps(class),
+ pcmk_ra_cap_fence_params)) {
/* For stonith resources, Pacemaker adds special parameters,
- * but these are not listed in fence agent meta-data, so the
- * controller will not hash them. That means we have to filter
- * them out before calculating our hash for comparison.
+ * but these are not listed in fence agent meta-data, so with older
+ * versions of DC, the controller will not hash them. That means we have
+ * to filter them out before calculating our hash for comparison.
*/
pcmk__xe_remove_matching_attrs(data->params_secure, is_fence_param,
NULL);
}
pcmk__filter_op_for_digest(data->params_secure);
/* CRM_meta_timeout *should* be part of a digest for recurring operations.
- * However, currently the controller does not add timeout to secure digests,
- * because it only includes parameters declared by the resource agent.
+ * However, with older versions of DC, the controller does not add timeout
+ * to secure digests, because it only includes parameters declared by the
+ * resource agent.
* Remove any timeout that made it this far, to match.
- *
- * @TODO Update the controller to add the timeout (which will require
- * bumping the feature set and checking that here).
*/
- xml_remove_prop(data->params_secure, CRM_META "_" XML_ATTR_TIMEOUT);
+ if (old_version) {
+ xml_remove_prop(data->params_secure, CRM_META "_" XML_ATTR_TIMEOUT);
+ }
data->digest_secure_calc = calculate_operation_digest(data->params_secure,
op_version);
}
/*!
* \internal
* \brief Add restart digest to a digest cache entry
*
* \param[out] data Digest cache entry to modify
* \param[in] xml_op XML of operation in CIB status (if available)
* \param[in] op_version CRM feature set to use for digest calculation
*
* \note This function doesn't need to handle overrides because it starts with
* data->params_all, which already has overrides applied.
*/
static void
calculate_restart_digest(op_digest_cache_t *data, xmlNode *xml_op,
const char *op_version)
{
const char *value = NULL;
// We must have XML of resource operation history
if (xml_op == NULL) {
return;
}
// And the history must have a restart digest to compare against
if (crm_element_value(xml_op, XML_LRM_ATTR_RESTART_DIGEST) == NULL) {
return;
}
// Start with a copy of all parameters
data->params_restart = copy_xml(data->params_all);
// Then filter out reloadable parameters, if any
value = crm_element_value(xml_op, XML_LRM_ATTR_OP_RESTART);
if (value != NULL) {
- pcmk__xe_remove_matching_attrs(data->params_restart, attr_in_string,
+ pcmk__xe_remove_matching_attrs(data->params_restart, attr_not_in_string,
(void *) value);
}
value = crm_element_value(xml_op, XML_ATTR_CRM_VERSION);
data->digest_restart_calc = calculate_operation_digest(data->params_restart,
value);
}
/*!
* \internal
* \brief Create a new digest cache entry with calculated digests
*
* \param[in] rsc Resource that action was for
* \param[in] task Name of action performed
* \param[in,out] interval_ms Action's interval (will be reset if in overrides)
* \param[in] node Node action was performed on
* \param[in] xml_op XML of operation in CIB status (if available)
* \param[in] overrides Key/value table to override resource parameters
* \param[in] calc_secure Whether to calculate secure digest
* \param[in] data_set Cluster working set
*
* \return Pointer to new digest cache entry (or NULL on memory error)
* \note It is the caller's responsibility to free the result using
* pe__free_digests().
*/
op_digest_cache_t *
pe__calculate_digests(pe_resource_t *rsc, const char *task, guint *interval_ms,
pe_node_t *node, xmlNode *xml_op, GHashTable *overrides,
bool calc_secure, pe_working_set_t *data_set)
{
op_digest_cache_t *data = calloc(1, sizeof(op_digest_cache_t));
- const char *op_version = CRM_FEATURE_SET;
+ const char *op_version = NULL;
GHashTable *params = NULL;
if (data == NULL) {
return NULL;
}
+
+ data->rc = RSC_DIGEST_MATCH;
+
if (xml_op != NULL) {
op_version = crm_element_value(xml_op, XML_ATTR_CRM_VERSION);
}
+ if (op_version == NULL && data_set != NULL && data_set->input != NULL) {
+ op_version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
+ }
+
+ if (op_version == NULL) {
+ op_version = CRM_FEATURE_SET;
+ }
+
params = pe_rsc_params(rsc, node, data_set);
calculate_main_digest(data, rsc, node, params, task, interval_ms, xml_op,
op_version, overrides, data_set);
if (calc_secure) {
calculate_secure_digest(data, rsc, params, xml_op, op_version,
overrides);
}
calculate_restart_digest(data, xml_op, op_version);
return data;
}
/*!
* \internal
* \brief Calculate action digests and store in node's digest cache
*
* \param[in] rsc Resource that action was for
* \param[in] task Name of action performed
* \param[in] interval_ms Action's interval
* \param[in] node Node action was performed on
* \param[in] xml_op XML of operation in CIB status (if available)
* \param[in] calc_secure Whether to calculate secure digest
* \param[in] data_set Cluster working set
*
* \return Pointer to node's digest cache entry
*/
static op_digest_cache_t *
rsc_action_digest(pe_resource_t *rsc, const char *task, guint interval_ms,
pe_node_t *node, xmlNode *xml_op, bool calc_secure,
pe_working_set_t *data_set)
{
op_digest_cache_t *data = NULL;
char *key = pcmk__op_key(rsc->id, task, interval_ms);
data = g_hash_table_lookup(node->details->digest_cache, key);
if (data == NULL) {
data = pe__calculate_digests(rsc, task, &interval_ms, node, xml_op,
NULL, calc_secure, data_set);
CRM_ASSERT(data != NULL);
g_hash_table_insert(node->details->digest_cache, strdup(key), data);
}
free(key);
return data;
}
/*!
* \internal
* \brief Calculate operation digests and compare against an XML history entry
*
* \param[in] rsc Resource to check
* \param[in] xml_op Resource history XML
* \param[in] node Node to use for digest calculation
* \param[in] data_set Cluster working set
*
* \return Pointer to node's digest cache entry, with comparison result set
*/
op_digest_cache_t *
rsc_action_digest_cmp(pe_resource_t * rsc, xmlNode * xml_op, pe_node_t * node,
pe_working_set_t * data_set)
{
op_digest_cache_t *data = NULL;
guint interval_ms = 0;
const char *op_version;
const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
const char *digest_all;
const char *digest_restart;
CRM_ASSERT(node != NULL);
op_version = crm_element_value(xml_op, XML_ATTR_CRM_VERSION);
digest_all = crm_element_value(xml_op, XML_LRM_ATTR_OP_DIGEST);
digest_restart = crm_element_value(xml_op, XML_LRM_ATTR_RESTART_DIGEST);
crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms);
data = rsc_action_digest(rsc, task, interval_ms, node, xml_op,
pcmk_is_set(data_set->flags, pe_flag_sanitized),
data_set);
- data->rc = RSC_DIGEST_MATCH;
if (digest_restart && data->digest_restart_calc && strcmp(data->digest_restart_calc, digest_restart) != 0) {
pe_rsc_info(rsc, "Parameters to %ums-interval %s action for %s on %s "
"changed: hash was %s vs. now %s (restart:%s) %s",
interval_ms, task, rsc->id, node->details->uname,
pcmk__s(digest_restart, "missing"),
data->digest_restart_calc,
op_version,
crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC));
data->rc = RSC_DIGEST_RESTART;
} else if (digest_all == NULL) {
/* it is unknown what the previous op digest was */
data->rc = RSC_DIGEST_UNKNOWN;
} else if (strcmp(digest_all, data->digest_all_calc) != 0) {
- pe_rsc_info(rsc, "Parameters to %ums-interval %s action for %s on %s "
- "changed: hash was %s vs. now %s (%s:%s) %s",
- interval_ms, task, rsc->id, node->details->uname,
- pcmk__s(digest_all, "missing"), data->digest_all_calc,
- (interval_ms > 0)? "reschedule" : "reload",
- op_version,
- crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC));
- data->rc = RSC_DIGEST_ALL;
+ /* Given a non-recurring operation with extra parameters configured,
+ * in case that the main digest doesn't match, even if the restart
+ * digest matches, enforce a restart rather than a reload-agent anyway.
+ * So that it ensures any changes of the extra parameters get applied
+ * for this specific operation, and the digests calculated for the
+ * resulting lrm_rsc_op will be correct.
+ * Preserve the implied rc RSC_DIGEST_RESTART for the case that the main
+ * digest doesn't match.
+ */
+ if (interval_ms == 0
+ && data->rc == RSC_DIGEST_RESTART) {
+ pe_rsc_info(rsc, "Parameters containing extra ones to %ums-interval"
+ " %s action for %s on %s "
+ "changed: hash was %s vs. now %s (restart:%s) %s",
+ interval_ms, task, rsc->id, node->details->uname,
+ pcmk__s(digest_all, "missing"), data->digest_all_calc,
+ op_version,
+ crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC));
+
+ } else {
+ pe_rsc_info(rsc, "Parameters to %ums-interval %s action for %s on %s "
+ "changed: hash was %s vs. now %s (%s:%s) %s",
+ interval_ms, task, rsc->id, node->details->uname,
+ pcmk__s(digest_all, "missing"), data->digest_all_calc,
+ (interval_ms > 0)? "reschedule" : "reload",
+ op_version,
+ crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC));
+ data->rc = RSC_DIGEST_ALL;
+ }
+
+ } else {
+ data->rc = RSC_DIGEST_MATCH;
}
return data;
}
/*!
* \internal
* \brief Create an unfencing summary for use in special node attribute
*
* Create a string combining a fence device's resource ID, agent type, and
* parameter digest (whether for all parameters or just non-private parameters).
* This can be stored in a special node attribute, allowing us to detect changes
* in either the agent type or parameters, to know whether unfencing must be
* redone or can be safely skipped when the device's history is cleaned.
*
* \param[in] rsc_id Fence device resource ID
* \param[in] agent_type Fence device agent
* \param[in] param_digest Fence device parameter digest
*
* \return Newly allocated string with unfencing digest
* \note The caller is responsible for freeing the result.
*/
static inline char *
create_unfencing_summary(const char *rsc_id, const char *agent_type,
const char *param_digest)
{
return crm_strdup_printf("%s:%s:%s", rsc_id, agent_type, param_digest);
}
/*!
* \internal
* \brief Check whether a node can skip unfencing
*
* Check whether a fence device's current definition matches a node's
* stored summary of when it was last unfenced by the device.
*
* \param[in] rsc_id Fence device's resource ID
* \param[in] agent Fence device's agent type
* \param[in] digest_calc Fence device's current parameter digest
* \param[in] node_summary Value of node's special unfencing node attribute
* (a comma-separated list of unfencing summaries for
* all devices that have unfenced this node)
*
* \return TRUE if digest matches, FALSE otherwise
*/
static bool
unfencing_digest_matches(const char *rsc_id, const char *agent,
const char *digest_calc, const char *node_summary)
{
bool matches = FALSE;
if (rsc_id && agent && digest_calc && node_summary) {
char *search_secure = create_unfencing_summary(rsc_id, agent,
digest_calc);
/* The digest was calculated including the device ID and agent,
* so there is no risk of collision using strstr().
*/
matches = (strstr(node_summary, search_secure) != NULL);
crm_trace("Calculated unfencing digest '%s' %sfound in '%s'",
search_secure, matches? "" : "not ", node_summary);
free(search_secure);
}
return matches;
}
/* Magic string to use as action name for digest cache entries used for
* unfencing checks. This is not a real action name (i.e. "on"), so
* pcmk__check_action_config() won't confuse these entries with real actions.
*/
#define STONITH_DIGEST_TASK "stonith-on"
/*!
* \internal
* \brief Calculate fence device digests and digest comparison result
*
* \param[in] rsc Fence device resource
* \param[in] agent Fence device's agent type
* \param[in] node Node with digest cache to use
* \param[in] data_set Cluster working set
*
* \return Node's digest cache entry
*/
op_digest_cache_t *
pe__compare_fencing_digest(pe_resource_t *rsc, const char *agent,
pe_node_t *node, pe_working_set_t *data_set)
{
const char *node_summary = NULL;
// Calculate device's current parameter digests
op_digest_cache_t *data = rsc_action_digest(rsc, STONITH_DIGEST_TASK, 0U,
node, NULL, TRUE, data_set);
// Check whether node has special unfencing summary node attribute
node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_ALL);
if (node_summary == NULL) {
data->rc = RSC_DIGEST_UNKNOWN;
return data;
}
// Check whether full parameter digest matches
if (unfencing_digest_matches(rsc->id, agent, data->digest_all_calc,
node_summary)) {
data->rc = RSC_DIGEST_MATCH;
return data;
}
// Check whether secure parameter digest matches
node_summary = pe_node_attribute_raw(node, CRM_ATTR_DIGESTS_SECURE);
if (unfencing_digest_matches(rsc->id, agent, data->digest_secure_calc,
node_summary)) {
data->rc = RSC_DIGEST_MATCH;
if (!pcmk__is_daemon && data_set->priv != NULL) {
pcmk__output_t *out = data_set->priv;
out->info(out, "Only 'private' parameters to %s "
"for unfencing %s changed", rsc->id,
node->details->uname);
}
return data;
}
// Parameters don't match
data->rc = RSC_DIGEST_ALL;
if (pcmk_is_set(data_set->flags, pe_flag_sanitized) && data->digest_secure_calc) {
if (data_set->priv != NULL) {
pcmk__output_t *out = data_set->priv;
char *digest = create_unfencing_summary(rsc->id, agent,
data->digest_secure_calc);
out->info(out, "Parameters to %s for unfencing "
"%s changed, try '%s'", rsc->id,
node->details->uname, digest);
free(digest);
} else if (!pcmk__is_daemon) {
char *digest = create_unfencing_summary(rsc->id, agent,
data->digest_secure_calc);
printf("Parameters to %s for unfencing %s changed, try '%s'\n",
rsc->id, node->details->uname, digest);
free(digest);
}
}
return data;
}