diff --git a/BasicSanity.sh b/BasicSanity.sh index faf65f3245..d00f337e64 100755 --- a/BasicSanity.sh +++ b/BasicSanity.sh @@ -1,97 +1,107 @@ #!/bin/bash test_home=`dirname $0` valgrind="" verbose="" tests="" if [ "$test_home" = "." ]; then test_home="$PWD" fi function info() { printf "$*\n" } function error() { printf " * ERROR: $*\n" } +function run_as_root() { + CMD="$1" + shift + ARGS="$@" + + # Test might not be executable if run from source directory + chmod a+x $CMD + + CMD="$CMD $ARGS $verbose" + + if [ $EUID -eq 0 ]; then + $CMD + + elif [ -z $TRAVIS ]; then + # sudo doesn't work in builtbot, su doesn't work in travis + echo "Enter the root password..." + su root -c "$CMD" + + else + echo "Enter the root password if prompted..." + sudo -- $CMD + fi +} + info "Test home is:\t$test_home" while true ; do case "$1" in - all) tests="pengine cli lrmd fencing"; shift;; + all) tests="$tests pengine cli lrmd fencing"; shift;; pengine|lrmd|pacemaker_remote|fencing|cli) tests="$tests $1"; shift;; -V|--verbose) verbose="-V"; shift;; -v|--valgrind) valgrind="-v"; shift;; --) shift ; break ;; "") break;; *) echo "unknown option: $1"; exit 1;; esac done if [ -z "$tests" ]; then tests="pengine cli lrmd" fi failed="" for t in $tests; do info "Executing the $t regression tests" info "============================================================" if [ -e $test_home/$t/regression.py ]; then - # Fencing, lrmd need root access - chmod a+x $test_home/$t/regression.py - echo "Enter the root password..." - # sudo doesn't work in builtbot, su doesn't work in travis - if [ x$TRAVIS = x ]; then - su root -c "$test_home/$t/regression.py $verbose" - else - sudo -- $test_home/$t/regression.py $verbose - fi + # fencing, lrmd need root access + run_as_root $test_home/$t/regression.py rc=$? elif [ $t == "pacemaker_remote" ] && [ -e $test_home/lrmd/regression.py ]; then # pacemaker_remote - chmod a+x $test_home/lrmd/regression.py - echo "Enter the root password..." - # sudo doesn't work in builtbot, su doesn't work in travis - if [ x$TRAVIS = x ]; then - su root -c "$test_home/$t/regression.py -R $verbose" - else - sudo -- $test_home/$t/regression.py -R $verbose - fi + run_as_root $test_home/lrmd/regression.py -R rc=$? - elif [ -e $test_home/$t ]; then + elif [ -e $test_home/$t/regression.sh ]; then # pengine, cli $test_home/$t/regression.sh $verbose $valgrind rc=$? elif [ $t = cli -a -e $test_home/tools ]; then - # Running cli tests from the source tree + # cli when run from the source tree t=tools $test_home/$t/regression.sh $verbose $valgrind rc=$? else error "Cannot find $t test in $test_home" rc=1 fi if [ $rc != 0 ]; then info "$t regression tests failed: $rc" failed="$failed $t" fi info "============================================================" info "" info "" done if [ -z "$failed" ]; then exit 0 fi error "regression tests for $failed failed" exit 1 diff --git a/ChangeLog b/ChangeLog index cb4307d8f1..03aa1932ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,2308 +1,2318 @@ +* Thu Jan 07 2016 Ken Gaillot Pacemaker-1.1.14-rc5-1 +- Update source tarball to revision: 3453803 +- Changesets: 22 +- Diff: 25 files changed, 80 insertions(+), 80 deletions(-) + +- Changes since Pacemaker-1.1.14-rc4 + + update SNMP MIB + + fix various misspellings in log messages + + fencing: use multiple topology levels properly + * Wed Dec 23 2015 Ken Gaillot Pacemaker-1.1.14-rc4-1 - Update source tarball to revision: 5f8a298 - Changesets: 18 - Diff: 27 files changed, 56 insertions(+), 53 deletions(-) - Changes since Pacemaker-1.1.14-rc3 + improved support for building on FreeBSD + libcrmcommon: when caching attrd connection, cache connection flags too * Mon Dec 14 2015 Ken Gaillot Pacemaker-1.1.14-rc3-1 - Update source tarball to revision: 97ada31 - Changesets: 7 - Diff: 2 files changed, 7 insertions(+), 2 deletions(-) - Changes since Pacemaker-1.1.14-rc2 + stonithd: fix issue where deleting a fence device attribute can delete the device * Tue Dec 08 2015 Ken Gaillot Pacemaker-1.1.14-rc2-1 - Update source tarball to revision: ce830a7 - Changesets: 1 - Diff: 1 file changed, 2 insertions(+) - Changes since Pacemaker-1.1.14-rc1 + crmd: ensure compilation works with built-in notifications disabled * Tue Dec 08 2015 Ken Gaillot Pacemaker-1.1.14-rc1-1 - Update source tarball to revision: 7cd6dcf - Changesets: 656 - Diff: 169 files changed, 13014 insertions(+), 7579 deletions(-) - Features added since Pacemaker-1.1.13 + crm_resource: Indicate common reasons why a resource may not start after a cleanup + crm_resource: New --force-promote and --force-demote options for debugging + fencing: Support targeting fencing topologies by node name pattern or node attribute + fencing: Remap sequential topology reboots to all-off-then-all-on + pengine: Allow guest remote nodes using containers/vms to be nested in a group resource + pengine: Allow resources to start and stop as soon as their state is known on all nodes + pengine: Include a list of all and available nodes with clone notifications + pengine: Addition of the clone resource clone-min metadata option + pengine: Support of multiple-active=block for resource groups + remote: reconnect_interval option for remote nodes to delay reconnect after fence - Changes since Pacemaker-1.1.13 + fix multiple memory issues (leaks, use-after-free, double free, use-of-NULL) in components and tools + cib: Do not terminate due to badly behaving clients + cman: handle corosync-invented node names of the form Node{id} for peers not in its node list + controld: replace bashism + crm_node: Display node state with -l and quorum status with -q, if available + crmd: resources would sometimes be restarted when only non-unique parameters changed + crmd: fence remote node after connection failure only once + crmd: handle resources named the same as cluster nodes + crmd: Pre-emptively fail in-flight actions when lrmd connections fail + crmd: Record actions in the CIB as failed if we cannot execute them + crm_report: Enable password sanitizing by default + crm_report: Allow log file discovery to be disabled + crm_resource: Allow the resource configuration to be modified for --force-{check,start,..} calls + crm_resource: Compensate for -C and -p being called with the child resource for clones + crm_resource: Correctly clean up all children for anonymous cloned groups + crm_resource: Correctly clean up failcounts for inactive anonymous clones + crm_resource: Correctly observe --force when deleting and updating attributes + crm_shadow: Fix "crm_shadow --diff" + crm_simulate: Prevent segfault on arches with 64bit time_t + fencing: ensure "required"/"automatic" only apply to "on" actions + fencing: Return a provider for the internal fencing agent "#watchdog" instead of logging an error + fencing: ignore stderr output of fence agents (often used for debug messages) + libcib: potential user input overflow + libcluster: overhaul peer cache management + log: make syslog less noisy + lrmd: cancel currently pending STONITH op if stonithd connection is lost + lrmd: Finalize all pending and recurring operations when cleaning up a resource + pengine: Bug cl#5247 - Imply resources running on a container are stopped when the container is stopped + pengine: cl#5235 - Prevent graph loops that can be introduced by "load_stopped -> migrate_to" ordering + pengine: Correctly bypass fencing for resources that do not require it + pengine: do not timeout remote node recurring monitor op failure until after fencing + pengine: Ensure recurring monitor operations are cancelled when clone instances are de-allocated + pengine: fixes segfault in pengine when fencing remote node + pengine: properly handle blocked clone actions + pengine: ensure failed actions that occurred in node shutdown are displayed + remote: Correctly display the usage of the ocf:pacemaker:remote resource agent + remote: do not fail operations because of a migration + remote: enable reloads for select remote connection options + resources: allow for top output with or without percent sign in HealthCPU + resources: Prevent an error message on stopping "Dummy" resource + systemd: Prevent segfault when logging failed operations + systemd: Reconnect to System DBus if the connection is closed + systemd: set systemd resources' timeout values higher than systemd's own default + tools: Do not send command lines to syslog + upstart: Ensure pending structs are correctly unreferenced * Wed Jun 24 2015 Andrew Beekhof Pacemaker-1.1.13-1 - Update source tarball to revision: 2a1847e - Changesets: 750 - Diff: 156 files changed, 11323 insertions(+), 3725 deletions(-) - Features added since Pacemaker-1.1.12 + Allow fail-counts to be removed en-mass when the new attrd is in operation + attrd supports private attributes (not written to CIB) + crmd: Ensure a watchdog device is in use if stonith-watchdog-timeout is configured + crmd: If configured, trigger the watchdog immediately if we loose quorum and no-quorum-policy=suicide + crm_diff: Support generating a difference without versions details if --no-version/-u is supplied + crm_resource: Implement an intelligent restart capability + Fencing: Advertise the watchdog device for fencing operations + Fencing: Allow the cluster to recover resources if the watchdog is in use + fencing: cl#5134 - Support random fencing delay to avoid double fencing + mcp: Allow orphan children to initiate node panic via SIGQUIT + mcp: Turn on sbd integration if pacemakerd finds it running + mcp: Two new error codes that result in machine reset or power off + Officially support the resource-discovery attribute for location constraints + PE: Allow natural ordering of colocation sets + PE: Support non-actionable degraded mode for OCF + pengine: cl#5207 - Display "UNCLEAN" for resources running on unclean offline nodes + remote: pcmk remote client tool for use with container wrapper script + Support machine panics for some kinds of errors (via sbd if available) + tools: add crm_resource --wait option + tools: attrd_updater supports --query and --all options + tools: attrd_updater: Allow attributes to be set for other nodes - Changes since Pacemaker-1.1.12 + pengine: exclusive discovery implies rsc is only allowed on exclusive subset of nodes + acl: Correctly implement the 'reference' acl directive + acl: Do not delay evaluation of added nodes in some situations + attrd: b22b1fe did uuid test too early + attrd: Clean out the node cache when requested by the admin + attrd: fixes double free in attrd legacy + attrd: properly write attributes for peers once uuid is discovered + attrd: refresh should force an immediate write-out of all attributes + attrd: Simplify how node deletions happen + Bug rhbz#1067544 - Tools: Correctly handle --ban, --move and --locate for master/slave groups + Bug rhbz#1181824 - Ensure the DC can be reliably fenced + cib: Ability to upgrade cib validation schema in legacy mode + cib: Always generate digests for cib diffs in legacy mode + cib: assignment where comparison intended + cib: Avoid nodeid conflicts we don't care about + cib: Correctly add "update-origin", "update-client" and "update-user" attributes for cib + cib: Correctly set up signal handlers + cib: Correctly track node state + cib: Do not update on disk backups if we're just querying them + cib: Enable cib legacy mode for plugin-based clusters + cib: Ensure file-based backends treat '-o section' consistently with the native backend + cib: Ensure upgrade operations from a non-DC get an acknowledgement + cib: No need to enforce cib digests for v2 diffs in legacy mode + cib: Revert d153b86 to instantly get cib synchronized in legacy mode + cib: tls sock cleanup for remote cib connections + cli: Ensure subsequent unknown long options are correctly detected + cluster: Invoke crm_remove_conflicting_peer() only when the new node's uname is being assigned in the node cache + common: Increment current and age for lib common as a result of APIs being added + corosync: Bug cl#5232 - Somewhat gracefully handle nodes with invalid UUIDs + corosync: Avoid unnecessary repeated CMAP API calls + crmd/pengine: handle on-fail=ignore properly + crmd: Add "on_node" attribute for *_last_failure_0 lrm resource operations + crmd: All peers need to track node shutdown requests + crmd: Cached copies of transient attributes cease to be valid once a node leaves the membership + crmd: Correctly add the local option that validates against schema for pengine to calculate + crmd: Disable debug logging that results in significant overhead + crmd: do not remove connection resources during re-probe + crmd: don't update fail count twice for same failure + crmd: Ensure remote connection resources timeout properly during 'migrate_from' action + crmd: Ensure throttle_mode() does something on Linux + crmd: Fixes crash when remote connection migration fails + crmd: gracefully handle remote node disconnects during op execution + crmd: Handle remote connection failures while executing ops on remote connection + crmd: include remote nodes when forcing cluster wide resource reprobe + crmd: never stop recurring monitor ops for pcmk remote during incomplete migration + crmd: Prevent the old version of DC from being fenced when it shuts down for rolling-upgrade + crmd: Prevent use-of-NULL during reprobe + crmd: properly update job limit for baremetal remote-nodes + crmd: Remote-node throttle jobs count towards cluster-node hosting conneciton rsc + crmd: Reset stonith failcount to recover transitioner when the node rejoins + crmd: resolves memory leak in crmd. + crmd: respect start-failure-is-fatal even for artifically injected events + crmd: Wait for all pending operations to complete before poking the policy engine + crmd: When container's host is fenced, cancel in-flight operations + crm_attribute: Correctly update config options when -o crm_config is specified + crm_failcount: Better error reporting when no resource is specified + crm_mon: add exit reason to resource failure output + crm_mon: Fill CRM_notify_node in traps with node's uname rather than node's id if possible + crm_mon: Repair notification delivery when the v2 patch format is in use + crm_node: Correctly remove nodes from the CIB by nodeid + crm_report: More patterns for finding logs on non-DC nodes + crm_resource: Allow resource restart operations to be node specific + crm_resource: avoid deletion of lrm cache on node with resource discovery disabled. + crm_resource: Calculate how long to wait for a restart based on the resource timeouts + crm_resource: Clean up memory in --restart error paths + crm_resource: Display the locations of all anonymous clone children when supplying the children's common ID + crm_resource: Ensure --restart sets/clears meta attributes + crm_resource: Ensure fail-counts are purged when we redetect the state of all resources + crm_resource: Implement --timeout for resource restart operations + crm_resource: Include group members when calculating the next timeout + crm_resource: Memory leak in error paths + crm_resource: Prevent use-after-free + crm_resource: Repair regression test outputs + crm_resource: Use-after-free when restarting a resource + dbus: ref count leaks + dbus: Ensure both the read and write queues get dispatched + dbus: Fail gracefully if malloc fails + dbus: handle dispatch queue when multiple replies need to be processed + dbus: Notice when dbus connections get disabled + dbus: Remove double-free introduced while trying to make coverity shut up + ensure if B is colocated with A, B can never run without A + fence_legacy: Avoid passing 'port' to cluster-glue agents + fencing: Allow nodes to be purged from the member cache + fencing: Correctly make args for fencing agents + fencing: Correctly wait for self-fencing to occur when the watchdog is in use + fencing: Ensure the hostlist parameter is set for watchdog agents + fencing: Force 'stonith-ng' as the system name + fencing: Gracefully handle invalid metadata from agents + fencing: If configured, wait stonith-watchdog-timer seconds for self-fencing to complete + fencing: Reject actions for devices that haven't been explicitly registered yet + ipc: properly allocate server enforced buffer size on client + ipc: use server enforced buffer during ipc client send + lrmd, services: interpret LSB status codes properly + lrmd: add back support for class heartbeat agents + lrmd: cancel pending async connection during disconnect + lrmd: enable ipc proxy for docker-wrapper privileged mode + lrmd: fix rescheduling of systemd monitor op during start + lrmd: Handle systemd reporting 'done' before a resource is actually stopped + lrmd: Hint to child processes that using sd_notify is not required + lrmd: Log with the correct personality + lrmd: Prevent glib assert triggered by timers being removed from mainloop more than once + lrmd: report original timeout when systemd operation completes + lrmd: store failed operation exit reason in cib + mainloop: resolves race condition mainloop poll involving modification of ipc connections + make targetted reprobe for remote node work, crm_resource -C -N + mcp: Allow a configurable delay when debugging shutdown issues + mcp: Avoid requiring 'export' for SYS-V sysconfig options + Membership: Detect and resolve nodes that change their ID + pacemakerd: resolves memory leak of xml structure in pacemakerd + pengine: ability to launch resources in isolated containers + pengine: add #kind=remote for baremetal remote-nodes + pengine: allow baremetal remote-nodes to recover without requiring fencing when cluster-node fails + pengine: allow remote-nodes to be placed in maintenance mode + pengine: Avoid trailing whitespaces when printing resource state + pengine: cl#5130 - Choose nodes capable of running all the colocated utilization resources + pengine: cl#5130 - Only check the capacities of the nodes that are allowed to run the resource + pengine: Correctly compare feature set to determine how to unpack meta attributes + pengine: disable migrations for resources with isolation containers + pengine: disable reloading of resources within isolated container wrappers + pengine: Do not aggregate children in a pending state into the started/stopped/etc lists + pengine: Do not record duplicate copies of the failed actions + pengine: Do not reschedule monitors that are no longer needed while resource definitions have changed + pengine: Fence baremetal remote when recurring monitor op fails + pengine: Fix colocation with unmanaged resources + pengine: Fix the behaviors of multi-state resources with asymmetrical ordering + pengine: fixes pengine crash with orphaned remote node connection resource + pengine: fixes segfault caused by malformed log warning + pengine: handle cloned isolated resources in a sane way + pengine: handle isolated resource scenario, cloned group of isolated resources + pengine: Handle ordering between stateful and migratable resources + pengine: imply stop in container node resources when host node is fenced + pengine: only fence baremetal remote when connection can fails or can not be recovered + pengine: only kill process group on timeout when on-fail does not equal block. + pengine: per-node control over resource discovery + pengine: prefer migration target for remote node connections + pengine: prevent disabling rsc discovery per node in certain situations + pengine: Prevent use-after-free in sort_rsc_process_order() + pengine: properly handle ordering during remote connection partial migration + pengine: properly recover remote-nodes when cluster-node proxy goes offline + pengine: remove unnecessary whitespace from notify environment variables + pengine: require-all feature for ordered clones + pengine: Resolve memory leaks + pengine: resource discovery mode for location constraints + pengine: restart master instances on instance attribute changes + pengine: Turn off legacy unpacking of resource options into the meta hashtable + pengine: Watchdog integration is sufficient for fencing + Perform systemd reloads asynchronously + ping: Correctly advertise multiplier default + Prefer to inherit the watchdog timeout from SBD + properly record stop args after reload + provide fake meta data for ra class heartbeat + remote: report timestamps for remote connection resource operations + remote: Treat recv msg timeout as a disconnect + service: Prevent potential use-of-NULL in metadata lookups + solaris: Allow compilation when dirent.d_type is not available + solaris: Correctly replace the linux swab functions + solaris: Disable throttling since /proc doesn't exist + stonith-ng: Correctly observe the watchdog completion timeout + stonith-ng: Correctly track node state + stonith-ng: Reset mainloop source IDs after removing them + systemd: Correctly handle long running stop actions + systemd: Ensure failed monitor operations always return + systemd: Ensure we don't call dbus_message_unref() with NULL + systemd: fix crash caused when canceling in-flight operation + systemd: Kindly ask dbus NOT to kill the process if the dbus connection fails + systemd: Perform actions asynchronously + systemd: Perform monitor operations without blocking + systemd: Tell systemd not to take DBus down from underneath us + systemd: Trick systemd into not stopping our services before us during shutdown + tools: Improve crm_mon output with certain option combinations + upstart: Monitor actions always return 'ok' or 'not running' + upstart: Perform more parts of monitor operations without blocking + xml: add 'require-all' to xml schema for constraints + xml: cl#5231 - Unset the deleted attributes in the resulting diffs + xml: Clone the latest constraint schema in preparation for changes" + xml: Correctly create v1 patchsets when deleting attributes + xml: Do not change the ordering of properties when applying v1 cib diffs + xml: Do not dump deleted attributes + xml: Do not prune leaves from v1 cib diffs that are being created with digests + xml: Ensure ACLs are reapplied before calculating what a replace operation changed + xml: Fix upgrade-1.3.xsl to correctly transform ACL rules with "attribute" + xml: Prevent assert errors in crm_element_value() on applying a patch without version information + xml: Prevent potential use-of-NULL * Tue Jul 22 2014 Andrew Beekhof Pacemaker-1.1.12-1 - Update source tarball to revision: 93a037d - Changesets: 795 - Diff: 195 files changed, 13772 insertions(+), 6176 deletions(-) - Features added since Pacemaker-1.1.11 + Changes to the ACL schema to support nodes and unix groups + cib: Check ACLs prior to making the update instead of parsing the diff afterwards + cib: Default ACL support to on + cib: Enable the more efficient xml patchset format + cib: Implement zero-copy status update + cib: Send all r/w operations via the cluster connection and have all nodes process them + crmd: Set "cluster-name" property to corosync's "cluster_name" by default for corosync-2 + crm_mon: Display brief output if "-b/--brief" is supplied or 'b' is toggled + crm_report: Allow ssh alternatives to be used + crm_ticket: Support multiple modifications for a ticket in an atomic operation + extra: Add logrotate configuration file for /var/log/pacemaker.log + Fencing: Add the ability to call stonith_api_time() from stonith_admin + logging: daemons always get a log file, unless explicitly set to configured 'none' + logging: allows the user to specify a log level that is output to syslog + PE: Automatically re-unfence a node if the fencing device definition changes + pengine: cl#5174 - Allow resource sets and templates for location constraints + pengine: Support cib object tags + pengine: Support cluster-specific instance attributes based on rules + pengine: Support id-ref in nvpair with optional "name" + pengine: Support per-resource maintenance mode + pengine: Support site-specific instance attributes based on rules + tools: Allow crm_shadow to create older configuration versions + tools: Display pending state in crm_mon/crm_resource/crm_simulate if --pending/-j is supplied (cl#5178) + xml: Add the ability to have lightweight schema revisions + xml: Enable resource sets in location constraints for 1.2 schema + xml: Support resources that require unfencing - Changes since Pacemaker-1.1.11 + acl: Authenticate pacemaker-remote requests with the node name as the client + acl: Read access must be explicitly granted + attrd: Ensure attribute dampening is always observed + attrd: Remove offline nodes from node cache for "peer-remove" requests + Bug cl#5055 - Improved migration support. + Bug cl#5184 - Ensure pending probes that ultimately fail are correctly updated + Bug cl#5196 - pengine: Check values after expanding templates + Bug cl#5212 - Do not promote instances when quorum is lots and no-quorum-policy=freeze + Bug cl#5213 - Ensure role colocation with -INFINITY is enforced + Bug cl#5213 - Limit the scope of the previous commit to the masters role + Bug cl#5219 - pengine: Allow unrelated resources with a common colocation target to remain promoted + Bug cl#5222 - cib: Repair rolling update capability + Bug cl#5222 - Enable legacy mode whenever a broadcast update is detected + Bug rhbz#1036631 - Stop members of cloned groups when dependencies are stopped + Bug rhbz#1054307 - cname pattern match should be more restrictive in init script + Bug rhbz#1057697 - Use native DBus library for systemd/upstart support to avoid problematic use of threads + Bug rhbz#1097457 - Limit the scope of the previous fix and include a helpful comment + Bug rhbz#1097457 - Prevent invalid transition when resource are ordered to start after the container they're started in + cib: allow setting permanent remote-node attributes + cib: Auto-detect which patchset format to use + cib: Determine the best value of validate-with if one is not supplied + cib: Do not disable cib disk writes if on-disk cib is corrupt + cib: Ensure 'cibadmin -R/--replace' commands get replies + cib: Erasing the cib is an admin action, bump the admin_epoch instead + cib: Fix remote cib based on TLS + cib: Ingore patch failures if we already have their contents + cib: Validate that everyone still sees the same configuration once all updates have completed + cibadmin: Allow priviliged clients to perform tasks as unpriviliged users + cibadmin: Remove dangerous commands that exposed unnecessary implementation internal details + cluster: Fix segfault on removing a node + cluster: Prevent search of unames from attempting to create node entries for unknown nodes + cluster: Remove unknown offline nodes with conflicting unames from node cache + controld: Do not consider the dlm up until the address list is present + controld: handling startup fencing within the controld agent, not the dlm + controld: Return OCF_ERR_INSTALLED instead of OCF_NOT_INSTALLED + crmd: Ack pending operations that were cancelled due to rsc deletion + crmd: Actions can only be executed if their pre-requisits completed successfully + crmd: avoid double free caused by nested hash table removal + crmd: Avoid spamming the cib by triggering a transition only once per non-status change + crmd: Correctly react to successful unfencing operations + crmd: Correctly recognise operation cancellations we initiated + crmd: Do not erase the status section for unfenced nodes + crmd: Do not overwrite existing node state when fencing completes + crmd: Do not start timers for already completed operations + crmd: Ensure crm_config options are re-read on updates + crmd: Fenced nodes that return prior to an election do not need to have their status section reset + crmd: make lrm_state hash table not case sensitive + crmd: make node_state erase correctly + crmd: Only write fence_averride if open() returns a positive file descriptor + crmd: Prevent manual fencing confirmations from attempting to create node entries for unknown nodes + crmd: Prevent SIGPIPE when notifying CMAN about fencing operations + crmd: Remove state of unknown nodes with conflicting unames from CIB + crmd: Remove unknown nodes with conflicting unames from CIB + crmd: Report unsuccessful unfencing operations + crm_diff: Allow the generation of xml patchsets without digests + crm_mon: Allow the file created by --as-html to be world readable + crm_mon: Ensure resource attributes have been unpacked before displaying connectivity data + crm_node: Only remove the named resource from the cib + crm_report: Gracefully handle rediculously large logfiles + crm_report: Only gather dlm data if dlm_controld is running + crm_resource: Gracefully handle -EACCESS when querying the cib + crm_verify: Perform a full set of calculations whenever the status section is present + fencing: Advertise support for reboot/on/off in the metadata for legacy agents + fencing: Automatically switch from 'list' to 'status' to 'static-list' if those actions are not advertised in the metadata + fencing: Cache metadata lookups to avoid repeated blocking during device registration + fencing: Correctly record which peer performed the fencing operation + fencing: default to 'off' when agent does not advertise 'reboot' in metadata + fencing: Do not unregister/register all stonith devices on every resource agent change + fencing: Execute all required fencing devices regardless of what topology level they are at + fencing: Fence using all required devices + fencing: Pass the correct options when looking up the history by node name + fencing: Update stonith device list only if stonith is enabled + get_cluster_type: failing concurrent tool invocations on heartbeat + ignore SIGPIPE when gnutls is in use + iso8601: Different logic is needed when logging and calculating durations + iso8601: Fix memory leak in duration calculation + Logging: Bootstrap daemon logging before processing arguments but configure it afterwards + lrmd: Cancel recurring operations before stop action is executed + lrmd: Expose logging variables expected by OCF agents + lrmd: Handle systemd reporting 'done' before a resource is actually stopped/started + lrmd: Merge duplicate recurring monitor operations + lrmd: Prevent OCF agents from logging to random files due to "value" of setenv() being NULL + lrmd: Provide stderr output from agents if available, otherwise fall back to stdout + mainloop: Better handle the killing of processes in the act of exiting + mainloop: Canceling in-flight operations should not fail if child process has already exited. + mainloop: Fixes use after free in process monitor code + mcp: Tell systemd not to respawn us if we exit with rc=100 + membership: Avoid duplicate peer entries in the peer cache + pengine: Allow container nodes to migrate with connection resource + pengine: avoid assert by searching for stop action on correct node during LogActions + pengine: Block restart of resources if any dependent resource in a group is unmanaged + pengine: cl#5186 - Avoid running rsc on two nodes when node is fenced during migration + pengine: cl#5187 - Prevent resources in an anti-colocation from even temporarily running on a same node + pengine: cl#5200 - Before migrating utilization-using resources to a node, take off the load that will no longer run there if it's not introducing transition loop + pengine: Correctly handle origin offsets in the future + pengine: Correctly observe requires=nothing + pengine: Default sequential to TRUE for resource sets for consistency with colocation sets + pengine: Delay unfencing until after we know the state of all resources that require unfencing + pengine: Do not initiate fencing for unclean nodes when fencing is disabled + pengine: Ensure instance numbers are preserved for cloned templates + pengine: Ensure unfencing only happens once, even if the transition is interrupted + pengine: Fencing devices default to only requiring quorum in order to start + pengine: fixes invalid transition caused by clones with more than 10 instances + pengine: Force record pending for migrate_to actions + pengine: handles edge case where container order constraints are not honored during migration + pengine: Ignore failure-timeout only if the failed operation has on-fail="block" + pengine: Mark unrunnable stop actions as "blocked" and show the correct current locations + pengine: Memory leaks + pengine: properly handle fencing of container remote-nodes when the container is orphaned + pengine: properly place resource within a container when container is a remote-node. + pengine: Unfencing is based on device probes, there is no need to unfence when normal resources are found active + pengine: Use "#cluster-name" in rules for setting cluster-specific instance attributes + pengine: Use "#site-name" in rules for setting site-specific instance attributes + remote: Allow baremetal remote-node connection resources to migrate + remote: clear remote-node status correctly + remote: Enable migration support for baremetal connection resources by default + remote: Handle request/response ipc proxy correctly + services: Correctly reset the nice value for lrmd's children + services: Do not allow duplicate recurring op entries + services: Do not block synced service executions + services: Fixes segfault associated with cancelling in-flight recurring operations. + services: Remove cancelled recurring ops from internal lists as early as possible + services: Remove file descriptors from mainloop as soon as we have drained them + services: Reset the scheduling policy and priority for lrmd's children without replying on SCHED_RESET_ON_FORK + services_action_cancel: Interpret return code from mainloop_child_kill() correctly + stonith_admin: Ensure pointers passed to sscanf() are properly initialized + stonith_api_time_helper now returns when the most recent fencing operation completed + systemd: Prevent use-of-NULL when determining if an agent exists + systemd: Try to handle dbus actions that complete prior to configuring a callback + Tools: Non-daemons shouldn't abort just because xml parsing failed + Upstart: Allow comilation with glib versions older than 2.28 + Upstart: Do not attempt upstart jobs if we cannot connect to dbus + When data was old, it fixed so that the newest cib might not be acquired. + xml: Check all available schemas when doing upgrades + xml: Correctly determine the lowest allowed schema version + xml: Correctly enforce ACLs after a replace operation + xml: Correctly infer attribute changes after a replace operation + xml: Create the correct diff when only part of a document is changed + xml: Detect attribute ordering changes + xml: Detect content that is added and removed in the same update + xml: Do not prune meaningful leaves from v1 patchsets + xml: Empty patchsets are considered to have applied cleanly + xml: Ensure patches always have version details set + xml: Find the minimal set of changes when part of a document is replaced + xml: If validate-with is missing, we find the most recent schema that accepts it and go from there + xml: Introduce a 'move' primitive for v2 patch sets + xml: Preserve the attribute order in the patch for subsequent digest validation + xml: Resolve memory leak when logging xml blobs + xml: Update xml validation to allow '' * Thu Feb 13 2014 David Vossel Pacemaker-1.1.11-1 - Update source tarball to revision: 33f9d09 - Changesets: 462 - Diff: 147 files changed, 6810 insertions(+), 4057 deletions(-) - Features added since Pacemaker-1.1.10 + attrd: A truly atomic version of attrd for use where CPG is used for cluster communication + cib: Allow values to be added/updated and removed in a single update + cib: Support XML comments in diffs + Core: Allow blackbox logging to be disabled with SIGUSR2 + crmd: Do not block on proxied calls from pacemaker_remoted + crmd: Enable cluster-wide throttling when the cib heavily exceeds its target load + crmd: Make the per-node action limit directly configurable in the CIB + crmd: Slow down recovery on nodes with IO load + crmd: Track CPU usage on cluster nodes and slow down recovery on nodes with high CPU/IO load + crm_mon: add --hide-headers option to hide all headers + crm_node: Display partition output in sorted order + crm_report: Collect logs directly from journald if available + Fencing: On timeout, clean up the agent's entire process group + Fencing: Support agents that need the host to be unfenced at startup + ipc: Raise the default buffer size to 128k + PE: Add a special attribute for distinguishing between real nodes and containers in constraint rules + PE: Allow location constraints to take a regex pattern to match against resource IDs + pengine: Distinguish between the agent being missing and something the agent needs being missing + remote: Properly version the remote connection protocol - Changes since Pacemaker-1.1.10 + Bug rhbz#1011618 - Consistently use 'Slave' as the role for unpromoted master/slave resources + Bug rhbz#1057697 - Use native DBus library for systemd and upstart support to avoid problematic use of threads + attrd: Any variable called 'cluster' makes the daemon crash before reaching main() + attrd: Avoid infinite write loop for unknown peers + attrd: Drop all attributes for peers that left the cluster + attrd: Give remote-nodes ability to set attributes with attrd + attrd: Prevent inflation of attribute dampen intervals + attrd: Support SI units for attribute dampening + Bug cl#5171 - pengine: Don't prevent clones from running due to dependent resources + Bug cl#5179 - Corosync: Attempt to retrieve a peer's node name if it is not already known + Bug cl#5181 - corosync: Ensure node IDs are written to the CIB as unsigned integers + Bug rhbz#902407 - crm_resource: Handle --ban for master/slave resources as advertised + cib: Correctly check for archived configuration files + cib: Correctly log short-form xml diffs + cib: Fix remote cib based on TLS + cibadmin: Report errors during sign-off + cli: Do not enabled blackbox for cli tools + cluster: Fix segfault on removing a node + cman: Do not start pacemaker if cman startup fails + cman: Start clvmd and friends from the init script if enabled + Command-line tools should stop after an assertion failure + controld: Use the correct variant of dlm_controld for corosync-2 clusters + cpg: Correctly set the group name length + cpg: Ensure the CPG group is always null-terminated + cpg: Only process one message at a time to allow other priority jobs to be performed + crmd: Correctly observe the configured batch-limit + crmd: Correctly update expected state when the previous DC shuts down + crmd: Correcty update the history cache when recurring ops change their return code + crmd: Don't add node_state to cib, if we have not seen or fenced this node yet + crmd: don't segfault on shutdown when using heartbeat + crmd: Prevent recurring monitors being cancelled due to notify operations + crmd: Reliably detect and act on reprobe operations from the policy engine + crmd: When a peer expectedly shuts down, record the new join and expected states into the cib + crmd: When the DC gracefully shuts down, record the new expected state into the cib + crm_attribute: Do not swallow hostname lookup failures + crm_mon: Do not display duplicates of failed actions + crm_mon: Reduce flickering in interactive mode + crm_resource: Observe --master modifier for --move + crm_resource: Provide a meaningful error if --master is used for primitives and groups + fencing: Allow fencing for node after topology entries are deleted + fencing: Apply correct score to the resource of group + fencing: Ignore changes to non-fencing resources + fencing: Observe pcmk_host_list during automatic unfencing + fencing: Put all fencing agent processes into their own process group + fencing: Wait until all possible replies are recieved before continuing with unverified devices + ipc: Compress msgs based on client's actual max send size + ipc: Have the ipc server enforce a minimum buffer size all clients must use. + iso8601: Prevent dates from jumping backwards a day in some timezones + lrmd: Correctly calculate metadata for the 'service' class + lrmd: Correctly cancel monitor actions for lsb/systemd/service resources on cleaning up + mcp: Remove LSB hints that instruct chkconfig to start pacemaker at boot time + mcp: Some distros complain when LSB scripts do not include Default-Start/Stop directives + pengine: Allow fencing of baremetal remote nodes + pengine: cl#5186 - Avoid running rsc on two nodes when node is fenced during migration + pengine: Correctly account for the location preferences of things colocated with a group + pengine: Correctly handle demotion of grouped masters that are partially demoted + pengine: Disable container node probes due to constraint conflicts + pengine: Do not allow colocation with blocked clone instances + pengine: Do not re-allocate clone instances that are blocked in the Stopped state + pengine: Do not restart resources that depend on unmanaged resources + pengine: Force record pending for migrate_to actions + pengine: Location constraints with role=Started should prevent masters from running at all + pengine: Order demote/promote of resources on remote nodes to happen only once the connection is up + pengine: Properly handle orphaned multistate resources living on remote-nodes + pengine: Properly shutdown orphaned remote connection resources + pengine: Recover unexpectedly running container nodes. + remote: Add support for ipv6 into pacemaker_remote daemon + remote: Handle endian changes between client and server and improve forward compatibility + services: Fixes segfault associated with cancelling in-flight recurring operations. + services: Reset the scheduling policy and priority for lrmd's children without replying on SCHED_RESET_ON_FORK * Fri Jul 26 2013 Andrew Beekhof Pacemaker-1.1.10-1 - Update source tarball to revision: ab2e209 - Changesets: 602 - Diff: 143 files changed, 8162 insertions(+), 5159 deletions(-) - Features added since Pacemaker-1.1.9 + Core: Convert all exit codes to positive errno values + crm_error: Add the ability to list and print error symbols + crm_resource: Allow individual resources to be reprobed + crm_resource: Allow options to be set recursively + crm_resource: Implement --ban for moving resources away from nodes and --clear (replaces --unmove) + crm_resource: Support OCF tracing when using --force-(check|start|stop) + PE: Allow active nodes in our current membership to be fenced without quorum + PE: Suppress meaningless IDs when displaying anonymous clone status + Turn off auto-respawning of systemd services when the cluster starts them + Bug cl#5128 - pengine: Support maintenance mode for a single node - Changes since Pacemaker-1.1.9 + crmd: cib: stonithd: Memory leaks resolved and improved use of glib reference counting + attrd: Fixes deleted attributes during dc election + Bug cf#5153 - Correctly display clone failcounts in crm_mon + Bug cl#5133 - pengine: Correctly observe on-fail=block for failed demote operation + Bug cl#5148 - legacy: Correctly remove a node that used to have a different nodeid + Bug cl#5151 - Ensure node names are consistently compared without case + Bug cl#5152 - crmd: Correctly clean up fenced nodes during membership changes + Bug cl#5154 - Do not expire failures when on-fail=block is present + Bug cl#5155 - pengine: Block the stop of resources if any depending resource is unmanaged + Bug cl#5157 - Allow migration in the absence of some colocation constraints + Bug cl#5161 - crmd: Prevent memory leak in operation cache + Bug cl#5164 - crmd: Fixes crash when using pacemaker-remote + Bug cl#5164 - pengine: Fixes segfault when calculating transition with remote-nodes. + Bug cl#5167 - crm_mon: Only print "stopped" node list for incomplete clone sets + Bug cl#5168 - Prevent clones from being bounced around the cluster due to location constraints + Bug cl#5170 - Correctly support on-fail=block for clones + cib: Correctly read back archived configurations if the primary is corrupted + cib: The result is not valid when diffs fail to apply cleanly for CLI tools + cib: Restore the ability to embed comments in the configuration + cluster: Detect and warn about node names with capitals + cman: Do not pretend we know the state of nodes we've never seen + cman: Do not unconditionally start cman if it is already running + cman: Support non-blocking CPG calls + Core: Ensure the blackbox is saved on abnormal program termination + corosync: Detect the loss of members for which we only know the nodeid + corosync: Do not pretend we know the state of nodes we've never seen + corosync: Ensure removed peers are erased from all caches + corosync: Nodes that can persist in sending CPG messages must be alive afterall + crmd: Do not get stuck in S_POLICY_ENGINE if a node we couldn't fence returns + crmd: Do not update fail-count and last-failure for old failures + crmd: Ensure all membership operations can complete while trying to cancel a transition + crmd: Ensure operations for cleaned up resources don't block recovery + crmd: Ensure we return to a stable state if there have been too many fencing failures + crmd: Initiate node shutdown if another node claims to have successfully fenced us + crmd: Prevent messages for remote crmd clients from being relayed to wrong daemons + crmd: Properly handle recurring monitor operations for remote-node agent + crmd: Store last-run and last-rc-change for all operations + crm_mon: Ensure stale pid files are updated when a new process is started + crm_report: Correctly collect logs when 'uname -n' reports fully qualified names + fencing: Fail the operation once all peers have been exhausted + fencing: Restore the ability to manually confirm that fencing completed + ipc: Allow unpriviliged clients to clean up after server failures + ipc: Restore the ability for members of the haclient group to connect to the cluster + legacy: Support "crm_node --remove" with a node name for corosync plugin (bnc#805278) + lrmd: Default to the upstream location for resource agent scratch directory + lrmd: Pass errors from lsb metadata generation back to the caller + pengine: Correctly handle resources that recover before we operate on them + pengine: Delete the old resource state on every node whenever the resource type is changed + pengine: Detect constraints with inappropriate actions (ie. promote for a clone) + pengine: Ensure per-node resource parameters are used during probes + pengine: If fencing is unavailable or disabled, block further recovery for resources that fail to stop + pengine: Implement the rest of get_timet_now() and rename to get_effective_time + pengine: Re-initiate _active_ recurring monitors that previously failed but have timed out + remote: Workaround for inconsistent tls handshake behavior between gnutls versions + systemd: Ensure we get shut down correctly by systemd + systemd: Reload systemd after adding/removing override files for cluster services + xml: Check for and replace non-printing characters with their octal equivalent while exporting xml text + xml: Prevent lockups by setting a more reliable buffer allocation strategy * Fri Mar 08 2013 Andrew Beekhof Pacemaker-1.1.9-1 - Update source tarball to revision: 7e42d77 - Statistics: Changesets: 731 Diff: 1301 files changed, 92909 insertions(+), 57455 deletions(-) - Features added in Pacemaker-1.1.9 + corosync: Allow cman and corosync 2.0 nodes to use a name other than uname() + corosync: Use queues to avoid blocking when sending CPG messages + ipc: Compress messages that exceed the configured IPC message limit + ipc: Use queues to prevent slow clients from blocking the server + ipc: Use shared memory by default + lrmd: Support nagios remote monitoring + lrmd: Pacemaker Remote Daemon for extending pacemaker functionality outside corosync cluster. + pengine: Check for master/slave resources that are not OCF agents + pengine: Support a 'requires' resource meta-attribute for controlling whether it needs quorum, fencing or nothing + pengine: Support for resource container + pengine: Support resources that require unfencing before start - Changes since Pacemaker-1.1.8 + attrd: Correctly handle deletion of non-existant attributes + Bug cl#5135 - Improved detection of the active cluster type + Bug rhbz#913093 - Use crm_node instead of uname + cib: Avoid use-after-free by correctly support cib_no_children for non-xpath queries + cib: Correctly process XML diff's involving element removal + cib: Performance improvements for non-DC nodes + cib: Prevent error message by correctly handling peer replies + cib: Prevent ordering changes when applying xml diffs + cib: Remove text nodes from cib replace operations + cluster: Detect node name collisions in corosync + cluster: Preserve corosync membership state when matching node name/id entries + cman: Force fenced to terminate on shutdown + cman: Ignore qdisk 'nodes' + core: Drop per-user core directories + corosync: Avoid errors when closing failed connections + corosync: Ensure peer state is preserved when matching names to nodeids + corosync: Clean up CMAP connections after querying node name + corosync: Correctly detect corosync 2.0 clusters even if we don't have permission to access it + crmd: Bug cl#5144 - Do not updated the expected status of failed nodes + crmd: Correctly determin if cluster disconnection was abnormal + crmd: Correctly relay messages for remote clients (bnc#805626, bnc#804704) + crmd: Correctly stall the FSA when waiting for additional inputs + crmd: Detect and recover when we are evicted from CPG + crmd: Differentiate between a node that is up and coming up in peer_update_callback() + crmd: Have cib operation timeouts scale with node count + crmd: Improved continue/wait logic in do_dc_join_finalize() + crmd: Prevent election storms caused by getrusage() values being too close + crmd: Prevent timeouts when performing pacemaker level membership negotiation + crmd: Prevent use-after-free of fsa_message_queue during exit + crmd: Store all current actions when stalling the FSA + crm_mon: Do not try to render a blank cib and indicate the previous output is now stale + crm_mon: Fixes crm_mon crash when using snmp traps. + crm_mon: Look for the correct error codes when applying configuration updates + crm_report: Ensure policy engine logs are found + crm_report: Fix node list detection + crm_resource: Have crm_resource generate a valid transition key when sending resource commands to the crmd + date/time: Bug cl#5118 - Correctly convert seconds-since-epoch to the current time + fencing: Attempt to provide more information that just 'generic error' for failed actions + fencing: Correctly record completed but previously unknown fencing operations + fencing: Correctly terminate when all device options have been exhausted + fencing: cov#739453 - String not null terminated + fencing: Do not merge new fencing requests with stale ones from dead nodes + fencing: Do not start fencing until entire device topology is found or query results timeout. + fencing: Do not wait for the query timeout if all replies have arrived + fencing: Fix passing of parameters from CMAN containing '=' + fencing: Fix non-comparison when sorting devices by priority + fencing: On failure, only try a topology device once from the remote level. + fencing: Only try peers for non-topology based operations once + fencing: Retry stonith device for duration of action's timeout period. + heartbeat: Remove incorrect assert during cluster connect + ipc: Bug cl#5110 - Prevent 100% CPU usage when looking for synchronous replies + ipc: Use 50k as the default compression threshold + legacy: Prevent assertion failure on routing ais messages (bnc#805626) + legacy: Re-enable logging from the pacemaker plugin + legacy: Relax the 'active' check for plugin based clusters to avoid false negatives + legacy: Skip peer process check if the process list is empty in crm_is_corosync_peer_active() + mcp: Only define HA_DEBUGLOG to avoid agent calls to ocf_log printing everything twice + mcp: Re-attach to existing pacemaker components when mcp fails + pengine: Any location constraint for the slave role applies to all roles + pengine: Avoid leaking memory when cleaning up failcounts and using containers + pengine: Bug cl#5101 - Ensure stop order is preserved for partially active groups + pengine: Bug cl#5140 - Allow set members to be stopped when the subseqent set has require-all=false + pengine: Bug cl#5143 - Prevent shuffling of anonymous master/slave instances + pengine: Bug rhbz#880249 - Ensure orphan masters are demoted before being stopped + pengine: Bug rhbz#880249 - Teach the PE how to recover masters into primitives + pengine: cl#5025 - Automatically clear failcount for start/monitor failures after resource parameters change + pengine: cl#5099 - Probe operation uses the timeout value from the minimum interval monitor by default (#bnc776386) + pengine: cl#5111 - When clone/master child rsc has on-fail=stop, insure all children stop on failure. + pengine: cl#5142 - Do not delete orphaned children of an anonymous clone + pengine: Correctly unpack active anonymous clones + pengine: Ensure previous migrations are closed out before attempting another one + pengine: Introducing the whitebox container resources feature + pengine: Prevent double-free for cloned primitive from template + pengine: Process rsc_ticket dependencies earlier for correctly allocating resources (bnc#802307) + pengine: Remove special cases for fencing resources + pengine: rhbz#902459 - Remove rsc node status for orphan resources + systemd: Gracefully handle unexpected DBus return types + Replace the use of the insecure mktemp(3) with mkstemp(3) * Thu Sep 20 2012 Andrew Beekhof Pacemaker-1.1.8-1 - Update source tarball to revision: 1a5341f - Statistics: Changesets: 1019 Diff: 2107 files changed, 117258 insertions(+), 73606 deletions(-) - All APIs have been cleaned up and reduced to essentials - Pacemaker now includes a replacement lrmd that supports systemd and upstart agents - Config and state files (cib.xml, PE inputs and core files) have moved to new locations - The crm shell has become a separate project and no longer included with Pacemaker - All daemons/tools now have a unified set of error codes based on errno.h (see crm_error) - Changes since Pacemaker-1.1.7 + Core: Bug cl#5032 - Rewrite the iso8601 date handling code + Core: Correctly extract the version details from a diff + Core: Log blackbox contents, if enabled, when an error occurs + Core: Only LOG_NOTICE and higher are sent to syslog + Core: Replace use of IPC from clplumbing with IPC from libqb + Core: SIGUSR1 now enables blackbox logging, SIGTRAP to write out + Core: Support a blackbox for additional logging detail after crashes/errors + Promote support for advanced fencing logic to the stable schema + Promote support for node starting scores to the stable schema + Promote support for service and systemd to the stable schema + attrd: Differentiate between updating all our attributes and everybody updating all theirs too + attrd: Have single-shot clients wait for an ack before disconnecting + cib: cl#5026 - Synced cib updates should not return until the cpg broadcast is complete. + corosync: Detect when the first corosync has not yet formed and handle it gracefully + corosync: Obtain a full list of configured nodes, including their names, when we connect to the quorum API + corosync: Obtain a node name from DNS if one was not already known + corosync: Populate the cib nodelist from corosync if available + corosync: Use the CFG API and DNS to determine node names if not configured in corosync.conf + crmd: Block after 10 failed fencing attempts for a node + crmd: cl#5051 - Fixes file leak in pe ipc connection initialization. + crmd: cl#5053 - Fixes fail-count not being updated properly. + crmd: cl#5057 - Restart sub-systems correctly (bnc#755671) + crmd: cl#5068 - Fixes crm_node -R option so it works with corosync 2.0 + crmd: Correctly re-establish failed attrd connections + crmd: Detect when the quorum API isn't configured for corosync 2.0 + crmd: Do not overwrite any configured node type (eg. quorum node) + crmd: Enable use of new lrmd daemon and client library in crmd. + crmd: Overhaul the way node state is recorded and updated in the CIB + fencing: Bug rhbz#853537 - Prevent use-of-NULL when the cib libraries are not available + fencing: cl#5073 - Add 'off' as an valid value for stonith-action option. + fencing: cl#5092 - Always timeout stonith operations if timeout period expires. + fencing: cl#5093 - Stonith per device timeout option + fencing: Clean up if we detect a failed connection + fencing: Delegate complex self fencing requests - we wont be around to see it to completion + fencing: Ensure all peers are notified of complex fencing op completion + fencing: Fix passing of fence_legacy parameters containing '=' + fencing: Gracefully handle metadata requests for unknown agents + fencing: Return cached dynamic target list for busy devices. + fencing: rhbz#801355 - Abort transition on DC when external fencing operation is detected + fencing: rhbz#801355 - Merge fence requests for identical operations already in progress. + fencing: rhbz#801355 - Report fencing operations external of pacemaker to cib + fencing: Specify the action to perform using action= instead of the older option= + fencing: Stop building fake metadata for broken agents + fencing: Tolerate agents that report empty metadata in the admin tool + mcp: Correctly retry the connection to corosync on failure + mcp: Do not shut down IPC until the last client exits + mcp: Prevent use-after-free when running against corosync 1.x + pengine: Bug cl#5059 - Use the correct action's status when calculating required actions for interleaved clones + pengine: Bypass online/offline checking resource detection for ping/quorum nodes + pengine: cl#5044 - migrate_to no longer requires load_stopped for avoiding possible transition loop + pengine: cl#5069 - Honor 'on-fail=ignore' even when operation is disabled. + pengine: cl#5070 - Allow influence of promotion score when multistate rsc is left hand of colocation + pengine: cl#5072 - Fixes monitor op stopping after rsc promotion. + pengine: cl#5072 - Fixes pengine regression test failures + pengine: Correctly set the status for nodes not intended to run Pacemaker + pengine: Do not append instance numbers to anonymous clones + pengine: Fix failcount expiration + pengine: Fix memory leaks found by valgrind + pengine: Fix use-after-free and use-of-NULL errors detected by coverity + pengine: Fixes use of colocation scores other than +/- INFINITY + pengine: Improve detection of rejoining nodes + pengine: Prevent use-of-NULL when tracing is enabled + pengine: Stonith resources are allowed to start even if their probes haven't completed on partially active nodes + services: New class called 'service' which expands to the correct (LSB/systemd/upstart) standard + services: Support Asynchronous systemd/upstart actions + Tools: crm_shadow - Bug cl#5062 - Correctly set argv[0] when forking a shell process + Tools: crm_report: Always include system logs (if we can find them) * Wed Mar 28 2012 Andrew Beekhof Pacemaker-1.1.7-1 - Update source tarball to revision: bc7ff2c - Statistics: Changesets: 513 Diff: 1171 files changed, 90472 insertions(+), 19368 deletions(-) - Changes since Pacemaker-1.1.6.1 + ais: Prepare for corosync versions using IPC from libqb + cib: Correctly shutdown in the presence of peers without relying on timers + cib: Don't halt disk writes if the previous digest is missing + cib: Determine when there are no peers to respond to our shutdown request and exit + cib: Ensure no additional messages are processed after we begin terminating + Cluster: Hook up the callbacks to the corosync quorum notifications + Core: basename() may modify its input, do not pass in a constant + Core: Bug cl#5016 - Prevent failures in recurring ops from being lost + Core: Bug rhbz#800054 - Correctly retrieve heartbeat uuids + Core: Correctly determine when an XML file should be decompressed + Core: Correctly track the length of a string without reading from uninitialzied memory (valgrind) + Core: Ensure signals are handled eventually in the absense of timer sources or IPC messages + Core: Prevent use-of-NULL in crm_update_peer() + Core: Strip text nodes from on disk xml files + Core: Support libqb for logging + corosync: Consistently set the correct uuid with get_node_uuid() + Corosync: Correctly disconnect from corosync variants + Corosync: Correctly extract the node id from membership udpates + corosync: Correctly infer lost members from the quorum API + Corosync: Default to using the nodeid as the node's uuid (instead of uname) + corosync: Ensure we catch nodes that leave the membership, even if the ringid doesn't change + corosync: Hook up CPG membership + corosync: Relax a development assert and gracefully handle the error condition + corosync: Remove deprecated member of the CFG API + corosync: Treat CS_ERR_QUEUE_FULL the same as CS_ERR_TRY_AGAIN + corosync: Unset the process list when nodes dissappear on us + crmd: Also purge fencing results when we enter S_NOT_DC + crmd: Bug cl#5015 - Remove the failed operation as well as the resulting fail-count and last-failure attributes + crmd: Correctly determine when a node can suicide with fencing + crmd: Election - perform the age comparison only once + crmd: Fast-track shutdown if we couldn't request it via attrd + crmd: Leave it up to the PE to decide which ops can/cannot be reload + crmd: Prevent use-after-free when calling delete_resource due to CRM_OP_REPROBE + crmd: Supply format arguments in the correct order + fencing: Add missing format parameter + fencing: Add the fencing topology section to the 1.1 configuration schema + fencing: fence_legacy - Drop spurilous host argument from status query + fencing: fence_legacy - Ensure port is available as an environment variable when calling monitor + fencing: fence_pcmk - don't block if nothing is specified on stdin + fencing: Fix log format error + fencing: Fix segfault caused by passing garbage to dlsym() + fencing: Fix use-of-NULL in process_remote_stonith_query() + fencing: Fix use-of-NULL when listing installed devices + fencing: Implement support for advanced fencing topologies: eg. kdump || (network && disk) || power + fencing: More gracefully handle failed 'list' operations for devices that only support a single connection + fencing: Prevent duplicate free when listing devices + fencing: Prevent uninitialized pointers being passed to free + fencing: Prevent use-after-free, we may need the query result for subsequent operations + fencing: Provide enough data to construct an entry in the node's fencing history + fencing: Standardize on /one/ method for clients to request members be fenced + fencing: Supress errors when listing all registered devices + mcp: corosync_cfg_state_track was removed from the corosync API, luckily we didnt use it for anything + mcp: Do not specify a WorkingDirectory in the systemd unit file - startup fails if its not available + mcp: Set the HA_quorum_type env variable consistently with our corosync plugin + mcp: Shut down if one of our child processes can/should not be respawned + pengine: Bug cl#5000 - Ensure ordering is preserved when depending on partial sets + pengine: Bug cl#5028 - Unmanaged services should block shutdown unless in maintenance mode + pengine: Bug cl#5038 - Prevent restart of anonymous clones when clone-max decreases + pengine: Bug cl#5007 - Fixes use of colocation constraints with multi-state resources + pengine: Bug cl#5014 - Prevent asymmetrical order constraints from causing resource stops + pengine: Bug cl#5000 - Implements ability to create rsc_order constraint sets such that A can start after B or C has started. + pengine: Correctly migrate a resource that has just migrated + pengine: Correct return from error path + pengine: Detect reloads of previously migrated resources + pengine: Ensure post-migration stop actions occur before node shutdown + pengine: Log as loudly as possible when we cannot shut down a cluster node + pengine: Reload of a resource no longer causes a restart of dependent resources + pengine: Support limiting the number of concurrent live migrations + pengine: Support referencing templates in constraints + pengine: Support of referencing resource templates in resource sets + pengine: Support to make tickets standby for relinquishing tickets gracefully + stonith: A "start" operation of a stonith resource does a "monitor" on the device beyond registering it + stonith: Bug rhbz#745526 - Ensure stonith_admin actually gets called by fence_pcmk + Stonith: Ensure all nodes receive and deliver notifications of the manual override + stonith: Fix the stonith timeout issue (cl#5009, bnc#727498) + Stonith: Implement a manual override for when nodes are known to be safely off + Tools: Bug cl#5003 - Prevent use-after-free in crm_simlate + Tools: crm_mon - Support to display tickets (based on Yuusuke Iida's work) + Tools: crm_simulate - Support to grant/revoke/standby/activate tickets from the new ticket state section + Tools: Implement crm_node functionality for native corosync + Fix a number of potential problems reported by coverity * Wed Aug 31 2011 Andrew Beekhof 1.1.6-1 - Update source tarball to revision: 676e5f25aa46 tip - Statistics: Changesets: 376 Diff: 1761 files changed, 36259 insertions(+), 140578 deletions(-) - Changes since Pacemaker-1.1.5 + ais: check for retryable errors when dispatching AIS messages + ais: Correctly disconnect from Corosync and Cman based clusters + ais: Followup to previous patch - Ensure we drain the corosync queue of messages when Glib tells us there is input + ais: Handle IPC error before checking for NULL data (bnc#702907) + cib: Check the validation version before adding the originator details of a CIB change + cib: Remove disconnected remote connections from mainloop + cman: Correctly override existing fenced operations + cman: Dequeue all the cman emitted events and not only the first one leaving the others in the event's queue. + cman: Don't call fenced_join and fenced_leave when notifying cman of a fencing event. + cman: We need to run the crmd as root for CMAN so that we can ACK fencing operations + Core: Cancelled and pending operations do not count as failed + Core: Ensure there is sufficient space for EOS when building short-form option strings + Core: Fix variable expansion in pkg-config files + Core: Partial revert of accidental commit in previous patch + Core: Use dlopen to load heartbeat libraries on-demand + crmd: Bug lf#2509 - Watch for config option changes from the CIB even if we're not the DC + crmd: Bug lf#2528 - Introduce a slight delay when creating a transition to allow attrd time to perform its updates + crmd: Bug lf#2559 - Fail actions that were scheduled for a failed/fenced node + crmd: Bug lf#2584 - Allow nodes to fence themselves if they're the last one standing + crmd: Bug lf#2632 - Correctly handle nodes that return faster than stonith + crmd: Cancel timers for actions that were pending on dead nodes + crmd: Catch fence operations that claim to succeed but did not really + crmd: Do not wait for actions that were pending on dead nodes + crmd: Ensure we do not attempt to perform action on failed nodes + crmd: Prevent use-of-NULL by g_hash_table_iter_next() + crmd: Recurring actions shouldn't cause the last non-recurring action to be forgotten + crmd: Store only the last and last failed operation in the CIB + mcp: dirname() modifies the input path - pass in a copy of the logfile path + mcp: Enable stack detection logic instead of forcing 'corosync' + mcp: Fix spelling mistake in systemd service script that prevents shutdown + mcp: Shut down if corosync becomes unavailable + mcp: systemd control file is now functional + pengine: Before migrating an utilization-using resource to a node, take off the load which will no longer run there (lf#2599, bnc#695440) + pengine: Before migrating an utilization-using resource to a node, take off the load which will no longer run there (regression tests) (lf#2599, bnc#695440) + pengine: Bug lf#2574 - Prevent shuffling by choosing the correct clone instance to stop + pengine: Bug lf#2575 - Use uname for migration variables, id is a UUID on heartbeat + pengine: Bug lf#2581 - Avoid group restart when clone (re)starts on an unrelated node + pengine: Bug lf#2613, lf#2619 - Group migration after failures and non-default utilization policies + pengine: Bug suse#707150 - Prevent services being active if dependencies on clones are not satisfied + pengine: Correctly recognise which recurring operations are currently active + pengine: Demote from Master does not clear previous errors + pengine: Ensure restarts due to definition changes cause the start action to be re-issued not probes + pengine: Ensure role is preserved for unmanaged resources + pengine: Ensure unmanaged resources have the correct role set so the correct monitor operation is chosen + pengine: Fix memory leak for re-allocated resources reported by valgrind + pengine: Implement cluster ticket and deadman + pengine: Implement resource template + pengine: Correctly determine the state of multi-state resources with a partial operation history + pengine: Only allocate master/slave resources once + pengine: Partial revert of 'Minor code cleanup CS: cf6bca32376c On: 2011-08-15' + pengine: Resolve memory leak reported by valgrind + pengine: Restore the ability to save inputs to disk + Shell: implement -w,--wait option to wait for the transition to finish + Shell: repair template list command + Shell: set of commands to examine logs, reports, etc + Stonith: Consolidate pcmk_host_map into run_stonith_agent so that it is applied consistently + Stonith: Deprecate pcmk_arg_map for the saner pcmk_host_argument + Stonith: Fix use-of-NULL by g_hash_table_lookup + Stonith: Improved pcmk_host_map parsing + Stonith: Prevent use-of-NULL by g_hash_table_lookup + Stonith: Prevent use-of-NULL when no Linux-HA stonith agents are present + stonith: Add missing entries to stonith_error2string() + Stonith: Correctly finish sending agent options if the initial write is interrupted + stonith: Correctly handle synchronous calls + stonith: Coverity - Correctly construct result list for the query API call + stonith: Coverity - Remove badly constructed memory allocation from the query API call + stonith: Ensure completed operations are recorded as such in the history + Stonith: Ensure device parameters are passed to the daemon during registration + stonith: Fix use-of-NULL in stonith_api_device_list() + stonith: stonith_admin - Prevent use of uninitialized pointer by --history command + Tools: Bug lf#2528 - Make progress when attrd_updater is called repeatedly within the dampen interval but with the same value + Tools: crm_report - Correctly extract data from the local node + Tools: crm_report - Remove newlines when detecting the node list + Tools: crm_report - Repair the ability to extract data from the local machine + Tools: crm_report - Report on all detected backtraces * Fri Feb 11 2011 Andrew Beekhof 1.1.5-1 - Update source tarball to revision: baad6636a053 - Statistics: Changesets: 184 Diff: 605 files changed, 46103 insertions(+), 26417 deletions(-) - Changes since Pacemaker-1.1.4 + Add the ability to delegate sub-sections of the cluster to non-root users via ACLs Needs to be enabled at compile time, not enabled by default. + ais: Bug lf#2550 - Report failed processes immediately + Core: Prevent recently introduced use-after-free in replace_xml_child() + Core: Reinstate the logic that skips past non-XML_ELEMENT_NODE children + Core: Remove extra calls to xmlCleanupParser resulting in use-after-free + Core: Repair reference to child-of-child after removal of xml_child_iter_filter from get_message_xml() + crmd: Bug lf#2545 - Ensure notify variables are accurate for stop operations + crmd: Cancel recurring operations while we're still connected to the lrmd + crmd: Reschedule the PE_START action if its not already running when we try to use it + crmd: Update failcount for failed promote and demote operations + pengine: Bug lf#2445 - Avoid relying on stickness for stable clone placement + pengine: Bug lf#2445 - Do not override configured clone stickiness values + pengine: Bug lf#2493 - Don't imply colocation requirements when applying ordering constraints with clones + pengine: Bug lf#2495 - Prevent segfault by validating the contents of ordering sets + pengine: Bug lf#2508 - Correctly reconstruct the status of anonymous cloned groups + pengine: Bug lf#2518 - Avoid spamming the logs with errors for orphan resources + pengine: Bug lf#2544 - Prevent unstable clone placement by factoring in the current node's score before all others + pengine: Bug lf#2554 - target-role alone is not sufficient to promote resources + pengine: Correct target_rc for probes of inactive resources (fix regression introduced by cs:ac3f03006e95) + pengine: Ensure that fencing has completed for stop actions on stonith-dependent resources (lf#2551) + pengine: Only update the node's promotion score if the resource is active there + pengine: Only use the promotion score from the current clone instance + pengine: Prevent use-of-NULL resulting from variable shadowing spotted by Coverity + pengine: Prevent use-of-NULL when there is status for an undefined node + pengine: Prevet use-after-free resulting from unintended recursion when chosing a node to promote master/slave resources + Shell: don't create empty optional sections (bnc#665131) + Stonith: Teach stonith_admin to automagically obtain the current node attributes for the target from the CIB + tools: Bug lf#2527 - Prevent use-of-NULL in crm_simulate + Tools: Prevent crm_resource commands from being lost due to the use of cib_scope_local * Wed Oct 20 2010 Andrew Beekhof 1.1.4-1 - Update source tarball to revision: 75406c3eb2c1 tip - Statistics: Changesets: 169 Diff: 772 files changed, 56172 insertions(+), 39309 deletions(-) - Changes since Pacemaker-1.1.3 + Italian translation of Clusters from Scratch + Significant performance enhancements to the Policy Engine and CIB + cib: Bug lf#2506 - Don't remove client's when notifications fail, they might just be too big + cib: Drop invalid/failed connections from the client hashtable + cib: Ensure all diffs sent to peers have sufficient ordering information + cib: Ensure non-change diffs can preserve the ordering on the other side + cib: Fix the feature set check + cib: Include version information on our synthesised diffs when nothing changed + cib: Optimize the way we detect group/set ordering changes - 15% speedup + cib: Prevent false detection of config updates with the new diff format + cib: Reduce unnecessary copying when comparing xml objects + cib: Repair the processing of updates sent from peer nodes + cib: Revert part of a recent commit that purged still valid connections + cib: The feature set version check is only valid if the current value is non-NULL + Core: Actually removing diff markers is necessary + Core: Bug lf#2506 - Drop the compression limit because Heartbeat's IPC code sucks + Core: Cache Relax-NG schemas - profiling indicates many cycles are wasted needlessly re-parsing them + Core: Correctly compare against crm_log_level in the logging macros + Core: Correctly extract the version details from a diff + Core: Correctly hook up the RNG schema cache + Core: Correctly use lazy_xml_sort() for v2 digests + Core: Don't compress large payload elements unless we're approaching message limits + Core: Don't insert empty ID tags when applying diffs + Core: Enable the improve v2 digests + Core: Ensure ordering is preserved when applying diffs + Core: Fix the CRM_CHECK macro + Core: Modify the v2 digest algorithm so that some fields are sorted + Core: Prevent use-after-free when creating a CIB update for a timed out action + Core: Prevent use-of-NULL when cleaning up RelaxNG data structures + Core: Provide significant performance improvements by implementing versioned diffs and digests + crmd: All pending operations should be recorded, even recurring ones with high start delays + crmd: Don't abort transitions when probes are completed on a node + crmd: Don't hide stop events that time out - allowing faster recovery in the presence of overloaded hosts + crmd: Ensure the CIB is always writable on the DC by removing a timing hole + crmd: Include the correct transition details for timed out operations + crmd: Prevent use of NULL by making copies of the operation's hash table + crmd: There's no need to check the cib version from the 'added' part of diff updates + crmd: Use the supplied timeout for stop actions + mcp: Ensure valgrind is able to log its output somewhere + mcp: Use 99/01 for the start/stop sequence to avoid problems with services (such as libvirtd) started by init - Patch from Vladislav Bogdanov + pengine: Ensure fencing of the DC preceeds the STONITH_DONE operation + pengine: Fix memory leak introduced as part of the conversion to GHashTables + pengine: Fix memory leak when processing completed migration actions + pengine: Fix typo leading to use-of-NULL in the new ordering code + pengine: Free memory in recently introduced helper function + pengine: lf#2478 - Implement improved handling and recovery of atomic resource migrations + pengine: Obtain massive speedup by prepending to the list of ordering constraints (which can grow quite large) + pengine: Optimize the logic for deciding which non-grouped anonymous clone instances to probe for + pengine: Prevent clones from being stopped because resources colocated with them cannot be active + pengine: Try to ensure atomic migration ops occur within a single transition + pengine: Use hashtables instead of linked lists for performance sensitive datastructures + pengine: Use the original digest algorithm for parameter lists + stonith: cleanup children on timeout in fence_legacy + Stonith: Fix two memory leaks + Tools: crm_shadow - Avoid replacing the entire configuration (including status) * Tue Sep 21 2010 Andrew Beekhof 1.1.3-1 - Update source tarball to revision: e3bb31c56244 tip - Statistics: Changesets: 352 Diff: 481 files changed, 14130 insertions(+), 11156 deletions(-) - Changes since Pacemaker-1.1.2.1 + ais: Bug lf#2401 - Improved processing when the peer crmd processes join/leave + ais: Correct the logic for conecting to plugin based clusters + ais: Do not supply a process list in mcp-mode + ais: Drop support for whitetank in the 1.1 release series + ais: Get an initial dump of the node membership when connecting to quorum-based clusters + ais: Guard against saturated cpg connections + ais: Handle CS_ERR_TRY_AGAIN in more cases + ais: Move the code for finding uid before the fork so that the child does no logging + ais: Never allow quorum plugins to affect connection to the pacemaker plugin + ais: Sign everyone up for peer process updates, not just the crmd + ais: The cluster type needs to be set before initializing classic openais connections + cib: Also free query result for xpath operations that return more than one hit + cib: Attempt to resolve memory corruption when forking a child to write the cib to disk + cib: Correctly free memory when writing out the cib to disk + cib: Fix the application of unversioned diffs + cib: Remove old developmental error logging + cib: Restructure the 'valid peer' check for deciding which instructions to ignore + cman: Correctly process membership/quorum changes from the pcmk plugin. Allow other message types through untouched + cman: Filter directed messages not intended for us + cman: Grab the initial membership when we connect + cman: Keep the list of peer processes up-to-date + cman: Make sure our common hooks are called after a cman membership update + cman: Make sure we can compile without cman present + cman: Populate sender details for cpg messages + cman: Update the ringid for cman based clusters + Core: Correctly unpack HA_Messages containing multiple entries with the same name + Core: crm_count_member() should only track nodes that have the full stack up + Core: New developmental logging system inspired by the kernel and a PoC from Lars Ellenberg + crmd: All nodes should see status updates, not just he DC + crmd: Allow non-DC nodes to clear failcounts too + crmd: Base DC election on process relative uptime + crmd: Bug lf#2439 - cancel_op() can also return HA_RSCBUSY + crmd: Bug lf#2439 - Handle asynchronous notification of resource deletion events + crmd: Bug lf#2458 - Ensure stop actions always have the relevant resource attributes + crmd: Disable age as a criteria for cman based clusters, its not reliable enough + crmd: Ensure we activate the DC timer if we detect an alternate DC + crmd: Factor the nanosecond component of process uptime in elections + crmd: Fix assertion failure when performing async resource failures + crmd: Fix handling of async resource deletion results + crmd: Include the action for crm graph operations + crmd: Make sure the membership cache is accurate after a sucessful fencing operation + crmd: Make sure we always poke the FSA after a transition to clear any TE_HALT actions + crmd: Offer crm-level membership once the peer starts the crmd process + crmd: Only need to request quorum update for plugin based clusters + crmd: Prevent assertion failure for stop actions resulting from cs: 3c0bc17c6daf + crmd: Prevent everyone from loosing DC elections by correctly initializing all relevant variables + crmd: Prevent segmentation fault + crmd: several fixes for async resource delete (thanks to beekhof) + crmd: Use the correct define/size for lrm resource IDs + Introduce two new cluster types 'cman' and 'corosync', replaces 'quorum_provider' concept + mcp: Add missing headers when built without heartbeat support + mcp: Correctly initialize the string containing the list of active daemons + mcp: Fix macro expansion in init script + mcp: Fix the expansion of the pid file in the init script + mcp: Handle CS_ERR_TRY_AGAIN when connecting to libcfg + mcp: Make sure we can compile the mcp without cman present + mcp: New master control process for (re)spawning pacemaker daemons + mcp: Read config early so we can re-initialize logging asap if daemonizing + mcp: Rename the mcp binary to pacemakerd and create a 'pacemaker' init script + mcp: Resend our process list after every CPG change + mcp: Tell chkconfig we need to shut down early on + pengine: Avoid creating invalid ordering constraints for probes that are not needed + pengine: Bug lf#1959 - Fail unmanaged resources should not prevent other services from shutting down + pengine: Bug lf#2422 - Ordering dependencies on partially active groups not observed properly + pengine: Bug lf#2424 - Use notify oepration definition if it exists in the configuration + pengine: Bug lf#2433 - No services should be stopped until probes finish + pengine: Bug lf#2453 - Enforce clone ordering in the absense of colocation constraints + pengine: Bug lf#2476 - Repair on-fail=block for groups and primitive resources + pengine: Correctly detect when there is a real failcount that expired and needs to be cleared + pengine: Correctly handle pseudo action creation + pengine: Correctly order clone startup after group/clone start + pengine: Correct use-after-free introduced in the prior patch + pengine: Do not demote resources because something that requires it can not run + pengine: Fix colocation for interleaved clones + pengine: Fix colocation with partially active groups + pengine: Fix potential use-after-free defect from coverity + pengine: Fix previous merge + pengine: Fix use-after-free in order_actions() reported by valgrind + pengine: Make the current data set a global variable so it does not need to be passed around everywhere + pengine: Prevent endless loop when looking for operation definitions in the configuration + pengine: Prevent segfault by ensuring the arguments to do_calculations() are initialized + pengine: Rewrite the ordering constraint logic to be simplicity, clarity and maintainability + pengine: Wait until stonith is available, do not fall back to shutdown for nodes requesting termination + Resolve coverity RESOURCE_LEAK defects + Shell: Complete the transition to using crm_attribute instead of crm_failcount and crm_standby + stonith: Advertise stonith-ng options in the metadata + stonith: Bug lf#2461 - Prevent segfault by not looking up operations if the hashtable has not been initialized yet + stonith: Bug lf#2473 - Add the timeout at the top level where the daemon is looking for it + Stonith: Bug lf#2473 - Ensure stonith operations complete within the timeout and are terminated if they run too long + stonith: Bug lf#2473 - Ensure timeouts are included for fencing operations + stonith: Bug lf#2473 - Gracefully handle remote operations that arrive late (after we have done notifications) + stonith: Correctly parse pcmk_host_list parameters that appear on a single line + stonith: Map poweron/poweroff back to on/off expected by the stonith tool from cluster-glue + stonith: pass the configuration to the stonith program via environment variables (bnc#620781) + Stonith: Use the timeout specified by the user + Support starting plugin-based Pacemaker clusters with the MCP as well + Tools: Bug lf#2456 - Fix assertion failure in crm_resource + tools: crm_node - Repair the ability to connect to openais based clusters + tools: crm_node - Use the correct short option for --cman + tools: crm_report - corosync.conf wont necessarily contain the text 'pacemaker' anymore + Tools: crm_simulate - Fix use-after-free in when terminating + tools: crm_simulate - Resolve coverity USE_AFTER_FREE defect + Tools: Drop the 'pingd' daemon and resource agent in favor of ocf:pacemaker:ping + Tools: Fix recently introduced use-of-NULL + Tools: Fix use-after-free defects from coverity * Wed May 12 2010 Andrew Beekhof 1.1.2-1 - Update source tarball to revision: c25c972a25cc tip - Statistics: Changesets: 339 Diff: 708 files changed, 37918 insertions(+), 10584 deletions(-) - Changes since Pacemaker-1.1.1 + ais: Do not count votes from offline nodes and calculate current votes before sending quorum data + ais: Ensure the list of active processes sent to clients is always up-to-date + ais: Look for the correct conf variable for turning on file logging + ais: Need to find a better and thread-safe way to set core_uses_pid. Disable for now. + ais: Use the threadsafe version of getpwnam + Core: Bump the feature set due to the new failcount expiry feature + Core: fix memory leaks exposed by valgrind + Core: Bug lf#2414 - Prevent use-after-free reported by valgrind when doing xpath based deletions + crmd: Bug lf#2414 - Prevent use-after-free of the PE connection after it dies + crmd: Bug lf#2414 - Prevent use-after-free of the stonith-ng connection + crmd: Bug lf#2401 - Improved detection of partially active peers + crmd: Bug lf#2379 - Ensure the cluster terminates when the PE is not available + crmd: Do not allow the target_rc to be misused by resource agents + crmd: Do not ignore action timeouts based on FSA state + crmd: Ensure we dont get stuck in S_PENDING if we loose an election to someone that never talks to us again + crmd: Fix memory leaks exposed by valgrind + crmd: Remove race condition that could lead to multiple instances of a clone being active on a machine + crmd: Send erase_status_tag() calls to the local CIB when the DC is fenced, since there is no DC to accept them + crmd: Use global fencing notifications to prevent secondary fencing operations of the DC + pengine: Bug lf#2317 - Avoid needless restart of primitive depending on a clone + pengine: Bug lf#2361 - Ensure clones observe mandatory ordering constraints if the LHS is unrunnable + pengine: Bug lf#2383 - Combine failcounts for all instances of an anonymous clone on a host + pengine: Bug lf#2384 - Fix intra-set colocation and ordering + pengine: Bug lf#2403 - Enforce mandatory promotion (colocation) constraints + pengine: Bug lf#2412 - Correctly find clone instances by their prefix + pengine: Do not be so quick to pull the trigger on nodes that are coming up + pengine: Fix memory leaks exposed by valgrind + pengine: Rewrite native_merge_weights() to avoid Fix use-after-free + Shell: Bug bnc#590035 - always reload status if working with the cluster + Shell: Bug bnc#592762 - Default to using the status section from the live CIB + Shell: Bug lf#2315 - edit multiple meta_attributes sets in resource management + Shell: Bug lf#2221 - enable comments + Shell: Bug bnc#580492 - implement new cibstatus interface and commands + Shell: Bug bnc#585471 - new cibstatus import command + Shell: check timeouts also against the default-action-timeout property + Shell: new configure filter command + Tools: crm_mon - fix memory leaks exposed by valgrind * Tue Feb 16 2010 Andrew Beekhof - 1.1.1-1 - First public release of Pacemaker 1.1 - Package reference documentation in a doc subpackage - Move cts into a subpackage so that it can be easily consumed by others - Update source tarball to revision: 17d9cd4ee29f + New stonith daemon that supports global notifications + Service placement influenced by the physical resources + A new tool for simulating failures and the cluster’s reaction to them + Ability to serialize an otherwise unrelated a set of resource actions (eg. Xen migrations) * Mon Jan 18 2010 Andrew Beekhof - 1.0.7-1 - Update source tarball to revision: 2eed906f43e9 (stable-1.0) tip - Statistics: Changesets: 193 Diff: 220 files changed, 15933 insertions(+), 8782 deletions(-) - Changes since 1.0.5-4 + pengine: Bug 2213 - Ensure groups process location constraints so that clone-node-max works for cloned groups + pengine: Bug lf#2153 - non-clones should not restart when clones stop/start on other nodes + pengine: Bug lf#2209 - Clone ordering should be able to prevent startup of dependent clones + pengine: Bug lf#2216 - Correctly identify the state of anonymous clones when deciding when to probe + pengine: Bug lf#2225 - Operations that require fencing should wait for 'stonith_complete' not 'all_stopped'. + pengine: Bug lf#2225 - Prevent clone peers from stopping while another is instance is (potentially) being fenced + pengine: Correctly anti-colocate with a group + pengine: Correctly unpack ordering constraints for resource sets to avoid graph loops + Tools: crm: load help from crm_cli.txt + Tools: crm: resource sets (bnc#550923) + Tools: crm: support for comments (LF 2221) + Tools: crm: support for description attribute in resources/operations (bnc#548690) + Tools: hb2openais: add EVMS2 CSM processing (and other changes) (bnc#548093) + Tools: hb2openais: do not allow empty rules, clones, or groups (LF 2215) + Tools: hb2openais: refuse to convert pure EVMS volumes + cib: Ensure the loop for login message terminates + cib: Finally fix reliability of receiving large messages over remote plaintext connections + cib: Fix remote notifications + cib: For remote connections, default to CRM_DAEMON_USER since thats the only one that the cib can validate the password for using PAM + cib: Remote plaintext - Retry sending parts of the message that did not fit the first time + crmd: Ensure batch-limit is correctly enforced + crmd: Ensure we have the latest status after a transition abort + (bnc#547579,547582): Tools: crm: status section editing support + shell: Add allow-migrate as allowed meta-attribute (bnc#539968) + Medium: Build: Do not automatically add -L/lib, it could cause 64-bit arches to break + Medium: pengine: Bug lf#2206 - rsc_order constraints always use score at the top level + Medium: pengine: Only complain about target-role=master for non m/s resources + Medium: pengine: Prevent non-multistate resources from being promoted through target-role + Medium: pengine: Provide a default action for resource-set ordering + Medium: pengine: Silently fix requires=fencing for stonith resources so that it can be set in op_defaults + Medium: Tools: Bug lf#2286 - Allow the shell to accept template parameters on the command line + Medium: Tools: Bug lf#2307 - Provide a way to determin the nodeid of past cluster members + Medium: Tools: crm: add update method to template apply (LF 2289) + Medium: Tools: crm: direct RA interface for ocf class resource agents (LF 2270) + Medium: Tools: crm: direct RA interface for stonith class resource agents (LF 2270) + Medium: Tools: crm: do not add score which does not exist + Medium: Tools: crm: do not consider warnings as errors (LF 2274) + Medium: Tools: crm: do not remove sets which contain id-ref attribute (LF 2304) + Medium: Tools: crm: drop empty attributes elements + Medium: Tools: crm: exclude locations when testing for pathological constraints (LF 2300) + Medium: Tools: crm: fix exit code on single shot commands + Medium: Tools: crm: fix node delete (LF 2305) + Medium: Tools: crm: implement -F (--force) option + Medium: Tools: crm: rename status to cibstatus (LF 2236) + Medium: Tools: crm: revisit configure commit + Medium: Tools: crm: stay in crm if user specified level only (LF 2286) + Medium: Tools: crm: verify changes on exit from the configure level + Medium: ais: Some clients such as gfs_controld want a cluster name, allow one to be specified in corosync.conf + Medium: cib: Clean up logic for receiving remote messages + Medium: cib: Create valid notification control messages + Medium: cib: Indicate where the remote connection came from + Medium: cib: Send password prompt to stderr so that stdout can be redirected + Medium: cts: Fix rsh handling when stdout is not required + Medium: doc: Fill in the section on removing a node from an AIS-based cluster + Medium: doc: Update the docs to reflect the 0.6/1.0 rolling upgrade problem + Medium: doc: Use Publican for docbook based documentation + Medium: fencing: stonithd: add metadata for stonithd instance attributes (and support in the shell) + Medium: fencing: stonithd: ignore case when comparing host names (LF 2292) + Medium: tools: Make crm_mon functional with remote connections + Medium: xml: Add stopped as a supported role for operations + Medium: xml: Bug bnc#552713 - Treat node unames as text fields not IDs + Medium: xml: Bug lf#2215 - Create an always-true expression for empty rules when upgrading from 0.6 * Thu Oct 29 2009 Andrew Beekhof - 1.0.5-4 - Include the fixes from CoroSync integration testing - Move the resource templates - they are not documentation - Ensure documentation is placed in a standard location - Exclude documentation that is included elsewhere in the package - Update the tarball from upstream to version ee19d8e83c2a + cib: Correctly clean up when both plaintext and tls remote ports are requested + pengine: Bug bnc#515172 - Provide better defaults for lt(e) and gt(e) comparisions + pengine: Bug lf#2197 - Allow master instances placemaker to be influenced by colocation constraints + pengine: Make sure promote/demote pseudo actions are created correctly + pengine: Prevent target-role from promoting more than master-max instances + ais: Bug lf#2199 - Prevent expected-quorum-votes from being populated with garbage + ais: Prevent deadlock - dont try to release IPC message if the connection failed + cib: For validation errors, send back the full CIB so the client can display the errors + cib: Prevent use-after-free for remote plaintext connections + crmd: Bug lf#2201 - Prevent use-of-NULL when running heartbeat * Wed Oct 13 2009 Andrew Beekhof - 1.0.5-3 - Update the tarball from upstream to version 38cd629e5c3c + Core: Bug lf#2169 - Allow dtd/schema validation to be disabled + pengine: Bug lf#2106 - Not all anonymous clone children are restarted after configuration change + pengine: Bug lf#2170 - stop-all-resources option had no effect + pengine: Bug lf#2171 - Prevent groups from starting if they depend on a complex resource which can not + pengine: Disable resource management if stonith-enabled=true and no stonith resources are defined + pengine: do not include master score if it would prevent allocation + ais: Avoid excessive load by checking for dead children every 1s (instead of 100ms) + ais: Bug rh#525589 - Prevent shutdown deadlocks when running on CoroSync + ais: Gracefully handle changes to the AIS nodeid + crmd: Bug bnc#527530 - Wait for the transition to complete before leaving S_TRANSITION_ENGINE + crmd: Prevent use-after-free with LOG_DEBUG_3 + Medium: xml: Mask the "symmetrical" attribute on rsc_colocation constraints (bnc#540672) + Medium (bnc#520707): Tools: crm: new templates ocfs2 and clvm + Medium: Build: Invert the disable ais/heartbeat logic so that --without (ais|heartbeat) is available to rpmbuild + Medium: pengine: Bug lf#2178 - Indicate unmanaged clones + Medium: pengine: Bug lf#2180 - Include node information for all failed ops + Medium: pengine: Bug lf#2189 - Incorrect error message when unpacking simple ordering constraint + Medium: pengine: Correctly log resources that would like to start but can not + Medium: pengine: Stop ptest from logging to syslog + Medium: ais: Include version details in plugin name + Medium: crmd: Requery the resource metadata after every start operation * Fri Aug 21 2009 Tomas Mraz - 1.0.5-2.1 - rebuilt with new openssl * Wed Aug 19 2009 Andrew Beekhof - 1.0.5-2 - Add versioned perl dependency as specified by https://fedoraproject.org/wiki/Packaging/Perl#Packages_that_link_to_libperl - No longer remove RPATH data, it prevents us finding libperl.so and no other libraries were being hardcoded - Compile in support for heartbeat - Conditionally add heartbeat-devel and corosynclib-devel to the -devel requirements depending on which stacks are supported * Mon Aug 17 2009 Andrew Beekhof - 1.0.5-1 - Add dependency on resource-agents - Use the version of the configure macro that supplies --prefix, --libdir, etc - Update the tarball from upstream to version 462f1569a437 (Pacemaker 1.0.5 final) + Tools: crm_resource - Advertise --move instead of --migrate + Medium: Extra: New node connectivity RA that uses system ping and attrd_updater + Medium: crmd: Note that dc-deadtime can be used to mask the brokeness of some switches * Tue Aug 11 2009 Ville Skyttä - 1.0.5-0.7.c9120a53a6ae.hg - Use bzipped upstream tarball. * Wed Jul 29 2009 Andrew Beekhof - 1.0.5-0.6.c9120a53a6ae.hg - Add back missing build auto* dependencies - Minor cleanups to the install directive * Tue Jul 28 2009 Andrew Beekhof - 1.0.5-0.5.c9120a53a6ae.hg - Add a leading zero to the revision when alphatag is used * Tue Jul 28 2009 Andrew Beekhof - 1.0.5-0.4.c9120a53a6ae.hg - Incorporate the feedback from the cluster-glue review - Realistically, the version is a 1.0.5 pre-release - Use the global directive instead of define for variables - Use the haclient/hacluster group/user instead of daemon - Use the _configure macro - Fix install dependencies * Fri Jul 24 2009 Andrew Beekhof - 1.0.4-3 - Initial Fedora checkin - Include an AUTHORS and license file in each package - Change the library package name to pacemaker-libs to be more Fedora compliant - Remove execute permissions from xml related files - Reference the new cluster-glue devel package name - Update the tarball from upstream to version c9120a53a6ae + pengine: Only prevent migration if the clone dependency is stopping/starting on the target node + pengine: Bug 2160 - Don't shuffle clones due to colocation + pengine: New implementation of the resource migration (not stop/start) logic + Medium: Tools: crm_resource - Prevent use-of-NULL by requiring a resource name for the -A and -a options + Medium: pengine: Prevent use-of-NULL in find_first_action() * Tue Jul 14 2009 Andrew Beekhof - 1.0.4-2 - Reference authors from the project AUTHORS file instead of listing in description - Change Source0 to reference the Mercurial repo - Cleaned up the summaries and descriptions - Incorporate the results of Fedora package self-review * Thu Jun 04 2009 Andrew Beekhof - 1.0.4-1 - Update source tarball to revision: 1d87d3e0fc7f (stable-1.0) - Statistics: Changesets: 209 Diff: 266 files changed, 12010 insertions(+), 8276 deletions(-) - Changes since Pacemaker-1.0.3 + (bnc#488291): ais: do not rely on byte endianness on ptr cast + (bnc#507255): Tools: crm: delete rsc/op_defaults (these meta_attributes are killing me) + (bnc#507255): Tools: crm: import properly rsc/op_defaults + (LF 2114): Tools: crm: add support for operation instance attributes + ais: Bug lf#2126 - Messages replies cannot be routed to transient clients + ais: Fix compilation for the latest Corosync API (v1719) + attrd: Do not perform all updates as complete refreshes + cib: Fix huge memory leak affecting heartbeat-based clusters + Core: Allow xpath queries to match attributes + Core: Generate the help text directly from a tool options struct + Core: Handle differences in 0.6 messaging format + crmd: Bug lf#2120 - All transient node attribute updates need to go via attrd + crmd: Correctly calculate how long an FSA action took to avoid spamming the logs with errors + crmd: Fix another large memory leak affecting Heartbeat based clusters + lha: Restore compatability with older versions + pengine: Bug bnc#495687 - Filesystem is not notified of successful STONITH under some conditions + pengine: Make running a cluster with STONITH enabled but no STONITH resources an error and provide details on resolutions + pengine: Prevent use-ofNULL when using resource ordering sets + pengine: Provide inter-notification ordering guarantees + pengine: Rewrite the notification code to be understanable and extendable + Tools: attrd - Prevent race condition resulting in the cluster forgetting the node wishes to shut down + Tools: crm: regression tests + Tools: crm_mon - Fix smtp notifications + Tools: crm_resource - Repair the ability to query meta attributes + Low Build: Bug lf#2105 - Debian package should contain pacemaker doc and crm templates + Medium (bnc#507255): Tools: crm: handle empty rsc/op_defaults properly + Medium (bnc#507255): Tools: crm: use the right obj_type when creating objects from xml nodes + Medium (LF 2107): Tools: crm: revisit exit codes in configure + Medium: cib: Do not bother validating updates that only affect the status section + Medium: Core: Include supported stacks in version information + Medium: crmd: Record in the CIB, the cluster infrastructure being used + Medium: cts: Do not combine crm_standby arguments - the wrapper can not process them + Medium: cts: Fix the CIBAusdit class + Medium: Extra: Refresh showscores script from Dominik + Medium: pengine: Build a statically linked version of ptest + Medium: pengine: Correctly log the actions for resources that are being recovered + Medium: pengine: Correctly log the occurance of promotion events + Medium: pengine: Implememt node health based on a patch from Mark Hamzy + Medium: Tools: Add examples to help text outputs + Medium: Tools: crm: catch syntax errors for configure load + Medium: Tools: crm: implement erasing nodes in configure erase + Medium: Tools: crm: work with parents only when managing xml objects + Medium: Tools: crm_mon - Add option to run custom notification program on resource operations (Patch by Dominik Klein) + Medium: Tools: crm_resource - Allow --cleanup to function on complex resources and cluster-wide + Medium: Tools: haresource2cib.py - Patch from horms to fix conversion error + Medium: Tools: Include stack information in crm_mon output + Medium: Tools: Two new options (--stack,--constraints) to crm_resource for querying how a resource is configured * Wed Apr 08 2009 Andrew Beekhof - 1.0.3-1 - Update source tarball to revision: b133b3f19797 (stable-1.0) tip - Statistics: Changesets: 383 Diff: 329 files changed, 15471 insertions(+), 15119 deletions(-) - Changes since Pacemaker-1.0.2 + Added tag SLE11-HAE-GMC for changeset 9196be9830c2 + ais plugin: Fix quorum calculation (bnc#487003) + ais: Another memory fix leak in error path + ais: Bug bnc#482847, bnc#482905 - Force a clean exit of OpenAIS once Pacemaker has finished unloading + ais: Bug bnc#486858 - Fix update_member() to prevent spamming clients with membership events containing no changes + ais: Centralize all quorum calculations in the ais plugin and allow expected votes to be configured int he cib + ais: Correctly handle a return value of zero from openais_dispatch_recv() + ais: Disable logging to a file + ais: Fix memory leak in error path + ais: IPC messages are only in scope until a response is sent + All signal handlers used with CL_SIGNAL() need to be as minimal as possible + cib: Bug bnc#482885 - Simplify CIB disk-writes to prevent data loss. Required a change to the backup filename format + cib: crmd: Revert part of 9782ab035003. Complex shutdown routines need G_main_add_SignalHandler to avoid race coditions + crm: Avoid infinite loop during crm configure edit (bnc#480327) + crmd: Avoid a race condition by waiting for the attrd update to trigger a transition automatically + crmd: Bug bnc#480977 - Prevent extra, partial, shutdown when a node restarts too quickly + crmd: Bug bnc#480977 - Prevent extra, partial, shutdown when a node restarts too quickly (verified) + crmd: Bug bnc#489063 - Ensure the DC is always unset after we 'loose' an election + crmd: Bug BSC#479543 - Correctly find the migration source for timed out migrate_from actions + crmd: Call crm_peer_init() before we start the FSA - prevents a race condition when used with Heartbeat + crmd: Erasing the status section should not be forced to the local node + crmd: Fix memory leak in cib notication processing code + crmd: Fix memory leak in transition graph processing + crmd: Fix memory leaks found by valgrind + crmd: More memory leaks fixes found by valgrind + fencing: stonithd: is_heartbeat_cluster is a no-no if there is no heartbeat support + pengine: Bug bnc#466788 - Exclude nodes that can not run resources + pengine: Bug bnc#466788 - Make colocation based on node attributes work + pengine: Bug BNC#478687 - Do not crash when clone-max is 0 + pengine: Bug bnc#488721 - Fix id-ref expansion for clones, the doc-root for clone children is not the cib root + pengine: Bug bnc#490418 - Correctly determine node state for nodes wishing to be terminated + pengine: Bug LF#2087 - Correctly parse the state of anonymous clones that have multiple instances on a given node + pengine: Bug lf#2089 - Meta attributes are not inherited by clone children + pengine: Bug lf#2091 - Correctly restart modified resources that were found active by a probe + pengine: Bug lf#2094 - Fix probe ordering for cloned groups + pengine: Bug LF:2075 - Fix large pingd memory leaks + pengine: Correctly attach orphaned clone children to their parent + pengine: Correctly handle terminate node attributes that are set to the output from time() + pengine: Ensure orphaned clone members are hooked up to the parent when clone-max=0 + pengine: Fix memory leak in LogActions + pengine: Fix the determination of whether a group is active + pengine: Look up the correct promotion preference for anonymous masters + pengine: Simplify handling of start failures by changing the default migration-threshold to INFINITY + pengine: The ordered option for clones no longer causes extra start/stop operations + RA: Bug bnc#490641 - Shut down dlm_controld with -TERM instead of -KILL + RA: pingd: Set default ping interval to 1 instead of 0 seconds + Resources: pingd - Correctly tell the ping daemon to shut down + Tools: Bug bnc#483365 - Ensure the command from cluster_test includes a value for --log-facility + Tools: cli: fix and improve delete command + Tools: crm: add and implement templates + Tools: crm: add support for command aliases and some common commands (i.e. cd,exit) + Tools: crm: create top configuration nodes if they are missing + Tools: crm: fix parsing attributes for rules (broken by the previous changeset) + Tools: crm: new ra set of commands + Tools: crm: resource agents information management + Tools: crm: rsc/op_defaults + Tools: crm: support for no value attribute in nvpairs + Tools: crm: the new configure monitor command + Tools: crm: the new configure node command + Tools: crm_mon - Prevent use-of-NULL when summarizing an orphan + Tools: hb2openais: create clvmd clone for respawn evmsd in ha.cf + Tools: hb2openais: fix a serious recursion bug in xml node processing + Tools: hb2openais: fix ocfs2 processing + Tools: pingd - prevent double free of getaddrinfo() output in error path + Tools: The default re-ping interval for pingd should be 1s not 1ms + Medium (bnc#479049): Tools: crm: add validation of resource type for the configure primitive command + Medium (bnc#479050): Tools: crm: add help for RA parameters in tab completion + Medium (bnc#479050): Tools: crm: add tab completion for primitive params/meta/op + Medium (bnc#479050): Tools: crm: reimplement cluster properties completion + Medium (bnc#486968): Tools: crm: listnodes function requires no parameters (do not mix completion with other stuff) + Medium: ais: Remove the ugly hack for dampening AIS membership changes + Medium: cib: Fix memory leaks by using mainloop_add_signal + Medium: cib: Move more logging to the debug level (was info) + Medium: cib: Overhaul the processing of synchronous replies + Medium: Core: Add library functions for instructing the cluster to terminate nodes + Medium: crmd: Add new expected-quorum-votes option + Medium: crmd: Allow up to 5 retires when an attrd update fails + Medium: crmd: Automatically detect and use new values for crm_config options + Medium: crmd: Bug bnc#490426 - Escalated shutdowns stall when there are pending resource operations + Medium: crmd: Clean up and optimize the DC election algorithm + Medium: crmd: Fix memory leak in shutdown + Medium: crmd: Fix memory leaks spotted by Valgrind + Medium: crmd: Ingore join messages from hosts other than our DC + Medium: crmd: Limit the scope of resource updates to the status section + Medium: crmd: Prevent the crmd from being respawned if its told to shut down when it did not ask to be + Medium: crmd: Re-check the election status after membership events + Medium: crmd: Send resource updates via the local CIB during elections + Medium: pengine: Bug bnc#491441 - crm_mon does not display operations returning 'uninstalled' correctly + Medium: pengine: Bug lf#2101 - For location constraints, role=Slave is equivalent to role=Started + Medium: pengine: Clean up the API - removed ->children() and renamed ->find_child() to fine_rsc() + Medium: pengine: Compress the display of healthy anonymous clones + Medium: pengine: Correctly log the actions for resources that are being recovered + Medium: pengine: Determin a promotion score for complex resources + Medium: pengine: Ensure clones always have a value for globally-unique + Medium: pengine: Prevent orphan clones from being allocated + Medium: RA: controld: Return proper exit code for stop op. + Medium: Tools: Bug bnc#482558 - Fix logging test in cluster_test + Medium: Tools: Bug bnc#482828 - Fix quoting in cluster_test logging setup + Medium: Tools: Bug bnc#482840 - Include directory path to CTSlab.py + Medium: Tools: crm: add more user input checks + Medium: Tools: crm: do not check resource status of we are working with a shadow + Medium: Tools: crm: fix id-refs and allow reference to top objects (i.e. primitive) + Medium: Tools: crm: ignore comments in the CIB + Medium: Tools: crm: multiple column output would not work with small lists + Medium: Tools: crm: refuse to delete running resources + Medium: Tools: crm: rudimentary if-else for templates + Medium: Tools: crm: Start/stop clones via target-role. + Medium: Tools: crm_mon - Compress the node status for healthy and offline nodes + Medium: Tools: crm_shadow - Return 0/cib_ok when --create-empty succeeds + Medium: Tools: crm_shadow - Support -e, the short form of --create-empty + Medium: Tools: Make attrd quieter + Medium: Tools: pingd - Avoid using various clplumbing functions as they seem to leak + Medium: Tools: Reduce pingd logging * Mon Feb 16 2009 Andrew Beekhof - 1.0.2-1 - Update source tarball to revision: d232d19daeb9 (stable-1.0) tip - Statistics: Changesets: 441 Diff: 639 files changed, 20871 insertions(+), 21594 deletions(-) - Changes since Pacemaker-1.0.1 + (bnc#450815): Tools: crm cli: do not generate id for the operations tag + ais: Add support for the new AIS IPC layer + ais: Always set header.error to the correct default: SA_AIS_OK + ais: Bug BNC#456243 - Ensure the membership cache always contains an entry for the local node + ais: Bug BNC:456208 - Prevent deadlocks by not logging in the child process before exec() + ais: By default, disable supprt for the WIP openais IPC patch + ais: Detect and handle situations where ais and the crm disagree on the node name + ais: Ensure crm_peer_seq is updated after a membership update + ais: Make sure all IPC header fields are set to sane defaults + ais: Repair and streamline service load now that whitetank startup functions correctly + build: create and install doc files + cib: Allow clients without mainloop to connect to the cib + cib: CID:18 - Fix use-of-NULL in cib_perform_op + cib: CID:18 - Repair errors introduced in b5a18704477b - Fix use-of-NULL in cib_perform_op + cib: Ensure diffs contain the correct values of admin_epoch + cib: Fix four moderately sized memory leaks detected by Valgrind + Core: CID:10 - Prevent indexing into an array of schemas with a negative value + Core: CID:13 - Fix memory leak in log_data_element + Core: CID:15 - Fix memory leak in crm_get_peer + Core: CID:6 - Fix use-of-NULL in copy_ha_msg_input + Core: Fix crash in the membership code preventing node shutdown + Core: Fix more memory leaks foudn by valgrind + Core: Prevent unterminated strings after decompression + crmd: Bug BNC:467995 - Delay marking STONITH operations complete until STONITH tells us so + crmd: Bug LF:1962 - Do not NACK peers because they are not (yet) in our membership. Just ignore them. + crmd: Bug LF:2010 - Ensure fencing cib updates create the node_state entry if needed to preent re-fencing during cluster startup + crmd: Correctly handle reconnections to attrd + crmd: Ensure updates for lost migrate operations indicate which node it tried to migrating to + crmd: If there are no nodes to finalize, start an election. + crmd: If there are no nodes to welcome, start an election. + crmd: Prevent node attribute loss by detecting attrd disconnections immediately + crmd: Prevent node re-probe loops by ensuring manditory actions always complete + pengine: Bug 2005 - Fix startup ordering of cloned stonith groups + pengine: Bug 2006 - Correctly reprobe cloned groups + pengine: Bug BNC:465484 - Fix the no-quorum-policy=suicide option + pengine: Bug LF:1996 - Correctly process disabled monitor operations + pengine: CID:19 - Fix use-of-NULL in determine_online_status + pengine: Clones now default to globally-unique=false + pengine: Correctly calculate the number of available nodes for the clone to use + pengine: Only shoot online nodes with no-quorum-policy=suicide + pengine: Prevent on-fail settings being ignored after a resource is successfully stopped + pengine: Prevent use-of-NULL for failed migrate actions in process_rsc_state() + pengine: Remove an optimization for the terminate node attribute that caused the cluster to block indefinitly + pengine: Repar the ability to colocate based on node attributes other than uname + pengine: Start the correct monitor operation for unmanaged masters + stonith: CID:3 - Fix another case of exceptionally poor error handling by the original stonith developers + stonith: CID:5 - Checking for NULL and then dereferencing it anyway is an interesting approach to error handling + stonithd: Sending IPC to the cluster is a privileged operation + stonithd: wrong checks for shmid (0 is a valid id) + Tools: attrd - Correctly determine when an attribute has stopped changing and should be committed to the CIB + Tools: Bug 2003 - pingd does not correctly detect failures when the interface is down + Tools: Bug 2003 - pingd does not correctly handle node-down events on multi-NIC systems + Tools: Bug 2021 - pingd does not detect sequence wrapping correctly, incorrectly reports nodes offline + Tools: Bug BNC:468066 - Do not use the result of uname() when its no longer in scope + Tools: Bug BNC:473265 - crm_resource -L dumps core + Tools: Bug LF:2001 - Transient node attributes should be set via attrd + Tools: Bug LF:2036 - crm_resource cannot set/get parameters for cloned resources + Tools: Bug LF:2046 - Node attribute updates are lost because attrd can take too long to start + Tools: Cause the correct clone instance to be failed with crm_resource -F + Tools: cluster_test - Allow the user to select a stack and fix CTS invocation + Tools: crm cli: allow rename only if the resource is stopped + Tools: crm cli: catch system errors on file operations + Tools: crm cli: completion for ids in configure + Tools: crm cli: drop '-rsc' from attributes for order constraint + Tools: crm cli: exit with an appropriate exit code + Tools: crm cli: fix wrong order of action and resource in order constraint + Tools: crm cli: fox wrong exit code + Tools: crm cli: improve handling of cib attributes + Tools: crm cli: new command: configure rename + Tools: crm cli: new command: configure upgrade + Tools: crm cli: new command: node delete + Tools: crm cli: prevent key errors on missing cib attributes + Tools: crm cli: print long help for help topics + Tools: crm cli: return on syntax error when parsing score + Tools: crm cli: rsc_location can be without nvpairs + Tools: crm cli: short node preference location constraint + Tools: crm cli: sometimes, on errors, level would change on single shot use + Tools: crm cli: syntax: drop a bunch of commas (remains of help tables conversion) + Tools: crm cli: verify user input for sanity + Tools: crm: find expressions within rules (do not always skip xml nodes due to used id) + Tools: crm_master should not define a set id now that attrd is used. Defining one can break lookups + Tools: crm_mon Use the OID assigned to the project by IANA for SNMP traps + Medium (bnc#445622): Tools: crm cli: improve the node show command and drop node status + Medium (LF 2009): stonithd: improve timeouts for remote fencing + Medium: ais: Allow dead peers to be removed from membership calculations + Medium: ais: Pass node deletion events on to clients + Medium: ais: Sanitize ipc usage + Medium: ais: Supply the node uname in addtion to the id + Medium: Build: Clean up configure to ensure NON_FATAL_CFLAGS is consistent with CFLAGS (ie. includes -g) + Medium: Build: Install cluster_test + Medium: Build: Use more restrictive CFLAGS and fix the resulting errors + Medium: cib: CID:20 - Fix potential use-after-free in cib_native_signon + Medium: Core: Bug BNC:474727 - Set a maximum time to wait for IPC messages + Medium: Core: CID:12 - Fix memory leak in decode_transition_magic error path + Medium: Core: CID:14 - Fix memory leak in calculate_xml_digest error path + Medium: Core: CID:16 - Fix memory leak in date_to_string error path + Medium: Core: Try to track down the cause of XML parsing errors + Medium: crmd: Bug BNC:472473 - Do not wait excessive amounts of time for lost actions + Medium: crmd: Bug BNC:472473 - Reduce the transition timeout to action_timeout+network_delay + Medium: crmd: Do not fast-track the processing of LRM refreshes when there are pending actions. + Medium: crmd: do_dc_join_filter_offer - Check the 'join' message is for the current instance before deciding to NACK peers + Medium: crmd: Find option values without having to do a config upgrade + Medium: crmd: Implement shutdown using a transient node attribute + Medium: crmd: Update the crmd options to use dashes instead of underscores + Medium: cts: Add 'cluster reattach' to the suite of automated regression tests + Medium: cts: cluster_test - Make some usability enhancements + Medium: CTS: cluster_test - suggest a valid port number + Medium: CTS: Fix python import order + Medium: cts: Implement an automated SplitBrain test + Medium: CTS: Remove references to deleted classes + Medium: Extra: Resources - Use HA_VARRUN instead of HA_RSCTMP for state files as Heartbeat removes HA_RSCTMP at startup + Medium: HB: Bug 1933 - Fake crmd_client_status_callback() calls because HB does not provide them for already running processes + Medium: pengine: CID:17 - Fix memory leak in find_actions_by_task error path + Medium: pengine: CID:7,8 - Prevent hypothetical use-of-NULL in LogActions + Medium: pengine: Defer logging the actions performed on a resource until we have processed ordering constraints + Medium: pengine: Remove the symmetrical attribute of colocation constraints + Medium: Resources: pingd - fix the meta defaults + Medium: Resources: Stateful - Add missing meta defaults + Medium: stonithd: exit if we the pid file cannot be locked + Medium: Tools: Allow attrd clients to specify the ID the attribute should be created with + Medium: Tools: attrd - Allow attribute updates to be performed from a hosts peer + Medium: Tools: Bug LF:1994 - Clean up crm_verify return codes + Medium: Tools: Change the pingd defaults to ping hosts once every second (instead of 5 times every 10 seconds) + Medium: Tools: cibmin - Detect resource operations with a view to providing email/snmp/cim notification + Medium: Tools: crm cli: add back symmetrical for order constraints + Medium: Tools: crm cli: generate role in location when converting from xml + Medium: Tools: crm cli: handle shlex exceptions + Medium: Tools: crm cli: keep order of help topics + Medium: Tools: crm cli: refine completion for ids in configure + Medium: Tools: crm cli: replace inf with INFINITY + Medium: Tools: crm cli: streamline cib load and parsing + Medium: Tools: crm cli: supply provider only for ocf class primitives + Medium: Tools: crm_mon - Add support for sending mail notifications of resource events + Medium: Tools: crm_mon - Include the DC version in status summary + Medium: Tools: crm_mon - Sanitize startup and option processing + Medium: Tools: crm_mon - switch to event-driven updates and add support for sending snmp traps + Medium: Tools: crm_shadow - Replace the --locate option with the saner --edit + Medium: Tools: hb2openais: do not remove Evmsd resources, but replace them with clvmd + Medium: Tools: hb2openais: replace crmadmin with crm_mon + Medium: Tools: hb2openais: replace the lsb class with ocf for o2cb + Medium: Tools: hb2openais: reuse code + Medium: Tools: LF:2029 - Display an error if crm_resource is used to reset the operation history of non-primitive resources + Medium: Tools: Make pingd resilient to attrd failures + Medium: Tools: pingd - fix the command line switches + Medium: Tools: Rename ccm_tool to crm_node * Tue Nov 18 2008 Andrew Beekhof - 1.0.1-1 - Update source tarball to revision: 6fc5ce8302ab (stable-1.0) tip - Statistics: Changesets: 170 Diff: 816 files changed, 7633 insertions(+), 6286 deletions(-) - Changes since Pacemaker-1.0.1 + ais: Allow the crmd to get callbacks whenever a node state changes + ais: Create an option for starting the mgmtd daemon automatically + ais: Ensure HA_RSCTMP exists for use by resource agents + ais: Hook up the openais.conf config logging options + ais: Zero out the PID of disconnecting clients + cib: Ensure global updates cause a disk write when appropriate + Core: Add an extra snaity check to getXpathResults() to prevent segfaults + Core: Do not redefine __FUNCTION__ unnecessarily + Core: Repair the ability to have comments in the configuration + crmd: Bug:1975 - crmd should wait indefinitely for stonith operations to complete + crmd: Ensure PE processing does not occur for all error cases in do_pe_invoke_callback + crmd: Requests to the CIB should cause any prior PE calculations to be ignored + heartbeat: Wait for membership 'up' events before removing stale node status data + pengine: Bug LF:1988 - Ensure recurring operations always have the correct target-rc set + pengine: Bug LF:1988 - For unmanaged resources we need to skip the usual can_run_resources() checks + pengine: Ensure the terminate node attribute is handled correctly + pengine: Fix optional colocation + pengine: Improve up the detection of 'new' nodes joining the cluster + pengine: Prevent assert failures in master_color() by ensuring unmanaged masters are always reallocated to their current location + Tools: crm cli: parser: return False on syntax error and None for comments + Tools: crm cli: unify template and edit commands + Tools: crm_shadow - Show more line number information after validation failures + Tools: hb2openais: add option to upgrade the CIB to v3.0 + Tools: hb2openais: add U option to getopts and update usage + Tools: hb2openais: backup improved and multiple fixes + Tools: hb2openais: fix class/provider reversal + Tools: hb2openais: fix testing + Tools: hb2openais: move the CIB update to the end + Tools: hb2openais: update logging and set logfile appropriately + Tools: LF:1969 - Attrd never sets any properties in the cib + Tools: Make attrd functional on OpenAIS + Medium: ais: Hook up the options for specifying the expected number of nodes and total quorum votes + Medium: ais: Look for pacemaker options inside the service block with 'name: pacemaker' instead of creating an addtional configuration block + Medium: ais: Provide better feedback when nodes change nodeids (in openais.conf) + Medium: cib: Always store cib contents on disk with num_updates=0 + Medium: cib: Ensure remote access ports are cleaned up on shutdown + Medium: crmd: Detect deleted resource operations automatically + Medium: crmd: Erase a nodes resource operations and transient attributes after a successful STONITH + Medium: crmd: Find a more appropriate place to update quorum and refresh attrd attributes + Medium: crmd: Fix the handling of unexpected PE exits to ensure the current CIB is stored + Medium: crmd: Fix the recording of pending operations in the CIB + Medium: crmd: Initiate an attrd refresh _after_ the status section has been fully repopulated + Medium: crmd: Only the DC should update quorum in an openais cluster + Medium: Ensure meta attributes are used consistantly + Medium: pengine: Allow group and clone level resource attributes + Medium: pengine: Bug N:437719 - Ensure scores from colocated resources count when allocating groups + Medium: pengine: Prevent lsb scripts from being used in globally unique clones + Medium: pengine: Make a best-effort guess at a migration threshold for people with 0.6 configs + Medium: Resources: controld - ensure we are part of a clone with globally_unique=false + Medium: Tools: attrd - Automatically refresh all attributes after a CIB replace operation + Medium: Tools: Bug LF:1985 - crm_mon - Correctly process failed cib queries to allow reconnection after cluster restarts + Medium: Tools: Bug LF:1987 - crm_verify incorrectly warns of configuration upgrades for the most recent version + Medium: Tools: crm (bnc#441028): check for key error in attributes management + Medium: Tools: crm_mon - display the meaning of the operation rc code instead of the status + Medium: Tools: crm_mon - Fix the display of timing data + Medium: Tools: crm_verify - check that we are being asked to validate a complete config + Medium: xml: Relax the restriction on the contents of rsc_locaiton.node * Thu Oct 16 2008 Andrew Beekhof - 1.0.0-1 - Update source tarball to revision: 388654dfef8f tip - Statistics: Changesets: 261 Diff: 3021 files changed, 244985 insertions(+), 111596 deletions(-) - Changes since f805e1b30103 + add the crm cli program + ais: Move the service id definition to a common location and make sure it is always used + build: rename hb2openais.sh to .in and replace paths with vars + cib: Implement --create for crm_shadow + cib: Remove dead files + Core: Allow the expected number of quorum votes to be configrable + Core: cl_malloc and friends were removed from Heartbeat + Core: Only call xmlCleanupParser() if we parsed anything. Doing so unconditionally seems to cause a segfault + hb2openais.sh: improve pingd handling; several bugs fixed + hb2openais: fix clone creation; replace EVMS strings + new hb2openais.sh conversion script + pengine: Bug LF:1950 - Ensure the current values for all notification variables are always set (even if empty) + pengine: Bug LF:1955 - Ensure unmanaged masters are unconditionally repromoted to ensure they are monitored correctly. + pengine: Bug LF:1955 - Fix another case of filtering causing unmanaged master failures + pengine: Bug LF:1955 - Umanaged mode prevents master resources from being allocated correctly + pengine: Bug N:420538 - Anit-colocation caused a positive node preference + pengine: Correctly handle unmanaged resources to prevent them from being started elsewhere + pengine: crm_resource - Fix the --migrate command + pengine: MAke stonith-enabled default to true and warn if no STONITH resources are found + pengine: Make sure orphaned clone children are created correctly + pengine: Monitors for unmanaged resources do not need to wait for start/promote/demote actions to complete + stonithd (LF 1951): fix remote stonith operations + stonithd: fix handling of timeouts + stonithd: fix logic for stonith resource priorities + stonithd: implement the fence-timeout instance attribute + stonithd: initialize value before reading fence-timeout + stonithd: set timeouts for fencing ops to the timeout of the start op + stonithd: stonith rsc priorities (new feature) + Tools: Add hb2openais - a tool for upgrading a Heartbeat cluster to use OpenAIS instead + Tools: crm_verify - clean up the upgrade logic to prevent crash on invalid configurations + Tools: Make pingd functional on Linux + Update version numbers for 1.0 candidates + Medium: ais: Add support for a synchronous call to retrieve the nodes nodeid + Medium: ais: Use the agreed service number + Medium: Build: Reliably detect heartbeat libraries during configure + Medium: Build: Supply prototypes for libreplace functions when needed + Medium: Build: Teach configure how to find corosync + Medium: Core: Provide better feedback if Pacemaker is started by a stack it does not support + Medium: crmd: Avoid calling GHashTable functions with NULL + Medium: crmd: Delay raising I_ERROR when the PE exits until we have had a chance to save the current CIB + Medium: crmd: Hook up the stonith-timeout option to stonithd + Medium: crmd: Prevent potential use-of-NULL in global_timer_callback + Medium: crmd: Rationalize the logging of graph aborts + Medium: pengine: Add a stonith_timeout option and remove new options that are better set in rsc_defaults + Medium: pengine: Allow external entities to ask for a node to be shot by creating a terminate=true transient node attribute + Medium: pengine: Bug LF:1950 - Notifications do not contain all documented resource state fields + Medium: pengine: Bug N:417585 - Do not restart group children whos individual score drops below zero + Medium: pengine: Detect clients that disconnect before receiving their reply + Medium: pengine: Implement a true maintenance mode + Medium: pengine: Implement on-fail=standby for NTT. Derived from a patch by Satomi TANIGUCHI + Medium: pengine: Print the correct message when stonith is disabled + Medium: pengine: ptest - check the input is valid before proceeding + Medium: pengine: Revert group stickiness to the 'old way' + Medium: pengine: Use the correct attribute for action 'requires' (was prereq) + Medium: stonithd: Fix compilation without full heartbeat install + Medium: stonithd: exit with better code on empty host list + Medium: tools: Add a new regression test for CLI tools + Medium: tools: crm_resource - return with non-zero when a resource migration command is invalid + Medium: tools: crm_shadow - Allow the admin to start with an empty CIB (and no cluster connection) + Medium: xml: pacemaker-0.7 is now an alias for the 1.0 schema * Mon Sep 22 2008 Andrew Beekhof - 0.7.3-1 - Update source tarball to revision: 33e677ab7764+ tip - Statistics: Changesets: 133 Diff: 89 files changed, 7492 insertions(+), 1125 deletions(-) - Changes since f805e1b30103 + Tools: add the crm cli program + Core: cl_malloc and friends were removed from Heartbeat + Core: Only call xmlCleanupParser() if we parsed anything. Doing so unconditionally seems to cause a segfault + new hb2openais.sh conversion script + pengine: Bug LF:1950 - Ensure the current values for all notification variables are always set (even if empty) + pengine: Bug LF:1955 - Ensure unmanaged masters are unconditionally repromoted to ensure they are monitored correctly. + pengine: Bug LF:1955 - Fix another case of filtering causing unmanaged master failures + pengine: Bug LF:1955 - Umanaged mode prevents master resources from being allocated correctly + pengine: Bug N:420538 - Anit-colocation caused a positive node preference + pengine: Correctly handle unmanaged resources to prevent them from being started elsewhere + pengine: crm_resource - Fix the --migrate command + pengine: MAke stonith-enabled default to true and warn if no STONITH resources are found + pengine: Make sure orphaned clone children are created correctly + pengine: Monitors for unmanaged resources do not need to wait for start/promote/demote actions to complete + stonithd (LF 1951): fix remote stonith operations + Tools: crm_verify - clean up the upgrade logic to prevent crash on invalid configurations + Medium: ais: Add support for a synchronous call to retrieve the nodes nodeid + Medium: ais: Use the agreed service number + Medium: pengine: Allow external entities to ask for a node to be shot by creating a terminate=true transient node attribute + Medium: pengine: Bug LF:1950 - Notifications do not contain all documented resource state fields + Medium: pengine: Bug N:417585 - Do not restart group children whos individual score drops below zero + Medium: pengine: Implement a true maintenance mode + Medium: pengine: Print the correct message when stonith is disabled + Medium: stonithd: exit with better code on empty host list + Medium: xml: pacemaker-0.7 is now an alias for the 1.0 schema * Wed Aug 20 2008 Andrew Beekhof - 0.7.1-1 - Update source tarball to revision: f805e1b30103+ tip - Statistics: Changesets: 184 Diff: 513 files changed, 43408 insertions(+), 43783 deletions(-) - Changes since 0.7.0-19 + Fix compilation when GNUTLS isn't found + admin: Fix use-after-free in crm_mon + Build: Remove testing code that prevented heartbeat-only builds + cib: Use single quotes so that the xpath queries for nvpairs will succeed + crmd: Always connect to stonithd when the TE starts and ensure we notice if it dies + crmd: Correctly handle a dead PE process + crmd: Make sure async-failures cause the failcount to be incremented + pengine: Bug LF:1941 - Handle failed clone instance probes when clone-max < #nodes + pengine: Parse resource ordering sets correctly + pengine: Prevent use-of-NULL - order->rsc_rh will not always be non-NULL + pengine: Unpack colocation sets correctly + Tools: crm_mon - Prevent use-of-NULL for orphaned resources + Medium: ais: Add support for a synchronous call to retrieve the nodes nodeid + Medium: ais: Allow transient clients to receive membership updates + Medium: ais: Avoid double-free in error path + Medium: ais: Include in the mebership nodes for which we have not determined their hostname + Medium: ais: Spawn the PE from the ais plugin instead of the crmd + Medium: cib: By default, new configurations use the latest schema + Medium: cib: Clean up the CIB if it was already disconnected + Medium: cib: Only increment num_updates if something actually changed + Medium: cib: Prevent use-after-free in client after abnormal termination of the CIB + Medium: Core: Fix memory leak in xpath searches + Medium: Core: Get more details regarding parser errors + Medium: Core: Repair expand_plus_plus - do not call char2score on unexpanded values + Medium: Core: Switch to the libxml2 parser - its significantly faster + Medium: Core: Use a libxml2 library function for xml -> text conversion + Medium: crmd: Asynchronous failure actions have no parameters + Medium: crmd: Avoid calling glib functions with NULL + Medium: crmd: Do not allow an election to promote a node from S_STARTING + Medium: crmd: Do not vote if we have not completed the local startup + Medium: crmd: Fix te_update_diff() now that get_object_root() functions differently + Medium: crmd: Fix the lrmd xpath expressions to not contain quotes + Medium: crmd: If we get a join offer during an election, better restart the election + Medium: crmd: No further processing is needed when using the LRMs API call for failing resources + Medium: crmd: Only update have-quorum if the value changed + Medium: crmd: Repair the input validation logic in do_te_invoke + Medium: cts: CIBs can no longer contain comments + Medium: cts: Enable a bunch of tests that were incorrectly disabled + Medium: cts: The libxml2 parser wont allow v1 resources to use integers as parameter names + Medium: Do not use the cluster UID and GID directly. Look them up based on the configured value of HA_CCMUSER + Medium: Fix compilation when heartbeat is not supported + Medium: pengine: Allow groups to be involved in optional ordering constraints + Medium: pengine: Allow sets of operations to be reused by multiple resources + Medium: pengine: Bug LF:1941 - Mark extra clone instances as orphans and do not show inactive ones + Medium: pengine: Determin the correct migration-threshold during resource expansion + Medium: pengine: Implement no-quorum-policy=suicide (FATE #303619) + Medium: pengine: Clean up resources after stopping old copies of the PE + Medium: pengine: Teach the PE how to stop old copies of itself + Medium: Tools: Backport hb_report updates + Medium: Tools: cib_shadow - On create, spawn a new shell with CIB_shadow and PS1 set accordingly + Medium: Tools: Rename cib_shadow to crm_shadow * Fri Jul 18 2008 Andrew Beekhof - 0.7.0-19 - Update source tarball to revision: 007c3a1c50f5 (unstable) tip - Statistics: Changesets: 108 Diff: 216 files changed, 4632 insertions(+), 4173 deletions(-) - Changes added since unstable-0.7 + admin: Fix use-after-free in crm_mon + ais: Change the tag for the ais plugin to "pacemaker" (used in openais.conf) + ais: Log terminated processes as an error + cib: Performance - Reorganize things to avoid calculating the XML diff twice + pengine: Bug LF:1941 - Handle failed clone instance probes when clone-max < #nodes + pengine: Fix memory leak in action2xml + pengine: Make OCF_ERR_ARGS a node-level error rather than a cluster-level one + pengine: Properly handle clones that are not installed on all nodes + Medium: admin: cibadmin - Show any validation errors if the upgrade failed + Medium: admin: cib_shadow - Implement --locate to display the underlying filename + Medium: admin: cib_shadow - Implement a --diff option + Medium: admin: cib_shadow - Implement a --switch option + Medium: admin: crm_resource - create more compact constraints that do not use lifetime (which is deprecated) + Medium: ais: Approximate born_on for OpenAIS based clusters + Medium: cib: Remove do_id_check, it is a poor substitute for ID validation by a schema + Medium: cib: Skip construction of pre-notify messages if no-one wants one + Medium: Core: Attempt to streamline some key functions to increase performance + Medium: Core: Clean up XML parser after validation + Medium: crmd: Detect and optimize the CRMs behavior when processing diffs of an LRM refresh + Medium: Fix memory leaks when resetting the name of an XML object + Medium: pengine: Prefer the current location if it is one of a group of nodes with the same (highest) score * Wed Jun 25 2008 Andrew Beekhof - 0.7.0-1 - Update source tarball to revision: bde0c7db74fb tip - Statistics: Changesets: 439 Diff: 676 files changed, 41310 insertions(+), 52071 deletions(-) - Changes added since stable-0.6 + A new tool for setting up and invoking CTS + Admin: All tools now use --node (-N) for specifying node unames + Admin: All tools now use --xml-file (-x) and --xml-text (-X) for specifying where to find XML blobs + cib: Cleanup the API - remove redundant input fields + cib: Implement CIB_shadow - a facility for making and testing changes before uploading them to the cluster + cib: Make registering per-op callbacks an API call and renamed (for clarity) the API call for requesting notifications + Core: Add a facility for automatically upgrading old configurations + Core: Adopt libxml2 as the XML processing library - all external clients need to be recompiled + Core: Allow sending TLS messages larger than the MTU + Core: Fix parsing of time-only ISO dates + Core: Smarter handling of XML values containing quotes + Core: XML memory corruption - catch, and handle, cases where we are overwriting an attribute value with itself + Core: The xml ID type does not allow UUIDs that start with a number + Core: Implement XPath based versions of query/delete/replace/modify + Core: Remove some HA2.0.(3,4) compatability code + crmd: Overhaul the detection of nodes that are starting vs. failed + pengine: Bug LF:1459 - Allow failures to expire + pengine: Have the PE do non-persistent configuration upgrades before performing calculations + pengine: Replace failure-stickiness with a simple 'migration-threshold' + tengine: Simplify the design by folding the tengine process into the crmd + Medium: Admin: Bug LF:1438 - Allow the list of all/active resource operations to be queried by crm_resource + Medium: Admin: Bug LF:1708 - crm_resource should print a warning if an attribute is already set as a meta attribute + Medium: Admin: Bug LF:1883 - crm_mon should display fail-count and operation history + Medium: Admin: Bug LF:1883 - crm_mon should display operation timing data + Medium: Admin: Bug N:371785 - crm_resource -C does not also clean up fail-count attributes + Medium: Admin: crm_mon - include timing data for failed actions + Medium: ais: Read options from the environment since objdb is not completely usable yet + Medium: cib: Add sections for op_defaults and rsc_defaults + Medium: cib: Better matching notification callbacks (for detecting duplicates and removal) + Medium: cib: Bug LF:1348 - Allow rules and attribute sets to be referenced for use in other objects + Medium: cib: BUG LF:1918 - By default, all cib calls now timeout after 30s + Medium: cib: Detect updates that decrease the version tuple + Medium: cib: Implement a client-side operation timeout - Requires LHA update + Medium: cib: Implement callbacks and async notifications for remote connections + Medium: cib: Make cib->cmds->update() an alias for modify at the API level (also implemented in cibadmin) + Medium: cib: Mark the CIB as disconnected if the IPC connection is terminated + Medium: cib: New call option 'cib_can_create' which can be passed to modify actions - allows the object to be created if it does not exist yet + Medium: cib: Reimplement get|set|delete attributes using XPath + Medium: cib: Remove some useless parts of the API + Medium: cib: Remove the 'attributes' scaffolding from the new format + Medium: cib: Implement the ability for clients to connect to remote servers + Medium: Core: Add support for validating xml against RelaxNG schemas + Medium: Core: Allow more than one item to be modified/deleted in XPath based operations + Medium: Core: Fix the sort_pairs function for creating sorted xml objects + Medium: Core: iso8601 - Implement subtract_duration and fix subtract_time + Medium: Core: Reduce the amount of xml copying occuring + Medium: Core: Support value='value+=N' XML updates (in addtion to value='value++') + Medium: crmd: Add support for lrm_ops->fail_rsc if its available + Medium: crmd: HB - watch link status for node leaving events + Medium: crmd: Bug LF:1924 - Improved handling of lrmd disconnects and shutdowns + Medium: crmd: Do not wait for actions with a start_delay over 5 minutes. Confirm them immediately + Medium: pengine: Bug LF:1328 - Do not fencing nodes in clusters without managed resources + Medium: pengine: Bug LF:1461 - Give transient node attributes (in ) preference over persistent ones (in ) + Medium: pengine: Bug LF:1884, Bug LF:1885 - Implement N:M ordering and colocation constraints + Medium: pengine: Bug LF:1886 - Create a resource and operation 'defaults' config section + Medium: pengine: Bug LF:1892 - Allow recurring actions to be triggered at known times + Medium: pengine: Bug LF:1926 - Probes should complete before stop actions are invoked + Medium: pengine: Fix the standby when its set as a transient attribute + Medium: pengine: Implement a global 'stop-all-resources' option + Medium: pengine: Implement cibpipe, a tool for performing/simulating config changes "offline" + Medium: pengine: We do not allow colocation with specific clone instances + Medium: Tools: pingd - Implement a stack-independent version of pingd + Medium: xml: Ship an xslt for upgrading from 0.6 to 0.7 * Thu Jun 19 2008 Andrew Beekhof - 0.6.5-1 - Update source tarball to revision: b9fe723d1ac5 tip - Statistics: Changesets: 48 Diff: 37 files changed, 1204 insertions(+), 234 deletions(-) - Changes since Pacemaker-0.6.4 + Admin: Repair the ability to delete failcounts + ais: Audit IPC handling between the AIS plugin and CRM processes + ais: Have the plugin create needed /var/lib directories + ais: Make sure the sync and async connections are assigned correctly (not swapped) + cib: Correctly detect configuration changes - num_updates does not count + pengine: Apply stickiness values to the whole group, not the individual resources + pengine: Bug N:385265 - Ensure groups are migrated instead of remaining partially active on the current node + pengine: Bug N:396293 - Enforce manditory group restarts due to ordering constraints + pengine: Correctly recover master instances found active on more than one node + pengine: Fix memory leaks reported by Valgrind + Medium: Admin: crm_mon - Misc improvements from Satomi Taniguchi + Medium: Bug LF:1900 - Resource stickiness should not allow placement in asynchronous clusters + Medium: crmd: Ensure joins are completed promptly when a node taking part dies + Medium: pengine: Avoid clone instance shuffling in more cases + Medium: pengine: Bug LF:1906 - Remove an optimization in native_merge_weights() causing group scores to behave eratically + Medium: pengine: Make use of target_rc data to correctly process resource operations + Medium: pengine: Prevent a possible use of NULL in sort_clone_instance() + Medium: tengine: Include target rc in the transition key - used to correctly determin operation failure * Thu May 22 2008 Andrew Beekhof - 0.6.4-1 - Update source tarball to revision: 226d8e356924 tip - Statistics: Changesets: 55 Diff: 199 files changed, 7103 insertions(+), 12378 deletions(-) - Changes since Pacemaker-0.6.3 + crmd: Bug LF:1881 LF:1882 - Overhaul the logic for operation cancelation and deletion + crmd: Bug LF:1894 - Make sure cancelled recurring operations are cleaned out from the CIB + pengine: Bug N:387749 - Colocation with clones causes unnecessary clone instance shuffling + pengine: Ensure 'master' monitor actions are cancelled _before_ we demote the resource + pengine: Fix assert failure leading to core dump - make sure variable is properly initialized + pengine: Make sure 'slave' monitoring happens after the resource has been demoted + pengine: Prevent failure stickiness underflows (where too many failures become a _positive_ preference) + Medium: Admin: crm_mon - Only complain if the output file could not be opened + Medium: Common: filter_action_parameters - enable legacy handling only for older versions + Medium: pengine: Bug N:385265 - The failure stickiness of group children is ignored until it reaches -INFINITY + Medium: pengine: Implement master and clone colocation by exlcuding nodes rather than setting ones score to INFINITY (similar to cs: 756afc42dc51) + Medium: tengine: Bug LF:1875 - Correctly find actions to cancel when their node leaves the cluster * Wed Apr 23 2008 Andrew Beekhof - 0.6.3-1 - Update source tarball to revision: fd8904c9bc67 tip - Statistics: Changesets: 117 Diff: 354 files changed, 19094 insertions(+), 11338 deletions(-) - Changes since Pacemaker-0.6.2 + Admin: Bug LF:1848 - crm_resource - Pass set name and id to delete_resource_attr() in the correct order + Build: SNMP has been moved to the management/pygui project + crmd: Bug LF1837 - Unmanaged resources prevent crmd from shutting down + crmd: Prevent use-after-free in lrm interface code (Patch based on work by Keisuke MORI) + pengine: Allow the cluster to make progress by not retrying failed demote actions + pengine: Anti-colocation with slave should not prevent master colocation + pengine: Bug LF 1768 - Wait more often for STONITH ops to complete before starting resources + pengine: Bug LF1836 - Allow is-managed-default=false to be overridden by individual resources + pengine: Bug LF185 - Prevent pointless master/slave instance shuffling by ignoring the master-pref of stopped instances + pengine: Bug N-191176 - Implement interleaved ordering for clone-to-clone scenarios + pengine: Bug N-347004 - Ensure clone notifications are always sent when an instance is stopped/started + pengine: Bug N-347004 - Include notification ordering is correct for interleaved clones + pengine: Bug PM-11 - Directly link probe_complete to starting clone instances + pengine: Bug PM1 - Fix setting failcounts when applied to complex resources + pengine: Bug PM12, LF1648 - Extensive revision of group ordering + pengine: Bug PM7 - Ensure masters are always demoted before they are stopped + pengine: Create probes after allocation to allow smarter handling of anonymous clones + pengine: Do not prioritize clone instances that must be moved + pengine: Fix error in previous commit that allowed more than the required number of masters to be promoted + pengine: Group start ordering fixes + pengine: Implement promote/demote ordering for cloned groups + tengine: Repair failcount updates + tengine: Use the correct offset when updating failcount + Medium: Admin: Add a summary output that can be easily parsed by CTS for audit purposes + Medium: Build: Make configure fail if bz2 or libxml2 are not present + Medium: Build: Re-instate a better default for LCRSODIR + Medium: CIB: Bug LF-1861 - Filter irrelvant error status from synchronous CIB clients + Medium: Core: Bug 1849 - Invalid conversion of ordinal leap year to gregorian date + Medium: Core: Drop compataibility code for 2.0.4 and 2.0.5 clusters + Medium: crmd: Bug LF-1860 - Automatically cancel recurring ops before demote and promote operations (not only stops) + Medium: crmd: Save the current CIB contents if we detect the PE crashed + Medium: pengine: Bug LF:1866 - Fix version check when applying compatability handling for failed start operations + Medium: pengine: Bug LF:1866 - Restore the ability to have start failures not be fatal + Medium: pengine: Bug PM1 - Failcount applies to all instances of non-unique clone + Medium: pengine: Correctly set the state of partially active master/slave groups + Medium: pengine: Do not claim to be stopping an already stopped orphan + Medium: pengine: Ensure implies_left ordering constraints are always effective + Medium: pengine: Indicate each resources 'promotion' score + Medium: pengine: Prevent a possible use-of-NULL + Medium: pengine: Reprocess the current action if it changed (so that any prior dependencies are updated) + Medium: tengine: Bug LF-1859 - Wait for fail-count updates to complete before terminating the transition + Medium: tengine: Bug LF:1859 - Do not abort graphs due to our own failcount updates + Medium: tengine: Bug LF:1859 - Prevent the TE from interupting itself * Thu Feb 14 2008 Andrew Beekhof - 0.6.2-1 - Update source tarball to revision: 28b1a8c1868b tip - Statistics: Changesets: 11 Diff: 7 files changed, 58 insertions(+), 18 deletions(-) - Changes since Pacemaker-0.6.1 + haresources2cib.py: set default-action-timeout to the default (20s) + haresources2cib.py: update ra parameters lists + Medium: SNMP: Allow the snmp subagent to be built (patch from MATSUDA, Daiki) + Medium: Tools: Make sure the autoconf variables in haresources2cib are expanded * Tue Feb 12 2008 Andrew Beekhof - 0.6.1-1 - Update source tarball to revision: e7152d1be933 tip - Statistics: Changesets: 25 Diff: 37 files changed, 1323 insertions(+), 227 deletions(-) - Changes since Pacemaker-0.6.0 + CIB: Ensure changes to top-level attributes (like admin_epoch) cause a disk write + CIB: Ensure the archived file hits the disk before returning + CIB: Repair the ability to do 'atomic increment' updates (value="value++") + crmd: Bug #7 - Connecting to the crmd immediately after startup causes use-of-NULL + Medium: CIB: Mask cib_diff_resync results from the caller - they do not need to know + Medium: crmd: Delay starting the IPC server until we are fully functional + Medium: CTS: Fix the startup patterns + Medium: pengine: Bug 1820 - Allow the first resource in a group to be migrated + Medium: pengine: Bug 1820 - Check the colocation dependencies of resources to be migrated * Mon Jan 14 2008 Andrew Beekhof - 0.6.0-1 - This is the first release of the Pacemaker Cluster Resource Manager formerly part of Heartbeat. - For those looking for the GUI, mgmtd, CIM or TSA components, they are now found in the new pacemaker-pygui project. Build dependencies prevent them from being included in Heartbeat (since the built-in CRM is no longer supported) and, being non-core components, are not included with Pacemaker. - Update source tarball to revision: c94b92d550cf - Statistics: Changesets: 347 Diff: 2272 files changed, 132508 insertions(+), 305991 deletions(-) - Test hardware: + 6-node vmware cluster (sles10-sp1/256Mb/vmware stonith) on a single host (opensuse10.3/2Gb/2.66Ghz Quad Core2) + 7-node EMC Centera cluster (sles10/512Mb/2Ghz Xeon/ssh stonith) - Notes: Heartbeat Stack + All testing was performed with STONITH enabled + The CRM was enabled using the "crm respawn" directive - Notes: OpenAIS Stack + This release contains a preview of support for the OpenAIS cluster stack + The current release of the OpenAIS project is missing two important patches that we require. OpenAIS packages containing these patches are available for most major distributions at: http://download.opensuse.org/repositories/server:/ha-clustering + The OpenAIS stack is not currently recommended for use in clusters that have shared data as STONITH support is not yet implimented + pingd is not yet available for use with the OpenAIS stack + 3 significant OpenAIS issues were found during testing of 4 and 6 node clusters. We are activly working together with the OpenAIS project to get these resolved. - Pending bugs encountered during testing: + OpenAIS #1736 - Openais membership took 20s to stabilize + Heartbeat #1750 - ipc_bufpool_update: magic number in head does not match + OpenAIS #1793 - Assertion failure in memb_state_gather_enter() + OpenAIS #1796 - Cluster message corruption - Changes since Heartbeat-2.1.2-24 + Add OpenAIS support + Admin: crm_uuid - Look in the right place for Heartbeat UUID files + admin: Exit and indicate a problem if the crmd exits while crmadmin is performing a query + cib: Fix CIB_OP_UPDATE calls that modify the whole CIB + cib: Fix compilation when supporting the heartbeat stack + cib: Fix memory leaks caused by the switch to get_message_xml() + cib: HA_VALGRIND_ENABLED needs to be set _and_ set to 1|yes|true + cib: Use get_message_xml() in preference to cl_get_struct() + cib: Use the return value from call to write() in cib_send_plaintext() + Core: ccm nodes can legitimately have a node id of 0 + Core: Fix peer-process tracking for the Heartbeat stack + Core: Heartbeat does not send status notifications for nodes that were already part of the cluster. Fake them instead + CRM: Add children to HA_Messages such that the field name matches F_XML_TAGNAME + crm: Adopt a more flexible appraoch to enabling Valgrind + crm: Fix compilation when bzip2 is not installed + CRM: Future-proof get_message_xml() + crmd: Filter election responses based on time not FSA state + crmd: Handle all possible peer states in crmd_ha_status_callback() + crmd: Make sure the current date/time is set - prevents use-of-NULL when evaluating rules + crmd: Relax an assertion regrading ccm membership instances + crmd: Use (node->processes&crm_proc_ais) to accurately update the CIB after replace operations + crmd: Heartbeat: Accurately record peer client status + pengine: Bug 1777 - Allow colocation with a resource in the Stopped state + pengine: Bug 1822 - Prevent use-of-NULL in PromoteRsc() + pengine: Implement three recovery policies based on op_status and op_rc + pengine: Parse fail-count correctly (it may be set to ININFITY) + pengine: Prevent graph-loop when stonith agents need to be moved around before a STONITH op + pengine: Prevent graph-loops when two operations have the same name+interval + tengine: Cancel active timers when destroying graphs + tengine: Ensure failcount is set correctly for failed stops/starts + tengine: Update failcount for oeprations that time out + Medium: admin: Prevent hang in crm_mon -1 when there is no cib connection - Patch from Junko IKEDA + Medium: cib: Require --force|-f when performing potentially dangerous commands with cibadmin + Medium: cib: Tweak the shutdown code + Medium: Common: Only count peer processes of active nodes + Medium: Core: Create generic cluster sign-in method + Medium: core: Fix compilation when Heartbeat support is disabled + Medium: Core: General cleanup for supporting two stacks + Medium: Core: iso6601 - Support parsing of time-only strings + Medium: core: Isolate more code that is only needed when SUPPORT_HEARTBEAT is enabled + Medium: crm: Improved logging of errors in the XML parser + Medium: crmd: Fix potential use-of-NULL in string comparison + Medium: crmd: Reimpliment syncronizing of CIB queries and updates when invoking the PE + Medium: crm_mon: Indicate when a node is both in standby mode and offline + Medium: pengine: Bug 1822 - Do not try an promote groups if not all of it is active + Medium: pengine: on_fail=nothing is an alias for 'ignore' not 'restart' + Medium: pengine: Prevent a potential use-of-NULL in cron_range_satisfied() + snmp subagent: fix a problem on displaying an unmanaged group + snmp subagent: use the syslog setting + snmp: v2 support (thanks to Keisuke MORI) + snmp_subagent - made it not complain about some things if shutting down diff --git a/cts/CTStests.py b/cts/CTStests.py index 7599bf3943..cc2decea4f 100644 --- a/cts/CTStests.py +++ b/cts/CTStests.py @@ -1,3159 +1,3159 @@ '''CTS: Cluster Testing System: Tests module There are a few things we want to do here: ''' __copyright__ = ''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. Add RecourceRecover testcase Zhao Kai ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import time, os, re, string, subprocess, tempfile from stat import * from cts import CTS from cts.CTSaudits import * from cts.CTSvars import * from cts.patterns import PatternSelector from cts.logging import LogFactory from cts.remote import RemoteFactory from cts.watcher import LogWatcher from cts.environment import EnvFactory AllTestClasses = [ ] class CTSTest: ''' A Cluster test. We implement the basic set of properties and behaviors for a generic cluster test. Cluster tests track their own statistics. We keep each of the kinds of counts we track as separate {name,value} pairs. ''' def __init__(self, cm): #self.name="the unnamed test" self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} # if not issubclass(cm.__class__, ClusterManager): # raise ValueError("Must be a ClusterManager object") self.CM = cm self.Env = EnvFactory().getInstance() self.rsh = RemoteFactory().getInstance() self.logger = LogFactory() self.templates = PatternSelector(cm["Name"]) self.Audits = [] self.timeout = 120 self.passed = 1 self.is_loop = 0 self.is_unsafe = 0 self.is_docker_unsafe = 0 self.is_experimental = 0 self.is_container = 0 self.is_valgrind = 0 self.benchmark = 0 # which tests to benchmark self.timer = {} # timers def log(self, args): self.logger.log(args) def debug(self, args): self.logger.debug(args) def has_key(self, key): return key in self.Stats def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): if str(key) == "0": raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead") if key in self.Stats: return self.Stats[key] return None def log_mark(self, msg): self.debug("MARK: test %s %s %d" % (self.name,msg,time.time())) return def get_timer(self,key = "test"): try: return self.timer[key] except: return 0 def set_timer(self,key = "test"): self.timer[key] = time.time() return self.timer[key] def log_timer(self,key = "test"): elapsed = 0 if key in self.timer: elapsed = time.time() - self.timer[key] s = key == "test" and self.name or "%s:%s" % (self.name,key) self.debug("%s runtime: %.2f" % (s, elapsed)) del self.timer[key] return elapsed def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not name in self.Stats: self.Stats[name] = 0 self.Stats[name] = self.Stats[name]+1 # Reset the test passed boolean if name == "calls": self.passed = 1 def failure(self, reason="none"): '''Increment the failure count''' self.passed = 0 self.incr("failure") self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason) return None def success(self): '''Increment the success count''' self.incr("success") return 1 def skipped(self): '''Increment the skipped count''' self.incr("skipped") return 1 def __call__(self, node): '''Perform the given test''' raise ValueError("Abstract Class member (__call__)") self.incr("calls") return self.failure() def audit(self): passed = 1 if len(self.Audits) > 0: for audit in self.Audits: if not audit(): self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name())) self.incr("auditfail") passed = 0 return passed def setup(self, node): '''Setup the given test''' return self.success() def teardown(self, node): '''Tear down the given test''' return self.success() def create_watch(self, patterns, timeout, name=None): if not name: name = self.name return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"]) def local_badnews(self, prefix, watch, local_ignore=[]): errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [] ignorelist.append(" CTS: ") ignorelist.append(prefix) ignorelist.extend(local_ignore) while errcount < 100: match = watch.look(0) if match: add_err = 1 for ignore in ignorelist: if add_err == 1 and re.search(ignore, match): add_err = 0 if add_err == 1: self.logger.log(prefix + " " + match) errcount = errcount + 1 else: break else: self.logger.log("Too many errors!") watch.end() return errcount def is_applicable(self): return self.is_applicable_common() def is_applicable_common(self): '''Return TRUE if we are applicable in the current test configuration''' #raise ValueError("Abstract Class member (is_applicable)") if self.is_loop and not self.Env["loop-tests"]: return 0 elif self.is_unsafe and not self.Env["unsafe-tests"]: return 0 elif self.is_valgrind and not self.Env["valgrind-tests"]: return 0 elif self.is_experimental and not self.Env["experimental-tests"]: return 0 elif self.is_docker_unsafe and self.Env["docker"]: return 0 elif self.is_container and not self.Env["container-tests"]: return 0 elif self.Env["benchmark"] and self.benchmark == 0: return 0 return 1 def find_ocfs2_resources(self, node): self.r_o2cb = None self.r_ocfs2 = [] (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "o2cb" and r.parent != "NA": self.debug("Found o2cb: %s" % self.r_o2cb) self.r_o2cb = r.parent if re.search("^Constraint", line): c = AuditConstraint(self.CM, line) if c.type == "rsc_colocation" and c.target == self.r_o2cb: self.r_ocfs2.append(c.rsc) self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2)) return len(self.r_ocfs2) def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' return 1 def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] class StopTest(CTSTest): '''Stop (deactivate) the cluster manager on a node''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Stop" def __call__(self, node): '''Perform the 'stop' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] != "up": return self.skipped() patterns = [] # Technically we should always be able to notice ourselves stopping patterns.append(self.templates["Pat:We_stopped"] % node) #if self.Env["use_logd"]: # patterns.append(self.templates["Pat:Logd_stopped"] % node) # Any active node needs to notice this one left # NOTE: This wont work if we have multiple partitions for other in self.Env["nodes"]: if self.CM.ShouldBeStatus[other] == "up" and other != node: patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node))) #self.debug("Checking %s will notice %s left"%(other, node)) watch = self.create_watch(patterns, self.Env["DeadTime"]) watch.setwatch() if node == self.CM.OurNode: self.incr("us") else: if self.CM.upcount() <= 1: self.incr("all") else: self.incr("them") self.CM.StopaCM(node) watch_result = watch.lookforall() failreason = None UnmatchedList = "||" if watch.unmatched: (rc, output) = self.rsh(node, "/bin/ps axf", None) for line in output: self.debug(line) (rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None) for line in output: self.debug(line) for regex in watch.unmatched: self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex)) UnmatchedList += regex + "||"; failreason = "Missing shutdown pattern" self.CM.cluster_stable(self.Env["DeadTime"]) if not watch.unmatched or self.CM.upcount() == 0: return self.success() if len(watch.unmatched) >= self.CM.upcount(): return self.failure("no match against (%s)" % UnmatchedList) if failreason == None: return self.success() else: return self.failure(failreason) # # We don't register StopTest because it's better when called by # another test... # class StartTest(CTSTest): '''Start (activate) the cluster manager on a node''' def __init__(self, cm, debug=None): CTSTest.__init__(self,cm) self.name = "start" self.debug = debug def __call__(self, node): '''Perform the 'start' test. ''' self.incr("calls") if self.CM.upcount() == 0: self.incr("us") else: self.incr("them") if self.CM.ShouldBeStatus[node] != "down": return self.skipped() elif self.CM.StartaCM(node): return self.success() else: return self.failure("Startup %s on node %s failed" % (self.Env["Name"], node)) # # We don't register StartTest because it's better when called by # another test... # class FlipTest(CTSTest): '''If it's running, stop it. If it's stopped start it. Overthrow the status quo... ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Flip" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'Flip' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] == "up": self.incr("stopped") ret = self.stop(node) type = "up->down" # Give the cluster time to recognize it's gone... time.sleep(self.Env["StableTime"]) elif self.CM.ShouldBeStatus[node] == "down": self.incr("started") ret = self.start(node) type = "down->up" else: return self.skipped() self.incr(type) if ret: return self.success() else: return self.failure("%s failure" % type) # Register FlipTest as a good test to run AllTestClasses.append(FlipTest) class RestartTest(CTSTest): '''Stop and restart a node''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Restart" self.start = StartTest(cm) self.stop = StopTest(cm) self.benchmark = 1 def __call__(self, node): '''Perform the 'restart' test. ''' self.incr("calls") self.incr("node:" + node) ret1 = 1 if self.CM.StataCM(node): self.incr("WasStopped") if not self.start(node): return self.failure("start (setup) failure: "+node) self.set_timer() if not self.stop(node): return self.failure("stop failure: "+node) if not self.start(node): return self.failure("start failure: "+node) return self.success() # Register RestartTest as a good test to run AllTestClasses.append(RestartTest) class StonithdTest(CTSTest): def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Stonithd" self.startall = SimulStartLite(cm) self.benchmark = 1 def __call__(self, node): self.incr("calls") if len(self.Env["nodes"]) < 2: return self.skipped() ret = self.startall(None) if not ret: return self.failure("Setup failed") is_dc = self.CM.is_node_dc(node) watchpats = [] watchpats.append(self.templates["Pat:FenceOpOK"] % node) watchpats.append(self.templates["Pat:NodeFenced"] % node) if self.Env["at-boot"] == 0: self.debug("Expecting %s to stay down" % node) self.CM.ShouldBeStatus[node] = "down" else: self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"])) watchpats.append("%s.* S_STARTING -> S_PENDING" % node) watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node) watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"]) watch.setwatch() origin = self.Env.RandomGen.choice(self.Env["nodes"]) rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node) if rc == 194: # 194 - 256 = -62 = Timer expired # # Look for the patterns, usually this means the required # device was running on the node to be fenced - or that # the required devices were in the process of being loaded # and/or moved # # Effectively the node committed suicide so there will be # no confirmation, but pacemaker should be watching and # fence the node again self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node)) elif origin != node and rc != 0: self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.debug("Waiting STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc)) elif origin == node and rc != 255: # 255 == broken pipe, ie. the node was fenced as expected self.logger.log("Locally originated fencing returned %d" % rc) self.set_timer("fence") matched = watch.lookforall() self.log_timer("fence") self.set_timer("reform") if watch.unmatched: self.logger.log("Patterns not found: " + repr(watch.unmatched)) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.debug("Waiting STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.Env["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") elif not is_stable: return self.failure("Cluster did not become stable") self.log_timer("reform") return self.success() def errorstoignore(self): return [ self.templates["Pat:Fencing_start"] % ".*", self.templates["Pat:Fencing_ok"] % ".*", r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery", r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired", ] def is_applicable(self): if not self.is_applicable_common(): return 0 if "DoFencing" in self.Env.keys(): return self.Env["DoFencing"] return 1 AllTestClasses.append(StonithdTest) class StartOnebyOne(CTSTest): '''Start all the nodes ~ one by one''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "StartOnebyOne" self.stopall = SimulStopLite(cm) self.start = StartTest(cm) self.ns = CTS.NodeStatus(cm.Env) def __call__(self, dummy): '''Perform the 'StartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Test setup failed") failed = [] self.set_timer() for node in self.Env["nodes"]: if not self.start(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to start: " + repr(failed)) return self.success() # Register StartOnebyOne as a good test to run AllTestClasses.append(StartOnebyOne) class SimulStart(CTSTest): '''Start all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStart" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'SimulStart' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Setup failed") self.CM.clear_all_caches() if not self.startall(None): return self.failure("Startall failed") return self.success() # Register SimulStart as a good test to run AllTestClasses.append(SimulStart) class SimulStop(CTSTest): '''Stop all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStop" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) def __call__(self, dummy): '''Perform the 'SimulStop' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.stopall(None): return self.failure("Stopall failed") return self.success() # Register SimulStop as a good test to run AllTestClasses.append(SimulStop) class StopOnebyOne(CTSTest): '''Stop all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "StopOnebyOne" self.startall = SimulStartLite(cm) self.stop = StopTest(cm) def __call__(self, dummy): '''Perform the 'StopOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") failed = [] self.set_timer() for node in self.Env["nodes"]: if not self.stop(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to stop: " + repr(failed)) self.CM.clear_all_caches() return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(StopOnebyOne) class RestartOnebyOne(CTSTest): '''Restart all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RestartOnebyOne" self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'RestartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") did_fail = [] self.set_timer() self.restart = RestartTest(self.CM) for node in self.Env["nodes"]: if not self.restart(node): did_fail.append(node) if did_fail: return self.failure("Could not restart %d nodes: %s" % (len(did_fail), repr(did_fail))) return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(RestartOnebyOne) class PartialStart(CTSTest): '''Start a node - but tell it to stop before it finishes starting up''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "PartialStart" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) self.stop = StopTest(cm) #self.is_unsafe = 1 def __call__(self, node): '''Perform the 'PartialStart' test. ''' self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Setup failed") # FIXME! This should use the CM class to get the pattern # then it would be applicable in general watchpats = [] watchpats.append("crmd.*Connecting to cluster infrastructure") watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() self.CM.StartaCMnoBlock(node) ret = watch.lookforall() if not ret: self.logger.log("Patterns not found: " + repr(watch.unmatched)) return self.failure("Setup of %s failed" % node) ret = self.stop(node) if not ret: return self.failure("%s did not stop in time" % node) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # We might do some fencing in the 2-node case if we make it up far enough return [ """Executing reboot fencing operation""", ] # Register StopOnebyOne as a good test to run AllTestClasses.append(PartialStart) class StandbyTest(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Standby" self.benchmark = 1 self.start = StartTest(cm) self.startall = SimulStartLite(cm) # make sure the node is active # set the node to standby mode # check resources, none resource should be running on the node # set the node to active mode # check resouces, resources should have been migrated back (SHOULD THEY?) def __call__(self, node): self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Start all nodes failed") self.debug("Make sure node %s is active" % node) if self.CM.StandbyStatus(node) != "off": if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.debug("Getting resources running on node %s" % node) rsc_on_node = self.CM.active_resources(node) watchpats = [] watchpats.append(r"State transition .* -> S_POLICY_ENGINE") watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() self.debug("Setting node %s to standby mode" % node) if not self.CM.SetStandbyMode(node, "on"): return self.failure("can't set node %s to standby mode" % node) self.set_timer("on") ret = watch.lookforall() if not ret: self.logger.log("Patterns not found: " + repr(watch.unmatched)) self.CM.SetStandbyMode(node, "off") return self.failure("cluster didn't react to standby change on %s" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "on": return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status)) self.log_timer("on") self.debug("Checking resources") bad_run = self.CM.active_resources(node) if len(bad_run) > 0: rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run))) self.debug("Setting node %s to active mode" % node) self.CM.SetStandbyMode(node, "off") return rc self.debug("Setting node %s to active mode" % node) if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.set_timer("off") self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.log_timer("off") return self.success() AllTestClasses.append(StandbyTest) class ValgrindTest(CTSTest): '''Check for memory leaks''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Valgrind" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_valgrind = 1 self.is_loop = 1 def setup(self, node): self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Stop all nodes failed") # Enable valgrind self.logger.logPat = "/tmp/%s-*.valgrind" % self.name self.Env["valgrind-prefix"] = self.name self.rsh(node, "rm -f %s" % self.logger.logPat, None) ret = self.startall(None) if not ret: return self.failure("Start all nodes failed") for node in self.Env["nodes"]: (rc, output) = self.rsh(node, "ps u --ppid `pidofproc aisexec`", None) for line in output: self.debug(line) return self.success() def teardown(self, node): # Disable valgrind self.Env["valgrind-prefix"] = None # Return all nodes to normal ret = self.stopall(None) if not ret: return self.failure("Stop all nodes failed") return self.success() def find_leaks(self): # Check for leaks leaked = [] self.stop = StopTest(self.CM) for node in self.Env["nodes"]: (rc, ps_out) = self.rsh(node, "ps u --ppid `pidofproc aisexec`", None) rc = self.stop(node) if not rc: self.failure("Couldn't shut down %s" % node) rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0) if rc != 1: leaked.append(node) self.failure("Valgrind errors detected on %s" % node) for line in ps_out: self.logger.log(line) (rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None) for line in output: self.logger.log(line) (rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None) for line in output: self.debug(line) self.rsh(node, "rm -f %s" % self.logger.logPat, None) return leaked def __call__(self, node): leaked = self.find_leaks() if len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"cib.*: \*\*\*\*\*\*\*\*\*\*\*\*\*", r"cib.*: .* avoid confusing Valgrind", r"HA_VALGRIND_ENABLED", ] class StandbyLoopTest(ValgrindTest): '''Check for memory leaks by putting a node in and out of standby for an hour''' def __init__(self, cm): ValgrindTest.__init__(self,cm) self.name = "StandbyLoop" def __call__(self, node): lpc = 0 delay = 2 failed = 0 done = time.time() + self.Env["loop-minutes"] * 60 while time.time() <= done and not failed: lpc = lpc + 1 time.sleep(delay) if not self.CM.SetStandbyMode(node, "on"): self.failure("can't set node %s to standby mode" % node) failed = lpc time.sleep(delay) if not self.CM.SetStandbyMode(node, "off"): self.failure("can't set node %s to active mode" % node) failed = lpc leaked = self.find_leaks() if failed: return self.failure("Iteration %d failed" % failed) elif len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() AllTestClasses.append(StandbyLoopTest) class BandwidthTest(CTSTest): # Tests should not be cluster-manager-specific # If you need to find out cluster manager configuration to do this, then # it should be added to the generic cluster manager API. '''Test the bandwidth which heartbeat uses''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Bandwidth" self.start = StartTest(cm) self.__setitem__("min",0) self.__setitem__("max",0) self.__setitem__("totalbandwidth",0) (handle, self.tempfile) = tempfile.mkstemp(".cts") os.close(handle) self.startall = SimulStartLite(cm) def __call__(self, node): '''Perform the Bandwidth test''' self.incr("calls") if self.CM.upcount() < 1: return self.skipped() Path = self.CM.InternalCommConfig() if "ip" not in Path["mediatype"]: return self.skipped() port = Path["port"][0] port = int(port) ret = self.startall(None) if not ret: return self.failure("Test setup failed") time.sleep(5) # We get extra messages right after startup. fstmpfile = "/var/run/band_estimate" dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \ % (port, fstmpfile) rc = self.rsh(node, dumpcmd) if rc == 0: farfile = "root@%s:%s" % (node, fstmpfile) self.rsh.cp(farfile, self.tempfile) Bandwidth = self.countbandwidth(self.tempfile) if not Bandwidth: self.logger.log("Could not compute bandwidth.") return self.success() intband = int(Bandwidth + 0.5) self.logger.log("...bandwidth: %d bits/sec" % intband) self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth if self.Stats["min"] == 0: self.Stats["min"] = Bandwidth if Bandwidth > self.Stats["max"]: self.Stats["max"] = Bandwidth if Bandwidth < self.Stats["min"]: self.Stats["min"] = Bandwidth self.rsh(node, "rm -f %s" % fstmpfile) os.unlink(self.tempfile) return self.success() else: return self.failure("no response from tcpdump command [%d]!" % rc) def countbandwidth(self, file): fp = open(file, "r") fp.seek(0) count = 0 sum = 0 while 1: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count + 1 linesplit = string.split(line," ") for j in range(len(linesplit)-1): if linesplit[j] == "udp": break if linesplit[j] == "length:": break try: sum = sum + int(linesplit[j+1]) except ValueError: self.logger.log("Invalid tcpdump line: %s" % line) return None T1 = linesplit[0] timesplit = string.split(T1,":") time2split = string.split(timesplit[2],".") time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 break while count < 100: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count+1 linessplit = string.split(line," ") for j in range(len(linessplit)-1): if linessplit[j] == "udp": break if linesplit[j] == "length:": break try: sum = int(linessplit[j+1]) + sum except ValueError: self.logger.log("Invalid tcpdump line: %s" % line) return None T2 = linessplit[0] timesplit = string.split(T2,":") time2split = string.split(timesplit[2],".") time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001 time = time2-time1 if (time <= 0): return 0 return (sum*8)/time def is_applicable(self): '''BandwidthTest never applicable''' return 0 AllTestClasses.append(BandwidthTest) ################################################################### class MaintenanceMode(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "MaintenanceMode" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.max = 30 #self.is_unsafe = 1 self.benchmark = 1 self.action = "asyncmon" self.interval = 0 self.rid = "maintenanceDummy" def toggleMaintenanceMode(self, node, action): pats = [] pats.append(self.templates["Pat:DC_IDLE"]) # fail the resource right after turning Maintenance mode on # verify it is not recovered until maintenance mode is turned off if action == "On": pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for %s on" % (self.action, self.rid)) else: pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0")) pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0")) watch = self.create_watch(pats, 60) watch.setwatch() self.debug("Turning maintenance mode %s" % action) self.rsh(node, self.templates["MaintenanceMode%s" % (action)]) if (action == "On"): self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) self.set_timer("recover%s" % (action)) watch.lookforall() self.log_timer("recover%s" % (action)) if watch.unmatched: self.debug("Failed to find patterns when turning maintenance mode %s" % action) return repr(watch.unmatched) return "" def insertMaintenanceDummy(self, node): pats = [] pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % (self.rid, "start_0"))) watch = self.create_watch(pats, 60) watch.setwatch() self.CM.AddDummyRsc(node, self.rid) self.set_timer("addDummy") watch.lookforall() self.log_timer("addDummy") if watch.unmatched: self.debug("Failed to find patterns when adding maintenance dummy resource") return repr(watch.unmatched) return "" def removeMaintenanceDummy(self, node): pats = [] pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0")) watch = self.create_watch(pats, 60) watch.setwatch() self.CM.RemoveDummyRsc(node, self.rid) self.set_timer("removeDummy") watch.lookforall() self.log_timer("removeDummy") if watch.unmatched: self.debug("Failed to find patterns when removing maintenance dummy resource") return repr(watch.unmatched) return "" def managedRscList(self, node): rscList = [] (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if tmp.managed(): rscList.append(tmp.id) return rscList def verifyResources(self, node, rscList, managed): managedList = list(rscList) managed_str = "managed" if not managed: managed_str = "unmanaged" (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if managed and not tmp.managed(): continue elif not managed and tmp.managed(): continue elif managedList.count(tmp.id): managedList.remove(tmp.id) if len(managedList) == 0: self.debug("Found all %s resources on %s" % (managed_str, node)) return True self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList)) return False def __call__(self, node): '''Perform the 'MaintenanceMode' test. ''' self.incr("calls") verify_managed = False verify_unmanaged = False failPat = "" ret = self.startall(None) if not ret: return self.failure("Setup failed") # get a list of all the managed resources. We use this list # after enabling maintenance mode to verify all managed resources # become un-managed. After maintenance mode is turned off, we use # this list to verify all the resources become managed again. managedResources = self.managedRscList(node) if len(managedResources) == 0: self.logger.log("No managed resources on %s" % node) return self.skipped() # insert a fake resource we can fail during maintenance mode # so we can verify recovery does not take place until after maintenance # mode is disabled. failPat = failPat + self.insertMaintenanceDummy(node) # toggle maintenance mode ON, then fail dummy resource. failPat = failPat + self.toggleMaintenanceMode(node, "On") # verify all the resources are now unmanaged if self.verifyResources(node, managedResources, False): verify_unmanaged = True # Toggle maintenance mode OFF, verify dummy is recovered. failPat = failPat + self.toggleMaintenanceMode(node, "Off") # verify all the resources are now managed again if self.verifyResources(node, managedResources, True): verify_managed = True # Remove our maintenance dummy resource. failPat = failPat + self.removeMaintenanceDummy(node) self.CM.cluster_stable() if failPat != "": return self.failure("Unmatched patterns: %s" % (failPat)) elif verify_unmanaged is False: return self.failure("Failed to verify resources became unmanaged during maintenance mode") elif verify_managed is False: return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"Updating failcount for %s" % self.rid, r"pengine.*: Recover %s\s*\(.*\)" % self.rid, r"Unknown operation: fail", r"(ERROR|error): sending stonithRA op to stonithd failed.", self.templates["Pat:RscOpOK"] % (self.rid, ("%s_%d" % (self.action, self.interval))), r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval), ] AllTestClasses.append(MaintenanceMode) class ResourceRecover(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ResourceRecover" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.max = 30 self.rid = None self.rid_alt = None #self.is_unsafe = 1 self.benchmark = 1 # these are the values used for the new LRM API call self.action = "asyncmon" self.interval = 0 def __call__(self, node): '''Perform the 'ResourceRecover' test. ''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed") resourcelist = self.CM.active_resources(node) # if there are no resourcelist, return directly if len(resourcelist) == 0: self.logger.log("No active resources on %s" % node) return self.skipped() self.rid = self.Env.RandomGen.choice(resourcelist) self.rid_alt = self.rid rsc = None (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if tmp.id == self.rid: rsc = tmp # Handle anonymous clones that get renamed self.rid = rsc.clone_id break if not rsc: return self.failure("Could not find %s in the resource list" % self.rid) self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) pats = [] pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for (%s|%s) on" % (self.action, rsc.id, rsc.clone_id)) if rsc.managed(): pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0")) if rsc.unique(): pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0")) else: # Anonymous clones may get restarted with a different clone number pats.append(self.templates["Pat:RscOpOK"] % (".*", "start_0")) watch = self.create_watch(pats, 60) watch.setwatch() self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) self.set_timer("recover") watch.lookforall() self.log_timer("recover") self.CM.cluster_stable() recovered = self.CM.ResourceLocation(self.rid) if watch.unmatched: return self.failure("Patterns not found: %s" % repr(watch.unmatched)) elif rsc.unique() and len(recovered) > 1: return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) elif len(recovered) > 0: self.debug("%s is running on: %s" % (self.rid, repr(recovered))) elif rsc.managed(): return self.failure("%s was not recovered and is inactive" % self.rid) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"Updating failcount for %s" % self.rid, r"pengine.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt), r"Unknown operation: fail", r"(ERROR|error): sending stonithRA op to stonithd failed.", self.templates["Pat:RscOpOK"] % (self.rid, ("%s_%d" % (self.action, self.interval))), r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval), ] AllTestClasses.append(ResourceRecover) class ComponentFail(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "ComponentFail" # TODO make this work correctly in docker. self.is_docker_unsafe = 1 self.startall = SimulStartLite(cm) self.complist = cm.Components() self.patterns = [] self.okerrpatterns = [] self.is_unsafe = 1 def __call__(self, node): '''Perform the 'ComponentFail' test. ''' self.incr("calls") self.patterns = [] self.okerrpatterns = [] # start all nodes ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.CM.cluster_stable(self.Env["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self.CM.is_node_dc(node, None) # select a component to kill chosen = self.Env.RandomGen.choice(self.complist) while chosen.dc_only == 1 and node_is_dc == 0: chosen = self.Env.RandomGen.choice(self.complist) self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot)) self.incr(chosen.name) if chosen.name != "aisexec" and chosen.name != "corosync": if self.Env["Name"] != "crm-lha" or chosen.name != "pengine": self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) self.patterns.extend(chosen.pats) if node_is_dc: self.patterns.extend(chosen.dc_pats) # In an ideal world, this next stuff should be in the "chosen" object as a member function if self.Env["Name"] == "crm-lha" and chosen.triggersreboot: # Make sure the node goes down and then comes back up if it should reboot... for other in self.Env["nodes"]: if other != node: self.patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node))) self.patterns.append(self.templates["Pat:Slave_started"] % node) self.patterns.append(self.templates["Pat:Local_started"] % node) if chosen.dc_only: # Sometimes these will be in the log, and sometimes they won't... self.okerrpatterns.append("%s .*Process %s:.* exited" % (node, chosen.name)) self.okerrpatterns.append("%s .*I_ERROR.*crmdManagedChildDied" % node) self.okerrpatterns.append("%s .*The %s subsystem terminated unexpectedly" % (node, chosen.name)) self.okerrpatterns.append("(ERROR|error): Client .* exited with return code") else: # Sometimes this won't be in the log... self.okerrpatterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name)) self.okerrpatterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name)) self.okerrpatterns.append(self.templates["Pat:ChildExit"]) if chosen.name == "stonith": # Ignore actions for STONITH resources (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rclass == "stonith": self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id) # supply a copy so self.patterns doesn't end up empty tmpPats = [] tmpPats.extend(self.patterns) self.patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonithPats = [] stonithPats.append(self.templates["Pat:Fencing_ok"] % node) stonith = self.create_watch(stonithPats, 0) stonith.setwatch() # set the watch for stable watch = self.create_watch( tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"]) watch.setwatch() # kill the component chosen.kill(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.debug("Waiting for any STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") self.CM.cluster_stable(self.Env["StartTime"]) self.debug("Checking if %s was shot" % node) shot = stonith.look(60) if shot: self.debug("Found: " + repr(shot)) self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) if self.Env["at-boot"] == 0: self.CM.ShouldBeStatus[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.lookforall(allow_multiple_matches=1) if watch.unmatched: self.logger.log("Patterns not found: " + repr(watch.unmatched)) self.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.Env["StartTime"]) if not matched: return self.failure("Didn't find all expected %s patterns" % chosen.name) elif not is_stable: return self.failure("Cluster did not become stable after killing %s" % chosen.name) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self.okerrpatterns.extend(self.patterns) return self.okerrpatterns AllTestClasses.append(ComponentFail) class SplitBrainTest(CTSTest): '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "SplitBrain" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.is_experimental = 1 def isolate_partition(self, partition): other_nodes = [] other_nodes.extend(self.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition)) if len(other_nodes) == 0: return 1 self.debug("Creating partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: if not self.CM.isolate_node(node, other_nodes): self.logger.log("Could not isolate %s" % node) return 0 return 1 def heal_partition(self, partition): other_nodes = [] other_nodes.extend(self.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"])) if len(other_nodes) == 0: return 1 self.debug("Healing partition: " + repr(partition)) self.debug("Everyone else: " + repr(other_nodes)) for node in partition: self.CM.unisolate_node(node, other_nodes) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") self.passed = 1 partitions = {} ret = self.startall(None) if not ret: return self.failure("Setup failed") while 1: # Retry until we get multiple partitions partitions = {} p_max = len(self.Env["nodes"]) for node in self.Env["nodes"]: p = self.Env.RandomGen.randint(1, p_max) if not p in partitions: partitions[p] = [] partitions[p].append(node) p_max = len(partitions.keys()) if p_max > 1: break # else, try again self.debug("Created %d partitions" % p_max) for key in list(partitions.keys()): self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) # Disabling STONITH to reduce test complexity for now self.rsh(node, "crm_attribute -V -n stonith-enabled -v false") for key in list(partitions.keys()): self.isolate_partition(partitions[key]) count = 30 while count > 0: if len(self.CM.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self.CM.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self.CM.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self.CM.partitions_expected = 1 # And heal them again for key in list(partitions.keys()): self.heal_partition(partitions[key]) # Wait for a single partition to form count = 30 while count > 0: if len(self.CM.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self.CM.find_partitions() if len(partitions) > 0: members = partitions[0].split() if len(members) != len(self.Env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self.CM.cluster_stable(1200): self.failure("Reformed cluster not stable") if self.Env["continue"] == 1: answer = "Y" else: try: answer = raw_input('Continue? [nY]') except EOFError, e: answer = "n" if answer and answer == "n": raise ValueError("Reformed cluster not stable") # Turn fencing back on if self.Env["DoFencing"]: self.rsh(node, "crm_attribute -V -D -n stonith-enabled") self.CM.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [ r"Another DC detected:", r"(ERROR|error).*: .*Application of an update diff failed", r"crmd.*:.*not in our membership list", r"CRIT:.*node.*returning after partition", ] def is_applicable(self): if not self.is_applicable_common(): return 0 return len(self.Env["nodes"]) > 2 AllTestClasses.append(SplitBrainTest) class Reattach(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "Reattach" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) self.is_unsafe = 0 # Handled by canrunnow() def _is_managed(self, node): is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -Q -G -d true", 1) is_managed = is_managed[:-1] # Strip off the newline return is_managed == "true" def _set_unmanaged(self, node): self.debug("Disable resource management") self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): self.debug("Re-enable resource management") self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def setup(self, node): attempt = 0 if not self.startall(None): return None # Make sure we are really _really_ stable and that all # resources, including those that depend on transient node # attributes, are started while not self.CM.cluster_stable(double_check=True): if attempt < 5: attempt += 1 self.debug("Not stable yet, re-testing") else: self.logger.log("Cluster is not stable") return None return 1 def teardown(self, node): # Make sure 'node' is up start = StartTest(self.CM) start(node) if not self._is_managed(node): self.logger.log("Attempting to re-enable resource management on %s" % node) self._set_managed(node) self.CM.cluster_stable() if not self._is_managed(node): self.logger.log("Could not re-enable resource management") return 0 return 1 def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' if self.find_ocfs2_resources(node): self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present") return 0 return 1 def __call__(self, node): self.incr("calls") pats = [] # Conveniently, pengine will display this message when disabling management, # even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["Delaying fencing operations"], 60) managed.setwatch() self._set_unmanaged(node) if not managed.lookforall(): self.logger.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not disabled") pats = [] pats.append(self.templates["Pat:RscOpOK"] % (".*", "start")) pats.append(self.templates["Pat:RscOpOK"] % (".*", "stop")) pats.append(self.templates["Pat:RscOpOK"] % (".*", "promote")) pats.append(self.templates["Pat:RscOpOK"] % (".*", "demote")) pats.append(self.templates["Pat:RscOpOK"] % (".*", "migrate")) watch = self.create_watch(pats, 60, "ShutdownActivity") watch.setwatch() self.debug("Shutting down the cluster") ret = self.stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self.startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.setwatch() # Re-enable resource management (and verify it happened). self._set_managed(node) self.CM.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rclass == "stonith": self.debug("Ignoring start actions for %s" % r.id) ignore.append(self.templates["Pat:RscOpOK"] % (r.id, "start_0")) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"resources were active at shutdown", ] def is_applicable(self): if self.Env["Name"] == "crm-lha": return None return 1 AllTestClasses.append(Reattach) class SpecialTest1(CTSTest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SpecialTest1" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Could not stop all nodes") # Test config recovery when the other nodes come up self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*") # Start the selected node ret = self.restart1(node) if not ret: return self.failure("Could not start "+node) # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Could not start the remaining nodes") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # Errors that occur as a result of the CIB being wiped return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed", r"error.*: Resource start-up disabled since no STONITH resources have been defined", r"error.*: Either configure some or disable STONITH with the stonith-enabled option", r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity", ] AllTestClasses.append(SpecialTest1) class HAETest(CTSTest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "HAETest" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_loop = 1 def setup(self, node): # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") return self.success() def wait_on_state(self, node, resource, expected_clones, attempts=240): while attempts > 0: active = 0 (rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None) # Hack until crm_resource does the right thing if rc == 0 and lines: active = len(lines) if len(lines) == expected_clones: return 1 elif rc == 1: self.debug("Resource %s is still inactive" % resource) elif rc == 234: self.logger.log("Unknown resource %s" % resource) return 0 elif rc == 246: self.logger.log("Cluster is inactive") return 0 elif rc != 0: self.logger.log("Call to crm_resource failed, rc=%d" % rc) return 0 else: self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones)) attempts -= 1 time.sleep(1) return 0 def find_dlm(self, node): self.r_dlm = None (rc, lines) = self.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "controld" and r.parent != "NA": self.debug("Found dlm: %s" % self.r_dlm) self.r_dlm = r.parent return 1 return 0 def find_hae_resources(self, node): self.r_dlm = None self.r_o2cb = None self.r_ocfs2 = [] if self.find_dlm(node): self.find_ocfs2_resources(node) def is_applicable(self): if not self.is_applicable_common(): return 0 if self.Env["Schema"] == "hae": return 1 return None class HAERoleTest(HAETest): def __init__(self, cm): '''Lars' mount/unmount test for the HA extension. ''' HAETest.__init__(self,cm) self.name = "HAERoleTest" def change_state(self, node, resource, target): rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 delay = 2 done = time.time() + self.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "Stopped") if not self.wait_on_state(node, self.r_dlm, 0): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "Started") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAERoleTest) class HAEStandbyTest(HAETest): '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): HAETest.__init__(self,cm) self.name = "HAEStandbyTest" def change_state(self, node, resource, target): rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 done = time.time() + self.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "true") if not self.wait_on_state(node, self.r_dlm, clone_max-1): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "false") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAEStandbyTest) class NearQuorumPointTest(CTSTest): ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint") #decide what to do with each node for node in self.Env["nodes"]: action = self.Env.RandomGen.choice(["start","stop"]) #action = self.Env.RandomGen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.debug("start nodes:" + repr(startset)) self.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self.CM.ShouldBeStatus[node] == "up": watchpats.append(self.templates["Pat:We_stopped"] % node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": #watchpats.append(self.templates["Pat:Slave_started"] % node) watchpats.append(self.templates["Pat:Local_started"] % node) else: for stopping in stopset: if self.CM.ShouldBeStatus[stopping] == "up": watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping))) if len(watchpats) == 0: return self.skipped() if len(startset) != 0: watchpats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() #begin actions for node in stopset: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": self.CM.StartaCMnoBlock(node) #get the result if watch.lookforall(): self.CM.cluster_stable() self.CM.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self.CM.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self.CM.StataCM(node) == 0: downnodes.append(node) self.CM.fencing_cleanup("NearQuorumPoint", stonith) if upnodes == [] and downnodes == []: self.CM.cluster_stable() # Make sure they're completely down with no residule for node in stopset: self.rsh(node, self.templates["StopCmd"]) return self.success() if len(upnodes) > 0: self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self.logger.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() def is_applicable(self): if self.Env["Name"] == "crm-cman": return None return 1 AllTestClasses.append(NearQuorumPointTest) class RollingUpgradeTest(CTSTest): '''Perform a rolling upgrade of the cluster''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RollingUpgrade" self.start = StartTest(cm) self.stop = StopTest(cm) self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def setup(self, node): # Start all remaining nodes ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.Env["nodes"]: if not self.downgrade(node, None): return self.failure("Couldn't downgrade %s" % node) ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.Env["nodes"]: if not self.upgrade(node, None): return self.failure("Couldn't upgrade %s" % node) return self.success() def install(self, node, version, start=1, flags="--force"): target_dir = "/tmp/rpm-%s" % version src_dir = "%s/%s" % (self.Env["rpm-dir"], version) self.logger.log("Installing %s on %s with %s" % (version, node, flags)) if not self.stop(node): return self.failure("stop failure: "+node) rc = self.rsh(node, "mkdir -p %s" % target_dir) rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir) (rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None) for line in lines: line = line[:-1] rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir)) rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir)) if start and not self.start(node): return self.failure("start failure: "+node) return self.success() def upgrade(self, node, start=1): return self.install(node, self.Env["current-version"], start) def downgrade(self, node, start=1): return self.install(node, self.Env["previous-version"], start, "--force --nodeps") def __call__(self, node): '''Perform the 'Rolling Upgrade' test. ''' self.incr("calls") for node in self.Env["nodes"]: if self.upgrade(node): return self.failure("Couldn't upgrade %s" % node) self.CM.cluster_stable() return self.success() def is_applicable(self): if not self.is_applicable_common(): return None if not "rpm-dir" in self.Env.keys(): return None if not "current-version" in self.Env.keys(): return None if not "previous-version" in self.Env.keys(): return None return 1 # Register RestartTest as a good test to run AllTestClasses.append(RollingUpgradeTest) class BSC_AddResource(CTSTest): '''Add a resource to the cluster''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "AddResource" self.resource_offset = 0 self.cib_cmd = """cibadmin -C -o %s -X '%s' """ def __call__(self, node): self.incr("calls") self.resource_offset = self.resource_offset + 1 r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset) start_pat = "crmd.*%s_start_0.*confirmed.*ok" patterns = [] patterns.append(start_pat % r_id) watch = self.create_watch(patterns, self.Env["DeadTime"]) watch.setwatch() ip = self.NextIP() if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip): return self.failure("Make resource %s failed" % r_id) failed = 0 watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Pattern not found: %s" % (regex)) failed = 1 if failed: return self.failure("Resource pattern(s) not found") if not self.CM.cluster_stable(self.Env["DeadTime"]): return self.failure("Unstable cluster") return self.success() def NextIP(self): ip = self.Env["IPBase"] if ":" in ip: fields = ip.rpartition(":") fields[2] = str(hex(int(fields[2], 16)+1)) print(str(hex(int(f[2], 16)+1))) else: fields = ip.rpartition('.') fields[2] = str(int(fields[2])+1) ip = fields[0] + fields[1] + fields[3]; self.Env["IPBase"] = ip return ip.strip() def make_ip_resource(self, node, id, rclass, type, ip): self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node)) rsc_xml=""" """ % (id, rclass, type, id, id, ip) node_constraint = """ """ % (id, id, id, id, node) rc = 0 (rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None) if rc != 0: self.logger.log("Constraint creation failed: %d" % rc) return None (rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None) if rc != 0: self.logger.log("Resource creation failed: %d" % rc) return None return 1 def is_applicable(self): if self.Env["DoBSC"]: return 1 return None AllTestClasses.append(BSC_AddResource) class SimulStopLite(CTSTest): '''Stop any active nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStopLite" def __call__(self, dummy): '''Perform the 'SimulStopLite' setup work. ''' self.incr("calls") self.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] for node in self.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.incr("WasStarted") watchpats.append(self.templates["Pat:We_stopped"] % node) #if self.Env["use_logd"]: # watchpats.append(self.templates["Pat:Logd_stopped"] % node) if len(watchpats) == 0: self.CM.clear_all_caches() return self.success() # Stop all the nodes - at about the same time... watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() self.set_timer() for node in self.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) if watch.lookforall(): self.CM.clear_all_caches() # Make sure they're completely down with no residule for node in self.Env["nodes"]: self.rsh(node, self.templates["StopCmd"]) return self.success() did_fail = 0 up_nodes = [] for node in self.Env["nodes"]: if self.CM.StataCM(node) == 1: did_fail = 1 up_nodes.append(node) if did_fail: return self.failure("Active nodes exist: " + repr(up_nodes)) self.logger.log("Warn: All nodes stopped but CTS didnt detect: " + repr(watch.unmatched)) self.CM.clear_all_caches() return self.failure("Missing log message: "+repr(watch.unmatched)) def is_applicable(self): '''SimulStopLite is a setup test and never applicable''' return 0 class SimulStartLite(CTSTest): '''Start any stopped nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "SimulStartLite" def __call__(self, dummy): '''Perform the 'SimulStartList' setup work. ''' self.incr("calls") self.debug("Setup: " + self.name) # We ignore the "node" parameter... node_list = [] for node in self.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "down": self.incr("WasStopped") node_list.append(node) self.set_timer() while len(node_list) > 0: # Repeat until all nodes come up watchpats = [ ] uppat = self.templates["Pat:Slave_started"] if self.CM.upcount() == 0: uppat = self.templates["Pat:Local_started"] watchpats.append(self.templates["Pat:DC_IDLE"]) for node in node_list: watchpats.append(uppat % node) watchpats.append(self.templates["Pat:InfraUp"] % node) watchpats.append(self.templates["Pat:PacemakerUp"] % node) # Start all the nodes - at about the same time... watch = self.create_watch(watchpats, self.Env["DeadTime"]+10) watch.setwatch() stonith = self.CM.prepare_fencing_watcher(self.name) for node in node_list: self.CM.StartaCMnoBlock(node) watch.lookforall() node_list = self.CM.fencing_cleanup(self.name, stonith) if node_list == None: return self.failure("Cluster did not stabilize") # Remove node_list messages from watch.unmatched for node in node_list: self.logger.debug("Dealing with stonith operations for %s" % repr(node_list)) if watch.unmatched: try: watch.unmatched.remove(uppat % node) except: self.debug("Already matched: %s" % (uppat % node)) try: watch.unmatched.remove(self.templates["Pat:InfraUp"] % node) except: self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node)) try: watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node) except: self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node)) if watch.unmatched: for regex in watch.unmatched: self.logger.log ("Warn: Startup pattern not found: %s" %(regex)) if not self.CM.cluster_stable(): return self.failure("Cluster did not stabilize") did_fail = 0 unstable = [] for node in self.Env["nodes"]: if self.CM.StataCM(node) == 0: did_fail = 1 unstable.append(node) if did_fail: return self.failure("Unstarted nodes exist: " + repr(unstable)) unstable = [] for node in self.Env["nodes"]: if not self.CM.node_stable(node): did_fail = 1 unstable.append(node) if did_fail: return self.failure("Unstable cluster nodes exist: " + repr(unstable)) return self.success() def is_applicable(self): '''SimulStartLite is a setup test and never applicable''' return 0 def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.Audits = audits result.append(bound_test) return result class RemoteLXC(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = "RemoteLXC" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.num_containers = 2 self.is_container = 1 self.is_docker_unsafe = 1 self.failed = 0 self.fail_string = "" def start_lxc_simple(self, node): # restore any artifacts laying around from a previous test. self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -R &>/dev/null") # generate the containers, put them in the config, add some resources to them pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % ("lxc1", "start_0")) pats.append(self.templates["Pat:RscOpOK"] % ("lxc2", "start_0")) pats.append(self.templates["Pat:RscOpOK"] % ("lxc-ms", "start_0")) pats.append(self.templates["Pat:RscOpOK"] % ("lxc-ms", "promote_0")) self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers) self.set_timer("remoteSimpleInit") watch.lookforall() self.log_timer("remoteSimpleInit") if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = 1 def cleanup_lxc_simple(self, node): pats = [ ] # if the test failed, attempt to clean up the cib and libvirt environment # as best as possible if self.failed == 1: # restore libvirt and cib self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -R &>/dev/null") self.rsh(node, "crm_resource -C -r container1 &>/dev/null") self.rsh(node, "crm_resource -C -r container2 &>/dev/null") self.rsh(node, "crm_resource -C -r lxc1 &>/dev/null") self.rsh(node, "crm_resource -C -r lxc2 &>/dev/null") self.rsh(node, "crm_resource -C -r lxc-ms &>/dev/null") time.sleep(20) return watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % ("container1", "stop_0")) pats.append(self.templates["Pat:RscOpOK"] % ("container2", "stop_0")) self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null") self.set_timer("remoteSimpleCleanup") watch.lookforall() self.log_timer("remoteSimpleCleanup") if watch.unmatched: self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched)) self.failed = 1 # cleanup libvirt self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -R &>/dev/null") def __call__(self, node): '''Perform the 'RemoteLXC' test. ''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed, start all nodes failed.") rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null") if rc == 1: self.log("Environment test for lxc support failed.") return self.skipped() self.start_lxc_simple(node) self.cleanup_lxc_simple(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed == 1: return self.failure(self.fail_string) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ r"Updating failcount for ping", r"pengine.*: Recover (ping|lxc-ms|container)\s*\(.*\)", # The orphaned lxc-ms resource causes an expected transition error # that is a result of the pengine not having knowledge that the # ms resource used to be a clone. As a result it looks like that # resource is running in multiple locations when it shouldn't... But in # this instance we know why this error is occurring and that it is expected. r"Calculated Transition .* /var/lib/pacemaker/pengine/pe-error", r"Resource lxc-ms .* is active on 2 nodes attempting recovery", r"Unknown operation: fail", r"(ERROR|error): sending stonithRA op to stonithd failed.", ] AllTestClasses.append(RemoteLXC) class RemoteDriver(CTSTest): def __init__(self, cm): CTSTest.__init__(self,cm) self.name = self.__class__.__name__ self.is_docker_unsafe = 1 self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.stop = StopTest(cm) self.remote_rsc = "remote-rsc" self.cib_cmd = """cibadmin -C -o %s -X '%s' """ self.reset() def reset(self): self.pcmk_started = 0 self.failed = False self.fail_string = "" self.remote_node_added = 0 self.remote_rsc_added = 0 self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False]) def fail(self, msg): """ Mark test as failed. """ self.failed = True # Always log the failure. self.logger.log(msg) # Use first failure as test status, as it's likely to be most useful. if not self.fail_string: self.fail_string = msg def get_othernode(self, node): for othernode in self.Env["nodes"]: if othernode == node: # we don't want to try and use the cib that we just shutdown. # find a cluster node that is not our soon to be remote-node. continue else: return othernode def del_rsc(self, node, rsc): othernode = self.get_othernode(node) rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc)) if rc != 0: self.fail("Removal of resource '%s' failed" % rsc) def add_rsc(self, node, rsc_xml): othernode = self.get_othernode(node) rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml)) if rc != 0: self.fail("resource creation failed") def add_primitive_rsc(self, node): rsc_xml = """ """ % (self.remote_rsc) self.add_rsc(node, rsc_xml) if not self.failed: self.remote_rsc_added = 1 def add_connection_rsc(self, node): if self.remote_use_reconnect_interval: # use reconnect interval and make sure to set cluster-recheck-interval as well. rsc_xml = """ """ % (self.remote_node, node) self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s")) else: # not using reconnect interval rsc_xml = """ """ % (self.remote_node, node) self.add_rsc(node, rsc_xml) if not self.failed: self.remote_node_added = 1 def stop_pcmk_remote(self, node): # disable pcmk remote for i in range(10): rc = self.rsh(node, "service pacemaker_remote stop") if rc != 0: time.sleep(6) else: break def start_pcmk_remote(self, node): for i in range(10): rc = self.rsh(node, "service pacemaker_remote start") if rc != 0: time.sleep(6) else: self.pcmk_started = 1 break def start_metal(self, node): pcmk_started = 0 # make sure the resource doesn't already exist for some reason self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc)) self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node)) if not self.stop(node): self.fail("Failed to shutdown cluster node %s" % node) return self.start_pcmk_remote(node) if self.pcmk_started == 0: self.fail("Failed to start pacemaker_remote on node %s" % node) return # convert node to baremetal node now that it has shutdow the cluster stack pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "start")) pats.append(self.templates["Pat:DC_IDLE"]) self.add_connection_rsc(node) self.set_timer("remoteMetalInit") watch.lookforall() self.log_timer("remoteMetalInit") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def migrate_connection(self, node): if self.failed: return pats = [ ] pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "migrate_to")) pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "migrate_from")) pats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(pats, 120) watch.setwatch() (rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None) if rc != 0: self.fail("failed to move remote node connection resource") return self.set_timer("remoteMetalMigrate") watch.lookforall() self.log_timer("remoteMetalMigrate") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return def fail_rsc(self, node): if self.failed: return watchpats = [ ] watchpats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "stop", self.remote_node)) watchpats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "start", self.remote_node)) watchpats.append(self.templates["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, 120) watch.setwatch() self.debug("causing dummy rsc to fail.") rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*") self.set_timer("remoteRscFail") watch.lookforall() self.log_timer("remoteRscFail") if watch.unmatched: self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched) def fail_connection(self, node): if self.failed: return watchpats = [ ] watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node) watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node) watch = self.create_watch(watchpats, 120) watch.setwatch() # force stop the pcmk remote daemon. this will result in fencing self.debug("Force stopped active remote node") self.stop_pcmk_remote(node) self.debug("Waiting for remote node to be fenced.") self.set_timer("remoteMetalFence") watch.lookforall() self.log_timer("remoteMetalFence") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return self.debug("Waiting for the remote node to come back up") self.CM.ns.WaitForNodeToComeUp(node, 120); pats = [ ] watch = self.create_watch(pats, 240) watch.setwatch() pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "start")) if self.remote_rsc_added == 1: pats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "start", self.remote_node)) # start the remote node again watch it integrate back into cluster. self.start_pcmk_remote(node) if self.pcmk_started == 0: self.fail("Failed to start pacemaker_remote on node %s" % node) return self.debug("Waiting for remote node to rejoin cluster after being fenced.") self.set_timer("remoteMetalRestart") watch.lookforall() self.log_timer("remoteMetalRestart") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) return def add_dummy_rsc(self, node): if self.failed: return # verify we can put a resource on the remote node pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() pats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "start", self.remote_node)) pats.append(self.templates["Pat:DC_IDLE"]) # Add a resource that must live on remote-node self.add_primitive_rsc(node) # force that rsc to prefer the remote node. (rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None) if rc != 0: self.fail("Failed to place remote resource on remote node.") return self.set_timer("remoteMetalRsc") watch.lookforall() self.log_timer("remoteMetalRsc") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) def test_attributes(self, node): if self.failed: return # This verifies permanent attributes can be set on a remote-node. It also # verifies the remote-node can edit it's own cib node section remotely. (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None) if rc != 0: self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line)) return (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -Q -N %s" % (self.remote_node), None) if rc != 0: self.fail("Failed to get remote-node attribute") return (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None) if rc != 0: self.fail("Failed to delete remote-node attribute") return def cleanup_metal(self, node): if self.pcmk_started == 0: return pats = [ ] watch = self.create_watch(pats, 120) watch.setwatch() if self.remote_rsc_added == 1: pats.append(self.templates["Pat:RscOpOK"] % (self.remote_rsc, "stop")) if self.remote_node_added == 1: pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "stop")) self.set_timer("remoteMetalCleanup") if self.remote_use_reconnect_interval: self.debug("Cleaning up re-check interval") self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"]) if self.remote_rsc_added == 1: # Remove dummy resource added for remote node tests self.debug("Cleaning up dummy rsc put on remote node") self.rsh(node, "crm_resource -U -r %s" % self.remote_rsc) self.del_rsc(node, self.remote_rsc) if self.remote_node_added == 1: # Remove remote node's connection resource self.debug("Cleaning up remote node connection resource") self.rsh(node, "crm_resource -U -r %s" % (self.remote_node)) self.del_rsc(node, self.remote_node) watch.lookforall() self.log_timer("remoteMetalCleanup") if watch.unmatched: self.fail("Unmatched patterns: %s" % watch.unmatched) self.stop_pcmk_remote(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.remote_node_added == 1: # Remove remote node itself self.debug("Cleaning up node entry for remote node") self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node) def setup_env(self, node): self.remote_node = "remote_%s" % (node) # we are assuming if all nodes have a key, that it is # the right key... If any node doesn't have a remote # key, we regenerate it everywhere. if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]): return # create key locally (handle, keyfile) = tempfile.mkstemp(".cts") os.close(handle) devnull = open(os.devnull, 'wb') subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"], stdout=devnull, stderr=devnull) devnull.close() # sync key throughout the cluster for node in self.Env["nodes"]: self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker") self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node) self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") self.rsh(node, "chmod 0640 /etc/pacemaker/authkey") os.unlink(keyfile) def is_applicable(self): if not self.is_applicable_common(): return False for node in self.Env["nodes"]: rc = self.rsh(node, "type pacemaker_remoted >/dev/null 2>&1") if rc != 0: return False return True def start_new_test(self, node): self.incr("calls") self.reset() ret = self.startall(None) if not ret: return self.failure("Setup failed, start all nodes failed.") self.setup_env(node) self.start_metal(node) self.add_dummy_rsc(node) def __call__(self, node): return self.failure("This base class is not meant to be called directly.") def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """is running on remote.*which isn't allowed""", """Connection terminated""", """Failed to send remote""", ] # RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses class RemoteBasic(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteBaremetal' test. ''' self.start_new_test(node) self.test_attributes(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteBasic) class RemoteStonithd(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteStonithd' test. ''' self.start_new_test(node) self.fail_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): if not RemoteDriver.is_applicable(self): return False if "DoFencing" in self.Env.keys(): return self.Env["DoFencing"] return True def errorstoignore(self): ignore_pats = [ r"Unexpected disconnect on remote-node", - r"crmd.*: error.*: Operation remote_.*_monitor", - r"pengine.*: Recover remote_.*\s*\(.*\)", + r"crmd.*:\s+error.*: Operation remote_.*_monitor", + r"pengine.*:\s+Recover remote_.*\s*\(.*\)", r"Calculated Transition .* /var/lib/pacemaker/pengine/pe-error", r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery", ] ignore_pats.extend(RemoteDriver.errorstoignore(self)) return ignore_pats AllTestClasses.append(RemoteStonithd) class RemoteMigrate(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteMigrate' test. ''' self.start_new_test(node) self.migrate_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() AllTestClasses.append(RemoteMigrate) class RemoteRscFailure(RemoteDriver): def __call__(self, node): '''Perform the 'RemoteRscFailure' test. ''' self.start_new_test(node) # This is an important step. We are migrating the connection # before failing the resource. This verifies that the migration # has properly maintained control over the remote-node. self.migrate_connection(node) self.fail_rsc(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self.CM.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def errorstoignore(self): ignore_pats = [ r"pengine.*: Recover remote-rsc\s*\(.*\)", ] ignore_pats.extend(RemoteDriver.errorstoignore(self)) return ignore_pats AllTestClasses.append(RemoteRscFailure) # vim:ts=4:sw=4:et: