Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
index 0ccc9c7a0d..bdd996b4d6 100644
--- a/cts/lab/CTStests.py
+++ b/cts/lab/CTStests.py
@@ -1,608 +1,466 @@
""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
"""
__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
#
# SPECIAL NOTE:
#
# Tests may NOT implement any cluster-manager-specific code in them.
# EXTEND the ClusterManager object to provide the base capabilities
# the test needs if you need to do something that the current CM classes
# do not. Otherwise you screw up the whole point of the object structure
# in CTS.
#
# Thank you.
#
import re
import time
from stat import *
from pacemaker import BuildOptions
from pacemaker._cts.CTS import NodeStatus
from pacemaker._cts.audits import AuditResource
from pacemaker._cts.tests import *
from pacemaker._cts.timer import Timer
AllTestClasses = [ ]
AllTestClasses.append(FlipTest)
AllTestClasses.append(RestartTest)
AllTestClasses.append(StonithdTest)
AllTestClasses.append(StartOnebyOne)
AllTestClasses.append(SimulStart)
AllTestClasses.append(SimulStop)
AllTestClasses.append(StopOnebyOne)
AllTestClasses.append(RestartOnebyOne)
AllTestClasses.append(PartialStart)
AllTestClasses.append(StandbyTest)
AllTestClasses.append(MaintenanceMode)
AllTestClasses.append(ResourceRecover)
AllTestClasses.append(ComponentFail)
AllTestClasses.append(SplitBrainTest)
-
-
-class Reattach(CTSTest):
- def __init__(self, cm):
- CTSTest.__init__(self,cm)
- self.name = "Reattach"
- self._startall = SimulStartLite(cm)
- self.restart1 = RestartTest(cm)
- self.stopall = SimulStopLite(cm)
- self.is_unsafe = False
-
- def _is_managed(self, node):
- (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1)
- is_managed = is_managed[0].strip()
- return is_managed == "true"
-
- def _set_unmanaged(self, node):
- self.debug("Disable resource management")
- self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
-
- def _set_managed(self, node):
- self.debug("Re-enable resource management")
- self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
-
- def setup(self, node):
- attempt = 0
- if not self._startall(None):
- return None
-
- # Make sure we are really _really_ stable and that all
- # resources, including those that depend on transient node
- # attributes, are started
- while not self._cm.cluster_stable(double_check=True):
- if attempt < 5:
- attempt += 1
- self.debug("Not stable yet, re-testing")
- else:
- self._logger.log("Cluster is not stable")
- return None
-
- return 1
-
- def teardown(self, node):
-
- # Make sure 'node' is up
- start = StartTest(self._cm)
- start(node)
-
- if not self._is_managed(node):
- self._logger.log("Attempting to re-enable resource management on %s" % node)
- self._set_managed(node)
- self._cm.cluster_stable()
- if not self._is_managed(node):
- self._logger.log("Could not re-enable resource management")
- return 0
-
- return 1
-
- def can_run_now(self, node):
- """ Return True if we can meaningfully run right now"""
- if self._find_ocfs2_resources(node):
- self._logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
- return False
-
- return True
-
- def __call__(self, node):
- self.incr("calls")
-
- pats = []
- # Conveniently, the scheduler will display this message when disabling
- # management, even if fencing is not enabled, so we can rely on it.
- managed = self.create_watch(["No fencing will be done"], 60)
- managed.set_watch()
-
- self._set_unmanaged(node)
-
- if not managed.look_for_all():
- self._logger.log("Patterns not found: " + repr(managed.unmatched))
- return self.failure("Resource management not disabled")
-
- pats = []
- pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
- pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
- pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
- pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
- pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
-
- watch = self.create_watch(pats, 60, "ShutdownActivity")
- watch.set_watch()
-
- self.debug("Shutting down the cluster")
- ret = self.stopall(None)
- if not ret:
- self._set_managed(node)
- return self.failure("Couldn't shut down the cluster")
-
- self.debug("Bringing the cluster back up")
- ret = self._startall(None)
- time.sleep(5) # allow ping to update the CIB
- if not ret:
- self._set_managed(node)
- return self.failure("Couldn't restart the cluster")
-
- if self.local_badnews("ResourceActivity:", watch):
- self._set_managed(node)
- return self.failure("Resources stopped or started during cluster restart")
-
- watch = self.create_watch(pats, 60, "StartupActivity")
- watch.set_watch()
-
- # Re-enable resource management (and verify it happened).
- self._set_managed(node)
- self._cm.cluster_stable()
- if not self._is_managed(node):
- return self.failure("Could not re-enable resource management")
-
- # Ignore actions for STONITH resources
- ignore = []
- (_, lines) = self._rsh(node, "crm_resource -c", verbose=1)
- for line in lines:
- if re.search("^Resource", line):
- r = AuditResource(self._cm, line)
- if r.rclass == "stonith":
-
- self.debug("Ignoring start actions for %s" % r.id)
- ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
-
- if self.local_badnews("ResourceActivity:", watch, ignore):
- return self.failure("Resources stopped or started after resource management was re-enabled")
-
- return ret
-
- @property
- def errors_to_ignore(self):
- """ Return list of errors which should be ignored """
-
- return [ r"resource( was|s were) active at shutdown" ]
-
- def is_applicable(self):
- return True
-
AllTestClasses.append(Reattach)
class SpecialTest1(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SpecialTest1"
self._startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, node):
'''Perform the 'SpecialTest1' test for Andrew. '''
self.incr("calls")
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Could not stop all nodes")
# Test config recovery when the other nodes come up
self._rsh(node, "rm -f " + BuildOptions.CIB_DIR + "/cib*")
# Start the selected node
ret = self.restart1(node)
if not ret:
return self.failure("Could not start "+node)
# Start all remaining nodes
ret = self._startall(None)
if not ret:
return self.failure("Could not start the remaining nodes")
return self.success()
@property
def errors_to_ignore(self):
""" Return list of errors which should be ignored """
# Errors that occur as a result of the CIB being wiped
return [ r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
r"error.*: Resource start-up disabled since no STONITH resources have been defined",
r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity" ]
AllTestClasses.append(SpecialTest1)
class NearQuorumPointTest(CTSTest):
'''
This test brings larger clusters near the quorum point (50%).
In addition, it will test doing starts and stops at the same time.
Here is how I think it should work:
- loop over the nodes and decide randomly which will be up and which
will be down Use a 50% probability for each of up/down.
- figure out what to do to get into that state from the current state
- in parallel, bring up those going up and bring those going down.
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "NearQuorumPoint"
def __call__(self, dummy):
'''Perform the 'NearQuorumPoint' test. '''
self.incr("calls")
startset = []
stopset = []
stonith = self._cm.prepare_fencing_watcher("NearQuorumPoint")
#decide what to do with each node
for node in self._env["nodes"]:
action = self._env.random_gen.choice(["start","stop"])
#action = self._env.random_gen.choice(["start","stop","no change"])
if action == "start" :
startset.append(node)
elif action == "stop" :
stopset.append(node)
self.debug("start nodes:" + repr(startset))
self.debug("stop nodes:" + repr(stopset))
#add search patterns
watchpats = [ ]
for node in stopset:
if self._cm.ShouldBeStatus[node] == "up":
watchpats.append(self.templates["Pat:We_stopped"] % node)
for node in startset:
if self._cm.ShouldBeStatus[node] == "down":
#watchpats.append(self.templates["Pat:NonDC_started"] % node)
watchpats.append(self.templates["Pat:Local_started"] % node)
else:
for stopping in stopset:
if self._cm.ShouldBeStatus[stopping] == "up":
watchpats.append(self.templates["Pat:They_stopped"] % (node, self._cm.key_for_node(stopping)))
if len(watchpats) == 0:
return self.skipped()
if len(startset) != 0:
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, self._env["DeadTime"]+10)
watch.set_watch()
#begin actions
for node in stopset:
if self._cm.ShouldBeStatus[node] == "up":
self._cm.StopaCMnoBlock(node)
for node in startset:
if self._cm.ShouldBeStatus[node] == "down":
self._cm.StartaCMnoBlock(node)
#get the result
if watch.look_for_all():
self._cm.cluster_stable()
self._cm.fencing_cleanup("NearQuorumPoint", stonith)
return self.success()
self._logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
#get the "bad" nodes
upnodes = []
for node in stopset:
if self._cm.StataCM(node) == 1:
upnodes.append(node)
downnodes = []
for node in startset:
if self._cm.StataCM(node) == 0:
downnodes.append(node)
self._cm.fencing_cleanup("NearQuorumPoint", stonith)
if upnodes == [] and downnodes == []:
self._cm.cluster_stable()
# Make sure they're completely down with no residule
for node in stopset:
self._rsh(node, self.templates["StopCmd"])
return self.success()
if len(upnodes) > 0:
self._logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
if len(downnodes) > 0:
self._logger.log("Warn: Unstartable nodes: " + repr(downnodes))
return self.failure()
def is_applicable(self):
return True
AllTestClasses.append(NearQuorumPointTest)
def TestList(cm, audits):
result = []
for testclass in AllTestClasses:
bound_test = testclass(cm)
if bound_test.is_applicable():
bound_test.audits = audits
result.append(bound_test)
return result
class RemoteLXC(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RemoteLXC"
self._start = StartTest(cm)
self._startall = SimulStartLite(cm)
self.num_containers = 2
self.is_container = True
self.fail_string = ""
def start_lxc_simple(self, node):
# restore any artifacts laying around from a previous test.
self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
# generate the containers, put them in the config, add some resources to them
pats = [ ]
watch = self.create_watch(pats, 120)
watch.set_watch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))
self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
with Timer(self._logger, self.name, "remoteSimpleInit"):
watch.look_for_all()
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = True
def cleanup_lxc_simple(self, node):
pats = [ ]
# if the test failed, attempt to clean up the cib and libvirt environment
# as best as possible
if self.failed:
# restore libvirt and cib
self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
return
watch = self.create_watch(pats, 120)
watch.set_watch()
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))
self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
with Timer(self._logger, self.name, "remoteSimpleCleanup"):
watch.look_for_all()
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = True
# cleanup libvirt
self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
def __call__(self, node):
'''Perform the 'RemoteLXC' test. '''
self.incr("calls")
ret = self._startall(None)
if not ret:
return self.failure("Setup failed, start all nodes failed.")
(rc, _) = self._rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
if rc == 1:
self.log("Environment test for lxc support failed.")
return self.skipped()
self.start_lxc_simple(node)
self.cleanup_lxc_simple(node)
self.debug("Waiting for the cluster to recover")
self._cm.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
@property
def errors_to_ignore(self):
""" Return list of errors which should be ignored """
return [ r"Updating failcount for ping",
r"schedulerd.*: Recover\s+(ping|lxc-ms|container)\s+\(.*\)",
# The orphaned lxc-ms resource causes an expected transition error
# that is a result of the scheduler not having knowledge that the
# promotable resource used to be a clone. As a result, it looks like that
# resource is running in multiple locations when it shouldn't... But in
# this instance we know why this error is occurring and that it is expected.
r"Calculated [Tt]ransition .*pe-error",
r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
r"Unknown operation: fail",
r"VirtualDomain.*ERROR: Unable to determine emulator" ]
AllTestClasses.append(RemoteLXC)
class RemoteBasic(RemoteDriver):
def __init__(self, cm):
RemoteDriver.__init__(self, cm)
self.name = "RemoteBasic"
def __call__(self, node):
'''Perform the 'RemoteBaremetal' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.test_attributes(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self._cm.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteBasic)
class RemoteStonithd(RemoteDriver):
def __init__(self, cm):
RemoteDriver.__init__(self, cm)
self.name = "RemoteStonithd"
def __call__(self, node):
'''Perform the 'RemoteStonithd' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.fail_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self._cm.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return False
if "DoFencing" in self._env:
return self._env["DoFencing"]
return True
@property
def errors_to_ignore(self):
""" Return list of errors which should be ignored """
return [ r"Lost connection to Pacemaker Remote node",
r"Software caused connection abort",
r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)",
r"error: Result of monitor operation for .* on remote-.*: Internal communication failure" ] + super().errors_to_ignore
AllTestClasses.append(RemoteStonithd)
class RemoteMigrate(RemoteDriver):
def __init__(self, cm):
RemoteDriver.__init__(self, cm)
self.name = "RemoteMigrate"
def __call__(self, node):
'''Perform the 'RemoteMigrate' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.migrate_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self._cm.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return 0
# This test requires at least three nodes: one to convert to a
# remote node, one to host the connection originally, and one
# to migrate the connection to.
if len(self._env["nodes"]) < 3:
return 0
return 1
AllTestClasses.append(RemoteMigrate)
class RemoteRscFailure(RemoteDriver):
def __init__(self, cm):
RemoteDriver.__init__(self, cm)
self.name = "RemoteRscFailure"
def __call__(self, node):
'''Perform the 'RemoteRscFailure' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
# This is an important step. We are migrating the connection
# before failing the resource. This verifies that the migration
# has properly maintained control over the remote-node.
self.migrate_connection(node)
self.fail_rsc(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self._cm.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
@property
def errors_to_ignore(self):
""" Return list of errors which should be ignored """
return [ r"schedulerd.*: Recover\s+remote-rsc\s+\(.*\)",
r"Dummy.*: No process state file found" ] + super().errors_to_ignore
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return 0
# This test requires at least three nodes: one to convert to a
# remote node, one to host the connection originally, and one
# to migrate the connection to.
if len(self._env["nodes"]) < 3:
return 0
return 1
AllTestClasses.append(RemoteRscFailure)
# vim:ts=4:sw=4:et:
diff --git a/python/pacemaker/_cts/tests/Makefile.am b/python/pacemaker/_cts/tests/Makefile.am
index 5ebd0eb90c..cd21152e2b 100644
--- a/python/pacemaker/_cts/tests/Makefile.am
+++ b/python/pacemaker/_cts/tests/Makefile.am
@@ -1,33 +1,34 @@
#
# Copyright 2023 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
MAINTAINERCLEANFILES = Makefile.in
pkgpythondir = $(pythondir)/$(PACKAGE)/_cts/tests
pkgpython_PYTHON = __init__.py \
componentfail.py \
ctstest.py \
fliptest.py \
maintenancemode.py \
partialstart.py \
+ reattach.py \
remotedriver.py \
resourcerecover.py \
restarttest.py \
restartonebyone.py \
simulstart.py \
simulstop.py \
simulstartlite.py \
simulstoplite.py \
splitbraintest.py \
standbytest.py \
startonebyone.py \
starttest.py \
stonithdtest.py \
stoptest.py
diff --git a/python/pacemaker/_cts/tests/__init__.py b/python/pacemaker/_cts/tests/__init__.py
index dd7ee6cec3..ed2739b7d2 100644
--- a/python/pacemaker/_cts/tests/__init__.py
+++ b/python/pacemaker/_cts/tests/__init__.py
@@ -1,27 +1,28 @@
"""
Test classes for the `pacemaker._cts` package.
"""
__copyright__ = "Copyright 2023 the Pacemaker project contributors"
__license__ = "GNU Lesser General Public License version 2.1 or later (LGPLv2.1+)"
from pacemaker._cts.tests.componentfail import ComponentFail
from pacemaker._cts.tests.ctstest import CTSTest
from pacemaker._cts.tests.fliptest import FlipTest
from pacemaker._cts.tests.maintenancemode import MaintenanceMode
from pacemaker._cts.tests.partialstart import PartialStart
+from pacemaker._cts.tests.reattach import Reattach
from pacemaker._cts.tests.restartonebyone import RestartOnebyOne
from pacemaker._cts.tests.resourcerecover import ResourceRecover
from pacemaker._cts.tests.restarttest import RestartTest
from pacemaker._cts.tests.remotedriver import RemoteDriver
from pacemaker._cts.tests.simulstart import SimulStart
from pacemaker._cts.tests.simulstop import SimulStop
from pacemaker._cts.tests.simulstartlite import SimulStartLite
from pacemaker._cts.tests.simulstoplite import SimulStopLite
from pacemaker._cts.tests.splitbraintest import SplitBrainTest
from pacemaker._cts.tests.standbytest import StandbyTest
from pacemaker._cts.tests.starttest import StartTest
from pacemaker._cts.tests.startonebyone import StartOnebyOne
from pacemaker._cts.tests.stonithdtest import StonithdTest
from pacemaker._cts.tests.stoponebyone import StopOnebyOne
from pacemaker._cts.tests.stoptest import StopTest
diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py
new file mode 100644
index 0000000000..a9550e4dff
--- /dev/null
+++ b/python/pacemaker/_cts/tests/reattach.py
@@ -0,0 +1,157 @@
+""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
+"""
+
+__all__ = ["Reattach"]
+__copyright__ = "Copyright 2000-2023 the Pacemaker project contributors"
+__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
+
+import re
+import time
+
+from pacemaker._cts.audits import AuditResource
+from pacemaker._cts.tests.ctstest import CTSTest
+from pacemaker._cts.tests.restarttest import RestartTest
+from pacemaker._cts.tests.simulstartlite import SimulStartLite
+from pacemaker._cts.tests.simulstoplite import SimulStopLite
+from pacemaker._cts.tests.starttest import StartTest
+
+
+class Reattach(CTSTest):
+ def __init__(self, cm):
+ CTSTest.__init__(self,cm)
+ self.name = "Reattach"
+ self._startall = SimulStartLite(cm)
+ self.restart1 = RestartTest(cm)
+ self.stopall = SimulStopLite(cm)
+ self.is_unsafe = False
+
+ def _is_managed(self, node):
+ (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1)
+ is_managed = is_managed[0].strip()
+ return is_managed == "true"
+
+ def _set_unmanaged(self, node):
+ self.debug("Disable resource management")
+ self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
+
+ def _set_managed(self, node):
+ self.debug("Re-enable resource management")
+ self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
+
+ def setup(self, node):
+ attempt = 0
+ if not self._startall(None):
+ return None
+
+ # Make sure we are really _really_ stable and that all
+ # resources, including those that depend on transient node
+ # attributes, are started
+ while not self._cm.cluster_stable(double_check=True):
+ if attempt < 5:
+ attempt += 1
+ self.debug("Not stable yet, re-testing")
+ else:
+ self._logger.log("Cluster is not stable")
+ return None
+
+ return 1
+
+ def teardown(self, node):
+
+ # Make sure 'node' is up
+ start = StartTest(self._cm)
+ start(node)
+
+ if not self._is_managed(node):
+ self._logger.log("Attempting to re-enable resource management on %s" % node)
+ self._set_managed(node)
+ self._cm.cluster_stable()
+ if not self._is_managed(node):
+ self._logger.log("Could not re-enable resource management")
+ return 0
+
+ return 1
+
+ def can_run_now(self, node):
+ """ Return True if we can meaningfully run right now"""
+ if self._find_ocfs2_resources(node):
+ self._logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
+ return False
+
+ return True
+
+ def __call__(self, node):
+ self.incr("calls")
+
+ pats = []
+ # Conveniently, the scheduler will display this message when disabling
+ # management, even if fencing is not enabled, so we can rely on it.
+ managed = self.create_watch(["No fencing will be done"], 60)
+ managed.set_watch()
+
+ self._set_unmanaged(node)
+
+ if not managed.look_for_all():
+ self._logger.log("Patterns not found: " + repr(managed.unmatched))
+ return self.failure("Resource management not disabled")
+
+ pats = []
+ pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
+ pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
+ pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
+ pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
+ pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
+
+ watch = self.create_watch(pats, 60, "ShutdownActivity")
+ watch.set_watch()
+
+ self.debug("Shutting down the cluster")
+ ret = self.stopall(None)
+ if not ret:
+ self._set_managed(node)
+ return self.failure("Couldn't shut down the cluster")
+
+ self.debug("Bringing the cluster back up")
+ ret = self._startall(None)
+ time.sleep(5) # allow ping to update the CIB
+ if not ret:
+ self._set_managed(node)
+ return self.failure("Couldn't restart the cluster")
+
+ if self.local_badnews("ResourceActivity:", watch):
+ self._set_managed(node)
+ return self.failure("Resources stopped or started during cluster restart")
+
+ watch = self.create_watch(pats, 60, "StartupActivity")
+ watch.set_watch()
+
+ # Re-enable resource management (and verify it happened).
+ self._set_managed(node)
+ self._cm.cluster_stable()
+ if not self._is_managed(node):
+ return self.failure("Could not re-enable resource management")
+
+ # Ignore actions for STONITH resources
+ ignore = []
+ (_, lines) = self._rsh(node, "crm_resource -c", verbose=1)
+ for line in lines:
+ if re.search("^Resource", line):
+ r = AuditResource(self._cm, line)
+ if r.rclass == "stonith":
+
+ self.debug("Ignoring start actions for %s" % r.id)
+ ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
+
+ if self.local_badnews("ResourceActivity:", watch, ignore):
+ return self.failure("Resources stopped or started after resource management was re-enabled")
+
+ return ret
+
+ @property
+ def errors_to_ignore(self):
+ """ Return list of errors which should be ignored """
+
+ return [ r"resource( was|s were) active at shutdown" ]
+
+ def is_applicable(self):
+ return True

File Metadata

Mime Type
text/x-diff
Expires
Mon, Apr 21, 6:31 PM (23 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1665229
Default Alt Text
(29 KB)

Event Timeline