diff --git a/.gitignore b/.gitignore index a3eb175569..5411b37939 100644 --- a/.gitignore +++ b/.gitignore @@ -1,354 +1,353 @@ # -# Copyright 2011-2024 the Pacemaker project contributors +# Copyright 2011-2025 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # # Common conventions for files that should be ignored *~ *.bz2 *.diff *.orig *.patch *.rej *.sed *.swp *.tar.gz *.tgz \#* .\#* logs # libtool artifacts *.la *.lo .libs libltdl libtool libtool.m4 ltdl.m4 /m4/argz.m4 /m4/ltargz.m4 /m4/ltoptions.m4 /m4/ltsugar.m4 /m4/ltversion.m4 /m4/lt~obsolete.m4 # autotools artifacts .deps .dirstamp Makefile Makefile.in aclocal.m4 autoconf autoheader autom4te.cache/ automake /confdefs.h config.log config.status configure /conftest* # gettext artifacts /ABOUT-NLS /m4/codeset.m4 /m4/fcntl-o.m4 /m4/gettext.m4 /m4/glibc2.m4 /m4/glibc21.m4 /m4/iconv.m4 /m4/intdiv0.m4 /m4/intl.m4 /m4/intldir.m4 /m4/intlmacosx.m4 /m4/intmax.m4 /m4/inttypes-pri.m4 /m4/inttypes_h.m4 /m4/lcmessage.m4 /m4/lib-ld.m4 /m4/lib-link.m4 /m4/lib-prefix.m4 /m4/lock.m4 /m4/longlong.m4 /m4/nls.m4 /m4/po.m4 /m4/printf-posix.m4 /m4/progtest.m4 /m4/size_max.m4 /m4/stdint_h.m4 /m4/threadlib.m4 /m4/uintmax_t.m4 /m4/visibility.m4 /m4/wchar_t.m4 /m4/wint_t.m4 /m4/xsize.m4 /po/*.gmo /po/*.header /po/*.pot /po/*.sin /po/Makefile.in.in /po/Makevars.template /po/POTFILES /po/Rules-quot /po/stamp-po # configure targets /agents/ocf/ClusterMon /agents/ocf/Dummy /agents/ocf/HealthCPU /agents/ocf/HealthIOWait /agents/ocf/HealthSMART /agents/ocf/Stateful /agents/ocf/SysInfo /agents/ocf/attribute /agents/ocf/controld /agents/ocf/ifspeed /agents/ocf/ping /agents/ocf/remote /agents/stonith/fence_legacy /agents/stonith/fence_watchdog /cts/benchmark/clubench /cts/cluster_test -/cts/cts /cts/cts-attrd /cts/cts-cli /cts/cts-exec /cts/cts-fencing /cts/cts-lab /cts/cts-regression /cts/cts-scheduler /cts/cts-schemas /cts/lab/CTS.py /cts/support/LSBDummy /cts/support/cts-support /cts/support/fence_dummy /cts/support/pacemaker-cts-dummyd /cts/support/pacemaker-cts-dummyd@.service /daemons/execd/pacemaker_remote /daemons/execd/pacemaker_remote.service /daemons/fenced/fence_legacy /daemons/fenced/fence_watchdog /daemons/pacemakerd/pacemaker.service /doc/Doxyfile /etc/init.d/pacemaker /etc/logrotate.d/pacemaker /etc/sysconfig/pacemaker /include/config.h /include/config.h.in /include/crm_config.h /maint/bumplibs /python/pacemaker/buildoptions.py /python/setup.py -/tools/cluster-clean -/tools/cluster-helper -/tools/cluster-init /tools/crm_error /tools/crm_failcount /tools/crm_master /tools/crm_mon.service /tools/crm_report /tools/crm_rule /tools/crm_standby /tools/pcmk_simtimes /tools/report.collector /tools/report.common /xml/rng-helper # Compiled targets and intermediary files *.o *.pc *.pyc /daemons/attrd/pacemaker-attrd /daemons/based/pacemaker-based /daemons/controld/pacemaker-controld /daemons/execd/cts-exec-helper /daemons/execd/pacemaker-execd /daemons/execd/pacemaker-remoted /daemons/fenced/cts-fence-helper /daemons/fenced/pacemaker-fenced /daemons/pacemakerd/pacemakerd /daemons/schedulerd/pacemaker-schedulerd /devel/scratch /lib/gnu/stdalign.h /tools/attrd_updater /tools/cibadmin /tools/crmadmin /tools/crm_attribute /tools/crm_diff /tools/crm_mon /tools/crm_node /tools/crm_resource /tools/crm_shadow /tools/crm_simulate /tools/crm_ticket /tools/crm_verify /tools/iso8601 /tools/stonith_admin # Generated XML schema files /xml/crm_mon.rng /xml/pacemaker*.rng /xml/versions.rng /xml/api/api-result*.rng # Working directories for make dist and make export /pacemaker-[a-f0-9][a-f0-9][a-f0-9][a-f0-9][a-f0-9][a-f0-9][a-f0-9] # Documentation build targets and intermediary files *.7 *.7.xml *.7.html *.8 *.8.xml *.8.html GPATH GRTAGS GTAGS TAGS /daemons/fenced/pacemaker-fenced.xml /daemons/schedulerd/pacemaker-schedulerd.xml /doc/HTML /doc/abi/ /doc/abi-check /doc/api/ /doc/crm_fencing.html /doc/sphinx/*/_build /doc/sphinx/*/conf.py /doc/sphinx/*/generated /doc/sphinx/build-[0-9]*.txt # Test artifacts (from unit tests, regression tests, static analysis, etc.) *.coverity *.gcda *.gcno coverity-* pacemaker_*.info /coverage /cppcheck.out /cts/scheduler/bug-rh-1097457.log /cts/scheduler/bug-rh-1097457.trs /cts/scheduler/shadow.* /cts/schemas/test-*/ref/*.up* /cts/schemas/test-*/ref.err/*.up.err* /cts/test-suite.log /lib/*/fuzzers/*/*_fuzzer /lib/*/tests/*/*.log /lib/*/tests/*/*_test /lib/*/tests/*/*.trs /lib/common/tests/schemas/schemas /test/_test_file.c # Packaging artifacts *.rpm /pacemaker.spec /rpm/[A-LN-Z]* /rpm/build.counter /rpm/mock # Project maintainer artifacts /maint/gnulib /maint/mocked/based /maint/testcc_helper.cc /maint/testcc_*_h # Formerly built files (helps when jumping back and forth in checkout) /.ABI-build /Doxyfile /HTML /abi_dumps /abi-check /agents/ocf/o2cb /build.counter /compat_reports /compile /cts/.regression.failed.diff /attrd /cib /config.guess /config.sub /coverage.sh /crmd +/cts/cts /cts/CTS.py /cts/CTSlab.py /cts/CTSvars.py /cts/HBDummy /cts/LSBDummy /cts/OCFIPraTest.py /cts/cts-coverage /cts/cts-log-watcher /cts/cts-support /cts/fence_dummy /cts/lab/CTSlab.py /cts/lab/CTSvars.py /cts/lab/OCFIPraTest.py /cts/lab/cluster_test /cts/lab/cts /cts/lab/cts-log-watcher /cts/lxc_autogen.sh /cts/pacemaker-cts-dummyd /cts/pacemaker-cts-dummyd@.service /daemons/based/cibmon /daemons/fenced/fence_legacy /daemons/fenced/fence_watchdog /daemons/pacemakerd/pacemaker /daemons/pacemakerd/pacemaker.combined.upstart /daemons/pacemakerd/pacemaker.upstart /depcomp /doc/*.build /doc/*/en-US/Ap-*.xml /doc/*/en-US/Ch-*.xml /doc/*/publican.cfg /doc/*/publish /doc/*/tmp/** /doc/.ABI-build /doc/Clusters_from_Scratch.txt /doc/Pacemaker_Explained.txt /doc/abi_dumps /doc/acls.html /doc/compat_reports /doc/publican-catalog* /doc/shared/en-US/*.xml /doc/shared/en-US/images/pcmk-*.png /doc/shared/en-US/images/Policy-Engine-*.png /extra/*/* /fencing /include/stamp-* /install-sh /lib/common/md5.c /lib/common/tests/flags/pcmk__clear_flags_as /lib/common/tests/flags/pcmk__set_flags_as /lib/common/tests/flags/pcmk_all_flags_set /lib/common/tests/flags/pcmk_any_flags_set /lib/common/tests/operations/parse_op_key /lib/common/tests/strings/pcmk__btoa /lib/common/tests/strings/pcmk__parse_ll_range /lib/common/tests/strings/pcmk__scan_double /lib/common/tests/strings/pcmk__str_any_of /lib/common/tests/strings/pcmk__strcmp /lib/common/tests/strings/pcmk__char_in_any_str /lib/common/tests/utils/pcmk_str_is_infinity /lib/common/tests/utils/pcmk_str_is_minus_infinity /lib/gnu/libgnu.a /lib/pengine/tests/rules/ /lrmd /ltmain.sh /mcp /missing /mock /pacemaker-*.spec /pengine /py-compile /scratch +/test-driver /tools/cibsecret +/tools/cluster-clean +/tools/cluster-helper /tools/cluster-init /tools/crm_mon.upstart -/test-driver /xml/assets /xml/crm.dtd /xml/version-diff.sh ylwrap pacemaker.sysusers_* diff --git a/INSTALL.md b/INSTALL.md index bfb5acdb7c..0e34179e7a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,82 +1,83 @@ # How to Install Pacemaker ## Build Dependencies | Version | Fedora-based | Suse-based | Debian-based | |:---------------:|:------------------:|:------------------:|:--------------:| | 1.13 or later | automake | automake | automake | | 2.64 or later | autoconf | autoconf | autoconf | | | libtool | libtool | libtool | | | libtool-ltdl-devel | | libltdl-dev | | | libuuid-devel | libuuid-devel | uuid-dev | | 0.28 or later | pkgconfig | pkgconfig | pkg-config | | 2.42.0 or later | glib2-devel | glib2-devel | libglib2.0-dev | | 2.9.2 or later | libxml2-devel | libxml2-devel | libxml2-dev | | | libxslt-devel | libxslt-devel | libxslt-dev | | | bzip2-devel | libbz2-devel | libbz2-dev | | 1.0.1 or later | libqb-devel | libqb-devel | libqb-dev | | 3.6 or later | python3 | python3 | python3 | | 0.18 or later | gettext-devel | gettext-tools | gettext | | 0.18 or later | | | autopoint | | 3.4.6 or later | gnutls-devel | libgnutls-devel | libgnutls-dev | Also: * make must be GNU (or compatible) (setting MAKE=gmake might also work but is untested) * GNU (or compatible) getopt must be somewhere on the PATH ### Cluster Stack Dependencies *Only corosync is currently supported* | Version | Fedora-based | Suse-based | Debian-based | |:---------------:|:------------------:|:------------------:|:--------------:| | 2.0.0 or later | corosynclib | libcorosync | corosync | | 2.0.0 or later | corosynclib-devel | libcorosync-devel | | | | | | libcfg-dev | | | | | libcpg-dev | | | | | libcmap-dev | | | | | libquorum-dev | ### Optional Build Dependencies | Feature Enabled | Version | Fedora-based | Suse-based | Debian-based | |:-----------------------------------------------:|:--------------:|:-----------------------:|:-----------------------:|:-----------------------:| | encrypted remote CIB admin | | pam-devel | pam-devel | libpam0g-dev | | interactive crm_mon | | ncurses-devel | ncurses-devel | ncurses-dev | | systemd support | | systemd-devel | systemd-devel | libsystemd-dev | | systemd resource support | 1.5.12 or later| dbus-devel | dbus-devel | libdbus-1-dev | | Linux-HA style fencing agents | | cluster-glue-libs-devel | libglue-devel | cluster-glue-dev | | documentation | | help2man | help2man | help2man | | documentation | | docbook-style-xsl | docbook-xsl-stylesheets | docbook-xsl | | documentation | | python3-sphinx | python3-sphinx | python3-sphinx | | documentation (PDF) | | latexmk texlive texlive-capt-of texlive-collection-xetex texlive-fncychap texlive-framed texlive-multirow texlive-needspace texlive-tabulary texlive-titlesec texlive-threeparttable texlive-upquote texlive-wrapfig texlive-xetex | texlive texlive-latex | texlive texlive-latex-extra | | annotated source code as HTML via "make global" | | global | global | global | | RPM packages via "make rpm" | 4.14 or later | rpm | rpm | (n/a) | | unit tests | 1.1.0 or later | libcmocka-devel | libcmocka-devel | libcmocka-dev | ## Optional Testing Dependencies -* procps and psmisc (if running cts-exec, cts-fencing, or CTS lab) -* valgrind (if running valgrind tests in cts-cli, cts-scheduler, or CTS lab) +* procps and psmisc (if running CTS lab) +* valgrind (if running valgrind tests in cts-cli or cts-scheduler, or if + running Pacemaker daemons under valgrind) +* python3-psutil (if running any CTS tests) * python3-dateutil and python3-systemd (if running CTS lab on cluster nodes running systemd) * nmap (if not specifying an IP address base) -* oprofile (if running CTS lab profiling tests) * dlm (to log DLM debugging info after CTS lab tests) * xmllint (to validate tool output in cts-cli) ## Simple Install $ make && sudo make install If GNU make is not your default make, use "gmake" instead. ## Detailed Install First, browse the build options that are available: $ ./autogen.sh $ ./configure --help Re-run ./configure with any options you want, then proceed with the simple method. diff --git a/configure.ac b/configure.ac index 87cb9bc005..df8e1bfdb6 100644 --- a/configure.ac +++ b/configure.ac @@ -1,2139 +1,2136 @@ dnl dnl autoconf for Pacemaker dnl dnl Copyright 2009-2025 the Pacemaker project contributors dnl dnl The version control history for this file may have further details. dnl dnl This source code is licensed under the GNU General Public License version 2 dnl or later (GPLv2+) WITHOUT ANY WARRANTY. dnl ============================================== dnl Bootstrap autotools dnl ============================================== # Require a minimum version of autoconf itself AC_PREREQ(2.64) dnl AC_CONFIG_MACRO_DIR is deprecated as of autoconf 2.70 (2020-12-08). dnl Once we can require that version, we can simplify this, and no longer dnl need ACLOCAL_AMFLAGS in Makefile.am. m4_ifdef([AC_CONFIG_MACRO_DIRS], [AC_CONFIG_MACRO_DIRS([m4])], [AC_CONFIG_MACRO_DIR([m4])]) m4_include([m4/version.m4]) AC_INIT([pacemaker], VERSION_NUMBER, [users@clusterlabs.org], [pacemaker], PCMK_URL) LT_CONFIG_LTDL_DIR([libltdl]) AC_CONFIG_AUX_DIR([libltdl/config]) dnl Where #defines that autoconf makes (e.g. HAVE_whatever) go dnl dnl include/config.h dnl - Internal API dnl - Contains all defines dnl - include/config.h.in is generated automatically by autoheader dnl - Not to be included in any header files except crm_internal.h dnl (which is also not to be included in any other header files) dnl dnl include/crm_config.h dnl - External API dnl - Contains a subset of defines dnl - include/crm_config.h.in is manually edited to select the subset dnl - Should not include HAVE_* defines dnl - Safe to include anywhere AC_CONFIG_HEADERS([include/config.h include/crm_config.h]) dnl 1.13: minimum automake version required dnl foreign: don't require GNU-standard top-level files dnl tar-ustar: use (older) POSIX variant of generated tar rather than v7 dnl subdir-objects: keep .o's with their .c's (no-op in 2.0+) AM_INIT_AUTOMAKE([1.13 foreign tar-ustar subdir-objects]) dnl Require minimum version of pkg-config PKG_PROG_PKG_CONFIG(0.28) AS_IF([test x"${PKG_CONFIG}" != x""], [], [AC_MSG_FAILURE([Could not find required build tool pkg-config (0.28 or later)])]) PKG_INSTALLDIR PKG_NOARCH_INSTALLDIR dnl ============================================== dnl Compiler checks and helpers dnl ============================================== dnl A particular compiler can be forced by setting the CC environment variable AC_PROG_CC dnl C++ is needed only to run maintainer utilities, not to build AC_PROG_CXX dnl Use at least C99 if possible (automatic for autoconf >= 2.70) m4_version_prereq([2.70], [:], [AC_PROG_CC_STDC]) # cc_supports_flag # Return success if the C compiler supports the given flag cc_supports_flag() { local CFLAGS="-Werror $@" AC_MSG_CHECKING([whether $CC supports $@]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ ]], [[ ]])], [RC=0; AC_MSG_RESULT([yes])], [RC=1; AC_MSG_RESULT([no])]) return $RC } # cc_temp_flags # Use the given flags for subsequent C compilation. These can be reverted to # what was used previously with cc_restore_flags. This allows certain tests to # use specific flags without affecting anything else. cc_temp_flags() { ac_save_CFLAGS="$CFLAGS" CFLAGS="$*" } # cc_restore_flags # Restore C compiler flags to what they were before the last cc_temp_flags # call. cc_restore_flags() { CFLAGS=$ac_save_CFLAGS } # Check for fatal warning support AS_IF([test $enable_fatal_warnings -ne $DISABLED dnl && test x"$GCC" = x"yes" && cc_supports_flag -Werror], [WERROR="-Werror"], [ WERROR="" AS_CASE([$enable_fatal_warnings], [$REQUIRED], [AC_MSG_ERROR([Compiler does not support fatal warnings])], [$OPTIONAL], [enable_fatal_warnings=$DISABLED]) ]) dnl ============================================== dnl Linker checks dnl ============================================== # Check whether linker supports --enable-new-dtags to use RUNPATH instead of # RPATH. It is necessary to do this before libtool does linker detection. # See also: https://github.com/kronosnet/kronosnet/issues/107 AX_CHECK_LINK_FLAG([-Wl,--enable-new-dtags], [AM_LDFLAGS=-Wl,--enable-new-dtags], [AC_MSG_ERROR(["Linker support for --enable-new-dtags is required"])]) AC_SUBST([AM_LDFLAGS]) saved_LDFLAGS="$LDFLAGS" LDFLAGS="$AM_LDFLAGS $LDFLAGS" LT_INIT([dlopen]) LDFLAGS="$saved_LDFLAGS" LTDL_INIT([convenience]) dnl ============================================== dnl Define configure options dnl ============================================== # yes_no_try # Map a yes/no/try user selection to $REQUIRED for yes, $DISABLED for no, and # $OPTIONAL for try. DISABLED=0 REQUIRED=1 OPTIONAL=2 yes_no_try() { local value AS_IF([test x"$1" = x""], [value="$2"], [value="$1"]) AS_CASE(["`echo "$value" | tr '[A-Z]' '[a-z]'`"], [0|no|false|disable], [return $DISABLED], [1|yes|true|enable], [return $REQUIRED], [try|check], [return $OPTIONAL] ) AC_MSG_ERROR([Invalid option value "$value"]) } # # Fix the defaults of certain built-in variables so they can be used in the # defaults for our custom arguments # AC_MSG_NOTICE([Sanitizing prefix: ${prefix}]) AS_IF([test x"$prefix" = x"NONE"], [ prefix=/usr dnl Fix default variables - "prefix" variable if not specified AS_IF([test x"$localstatedir" = x"\${prefix}/var"], [localstatedir="/var"]) AS_IF([test x"$sysconfdir" = x"\${prefix}/etc"], [sysconfdir="/etc"]) ]) AC_MSG_NOTICE([Sanitizing exec_prefix: ${exec_prefix}]) AS_CASE([$exec_prefix], [prefix|NONE], [exec_prefix=$prefix]) AC_MSG_NOTICE([Sanitizing libdir: ${libdir}]) AS_CASE([$libdir], [prefix|NONE], [ AC_MSG_CHECKING([which lib directory to use]) for aDir in lib64 lib do trydir="${exec_prefix}/${aDir}" AS_IF([test -d ${trydir}], [ libdir=${trydir} break ]) done AC_MSG_RESULT([$libdir]) ]) # Start a list of optional features this build supports PCMK_FEATURES="" dnl This section should include only the definition of configure script dnl options and determining their values. Processing should be done later when dnl possible, other than what's needed to determine values and defaults. dnl Per the autoconf docs, --enable-*/--disable-* options should control dnl features inherent to Pacemaker, while --with-*/--without-* options should dnl control the use of external software. However, --enable-*/--disable-* may dnl implicitly require additional external dependencies, and dnl --with-*/--without-* may implicitly enable or disable features, so the dnl line is blurry. dnl dnl We also use --with-* options for custom file, directory, and path dnl locations, since autoconf does not provide an option type for those. dnl --enable-* options: build process AC_ARG_ENABLE([quiet], [AS_HELP_STRING([--enable-quiet], [suppress make output unless there is an error @<:@no@:>@])] ) yes_no_try "$enable_quiet" "no" enable_quiet=$? AC_ARG_ENABLE([fatal-warnings], [AS_HELP_STRING([--enable-fatal-warnings], [enable pedantic and fatal warnings for gcc @<:@try@:>@])], ) yes_no_try "$enable_fatal_warnings" "try" enable_fatal_warnings=$? AC_ARG_ENABLE([hardening], [AS_HELP_STRING([--enable-hardening], [harden the resulting executables/libraries @<:@try@:>@])] ) yes_no_try "$enable_hardening" "try" enable_hardening=$? dnl --enable-* options: features within Pacemaker dnl @COMPAT This should be --with-systemd AC_ARG_ENABLE([systemd], [AS_HELP_STRING([--enable-systemd], [enable support for managing resources via systemd @<:@try@:>@])] ) yes_no_try "$enable_systemd" "try" enable_systemd=$? AC_ARG_ENABLE([deprecated-libs], [AS_HELP_STRING([--enable-deprecated-libs], [Build and install deprecated C libraries @<:@yes@:>@])] ) yes_no_try "$enable_deprecated_libs" "yes" enable_deprecated_libs=$? AM_CONDITIONAL([BUILD_DEPRECATED_LIBS], [test $enable_deprecated_libs -ne $DISABLED]) # AM_GNU_GETTEXT calls AM_NLS which defines the nls option, but it defaults # to enabled. We override the definition of AM_NLS to flip the default and mark # it as experimental in the help text. AC_DEFUN([AM_NLS], [AC_MSG_CHECKING([whether NLS is requested]) AC_ARG_ENABLE([nls], [AS_HELP_STRING([--enable-nls], [use Native Language Support (experimental)])], USE_NLS=$enableval, USE_NLS=no) AC_MSG_RESULT([$USE_NLS]) AC_SUBST([USE_NLS])] ) AM_GNU_GETTEXT([external]) AM_GNU_GETTEXT_VERSION([0.18]) dnl --with-* options: external software support, and custom values dnl This argument is defined via an M4 macro so default can be a variable AC_DEFUN([VERSION_ARG], [AC_ARG_WITH([version], [AS_HELP_STRING([--with-version=VERSION], [override package version @<:@$1@:>@])], [ PACEMAKER_VERSION="$withval" ], [ PACEMAKER_VERSION="$PACKAGE_VERSION" ])] ) VERSION_ARG(VERSION_NUMBER) CRM_DAEMON_USER="" AC_ARG_WITH([daemon-user], [AS_HELP_STRING([--with-daemon-user=USER], [user to run unprivileged Pacemaker daemons as (advanced option: changing this may break other cluster components unless similarly configured) @<:@hacluster@:>@])], [ CRM_DAEMON_USER="$withval" ] ) AS_IF([test x"${CRM_DAEMON_USER}" = x""], [CRM_DAEMON_USER="hacluster"]) CRM_DAEMON_GROUP="" AC_ARG_WITH([daemon-group], [AS_HELP_STRING([--with-daemon-group=GROUP], [group to run unprivileged Pacemaker daemons as (advanced option: changing this may break other cluster components unless similarly configured) @<:@haclient@:>@])], [ CRM_DAEMON_GROUP="$withval" ] ) AS_IF([test x"${CRM_DAEMON_GROUP}" = x""], [CRM_DAEMON_GROUP="haclient"]) BUG_URL="" AC_ARG_WITH([bug-url], [AS_HELP_STRING([--with-bug-url=DIR], m4_normalize([ address where users should submit bug reports @<:@https://bugs.clusterlabs.org/enter_bug.cgi?product=Pacemaker@:>@]))], [ BUG_URL="$withval" ] ) AS_IF([test x"${BUG_URL}" = x""], [BUG_URL="https://bugs.clusterlabs.org/enter_bug.cgi?product=Pacemaker"]) dnl @COMPAT This should be --enable-cibsecrets option AC_ARG_WITH([cibsecrets], [AS_HELP_STRING([--with-cibsecrets], [support separate file for CIB secrets @<:@no@:>@])] ) yes_no_try "$with_cibsecrets" "no" with_cibsecrets=$? PCMK__GNUTLS_PRIORITIES="NORMAL" AC_ARG_WITH([gnutls-priorities], [AS_HELP_STRING([--with-gnutls-priorities], [default GnuTLS cipher priorities @<:@NORMAL@:>@])], [ test x"$withval" = x"no" || PCMK__GNUTLS_PRIORITIES="$withval" ] ) AC_ARG_WITH([concurrent-fencing-default], [AS_HELP_STRING([--with-concurrent-fencing-default], m4_normalize([ default value for concurrent-fencing cluster option (deprecated) @<:@true@:>@]))], ) AS_CASE([$with_concurrent_fencing_default], [""], [with_concurrent_fencing_default="true"], [true], [], [false], [PCMK_FEATURES="$PCMK_FEATURES concurrent-fencing-default-false"], [AC_MSG_ERROR([Invalid value "$with_concurrent_fencing_default" for --with-concurrent-fencing-default])] ) AC_ARG_WITH([sbd-sync-default], [AS_HELP_STRING([--with-sbd-sync-default], m4_normalize([ default value used by sbd if SBD_SYNC_RESOURCE_STARTUP environment variable is not set @<:@false@:>@]))], ) AS_CASE([$with_sbd_sync_default], [""], [with_sbd_sync_default=false], [false], [], [true], [PCMK_FEATURES="$PCMK_FEATURES default-sbd-sync"], [AC_MSG_ERROR([Invalid value "$with_sbd_sync_default" for --with-sbd-sync-default])] ) AC_ARG_WITH([resource-stickiness-default], [AS_HELP_STRING([--with-resource-stickiness-default], [If positive, value to add to new CIBs as explicit resource default for resource-stickiness @<:@0@:>@])], ) errmsg="Invalid value \"$with_resource_stickiness_default\" for --with-resource-stickiness-default" AS_CASE([$with_resource_stickiness_default], [0|""], [with_resource_stickiness_default="0"], [*[[!0-9]]*], [AC_MSG_ERROR([$errmsg])], [PCMK_FEATURES="$PCMK_FEATURES default-resource-stickiness"] ) AC_ARG_WITH([corosync], [AS_HELP_STRING([--with-corosync], [support the Corosync messaging and membership layer @<:@try@:>@])] ) yes_no_try "$with_corosync" "try" with_corosync=$? dnl Get default from Corosync if possible PKG_CHECK_VAR([PCMK__COROSYNC_CONF], [corosync], [corosysconfdir], [PCMK__COROSYNC_CONF="$PCMK__COROSYNC_CONF/corosync.conf"], [PCMK__COROSYNC_CONF="${sysconfdir}/corosync/corosync.conf"]) AC_ARG_WITH([corosync-conf], [AS_HELP_STRING([--with-corosync-conf], m4_normalize([ location of Corosync configuration file @<:@value from Corosync package if available otherwise SYSCONFDIR/corosync/corosync.conf@:>@]))], [ PCMK__COROSYNC_CONF="$withval" ] ) dnl --with-* options: directory locations INITDIR="" AC_ARG_WITH([initdir], [AS_HELP_STRING([--with-initdir=DIR], m4_normalize([ directory for lsb resources (init scripts), or "try" to check for common locations, or "no" to disable] @<:@try@:>@))], [ INITDIR="$withval" ] ) AS_IF([test x"$INITDIR" = x""], [INITDIR="try"]) systemdsystemunitdir="${systemdsystemunitdir-}" AC_ARG_WITH([systemdsystemunitdir], [AS_HELP_STRING([--with-systemdsystemunitdir=DIR], [directory for systemd unit files (advanced option: must match what systemd uses)])], [ systemdsystemunitdir="$withval" ] ) CONFIGDIR="" AC_ARG_WITH([configdir], [AS_HELP_STRING([--with-configdir=DIR], [directory for Pacemaker configuration file @<:@SYSCONFDIR/sysconfig@:>@])], [ CONFIGDIR="$withval" ] ) dnl --runstatedir is available as of autoconf 2.70 (2020-12-08). When users dnl have an older version, they can use our --with-runstatedir. pcmk_runstatedir="" AC_ARG_WITH([runstatedir], [AS_HELP_STRING([--with-runstatedir=DIR], [modifiable per-process data @<:@LOCALSTATEDIR/run@:>@ (ignored if --runstatedir is available)])], [ pcmk_runstatedir="$withval" ] ) CRM_LOG_DIR="" AC_ARG_WITH([logdir], [AS_HELP_STRING([--with-logdir=DIR], [directory for Pacemaker log file @<:@LOCALSTATEDIR/log/pacemaker@:>@])], [ CRM_LOG_DIR="$withval" ] ) CRM_BUNDLE_DIR="" AC_ARG_WITH([bundledir], [AS_HELP_STRING([--with-bundledir=DIR], [directory for Pacemaker bundle logs @<:@LOCALSTATEDIR/log/pacemaker/bundles@:>@])], [ CRM_BUNDLE_DIR="$withval" ] ) dnl Get default from resource-agents if possible. Otherwise, the default uses dnl /usr/lib rather than libdir because it's determined by the OCF project and dnl not Pacemaker. Even if a user wants to install Pacemaker to /usr/local or dnl such, the OCF agents will be expected in their usual location. However, we dnl do give the user the option to override it. PKG_CHECK_VAR([PCMK_OCF_ROOT], [resource-agents], [ocfrootdir], [], [PCMK_OCF_ROOT="/usr/lib/ocf"]) AC_ARG_WITH([ocfdir], [AS_HELP_STRING([--with-ocfdir=DIR], m4_normalize([ OCF resource agent root directory (advanced option: changing this may break other cluster components unless similarly configured) @<:@value from resource-agents package if available otherwise /usr/lib/ocf@:>@]))], [ PCMK_OCF_ROOT="$withval" ] ) dnl Get default from resource-agents if possible PKG_CHECK_VAR([PCMK__OCF_RA_PATH], [resource-agents], [ocfrapath], [], [PCMK__OCF_RA_PATH="$PCMK_OCF_ROOT/resource.d"]) AC_ARG_WITH([ocfrapath], [AS_HELP_STRING([--with-ocfrapath=DIR], m4_normalize([ OCF resource agent directories (colon-separated) to search @<:@value from resource-agents package if available otherwise OCFDIR/resource.d@:>@]))], [ PCMK__OCF_RA_PATH="$withval" ] ) OCF_RA_INSTALL_DIR="$PCMK_OCF_ROOT/resource.d" AC_ARG_WITH([ocfrainstalldir], [AS_HELP_STRING([--with-ocfrainstalldir=DIR], m4_normalize([ OCF installation directory for Pacemakers resource agents @<:@OCFDIR/resource.d@:>@]))], [ OCF_RA_INSTALL_DIR="$withval" ] ) dnl Get default from fence-agents if available PKG_CHECK_VAR([FA_PREFIX], [fence-agents], [prefix], [PCMK__FENCE_BINDIR="${FA_PREFIX}/sbin"], [PCMK__FENCE_BINDIR="$sbindir"]) AC_ARG_WITH([fence-bindir], [AS_HELP_STRING([--with-fence-bindir=DIR], m4_normalize([ directory for executable fence agents @<:@value from fence-agents package if available otherwise SBINDIR@:>@]))], [ PCMK__FENCE_BINDIR="$withval" ] ) dnl --with-* options: non-production testing AC_ARG_WITH([profiling], [AS_HELP_STRING([--with-profiling], [disable optimizations, for effective profiling @<:@no@:>@])] ) yes_no_try "$with_profiling" "no" with_profiling=$? AC_ARG_WITH([coverage], [AS_HELP_STRING([--with-coverage], [disable optimizations, for effective profiling and coverage testing @<:@no@:>@])] ) yes_no_try "$with_coverage" "no" with_coverage=$? AC_DEFINE_UNQUOTED([PCMK__WITH_COVERAGE], [$with_coverage], [Build with code coverage]) AM_CONDITIONAL([BUILD_COVERAGE], [test $with_coverage -ne $DISABLED]) AC_ARG_WITH([sanitizers], [AS_HELP_STRING([--with-sanitizers=...,...], [enable SANitizer build, do *NOT* use for production. Only ASAN/UBSAN/TSAN are currently supported])], [ SANITIZERS="$withval" ], [ SANITIZERS="" ]) dnl Environment variable options AC_ARG_VAR([CFLAGS_HARDENED_LIB], [extra C compiler flags for hardened libraries]) AC_ARG_VAR([LDFLAGS_HARDENED_LIB], [extra linker flags for hardened libraries]) AC_ARG_VAR([CFLAGS_HARDENED_EXE], [extra C compiler flags for hardened executables]) AC_ARG_VAR([LDFLAGS_HARDENED_EXE], [extra linker flags for hardened executables]) dnl ============================================== dnl Locate essential tools dnl ============================================== PATH="$PATH:/sbin:/usr/sbin:/usr/local/sbin:/usr/local/bin" export PATH dnl Pacemaker's executable python scripts will invoke the python specified by dnl configure's PYTHON variable. If not specified, AM_PATH_PYTHON will check a dnl built-in list with (unversioned) "python" having precedence. To configure dnl Pacemaker to use a specific python interpreter version, define PYTHON dnl when calling configure, for example: ./configure PYTHON=/usr/bin/python3.6 dnl If PYTHON was specified, ensure it is an absolute path AS_IF([test x"${PYTHON}" != x""], [AC_PATH_PROG([PYTHON], [$PYTHON])]) dnl Require a minimum Python version AM_PATH_PYTHON([3.6]) AC_PROG_LN_S AC_PROG_MKDIR_P AC_PATH_PROG([GIT], [git], [false]) dnl Bash is needed for building man pages and running regression tests. dnl We set "BASH_PATH" because "BASH" is already an environment variable. REQUIRE_PROG([BASH_PATH], [bash]) AC_PATH_PROGS(PCMK__VALGRIND_EXEC, valgrind, /usr/bin/valgrind) AC_DEFINE_UNQUOTED(PCMK__VALGRIND_EXEC, "$PCMK__VALGRIND_EXEC", Valgrind command) dnl ============================================== dnl Package and schema versioning dnl ============================================== # Redefine PACKAGE_VERSION and VERSION according to PACEMAKER_VERSION in case # the user used --with-version. Unfortunately, this can only affect the # substitution variables and later uses in this file, not the config.h # constants, so we have to be careful to use only PACEMAKER_VERSION in C code. PACKAGE_VERSION=$PACEMAKER_VERSION VERSION=$PACEMAKER_VERSION AC_DEFINE_UNQUOTED(PACEMAKER_VERSION, "$VERSION", [Version number of this Pacemaker build]) AC_MSG_CHECKING([build version]) AS_IF([test "$GIT" != "false" && test -d .git], [ BUILD_VERSION=`"$GIT" log --pretty="format:%h" -n 1` AC_MSG_RESULT([$BUILD_VERSION (git hash)]) ], [ # The current directory name make a reasonable default # Most generated archives will include the hash or tag BASE=`basename $PWD` BUILD_VERSION=`echo $BASE | sed s:.*[[Pp]]acemaker-::` AC_MSG_RESULT([$BUILD_VERSION (directory name)]) ]) AC_DEFINE_UNQUOTED(BUILD_VERSION, "$BUILD_VERSION", Build version) AC_SUBST(BUILD_VERSION) # schema_files # List all manually edited RNG schemas (as opposed to auto-generated via make) # in the given directory. Use git if available to list managed RNGs, in case # there are leftover schema files from an earlier build of a different # version. Otherwise, check all RNGs. schema_files() { local files="$("$GIT" ls-files "$1"/*.rng 2>/dev/null)" AS_IF([test x"$files" = x""], [ files="$(ls -1 "$1"/*.rng | grep -E -v \ '/(pacemaker|api-result|crm_mon|versions)[^/]*\.rng')" ]) echo "$files" } # latest_schema_version # Determine highest RNG version in the given schema directory. latest_schema_version() { schema_files "$1" | sed -n -e 's/^.*-\([[0-9]][[0-9.]]*\).rng$/\1/p' dnl | sort -V | tail -1 } # schemas_for_make # Like schema_files, but suitable for use in make variables. schemas_for_make() { local file for file in $(schema_files "$1"); do AS_ECHO_N(["\$(top_srcdir)/$file "]) done } # Detect highest API schema version API_VERSION=$(latest_schema_version "xml/api") AC_DEFINE_UNQUOTED([PCMK__API_VERSION], ["$API_VERSION"], [Highest API schema version]) # Detect highest CIB schema version CIB_VERSION=$(latest_schema_version "xml") AC_SUBST(CIB_VERSION) # Re-run configure at next make if schema files change, to re-detect versions cib_schemas="$(schemas_for_make "xml")" api_schemas="$(schemas_for_make "xml/api")" CONFIG_STATUS_DEPENDENCIES="$cib_schemas $api_schemas" AC_SUBST(CONFIG_STATUS_DEPENDENCIES) dnl ============================================== dnl Process simple options dnl ============================================== AS_IF([test x"$enable_nls" = x"yes"], [PCMK_FEATURES="$PCMK_FEATURES nls"]) AS_IF([test x"$with_concurrent_fencing_default" = x"true"], [PCMK__CONCURRENT_FENCING_DEFAULT_TRUE="1"], [PCMK__CONCURRENT_FENCING_DEFAULT_TRUE="0"]) AC_DEFINE_UNQUOTED([PCMK__CONCURRENT_FENCING_DEFAULT_TRUE], [$PCMK__CONCURRENT_FENCING_DEFAULT_TRUE], [Whether concurrent-fencing cluster option default is true]) AC_DEFINE_UNQUOTED([PCMK__SBD_SYNC_DEFAULT], [$with_sbd_sync_default], [Default value for SBD_SYNC_RESOURCE_STARTUP environment variable]) AC_DEFINE_UNQUOTED([PCMK__RESOURCE_STICKINESS_DEFAULT], [$with_resource_stickiness_default], [Default value for resource-stickiness resource meta-attribute]) AS_IF([test x"${PCMK__GNUTLS_PRIORITIES}" != x""], [], [AC_MSG_ERROR([--with-gnutls-priorities value must not be empty])]) AC_DEFINE_UNQUOTED([PCMK__GNUTLS_PRIORITIES], ["$PCMK__GNUTLS_PRIORITIES"], [GnuTLS cipher priorities]) AC_SUBST(PCMK__GNUTLS_PRIORITIES) AC_SUBST(BUG_URL) AC_DEFINE_UNQUOTED([PCMK__BUG_URL], ["$BUG_URL"], [Where bugs should be reported]) AC_DEFINE_UNQUOTED([CRM_DAEMON_USER], ["$CRM_DAEMON_USER"], [User to run Pacemaker daemons as]) AC_SUBST(CRM_DAEMON_USER) AC_DEFINE_UNQUOTED([CRM_DAEMON_GROUP], ["$CRM_DAEMON_GROUP"], [Group to run Pacemaker daemons as]) AC_SUBST(CRM_DAEMON_GROUP) dnl ============================================== dnl Process file paths dnl ============================================== # expand_path_option [] # Given the name of a file path variable, expand any variable references # inside it, use the specified default if it is not specified, and ensure it # is a full path. expand_path_option() { # The first argument is the variable *name* (not value) ac_path_varname="$1" # Get the original value of the variable ac_path_value=$(eval echo "\${${ac_path_varname}}") # Expand any literal variable expressions in the value so that we don't # end up with something like '${prefix}' in #defines etc. # # Autoconf deliberately leaves values unexpanded to allow overriding # the configure script choices in make commands (for example, # "make exec_prefix=/foo install"). No longer being able to do this seems # like no great loss. eval ac_path_value=$(eval echo "${ac_path_value}") # Use (expanded) default if necessary AS_IF([test x"${ac_path_value}" = x""], [eval ac_path_value=$(eval echo "$2")]) # Require a full path AS_CASE(["$ac_path_value"], [/*], [eval ${ac_path_varname}="$ac_path_value"], [*], [AC_MSG_ERROR([$ac_path_varname value "$ac_path_value" is not a full path])] ) } dnl Expand values of autoconf-provided directory options expand_path_option prefix expand_path_option exec_prefix expand_path_option bindir expand_path_option sbindir expand_path_option libexecdir expand_path_option datarootdir expand_path_option datadir expand_path_option sysconfdir expand_path_option sharedstatedir expand_path_option localstatedir expand_path_option libdir expand_path_option includedir expand_path_option oldincludedir expand_path_option infodir expand_path_option mandir AC_DEFUN([AC_DATAROOTDIR_CHECKED]) dnl Expand values of custom directory options AS_IF([test x"$INITDIR" = x"try"], [ AC_MSG_CHECKING([for an init directory]) INITDIR=no for initdir in /etc/init.d /etc/rc.d/init.d /sbin/init.d \ /usr/local/etc/rc.d /etc/rc.d ${sysconfdir}/init.d do AS_IF([test -d $initdir], [ INITDIR=$initdir break ]) done AC_MSG_RESULT([$INITDIR]) ]) support_lsb=$DISABLED AM_CONDITIONAL([BUILD_LSB], [test x"${INITDIR}" != x"no"]) AM_COND_IF([BUILD_LSB], [ support_lsb=$REQUIRED expand_path_option INITDIR PCMK_FEATURES="$PCMK_FEATURES lsb" ], [ INITDIR="" ]) AC_SUBST(INITDIR) AC_DEFINE_UNQUOTED([PCMK__ENABLE_LSB], [$support_lsb], [Whether to support LSB resource agents]) AC_DEFINE_UNQUOTED([PCMK__LSB_INIT_DIR], ["$INITDIR"], [Location for LSB init scripts]) expand_path_option localedir "${datadir}/locale" AC_DEFINE_UNQUOTED([PCMK__LOCALE_DIR],["$localedir"], [Base directory for message catalogs]) AS_IF([test x"${runstatedir}" = x""], [runstatedir="${pcmk_runstatedir}"]) expand_path_option runstatedir "${localstatedir}/run" AC_DEFINE_UNQUOTED([PCMK__RUN_DIR], ["$runstatedir"], [Location for modifiable per-process data]) AC_SUBST(runstatedir) expand_path_option docdir "${datadir}/doc/${PACKAGE}-${VERSION}" AC_SUBST(docdir) expand_path_option CONFIGDIR "${sysconfdir}/sysconfig" AC_SUBST(CONFIGDIR) expand_path_option PCMK__COROSYNC_CONF "${sysconfdir}/corosync/corosync.conf" AC_SUBST(PCMK__COROSYNC_CONF) expand_path_option CRM_LOG_DIR "${localstatedir}/log/pacemaker" AC_DEFINE_UNQUOTED([CRM_LOG_DIR], ["$CRM_LOG_DIR"], [Location for Pacemaker log file]) AC_SUBST(CRM_LOG_DIR) expand_path_option CRM_BUNDLE_DIR "${localstatedir}/log/pacemaker/bundles" AC_DEFINE_UNQUOTED([CRM_BUNDLE_DIR], ["$CRM_BUNDLE_DIR"], [Location for Pacemaker bundle logs]) AC_SUBST(CRM_BUNDLE_DIR) expand_path_option PCMK__FENCE_BINDIR AC_SUBST(PCMK__FENCE_BINDIR) AC_DEFINE_UNQUOTED([PCMK__FENCE_BINDIR], ["$PCMK__FENCE_BINDIR"], [Location for executable fence agents]) expand_path_option PCMK_OCF_ROOT AC_SUBST(PCMK_OCF_ROOT) AC_DEFINE_UNQUOTED([PCMK_OCF_ROOT], ["$PCMK_OCF_ROOT"], [OCF root directory for resource agents and libraries]) expand_path_option PCMK__OCF_RA_PATH AC_SUBST(PCMK__OCF_RA_PATH) AC_DEFINE_UNQUOTED([PCMK__OCF_RA_PATH], ["$PCMK__OCF_RA_PATH"], [OCF directories to search for resource agents ]) expand_path_option OCF_RA_INSTALL_DIR AC_SUBST(OCF_RA_INSTALL_DIR) # Derived paths PCMK_SCHEMA_DIR="${datadir}/pacemaker" AC_DEFINE_UNQUOTED([PCMK_SCHEMA_DIR], ["$PCMK_SCHEMA_DIR"], [Location for the Pacemaker Relax-NG Schema]) AC_SUBST(PCMK_SCHEMA_DIR) PCMK__REMOTE_SCHEMA_DIR="${localstatedir}/lib/pacemaker/schemas" AC_DEFINE_UNQUOTED([PCMK__REMOTE_SCHEMA_DIR], ["$PCMK__REMOTE_SCHEMA_DIR"], [Location to store Relax-NG Schema files on remote nodes]) AC_SUBST(PCMK__REMOTE_SCHEMA_DIR) CRM_CORE_DIR="${localstatedir}/lib/pacemaker/cores" AC_DEFINE_UNQUOTED([CRM_CORE_DIR], ["$CRM_CORE_DIR"], [Directory Pacemaker daemons should change to (without systemd, core files will go here)]) AC_SUBST(CRM_CORE_DIR) PCMK__PERSISTENT_DATA_DIR="${localstatedir}/lib/pacemaker" AC_DEFINE_UNQUOTED([PCMK__PERSISTENT_DATA_DIR], ["$PCMK__PERSISTENT_DATA_DIR"], [Location to store directory produced by Pacemaker daemons]) AC_SUBST(PCMK__PERSISTENT_DATA_DIR) CRM_BLACKBOX_DIR="${localstatedir}/lib/pacemaker/blackbox" AC_DEFINE_UNQUOTED([CRM_BLACKBOX_DIR], ["$CRM_BLACKBOX_DIR"], [Where to keep blackbox dumps]) AC_SUBST(CRM_BLACKBOX_DIR) PCMK_SCHEDULER_INPUT_DIR="${localstatedir}/lib/pacemaker/pengine" AC_DEFINE_UNQUOTED([PCMK_SCHEDULER_INPUT_DIR], ["$PCMK_SCHEDULER_INPUT_DIR"], [Where to keep scheduler outputs]) AC_SUBST(PCMK_SCHEDULER_INPUT_DIR) CRM_CONFIG_DIR="${localstatedir}/lib/pacemaker/cib" AC_DEFINE_UNQUOTED([CRM_CONFIG_DIR], ["$CRM_CONFIG_DIR"], [Where to keep configuration files]) AC_SUBST(CRM_CONFIG_DIR) CRM_DAEMON_DIR="${libexecdir}/pacemaker" AC_DEFINE_UNQUOTED([CRM_DAEMON_DIR], ["$CRM_DAEMON_DIR"], [Location for Pacemaker daemons]) AC_SUBST(CRM_DAEMON_DIR) CRM_STATE_DIR="${runstatedir}/crm" AC_DEFINE_UNQUOTED([CRM_STATE_DIR], ["$CRM_STATE_DIR"], [Where to keep state files and sockets]) AC_SUBST(CRM_STATE_DIR) PCMK__OCF_TMP_DIR="${runstatedir}/resource-agents" AC_DEFINE_UNQUOTED([PCMK__OCF_TMP_DIR], ["$PCMK__OCF_TMP_DIR"], [Where resource agents should keep state files]) AC_SUBST(PCMK__OCF_TMP_DIR) PACEMAKER_CONFIG_DIR="${sysconfdir}/pacemaker" AC_DEFINE_UNQUOTED([PACEMAKER_CONFIG_DIR], ["$PACEMAKER_CONFIG_DIR"], [Where to keep configuration files like authkey]) AC_SUBST(PACEMAKER_CONFIG_DIR) # Fedora >=42 makes /usr/sbin a symlink to /usr/bin. It updates the RPM macros # to set _sbindir to "${_exec_prefix}/bin", the same value as _bindir. # Previously it was set to "${_exec_prefix}/sbin". (Note that because of the # symlink, paths beginning with /usr/sbin remain valid.) # # This causes problems with bundle resources. Pacemaker automatically generates # a configuration for the bundle's container resource. If the bundle contains a # primitive and the container's run-command attribute is unset, the generated # container resource has its run_cmd attribute set to # SBIN_DIR "/" PCMK__SERVER_REMOTED, which is intended as a reasonable default. # If SBIN_DIR becomes "/usr/bin" instead of "/usr/sbin", at least two problems # can occur: # 1. The container resource's digest changes compared to the digest in the # resource history entry. Pacemaker interprets this as a configuration # change and restarts the container resource. # 2. If the container is running a different OS distro or an older version of # Fedora, then the new /usr/bin/pacemaker-remoted path may be invalid; the # executable was installed at /usr/sbin/pacemaker-remoted, which is NOT a # symlink to /usr/bin path. In this case, the container fails to start. # # We override the value only for the SBIN_DIR constant, which is used only for # the sbd path and the default pacemaker-remoted path. There is no need to # override sbindir, which would affect installation directories. # # There is no more specific way than the below, to detect whether the build # system has this /usr/sbin vs. /usr/bin change in effect. Thus corner cases are # possible when sbindir/bindir are manually specified or in distros with # atypical defaults. # # At time of writing, autoconf is unchanged. However, we perform the override # here instead of in the spec file, in case autoconf changes in the future. # # Note that other distros (for example, RHEL) are likely to incorporate these # changes in the future. # # References: # * https://fedoraproject.org/wiki/Changes/Unify_bin_and_sbin # * https://bodhi.fedoraproject.org/updates/FEDORA-2025-da0a082e66 # * https://discussion.fedoraproject.org/t/144562 AS_IF([test x"$sbindir" = x"$bindir" \ && test x"$sbindir" = x"${exec_prefix}/bin"], [SBIN_DIR="${exec_prefix}/sbin"], [SBIN_DIR="$sbindir"]) AC_DEFINE_UNQUOTED([SBIN_DIR], ["$SBIN_DIR"], [Location for system binaries]) # Warn about any directories that don't exist (which may be OK) for j in prefix exec_prefix bindir sbindir libexecdir datadir sysconfdir \ sharedstatedir localstatedir libdir includedir oldincludedir infodir \ mandir INITDIR docdir CONFIGDIR localedir SBIN_DIR do dirname=`eval echo '${'${j}'}'` AS_IF([test -n "$dirname" && test ! -d "$dirname"], [AC_MSG_WARN([$j directory ($dirname) does not exist (yet)])]) done dnl =============================================== dnl General Processing dnl =============================================== us_auth= AC_CHECK_HEADER([sys/socket.h], [ AC_CHECK_DECL([SO_PEERCRED], [ # Linux AC_CHECK_TYPE([struct ucred], [ us_auth=peercred_ucred; AC_DEFINE([HAVE_UCRED], [1], [Define if Unix socket auth method is getsockopt(s, SO_PEERCRED, &ucred, ...)]) ], [ # OpenBSD AC_CHECK_TYPE([struct sockpeercred], [ us_auth=localpeercred_sockepeercred; AC_DEFINE([HAVE_SOCKPEERCRED], [1], [Define if Unix socket auth method is getsockopt(s, SO_PEERCRED, &sockpeercred, ...)]) ], [], [[#include ]]) ], [[#define _GNU_SOURCE #include ]]) ], [], [[#include ]]) ]) AS_IF([test -z "${us_auth}"], [ # FreeBSD AC_CHECK_DECL([getpeereid], [ us_auth=getpeereid; AC_DEFINE([HAVE_GETPEEREID], [1], [Define if Unix socket auth method is getpeereid(s, &uid, &gid)]) ], [ # Solaris/OpenIndiana AC_CHECK_DECL([getpeerucred], [ us_auth=getpeerucred; AC_DEFINE([HAVE_GETPEERUCRED], [1], [Define if Unix socket auth method is getpeercred(s, &ucred)]) ], [ AC_MSG_FAILURE([No way to authenticate a Unix socket peer]) ], [[#include ]]) ]) ]) dnl OS-based decision-making is poor autotools practice; feature-based dnl mechanisms are strongly preferred. Keep this section to a bare minimum; dnl regard as a "necessary evil". dnl Set host_os and host_cpu AC_CANONICAL_HOST INIT_EXT="" PROCFS=0 dnl Solaris and some *BSD versions support procfs but not files we need AS_CASE(["$host_os"], [*bsd*], [INIT_EXT=".sh"], [*linux*], [PROCFS=1], [darwin*], [ LIBS="$LIBS -L${prefix}/lib" CFLAGS="$CFLAGS -I${prefix}/include" ]) AC_SUBST(INIT_EXT) AM_CONDITIONAL([SUPPORT_PROCFS], [test $PROCFS -eq 1]) AC_DEFINE_UNQUOTED([HAVE_LINUX_PROCFS], [$PROCFS], [Define to 1 if procfs is supported]) AS_CASE(["$host_cpu"], [ppc64|powerpc64], [ AS_CASE([$CFLAGS], [*powerpc64*], [], [*], [AS_IF([test x"$GCC" = x"yes"], [CFLAGS="$CFLAGS -m64"]) ]) ]) dnl ============================================== dnl Documentation build dependencies and checks dnl ============================================== AC_PATH_PROG([HELP2MAN], [help2man]) AC_PATH_PROG([SPHINX], [sphinx-build]) AC_PATH_PROG([XSLTPROC], [xsltproc]) AC_PATH_PROG([XMLCATALOG], [xmlcatalog]) AM_CONDITIONAL(BUILD_HELP, test x"${HELP2MAN}" != x"") AS_IF([test x"${HELP2MAN}" != x""], [PCMK_FEATURES="$PCMK_FEATURES generated-manpages"]) MANPAGE_XSLT="" AS_IF([test x"${XSLTPROC}" != x""], [ AC_MSG_CHECKING([for DocBook-to-manpage transform]) # first try to figure out correct template using xmlcatalog query, # resort to extensive (semi-deterministic) file search if that fails DOCBOOK_XSL_URI='http://docbook.sourceforge.net/release/xsl/current' DOCBOOK_XSL_PATH='manpages/docbook.xsl' MANPAGE_XSLT=$(${XMLCATALOG} "" ${DOCBOOK_XSL_URI}/${DOCBOOK_XSL_PATH} \ | sed -n 's|^file://||p;q') AS_IF([test x"${MANPAGE_XSLT}" = x""], [ DIRS=$(find "${datadir}" -name $(basename $(dirname ${DOCBOOK_XSL_PATH})) \ -type d 2>/dev/null | LC_ALL=C sort) XSLT=$(basename ${DOCBOOK_XSL_PATH}) for d in ${DIRS} do AS_IF([test -f "${d}/${XSLT}"], [ MANPAGE_XSLT="${d}/${XSLT}" break ]) done ]) ]) AC_MSG_RESULT([$MANPAGE_XSLT]) AC_SUBST(MANPAGE_XSLT) AM_CONDITIONAL(BUILD_XML_HELP, test x"${MANPAGE_XSLT}" != x"") AS_IF([test x"${MANPAGE_XSLT}" != x""], [PCMK_FEATURES="$PCMK_FEATURES agent-manpages"]) AM_CONDITIONAL([BUILD_SPHINX_DOCS], [test x"${SPHINX}" != x""]) AM_COND_IF([BUILD_SPHINX_DOCS], [PCMK_FEATURES="$PCMK_FEATURES books"]) dnl Pacemaker's shell scripts (and thus man page builders) rely on GNU getopt AC_MSG_CHECKING([for GNU-compatible getopt]) IFS_orig=$IFS IFS=: for PATH_DIR in $PATH do IFS=$IFS_orig GETOPT_PATH="${PATH_DIR}/getopt" AS_IF([test -f "$GETOPT_PATH" && test -x "$GETOPT_PATH"], [ $GETOPT_PATH -T >/dev/null 2>/dev/null AS_IF([test $? -eq 4], [break]) ]) GETOPT_PATH="" done IFS=$IFS_orig AS_IF([test -n "$GETOPT_PATH"], [AC_MSG_RESULT([$GETOPT_PATH])], [ AC_MSG_RESULT([no]) AC_MSG_ERROR([Could not find required build tool GNU-compatible getopt]) ]) AC_SUBST([GETOPT_PATH]) dnl =============================================== dnl Libraries dnl =============================================== AC_SEARCH_LIBS([socket], [socket]) save_LIBS="$LIBS" DL_LIBS="" LIBS="" AC_SEARCH_LIBS([dlopen], [dl], [test "$ac_cv_search_dlopen" = "none required" || DL_LIBS="$LIBS"]) AC_SUBST(DL_LIBS) LIBS="$save_LIBS" save_LIBS="$LIBS" PAM_LIBS="" LIBS="" AC_SEARCH_LIBS([pam_start], [pam], [test "$ac_cv_search_pam_start" = "none required" || PAM_LIBS="$LIBS"]) AC_SUBST(PAM_LIBS) LIBS="$save_LIBS" PKG_CHECK_MODULES([UUID], [uuid], [CPPFLAGS="${CPPFLAGS} ${UUID_CFLAGS}" LIBS="${LIBS} ${UUID_LIBS}"]) # Require minimum glib version PKG_CHECK_MODULES([GLIB], [glib-2.0 >= 2.42.0], [CPPFLAGS="${CPPFLAGS} ${GLIB_CFLAGS}" LIBS="${LIBS} ${GLIB_LIBS}"]) # Check whether high-resolution sleep function is available AC_CHECK_FUNCS([nanosleep usleep]) PKG_CHECK_MODULES(LIBXML2, [libxml-2.0 >= 2.9.2], [CPPFLAGS="${CPPFLAGS} ${LIBXML2_CFLAGS}" LIBS="${LIBS} ${LIBXML2_LIBS}"]) AC_PATH_PROGS(XMLLINT_PATH, xmllint, /usr/bin/xmllint) AC_DEFINE_UNQUOTED(XMLLINT_PATH, "$XMLLINT_PATH", xmllint command) REQUIRE_LIB([xslt], [xsltApplyStylesheet]) dnl ======================================================================== dnl Headers dnl ======================================================================== # Some distributions insert #warnings into deprecated headers. If we will # enable fatal warnings for the build, then enable them for the header checks # as well, otherwise the build could fail even though the header check # succeeds. (We should probably be doing this in more places.) cc_temp_flags "$CFLAGS $WERROR" # Optional headers (inclusion of these should be conditional in C code) AC_CHECK_HEADERS([sys/signalfd.h]) AC_CHECK_HEADERS([uuid/uuid.h]) AC_CHECK_HEADERS([security/pam_appl.h pam/pam_appl.h]) AS_IF([test x"$ac_cv_lib_pam_pam_start" = x"yes"], AS_IF([test x"$ac_cv_header_security_pam_appl_h" = x"yes" dnl || test x"$ac_cv_header_pam_pam_appl_h" = x"yes"], [PCMK_FEATURES="$PCMK_FEATURES pam"])) # Required headers REQUIRE_HEADER([arpa/inet.h]) REQUIRE_HEADER([ctype.h]) REQUIRE_HEADER([dirent.h]) REQUIRE_HEADER([dlfcn.h]) REQUIRE_HEADER([errno.h]) REQUIRE_HEADER([fcntl.h]) REQUIRE_HEADER([float.h]) REQUIRE_HEADER([glib.h]) REQUIRE_HEADER([grp.h]) REQUIRE_HEADER([inttypes.h]) REQUIRE_HEADER([libgen.h]) REQUIRE_HEADER([limits.h]) REQUIRE_HEADER([locale.h]) REQUIRE_HEADER([netdb.h]) REQUIRE_HEADER([netinet/in.h]) REQUIRE_HEADER([netinet/ip.h], [ #include #include ]) REQUIRE_HEADER([netinet/tcp.h]) REQUIRE_HEADER([pwd.h]) REQUIRE_HEADER([regex.h]) REQUIRE_HEADER([sched.h]) REQUIRE_HEADER([signal.h]) REQUIRE_HEADER([stdarg.h]) REQUIRE_HEADER([stdbool.h]) REQUIRE_HEADER([stdint.h]) REQUIRE_HEADER([stdio.h]) REQUIRE_HEADER([stdlib.h]) REQUIRE_HEADER([string.h]) REQUIRE_HEADER([strings.h]) REQUIRE_HEADER([sys/ioctl.h]) REQUIRE_HEADER([sys/param.h]) REQUIRE_HEADER([sys/reboot.h]) REQUIRE_HEADER([sys/resource.h]) REQUIRE_HEADER([sys/socket.h]) REQUIRE_HEADER([sys/stat.h]) REQUIRE_HEADER([sys/time.h]) REQUIRE_HEADER([sys/types.h]) REQUIRE_HEADER([sys/uio.h]) REQUIRE_HEADER([sys/utsname.h]) REQUIRE_HEADER([sys/wait.h]) REQUIRE_HEADER([termios.h]) REQUIRE_HEADER([time.h]) REQUIRE_HEADER([unistd.h]) REQUIRE_HEADER([libxml/xpath.h]) REQUIRE_HEADER([libxslt/xslt.h]) cc_restore_flags dnl ======================================================================== dnl Generic declarations dnl ======================================================================== AC_CHECK_DECLS([CLOCK_MONOTONIC], [PCMK_FEATURES="$PCMK_FEATURES monotonic"], [], [[ #include ]]) dnl ======================================================================== dnl Unit test declarations dnl ======================================================================== AC_CHECK_DECLS([assert_float_equal], [], [], [[ #include #include #include #include ]]) dnl ======================================================================== dnl Byte size dnl ======================================================================== # Compile-time assert hack # https://jonjagger.blogspot.com/2017/07/compile-time-assertions-in-c.html AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[ switch (0) { case 0: case (CHAR_BIT == 8): break; } ]])], [], [AC_MSG_FAILURE(m4_normalize([Pacemaker is not supported on platforms where char is not 8 bits]))]) dnl ======================================================================== dnl Structures dnl ======================================================================== AC_CHECK_MEMBERS([struct tm.tm_gmtoff],,,[[#include ]]) dnl ======================================================================== dnl Functions dnl ======================================================================== REQUIRE_FUNC([alphasort]) REQUIRE_FUNC([getopt]) REQUIRE_FUNC([scandir]) REQUIRE_FUNC([sched_getscheduler]) REQUIRE_FUNC([setenv]) REQUIRE_FUNC([strndup]) REQUIRE_FUNC([strnlen]) REQUIRE_FUNC([unsetenv]) REQUIRE_FUNC([uuid_unparse]) REQUIRE_FUNC([vasprintf]) AC_CHECK_FUNCS([strchrnul]) AC_CHECK_FUNCS([fopen64]) AM_CONDITIONAL([WRAPPABLE_FOPEN64], [test x"$ac_cv_func_fopen64" = x"yes"]) AC_MSG_CHECKING([whether strerror always returns non-NULL]) AC_RUN_IFELSE([AC_LANG_PROGRAM([[ #include #include ]], [[ return strerror(-1) == NULL; ]])], [AC_MSG_RESULT([yes])], [AC_MSG_ERROR([strerror() is not C99-compliant])], [AC_MSG_ERROR([strerror() is not C99-compliant])]) AC_RUN_IFELSE([AC_LANG_PROGRAM([[#include ]], [[ const char *s = "some-command-line-arg"; char *name = NULL; int n = sscanf(s, "%ms", &name); return n != 1; ]])], [have_sscanf_m="yes"], [have_sscanf_m="no"], [have_sscanf_m="no"]) AS_IF([test x"$have_sscanf_m" = x"yes"], [AC_DEFINE([HAVE_SSCANF_M], [1], [Define to 1 if sscanf %m modifier is available])]) dnl ======================================================================== dnl bzip2 dnl ======================================================================== REQUIRE_HEADER([bzlib.h]) REQUIRE_LIB([bz2], [BZ2_bzBuffToBuffCompress]) dnl ======================================================================== dnl sighandler_t is missing from Illumos, Solaris11 systems dnl ======================================================================== AC_MSG_CHECKING([for sighandler_t]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[sighandler_t *f;]])], [ AC_MSG_RESULT([yes]) AC_DEFINE([HAVE_SIGHANDLER_T], [1], [Define to 1 if sighandler_t is available]) ], [AC_MSG_RESULT([no])]) dnl ======================================================================== dnl ncurses dnl ======================================================================== dnl dnl A few OSes (e.g. Linux) deliver a default "ncurses" alongside "curses". dnl Many non-Linux deliver "curses"; sites may add "ncurses". dnl dnl However, the source-code recommendation for both is to #include "curses.h" dnl (i.e. "ncurses" still wants the include to be simple, no-'n', "curses.h"). dnl dnl ncurses takes precedence. dnl AC_CHECK_HEADERS([curses.h curses/curses.h ncurses.h ncurses/ncurses.h]) save_LIBS="$LIBS" found_curses=0 CURSES_LIBS="" LIBS="" AC_SEARCH_LIBS([printw], [ncurses curses], [test "$ac_cv_search_printw" = "none required" || CURSES_LIBS="$LIBS" found_curses=1], [found_curses=0]) LIBS="$save_LIBS" dnl Check for printw() prototype compatibility AS_IF([test $found_curses -eq 1 && cc_supports_flag -Wcast-qual], [ ac_save_LIBS="$LIBS" LIBS="$CURSES_LIBS" # avoid broken test because of hardened build environment in Fedora 23+ # - https://fedoraproject.org/wiki/Changes/Harden_All_Packages # - https://bugzilla.redhat.com/1297985 AS_IF([cc_supports_flag -fPIC], [cc_temp_flags "-Wcast-qual $WERROR -fPIC"], [cc_temp_flags "-Wcast-qual $WERROR"]) AC_MSG_CHECKING([whether curses library is compatible]) AC_LINK_IFELSE( [AC_LANG_PROGRAM([ #if defined(HAVE_NCURSES_H) # include #elif defined(HAVE_NCURSES_NCURSES_H) # include #elif defined(HAVE_CURSES_H) # include #elif defined(HAVE_CURSES_CURSES_H) # include #endif ], [printw((const char *)"Test");] )], [AC_MSG_RESULT([yes]) PCMK_FEATURES="$PCMK_FEATURES ncurses" ], [ found_curses=0 CURSES_LIBS="" AC_MSG_RESULT([no]) AC_MSG_WARN(m4_normalize([Disabling curses because the printw() function of your (n)curses library is old. If you wish to enable curses, update to a newer version (ncurses 5.4 or later is recommended, available from https://invisible-island.net/ncurses/) ])) ] ) LIBS="$ac_save_LIBS" cc_restore_flags ]) AC_DEFINE_UNQUOTED([PCMK__ENABLE_CURSES], [$found_curses], [have ncurses library]) AC_SUBST(CURSES_LIBS) dnl ======================================================================== dnl Profiling and GProf dnl ======================================================================== CFLAGS_ORIG="$CFLAGS" AS_IF([test $with_coverage -ne $DISABLED], [ with_profiling=$REQUIRED PCMK_FEATURES="$PCMK_FEATURES coverage" CFLAGS="$CFLAGS -fprofile-arcs -ftest-coverage" dnl During linking, make sure to specify -lgcov or -coverage ] ) AS_IF([test $with_profiling -ne $DISABLED], [ with_profiling=$REQUIRED PCMK_FEATURES="$PCMK_FEATURES profile" dnl Disable various compiler optimizations CFLAGS="$CFLAGS -fno-omit-frame-pointer -fno-inline -fno-builtin" dnl CFLAGS="$CFLAGS -fno-inline-functions" dnl CFLAGS="$CFLAGS -fno-default-inline" dnl CFLAGS="$CFLAGS -fno-inline-functions-called-once" dnl CFLAGS="$CFLAGS -fno-optimize-sibling-calls" dnl Turn off optimization so tools can get accurate line numbers CFLAGS=`echo $CFLAGS | sed \ -e 's/-O.\ //g' \ -e 's/-Wp,-D_FORTIFY_SOURCE=.\ //g' \ -e 's/-D_FORTIFY_SOURCE=.\ //g'` CFLAGS="$CFLAGS -O0 -g3 -gdwarf-2" AC_MSG_NOTICE([CFLAGS before adding profiling options: $CFLAGS_ORIG]) AC_MSG_NOTICE([CFLAGS after: $CFLAGS]) ] ) AM_CONDITIONAL([BUILD_PROFILING], [test "$with_profiling" = "$REQUIRED"]) dnl ======================================================================== dnl Cluster infrastructure - LibQB dnl ======================================================================== PKG_CHECK_MODULES([libqb], [libqb >= 1.0.1]) CPPFLAGS="$libqb_CFLAGS $CPPFLAGS" LIBS="$libqb_LIBS $LIBS" dnl libqb 2.0.5+ (2022-03) AC_CHECK_FUNCS([qb_ipcc_connect_async]) dnl libqb 2.0.2+ (2020-10) AC_CHECK_FUNCS([qb_ipcc_auth_get]) dnl libqb 2.0.0+ (2020-05) dnl also defines QB_FEATURE_LOG_HIRES_TIMESTAMPS CHECK_ENUM_VALUE([qb/qblog.h],[qb_log_conf],[QB_LOG_CONF_MAX_LINE_LEN]) CHECK_ENUM_VALUE([qb/qblog.h],[qb_log_conf],[QB_LOG_CONF_ELLIPSIS]) dnl Support Linux-HA fence agents if available AS_IF([test x"$cross_compiling" != x"yes"], [CPPFLAGS="$CPPFLAGS -I${prefix}/include/heartbeat"]) AC_CHECK_HEADERS([stonith/stonith.h], [ save_LIBS="$LIBS" STONITH_LIBS="" LIBS="" AC_SEARCH_LIBS([PILLoadPlugin], [pils], [test "$ac_cv_search_PILLoadPlugin" = "none required" || STONITH_LIBS="$LIBS"]) LIBS="" AC_SEARCH_LIBS([G_main_add_IPC_Channel], [plumb], [test "$ac_cv_search_G_main_add_IPC_Channel" = "none required" || STONITH_LIBS="$STONITH_LIBS $LIBS"]) AC_SUBST(STONITH_LIBS) LIBS="$save_LIBS" PCMK_FEATURES="$PCMK_FEATURES lha" ]) AM_CONDITIONAL([BUILD_LHA_SUPPORT], [test x"$ac_cv_header_stonith_stonith_h" = x"yes"]) dnl =============================== dnl Detect DBus and systemd support dnl =============================== HAVE_dbus=0 PC_NAME_DBUS="" PKG_CHECK_MODULES([DBUS],[dbus-1 >= 1.5.12], [ HAVE_dbus=1 PC_NAME_DBUS="dbus-1" CPPFLAGS="${CPPFLAGS} ${DBUS_CFLAGS}" ],[]) AC_DEFINE_UNQUOTED(HAVE_DBUS, $HAVE_dbus, Support dbus) AC_SUBST(PC_NAME_DBUS) check_systemdsystemunitdir() { AC_MSG_CHECKING([which system unit file directory to use]) PKG_CHECK_VAR([systemdsystemunitdir], [systemd], [systemdsystemunitdir]) AC_MSG_RESULT([${systemdsystemunitdir}]) test x"$systemdsystemunitdir" != x"" return $? } AS_CASE([$enable_systemd], [$REQUIRED], [ AS_IF([test $HAVE_dbus = 0], [AC_MSG_FAILURE([Cannot support systemd resources without DBus])]) AS_IF([test "$ac_cv_have_decl_CLOCK_MONOTONIC" = "no"], [AC_MSG_FAILURE([Cannot support systemd resources without monotonic clock])]) AS_IF([check_systemdsystemunitdir], [], [AC_MSG_FAILURE([Cannot support systemd resources without systemdsystemunitdir])]) ], [$OPTIONAL], [ AS_IF([test $HAVE_dbus = 0 \ || test x"$ac_cv_have_decl_CLOCK_MONOTONIC" = x"no"], [enable_systemd=$DISABLED], [ AC_MSG_CHECKING([for systemd version (using dbus-send)]) ret=$({ dbus-send --system --print-reply \ --dest=org.freedesktop.systemd1 \ /org/freedesktop/systemd1 \ org.freedesktop.DBus.Properties.Get \ string:org.freedesktop.systemd1.Manager \ string:Version 2>/dev/null \ || echo "version unavailable"; } | tail -n1) # sanitize output a bit (interested just in value, not type), # ret is intentionally unenquoted so as to normalize whitespace ret=$(echo ${ret} | cut -d' ' -f2-) AC_MSG_RESULT([${ret}]) AS_IF([test x"$ret" != x"unavailable" \ || systemctl --version 2>/dev/null | grep -q systemd], [ AS_IF([check_systemdsystemunitdir], [enable_systemd=$REQUIRED], [enable_systemd=$DISABLED]) ], [enable_systemd=$DISABLED] ) ]) ], ) AC_MSG_CHECKING([whether to enable support for managing resources via systemd]) AS_IF([test $enable_systemd -eq $DISABLED], [AC_MSG_RESULT([no])], [ AC_MSG_RESULT([yes]) PCMK_FEATURES="$PCMK_FEATURES systemd" ] ) AC_SUBST([systemdsystemunitdir]) AC_DEFINE_UNQUOTED([SUPPORT_SYSTEMD], [$enable_systemd], [Support systemd resources]) AM_CONDITIONAL([BUILD_SYSTEMD], [test $enable_systemd = $REQUIRED]) AC_SUBST(SUPPORT_SYSTEMD) STACKS="" CLUSTERLIBS="" PC_NAME_CLUSTER="" dnl ======================================================================== dnl Detect support for "service" alias dnl ======================================================================== PCMK__ENABLE_SERVICE=$DISABLED AM_COND_IF([BUILD_LSB], [PCMK__ENABLE_SERVICE=$REQUIRED]) AM_COND_IF([BUILD_SYSTEMD], [PCMK__ENABLE_SERVICE=$REQUIRED]) AS_IF([test $PCMK__ENABLE_SERVICE -ne $DISABLED], [PCMK_FEATURES="$PCMK_FEATURES service"]) AC_SUBST(PCMK__ENABLE_SERVICE) AC_DEFINE_UNQUOTED([PCMK__ENABLE_SERVICE], [$PCMK__ENABLE_SERVICE], [Whether "service" is supported as an agent standard]) dnl ======================================================================== dnl Cluster stack - Corosync dnl ======================================================================== COROSYNC_LIBS="" AS_CASE([$with_corosync], [$REQUIRED], [ # These will be fatal if unavailable PKG_CHECK_MODULES([cpg], [libcpg]) PKG_CHECK_MODULES([cfg], [libcfg]) PKG_CHECK_MODULES([cmap], [libcmap]) PKG_CHECK_MODULES([quorum], [libquorum]) PKG_CHECK_MODULES([libcorosync_common], [libcorosync_common]) ] [$OPTIONAL], [ PKG_CHECK_MODULES([cpg], [libcpg], [], [with_corosync=$DISABLED]) PKG_CHECK_MODULES([cfg], [libcfg], [], [with_corosync=$DISABLED]) PKG_CHECK_MODULES([cmap], [libcmap], [], [with_corosync=$DISABLED]) PKG_CHECK_MODULES([quorum], [libquorum], [], [with_corosync=$DISABLED]) PKG_CHECK_MODULES([libcorosync_common], [libcorosync_common], [], [with_corosync=$DISABLED]) AS_IF([test $with_corosync -ne $DISABLED], [with_corosync=$REQUIRED]) ] ) AS_IF([test $with_corosync -ne $DISABLED], [ AC_MSG_CHECKING([for Corosync 2 or later]) AC_MSG_RESULT([yes]) CFLAGS="$CFLAGS $libqb_CFLAGS $cpg_CFLAGS $cfg_CFLAGS $cmap_CFLAGS $quorum_CFLAGS $libcorosync_common_CFLAGS" CPPFLAGS="$CPPFLAGS `$PKG_CONFIG --cflags-only-I corosync`" COROSYNC_LIBS="$COROSYNC_LIBS $cpg_LIBS $cfg_LIBS $cmap_LIBS $quorum_LIBS $libcorosync_common_LIBS" CLUSTERLIBS="$CLUSTERLIBS $COROSYNC_LIBS" PC_NAME_CLUSTER="$PC_CLUSTER_NAME libcfg libcmap libcorosync_common libcpg libquorum" STACKS="$STACKS corosync-ge-2" dnl Shutdown tracking added (back) to corosync Jan 2021 saved_LIBS="$LIBS" LIBS="$LIBS $COROSYNC_LIBS" AC_CHECK_FUNCS([corosync_cfg_trackstart]) LIBS="$saved_LIBS" ] ) AC_DEFINE_UNQUOTED([SUPPORT_COROSYNC], [$with_corosync], [Support the Corosync messaging and membership layer]) AM_CONDITIONAL([BUILD_CS_SUPPORT], [test $with_corosync -eq $REQUIRED]) AC_SUBST([SUPPORT_COROSYNC]) dnl dnl Cluster stack - Sanity dnl AS_IF([test x"$STACKS" != x""], [AC_MSG_NOTICE([Supported stacks:${STACKS}])], [AC_MSG_FAILURE([At least one cluster stack must be supported])]) PCMK_FEATURES="${PCMK_FEATURES}${STACKS}" AC_SUBST(CLUSTERLIBS) AC_SUBST(PC_NAME_CLUSTER) dnl ======================================================================== dnl CIB secrets dnl ======================================================================== AS_IF([test $with_cibsecrets -ne $DISABLED], [ with_cibsecrets=$REQUIRED PCMK_FEATURES="$PCMK_FEATURES cibsecrets" PCMK__CIB_SECRETS_DIR="${localstatedir}/lib/pacemaker/lrm/secrets" AC_DEFINE_UNQUOTED([PCMK__CIB_SECRETS_DIR], ["$PCMK__CIB_SECRETS_DIR"], [Location for CIB secrets]) AC_SUBST([PCMK__CIB_SECRETS_DIR]) ] ) AC_DEFINE_UNQUOTED([PCMK__ENABLE_CIBSECRETS], [$with_cibsecrets], [Support CIB secrets]) AM_CONDITIONAL([BUILD_CIBSECRETS], [test $with_cibsecrets -eq $REQUIRED]) dnl ======================================================================== dnl GnuTLS dnl ======================================================================== PKG_CHECK_MODULES(GNUTLS, [gnutls >= 3.4.6], [CPPFLAGS="${CPPFLAGS} ${GNUTLS_CFLAGS}" LIBS="${LIBS} ${GNUTLS_LIBS}"]) # --- ASAN/UBSAN/TSAN (see man gcc) --- # when using SANitizers, we need to pass the -fsanitize.. # to both CFLAGS and LDFLAGS. The CFLAGS/LDFLAGS must be # specified as first in the list or there will be runtime # issues (for example user has to LD_PRELOAD asan for it to work # properly). AS_IF([test -n "${SANITIZERS}"], [ SANITIZERS=$(echo $SANITIZERS | sed -e 's/,/ /g') for SANITIZER in $SANITIZERS do AS_CASE([$SANITIZER], [asan|ASAN], [ SANITIZERS_CFLAGS="$SANITIZERS_CFLAGS -fsanitize=address" SANITIZERS_LDFLAGS="$SANITIZERS_LDFLAGS -fsanitize=address -lasan" PCMK_FEATURES="$PCMK_FEATURES asan" REQUIRE_LIB([asan],[main]) ], [ubsan|UBSAN], [ SANITIZERS_CFLAGS="$SANITIZERS_CFLAGS -fsanitize=undefined" SANITIZERS_LDFLAGS="$SANITIZERS_LDFLAGS -fsanitize=undefined -lubsan" PCMK_FEATURES="$PCMK_FEATURES ubsan" REQUIRE_LIB([ubsan],[main]) ], [tsan|TSAN], [ SANITIZERS_CFLAGS="$SANITIZERS_CFLAGS -fsanitize=thread" SANITIZERS_LDFLAGS="$SANITIZERS_LDFLAGS -fsanitize=thread -ltsan" PCMK_FEATURES="$PCMK_FEATURES tsan" REQUIRE_LIB([tsan],[main]) ]) done ]) dnl ======================================================================== dnl Compiler flags dnl ======================================================================== dnl Make sure that CFLAGS is not exported. If the user did dnl not have CFLAGS in their environment then this should have dnl no effect. However if CFLAGS was exported from the user's dnl environment, then the new CFLAGS will also be exported dnl to sub processes. AS_IF([export | fgrep " CFLAGS=" > /dev/null], [ SAVED_CFLAGS="$CFLAGS" unset CFLAGS CFLAGS="$SAVED_CFLAGS" unset SAVED_CFLAGS ]) CC_EXTRAS="" AS_IF([test x"$GCC" != x"yes"], [CFLAGS="$CFLAGS -g"], [ CFLAGS="$CFLAGS -ggdb" dnl When we don't have diagnostic push / pull, we can't explicitly disable dnl checking for nonliteral formats in the places where they occur on purpose dnl thus we disable nonliteral format checking globally as we are aborting dnl on warnings. dnl what makes the things really ugly is that nonliteral format checking is dnl obviously available as an extra switch in very modern gcc but for older dnl gcc this is part of -Wformat=2 dnl so if we have push/pull we can enable -Wformat=2 -Wformat-nonliteral dnl if we don't have push/pull but -Wformat-nonliteral we can enable -Wformat=2 dnl otherwise none of both gcc_diagnostic_push_pull=no cc_temp_flags "$CFLAGS $WERROR" AC_MSG_CHECKING([for gcc diagnostic push / pull]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #pragma GCC diagnostic push #pragma GCC diagnostic pop ]])], [ AC_MSG_RESULT([yes]) gcc_diagnostic_push_pull=yes ], AC_MSG_RESULT([no])) cc_restore_flags AS_IF([cc_supports_flag "-Wformat-nonliteral"], [gcc_format_nonliteral=yes], [gcc_format_nonliteral=no]) # We had to eliminate -Wnested-externs because of libtool changes # Make sure to order options so that the former stand for prerequisites # of the latter (e.g., -Wformat-nonliteral requires -Wformat). EXTRA_FLAGS="-fgnu89-inline" EXTRA_FLAGS="$EXTRA_FLAGS -Wall" EXTRA_FLAGS="$EXTRA_FLAGS -Waggregate-return" EXTRA_FLAGS="$EXTRA_FLAGS -Wbad-function-cast" EXTRA_FLAGS="$EXTRA_FLAGS -Wcast-align" EXTRA_FLAGS="$EXTRA_FLAGS -Wdeclaration-after-statement" EXTRA_FLAGS="$EXTRA_FLAGS -Wendif-labels" EXTRA_FLAGS="$EXTRA_FLAGS -Wfloat-equal" EXTRA_FLAGS="$EXTRA_FLAGS -Wformat-security" EXTRA_FLAGS="$EXTRA_FLAGS -Wimplicit-fallthrough" EXTRA_FLAGS="$EXTRA_FLAGS -Wmissing-prototypes" EXTRA_FLAGS="$EXTRA_FLAGS -Wmissing-declarations" EXTRA_FLAGS="$EXTRA_FLAGS -Wnested-externs" EXTRA_FLAGS="$EXTRA_FLAGS -Wno-long-long" EXTRA_FLAGS="$EXTRA_FLAGS -Wno-strict-aliasing" EXTRA_FLAGS="$EXTRA_FLAGS -Wpointer-arith" EXTRA_FLAGS="$EXTRA_FLAGS -Wstrict-prototypes" EXTRA_FLAGS="$EXTRA_FLAGS -Wwrite-strings" EXTRA_FLAGS="$EXTRA_FLAGS -Wunused-but-set-variable" EXTRA_FLAGS="$EXTRA_FLAGS -Wunsigned-char" AS_IF([test x"$gcc_diagnostic_push_pull" = x"yes"], [ AC_DEFINE([HAVE_FORMAT_NONLITERAL], [], [gcc can complain about nonliterals in format]) EXTRA_FLAGS="$EXTRA_FLAGS -Wformat=2 -Wformat-nonliteral" ], [test x"$gcc_format_nonliteral" = x"yes"], [EXTRA_FLAGS="$EXTRA_FLAGS -Wformat=2"]) # Additional warnings it might be nice to enable one day # -Wshadow # -Wunreachable-code for j in $EXTRA_FLAGS do AS_IF([cc_supports_flag $CC_EXTRAS $j], [CC_EXTRAS="$CC_EXTRAS $j"]) done AC_MSG_NOTICE([Using additional gcc flags: ${CC_EXTRAS}]) ]) dnl dnl Hardening flags dnl dnl The prime control of whether to apply (targeted) hardening build flags and dnl which ones is --{enable,disable}-hardening option passed to ./configure: dnl dnl --enable-hardening=try (default): dnl depending on whether any of CFLAGS_HARDENED_EXE, LDFLAGS_HARDENED_EXE, dnl CFLAGS_HARDENED_LIB or LDFLAGS_HARDENED_LIB environment variables dnl (see below) is set and non-null, all these custom flags (even if not dnl set) are used as are, otherwise the best effort is made to offer dnl reasonably strong hardening in several categories (RELRO, PIE, dnl "bind now", stack protector) according to what the selected toolchain dnl can offer dnl dnl --enable-hardening: dnl same effect as --enable-hardening=try when the environment variables dnl in question are suppressed dnl dnl --disable-hardening: dnl do not apply any targeted hardening measures at all dnl dnl The user-injected environment variables that regulate the hardening in dnl default case are as follows: dnl dnl * CFLAGS_HARDENED_EXE, LDFLAGS_HARDENED_EXE dnl compiler and linker flags (respectively) for daemon programs dnl (pacemakerd, pacemaker-attrd, pacemaker-controld, pacemaker-execd, dnl pacemaker-based, pacemaker-fenced, pacemaker-remoted, dnl pacemaker-schedulerd) dnl dnl * CFLAGS_HARDENED_LIB, LDFLAGS_HARDENED_LIB dnl compiler and linker flags (respectively) for libraries linked dnl with the daemon programs dnl dnl Note that these are purposedly targeted variables (addressing particular dnl targets all over the scattered Makefiles) and have no effect outside of dnl the predestined scope (e.g., CLI utilities). For a global reach, dnl use CFLAGS, LDFLAGS, etc. as usual. dnl dnl For guidance on the suitable flags consult, for instance: dnl https://fedoraproject.org/wiki/Changes/Harden_All_Packages#Detailed_Harden_Flags_Description dnl https://owasp.org/index.php/C-Based_Toolchain_Hardening#GCC.2FBinutils dnl AS_IF([test $enable_hardening -eq $OPTIONAL], [ AS_IF([test "$(env | grep -Ec '^(C|LD)FLAGS_HARDENED_(EXE|LIB)=.')" = 0], [enable_hardening=$REQUIRED], [AC_MSG_NOTICE([Hardening: using custom flags from environment])] ) ], [ unset CFLAGS_HARDENED_EXE unset CFLAGS_HARDENED_LIB unset LDFLAGS_HARDENED_EXE unset LDFLAGS_HARDENED_LIB ] ) AS_CASE([$enable_hardening], [$DISABLED], [AC_MSG_NOTICE([Hardening: explicitly disabled])], [$REQUIRED], [ CFLAGS_HARDENED_EXE= CFLAGS_HARDENED_LIB= LDFLAGS_HARDENED_EXE= LDFLAGS_HARDENED_LIB= relro=0 pie=0 bindnow=0 stackprot="none" # daemons incl. libs: partial RELRO flag="-Wl,-z,relro" CC_CHECK_LDFLAGS(["${flag}"], [ LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}" LDFLAGS_HARDENED_LIB="${LDFLAGS_HARDENED_LIB} ${flag}" relro=1 ]) # daemons: PIE for both CFLAGS and LDFLAGS AS_IF([cc_supports_flag -fPIE], [ flag="-pie" CC_CHECK_LDFLAGS(["${flag}"], [ CFLAGS_HARDENED_EXE="${CFLAGS_HARDENED_EXE} -fPIE" LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}" pie=1 ]) ] ) # daemons incl. libs: full RELRO if sensible + as-needed linking # so as to possibly mitigate startup performance # hit caused by excessive linking with unneeded # libraries AS_IF([test "${relro}" = 1 && test "${pie}" = 1], [ flag="-Wl,-z,now" CC_CHECK_LDFLAGS(["${flag}"], [ LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}" LDFLAGS_HARDENED_LIB="${LDFLAGS_HARDENED_LIB} ${flag}" bindnow=1 ]) ] ) AS_IF([test "${bindnow}" = 1], [ flag="-Wl,--as-needed" CC_CHECK_LDFLAGS(["${flag}"], [ LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}" LDFLAGS_HARDENED_LIB="${LDFLAGS_HARDENED_LIB} ${flag}" ]) ]) # universal: prefer strong > all > default stack protector if possible flag= AS_IF([cc_supports_flag -fstack-protector-strong], [ flag="-fstack-protector-strong" stackprot="strong" ], [cc_supports_flag -fstack-protector-all], [ flag="-fstack-protector-all" stackprot="all" ], [cc_supports_flag -fstack-protector], [ flag="-fstack-protector" stackprot="default" ] ) AS_IF([test -n "${flag}"], [CC_EXTRAS="${CC_EXTRAS} ${flag}"]) # universal: enable stack clash protection if possible AS_IF([cc_supports_flag -fstack-clash-protection], [ CC_EXTRAS="${CC_EXTRAS} -fstack-clash-protection" AS_IF([test "${stackprot}" = "none"], [stackprot="clash-only"], [stackprot="${stackprot}+clash"] ) ] ) # Log a summary AS_IF([test "${relro}" = 1 || test "${pie}" = 1 || test x"${stackprot}" != x"none"], [AC_MSG_NOTICE(m4_normalize([Hardening: relro=${relro} pie=${pie} bindnow=${bindnow} stackprot=${stackprot}])) ], [AC_MSG_WARN([Hardening: no suitable features in the toolchain detected])] ) ], ) CFLAGS="$SANITIZERS_CFLAGS $CFLAGS $CC_EXTRAS" LDFLAGS="$SANITIZERS_LDFLAGS $LDFLAGS" CFLAGS_HARDENED_EXE="$SANITIZERS_CFLAGS $CFLAGS_HARDENED_EXE" LDFLAGS_HARDENED_EXE="$SANITIZERS_LDFLAGS $LDFLAGS_HARDENED_EXE" NON_FATAL_CFLAGS="$CFLAGS" AC_SUBST(NON_FATAL_CFLAGS) dnl dnl We reset CFLAGS to include our warnings *after* all function dnl checking goes on, so that our warning flags don't keep the dnl AC_*FUNCS() calls above from working. In particular, -Werror will dnl *always* cause us troubles if we set it before here. dnl dnl AS_IF([test $enable_fatal_warnings -ne $DISABLED], [ AC_MSG_NOTICE([Enabling fatal compiler warnings]) CFLAGS="$CFLAGS $WERROR" ]) AC_SUBST(CFLAGS) dnl This is useful for use in Makefiles that need to remove one specific flag CFLAGS_COPY="$CFLAGS" AC_SUBST(CFLAGS_COPY) AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries AC_SUBST(LOCALE) dnl Options for cleaning up the compiler output AS_IF([test $enable_quiet -ne $DISABLED], [ AC_MSG_NOTICE([Suppressing make details]) QUIET_LIBTOOL_OPTS="--silent" QUIET_MAKE_OPTS="-s" # POSIX compliant ], [ QUIET_LIBTOOL_OPTS="" QUIET_MAKE_OPTS="" ] ) dnl Put the above variables to use LIBTOOL="${LIBTOOL} --tag=CC \$(QUIET_LIBTOOL_OPTS)" MAKEFLAGS="${MAKEFLAGS} ${QUIET_MAKE_OPTS}" # Make features list available (sorted alphabetically, without leading space) PCMK_FEATURES=`echo "$PCMK_FEATURES" | sed -e 's/^ //' -e 's/ /\n/g' | sort | xargs` AC_DEFINE_UNQUOTED(CRM_FEATURES, "$PCMK_FEATURES", Set of enabled features) AC_SUBST(PCMK_FEATURES) AC_SUBST(CC) AC_SUBST(MAKEFLAGS) AC_SUBST(LIBTOOL) AC_SUBST(QUIET_LIBTOOL_OPTS) dnl Files we output that need to be executable CONFIG_FILES_EXEC([agents/ocf/ClusterMon], [agents/ocf/Dummy], [agents/ocf/HealthCPU], [agents/ocf/HealthIOWait], [agents/ocf/HealthSMART], [agents/ocf/Stateful], [agents/ocf/SysInfo], [agents/ocf/attribute], [agents/ocf/controld], [agents/ocf/ifspeed], [agents/ocf/ping], [agents/ocf/remote], [agents/stonith/fence_legacy], [agents/stonith/fence_watchdog], [cts/cluster_test], - [cts/cts], [cts/cts-attrd], [cts/cts-cli], [cts/cts-exec], [cts/cts-fencing], [cts/cts-lab], [cts/cts-regression], [cts/cts-scheduler], [cts/cts-schemas], [cts/benchmark/clubench], [cts/support/LSBDummy], [cts/support/cts-support], [cts/support/fence_dummy], [cts/support/pacemaker-cts-dummyd], [doc/abi-check], [maint/bumplibs], - [tools/cluster-clean], - [tools/cluster-helper], [tools/crm_failcount], [tools/crm_master], [tools/crm_report], [tools/crm_standby], [tools/pcmk_simtimes], [xml/rng-helper]) dnl Other files we output AC_CONFIG_FILES(Makefile \ agents/Makefile \ agents/alerts/Makefile \ agents/ocf/Makefile \ agents/stonith/Makefile \ cts/Makefile \ cts/benchmark/Makefile \ cts/scheduler/Makefile \ cts/scheduler/dot/Makefile \ cts/scheduler/exp/Makefile \ cts/scheduler/scores/Makefile \ cts/scheduler/stderr/Makefile \ cts/scheduler/summary/Makefile \ cts/scheduler/xml/Makefile \ cts/support/Makefile \ cts/support/pacemaker-cts-dummyd@.service \ daemons/Makefile \ daemons/attrd/Makefile \ daemons/based/Makefile \ daemons/controld/Makefile \ daemons/execd/Makefile \ daemons/execd/pacemaker_remote \ daemons/execd/pacemaker_remote.service \ daemons/fenced/Makefile \ daemons/pacemakerd/Makefile \ daemons/pacemakerd/pacemaker.service \ daemons/schedulerd/Makefile \ devel/Makefile \ doc/Doxyfile \ doc/Makefile \ doc/sphinx/Makefile \ etc/Makefile \ etc/init.d/pacemaker \ etc/logrotate.d/pacemaker \ etc/sysconfig/pacemaker \ include/Makefile \ include/crm/Makefile \ include/crm/cib/Makefile \ include/crm/common/Makefile \ include/crm/cluster/Makefile \ include/crm/fencing/Makefile \ include/crm/pengine/Makefile \ include/pcmki/Makefile \ lib/Makefile \ lib/cib/Makefile \ lib/cluster/Makefile \ lib/cluster/tests/Makefile \ lib/cluster/tests/cluster/Makefile \ lib/cluster/tests/cpg/Makefile \ lib/common/Makefile \ lib/common/tests/Makefile \ lib/common/tests/acl/Makefile \ lib/common/tests/actions/Makefile \ lib/common/tests/agents/Makefile \ lib/common/tests/cmdline/Makefile \ lib/common/tests/digest/Makefile \ lib/common/tests/flags/Makefile \ lib/common/tests/health/Makefile \ lib/common/tests/io/Makefile \ lib/common/tests/iso8601/Makefile \ lib/common/tests/lists/Makefile \ lib/common/tests/messages/Makefile \ lib/common/tests/nodes/Makefile \ lib/common/tests/nvpair/Makefile \ lib/common/tests/options/Makefile \ lib/common/tests/output/Makefile \ lib/common/tests/patchset/Makefile \ lib/common/tests/probes/Makefile \ lib/common/tests/procfs/Makefile \ lib/common/tests/resources/Makefile \ lib/common/tests/results/Makefile \ lib/common/tests/rules/Makefile \ lib/common/tests/scheduler/Makefile \ lib/common/tests/schemas/Makefile \ lib/common/tests/scores/Makefile \ lib/common/tests/strings/Makefile \ lib/common/tests/utils/Makefile \ lib/common/tests/xml/Makefile \ lib/common/tests/xml_comment/Makefile \ lib/common/tests/xml_element/Makefile \ lib/common/tests/xml_idref/Makefile \ lib/common/tests/xpath/Makefile \ lib/fencing/Makefile \ lib/libpacemaker.pc \ lib/lrmd/Makefile \ lib/pacemaker/Makefile \ lib/pacemaker/tests/Makefile \ lib/pacemaker/tests/pcmk_resource/Makefile \ lib/pacemaker/tests/pcmk_ticket/Makefile \ lib/pacemaker.pc \ lib/pacemaker-cib.pc \ lib/pacemaker-cluster.pc \ lib/pacemaker-fencing.pc \ lib/pacemaker-lrmd.pc \ lib/pacemaker-service.pc \ lib/pacemaker-pe_rules.pc \ lib/pacemaker-pe_status.pc \ lib/pengine/Makefile \ lib/pengine/tests/Makefile \ lib/pengine/tests/native/Makefile \ lib/pengine/tests/status/Makefile \ lib/pengine/tests/unpack/Makefile \ lib/pengine/tests/utils/Makefile \ lib/services/Makefile \ maint/Makefile \ po/Makefile.in \ python/Makefile \ python/setup.py \ python/pacemaker/Makefile \ python/pacemaker/_cts/Makefile \ python/pacemaker/_cts/tests/Makefile \ python/pacemaker/buildoptions.py \ python/tests/Makefile \ rpm/Makefile \ tests/Makefile \ tools/Makefile \ tools/crm_mon.service \ tools/report.collector \ tools/report.common \ xml/Makefile \ xml/pacemaker-schemas.pc \ ) dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() dnl ***************** dnl Configure summary dnl ***************** AC_MSG_NOTICE([]) AC_MSG_NOTICE([$PACKAGE configuration:]) AC_MSG_NOTICE([ Version = ${VERSION} (Build: $BUILD_VERSION)]) AC_MSG_NOTICE([ Features = ${PCMK_FEATURES}]) AC_MSG_NOTICE([]) AC_MSG_NOTICE([ Prefix = ${prefix}]) AC_MSG_NOTICE([ Executables = ${sbindir}]) AC_MSG_NOTICE([ Man pages = ${mandir}]) AC_MSG_NOTICE([ Libraries = ${libdir}]) AC_MSG_NOTICE([ Header files = ${includedir}]) AC_MSG_NOTICE([ Arch-independent files = ${datadir}]) AC_MSG_NOTICE([ State information = ${localstatedir}]) AC_MSG_NOTICE([ System configuration = ${sysconfdir}]) AC_MSG_NOTICE([ OCF agents = ${PCMK_OCF_ROOT}]) AM_COND_IF([BUILD_LSB], [AC_MSG_NOTICE([ LSB agents = ${INITDIR}])]) AC_MSG_NOTICE([]) AC_MSG_NOTICE([ HA group name = ${CRM_DAEMON_GROUP}]) AC_MSG_NOTICE([ HA user name = ${CRM_DAEMON_USER}]) AC_MSG_NOTICE([]) AC_MSG_NOTICE([ CFLAGS = ${CFLAGS}]) AC_MSG_NOTICE([ CFLAGS_HARDENED_EXE = ${CFLAGS_HARDENED_EXE}]) AC_MSG_NOTICE([ CFLAGS_HARDENED_LIB = ${CFLAGS_HARDENED_LIB}]) AC_MSG_NOTICE([ LDFLAGS_HARDENED_EXE = ${LDFLAGS_HARDENED_EXE}]) AC_MSG_NOTICE([ LDFLAGS_HARDENED_LIB = ${LDFLAGS_HARDENED_LIB}]) AC_MSG_NOTICE([ Libraries = ${LIBS}]) AC_MSG_NOTICE([ Stack Libraries = ${CLUSTERLIBS}]) AC_MSG_NOTICE([ Unix socket auth method = ${us_auth}]) diff --git a/cts/Makefile.am b/cts/Makefile.am index 3ad7d204fe..af61cc72fc 100644 --- a/cts/Makefile.am +++ b/cts/Makefile.am @@ -1,80 +1,77 @@ # # Copyright 2001-2025 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # include $(top_srcdir)/mk/common.mk include $(top_srcdir)/mk/python.mk # Test commands and globally applicable test files should be in $(testdir), # and command-specific test data should be in a command-specific subdirectory. testdir = $(datadir)/$(PACKAGE)/tests test_SCRIPTS = cts-attrd \ cts-cli \ cts-exec \ cts-fencing \ cts-lab \ cts-regression \ cts-scheduler \ cts-schemas dist_test_DATA = README.md \ valgrind-pcmk.suppressions clidir = $(testdir)/cli dist_cli_DATA = $(wildcard cli/*.xml cli/*.exp) -ctsdir = $(datadir)/$(PACKAGE)/tests/cts -cts_SCRIPTS = cts - schemasdir = $(testdir)/schemas dist_schemas_DATA = $(wildcard schemas/*/ref/*.ref*) \ $(wildcard schemas/*/ref.err/*.ref.err*) \ $(wildcard schemas/*/xml/*.xml) noinst_SCRIPTS = cluster_test .PHONY: scheduler-list scheduler-list: @for T in "$(srcdir)"/scheduler/xml/*.xml; do \ echo $$(basename $$T .xml); \ done CLEANFILES = $(builddir)/.regression.failed.diff .PHONY: clean-local clean-local: rm -f scheduler/*/*.pe rm -f schemas/*/*.up{,.err} SUBDIRS = benchmark \ scheduler \ support .PHONY: cts-support-install cts-support-install: $(MAKE) $(AM_MAKEFLAGS) -C support cts-support $(builddir)/support/cts-support install .PHONY: cts-support-uninstall cts-support-uninstall: $(MAKE) $(AM_MAKEFLAGS) -C support cts-support $(builddir)/support/cts-support uninstall # Everything listed here is a python script, typically generated from a .in file # (though that is not a requirement). We want to run pylint on all of these # things after they've been built. # FIXME: When cts-schemas is converted to python, this can be removed because # it will duplicate test_SCRIPTS above. python_files = cts-attrd \ cts-cli \ cts-exec \ cts-fencing \ cts-lab \ cts-regression \ cts-scheduler PYCHECKFILES ?= $(python_files) diff --git a/cts/README.md b/cts/README.md index 999131dc26..03dcf18e38 100644 --- a/cts/README.md +++ b/cts/README.md @@ -1,315 +1,304 @@ # Pacemaker Cluster Test Suite (CTS) The Cluster Test Suite (CTS) refers to all Pacemaker testing code that can be run in an installed environment. (Pacemaker also has unit tests that must be run from a source distribution.) CTS includes: * Regression tests: These test specific Pacemaker components individually (no integration tests). The primary front end is cts-regression in this directory. Run it with the --help option to see its usage. cts-regression is a wrapper for individual component regression tests also in this directory (cts-cli, cts-exec, cts-fencing, and cts-scheduler). The CLI and scheduler regression tests can also be run from a source distribution. The other regression tests can only run in an installed environment, and the cluster should not be running on the node running these tests. * The CTS lab: This is a cluster exerciser for intensively testing the behavior of an entire working cluster. It is primarily for developers and packagers of the Pacemaker source code, but it can be useful for users who wish to see how their cluster will react to various situations. Most of the lab code is in the Pacemaker Python module. The front end, cts-lab, is in this directory. The CTS lab runs a randomized series of predefined tests on the cluster. It can be run against a pre-existing cluster configuration or overwrite the existing configuration with a test configuration. * Helpers: Some of the component regression tests and the CTS lab require certain helpers to be installed as root. These include a dummy LSB init script, dummy systemd service, etc. In a source distribution, the source for these is in cts/support. The tests will install these as needed and uninstall them when done. This means that the cluster configuration created by the CTS lab will generate failures if started manually after the lab exits. However, the helper installer can be run manually to make the configuration usable, if you want to do your own further testing with it: /usr/libexec/pacemaker/cts-support install As you might expect, you can also remove the helpers with: /usr/libexec/pacemaker/cts-support uninstall (The actual directory location may vary depending on how Pacemaker was built.) * Cluster benchmark: The benchmark subdirectory of this directory contains some cluster test environment benchmarking code. It is not particularly useful for end users. * Valgrind suppressions: When memory-testing Pacemaker code with valgrind, various bugs in non-Pacemaker libraries and such can clutter the results. The valgrind-pcmk.suppressions file in this directory can be used with valgrind's --suppressions option to eliminate many of these. ## Using the CTS lab ### Requirements * Three or more machines (one test exerciser and at least two cluster nodes). * The test cluster nodes should be on the same subnet and have journalling filesystems (ext4, xfs, etc.) for all of their filesystems other than /boot. You also need a number of free IP addresses on that subnet if you intend to test IP address takeover. * The test exerciser machine doesn't need to be on the same subnet as the test cluster machines. Minimal demands are made on the exerciser; it just has to stay up during the tests. * Tracking problems is easier if all machines' clocks are closely synchronized. NTP does this automatically, but you can do it by hand if you want. * The account on the exerciser used to run the CTS lab (which does not need to be root) must be able to ssh as root to the cluster nodes without a password challenge. See the Mini-HOWTO at the end of this file for details about how to configure ssh for this. * The exerciser needs to be able to resolve all cluster node names, whether by DNS or /etc/hosts. * CTS is not guaranteed to run on all platforms that Pacemaker itself does. It calls commands such as service that may not be provided by all OSes. ### Preparation * Install Pacemaker, including the testing code, on all machines. The testing code must be the same version as the rest of Pacemaker, and the Pacemaker version must be the same on the exerciser and all cluster nodes. You can install from source, although many distributions package the testing code (named pacemaker-cts or similar). Typically, everything needed by the CTS lab is installed in /usr/share/pacemaker/tests/cts. * Configure the cluster layer (Corosync) on the cluster machines (*not* the exerciser), and verify it works. Node names used in the cluster configuration *must* match the hosts' names as returned by `uname -n`; they do not have to match the machines' fully qualified domain names. * Optionally, configure the exerciser as a log aggregator, using something like `rsyslog` log forwarding. If aggregation is detected, the exerciser will look for new messages locally instead of requesting them repeatedly from cluster nodes. * Currently, `/var/log/messages` on the exerciser is the only supported log destination. Further, if it's specified explicitly on the command line as the log file, then CTS lab will not check for aggregation. * CTS lab does not currently detect systemd journal log aggregation. * Optionally, if the lab nodes use the systemd journal for logs, create /etc/systemd/journald.conf.d/cts-lab.conf on each with `RateLimitIntervalSec=0` or `RateLimitBurst=0`, to avoid issues with log detection. ### Run The primary interface to the CTS lab is the cts-lab executable: /usr/share/pacemaker/tests/cts-lab [options] (The actual directory location may vary depending on how Pacemaker was built.) As part of the options, specify the cluster nodes with --nodes, for example: --nodes "pcmk-1 pcmk-2 pcmk-3" Most people will want to save the output to a file, for example: --outputfile ~/cts.log Unless you want to test a pre-existing cluster configuration, you also want (*warning*: with these options, any existing configuration will be lost): --clobber-cib --populate-resources You can test floating IP addresses (*not* already used by any host), one per cluster node, by specifying the first, for example: --test-ip-base 192.168.9.100 Configure some sort of fencing, for example to use fence\_xvm: - --stonith xvm + --fencing-agent fence_xvm Putting all the above together, a command line might look like: /usr/share/pacemaker/tests/cts-lab --nodes "pcmk-1 pcmk-2 pcmk-3" \ --outputfile ~/cts.log --clobber-cib --populate-resources \ - --test-ip-base 192.168.9.100 --stonith xvm 50 + --test-ip-base 192.168.9.100 --fencing-agent fence_xvm 50 For more options, run with the --help option. -There are also a couple of wrappers for cts-lab that some users may find more -convenient: cts, which is typically installed in the same place as the rest of -the testing code; and cluster\_test, which is in the source directory and -typically not installed. - -To extract the result of a particular test, run: - - crm_report -T $test +There is also a wrapper for cts-lab that some users may find more convenient: +cluster\_test, which is in the source directory and typically not installed. ### Optional: Memory testing Pacemaker has various options for testing memory management. On cluster nodes, Pacemaker components use various environment variables to control these options. How these variables are set varies by OS, but usually they are set in a file such as /etc/sysconfig/pacemaker or /etc/default/pacemaker. Valgrind is a program for detecting memory management problems such as use-after-free errors. If you have valgrind installed, you can enable it by setting the following environment variables on all cluster nodes: PCMK_valgrind_enabled=pacemaker-attrd,pacemaker-based,pacemaker-controld,pacemaker-execd,pacemaker-fenced,pacemaker-schedulerd VALGRIND_OPTS="--leak-check=full --trace-children=no --num-callers=25 --log-file=/var/lib/pacemaker/valgrind-%p --suppressions=/usr/share/pacemaker/tests/valgrind-pcmk.suppressions --gen-suppressions=all" -If running the CTS lab with valgrind enabled on the cluster nodes, add these -options to cts-lab: - - --valgrind-procs "pacemaker-attrd pacemaker-based pacemaker-controld pacemaker-execd pacemaker-schedulerd pacemaker-fenced" - These options should only be set while specifically testing memory management, because they may slow down the cluster significantly, and they will disable writes to the CIB. If desired, you can enable valgrind on a subset of pacemaker components rather than all of them as listed above. Valgrind will put a text file for each process in the location specified by valgrind's --log-file option. See https://www.valgrind.org/docs/manual/mc-manual.html for explanations of the messages valgrind generates. Separately, if you are using the GNU C library, the G\_SLICE, MALLOC\_PERTURB\_, and MALLOC\_CHECK\_ environment variables can be set to affect the library's memory management functions. When using valgrind, G\_SLICE should be set to "always-malloc", which helps valgrind track memory by always using the malloc() and free() routines directly. When not using valgrind, G\_SLICE can be left unset, or set to "debug-blocks", which enables the C library to catch many memory errors but may impact performance. If the MALLOC\_PERTURB\_ environment variable is set to an 8-bit integer, the C library will initialize all newly allocated bytes of memory to the integer value, and will set all newly freed bytes of memory to the bitwise inverse of the integer value. This helps catch uses of uninitialized or freed memory blocks that might otherwise go unnoticed. Example: MALLOC_PERTURB_=221 If the MALLOC\_CHECK\_ environment variable is set, the C library will check for certain heap corruption errors. The most useful value in testing is 3, which will cause the library to print a message to stderr and abort execution. Example: MALLOC_CHECK_=3 Valgrind should be enabled for either all nodes or none when used with the CTS lab, but the C library variables may be set differently on different nodes. ### Optional: Remote node testing If the pacemaker-remoted daemon is installed on all cluster nodes, the CTS lab will enable remote node tests. The remote node tests choose a random node, stop the cluster on it, start pacemaker-remoted on it, and add an ocf:pacemaker:remote resource to turn it into a remote node. When the test is done, the lab will turn the node back into a cluster node. To avoid conflicts, the lab will rename the node, prefixing the original node name with "remote-". For example, "pcmk-1" will become "remote-pcmk-1". These names do not need to be resolvable. The name change may require special fencing configuration, if the fence agent expects the node name to be the same as its hostname. A common approach is to specify the "remote-" names in pcmk\_host\_list. If you use pcmk\_host\_list=all, the lab will expand that to all cluster nodes and their "remote-" names. You may additionally need a pcmk\_host\_map argument to map the "remote-" names to the hostnames. Example: - --stonith xvm --stonith-args \ - pcmk_host_list=all,pcmk_host_map=remote-pcmk-1:pcmk-1;remote-pcmk-2:pcmk-2 + --fencing-agent fence_xvm --fencing-params \ + 'pcmk_host_list=all pcmk_host_map=remote-pcmk-1:pcmk-1;remote-pcmk-2:pcmk-2' ### Optional: Remote node testing with valgrind When running the remote node tests, the Pacemaker components on the *cluster* nodes can be run under valgrind as described in the "Memory testing" section. However, pacemaker-remoted cannot be run under valgrind that way, because it is started by the OS's regular boot system and not by Pacemaker. Details vary by system, but the goal is to set the VALGRIND\_OPTS environment variable and then start pacemaker-remoted by prefixing it with the path to valgrind. The init script and systemd service file provided with pacemaker-remoted will load the pacemaker environment variables from the same location used by other Pacemaker components, so VALGRIND\_OPTS will be set correctly if using one of those. For an OS using systemd, you can override the ExecStart parameter to run valgrind. For example: mkdir /etc/systemd/system/pacemaker_remote.service.d cat >/etc/systemd/system/pacemaker_remote.service.d/valgrind.conf <&2 } usage() { echo "usage: $0 " echo " dir: working directory (with the control file)" exit 0 } [ $# -eq 0 ] && usage WORKDIR=$1 test -d "$WORKDIR" || usage CTSCTRL=~/.cts CTRL=$WORKDIR/control CSV=$WORKDIR/bench.csv STATS=$WORKDIR/bench.stats test -f $CTRL && . $CTRL @datadir@/@PACKAGE@/tests/cts/cluster_test 500 || { msg "cluster_test failed" exit 1 } test -f $CTSCTRL || { msg no CTS control file $CTSCTRL exit 1 } . $CTSCTRL -: ${CTS_logfacility:=local7} -: ${CTS_stack:=corosync} : ${CTS_logfile:="@CRM_LOG_DIR@/ha-log-bench"} : ${CTS_adv:="--schema pacemaker-1.2 --clobber-cib -r"} : ${RUNS:=3} : ${CTSTESTS:="--benchmark"} : ${CTSDIR:="@datadir@/@PACKAGE@/tests/cts"} : ${CTS_node_list:=""} -: ${CTS_boot:=""} : ${CTS_stonith:=""} : ${CTS_stonith_args:=""} [ -n "$CTS_node_list" ] || { msg no node list specified exit 1 } -case "$CTS_stack" in -corosync) CRM_REPORT_OPTS="--corosync";; -*) msg "$CTS_stack: cluster stack not recognized"; exit 1;; -esac - -CTSOPTS="--stack $CTS_stack --at-boot $CTS_boot $CTS_adv" -CTSOPTS="$CTSOPTS --facility $CTS_logfacility --logfile $CTS_logfile" +CTSOPTS="$CTS_adv --logfile $CTS_logfile" if [ "x$CTS_stonith" != "x" ]; then - CTSOPTS="$CTSOPTS --stonith-type $CTS_stonith" + CTSOPTS="$CTSOPTS --fencing-agent $CTS_stonith" [ "x$CTS_stonith_args" != "x" ] && - CTSOPTS="$CTSOPTS --stonith-params \"$CTS_stonith_args\"" + CTSOPTS="$CTSOPTS --fencing-params \"$CTS_stonith_args\"" else - CTSOPTS="$CTSOPTS --stonith 0" + CTSOPTS="$CTSOPTS --disable-fencing" fi CTSOPTS="$CTSOPTS $CTSTESTS" fibonacci() { F_LIMIT=$1 F_N=2 F_N_PREV=1 while [ $F_N -le $F_LIMIT ]; do echo $F_N F_N_TMP=$F_N F_N=$((F_N+F_N_PREV)) F_N_PREV=$F_N_TMP done [ $F_N_PREV -ne $F_LIMIT ] && echo $F_LIMIT } [ "$SERIES" ] || SERIES=$(fibonacci "$(echo $CTS_node_list | wc -w)") get_nodes() { GN_C_NODES=$(echo $CTS_node_list | awk -v n="$1" ' { for( i=1; i<=NF; i++ ) node[cnt++]=$i } END{for( i=0; i "$RC_ODIR/ctsrun.out" 2>&1 & ctspid=$! tail -f "$RC_ODIR/ctsrun.out" & tailpid=$! wait $ctspid kill $tailpid >/dev/null 2>&1 } bench_re='CTS:.*runtime:' diginfo() { DI_CTS_DIR="$1" DI_S="$2" filter="$3" ( cd "$DI_CTS_DIR" || return for r in [0-9]*.tar.bz2; do tar xjf $r DI_D=$(basename "$r" .tar.bz2) for DI_V in $(grep "$bench_re" "$DI_D/ha-log.txt" | eval "$filter"); do DI_S="$DI_S,$DI_V" done rm -r "$DI_D" done echo $DI_S ) } printheader() { diginfo $1 "" "awk '{print \$(NF-2)}'" } printstats() { diginfo $1 "$clusize" "awk '{print \$(NF)}'" } printmedians() { PM_F="$1" PM_S="$clusize" PM_MIDDLE=$((RUNS/2 + 1)) set $(head -1 "$PM_F" | sed 's/,/ /g') PM_COLS=$# for PM_I in $(seq 2 $PM_COLS); do PM_V=$(awk -v i=$PM_I -F, '{print $i}' < $PM_F | sort -n | head -$PM_MIDDLE | tail -1) PM_S="$PM_S,$PM_V" done echo $PM_S } rm -f $CSV tmpf=`mktemp` test -f "$tmpf" || { msg "can't create temporary file" exit 1 } trap "rm -f $tmpf" 0 for clusize in $SERIES; do nodes=`get_nodes $clusize` outdir=$WORKDIR/$clusize rm -rf $outdir mkdir -p $outdir rm -f $tmpf node_cleanup for i in `seq $RUNS`; do true > $CTS_logfile mkdir -p $outdir/$i runcts $outdir/$i mkreports $outdir/$i printstats $outdir/$i >> $tmpf done [ -f "$CSV" ] || printheader $outdir/1 > $CSV printmedians $tmpf >> $CSV cat $tmpf >> $STATS msg "Statistics for $clusize-node cluster saved" done msg "Tests done for series $SERIES, output in $CSV and $STATS" # vim: set filetype=sh: diff --git a/cts/cluster_test.in b/cts/cluster_test.in index 9dcc64612a..324fc55183 100755 --- a/cts/cluster_test.in +++ b/cts/cluster_test.in @@ -1,177 +1,148 @@ #!@BASH_PATH@ # -# Copyright 2008-2020 the Pacemaker project contributors +# Copyright 2008-2025 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # if [ -e ~/.cts ]; then . ~/.cts fi anyAsked=0 [ $# -lt 1 ] || CTS_numtests=$1 die() { echo "$@"; exit 1; } if [ -z "$CTS_asked_once" ]; then anyAsked=1 echo "This script should only be executed on the test exerciser." echo "The test exerciser will remotely execute the actions required by the" echo "tests and should not be part of the cluster itself." read -p "Is this host intended to be the test exerciser? (yN) " doUnderstand [ "$doUnderstand" = "y" ] \ || die "This script must be executed on the test exerciser" fi if [ -z "$CTS_node_list" ]; then anyAsked=1 read -p "Please list your cluster nodes (eg. node1 node2 node3): " CTS_node_list else echo "Beginning test of cluster: $CTS_node_list" fi -if [ -z "$CTS_stack" ]; then - anyAsked=1 - read -p "Which cluster stack are you using? ([corosync]): " CTS_stack - [ -n "$CTS_stack" ] || CTS_stack=corosync -else - echo "Using the $CTS_stack cluster stack" -fi - [ "${CTS_node_list}" = "${CTS_node_list/$HOSTNAME/}" ] \ || die "This script must be executed on the test exerciser, and the test exerciser cannot be part of the cluster" printf "+ Bootstrapping ssh... " if [ -z "$SSH_AUTH_SOCK" ]; then printf "\n + Initializing SSH " eval "$(ssh-agent)" echo " + Adding identities..." ssh-add rc=$? if [ $rc -ne 0 ]; then echo " -- No identities added" printf "\nThe ability to open key-based 'ssh' connections (as the user 'root') is required to use CTS.\n" read -p " - Do you want this program to help you create one? (yN) " auto_fix if [ "$auto_fix" = "y" ]; then ssh-keygen -t dsa ssh-add else die "Please run 'ssh-keygen -t dsa' to create a new key" fi fi else echo "OK" fi test_ok=1 printf "+ Testing ssh configuration... " for n in $CTS_node_list; do ssh -l root -o PasswordAuthentication=no -o ConnectTimeout=5 "$n" /bin/true rc=$? if [ $rc -ne 0 ]; then echo " - connection to $n failed" test_ok=0 fi done if [ $test_ok -eq 0 ]; then printf "\nThe ability to open key-based 'ssh' connections (as the user 'root') is required to use CTS.\n" read -p " - Do you want this program to help you with such a setup? (yN) " auto_fix if [ "$auto_fix" = "y" ]; then # XXX are we picking the most suitable identity? privKey=$(ssh-add -L | head -n1 | cut -d" " -f3) sshCopyIdOpts="-o User=root" [ -z "$privKey" ] || sshCopyIdOpts+=" -i \"${privKey}.pub\"" for n in $CTS_node_list; do eval "ssh-copy-id $sshCopyIdOpts \"${n}\"" \ || die "Attempt to 'ssh-copy-id $sshCopyIdOpts \"$n\"' failed" done else die "Please install one of your SSH public keys to root's account on all cluster nodes" fi fi echo "OK" if [ -z "$CTS_logfile" ]; then anyAsked=1 read -p " + Where does/should syslog store logs from remote hosts? (/var/log/messages) " CTS_logfile [ -n "$CTS_logfile" ] || CTS_logfile=/var/log/messages fi [ -e "$CTS_logfile" ] || die "$CTS_logfile doesn't exist" -if [ -z "$CTS_logfacility" ]; then - anyAsked=1 - read -p " + Which log facility does the cluster use? (daemon) " CTS_logfacility - [ -n "$CTS_logfacility" ] || CTS_logfacility=daemon -fi - -if [ -z "$CTS_boot" ]; then - read -p "+ Is the cluster software started automatically when a node boots? [yN] " CTS_boot - if [ -z "$CTS_boot" ]; then - CTS_boot=0 - else - case $CTS_boot in - 1|y|Y) CTS_boot=1;; - *) CTS_boot=0;; - esac - fi -fi - if [ -z "$CTS_numtests" ]; then read -p "+ How many test iterations should be performed? (500) " CTS_numtests [ -n "$CTS_numtests" ] || CTS_numtests=500 fi if [ -z "$CTS_asked_once" ]; then anyAsked=1 read -p "+ What type of STONITH agent do you use? (none) " CTS_stonith [ -z "$CTS_stonith" ] \ || read -p "+ List any STONITH agent parameters (eq. device_host=switch.power.com): " CTS_stonith_args [ -n "$CTS_adv" ] \ || read -p "+ (Advanced) Any extra CTS parameters? (none) " CTS_adv fi [ $anyAsked -eq 0 ] \ || read -p "+ Save values to ~/.cts for next time? (yN) " doSave if [ "$doSave" = "y" ]; then cat > ~/.cts <<-EOF # CTS Test data - CTS_stack="$CTS_stack" CTS_node_list="$CTS_node_list" CTS_logfile="$CTS_logfile" CTS_logport="$CTS_logport" - CTS_logfacility="$CTS_logfacility" CTS_asked_once=1 CTS_adv="$CTS_adv" CTS_stonith="$CTS_stonith" CTS_stonith_args="$CTS_stonith_args" - CTS_boot="$CTS_boot" EOF fi cts_extra="" if [ -n "$CTS_stonith" ]; then - cts_extra="$cts_extra --stonith-type $CTS_stonith" + cts_extra="$cts_extra --fencing-agent $CTS_stonith" [ -z "$CTS_stonith_args" ] \ - || cts_extra="$cts_extra --stonith-params \"$CTS_stonith_args\"" + || cts_extra="$cts_extra --fencing-params \"$CTS_stonith_args\"" else - cts_extra="$cts_extra --stonith 0" + cts_extra="$cts_extra --disable-fencing" echo " - Testing a cluster without STONITH is like a blunt pencil... pointless" fi printf "\nAll set to go for %d iterations!\n" "$CTS_numtests" [ $anyAsked -ne 0 ] \ || echo "+ To use a different configuration, remove ~/.cts and re-run cts (or edit it manually)." echo Now paste the following command into this shell: -echo "@PYTHON@ `dirname "$0"`/cts-lab -L \"$CTS_logfile\" --syslog-facility \"$CTS_logfacility\" --no-unsafe-tests --stack \"$CTS_stack\" $CTS_adv --at-boot \"$CTS_boot\" $cts_extra \"$CTS_numtests\" --nodes \"$CTS_node_list\"" +echo "@PYTHON@ `dirname "$0"`/cts-lab -L \"$CTS_logfile\" --no-unsafe-tests $CTS_adv $cts_extra \"$CTS_numtests\" --nodes \"$CTS_node_list\"" # vim: set filetype=sh: diff --git a/cts/cts-lab.in b/cts/cts-lab.in index a909ebad01..bd161282b6 100644 --- a/cts/cts-lab.in +++ b/cts/cts-lab.in @@ -1,136 +1,129 @@ #!@PYTHON@ """Command-line interface to Pacemaker's Cluster Test Suite (CTS).""" # pylint doesn't like the module name "cts-lab" which is an invalid complaint for this file # This also disables various other invalid names - it thinks scenario and match are constants # that should have all caps names, and that cm and n are too short. # pylint: disable=invalid-name __copyright__ = "Copyright 2001-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import signal import sys from pacemaker._cts.CTS import CtsLab from pacemaker._cts.cmcorosync import Corosync2 from pacemaker._cts.audits import audit_list from pacemaker._cts.logging import LogFactory from pacemaker._cts.scenarios import AllOnce, Boot, BootCluster, LeaveBooted, RandomTests, Sequence from pacemaker._cts.tests import test_list # These are globals so they can be used by the signal handler. scenario = None LogFactory().add_stderr() def sig_handler(signum, _frame): """Handle the given signal number.""" LogFactory().log(f"Interrupted by signal {signum}") if scenario: scenario.summarize() if signum == 15: if scenario: scenario.teardown() sys.exit(1) def plural_s(n): """Return a string suffix depending on whether or not n is > 1.""" if n == 1: return "" return "S" if __name__ == '__main__': environment = CtsLab(sys.argv[1:]) iters = environment["iterations"] tests = [] # Set the signal handler signal.signal(15, sig_handler) signal.signal(10, sig_handler) - # Create the Cluster Manager object - cm = None - - if environment["Stack"] == "corosync 2+": - cm = Corosync2() - else: - LogFactory().log(f"Unknown stack: {environment['stack']}") - sys.exit(1) + # Create the Cluster Manager object. + # Currently Corosync2 is the only available cluster manager. + cm = Corosync2() if environment["TruncateLog"]: if environment["OutputFile"] is None: LogFactory().log("Ignoring truncate request because no output file specified") else: LogFactory().log(f"Truncating {environment['OutputFile']}") with open(environment["OutputFile"], "w", encoding="utf-8") as outputfile: outputfile.truncate(0) audits = audit_list(cm) if environment["ListTests"]: tests = test_list(cm, audits) LogFactory().log(f"Total {len(tests)} tests") for test in tests: LogFactory().log(test.name) sys.exit(0) elif len(environment["tests"]) == 0: tests = test_list(cm, audits) else: chosen = environment["tests"] for test_case in chosen: match = None for test in test_list(cm, audits): if test.name == test_case: match = test if not match: LogFactory().log("--choose: No applicable/valid tests chosen") sys.exit(1) else: tests.append(match) # Scenario selection if environment["scenario"] == "all-once": iters = len(tests) scenario = AllOnce(cm, [BootCluster(cm, environment)], audits, tests) elif environment["scenario"] == "sequence": scenario = Sequence(cm, [BootCluster(cm, environment)], audits, tests) elif environment["scenario"] == "boot": scenario = Boot(cm, [LeaveBooted(cm, environment)], audits, []) else: scenario = RandomTests(cm, [BootCluster(cm, environment)], audits, tests) LogFactory().log(f">>>>>>>>>>>>>>>> BEGINNING {iters!r} TEST{plural_s(iters)}") - LogFactory().log(f"Stack: {environment['Stack']} ({environment['Name']})") LogFactory().log(f"Schema: {environment['Schema']}") LogFactory().log(f"Scenario: {scenario.__doc__}") LogFactory().log(f"CTS Exerciser: {environment['cts-exerciser']}") LogFactory().log(f"CTS Logfile: {environment['OutputFile']}") - LogFactory().log(f"Random Seed: {environment['RandSeed']}") if "syslogd" in environment: - LogFactory().log(f"Syslog variant: {environment['syslogd'].strip()}") + LogFactory().log(f"Syslog variant: {environment['syslogd']}") LogFactory().log(f"System log files: {environment['LogFileName']}") if "IPBase" in environment: LogFactory().log(f"Base IP for resources: {environment['IPBase']}") LogFactory().log(f"Cluster starts at boot: {environment['at-boot']}") environment.dump() rc = environment.run(scenario, iters) sys.exit(rc) # vim: set filetype=python: diff --git a/cts/cts.in b/cts/cts.in deleted file mode 100755 index 20dcb1554f..0000000000 --- a/cts/cts.in +++ /dev/null @@ -1,406 +0,0 @@ -#!@BASH_PATH@ -# -# Copyright 2012-2023 the Pacemaker project contributors -# -# The version control history for this file may have further details. -# -# This source code is licensed under the GNU General Public License version 2 -# or later (GPLv2+) WITHOUT ANY WARRANTY. -# - -# e.g. /etc/sysconfig or /etc/default -CONFIG_DIR=@CONFIGDIR@ - -cts_root=`dirname $0` - -logfile=0 -summary=0 -verbose=0 -watch=0 -saved=0 -tests="" - -install=0 -clean=0 -kill=0 -run=0 -boot=0 -target=rhel-7 -cmd="" -trace="" - -custom_log="" -patterns="-e CTS:" - -function sed_in_place_remotely() { - cluster-helper -g $cluster_name -- cp -p "\"$1\"" "\"$1.sed\"" \&\& sed -e "\"$2\"" "\"$1\"" \> "\"$1.sed\"" \&\& mv "\"$1.sed\"" "\"$1\"" -} - - -helpmsg=$(cat </dev/null -if [ $? != 0 ]; then - echo $0 needs the cluster-helper script to be in your path - exit 1 -fi - -which cluster-clean &>/dev/null -if [ $? != 0 ]; then - echo $0 needs the cluster-clean script to be in your path - exit 1 -fi - -if [ "x$cluster_name" = x ] || [ "x$cluster_name" = xpick ]; then - clusters=`ls -1 ~/.dsh/group/[a-z]+[0-9] | sed s/.*group.// | tr '\n' ' ' ` - - echo "custom) interactively define a cluster" - for i in $clusters; do - echo "$i) `cluster-helper --list short -g $i`" - done - - read -p "Choose a cluster [custom]: " cluster_name - echo -fi - -if [ -z $cluster_name ]; then - cluster_name=custom -fi - - -case $cluster_name in - custom) - read -p "Cluster name: " cluster_name - read -p "Cluster hosts: " cluster_hosts - read -p "Cluster log file: " cluster_log - cluster-helper add -g "$cluster_name" -w "$cluster_hosts" - ;; - *) - cluster_hosts=`cluster-helper --list short -g $cluster_name` - cluster_log=~/cluster-$cluster_name.log - ;; -esac - -# NOTES ABOUT THESE AWESOME REGULAR EXPRESSIONS: -# -# * We can't assume GNU sed. Unfortunately, + and * are GNU extensions. Thus, -# we have to use {1,} for + and {0,} for *. -# * You don't need to add an extra set of escaped quotes around the sed expression -# arguments here - sed_in_place_remotely will do that for you. -# * Only literal quotes need the triple backslashes. All other special characters -# are fine with just a single one. -# * sed needs a LOT of characters escaped - \, {, }, (, ), and | at least. - -if [ x$cmd != x ]; then - config="${CONFIG_DIR}/pacemaker" - case $cmd in - trace-ls|tls) - cluster-helper -g $cluster_name -- grep "^[[:space:]]*PCMK_trace_functions" $config - ;; - trace-add|tadd) - echo "Adding $trace to PCMK_trace_functions" - # Note that this only works if there's already a PCMK_trace_functions line. - # If there isn't one, create it with trace-set first. - # - # Match optional whitespace; then PCMK_trace_functions; then an equals - # surrounded by optional whitespace; then an optional quote; then whatever - # else (presumably, this is the list of previously traced functions with - # an optional trailing quote). Replace the entire line with - # PCMK_trace_functions=, - sed_in_place_remotely "$config" "s/^[ \t]\{0,\}PCMK_trace_functions[ \t]\{0,\}=[ \t]\{0,\}\(\\\"\{0,1\}\)\(.\{1,\}\)/PCMK_trace_functions=\1$trace,\2/" - ;; - trace-rm|trm) - echo "Removing $trace from PCMK_trace_functions" - # A bunch of simple regexes are easier to follow than one giant one. - # Look for $trace in the following places on any line containing - # PCMK_trace_functions near the beginning: - # - # (1) At the start of a list - - # Match one of a leading quote, or an equals followed by optional - # whitespace; then $trace; then a comma. Replace $trace with whatever - # came before it. - # (2) In the middle of a list - - # Match a comma; then $trace; then a comma. Replace $trace with a - # single comma. - # (3) At the end of a list - - # Match a comma; then $trace; then one of a quote, whitespace, or - # the EOL. Replace $trace with whatever came after it. - # (4) All by itself - - # Match one of a leading quote, whitespace, or equals followed by - # optional whitespace; then $trace; then one of a trailing quote, - # whitespace, or the EOL. Replace $trace with whatever came before - # and after it. - sed_in_place_remotely "$config" "/^[ \t]\{0,\}PCMK_trace_functions/ { \ - s/\(\\\"\|=\|[ \t]\{1,\}\)$trace,/\1/ ; \ - s/,$trace,/,/ ; \ - s/,$trace\(\\\"\|[ \t]\{1,\}\|$\)/\1/ ; \ - s/\(\\\"\|[ \t]\{1,\}\|=[ \t]\{0,\}\)$trace\(\\\"\|[ \t]\{1,\}\|$\)/\1\2/ }" - ;; - trace-set|tset) - echo "Setting PCMK_trace_functions to '$trace'" - # Do this in two separate sed commands: - # - # (1) Unconditionally remove any existing PCMK_trace_functions= lines. - # (2) Add a new line with $trace after the example line, which therefore - # must exist. Note that GNU sed would support "a PCMK_trace_functions=$trace", - # but that's an extension. For all other seds, we have to put the - # command and the text on separate lines. - sed_in_place_remotely "$config" "/^[ \t]*PCMK_trace_functions/ d ; /^# Example: PCMK_trace_functions/ a\\\ -PCMK_trace_functions=\\\"$trace\\\"" - ;; - esac - exit 0 -fi - -if [ $run = 1 ]; then - install=1 - clean=1 -fi - -if [ $clean = 1 ]; then - rm -f $cluster_log - cluster-clean -g $cluster_name --kill -elif [ $kill = 1 ]; then - cluster-clean -g $cluster_name --kill-only - exit 0 -fi - -if [ $install = 1 ]; then - cluster-helper -g $cluster_name -- yum install -y pacemaker pacemaker-debuginfo pacemaker-cts libqb libqb-debuginfo -fi - -if [ $boot = 1 ]; then - $cts_root/cts-lab -r -c -g $cluster_name --boot - rc=$? - if [ $rc = 0 ]; then - echo "The cluster is ready..." - fi - exit $rc - -elif [ $run = 1 ]; then - $cts_root/cts-lab -r -c -g $cluster_name 500 "$@" - exit $? - -elif [ $clean = 1 ]; then - exit 0 -fi - -screen -ls | grep cts-$cluster_name &>/dev/null -active=$? - -if [ ! -z $custom_log ]; then - cluster_log=$custom_log -fi - -if [ "x$tests" != x ] && [ "x$tests" != "x " ]; then - for t in $tests; do - echo "crm_report --cts-log $cluster_log -d -T $t" - crm_report --cts-log $cluster_log -d -T $t - done - -elif [ $logfile = 1 ]; then - echo $cluster_log - -elif [ $summary = 1 ]; then - files=$cluster_log - if [ $saved = 1 ]; then - files=`ls -1tr ~/CTS-*/cluster-log.txt` - fi - for f in $files; do - echo $f - case $verbose in - 0) - cat -n $f | grep $patterns | grep -v "CTS: debug:" - ;; - 1) - cat -n $f | grep $patterns | grep -v "CTS:.* cmd:" - ;; - *) - cat -n $f | grep $patterns - ;; - esac - echo "" - done - -elif [ $watch = 1 ]; then - case $verbose in - 0) - tail -F $cluster_log | grep $patterns | grep -v "CTS: debug:" - ;; - 1) - tail -F $cluster_log | grep $patterns | grep -v "CTS:.* cmd:" - ;; - *) - tail -F $cluster_log | grep $patterns - ;; - esac - -elif [ $active = 0 ]; then - screen -x cts-$cluster_name - -else - touch $cluster_log - export cluster_name cluster_hosts cluster_log - screen -S cts-$cluster_name bash -fi - -# vim: set filetype=sh: diff --git a/cts/support/cts-support.in b/cts/support/cts-support.in index 60cc300621..5ac8d6002a 100644 --- a/cts/support/cts-support.in +++ b/cts/support/cts-support.in @@ -1,241 +1,241 @@ #!@PYTHON@ """Manage support files for Pacemaker CTS.""" # pylint doesn't like the module name "cts-attrd" which is an invalid complaint for this file # but probably something we want to continue warning about elsewhere # pylint: disable=invalid-name # pacemaker imports need to come after we modify sys.path, which pylint will complain about. # pylint: disable=wrong-import-position # We access various private members several places in this file, so disable this warning # file-wide. # pylint: disable=protected-access __copyright__ = "Copyright 2024 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import argparse import fcntl import os import shutil import subprocess import sys # These imports allow running from a source checkout after running `make`. # Note that while this doesn't necessarily mean it will successfully run tests, # but being able to see --help output can be useful. if os.path.exists("@abs_top_srcdir@/python"): sys.path.insert(0, "@abs_top_srcdir@/python") # pylint: disable=comparison-of-constants,comparison-with-itself,condition-evals-to-constant if os.path.exists("@abs_top_builddir@/python") and "@abs_top_builddir@" != "@abs_top_srcdir@": sys.path.insert(0, "@abs_top_builddir@/python") from pacemaker.buildoptions import BuildOptions from pacemaker.exitstatus import ExitStatus COROSYNC_RUNTIME_CONF = "cts.conf" COROSYNC_RUNTIME_UNIT = "corosync.service.d" DUMMY_DAEMON = "pacemaker-cts-dummyd" DUMMY_DAEMON_UNIT = "pacemaker-cts-dummyd@.service" FENCE_DUMMY = "fence_dummy" FENCE_DUMMY_ALIASES = ["auto_unfence", "no_reboot", "no_on", "no_nodeid"] LSB_DUMMY = "LSBDummy" def daemon_reload(): """Reload the systemd daemon.""" try: subprocess.call(["systemctl", "daemon-reload"]) except subprocess.SubprocessError: pass def install(src, destdir, mode=0o755): """Install a file to a given directory with the given mode.""" destfile = "%s/%s" % (destdir, os.path.basename(src)) shutil.copyfile(src, destfile) os.chmod(destfile, mode) def makedirs_if_missing(path): """If the directory path doesn't exist, create it.""" if os.path.exists(path): return os.makedirs(path) def cmd_install(src): """Install support files needed by Pacemaker CTS.""" cmd_uninstall() if not os.path.exists(src): sys.exit(ExitStatus.ERROR) os.chdir(src) if os.path.exists(BuildOptions.UNIT_DIR): print("Installing %s ..." % DUMMY_DAEMON) d = "%s/pacemaker" % BuildOptions.LIBEXEC_DIR makedirs_if_missing(d) install(DUMMY_DAEMON, d) print("Installing %s ..." % DUMMY_DAEMON_UNIT) install(DUMMY_DAEMON_UNIT, BuildOptions.UNIT_DIR) daemon_reload() runtime_unit_dir = "%s/systemd/system" % BuildOptions.RUNTIME_STATE_DIR if os.path.exists(runtime_unit_dir): unit_dir = "%s/%s" % (runtime_unit_dir, COROSYNC_RUNTIME_UNIT) print("Installing %s to %s ..." % (COROSYNC_RUNTIME_CONF, unit_dir)) makedirs_if_missing(unit_dir) install(COROSYNC_RUNTIME_CONF, unit_dir, 0o644) daemon_reload() print("Installing %s to %s ..." % (FENCE_DUMMY, BuildOptions._FENCE_BINDIR)) makedirs_if_missing(BuildOptions._FENCE_BINDIR) install(FENCE_DUMMY, BuildOptions._FENCE_BINDIR) for alias in FENCE_DUMMY_ALIASES: print("Installing fence_dummy_%s to %s ..." % (alias, BuildOptions._FENCE_BINDIR)) try: os.symlink(FENCE_DUMMY, "%s/fence_dummy_%s" % (BuildOptions._FENCE_BINDIR, alias)) except OSError: sys.exit(ExitStatus.ERROR) if BuildOptions.INIT_DIR is not None: print("Installing %s to %s ..." % (LSB_DUMMY, BuildOptions.INIT_DIR)) makedirs_if_missing(BuildOptions.INIT_DIR) install(LSB_DUMMY, BuildOptions.INIT_DIR) def cmd_uninstall(): """Remove support files needed by Pacemaker CTS.""" dummy_unit_file = "%s/%s" % (BuildOptions.UNIT_DIR, DUMMY_DAEMON_UNIT) if os.path.exists(dummy_unit_file): print("Removing %s ..." % dummy_unit_file) os.remove(dummy_unit_file) daemon_reload() corosync_runtime_dir = "%s/systemd/system/%s" % (BuildOptions.RUNTIME_STATE_DIR, COROSYNC_RUNTIME_UNIT) if os.path.exists(corosync_runtime_dir): print("Removing %s ..." % corosync_runtime_dir) shutil.rmtree(corosync_runtime_dir) daemon_reload() for f in ["%s/pacemaker/%s" % (BuildOptions.LIBEXEC_DIR, DUMMY_DAEMON), "%s/%s" % (BuildOptions._FENCE_BINDIR, FENCE_DUMMY), "%s/%s" % (BuildOptions.INIT_DIR, LSB_DUMMY)]: if not os.path.exists(f): continue print("Removing %s ..." % f) os.remove(f) for alias in FENCE_DUMMY_ALIASES: f = "%s/fence_dummy_%s" % (BuildOptions._FENCE_BINDIR, alias) if not os.path.exists(f) and not os.path.islink(f): continue print("Removing %s ..." % f) os.remove(f) def cmd_watch(filename, limit, offset, prefix): """Watch a log file.""" if not os.access(filename, os.R_OK): print("%sLast read: %d, limit=%d, count=%d - unreadable" % (prefix, 0, limit, 0)) sys.exit(ExitStatus.ERROR) with open(filename, "r", encoding="utf-8") as logfile: logfile.seek(0, os.SEEK_END) newsize = logfile.tell() if offset != 'EOF': offset = int(offset) if newsize >= offset: logfile.seek(offset) else: print("%sFile truncated from %d to %d" % (prefix, offset, newsize)) if (newsize * 1.05) < offset: logfile.seek(0) # Don't block when we reach EOF fcntl.fcntl(logfile.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) count = 0 while True: if logfile.tell() >= newsize: break if limit and count >= limit: break line = logfile.readline() if not line: break print(line.strip()) count += 1 print("%sLast read: %d, limit=%d, count=%d" % (prefix, logfile.tell(), limit, count)) def build_options(): """Handle command line arguments.""" # Create the top-level parser parser = argparse.ArgumentParser(description="Support tool for CTS") subparsers = parser.add_subparsers(dest="subparser_name") # Create the parser for the "install" command subparsers.add_parser("install", help="Install support files") # Create the parser for the "uninstall" command subparsers.add_parser("uninstall", help="Remove support files") # Create the parser for the "watch" command watch_parser = subparsers.add_parser("watch", help="Remote log watcher") watch_parser.add_argument("-f", "--filename", default="/var/log/messages", help="File to watch") watch_parser.add_argument("-l", "--limit", type=int, default=0, help="Maximum number of lines to read") watch_parser.add_argument("-o", "--offset", default=0, help="Which line number to start reading from") watch_parser.add_argument("-p", "--prefix", default="", help="String to add to the beginning of each line") args = parser.parse_args() return args if __name__ == "__main__": opts = build_options() if os.geteuid() != 0: print("This command must be run as root") sys.exit(ExitStatus.ERROR) # If the install directory doesn't exist, assume we're in a build directory. - data_dir = "%s/pacemaker/tests/cts" % BuildOptions.DATA_DIR + data_dir = f"{BuildOptions.DATA_DIR}/pacemaker/tests/cts" if not os.path.exists(data_dir): - data_dir = "%s/pacemaker/tests/cts" % BuildOptions._BUILD_DIR + data_dir = f"{BuildOptions._BUILD_DIR}/cts/support" if opts.subparser_name == "install": cmd_install(data_dir) if opts.subparser_name == "uninstall": cmd_uninstall() if opts.subparser_name == "watch": cmd_watch(opts.filename, opts.limit, opts.offset, opts.prefix) # vim: set filetype=python: diff --git a/python/pacemaker/_cts/CTS.py b/python/pacemaker/_cts/CTS.py index d09314a3e1..192618e60d 100644 --- a/python/pacemaker/_cts/CTS.py +++ b/python/pacemaker/_cts/CTS.py @@ -1,230 +1,243 @@ """Main classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["CtsLab", "NodeStatus", "Process"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys import time import traceback from pacemaker.exitstatus import ExitStatus from pacemaker._cts.environment import EnvFactory from pacemaker._cts.input import should_continue from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory class CtsLab: """ A class that defines the Lab Environment for the Cluster Test System. It defines those things which are expected to change from test environment to test environment for the same cluster manager. This is where you define the set of nodes that are in your test lab, what kind of reset mechanism you use, etc. All this data is stored as key/value pairs in an Environment instance constructed from arguments passed to this class. The CTS code ignores names it doesn't know about or need. Individual tests have access to this information, and it is perfectly acceptable to provide hints, tweaks, fine-tuning directions, or other information to the tests through this mechanism. """ def __init__(self, args=None): """ Create a new CtsLab instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: args -- A list of command line parameters, minus the program name. """ self._env = EnvFactory().getInstance(args) self._logger = LogFactory() def dump(self): """Print the current environment.""" self._env.dump() def __contains__(self, key): """Return True if the given environment key exists.""" # pylint gets confused because of EnvFactory here. # pylint: disable=unsupported-membership-test return key in self._env def __getitem__(self, key): """Return the given environment key, or raise KeyError if it does not exist.""" # Throughout this file, pylint has trouble understanding that EnvFactory # and RemoteFactory are singleton instances that can be treated as callable # and subscriptable objects. Various warnings are disabled because of this. # See also a comment about self._rsh in environment.py. # pylint: disable=unsubscriptable-object return self._env[key] def __setitem__(self, key, value): """Set the given environment key to the given value, overriding any previous value.""" # pylint: disable=unsupported-assignment-operation self._env[key] = value def run(self, scenario, iterations): """ Run the given scenario the given number of times. Returns ExitStatus.OK on success, or ExitStatus.ERROR on error. """ if not scenario: self._logger.log("No scenario was defined") return ExitStatus.ERROR self._logger.log("Cluster nodes: ") # pylint: disable=unsubscriptable-object for node in self._env["nodes"]: self._logger.log(f" * {node}") if not scenario.setup(): return ExitStatus.ERROR # We want to alert on any exceptions caused by running a scenario, so # here it's okay to disable the pylint warning. # pylint: disable=bare-except try: scenario.run(iterations) except: # noqa: E722 self._logger.log(f"Exception by {sys.exc_info()[0]}") self._logger.traceback(traceback) scenario.summarize() scenario.teardown() return ExitStatus.ERROR scenario.teardown() scenario.summarize() if scenario.stats["failure"] > 0: return ExitStatus.ERROR if scenario.stats["success"] != iterations: self._logger.log("No failure count but success != requested iterations") return ExitStatus.ERROR return ExitStatus.OK class NodeStatus: """ A class for querying the status of cluster nodes. Are nodes up? Do they respond to SSH connections? """ def __init__(self, env): """ Create a new NodeStatus instance. Arguments: env -- An Environment instance """ self._env = env def _node_booted(self, node): """Return True if the given node is booted (responds to pings).""" # pylint: disable=not-callable (rc, _) = RemoteFactory().getInstance()("localhost", f"ping -nq -c1 -w1 {node}", verbose=0) return rc == 0 def _sshd_up(self, node): """Return true if sshd responds on the given node.""" # pylint: disable=not-callable (rc, _) = RemoteFactory().getInstance()(node, "true", verbose=0) return rc == 0 def wait_for_node(self, node, timeout=300): """ Wait for a node to become available. Should the timeout be reached, the user will be given a choice whether to continue or not. If not, ValueError will be raised. Returns True when the node is available, or False if the timeout is reached. """ initial_timeout = timeout anytimeouts = False while timeout > 0: if self._node_booted(node) and self._sshd_up(node): if anytimeouts: # Fudge to wait for the system to finish coming up time.sleep(30) LogFactory().debug(f"Node {node} now up") return True time.sleep(30) if not anytimeouts: LogFactory().debug(f"Waiting for node {node} to come up") anytimeouts = True timeout -= 1 LogFactory().log(f"{node} did not come up within {initial_timeout} tries") if not should_continue(self._env["continue"]): raise ValueError(f"{node} did not come up within {initial_timeout} tries") return False def wait_for_all_nodes(self, nodes, timeout=300): """Return True when all nodes come up, or False if the timeout is reached.""" for node in nodes: if not self.wait_for_node(node, timeout): return False return True class Process: """A class for managing a Pacemaker daemon.""" # pylint: disable=invalid-name - def __init__(self, cm, name, dc_only=False, pats=None, dc_pats=None, - badnews_ignore=None): + def __init__(self, cm, name, pats=None, badnews_ignore=None): """ Create a new Process instance. Arguments: cm -- A ClusterManager instance name -- The command being run - dc_only -- Should this daemon be killed only on the DC? pats -- Regexes we expect to find in log files - dc_pats -- Additional DC-specific regexes we expect to find - in log files badnews_ignore -- Regexes for lines in the log that can be ignored """ self._cm = cm self.badnews_ignore = badnews_ignore - self.dc_only = dc_only - self.dc_pats = dc_pats self.name = name self.pats = pats if self.badnews_ignore is None: self.badnews_ignore = [] - if self.dc_pats is None: - self.dc_pats = [] - if self.pats is None: self.pats = [] - def kill(self, node): - """Kill the instance of this process running on the given node.""" - (rc, _) = self._cm.rsh(node, f"killall -9 {self.name}") - + def signal(self, sig, node): + """Send a signal to the instance of this process running on the given node.""" + # Using psutil would be nice but we need a shell command line. + + # Word boundaries. It's not clear how portable \<, \>, \b, and \W are. + non_word_char = "[^_[:alnum:]]" + word_begin = f"(^|{non_word_char})" + word_end = f"($|{non_word_char})" + + # Match this process, possibly running under valgrind + search_re = f"({word_begin}valgrind )?.*{word_begin}{self.name}{word_end}" + + if sig in ["SIGKILL", "KILL", 9, "SIGTERM", "TERM", 15]: + (rc, _) = self._cm.rsh(node, f"pgrep --full '{search_re}'") + if rc == 1: + # No matching process, so nothing to kill/terminate + return + if rc != 0: + # 2 or 3: Syntax error or fatal error (like out of memory) + self._cm.log(f"ERROR: pgrep for {self.name} failed on node {node}") + return + + # 0: One or more processes were successfully signaled. + # 1: No processes matched or none of them could be signalled. + # This is why we check for no matching process above. + (rc, _) = self._cm.rsh(node, f"pkill --signal {sig} --full '{search_re}'") if rc != 0: - self._cm.log(f"ERROR: Kill {self.name} failed on node {node}") + self._cm.log(f"ERROR: Sending signal {sig} to {self.name} failed on node {node}") diff --git a/python/pacemaker/_cts/audits.py b/python/pacemaker/_cts/audits.py index d140a15a6e..95165d48d0 100644 --- a/python/pacemaker/_cts/audits.py +++ b/python/pacemaker/_cts/audits.py @@ -1,1052 +1,1044 @@ """Auditing classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["AuditConstraint", "AuditResource", "ClusterAudit", "audit_list"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time import uuid from pacemaker.buildoptions import BuildOptions from pacemaker._cts.input import should_continue from pacemaker._cts.watcher import LogKind, LogWatcher class ClusterAudit: """ The base class for various kinds of auditors. Specific audit implementations should be built on top of this one. Audits can do all kinds of checks on the system. The basic interface for callers is the `__call__` method, which returns True if the audit passes and False if it fails. """ def __init__(self, cm): """ Create a new ClusterAudit instance. Arguments: cm -- A ClusterManager instance """ # pylint: disable=invalid-name self._cm = cm self.name = None def __call__(self): """Perform the audit action.""" raise NotImplementedError def is_applicable(self): """ Return True if this audit is applicable in the current test configuration. This method must be implemented by all subclasses. """ raise NotImplementedError def log(self, args): """Log a message.""" self._cm.log(f"audit: {args}") def debug(self, args): """Log a debug message.""" self._cm.debug(f"audit: {args}") class LogAudit(ClusterAudit): """ Audit each cluster node to verify that some logging system is usable. This is done by logging a unique test message and then verifying that we can read back that test message using logging tools. """ def __init__(self, cm): """ Create a new LogAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.name = "LogAudit" def _restart_cluster_logging(self, nodes=None): """Restart logging on the given nodes, or all if none are given.""" if not nodes: nodes = self._cm.env["nodes"] self._cm.debug(f"Restarting logging on: {nodes!r}") for node in nodes: if self._cm.env["have_systemd"]: (rc, _) = self._cm.rsh(node, "systemctl stop systemd-journald.socket") if rc != 0: self._cm.log(f"ERROR: Cannot stop 'systemd-journald' on {node}") (rc, _) = self._cm.rsh(node, "systemctl start systemd-journald.service") if rc != 0: self._cm.log(f"ERROR: Cannot start 'systemd-journald' on {node}") if "syslogd" in self._cm.env: (rc, _) = self._cm.rsh(node, f"service {self._cm.env['syslogd']} restart") if rc != 0: self._cm.log(f"""ERROR: Cannot restart '{self._cm.env["syslogd"]}' on {node}""") def _create_watcher(self, patterns, kind): """Create a new LogWatcher instance for the given patterns.""" watch = LogWatcher(self._cm.env["LogFileName"], patterns, self._cm.env["nodes"], kind, "LogAudit", 5, silent=True) watch.set_watch() return watch def _test_logging(self): """Perform the log audit.""" patterns = [] prefix = "Test message from" suffix = str(uuid.uuid4()) watch = {} for node in self._cm.env["nodes"]: # Look for the node name in two places to make sure # that syslog is logging with the correct hostname m = re.search("^([^.]+).*", node) if m: simple = m.group(1) else: simple = node patterns.append(f"{simple}.*{prefix} {node} {suffix}") watch_pref = self._cm.env["log_kind"] if watch_pref is None: kinds = [LogKind.LOCAL_FILE] if self._cm.env["have_systemd"]: kinds.append(LogKind.JOURNAL) kinds.append(LogKind.REMOTE_FILE) for k in kinds: watch[k] = self._create_watcher(patterns, k) self._cm.log(f"Logging test message with identifier {suffix}") else: watch[watch_pref] = self._create_watcher(patterns, watch_pref) for node in self._cm.env["nodes"]: - cmd = f"logger -p {self._cm.env['SyslogFacility']}.info {prefix} {node} {suffix}" + cmd = f"logger -p {self._cm.env['syslog_facility']}.info {prefix} {node} {suffix}" (rc, _) = self._cm.rsh(node, cmd, synchronous=False, verbose=0) if rc != 0: self._cm.log(f"ERROR: Cannot execute remote command [{cmd}] on {node}") for k, w in watch.items(): if watch_pref is None: self._cm.log(f"Checking for test message in {k} logs") w.look_for_all(silent=True) if not w.unmatched: if watch_pref is None: self._cm.log(f"Found test message in {k} logs") self._cm.env["log_kind"] = k return True for regex in w.unmatched: self._cm.log(f"Test message [{regex}] not found in {w.kind} logs") return False def __call__(self): """Perform the audit action.""" max_attempts = 3 attempt = 0 passed = True self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) while attempt <= max_attempts and not self._test_logging(): attempt += 1 self._restart_cluster_logging() time.sleep(60 * attempt) if attempt > max_attempts: self._cm.log("ERROR: Cluster logging unrecoverable.") passed = False return passed def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" if self._cm.env["LogAuditDisabled"]: return False return True class DiskAudit(ClusterAudit): """ Audit disk usage on cluster nodes. Verify that there is enough free space left on whichever mounted file system holds the logs. Warn on: less than 100 MB or 10% of free space Error on: less than 10 MB or 5% of free space """ def __init__(self, cm): """ Create a new DiskAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.name = "DiskspaceAudit" def __call__(self): """Perform the audit action.""" passed = True # @TODO Use directory of PCMK_logfile if set on host dfcmd = "df -BM %s | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%%'" % BuildOptions.LOG_DIR self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) for node in self._cm.env["nodes"]: (_, dfout) = self._cm.rsh(node, dfcmd, verbose=1) if not dfout: self._cm.log(f"ERROR: Cannot execute remote df command [{dfcmd}] on {node}") continue dfout = dfout[0].strip() try: (used, remain) = dfout.split() used_percent = int(used) remaining_mb = int(remain) except (ValueError, TypeError): self._cm.log(f"Warning: df output '{dfout}' from {node} was invalid [{used}, {remain}]") else: if remaining_mb < 10 or used_percent > 95: self._cm.log(f"CRIT: Out of log disk space on {node} ({used_percent}% / {remaining_mb}MB)") passed = False if not should_continue(self._cm.env): raise ValueError(f"Disk full on {node}") elif remaining_mb < 100 or used_percent > 90: self._cm.log(f"WARN: Low on log disk space ({remaining_mb}MB) on {node}") return passed def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" return True class FileAudit(ClusterAudit): """ Audit the filesystem looking for various failure conditions. Check for: * The presence of core dumps from corosync or Pacemaker daemons * Stale IPC files """ def __init__(self, cm): """ Create a new FileAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.known = [] self.name = "FileAudit" def _output_has_core(self, output, node): """Check output for any lines that would indicate the presence of a core dump.""" found = False for line in output: line = line.strip() if line in self.known: continue found = True self.known.append(line) self._cm.log(f"Warning: core file on {node}: {line}") return found def _find_core_with_coredumpctl(self, node): """Use coredumpctl to find core dumps on the given node.""" (_, lsout) = self._cm.rsh(node, "coredumpctl --no-legend --no-pager") return self._output_has_core(lsout, node) def _find_core_on_fs(self, node, paths): """Check for core dumps on the given node, under any of the given paths.""" (_, lsout) = self._cm.rsh(node, f"ls -al {' '.join(paths)} | grep core.[0-9]", verbose=1) return self._output_has_core(lsout, node) def __call__(self): """Perform the audit action.""" passed = True self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) for node in self._cm.env["nodes"]: found = False # If systemd is present, first see if coredumpctl logged any core dumps. if self._cm.env["have_systemd"]: found = self._find_core_with_coredumpctl(node) if found: passed = False # If we didn't find any core dumps, it's for one of three reasons: # (1) Nothing crashed # (2) systemd is not present # (3) systemd is present but coredumpctl is not enabled # # To handle the last two cases, check the other filesystem locations. if not found: found = self._find_core_on_fs(node, ["/var/lib/pacemaker/cores/*", "/var/lib/corosync"]) if found: passed = False if self._cm.expected_status.get(node) == "down": clean = False (_, lsout) = self._cm.rsh(node, "ls -al /dev/shm | grep qb-", verbose=1) for line in lsout: passed = False clean = True self._cm.log(f"Warning: Stale IPC file on {node}: {line}") if clean: (_, lsout) = self._cm.rsh(node, "ps axf | grep -e pacemaker -e corosync", verbose=1) for line in lsout: self._cm.debug(f"ps[{node}]: {line}") self._cm.rsh(node, "rm -rf /dev/shm/qb-*") else: self._cm.debug(f"Skipping {node}") return passed def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" return True class AuditResource: """A base class for storing information about a cluster resource.""" def __init__(self, cm, line): """ Create a new AuditResource instance. Arguments: cm -- A ClusterManager instance line -- One line of output from `crm_resource` describing a single resource """ # pylint: disable=invalid-name fields = line.split() self._cm = cm self.line = line self.type = fields[1] self.id = fields[2] self.clone_id = fields[3] self.parent = fields[4] self.rprovider = fields[5] self.rclass = fields[6] self.rtype = fields[7] self.host = fields[8] self.needs_quorum = fields[9] self.flags = int(fields[10]) self.flags_s = fields[11] if self.parent == "NA": self.parent = None @property def unique(self): """Return True if this resource is unique.""" return self.flags & 0x20 @property def orphan(self): """Return True if this resource is an orphan.""" return self.flags & 0x01 @property def managed(self): """Return True if this resource is managed by the cluster.""" return self.flags & 0x02 class AuditConstraint: """A base class for storing information about a cluster constraint.""" def __init__(self, cm, line): """ Create a new AuditConstraint instance. Arguments: cm -- A ClusterManager instance line -- One line of output from `crm_resource` describing a single constraint """ # pylint: disable=invalid-name fields = line.split() self._cm = cm self.line = line self.type = fields[1] self.id = fields[2] self.rsc = fields[3] self.target = fields[4] self.score = fields[5] self.rsc_role = fields[6] self.target_role = fields[7] if self.rsc_role == "NA": self.rsc_role = None if self.target_role == "NA": self.target_role = None class PrimitiveAudit(ClusterAudit): """ Audit primitive resources to verify a variety of conditions. Check that: * Resources are active and managed only when expected * Resources are active on the expected cluster node * Resources are not orphaned """ def __init__(self, cm): """ Create a new PrimitiveAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.name = "PrimitiveAudit" self._active_nodes = [] self._constraints = [] self._inactive_nodes = [] self._resources = [] self._target = None def _audit_resource(self, resource, quorum): """Perform the audit of a single resource.""" rc = True active = self._cm.resource_location(resource.id) if len(active) == 1: if quorum: self.debug(f"Resource {resource.id} active on {active!r}") elif resource.needs_quorum == 1: self._cm.log(f"Resource {resource.id} active without quorum: {active!r}") rc = False elif not resource.managed: self._cm.log(f"Resource {resource.id} not managed. Active on {active!r}") elif not resource.unique: # TODO: Figure out a clever way to actually audit these resource types if len(active) > 1: self.debug(f"Non-unique resource {resource.id} is active on: {active!r}") else: self.debug(f"Non-unique resource {resource.id} is not active") elif len(active) > 1: self._cm.log(f"Resource {resource.id} is active multiple times: {active!r}") rc = False elif resource.orphan: self.debug(f"Resource {resource.id} is an inactive orphan") elif not self._inactive_nodes: self._cm.log(f"WARN: Resource {resource.id} not served anywhere") rc = False - elif self._cm.env["warn-inactive"]: - if quorum or not resource.needs_quorum: - self._cm.log(f"WARN: Resource {resource.id} not served anywhere " - f"(Inactive nodes: {self._inactive_nodes!r})") - else: - self.debug(f"Resource {resource.id} not served anywhere " - f"(Inactive nodes: {self._inactive_nodes!r})") - elif quorum or not resource.needs_quorum: self.debug(f"Resource {resource.id} not served anywhere " f"(Inactive nodes: {self._inactive_nodes!r})") return rc def _setup(self): """ Verify cluster nodes are active. Collect resource and colocation information used for performing the audit. """ for node in self._cm.env["nodes"]: if self._cm.expected_status[node] == "up": self._active_nodes.append(node) else: self._inactive_nodes.append(node) for node in self._cm.env["nodes"]: if self._target is None and self._cm.expected_status[node] == "up": self._target = node if not self._target: # TODO: In Pacemaker 1.0 clusters we'll be able to run crm_resource # with CIB_file=/path/to/cib.xml even when the cluster isn't running self.debug(f"No nodes active - skipping {self.name}") return False (_, lines) = self._cm.rsh(self._target, "crm_resource --list-cts", verbose=1) for line in lines: if re.search("^Resource", line): self._resources.append(AuditResource(self._cm, line)) elif re.search("^Constraint", line): self._constraints.append(AuditConstraint(self._cm, line)) else: self._cm.log(f"Unknown entry: {line}") return True def __call__(self): """Perform the audit action.""" passed = True if not self._setup(): return passed + primitives = [r for r in self._resources if r.type == "primitive"] quorum = self._cm.has_quorum(None) - for resource in self._resources: - if resource.type == "primitive" and not self._audit_resource(resource, quorum): + + for primitive in primitives: + if not self._audit_resource(primitive, quorum): passed = False return passed def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" # @TODO Due to long-ago refactoring, this name test would never match, # so this audit (and those derived from it) would never run. # Uncommenting the next lines fixes the name test, but that then # exposes pre-existing bugs that need to be fixed. - # if self._cm["Name"] == "crm-corosync": + # if self._cm.name == "crm-corosync": # return True return False class GroupAudit(PrimitiveAudit): """ Audit group resources. Check that: * Each of its child primitive resources is active on the expected cluster node """ def __init__(self, cm): """ Create a new GroupAudit instance. Arguments: cm -- A ClusterManager instance """ PrimitiveAudit.__init__(self, cm) self.name = "GroupAudit" def __call__(self): passed = True if not self._setup(): return passed - for group in self._resources: - if group.type != "group": - continue + groups = [r for r in self._resources if r.type == "group"] + for group in groups: first_match = True group_location = None + children = [r for r in self._resources if r.parent == group.id] - for child in self._resources: - if child.parent != group.id: - continue - + for child in children: nodes = self._cm.resource_location(child.id) if first_match and len(nodes) > 0: group_location = nodes[0] first_match = False if len(nodes) > 1: passed = False self._cm.log(f"Child {child.id} of {group.id} is active more than once: {nodes!r}") elif not nodes: # Groups are allowed to be partially active # However we do need to make sure later children aren't running group_location = None self.debug(f"Child {child.id} of {group.id} is stopped") elif nodes[0] != group_location: passed = False self._cm.log(f"Child {child.id} of {group.id} is active on the wrong " f"node ({nodes[0]}) expected {group_location}") else: self.debug(f"Child {child.id} of {group.id} is active on {nodes[0]}") return passed class CloneAudit(PrimitiveAudit): """ Audit clone resources. NOTE: Currently, this class does not perform any actual audit functions. """ def __init__(self, cm): """ Create a new CloneAudit instance. Arguments: cm -- A ClusterManager instance """ PrimitiveAudit.__init__(self, cm) self.name = "CloneAudit" def __call__(self): passed = True if not self._setup(): return passed - for clone in self._resources: - if clone.type != "clone": - continue + clones = [r for r in self._resources if r.type == "clone"] + + for clone in clones: + children = [r for r in self._resources + if r.parent == clone.id and r.type == "primitive"] - for child in self._resources: - if child.parent == clone.id and child.type == "primitive": - self.debug(f"Checking child {child.id} of {clone.id}...") - # Check max and node_max - # Obtain with: - # crm_resource -g clone_max --meta -r child.id - # crm_resource -g clone_node_max --meta -r child.id + for child in children: + self.debug(f"Checking child {child.id} of {clone.id}...") + # Check max and node_max + # Obtain with: + # crm_resource -g clone_max --meta -r child.id + # crm_resource -g clone_node_max --meta -r child.id return passed class ColocationAudit(PrimitiveAudit): """ Audit cluster resources. Check that: * Resources are colocated with the expected resource """ def __init__(self, cm): """ Create a new ColocationAudit instance. Arguments: cm -- A ClusterManager instance """ PrimitiveAudit.__init__(self, cm) self.name = "ColocationAudit" def _crm_location(self, resource): """Return a list of cluster nodes where a given resource is running.""" (rc, lines) = self._cm.rsh(self._target, f"crm_resource --locate -r {resource} -Q", verbose=1) hosts = [] if rc == 0: for line in lines: fields = line.split() hosts.append(fields[0]) return hosts def __call__(self): passed = True if not self._setup(): return passed for coloc in self._constraints: if coloc.type != "rsc_colocation": continue source = self._crm_location(coloc.rsc) target = self._crm_location(coloc.target) if not source: self.debug(f"Colocation audit ({coloc.id}): {coloc.rsc} not running") else: for node in source: if node not in target: passed = False self._cm.log(f"Colocation audit ({coloc.id}): {coloc.rsc} running " f"on {node} (not in {target!r})") else: self.debug(f"Colocation audit ({coloc.id}): {coloc.rsc} running " f"on {node} (in {target!r})") return passed class ControllerStateAudit(ClusterAudit): """Verify active and inactive resources.""" def __init__(self, cm): """ Create a new ControllerStateAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.name = "ControllerStateAudit" def __call__(self): passed = True up_are_down = 0 down_are_up = 0 unstable_list = [] for node in self._cm.env["nodes"]: should_be = self._cm.expected_status[node] rc = self._cm.test_node_cm(node) if rc > 0: if should_be == "down": down_are_up += 1 if rc == 1: unstable_list.append(node) elif should_be == "up": up_are_down += 1 if len(unstable_list) > 0: passed = False self._cm.log(f"Cluster is not stable: {len(unstable_list)} (of " f"{self._cm.upcount()}): {unstable_list!r}") if up_are_down > 0: passed = False self._cm.log(f"{up_are_down} (of {len(self._cm.env['nodes'])}) nodes " "expected to be up were down.") if down_are_up > 0: passed = False self._cm.log(f"{down_are_up} (of {len(self._cm.env['nodes'])}) nodes " "expected to be down were up.") return passed def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" # @TODO Due to long-ago refactoring, this name test would never match, # so this audit (and those derived from it) would never run. # Uncommenting the next lines fixes the name test, but that then # exposes pre-existing bugs that need to be fixed. - # if self._cm["Name"] == "crm-corosync": + # if self._cm.name == "crm-corosync": # return True return False class CIBAudit(ClusterAudit): """Audit the CIB by verifying that it is identical across cluster nodes.""" def __init__(self, cm): """ Create a new CIBAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.name = "CibAudit" def __call__(self): passed = True ccm_partitions = self._cm.find_partitions() if not ccm_partitions: self.debug("\tNo partitions to audit") return passed for partition in ccm_partitions: self.debug(f"\tAuditing CIB consistency for: {partition}") if self._audit_cib_contents(partition) == 0: passed = False return passed def _audit_cib_contents(self, hostlist): """Perform the CIB audit on the given hosts.""" passed = True node0 = None node0_xml = None partition_hosts = hostlist.split() for node in partition_hosts: node_xml = self._store_remote_cib(node, node0) if node_xml is None: self._cm.log(f"Could not perform audit: No configuration from {node}") passed = False elif node0 is None: node0 = node node0_xml = node_xml elif node0_xml is None: self._cm.log(f"Could not perform audit: No configuration from {node0}") passed = False else: (rc, result) = self._cm.rsh( node0, f"crm_diff -VV -cf --new {node_xml} --original {node0_xml}", verbose=1) if rc != 0: self._cm.log(f"Diff between {node0_xml} and {node_xml} failed: {rc}") passed = False for line in result: if not re.search("", line): passed = False self.debug(f"CibDiff[{node0}-{node}]: {line}") else: self.debug(f"CibDiff[{node0}-{node}] Ignoring: {line}") return passed def _store_remote_cib(self, node, target): """ Store a copy of the given node's CIB on the given target node. If no target is given, store the CIB on the given node. """ filename = f"/tmp/ctsaudit.{node}.xml" if not target: target = node - (rc, lines) = self._cm.rsh(node, self._cm["CibQuery"], verbose=1) + (rc, lines) = self._cm.rsh(node, self._cm.templates["CibQuery"], verbose=1) if rc != 0: self._cm.log("Could not retrieve configuration") return None self._cm.rsh("localhost", f"rm -f {filename}") for line in lines: self._cm.rsh("localhost", f"echo \'{line[:-1]}\' >> {filename}", verbose=0) if self._cm.rsh.copy(filename, f"root@{target}:{filename}", silent=True) != 0: self._cm.log("Could not store configuration") return None return filename def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" # @TODO Due to long-ago refactoring, this name test would never match, # so this audit (and those derived from it) would never run. # Uncommenting the next lines fixes the name test, but that then # exposes pre-existing bugs that need to be fixed. - # if self._cm["Name"] == "crm-corosync": + # if self._cm.name == "crm-corosync": # return True return False class PartitionAudit(ClusterAudit): """ Audit each partition in a cluster to verify a variety of conditions. Check that: * The number of partitions and the nodes in each is as expected * Each node is active when it should be active and inactive when it should be inactive * The status and epoch of each node is as expected * A partition has quorum * A partition has a DC when expected """ def __init__(self, cm): """ Create a new PartitionAudit instance. Arguments: cm -- A ClusterManager instance """ ClusterAudit.__init__(self, cm) self.name = "PartitionAudit" self._node_epoch = {} self._node_state = {} self._node_quorum = {} def __call__(self): passed = True ccm_partitions = self._cm.find_partitions() if not ccm_partitions: return passed self._cm.cluster_stable(double_check=True) if len(ccm_partitions) != self._cm.partitions_expected: self._cm.log(f"ERROR: {len(ccm_partitions)} cluster partitions detected:") passed = False for partition in ccm_partitions: self._cm.log(f"\t {partition}") for partition in ccm_partitions: if self._audit_partition(partition) == 0: passed = False return passed def _trim_string(self, avalue): """Remove the last character from a multi-character string.""" if not avalue: return None if len(avalue) > 1: return avalue[:-1] return avalue def _trim2int(self, avalue): """Remove the last character from a multi-character string and convert the result to an int.""" trimmed = self._trim_string(avalue) if trimmed: return int(trimmed) return None def _audit_partition(self, partition): """Perform the audit of a single partition.""" passed = True dc_found = [] dc_allowed_list = [] lowest_epoch = None node_list = partition.split() self.debug(f"Auditing partition: {partition}") for node in node_list: if self._cm.expected_status[node] != "up": self._cm.log(f"Warn: Node {node} appeared out of nowhere") self._cm.expected_status[node] = "up" # not in itself a reason to fail the audit (not what we're # checking for in this audit) - (_, out) = self._cm.rsh(node, self._cm["StatusCmd"] % node, verbose=1) + (_, out) = self._cm.rsh(node, self._cm.templates["StatusCmd"] % node, verbose=1) self._node_state[node] = out[0].strip() - (_, out) = self._cm.rsh(node, self._cm["EpochCmd"], verbose=1) + (_, out) = self._cm.rsh(node, self._cm.templates["EpochCmd"], verbose=1) self._node_epoch[node] = out[0].strip() - (_, out) = self._cm.rsh(node, self._cm["QuorumCmd"], verbose=1) + (_, out) = self._cm.rsh(node, self._cm.templates["QuorumCmd"], verbose=1) self._node_quorum[node] = out[0].strip() self.debug(f"Node {node}: {self._node_state[node]} - {self._node_epoch[node]} - {self._node_quorum[node]}.") self._node_state[node] = self._trim_string(self._node_state[node]) self._node_epoch[node] = self._trim2int(self._node_epoch[node]) self._node_quorum[node] = self._trim_string(self._node_quorum[node]) if not self._node_epoch[node]: self._cm.log(f"Warn: Node {node} disappeared: can't determine epoch") self._cm.expected_status[node] = "down" # not in itself a reason to fail the audit (not what we're # checking for in this audit) elif lowest_epoch is None or self._node_epoch[node] < lowest_epoch: lowest_epoch = self._node_epoch[node] if not lowest_epoch: self._cm.log(f"Lowest epoch not determined in {partition}") passed = False for node in node_list: if self._cm.expected_status[node] != "up": continue if self._cm.is_node_dc(node, self._node_state[node]): dc_found.append(node) if self._node_epoch[node] == lowest_epoch: self.debug(f"{node}: OK") elif not self._node_epoch[node]: self.debug(f"Check on {node} ignored: no node epoch") elif not lowest_epoch: self.debug(f"Check on {node} ignored: no lowest epoch") else: self._cm.log(f"DC {node} is not the oldest node " f"({self._node_epoch[node]} vs. {lowest_epoch})") passed = False if not dc_found: self._cm.log(f"DC not found on any of the {len(dc_allowed_list)} allowed " f"nodes: {dc_allowed_list} (of {node_list})") elif len(dc_found) > 1: self._cm.log(f"{len(dc_found)} DCs ({dc_found}) found in cluster partition: {node_list}") passed = False if not passed: for node in node_list: if self._cm.expected_status[node] == "up": self._cm.log(f"epoch {self._node_epoch[node]} : {self._node_state[node]}") return passed def is_applicable(self): """Return True if this audit is applicable in the current test configuration.""" # @TODO Due to long-ago refactoring, this name test would never match, # so this audit (and those derived from it) would never run. # Uncommenting the next lines fixes the name test, but that then # exposes pre-existing bugs that need to be fixed. - # if self._cm["Name"] == "crm-corosync": + # if self._cm.name == "crm-corosync": # return True return False # pylint: disable=invalid-name def audit_list(cm): """Return a list of instances of applicable audits that can be performed.""" result = [] for auditclass in [DiskAudit, FileAudit, LogAudit, ControllerStateAudit, PartitionAudit, PrimitiveAudit, GroupAudit, CloneAudit, ColocationAudit, CIBAudit]: a = auditclass(cm) if a.is_applicable(): result.append(a) return result diff --git a/python/pacemaker/_cts/cib.py b/python/pacemaker/_cts/cib.py index 3f0ed8327b..1dccd56c96 100644 --- a/python/pacemaker/_cts/cib.py +++ b/python/pacemaker/_cts/cib.py @@ -1,408 +1,403 @@ """CIB generator for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["ConfigFactory"] __copyright__ = "Copyright 2008-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import warnings import tempfile from pacemaker.buildoptions import BuildOptions from pacemaker._cts.cibxml import Alerts, Clone, Expression, FencingTopology, Group, Nodes, OpDefaults, Option, Resource, Rule from pacemaker._cts.network import next_ip class CIB: """A class for generating, representing, and installing a CIB file onto cluster nodes.""" def __init__(self, cm, version, factory, tmpfile=None): """ Create a new CIB instance. Arguments: cm -- A ClusterManager instance version -- The schema syntax version factory -- A ConfigFactory instance tmpfile -- Where to store the CIB, or None to use a new tempfile """ # pylint: disable=invalid-name self._cib = None self._cm = cm self._counter = 1 self._factory = factory self._num_nodes = 0 self.version = version if not tmpfile: warnings.filterwarnings("ignore") # pylint: disable=consider-using-with f = tempfile.NamedTemporaryFile(delete=True) f.close() tmpfile = f.name warnings.resetwarnings() self._factory.tmpfile = tmpfile def _show(self): """Query a cluster node for its generated CIB; log and return the result.""" output = "" (_, result) = self._factory.rsh(self._factory.target, f"HOME=/root CIB_file={self._factory.tmpfile} cibadmin -Q", verbose=1) for line in result: output += line self._factory.debug(f"Generated Config: {line}") return output def new_ip(self, name=None): """Generate an IP resource for the next available IP address, optionally specifying the resource's name.""" - if self._cm.env["IPagent"] == "IPaddr2": - ip = next_ip(self._cm.env["IPBase"]) - if not name: - if ":" in ip: - (_, _, suffix) = ip.rpartition(":") - name = f"r{suffix}" - else: - name = f"r{ip}" - - r = Resource(self._factory, name, self._cm.env["IPagent"], "ocf") - r["ip"] = ip - + ip = next_ip(self._cm.env["IPBase"]) + if not name: if ":" in ip: - r["cidr_netmask"] = "64" - r["nic"] = "eth0" + (_, _, suffix) = ip.rpartition(":") + name = f"r{suffix}" else: - r["cidr_netmask"] = "32" + name = f"r{ip}" - else: - if not name: - name = f"r{self._cm.env['IPagent']}{self._counter}" - self._counter += 1 + r = Resource(self._factory, name, "IPaddr2", "ocf") + r["ip"] = ip - r = Resource(self._factory, name, self._cm.env["IPagent"], "ocf") + if ":" in ip: + r["cidr_netmask"] = "64" + r["nic"] = "eth0" + else: + r["cidr_netmask"] = "32" r.add_op("monitor", "5s") return r def get_node_id(self, node_name): """Check the cluster configuration for the node ID for the given node_name.""" # We can't account for every possible configuration, # so we only return a node ID if: # * The node is specified in /etc/corosync/corosync.conf # with "ring0_addr:" equal to node_name and "nodeid:" # explicitly specified. # In all other cases, we return 0. node_id = 0 # awkward command: use } as record separator # so each corosync.conf "object" is one record; # match the "node {" record that has "ring0_addr: node_name"; # then print the substring of that record after "nodeid:" awk = r"""awk -v RS="}" """ \ r"""'/^(\s*nodelist\s*{)?\s*node\s*{.*(ring0_addr|name):\s*%s(\s+|$)/""" \ r"""{gsub(/.*nodeid:\s*/,"");gsub(/\s+.*$/,"");print}' %s""" \ % (node_name, BuildOptions.COROSYNC_CONFIG_FILE) (rc, output) = self._factory.rsh(self._factory.target, awk, verbose=1) if rc == 0 and len(output) == 1: try: node_id = int(output[0]) except ValueError: node_id = 0 return node_id def install(self, target): """Generate a CIB file and install it to the given cluster node.""" old = self._factory.tmpfile # Force a rebuild self._cib = None self._factory.tmpfile = f"{BuildOptions.CIB_DIR}/cib.xml" self.contents(target) self._factory.rsh(self._factory.target, f"chown {BuildOptions.DAEMON_USER} {self._factory.tmpfile}") self._factory.tmpfile = old def contents(self, target): """Generate a complete CIB file.""" + # pylint: disable=too-many-locals # fencing resource if self._cib: return self._cib if target: self._factory.target = target self._factory.rsh(self._factory.target, f"HOME=/root cibadmin --empty {self.version} > {self._factory.tmpfile}") self._num_nodes = len(self._cm.env["nodes"]) no_quorum = "stop" if self._num_nodes < 3: no_quorum = "ignore" self._factory.log(f"Cluster only has {self._num_nodes} nodes, configuring: no-quorum-policy=ignore") # We don't need a nodes section unless we add attributes stn = None # Fencing resource # Define first so that the shell doesn't reject every update - if self._cm.env["DoFencing"]: + if self._cm.env["fencing_enabled"]: # Define the "real" fencing device - st = Resource(self._factory, "Fencing", self._cm.env["stonith-type"], "stonith") + st = Resource(self._factory, "Fencing", self._cm.env["fencing_agent"], "stonith") # Set a threshold for unreliable stonith devices such as the vmware one st.add_meta("migration-threshold", "5") st.add_op("monitor", "120s", timeout="120s") st.add_op("stop", "0", timeout="60s") st.add_op("start", "0", timeout="60s") # For remote node tests, a cluster node is stopped and brought back up # as a remote node with the name "remote-OLDNAME". To allow fencing # devices to fence these nodes, create a list of all possible node names. all_node_names = [prefix + n for n in self._cm.env["nodes"] for prefix in ('', 'remote-')] # Add all parameters specified by user - entries = self._cm.env["stonith-params"].split(',') - for entry in entries: + for param in self._cm.env["fencing_params"]: try: - (name, value) = entry.split('=', 1) + (name, value) = param.split('=', 1) except ValueError: - print(f"Warning: skipping invalid fencing parameter: {entry}") + print(f"Warning: skipping invalid fencing parameter: {param}") continue # Allow user to specify "all" as the node list, and expand it here if name in ["hostlist", "pcmk_host_list"] and value == "all": value = ' '.join(all_node_names) st[name] = value st.commit() # Test advanced fencing logic stf_nodes = [] stt_nodes = [] attr_nodes = {} # Create the levels stl = FencingTopology(self._factory) for node in self._cm.env["nodes"]: # Remote node tests will rename the node remote_node = f"remote-{node}" # Randomly assign node to a fencing method - ftype = self._cm.env.random_gen.choice(["levels-and", "levels-or ", "broadcast "]) + # @TODO What does "broadcast" do, if anything? + types = ["levels-and", "levels-or", "broadcast"] + width = max(len(t) for t in types) + ftype = self._cm.env.random_gen.choice(types) # For levels-and, randomly choose targeting by node name or attribute by = "" if ftype == "levels-and": node_id = self.get_node_id(node) if node_id == 0 or self._cm.env.random_gen.choice([True, False]): by = " (by name)" else: attr_nodes[node] = node_id by = " (by attribute)" - self._cm.log(f" - Using {ftype} fencing for node: {node}{by}") + self._cm.log(f" - Using {ftype:{width}} fencing for node: {node}{by}") if ftype == "levels-and": # If targeting by name, add a topology level for this node if node not in attr_nodes: stl.level(1, node, "FencingPass,Fencing") # Always target remote nodes by name, otherwise we would need to add # an attribute to the remote node only during remote tests (we don't # want nonexistent remote nodes showing up in the non-remote tests). # That complexity is not worth the effort. stl.level(1, remote_node, "FencingPass,Fencing") # Add the node (and its remote equivalent) to the list of levels-and nodes. stt_nodes.extend([node, remote_node]) - elif ftype == "levels-or ": + elif ftype == "levels-or": for n in [node, remote_node]: stl.level(1, n, "FencingFail") stl.level(2, n, "Fencing") stf_nodes.extend([node, remote_node]) # If any levels-and nodes were targeted by attribute, # create the attributes and a level for the attribute. if attr_nodes: stn = Nodes(self._factory) for (node_name, node_id) in attr_nodes.items(): stn.add_node(node_name, node_id, {"cts-fencing": "levels-and"}) stl.level(1, None, "FencingPass,Fencing", "cts-fencing", "levels-and") # Create a Dummy agent that always passes for levels-and if stt_nodes: stt = Resource(self._factory, "FencingPass", "fence_dummy", "stonith") stt["pcmk_host_list"] = " ".join(stt_nodes) # Wait this many seconds before doing anything, handy for letting disks get flushed too stt["random_sleep_range"] = "30" stt["mode"] = "pass" stt.commit() # Create a Dummy agent that always fails for levels-or if stf_nodes: stf = Resource(self._factory, "FencingFail", "fence_dummy", "stonith") stf["pcmk_host_list"] = " ".join(stf_nodes) # Wait this many seconds before doing anything, handy for letting disks get flushed too stf["random_sleep_range"] = "30" stf["mode"] = "fail" stf.commit() # Now commit the levels themselves stl.commit() o = Option(self._factory) - o["stonith-enabled"] = self._cm.env["DoFencing"] + o["stonith-enabled"] = self._cm.env["fencing_enabled"] o["start-failure-is-fatal"] = "false" o["pe-input-series-max"] = "5000" o["shutdown-escalation"] = "5min" o["batch-limit"] = "10" o["dc-deadtime"] = "5s" o["no-quorum-policy"] = no_quorum o.commit() o = OpDefaults(self._factory) o["timeout"] = "90s" o.commit() # Commit the nodes section if we defined one if stn is not None: stn.commit() # Add an alerts section if possible if self._factory.rsh.exists_on_all(self._cm.env["notification-agent"], self._cm.env["nodes"]): alerts = Alerts(self._factory) alerts.add_alert(self._cm.env["notification-agent"], self._cm.env["notification-recipient"]) alerts.commit() # Add resources? - if self._cm.env["CIBResource"]: + if self._cm.env["create_resources"]: self.add_resources() # generate cib self._cib = self._show() if self._factory.tmpfile != f"{BuildOptions.CIB_DIR}/cib.xml": self._factory.rsh(self._factory.target, f"rm -f {self._factory.tmpfile}") return self._cib def add_resources(self): """Add various resources and their constraints to the CIB.""" # Per-node resources for node in self._cm.env["nodes"]: name = f"rsc_{node}" r = self.new_ip(name) r.prefer(node, "100") r.commit() # Migrator # Make this slightly sticky (since we have no other location constraints) to avoid relocation during Reattach m = Resource(self._factory, "migrator", "Dummy", "ocf", "pacemaker") m["passwd"] = "whatever" m.add_meta("resource-stickiness", "1") m.add_meta("allow-migrate", "1") m.add_op("monitor", "P10S") m.commit() # Ping the test exerciser p = Resource(self._factory, "ping-1", "ping", "ocf", "pacemaker") p.add_op("monitor", "60s") p["host_list"] = self._cm.env["cts-exerciser"] p["name"] = "connected" p["debug"] = "true" c = Clone(self._factory, "Connectivity", p) c["globally-unique"] = "false" c.commit() # promotable clone resource s = Resource(self._factory, "stateful-1", "Stateful", "ocf", "pacemaker") s.add_op("monitor", "15s", timeout="60s") s.add_op("monitor", "16s", timeout="60s", role="Promoted") ms = Clone(self._factory, "promotable-1", s) ms["promotable"] = "true" ms["clone-max"] = self._num_nodes ms["clone-node-max"] = 1 ms["promoted-max"] = 1 ms["promoted-node-max"] = 1 # Require connectivity to run the promotable clone r = Rule(self._factory, "connected", "-INFINITY", op="or") r.add_child(Expression(self._factory, "m1-connected-1", "connected", "lt", "1")) r.add_child(Expression(self._factory, "m1-connected-2", "connected", "not_defined", None)) ms.prefer("connected", rule=r) ms.commit() # Group Resource g = Group(self._factory, "group-1") g.add_child(self.new_ip()) if self._cm.env["have_systemd"]: sysd = Resource(self._factory, "petulant", "pacemaker-cts-dummyd@10", "service") sysd.add_op("monitor", "P10S") g.add_child(sysd) else: g.add_child(self.new_ip()) g.add_child(self.new_ip()) # Make group depend on the promotable clone g.after("promotable-1", first="promote", then="start") g.colocate("promotable-1", "INFINITY", withrole="Promoted") g.commit() # LSB resource dependent on group-1 if BuildOptions.INIT_DIR is not None: lsb = Resource(self._factory, "lsb-dummy", "LSBDummy", "lsb") lsb.add_op("monitor", "5s") lsb.after("group-1") lsb.colocate("group-1") lsb.commit() class ConfigFactory: """Singleton to generate a CIB file for the environment's schema version.""" def __init__(self, cm): """ Create a new ConfigFactory instance. Arguments: cm -- A ClusterManager instance """ # pylint: disable=invalid-name self._cm = cm self.rsh = self._cm.rsh if not self._cm.env["ListTests"]: self.target = self._cm.env["nodes"][0] self.tmpfile = None def log(self, args): """Log a message.""" self._cm.log(f"cib: {args}") def debug(self, args): """Log a debug message.""" self._cm.debug(f"cib: {args}") def create_config(self, name=f"pacemaker-{BuildOptions.CIB_SCHEMA_VERSION}"): """Return a CIB object for the given schema version.""" return CIB(self._cm, name, self) diff --git a/python/pacemaker/_cts/clustermanager.py b/python/pacemaker/_cts/clustermanager.py index fc4753e70d..f521e1f746 100644 --- a/python/pacemaker/_cts/clustermanager.py +++ b/python/pacemaker/_cts/clustermanager.py @@ -1,899 +1,817 @@ """ClusterManager class for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["ClusterManager"] __copyright__ = """Copyright 2000-2025 the Pacemaker project contributors. Certain portions by Huang Zhen are copyright 2004 International Business Machines. The version control history for this file may have further details.""" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import os import re import time from collections import UserDict from pacemaker.buildoptions import BuildOptions from pacemaker.exitstatus import ExitStatus from pacemaker._cts.CTS import NodeStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.cib import ConfigFactory from pacemaker._cts.environment import EnvFactory from pacemaker._cts.logging import LogFactory from pacemaker._cts.patterns import PatternSelector from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.watcher import LogWatcher # pylint doesn't understand that self._rsh is callable (it stores the # singleton instance of RemoteExec, as returned by the getInstance method # of RemoteFactory). # @TODO See if type annotations fix this. # I think we could also fix this by getting rid of the getInstance methods, # but that's a project for another day. For now, just disable the warning. # pylint: disable=not-callable # ClusterManager has a lot of methods. # pylint: disable=too-many-public-methods class ClusterManager(UserDict): """ An abstract base class for managing the cluster. This class implements high-level operations on the cluster and/or its cluster managers. Actual cluster-specific management classes should be subclassed from this one. Among other things, this class tracks the state every node is expected to be in. """ - def _final_conditions(self): - """Check all keys to make sure they have a non-None value.""" - for (key, val) in self._data.items(): - if val is None: - raise ValueError(f"Improper derivation: self[{key}] must be overridden by subclass.") - def __init__(self): - """ - Create a new ClusterManager instance. - - This class can be treated kind of like a dictionary due to the process - of certain dict functions like __getitem__ and __setitem__. This is - because it contains a lot of name/value pairs. However, it is not - actually a dictionary so do not rely on standard dictionary behavior. - """ + """Create a new ClusterManager instance.""" # Eventually, ClusterManager should not be a UserDict subclass. Until # that point... # pylint: disable=super-init-not-called self.__instance_errors_to_ignore = [] self._cib_installed = False - self._data = {} self._logger = LogFactory() self.env = EnvFactory().getInstance() self.expected_status = {} self.name = self.env["Name"] # pylint: disable=invalid-name self.ns = NodeStatus(self.env) self.our_node = os.uname()[1].lower() self.partitions_expected = 1 self.rsh = RemoteFactory().getInstance() - self.templates = PatternSelector(self.env["Name"]) - - self._final_conditions() + self.templates = PatternSelector(self.name) self._cib_factory = ConfigFactory(self) self._cib = self._cib_factory.create_config(self.env["Schema"]) self._cib_sync = {} - def __getitem__(self, key): - """ - Return the given key, checking for it in several places. - - If key is "Name", return the name of the cluster manager. If the key - was previously added to the dictionary via __setitem__, return that. - Otherwise, return the template pattern for the key. - - This method should not be used and may be removed in the future. - """ - if key == "Name": - return self.name - - print(f"FIXME: Getting {key} from {self!r}") - if key in self._data: - return self._data[key] - - return self.templates.get_patterns(key) - - def __setitem__(self, key, value): - """ - Set the given key to the given value, overriding any previous value. - - This method should not be used and may be removed in the future. - """ - print(f"FIXME: Setting {key}={value} on {self!r}") - self._data[key] = value - def clear_instance_errors_to_ignore(self): """Reset instance-specific errors to ignore on each iteration.""" self.__instance_errors_to_ignore = [] @property def instance_errors_to_ignore(self): """Return a list of known errors that should be ignored for a specific test instance.""" return self.__instance_errors_to_ignore @property def errors_to_ignore(self): """Return a list of known error messages that should be ignored.""" return self.templates.get_patterns("BadNewsIgnore") def log(self, args): """Log a message.""" self._logger.log(args) def debug(self, args): """Log a debug message.""" self._logger.debug(args) def upcount(self): """Return how many nodes are up.""" count = 0 for node in self.env["nodes"]: if self.expected_status[node] == "up": count += 1 return count def install_support(self, command="install"): """ Install or uninstall the CTS support files. This includes various init scripts and data, daemons, fencing agents, etc. """ for node in self.env["nodes"]: self.rsh(node, f"{BuildOptions.DAEMON_DIR}/cts-support {command}") def prepare_fencing_watcher(self): """Return a LogWatcher object that watches for fencing log messages.""" # If we don't have quorum now but get it as a result of starting this node, # then a bunch of nodes might get fenced if self.has_quorum(None): self.debug("Have quorum") return None if not self.templates["Pat:Fencing_start"]: print("No start pattern") return None if not self.templates["Pat:Fencing_ok"]: print("No ok pattern") return None stonith = None stonith_pats = [] for peer in self.env["nodes"]: if self.expected_status[peer] == "up": continue stonith_pats.extend([ self.templates["Pat:Fencing_ok"] % peer, self.templates["Pat:Fencing_start"] % peer, ]) stonith = LogWatcher(self.env["LogFileName"], stonith_pats, self.env["nodes"], self.env["log_kind"], "StartupFencing", 0) stonith.set_watch() return stonith def fencing_cleanup(self, node, stonith): """Wait for a previously fenced node to return to the cluster.""" peer_list = [] peer_state = {} self.debug(f"Looking for nodes that were fenced as a result of {node} starting") # If we just started a node, we may now have quorum (and permission to fence) if not stonith: self.debug("Nothing to do") return peer_list q = self.has_quorum(None) if not q and len(self.env["nodes"]) > 2: # We didn't gain quorum - we shouldn't have shot anyone self.debug(f"Quorum: {q} Len: {len(self.env['nodes'])}") return peer_list for n in self.env["nodes"]: peer_state[n] = "unknown" # Now see if any states need to be updated self.debug(f"looking for: {stonith.regexes!r}") shot = stonith.look(0) while shot: self.debug(f"Found: {shot!r}") del stonith.regexes[stonith.whichmatch] # Extract node name for n in self.env["nodes"]: if re.search(self.templates["Pat:Fencing_ok"] % n, shot): peer = n peer_state[peer] = "complete" self.__instance_errors_to_ignore.append(self.templates["Pat:Fencing_ok"] % peer) elif peer_state[n] != "complete" and re.search(self.templates["Pat:Fencing_start"] % n, shot): # TODO: Correctly detect multiple fencing operations for the same host peer = n peer_state[peer] = "in-progress" self.__instance_errors_to_ignore.append(self.templates["Pat:Fencing_start"] % peer) if not peer: self._logger.log(f"ERROR: Unknown stonith match: {shot!r}") elif peer not in peer_list: self.debug(f"Found peer: {peer}") peer_list.append(peer) # Get the next one shot = stonith.look(60) for peer in peer_list: self.debug(f" Peer {peer} was fenced as a result of {node} starting: {peer_state[peer]}") if self.env["at-boot"]: self.expected_status[peer] = "up" else: self.expected_status[peer] = "down" if peer_state[peer] == "in-progress": # Wait for any in-progress operations to complete shot = stonith.look(60) while stonith.regexes and shot: self.debug(f"Found: {shot!r}") del stonith.regexes[stonith.whichmatch] shot = stonith.look(60) # Now make sure the node is alive too - self.ns.wait_for_node(peer, self.env["DeadTime"]) + self.ns.wait_for_node(peer, self.env["dead_time"]) # Poll until it comes up if self.env["at-boot"]: if not self.stat_cm(peer): - time.sleep(self.env["StartTime"]) + time.sleep(self.env["start_time"]) if not self.stat_cm(peer): self._logger.log(f"ERROR: Peer {peer} failed to restart after being fenced") return None return peer_list + def _install_config(self, node): + """Remove and re-install the CIB on the first node in the cluster.""" + if not self.ns.wait_for_node(node): + self.log(f"Node {node} is not up.") + return + + if node in self._cib_sync or not self.env["overwrite_cib"]: + return + + self._cib_sync[node] = True + self.rsh(node, f"rm -f {BuildOptions.CIB_DIR}/cib*") + + # Only install the CIB on the first node, all the other ones will pick it up from there + if self._cib_installed: + return + + self._cib_installed = True + if self.env["CIBfilename"]: + self.log(f"Installing CIB ({self.env['CIBfilename']}) on node {node}") + + rc = self.rsh.copy(self.env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)) + + if rc != 0: + raise ValueError(f"Can not scp file to {node} {rc}") + + else: + self.log(f"Installing Generated CIB on node {node}") + self._cib.install(node) + + self.rsh(node, f"chown {BuildOptions.DAEMON_USER} {BuildOptions.CIB_DIR}/cib.xml") + def start_cm(self, node, verbose=False): """Start up the cluster manager on a given node.""" - if verbose: - self._logger.log(f"Starting {self.templates['Name']} on node {node}") - else: - self.debug(f"Starting {self.templates['Name']} on node {node}") + log_fn = self._logger.log if verbose else self.debug + log_fn(f"Starting {self.name} on node {node}") if node not in self.expected_status: self.expected_status[node] = "down" if self.expected_status[node] != "down": return True # Technically we should always be able to notice ourselves starting patterns = [ self.templates["Pat:Local_started"] % node, ] if self.upcount() == 0: patterns.append(self.templates["Pat:DC_started"] % node) else: patterns.append(self.templates["Pat:NonDC_started"] % node) watch = LogWatcher(self.env["LogFileName"], patterns, self.env["nodes"], self.env["log_kind"], - "StartaCM", self.env["StartTime"] + 10) + "StartaCM", self.env["start_time"] + 10) - self.install_config(node) + self._install_config(node) self.expected_status[node] = "any" - if self.stat_cm(node) and self.cluster_stable(self.env["DeadTime"]): + if self.stat_cm(node) and self.cluster_stable(self.env["dead_time"]): self._logger.log(f"{node} was already started") return True stonith = self.prepare_fencing_watcher() watch.set_watch() (rc, _) = self.rsh(node, self.templates["StartCmd"]) if rc != 0: self._logger.log(f"Warn: Start command failed on node {node}") self.fencing_cleanup(node, stonith) return False self.expected_status[node] = "up" watch_result = watch.look_for_all() if watch.unmatched: for regex in watch.unmatched: self._logger.log(f"Warn: Startup pattern not found: {regex}") - if watch_result and self.cluster_stable(self.env["DeadTime"]): + if watch_result and self.cluster_stable(self.env["dead_time"]): self.fencing_cleanup(node, stonith) return True - if self.stat_cm(node) and self.cluster_stable(self.env["DeadTime"]): + if self.stat_cm(node) and self.cluster_stable(self.env["dead_time"]): self.fencing_cleanup(node, stonith) return True self._logger.log(f"Warn: Start failed for node {node}") return False def start_cm_async(self, node, verbose=False): """Start up the cluster manager on a given node without blocking.""" - if verbose: - self._logger.log(f"Starting {self['Name']} on node {node}") - else: - self.debug(f"Starting {self['Name']} on node {node}") + log_fn = self._logger.log if verbose else self.debug + log_fn(f"Starting {self.name} on node {node}") - self.install_config(node) + self._install_config(node) self.rsh(node, self.templates["StartCmd"], synchronous=False) self.expected_status[node] = "up" def stop_cm(self, node, verbose=False, force=False): """Stop the cluster manager on a given node.""" - if verbose: - self._logger.log(f"Stopping {self['Name']} on node {node}") - else: - self.debug(f"Stopping {self['Name']} on node {node}") + log_fn = self._logger.log if verbose else self.debug + log_fn(f"Stopping {self.name} on node {node}") if self.expected_status[node] != "up" and not force: return True (rc, _) = self.rsh(node, self.templates["StopCmd"]) if rc == 0: # Make sure we can continue even if corosync leaks self.expected_status[node] = "down" - self.cluster_stable(self.env["DeadTime"]) + self.cluster_stable(self.env["dead_time"]) return True - self._logger.log(f"ERROR: Could not stop {self['Name']} on node {node}") + self._logger.log(f"ERROR: Could not stop {self.name} on node {node}") return False def stop_cm_async(self, node): """Stop the cluster manager on a given node without blocking.""" - self.debug(f"Stopping {self['Name']} on node {node}") + self.debug(f"Stopping {self.name} on node {node}") self.rsh(node, self.templates["StopCmd"], synchronous=False) self.expected_status[node] = "down" def startall(self, nodelist=None, verbose=False, quick=False): """Start the cluster manager on every node in the cluster, or on every node in nodelist.""" if not nodelist: nodelist = self.env["nodes"] for node in nodelist: if self.expected_status[node] == "down": self.ns.wait_for_all_nodes(nodelist, 300) if not quick: # This is used for "basic sanity checks", so only start one node ... return self.start_cm(nodelist[0], verbose=verbose) # Approximation of SimulStartList for --boot watchpats = [ self.templates["Pat:DC_IDLE"], ] for node in nodelist: watchpats.extend([ self.templates["Pat:InfraUp"] % node, self.templates["Pat:PacemakerUp"] % node, self.templates["Pat:Local_started"] % node, self.templates["Pat:They_up"] % (nodelist[0], node), ]) # Start all the nodes - at about the same time... watch = LogWatcher(self.env["LogFileName"], watchpats, self.env["nodes"], self.env["log_kind"], "fast-start", - self.env["DeadTime"] + 10) + self.env["dead_time"] + 10) watch.set_watch() if not self.start_cm(nodelist[0], verbose=verbose): return False for node in nodelist: self.start_cm_async(node, verbose=verbose) watch.look_for_all() if watch.unmatched: for regex in watch.unmatched: self._logger.log(f"Warn: Startup pattern not found: {regex}") if not self.cluster_stable(): self._logger.log("Cluster did not stabilize") return False return True def stopall(self, nodelist=None, verbose=False, force=False): """Stop the cluster manager on every node in the cluster, or on every node in nodelist.""" ret = True if not nodelist: nodelist = self.env["nodes"] for node in self.env["nodes"]: if self.expected_status[node] == "up" or force: if not self.stop_cm(node, verbose=verbose, force=force): ret = False return ret def statall(self, nodelist=None): """Return the status of the cluster manager on every node in the cluster, or on every node in nodelist.""" result = {} if not nodelist: nodelist = self.env["nodes"] for node in nodelist: if self.stat_cm(node): result[node] = "up" else: result[node] = "down" return result def isolate_node(self, target, nodes=None): """Break communication between the target node and all other nodes in the cluster, or nodes.""" if not nodes: nodes = self.env["nodes"] for node in nodes: if node == target: continue (rc, _) = self.rsh(target, self.templates["BreakCommCmd"] % node) if rc != 0: self._logger.log(f"Could not break the communication between {target} and {node}: {rc}") return False self.debug(f"Communication cut between {target} and {node}") return True def unisolate_node(self, target, nodes=None): """Re-establish communication between the target node and all other nodes in the cluster, or nodes.""" if not nodes: nodes = self.env["nodes"] for node in nodes: if node == target: continue # Limit the amount of time we have asynchronous connectivity for # Restore both sides as simultaneously as possible self.rsh(target, self.templates["FixCommCmd"] % node, synchronous=False) self.rsh(node, self.templates["FixCommCmd"] % target, synchronous=False) self.debug(f"Communication restored between {target} and {node}") - def oprofile_start(self, node=None): - """Start profiling on the given node, or all nodes in the cluster.""" - if not node: - for n in self.env["oprofile"]: - self.oprofile_start(n) - - elif node in self.env["oprofile"]: - self.debug(f"Enabling oprofile on {node}") - self.rsh(node, "opcontrol --init") - self.rsh(node, "opcontrol --setup --no-vmlinux --separate=lib --callgraph=20 --image=all") - self.rsh(node, "opcontrol --start") - self.rsh(node, "opcontrol --reset") - - def oprofile_save(self, test, node=None): - """Save profiling data and restart profiling on the given node, or all nodes in the cluster.""" - if not node: - for n in self.env["oprofile"]: - self.oprofile_save(test, n) - - elif node in self.env["oprofile"]: - self.rsh(node, "opcontrol --dump") - self.rsh(node, f"opcontrol --save=cts.{test}") - # Read back with: opreport -l session:cts.0 image:/c* - self.oprofile_stop(node) - self.oprofile_start(node) - - def oprofile_stop(self, node=None): - """ - Start profiling on the given node, or all nodes in the cluster. - - This does not save profiling data, so call oprofile_save first if needed. - """ - if not node: - for n in self.env["oprofile"]: - self.oprofile_stop(n) - - elif node in self.env["oprofile"]: - self.debug(f"Stopping oprofile on {node}") - self.rsh(node, "opcontrol --reset") - self.rsh(node, "opcontrol --shutdown 2>&1 > /dev/null") - - def install_config(self, node): - """Remove and re-install the CIB on the first node in the cluster.""" - if not self.ns.wait_for_node(node): - self.log(f"Node {node} is not up.") - return - - if node in self._cib_sync or not self.env["ClobberCIB"]: - return - - self._cib_sync[node] = True - self.rsh(node, f"rm -f {BuildOptions.CIB_DIR}/cib*") - - # Only install the CIB on the first node, all the other ones will pick it up from there - if self._cib_installed: - return - - self._cib_installed = True - if self.env["CIBfilename"]: - self.log(f"Installing CIB ({self.env['CIBfilename']}) on node {node}") - - rc = self.rsh.copy(self.env["CIBfilename"], "root@" + (self.templates["CIBfile"] % node)) - - if rc != 0: - raise ValueError(f"Can not scp file to {node} {rc}") - - else: - self.log(f"Installing Generated CIB on node {node}") - self._cib.install(node) - - self.rsh(node, f"chown {BuildOptions.DAEMON_USER} {BuildOptions.CIB_DIR}/cib.xml") - def prepare(self): """ Finish initialization. Clear out the expected status and record the current status of every node in the cluster. """ self.partitions_expected = 1 for node in self.env["nodes"]: self.expected_status[node] = "" - if self.env["experimental-tests"]: - self.unisolate_node(node) + # This used to be conditional on whether SplitBrainTest was + # allowed to run. SplitBrainTest is supposed to unisolate + # the nodes, so this shouldn't be necessary here. However, + # SplitBrainTest was flagged as "experimental" from 2009 to + # 2025 and wasn't allowed to run by default. So uncomment + # this if problems emerge. + # + # @COMPAT Delete this comment if no problems emerge after a + # while. + # + # self.unisolate_node(node) self.stat_cm(node) def test_node_cm(self, node): """ Check the status of a given node. Returns 0 if the node is down, 1 if the node is up but unstable, and 2 if the node is up and stable. """ watchpats = [ "Current ping state: (S_IDLE|S_NOT_DC)", self.templates["Pat:NonDC_started"] % node, self.templates["Pat:DC_started"] % node, ] idle_watch = LogWatcher(self.env["LogFileName"], watchpats, [node], self.env["log_kind"], "ClusterIdle") idle_watch.set_watch() (_, out) = self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) if not out: out = "" else: out = out[0].strip() self.debug(f"Node {node} status: '{out}'") if out.find('ok') < 0: if self.expected_status[node] == "up": self.log(f"Node status for {node} is down but we think it should be {self.expected_status[node]}") self.expected_status[node] = "down" return 0 if self.expected_status[node] == "down": self.log(f"Node status for {node} is up but we think it should be {self.expected_status[node]}: {out}") self.expected_status[node] = "up" # check the output first - because syslog-ng loses messages if out.find('S_NOT_DC') != -1: # Up and stable return 2 if out.find('S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug(f"Warn: Node {node} is unstable: {out}") return 1 # Up and stable return 2 def stat_cm(self, node): """Report the status of the cluster manager on a given node.""" return self.test_node_cm(node) > 0 # Being up and being stable is not the same question... def node_stable(self, node): """Return whether or not the given node is stable.""" if self.test_node_cm(node) == 2: return True self.log(f"Warn: Node {node} not stable") return False - def partition_stable(self, nodes, timeout=None): + def _partition_stable(self, nodes, timeout=None): """Return whether or not all nodes in the given partition are stable.""" watchpats = [ "Current ping state: S_IDLE", self.templates["Pat:DC_IDLE"], ] self.debug("Waiting for cluster stability...") if timeout is None: - timeout = self.env["DeadTime"] + timeout = self.env["dead_time"] if len(nodes) < 3: self.debug("Cluster is inactive") return True idle_watch = LogWatcher(self.env["LogFileName"], watchpats, nodes.split(), self.env["log_kind"], "ClusterStable", timeout) idle_watch.set_watch() for node in nodes.split(): # have each node dump its current state self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) ret = idle_watch.look() while ret: self.debug(ret) for node in nodes.split(): if re.search(node, ret): return True ret = idle_watch.look() self.debug(f"Warn: Partition {nodes!r} not IDLE after {timeout}s") return False def cluster_stable(self, timeout=None, double_check=False): """Return whether or not all nodes in the cluster are stable.""" partitions = self.find_partitions() for partition in partitions: - if not self.partition_stable(partition, timeout): + if not self._partition_stable(partition, timeout): return False if not double_check: return True # Make sure we are really stable and that all resources, # including those that depend on transient node attributes, # are started if they were going to be time.sleep(5) for partition in partitions: - if not self.partition_stable(partition, timeout): + if not self._partition_stable(partition, timeout): return False return True def is_node_dc(self, node, status_line=None): """ Return whether or not the given node is the cluster DC. Check the given status_line, or query the cluster if None. """ if not status_line: (_, out) = self.rsh(node, self.templates["StatusCmd"] % node, verbose=1) if out: status_line = out[0].strip() if not status_line: return False if status_line.find('S_IDLE') != -1: return True if status_line.find('S_INTEGRATION') != -1: return True if status_line.find('S_FINALIZE_JOIN') != -1: return True if status_line.find('S_POLICY_ENGINE') != -1: return True if status_line.find('S_TRANSITION_ENGINE') != -1: return True return False def active_resources(self, node): """Return a list of primitive resources active on the given node.""" (_, output) = self.rsh(node, "crm_resource -c", verbose=1) resources = [] for line in output: if not re.search("^Resource", line): continue tmp = AuditResource(self, line) if tmp.type == "primitive" and tmp.host == node: resources.append(tmp.id) return resources def resource_location(self, rid): """Return a list of nodes on which the given resource is running.""" resource_nodes = [] for node in self.env["nodes"]: if self.expected_status[node] != "up": continue cmd = self.templates["RscRunning"] % rid (rc, lines) = self.rsh(node, cmd) if rc == 127: self.log(f"Command '{cmd}' failed. Binary or pacemaker-cts package not installed?") for line in lines: self.log(f"Output: {line} ") elif rc == 0: resource_nodes.append(node) return resource_nodes def find_partitions(self): """ Return a list of all partitions in the cluster. Each element of the list is itself a list of all active nodes in that partition. """ ccm_partitions = [] for node in self.env["nodes"]: if self.expected_status[node] != "up": self.debug(f"Node {node} is down... skipping") continue (_, out) = self.rsh(node, self.templates["PartitionCmd"], verbose=1) if not out: self.log(f"no partition details for {node}") continue partition = out[0].strip() if len(partition) <= 2: self.log(f"bad partition details for {node}") continue nodes = partition.split() nodes.sort() partition = ' '.join(nodes) found = 0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug(f"Adding partition from {node}: {partition}") ccm_partitions.append(partition) else: self.debug(f"Partition '{partition}' from {node} is consistent with existing entries") self.debug(f"Found partitions: {ccm_partitions!r}") return ccm_partitions def has_quorum(self, node_list): """Return whether or not the cluster has quorum.""" # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.env["nodes"] for node in node_list: if self.expected_status[node] != "up": continue (rc, quorum) = self.rsh(node, self.templates["QuorumCmd"], verbose=1) if rc != ExitStatus.OK: self.debug(f"WARN: Quorum check on {node} returned error ({rc})") continue quorum = quorum[0].strip() if quorum.find("1") != -1: return True if quorum.find("0") != -1: return False self.debug(f"WARN: Unexpected quorum test result from {node}:{quorum}") return False @property def components(self): """ Return a list of all patterns that should be ignored for the cluster's components. This must be provided by all subclasses. """ raise NotImplementedError def in_standby_mode(self, node): """Return whether or not the node is in Standby.""" (_, out) = self.rsh(node, self.templates["StandbyQueryCmd"] % node, verbose=1) if not out: return False out = out[0].strip() self.debug(f"Standby result: {out}") return out == "on" def set_standby_mode(self, node, status): """ Set node to Standby if status is True, or Active if status is False. Return whether the node is now in the requested status. """ current_status = self.in_standby_mode(node) if current_status == status: return True if status: cmd = self.templates["StandbyCmd"] % (node, "on") else: cmd = self.templates["StandbyCmd"] % (node, "off") (rc, _) = self.rsh(node, cmd) return rc == 0 def add_dummy_rsc(self, node, rid): """Add a dummy resource with the given ID to the given node.""" rsc_xml = f""" ' '""" constraint_xml = f""" ' '""" self.rsh(node, self.templates['CibAddXml'] % rsc_xml) self.rsh(node, self.templates['CibAddXml'] % constraint_xml) def remove_dummy_rsc(self, node, rid): """Remove the previously added dummy resource given by rid on the given node.""" constraint = f"\"//rsc_location[@rsc='{rid}']\"" rsc = f"\"//primitive[@id='{rid}']\"" self.rsh(node, self.templates['CibDelXpath'] % constraint) self.rsh(node, self.templates['CibDelXpath'] % rsc) diff --git a/python/pacemaker/_cts/cmcorosync.py b/python/pacemaker/_cts/cmcorosync.py index b753c36e9a..3206ce6952 100644 --- a/python/pacemaker/_cts/cmcorosync.py +++ b/python/pacemaker/_cts/cmcorosync.py @@ -1,75 +1,29 @@ """Corosync-specific class for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["Corosync2"] __copyright__ = "Copyright 2007-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.CTS import Process from pacemaker._cts.clustermanager import ClusterManager -from pacemaker._cts.patterns import PatternSelector - -# Throughout this file, pylint has trouble understanding that EnvFactory -# is a singleton instance that can be treated as a subscriptable object. -# Various warnings are disabled because of this. See also a comment about -# self._rsh in environment.py. -# pylint: disable=unsubscriptable-object class Corosync2(ClusterManager): """A subclass of ClusterManager specialized to handle corosync2 and later based clusters.""" - def __init__(self): - """Create a new Corosync2 instance.""" - ClusterManager.__init__(self) - - self._fullcomplist = {} - self.templates = PatternSelector(self.name) - @property def components(self): """Return a list of patterns that should be ignored for the cluster's components.""" - complist = [] - - if not self._fullcomplist: - common_ignore = self.templates.get_component("common-ignore") - - daemons = [ - "pacemaker-based", - "pacemaker-controld", - "pacemaker-attrd", - "pacemaker-execd", - "pacemaker-fenced" - ] - for c in daemons: - badnews = self.templates.get_component(f"{c}-ignore") + common_ignore - proc = Process(self, c, pats=self.templates.get_component(c), - badnews_ignore=badnews) - self._fullcomplist[c] = proc - - # the scheduler uses dc_pats instead of pats - badnews = self.templates.get_component("pacemaker-schedulerd-ignore") + common_ignore - proc = Process(self, "pacemaker-schedulerd", - dc_pats=self.templates.get_component("pacemaker-schedulerd"), - badnews_ignore=badnews) - self._fullcomplist["pacemaker-schedulerd"] = proc - - # add (or replace) extra components - badnews = self.templates.get_component("corosync-ignore") + common_ignore - proc = Process(self, "corosync", pats=self.templates.get_component("corosync"), - badnews_ignore=badnews) - self._fullcomplist["corosync"] = proc - - # Processes running under valgrind can't be shot with "killall -9 processname", - # so don't include them in the returned list - vgrind = self.env["valgrind-procs"].split() - for (key, val) in self._fullcomplist.items(): - if key in vgrind: - self.log(f"Filtering {key} from the component list as it is being profiled by valgrind") - continue - - if key == "pacemaker-fenced" and not self.env["DoFencing"]: - continue - - complist.append(val) - - return complist + comps = [ + "corosync", + "pacemaker-attrd", + "pacemaker-based", + "pacemaker-controld", + "pacemaker-execd", + "pacemaker-fenced" + ] + return [ + Process(self, c, pats=self.templates.get_component(c), + badnews_ignore=self.templates.get_component(f"{c}-ignore")) + for c in comps + ] diff --git a/python/pacemaker/_cts/corosync.py b/python/pacemaker/_cts/corosync.py index beb574d2b8..d0e680319f 100644 --- a/python/pacemaker/_cts/corosync.py +++ b/python/pacemaker/_cts/corosync.py @@ -1,186 +1,188 @@ """A module providing functions for manipulating corosync.""" __all__ = ["Corosync", "localname"] -__copyright__ = "Copyright 2009-2024 the Pacemaker project contributors" +__copyright__ = "Copyright 2009-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+)" import os import shutil import subprocess import tempfile import time from pacemaker.buildoptions import BuildOptions from pacemaker._cts.environment import EnvFactory from pacemaker._cts.process import killall, stdout_from_command AUTOGEN_COROSYNC_TEMPLATE = """ totem { version: 2 cluster_name: %s crypto_cipher: none crypto_hash: none transport: udp } nodelist { node { nodeid: 1 name: %s ring0_addr: 127.0.0.1 } } logging { debug: off to_syslog: no to_stderr: no to_logfile: yes logfile: %s } """ def corosync_cfg_exists(): """Return whether the corosync config file exists.""" return os.path.exists(BuildOptions.COROSYNC_CONFIG_FILE) def corosync_log_file(cfgfile): """Return the path to the corosync log file, or None.""" with open(cfgfile, "r", encoding="utf-8") as f: for line in f.readlines(): # "to_logfile:" could also be in the config file, so check for a # slash to make sure it's a path we're looking at. if "logfile: /" in line: return line.split()[-1] return None def generate_corosync_cfg(logdir, cluster_name, node_name): """ Generate a corosync config file. If there's a corosync config file already installed on the system, move it to a temporary location and return that temporary name. Otherwise, return None. """ retval = None if corosync_cfg_exists(): # pylint: disable=consider-using-with config_dir = os.path.dirname(BuildOptions.COROSYNC_CONFIG_FILE) f = tempfile.NamedTemporaryFile(dir=config_dir, prefix="corosync.conf-") f.close() shutil.move(BuildOptions.COROSYNC_CONFIG_FILE, f.name) retval = f.name logfile = os.path.join(logdir, "corosync.log") with open(BuildOptions.COROSYNC_CONFIG_FILE, "w", encoding="utf-8") as corosync_cfg: corosync_cfg.write(AUTOGEN_COROSYNC_TEMPLATE % (cluster_name, node_name, logfile)) return retval def localname(): """Return the uname of the local host.""" our_uname = stdout_from_command(["uname", "-n"]) if our_uname: our_uname = our_uname[0] else: our_uname = "localhost" return our_uname class Corosync: """A class for managing corosync processes and config files.""" def __init__(self, verbose, logdir, cluster_name): """ Create a new Corosync instance. Arguments: verbose -- Whether to print the corosync log file logdir -- The base directory under which to store log files cluster_name -- The name of the cluster """ self.verbose = verbose self.logdir = logdir self.cluster_name = cluster_name - # The Corosync class doesn't use self._env._nodes, but the + # The Corosync class doesn't use self._env["nodes"], but the # "--nodes" argument is required to be present and nonempty self._env = EnvFactory().getInstance(args=["--nodes", "localhost"]) self._existing_cfg_file = None def _ready(self, logfile, timeout=10): """Return whether corosync is ready.""" i = 0 while i < timeout: with open(logfile, "r", encoding="utf-8") as corosync_log: for line in corosync_log.readlines(): if line.endswith("ready to provide service.\n"): # Even once the line is in the log file, we may still need to wait just # a little bit longer before corosync is really ready to go. time.sleep(1) return time.sleep(1) i += 1 raise TimeoutError def start(self, kill_first=False, timeout=10): """ Start the corosync process. Arguments: kill_first -- Whether to kill any pre-existing corosync processes before starting a new one timeout -- If corosync does not start within this many seconds, raise TimeoutError """ if kill_first: killall(["corosync"]) self._existing_cfg_file = generate_corosync_cfg(self.logdir, self.cluster_name, localname()) logfile = corosync_log_file(BuildOptions.COROSYNC_CONFIG_FILE) + # pylint doesn't understand that self._env is subscriptable. + # pylint: disable=unsubscriptable-object if self._env["have_systemd"]: cmd = ["systemctl", "start", "corosync.service"] else: cmd = ["corosync"] if self.verbose: print("Starting corosync") with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as test: test.wait() # Wait for corosync to be ready before returning self._ready(logfile, timeout=timeout) def stop(self): """Stop the corosync process.""" killall(["corosync"]) if self.verbose: print("Corosync output") logfile = corosync_log_file(BuildOptions.COROSYNC_CONFIG_FILE) with open(logfile, "r", encoding="utf-8") as corosync_log: for line in corosync_log.readlines(): print(line.strip()) os.remove(BuildOptions.COROSYNC_CONFIG_FILE) # If there was a previous corosync config file, move it back into place if self._existing_cfg_file: shutil.move(self._existing_cfg_file, BuildOptions.COROSYNC_CONFIG_FILE) diff --git a/python/pacemaker/_cts/environment.py b/python/pacemaker/_cts/environment.py index d87fe93ecb..da939803a6 100644 --- a/python/pacemaker/_cts/environment.py +++ b/python/pacemaker/_cts/environment.py @@ -1,611 +1,441 @@ """Test environment classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["EnvFactory", "set_cts_path"] __copyright__ = "Copyright 2014-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import argparse from contextlib import suppress from glob import glob import os import random import shlex import socket import sys -import time from pacemaker.buildoptions import BuildOptions from pacemaker._cts.logging import LogFactory from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.watcher import LogKind class Environment: """ A class for managing the CTS environment. This consists largely of processing and storing command line parameters. """ # pylint doesn't understand that self._rsh is callable (it stores the # singleton instance of RemoteExec, as returned by the getInstance method # of RemoteFactory). # @TODO See if type annotations fix this. # I think we could also fix this by getting rid of the getInstance methods, # but that's a project for another day. For now, just disable the warning. # pylint: disable=not-callable def __init__(self, args): """ Create a new Environment instance. This class can be treated kind of like a dictionary due to the presence of typical dict functions like __contains__, __getitem__, and __setitem__. However, it is not a dictionary so do not rely on standard dictionary behavior. Arguments: args -- A list of command line parameters, minus the program name. If None, sys.argv will be used. """ self.data = {} - self._nodes = [] # Set some defaults before processing command line arguments. These are # either not set by any command line parameter, or they need a default # that can't be set in add_argument. - self["DeadTime"] = 300 - self["StartTime"] = 300 - self["StableTime"] = 30 - self["tests"] = [] - self["IPagent"] = "IPaddr2" - self["DoFencing"] = True - self["CIBResource"] = False + self["dead_time"] = 300 self["log_kind"] = None self["scenario"] = "random" + self["stable_time"] = 30 + self["start_time"] = 300 + self["syslog_facility"] = "daemon" + self["tests"] = [] + + # Hard-coded since there is only one supported cluster manager/stack + self["Name"] = "crm-corosync" + self["Stack"] = "corosync 2+" self.random_gen = random.Random() self._logger = LogFactory() self._rsh = RemoteFactory().getInstance() - self._target = "localhost" - self._seed_random() self._parse_args(args) if not self["ListTests"]: self._validate() self._discover() - def _seed_random(self, seed=None): - """ - Initialize the random number generator. - - Arguments: - seed -- Use this to see the random number generator, or use the - current time if None. - """ - if not seed: - seed = int(time.time()) - - self["RandSeed"] = seed - self.random_gen.seed(str(seed)) - def dump(self): """Print the current environment.""" for key in sorted(self.data.keys()): self._logger.debug(f"{f'Environment[{key}]':35}: {str(self[key])}") def __contains__(self, key): """Return True if the given key exists in the environment.""" - if key == "nodes": - return True - return key in self.data def __getitem__(self, key): """Return the given environment key, or None if it does not exist.""" - if key == "nodes": - return self._nodes - - if key == "Name": - return self._get_stack_short() - return self.data.get(key) def __setitem__(self, key, value): """Set the given environment key to the given value, overriding any previous value.""" - if key == "Stack": - self._set_stack(value) - - elif key == "nodes": - self._nodes = [] + if key == "nodes": + self.data["nodes"] = [] for node in value: + node = node.strip() + # I don't think I need the IP address, etc. but this validates # the node name against /etc/hosts and/or DNS, so it's a # GoodThing(tm). try: - n = node.strip() # @TODO This only handles IPv4, use getaddrinfo() instead # (here and in _discover()) - socket.gethostbyname_ex(n) - self._nodes.append(n) + socket.gethostbyname_ex(node) + self.data["nodes"].append(node) except socket.herror: self._logger.log(f"{node} not found in DNS... aborting") raise else: self.data[key] = value def random_node(self): """Choose a random node from the cluster.""" return self.random_gen.choice(self["nodes"]) - def _set_stack(self, name): - """Normalize the given cluster stack name.""" - if name in ["corosync", "cs", "mcp"]: - self.data["Stack"] = "corosync 2+" - - else: - raise ValueError(f"Unknown stack: {name}") - - def _get_stack_short(self): - """Return the short name for the currently set cluster stack.""" - if "Stack" not in self.data: - return "unknown" - - if self.data["Stack"] == "corosync 2+": - return "crm-corosync" - - LogFactory().log(f"Unknown stack: {self['stack']}") - raise ValueError(f"Unknown stack: {self['stack']}") - - def _detect_systemd(self): + def _detect_systemd(self, node): """Detect whether systemd is in use on the target node.""" if "have_systemd" not in self.data: - (rc, _) = self._rsh(self._target, "systemctl list-units", verbose=0) + (rc, _) = self._rsh(node, "systemctl list-units", verbose=0) self["have_systemd"] = rc == 0 - def _detect_syslog(self): + def _detect_syslog(self, node): """Detect the syslog variant in use on the target node (if any).""" if "syslogd" in self.data: return if self["have_systemd"]: # Systemd - (_, lines) = self._rsh(self._target, r"systemctl list-units | grep syslog.*\.service.*active.*running | sed 's:.service.*::'", verbose=1) + (_, lines) = self._rsh(node, r"systemctl list-units | grep syslog.*\.service.*active.*running | sed 's:.service.*::'", verbose=1) else: # SYS-V - (_, lines) = self._rsh(self._target, "chkconfig --list | grep syslog.*on | awk '{print $1}' | head -n 1", verbose=1) + (_, lines) = self._rsh(node, "chkconfig --list | grep syslog.*on | awk '{print $1}' | head -n 1", verbose=1) with suppress(IndexError): self["syslogd"] = lines[0].strip() def disable_service(self, node, service): """Disable the given service on the given node.""" if self["have_systemd"]: # Systemd (rc, _) = self._rsh(node, f"systemctl disable {service}") return rc # SYS-V (rc, _) = self._rsh(node, f"chkconfig {service} off") return rc def enable_service(self, node, service): """Enable the given service on the given node.""" if self["have_systemd"]: # Systemd (rc, _) = self._rsh(node, f"systemctl enable {service}") return rc # SYS-V (rc, _) = self._rsh(node, f"chkconfig {service} on") return rc def service_is_enabled(self, node, service): """Return True if the given service is enabled on the given node.""" if self["have_systemd"]: # Systemd # With "systemctl is-enabled", we should check if the service is # explicitly "enabled" instead of the return code. For example it returns # 0 if the service is "static" or "indirect", but they don't really count # as "enabled". (rc, _) = self._rsh(node, f"systemctl is-enabled {service} | grep enabled") return rc == 0 # SYS-V (rc, _) = self._rsh(node, f"chkconfig --list | grep -e {service}.*on") return rc == 0 - def _detect_at_boot(self): + def _detect_at_boot(self, node): """Detect if the cluster starts at boot.""" - if "at-boot" not in self.data: - self["at-boot"] = self.service_is_enabled(self._target, "corosync") \ - or self.service_is_enabled(self._target, "pacemaker") + self["at-boot"] = any(self.service_is_enabled(node, service) + for service in ("pacemaker", "corosync")) - def _detect_ip_offset(self): + def _detect_ip_offset(self, node): """Detect the offset for IPaddr resources.""" - if self["CIBResource"] and "IPBase" not in self.data: - (_, lines) = self._rsh(self._target, "ip addr | grep inet | grep -v -e link -e inet6 -e '/32' -e ' lo' | awk '{print $2}'", verbose=0) + if self["create_resources"] and "IPBase" not in self.data: + (_, lines) = self._rsh(node, "ip addr | grep inet | grep -v -e link -e inet6 -e '/32' -e ' lo' | awk '{print $2}'", verbose=0) network = lines[0].strip() - (_, lines) = self._rsh(self._target, "nmap -sn -n %s | grep 'scan report' | awk '{print $NF}' | sed 's:(::' | sed 's:)::' | sort -V | tail -n 1" % network, verbose=0) + (_, lines) = self._rsh(node, "nmap -sn -n %s | grep 'scan report' | awk '{print $NF}' | sed 's:(::' | sed 's:)::' | sort -V | tail -n 1" % network, verbose=0) try: self["IPBase"] = lines[0].strip() except (IndexError, TypeError): self["IPBase"] = None if not self["IPBase"]: self["IPBase"] = " fe80::1234:56:7890:1000" self._logger.log("Could not determine an offset for IPaddr resources. Perhaps nmap is not installed on the nodes.") self._logger.log(f"""Defaulting to '{self["IPBase"]}', use --test-ip-base to override""") return - # pylint thinks self["IPBase"] is a list, not a string, which causes it - # to error out because a list doesn't have split(). - # pylint: disable=no-member last_part = self["IPBase"].split('.')[3] - if int(last_part) >= 240: self._logger.log(f"Could not determine an offset for IPaddr resources. Upper bound is too high: {self['IPBase']} {last_part}") self["IPBase"] = " fe80::1234:56:7890:1000" self._logger.log(f"""Defaulting to '{self["IPBase"]}', use --test-ip-base to override""") def _validate(self): """Check that we were given all required command line parameters.""" if not self["nodes"]: raise ValueError("No nodes specified!") def _discover(self): """Probe cluster nodes to figure out how to log and manage services.""" - self._target = random.Random().choice(self["nodes"]) - exerciser = socket.gethostname() # Use the IP where possible to avoid name lookup failures for ip in socket.gethostbyname_ex(exerciser)[2]: if ip != "127.0.0.1": exerciser = ip break self["cts-exerciser"] = exerciser - self._detect_systemd() - self._detect_syslog() - self._detect_at_boot() - self._detect_ip_offset() + node = self["nodes"][0] + self._detect_systemd(node) + self._detect_syslog(node) + self._detect_at_boot(node) + self._detect_ip_offset(node) def _parse_args(self, argv): """ Parse and validate command line parameters. Set the appropriate values in the environment dictionary. If argv is None, use sys.argv instead. """ if not argv: argv = sys.argv[1:] - parser = argparse.ArgumentParser(epilog=f"{sys.argv[0]} -g virt1 -r --stonith ssh --schema pacemaker-2.0 500") + parser = argparse.ArgumentParser() grp1 = parser.add_argument_group("Common options") - grp1.add_argument("-g", "--dsh-group", "--group", - metavar="GROUP", dest="group", - help="Use the nodes listed in the named DSH group (~/.dsh/groups/$name)") grp1.add_argument("--benchmark", action="store_true", help="Add timing information") grp1.add_argument("--list", "--list-tests", action="store_true", dest="list_tests", help="List the valid tests") grp1.add_argument("--nodes", default="", metavar="NODES", help="List of cluster nodes separated by whitespace") - grp1.add_argument("--stack", - default="corosync", - metavar="STACK", - help="Which cluster stack is installed") grp2 = parser.add_argument_group("Options that CTS will usually auto-detect correctly") grp2.add_argument("-L", "--logfile", metavar="PATH", help="Where to look for logs from cluster nodes (or 'journal' for systemd journal)") - grp2.add_argument("--at-boot", "--cluster-starts-at-boot", - choices=["1", "0", "yes", "no"], - help="Does the cluster software start at boot time?") - grp2.add_argument("--facility", "--syslog-facility", - default="daemon", - metavar="NAME", - help="Which syslog facility to log to") grp2.add_argument("--ip", "--test-ip-base", metavar="IP", help="Offset for generated IP address resources") grp3 = parser.add_argument_group("Options for release testing") grp3.add_argument("-r", "--populate-resources", action="store_true", help="Generate a sample configuration") grp3.add_argument("--choose", metavar="NAME", help="Run only the named tests, separated by whitespace") - grp3.add_argument("--fencing", "--stonith", - choices=["1", "0", "yes", "no", "lha", "openstack", "rhcs", "rhevm", "scsi", "ssh", "virt", "xvm"], - default="1", - help="What fencing agent to use") + grp3.add_argument("--disable-fencing", + action="store_false", + dest="fencing_enabled", + help="Whether to disable fencing") + grp3.add_argument("--fencing-agent", + metavar="AGENT", + default="external/ssh", + help="Agent to use for a fencing resource") + grp3.add_argument("--fencing-params", + metavar="PARAMS", + help="Parameters for the fencing resource (as NAME=VALUE), separated by whitespace") grp3.add_argument("--once", action="store_true", help="Run all valid tests once") grp4 = parser.add_argument_group("Additional (less common) options") grp4.add_argument("-c", "--clobber-cib", action="store_true", help="Erase any existing configuration") grp4.add_argument("-y", "--yes", action="store_true", dest="always_continue", help="Continue to run whenever prompted") grp4.add_argument("--boot", action="store_true", help="") grp4.add_argument("--cib-filename", metavar="PATH", help="Install the given CIB file to the cluster") - grp4.add_argument("--experimental-tests", - action="store_true", - help="Include experimental tests") - grp4.add_argument("--loop-minutes", - type=int, default=60, - help="") - grp4.add_argument("--no-loop-tests", - action="store_true", - help="Don't run looping/time-based tests") grp4.add_argument("--no-unsafe-tests", - action="store_true", + action="store_false", + dest="unsafe_tests", help="Don't run tests that are unsafe for use with ocfs2/drbd") grp4.add_argument("--notification-agent", metavar="PATH", default="/var/lib/pacemaker/notify.sh", help="Script to configure for Pacemaker alerts") grp4.add_argument("--notification-recipient", metavar="R", default="/var/lib/pacemaker/notify.log", help="Recipient to pass to alert script") - grp4.add_argument("--oprofile", - default="", - metavar="NODES", - help="List of cluster nodes to run oprofile on") grp4.add_argument("--outputfile", metavar="PATH", help="Location to write logs to") - grp4.add_argument("--qarsh", - action="store_true", - help="Use QARSH to access nodes instead of SSH") grp4.add_argument("--schema", metavar="SCHEMA", default=f"pacemaker-{BuildOptions.CIB_SCHEMA_VERSION}", help="Create a CIB conforming to the given schema") grp4.add_argument("--seed", metavar="SEED", help="Use the given string as the random number seed") - grp4.add_argument("--set", - action="append", - metavar="ARG", - default=[], - help="Set key=value pairs (can be specified multiple times)") - grp4.add_argument("--stonith-args", - metavar="ARGS", - default="hostlist=all,livedangerously=yes", - help="") - grp4.add_argument("--stonith-type", - metavar="TYPE", - default="external/ssh", - help="") grp4.add_argument("--trunc", action="store_true", dest="truncate", help="Truncate log file before starting") - grp4.add_argument("--valgrind-procs", - metavar="PROCS", - default="pacemaker-attrd pacemaker-based pacemaker-controld pacemaker-execd pacemaker-fenced pacemaker-schedulerd", - help="Run valgrind against the given space-separated list of processes") - grp4.add_argument("--warn-inactive", - action="store_true", - help="Warn if a resource is assigned to an inactive node") parser.add_argument("iterations", nargs='?', type=int, default=1, help="Number of tests to run") args = parser.parse_args(args=argv) # Set values on this object based on what happened with command line # processing. This has to be done in several blocks. # These values can always be set. Most get a default from the add_argument # calls, they only do one thing, and they do not have any side effects. self["CIBfilename"] = args.cib_filename if args.cib_filename else None - self["ClobberCIB"] = args.clobber_cib + self["create_resources"] = bool(args.ip or args.populate_resources) + self["fencing_agent"] = args.fencing_agent + self["fencing_enabled"] = args.fencing_enabled + self["fencing_params"] = shlex.split(args.fencing_params) self["ListTests"] = args.list_tests + self["overwrite_cib"] = any([args.clobber_cib, args.ip, args.populate_resources]) self["Schema"] = args.schema - self["Stack"] = args.stack - self["SyslogFacility"] = args.facility self["TruncateLog"] = args.truncate - self["at-boot"] = args.at_boot in ["1", "yes"] self["benchmark"] = args.benchmark self["continue"] = args.always_continue - self["experimental-tests"] = args.experimental_tests self["iterations"] = args.iterations - self["loop-minutes"] = args.loop_minutes - self["loop-tests"] = not args.no_loop_tests self["nodes"] = shlex.split(args.nodes) self["notification-agent"] = args.notification_agent self["notification-recipient"] = args.notification_recipient - self["oprofile"] = shlex.split(args.oprofile) - self["stonith-params"] = args.stonith_args - self["stonith-type"] = args.stonith_type - self["unsafe-tests"] = not args.no_unsafe_tests - self["valgrind-procs"] = args.valgrind_procs - self["warn-inactive"] = args.warn_inactive - - # Nodes and groups are mutually exclusive. Additionally, --group does - # more than just set a value. Here, set nodes first and then if a group - # is specified, override the previous nodes value. - if args.group: - self["OutputFile"] = f"{os.environ['HOME']}/cluster-{args.dsh_group}.log" - LogFactory().add_file(self["OutputFile"], "CTS") - - dsh_file = f"{os.environ['HOME']}/.dsh/group/{args.dsh_group}" - - if os.path.isfile(dsh_file): - self["nodes"] = [] - - with open(dsh_file, "r", encoding="utf-8") as f: - for line in f: - stripped = line.strip() - - if not stripped.startswith('#'): - self["nodes"].append(stripped) - else: - print(f"Unknown DSH group: {args.dsh_group}") + self["unsafe-tests"] = args.unsafe_tests # Everything else either can't have a default set in an add_argument # call (likely because we don't want to always have a value set for it) # or it does something fancier than just set a single value. However, # order does not matter for these as long as the user doesn't provide # conflicting arguments on the command line. So just do Everything # alphabetically. if args.boot: self["scenario"] = "boot" if args.choose: self["scenario"] = "sequence" self["tests"].extend(shlex.split(args.choose)) self["iterations"] = len(self["tests"]) - if args.fencing in ["0", "no"]: - self["DoFencing"] = False - - elif args.fencing in ["rhcs", "virt", "xvm"]: - self["stonith-type"] = "fence_xvm" - - elif args.fencing == "scsi": - self["stonith-type"] = "fence_scsi" - - elif args.fencing in ["lha", "ssh"]: - self["stonith-params"] = "hostlist=all,livedangerously=yes" - self["stonith-type"] = "external/ssh" - - elif args.fencing == "openstack": - self["stonith-type"] = "fence_openstack" - - print("Obtaining OpenStack credentials from the current environment") - region = os.environ['OS_REGION_NAME'] - tenant = os.environ['OS_TENANT_NAME'] - auth = os.environ['OS_AUTH_URL'] - user = os.environ['OS_USERNAME'] - password = os.environ['OS_PASSWORD'] - - self["stonith-params"] = f"region={region},tenant={tenant},auth={auth},user={user},password={password}" - - elif args.fencing == "rhevm": - self["stonith-type"] = "fence_rhevm" - - print("Obtaining RHEV-M credentials from the current environment") - user = os.environ['RHEVM_USERNAME'] - password = os.environ['RHEVM_PASSWORD'] - server = os.environ['RHEVM_SERVER'] - port = os.environ['RHEVM_PORT'] - - self["stonith-params"] = f"login={user},passwd={password},ipaddr={server},ipport={port},ssl=1,shell_timeout=10" - if args.ip: - self["CIBResource"] = True - self["ClobberCIB"] = True self["IPBase"] = args.ip if args.logfile == "journal": self["LogAuditDisabled"] = True self["log_kind"] = LogKind.JOURNAL elif args.logfile: self["LogAuditDisabled"] = True self["LogFileName"] = args.logfile self["log_kind"] = LogKind.REMOTE_FILE else: # We can't set this as the default on the parser.add_argument call # for this option because then args.logfile will be set, which means # the above branch will be taken and those other values will also be # set. self["LogFileName"] = "/var/log/messages" if args.once: self["scenario"] = "all-once" if args.outputfile: self["OutputFile"] = args.outputfile LogFactory().add_file(self["OutputFile"]) - if args.populate_resources: - self["CIBResource"] = True - self["ClobberCIB"] = True - - if args.qarsh: - self._rsh.enable_qarsh() - - for kv in args.set: - (name, value) = kv.split("=") - self[name] = value - print(f"Setting {name} = {value}") + self.random_gen.seed(args.seed) class EnvFactory: """A class for constructing a singleton instance of an Environment object.""" instance = None # pylint: disable=invalid-name def getInstance(self, args=None): """ Return the previously created instance of Environment. If no instance exists, create a new instance and return that. """ if not EnvFactory.instance: EnvFactory.instance = Environment(args) return EnvFactory.instance def set_cts_path(extra=None): """Set the PATH environment variable appropriately for the tests.""" new_path = os.environ['PATH'] # Add any search paths given on the command line if extra is not None: for p in extra: new_path = f"{p}:{new_path}" cwd = os.getcwd() if os.path.exists(f"{cwd}/cts/cts-attrd.in"): # pylint: disable=protected-access print(f"Running tests from the source tree: {BuildOptions._BUILD_DIR}") for d in glob(f"{BuildOptions._BUILD_DIR}/daemons/*/"): new_path = f"{d}:{new_path}" new_path = f"{BuildOptions._BUILD_DIR}/tools:{new_path}" new_path = f"{BuildOptions._BUILD_DIR}/cts/support:{new_path}" print(f"Using local schemas from: {cwd}/xml") os.environ["PCMK_schema_directory"] = f"{cwd}/xml" else: print(f"Running tests from the install tree: {BuildOptions.DAEMON_DIR} (not {cwd})") new_path = f"{BuildOptions.DAEMON_DIR}:{new_path}" os.environ["PCMK_schema_directory"] = BuildOptions.SCHEMA_DIR print(f'Using PATH="{new_path}"') os.environ['PATH'] = new_path diff --git a/python/pacemaker/_cts/logging.py b/python/pacemaker/_cts/logging.py index 4548ce9a0b..29f5432248 100644 --- a/python/pacemaker/_cts/logging.py +++ b/python/pacemaker/_cts/logging.py @@ -1,118 +1,113 @@ """Logging classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["LogFactory"] __copyright__ = "Copyright 2014-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import os import sys import time class Logger: """Abstract class to use as parent for CTS logging classes.""" TimeFormat = "%b %d %H:%M:%S\t" - def __init__(self, filename=None, tag=None): + def __init__(self, filename=None): # Whether this logger should print debug messages self._debug_target = True self._logfile = filename - if tag: - self._source = f"{tag}: " - else: - self._source = "" - def __call__(self, lines): """Log specified messages.""" raise ValueError("Abstract class member (__call__)") def write(self, line): """Log a single line excluding trailing whitespace.""" return self(line.rstrip()) def writelines(self, lines): """Log a series of lines excluding trailing whitespace.""" for line in lines: self.write(line) @property def is_debug_target(self): """Return True if this logger should receive debug messages.""" return self._debug_target class StdErrLog(Logger): """Class to log to standard error.""" - def __init__(self, filename, tag): - Logger.__init__(self, filename, tag) + def __init__(self, filename): + Logger.__init__(self, filename) self._debug_target = False def __call__(self, lines): """Log specified lines to stderr.""" timestamp = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, str): lines = [lines] for line in lines: print(f"{timestamp}{line}", file=sys.__stderr__) sys.__stderr__.flush() class FileLog(Logger): """Class to log to a file.""" - def __init__(self, filename, tag): - Logger.__init__(self, filename, tag) + def __init__(self, filename): + Logger.__init__(self, filename) self._hostname = os.uname()[1] def __call__(self, lines): """Log specified lines to the file.""" with open(self._logfile, "at", encoding="utf-8") as logf: timestamp = time.strftime(Logger.TimeFormat, time.localtime(time.time())) if isinstance(lines, str): lines = [lines] for line in lines: - print(f"{timestamp}{self._hostname} {self._source}{line}", file=logf) + print(f"{timestamp}{self._hostname} {line}", file=logf) class LogFactory: """Singleton to log messages to various destinations.""" log_methods = [] have_stderr = False - def add_file(self, filename, tag=None): + def add_file(self, filename): """When logging messages, log them to specified file.""" if filename: - LogFactory.log_methods.append(FileLog(filename, tag)) + LogFactory.log_methods.append(FileLog(filename)) def add_stderr(self): """When logging messages, log them to standard error.""" if not LogFactory.have_stderr: LogFactory.have_stderr = True - LogFactory.log_methods.append(StdErrLog(None, None)) + LogFactory.log_methods.append(StdErrLog(None)) def log(self, args): """Log a message (to all configured log destinations).""" for logfn in LogFactory.log_methods: logfn(args.strip()) def debug(self, args): """Log a debug message (to all configured log destinations).""" for logfn in LogFactory.log_methods: if logfn.is_debug_target: logfn(f"debug: {args.strip()}") def traceback(self, traceback): """Log a stack trace (to all configured log destinations).""" for logfn in LogFactory.log_methods: traceback.print_exc(50, logfn) diff --git a/python/pacemaker/_cts/patterns.py b/python/pacemaker/_cts/patterns.py index adfc61e3e7..8bbf4e24f1 100644 --- a/python/pacemaker/_cts/patterns.py +++ b/python/pacemaker/_cts/patterns.py @@ -1,403 +1,386 @@ """Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["PatternSelector"] __copyright__ = "Copyright 2008-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+)" -import argparse - from pacemaker.buildoptions import BuildOptions class BasePatterns: """ The base class for holding a stack-specific set of command and log file/stdout patterns. Stack-specific classes need to be built on top of this one. """ def __init__(self): """Create a new BasePatterns instance which holds a very minimal set of basic patterns.""" self._bad_news = [] self._components = {} self._name = "crm-base" self._ignore = [ "avoid confusing Valgrind", # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", # pcs can log this when node is fenced, but fencing is OK in some # tests (and we will catch it in pacemaker logs when not OK) r"pcs.daemon:No response from: .* request: get_configs, error:", # This is overbroad, but there's no way to say that only certain # transition errors are acceptable. We have to rely on causes of a # transition error logging their own error message, which should # always be the case. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", # This message comes up periodically but doesn't actually seem to # be related to any specific test failure, so just ignore it. r"pacemaker-based.* Local CIB .* differs from", ] self._commands = { "StatusCmd": "crmadmin -t 60 -S %s 2>/dev/null", "CibQuery": "cibadmin -Q", "CibAddXml": "cibadmin --modify -c --xml-text %s", "CibDelXpath": "cibadmin --delete --xpath %s", "RscRunning": BuildOptions.DAEMON_DIR + "/cts-exec-helper -R -r %s", "CIBfile": "%s:" + BuildOptions.CIB_DIR + "/cib.xml", "TmpDir": "/tmp", "BreakCommCmd": "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd": "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", "MaintenanceModeOn": "cibadmin --modify -c --xml-text ''", "MaintenanceModeOff": "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", "StandbyCmd": "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null", "StandbyQueryCmd": "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null", } self._search = { "Pat:DC_IDLE": r"pacemaker-controld.*State transition.*-> S_IDLE", # This won't work if we have multiple partitions "Pat:Local_started": r"%s\W.*controller successfully started", "Pat:NonDC_started": r"%s\W.*State transition.*-> S_NOT_DC", "Pat:DC_started": r"%s\W.*State transition.*-> S_IDLE", "Pat:We_stopped": r"%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped": r"%s\W.*LOST:.* %s ", "Pat:They_dead": r"node %s.*: is dead", "Pat:They_up": r"%s %s\W.*OVERRIDE THIS PATTERN", "Pat:TransitionComplete": "Transition status: Complete: complete", "Pat:Fencing_start": r"Requesting peer fencing .* targeting %s", "Pat:Fencing_ok": r"pacemaker-fenced.*:\s*Operation .* targeting %s by .* for .*@.*: OK", "Pat:Fencing_recover": r"pacemaker-schedulerd.*: Recover\s+%s", "Pat:Fencing_active": r"stonith resource .* is active on 2 nodes (attempting recovery)", "Pat:Fencing_probe": r"pacemaker-controld.* Result of probe operation for %s on .*: Error", "Pat:RscOpOK": r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?OK", "Pat:RscOpFail": r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of %s ", "Pat:CloneOpFail": r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of (%s|%s) ", "Pat:RscRemoteOpOK": r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?OK", "Pat:NodeFenced": r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK", } def get_component(self, key): """ Return the patterns for a single component as a list, given by key. This is typically the name of some subprogram (pacemaker-based, pacemaker-fenced, etc.) or various special purpose keys. If key is unknown, return an empty list. """ if key in self._components: return self._components[key] print(f"Unknown component '{key}' for {self._name}") return [] def get_patterns(self, key): """ Return various patterns supported by this object, given by key. Depending on the key, this could either be a list or a hash. If key is unknown, return None. """ if key == "BadNews": return self._bad_news if key == "BadNewsIgnore": return self._ignore if key == "Commands": return self._commands if key == "Search": return self._search if key == "Components": return self._components print(f"Unknown pattern '{key}' for {self._name}") return None def __getitem__(self, key): - if key == "Name": - return self._name if key in self._commands: return self._commands[key] if key in self._search: return self._search[key] print(f"Unknown template '{key}' for {self._name}") return None class Corosync2Patterns(BasePatterns): """Patterns for Corosync version 2 cluster manager class.""" + # @FIXME Some of the templates here look like they start with + # incorrect daemon names. Also, many of them aren't Corosync- + # specific and should probably go in BasePatterns. + def __init__(self): BasePatterns.__init__(self) self._name = "crm-corosync" self._commands.update({ "StartCmd": "service corosync start && service pacemaker start", "StopCmd": "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop", "EpochCmd": "crm_node -e", "QuorumCmd": "crm_node -q", "PartitionCmd": "crm_node -p", }) self._search.update({ # Close enough ... "Corosync Cluster Engine exiting normally" isn't # printed reliably. "Pat:We_stopped": r"%s\W.*Unloading all Corosync service engines", "Pat:They_stopped": r"%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead": r"pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_up": r"\W%s\W.*pacemaker-controld.*Node %s state is now member", "Pat:ChildExit": r"\[[0-9]+\] exited with status [0-9]+ \(", # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() "Pat:ChildKilled": r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)", "Pat:ChildRespawn": r"%s\W.*pacemakerd.*Respawning subdaemon %s after unexpected exit", "Pat:InfraUp": r"%s\W.*corosync.*Initializing transport", "Pat:PacemakerUp": r"%s\W.*pacemakerd.*Starting Pacemaker", }) self._ignore += [ r"crm_mon:", r"crmadmin:", r"update_trace_data", r"async_notify:.*strange, client not found", r"Parse error: Ignoring unknown option .*nodename", r"error.*: Operation 'reboot' .* using FencingFail returned ", r"getinfo response error: 1$", r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE", r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)", ] self._bad_news = [ r"[^(]error:", r"crit:", r"ERROR:", r"CRIT:", r"Shutting down...NOW", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting", r"schedulerd.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"pacemakerd.*\[[0-9]+\] terminated( with signal|$)", r"pacemakerd.*\[[0-9]+\] .* will now be killed", r"pacemaker-schedulerd.*Recover\s+.*\(.* -\> .*\)", r"rsyslogd.* lost .* due to rate-limiting", r"Peer is not part of our cluster", r"We appear to be in an election loop", r"Unknown node -> we will not deliver message", r"(Blackbox dump requested|Problem detected)", r"pacemakerd.*Could not connect to Cluster Configuration Database API", r"Receiving messages from a node we think is dead", r"share the same cluster nodeid", r"share the same name", r"pacemaker-controld:.*Transition failed: terminated", r"Local CIB .* differs from .*:", r"warn.*:\s*Continuing but .* will NOT be used", r"warn.*:\s*Cluster configuration file .* is corrupt", r"Election storm", r"stalled the FSA with pending inputs", ] - self._components["common-ignore"] = [ + components_common_ignore = [ r"Pending action:", r"resource( was|s were) active at shutdown", r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", r"pacemaker-controld.*:\s*Exiting now due to errors", r".*:\s*Requesting fencing \([^)]+\) targeting node ", r"(Blackbox dump requested|Problem detected)", ] - self._components["corosync-ignore"] = [ + self._components["corosync-ignore"] = components_common_ignore + [ r"Could not connect to Corosync CFG: CS_ERR_LIBRARY", r"error:.*Connection to the CPG API failed: Library error", r"\[[0-9]+\] exited with status [0-9]+ \(", r"\[[0-9]+\] terminated with signal 15", r"pacemaker-based.*error:.*Corosync connection lost", r"pacemaker-fenced.*error:.*Corosync connection terminated", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state", r"pacemaker-controld.*error:.*Could not recover from internal error", r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*cib_(shm|rw) IPC provider disconnected while waiting", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"error: Lost fencer connection", ] self._components["corosync"] = [ # We expect each daemon to lose its cluster connection. # However, if the CIB manager loses its connection first, # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*warning:.*Lost connection to cluster layer", r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (Corosync process group|the CIB manager)", r"pacemaker-based.*:\s*crit:.*Exiting immediately after losing connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"schedulerd.*Scheduling node .* for fencing", r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK", ] self._components["pacemaker-based"] = [ r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102", r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning subdaemon pacemaker-attrd after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-based after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-fenced after unexpected exit", r"pacemaker-.* Connection to cib_.* (failed|closed)", r"pacemaker-attrd.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*I_ERROR.*handle_cib_disconnect", r"pacemaker-controld.* State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] - self._components["pacemaker-based-ignore"] = [ + self._components["pacemaker-based-ignore"] = components_common_ignore + [ r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", r"pacemaker-controld.*:Could not connect to attrd: Connection refused", ] self._components["pacemaker-execd"] = [ r"pacemaker-controld.*Lost connection to local executor", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning subdaemon pacemaker-execd after unexpected exit", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", ] - self._components["pacemaker-execd-ignore"] = [ + self._components["pacemaker-execd-ignore"] = components_common_ignore + [ r"pacemaker-(attrd|controld).*Connection to lrmd.* (failed|closed)", r"pacemaker-(attrd|controld).*Could not execute alert", ] self._components["pacemaker-controld"] = [ r"State transition .* -> S_IDLE", ] - self._components["pacemaker-controld-ignore"] = [] + self._components["pacemaker-controld-ignore"] = components_common_ignore self._components["pacemaker-attrd"] = [] - self._components["pacemaker-attrd-ignore"] = [ + self._components["pacemaker-attrd-ignore"] = components_common_ignore + [ r"pacemaker-controld.*Connection to attrd (IPC failed|closed)", ] self._components["pacemaker-schedulerd"] = [ r"State transition .* S_RECOVERY", r"pacemakerd.* Respawning subdaemon pacemaker-controld after unexpected exit", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", r"pacemaker-controld.*Lost connection to the scheduler", r"pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] - self._components["pacemaker-schedulerd-ignore"] = [ + self._components["pacemaker-schedulerd-ignore"] = components_common_ignore + [ r"Connection to pengine.* (failed|closed)", ] self._components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Lost fencer connection", r"pacemaker-controld.*Fencer successfully connected", ] - self._components["pacemaker-fenced-ignore"] = [ + self._components["pacemaker-fenced-ignore"] = components_common_ignore + [ r"(error|warning):.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"error:.*Lost fencer connection", r"error:.*Fencer connection failed \(will retry\)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error \(Lost connection to fencer\)", ] - self._components["pacemaker-fenced-ignore"].extend(self._components["common-ignore"]) - patternVariants = { "crm-base": BasePatterns, "crm-corosync": Corosync2Patterns } class PatternSelector: """Choose from among several Pattern objects and return the information from that object.""" def __init__(self, name="crm-corosync"): """ Create a new PatternSelector object. Instantiate whatever class is given by name. Defaults to Corosync2Patterns for "crm-corosync" or None. While other objects could be supported in the future, only this and the base object are supported at this time. """ self._name = name # If no name was given, use the default. Otherwise, look up the appropriate # class in patternVariants, instantiate it, and use that. if not name: self._base = Corosync2Patterns() else: self._base = patternVariants[name]() - def get_patterns(self, kind): - """Call get_patterns on the previously instantiated pattern object.""" - return self._base.get_patterns(kind) - - def get_template(self, key): + def __getitem__(self, key): """ Return a single pattern from the previously instantiated pattern object. If no pattern exists for the given key, return None. """ return self._base[key] + def get_patterns(self, kind): + """Call get_patterns on the previously instantiated pattern object.""" + return self._base.get_patterns(kind) + def get_component(self, kind): """Call get_component on the previously instantiated pattern object.""" return self._base.get_component(kind) - - def __getitem__(self, key): - """Return the pattern for the given key, or None if it does not exist.""" - return self.get_template(key) - - -# PYTHONPATH=python python python/pacemaker/_cts/patterns.py -k crm-corosync -t StartCmd -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("-k", "--kind", metavar="KIND") - parser.add_argument("-t", "--template", metavar="TEMPLATE") - - args = parser.parse_args() - - print(PatternSelector(args.kind)[args.template]) diff --git a/python/pacemaker/_cts/remote.py b/python/pacemaker/_cts/remote.py index 4745ca2777..9605a139c1 100644 --- a/python/pacemaker/_cts/remote.py +++ b/python/pacemaker/_cts/remote.py @@ -1,290 +1,282 @@ """Remote command runner for Pacemaker's Cluster Test Suite (CTS).""" __all__ = ["RemoteExec", "RemoteFactory"] __copyright__ = "Copyright 2014-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import os from subprocess import Popen, PIPE from threading import Thread from pacemaker._cts.logging import LogFactory def convert2string(lines): """ Convert byte strings to UTF-8 strings. Lists of byte strings are converted to a list of UTF-8 strings. All other text formats are passed through. """ if isinstance(lines, bytes): return lines.decode("utf-8") if isinstance(lines, list): lst = [] for line in lines: if isinstance(line, bytes): line = line.decode("utf-8") lst.append(line) return lst return lines class AsyncCmd(Thread): """A class for doing the hard work of running a command on another machine.""" def __init__(self, node, command, proc=None, delegate=None): """ Create a new AsyncCmd instance. Arguments: node -- The remote machine to run on command -- The ssh command string to use for remote execution proc -- If not None, a process object previously created with Popen. Instead of spawning a new process, we will then wait on this process to finish and handle its output. delegate -- When the command completes, call the async_complete method on this object """ self._command = command self._delegate = delegate self._logger = LogFactory() self._node = node self._proc = proc Thread.__init__(self) def run(self): """Run the previously instantiated AsyncCmd object.""" out = None err = None if not self._proc: # pylint: disable=consider-using-with self._proc = Popen(self._command, stdout=PIPE, stderr=PIPE, close_fds=True, shell=True) self._logger.debug(f"cmd: async: target={self._node}, pid={self._proc.pid}: {self._command}") self._proc.wait() if self._delegate: self._logger.debug(f"cmd: pid {self._proc.pid} returned {self._proc.returncode} to {self._delegate!r}") else: self._logger.debug(f"cmd: pid {self._proc.pid} returned {self._proc.returncode}") if self._proc.stderr: err = self._proc.stderr.readlines() self._proc.stderr.close() for line in err: self._logger.debug(f"cmd: stderr[{self._proc.pid}]: {line}") err = convert2string(err) if self._proc.stdout: out = self._proc.stdout.readlines() self._proc.stdout.close() out = convert2string(out) if self._delegate: self._delegate.async_complete(self._proc.pid, self._proc.returncode, out, err) class RemoteExec: """ An abstract class for remote execution. It runs a command on another machine using ssh and scp. """ def __init__(self, command, cp_command, silent=False): """ Create a new RemoteExec instance. Arguments: command -- The ssh command string to use for remote execution cp_command -- The scp command string to use for copying files silent -- Should we log command status? """ self._command = command self._cp_command = cp_command self._logger = LogFactory() self._silent = silent self._our_node = os.uname()[1].lower() def _fixcmd(self, cmd): """Perform shell escapes on certain characters in the input cmd string.""" return re.sub("\'", "'\\''", cmd) def _cmd(self, args): """Given a list of arguments, return the string that will be run on the remote system.""" sysname = args[0] command = args[1] if sysname is None or sysname.lower() in [self._our_node, "localhost"]: ret = command else: ret = f"{self._command} {sysname} '{self._fixcmd(command)}'" return ret def _log(self, args): """Log a message.""" if not self._silent: self._logger.log(args) def _debug(self, args): """Log a message at the debug level.""" if not self._silent: self._logger.debug(args) def call_async(self, node, command, delegate=None): """ Run the given command on the given remote system and do not wait for it to complete. Arguments: node -- The remote machine to run on command -- The command to run, as a string delegate -- When the command completes, call the async_complete method on this object Returns the running process object. """ aproc = AsyncCmd(node, self._cmd([node, command]), delegate=delegate) aproc.start() return aproc def __call__(self, node, command, synchronous=True, verbose=2): """ Run the given command on the given remote system. If you call this class like a function, this is what gets called. It's approximately the same as a system() call on the remote machine. Arguments: node -- The remote machine to run on command -- The command to run, as a string synchronous -- Should we wait for the command to complete? verbose -- If 0, do not log anything. If 1, log the command and its return code but not its output. If 2, additionally log command output. Returns a tuple of (return code, command output). """ rc = 0 result = None # pylint: disable=consider-using-with proc = Popen(self._cmd([node, command]), stdout=PIPE, stderr=PIPE, close_fds=True, shell=True) if not synchronous and proc.pid > 0 and not self._silent: aproc = AsyncCmd(node, command, proc=proc) aproc.start() return (rc, result) if proc.stdout: result = proc.stdout.readlines() proc.stdout.close() else: self._log("No stdout stream") rc = proc.wait() if verbose > 0: self._debug(f"cmd: target={node}, rc={rc}: {command}") result = convert2string(result) if proc.stderr: errors = proc.stderr.readlines() proc.stderr.close() for err in errors: self._debug(f"cmd: stderr: {err}") if verbose == 2: for line in result: self._debug(f"cmd: stdout: {line}") return (rc, result) def copy(self, source, target, silent=False): """ Perform a copy of the source file to the remote target. This function uses the cp_command provided when the RemoteExec object was created. Returns the return code of the cp_command. """ # @TODO Use subprocess module with argument array instead # (self._cp_command should be an array too) cmd = f"{self._cp_command} '{source}' '{target}'" rc = os.system(cmd) if not silent: self._debug(f"cmd: rc={rc}: {cmd}") return rc def exists_on_all(self, filename, hosts): """Return True if specified file exists on all specified hosts.""" for host in hosts: (rc, _) = self(host, f"test -r {filename}") if rc != 0: return False return True def exists_on_none(self, filename, hosts): """Return True if specified file does not exist on any specified host.""" for host in hosts: (rc, _) = self(host, f"test -r {filename}") if rc == 0: return False return True class RemoteFactory: """A class for constructing a singleton instance of a RemoteExec object.""" # Class variables # -n: no stdin, -x: no X11, # -o ServerAliveInterval=5: disconnect after 3*5s if the server # stops responding command = ("ssh -l root -n -x -o ServerAliveInterval=5 " "-o ConnectTimeout=10 -o TCPKeepAlive=yes " "-o ServerAliveCountMax=3 ") # -B: batch mode, -q: no stats (quiet) cp_command = "scp -B -q" instance = None # pylint: disable=invalid-name def getInstance(self): """ Return the previously created instance of RemoteExec. If no instance exists, create one and then return that. """ if not RemoteFactory.instance: RemoteFactory.instance = RemoteExec(RemoteFactory.command, RemoteFactory.cp_command, False) return RemoteFactory.instance - - def enable_qarsh(self): - """Enable the QA remote shell.""" - # http://nstraz.wordpress.com/2008/12/03/introducing-qarsh/ - print("Using QARSH for connections to cluster nodes") - - RemoteFactory.command = "qarsh -t 300 -l root" - RemoteFactory.cp_command = "qacp -q" diff --git a/python/pacemaker/_cts/scenarios.py b/python/pacemaker/_cts/scenarios.py index 07f2b38805..767ffa13f1 100644 --- a/python/pacemaker/_cts/scenarios.py +++ b/python/pacemaker/_cts/scenarios.py @@ -1,421 +1,409 @@ """Test scenario classes for Pacemaker's Cluster Test Suite (CTS).""" __all__ = [ "AllOnce", "Boot", "BootCluster", "LeaveBooted", "RandomTests", "Sequence", ] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time from pacemaker._cts.audits import ClusterAudit from pacemaker._cts.input import should_continue from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.watcher import LogWatcher class ScenarioComponent: """ The base class for all scenario components. A scenario component is one single step in a scenario. Each component is basically just a setup and teardown method. """ def __init__(self, cm, env): """ Create a new ScenarioComponent instance. Arguments: cm -- A ClusterManager instance env -- An Environment instance """ # pylint: disable=invalid-name self._cm = cm self._env = env def is_applicable(self): """ Return True if this component is applicable in the given Environment. This method must be provided by all subclasses. """ raise NotImplementedError def setup(self): """ Set up the component, returning True on success. This method must be provided by all subclasses. """ raise NotImplementedError def teardown(self): """ Tear down the given component. This method must be provided by all subclasses. """ raise NotImplementedError class Scenario: """ The base class for scenarios. A scenario is an ordered list of ScenarioComponent objects. A scenario proceeds by setting up all its components in sequence, running a list of tests and audits, and then tearing down its components in reverse. """ def __init__(self, cm, components, audits, tests): """ Create a new Scenario instance. Arguments: cm -- A ClusterManager instance components -- A list of ScenarioComponents comprising this Scenario audits -- A list of ClusterAudits that will be performed as part of this Scenario tests -- A list of CTSTests that will be run """ # pylint: disable=invalid-name self.stats = { "success": 0, "failure": 0, "BadNews": 0, "skipped": 0 } self.tests = tests self._audits = audits self._bad_news = None self._cm = cm self._components = components for comp in components: if not issubclass(comp.__class__, ScenarioComponent): raise ValueError("Init value must be subclass of ScenarioComponent") for audit in audits: if not issubclass(audit.__class__, ClusterAudit): raise ValueError("Init value must be subclass of ClusterAudit") for test in tests: if not issubclass(test.__class__, CTSTest): raise ValueError("Init value must be a subclass of CTSTest") def is_applicable(self): """Return True if all ScenarioComponents are applicable.""" for comp in self._components: if not comp.is_applicable(): return False return True def setup(self): """ Set up the scenario, returning True on success. If setup fails at some point, tear down those components that did successfully set up. """ self._cm.prepare() self.audit() # Also detects remote/local log config self._cm.ns.wait_for_all_nodes(self._cm.env["nodes"]) self.audit() self._cm.install_support() self._bad_news = LogWatcher(self._cm.env["LogFileName"], self._cm.templates.get_patterns("BadNews"), self._cm.env["nodes"], self._cm.env["log_kind"], "BadNews", 0) self._bad_news.set_watch() # Call after we've figured out what type of log watching to do in LogAudit j = 0 while j < len(self._components): if not self._components[j].setup(): # OOPS! We failed. Tear partial setups down. self.audit() self._cm.log("Tearing down partial setup") self.teardown(j) return False j += 1 self.audit() return True def teardown(self, n_components=None): """ Tear down the scenario in the reverse order it was set up. If n_components is not None, only tear down that many components. """ if not n_components: n_components = len(self._components) - 1 j = n_components while j >= 0: self._components[j].teardown() j -= 1 self.audit() self._cm.install_support("uninstall") def incr(self, name): """Increment the given stats key.""" if name not in self.stats: self.stats[name] = 0 self.stats[name] += 1 def run(self, iterations): - """Run all tests in the scenario the given number of times.""" - self._cm.oprofile_start() - - try: - self._run_loop(iterations) - self._cm.oprofile_stop() - except: # noqa: E722 - self._cm.oprofile_stop() - raise - - def _run_loop(self, iterations): """Run all the tests the given number of times.""" raise NotImplementedError def run_test(self, test, testcount): """ Run the given test. testcount is the number of tests (including this one) that have been run across all iterations. """ nodechoice = self._cm.env.random_node() ret = True did_run = False self._cm.clear_instance_errors_to_ignore() self._cm.log(f"Running test {test.name:<22} {f'({nodechoice})':<15} [{testcount:>3}]") starttime = test.set_timer() if not test.setup(nodechoice): self._cm.log("Setup failed") ret = False else: did_run = True ret = test(nodechoice) if not test.teardown(nodechoice): self._cm.log("Teardown failed") if not should_continue(self._cm.env): raise ValueError(f"Teardown of {test.name} on {nodechoice} failed") ret = False stoptime = time.time() - self._cm.oprofile_save(testcount) elapsed_time = stoptime - starttime test_time = stoptime - test.get_timer() if "min_time" not in test.stats: test.stats["elapsed_time"] = elapsed_time test.stats["min_time"] = test_time test.stats["max_time"] = test_time else: test.stats["elapsed_time"] += elapsed_time if test_time < test.stats["min_time"]: test.stats["min_time"] = test_time if test_time > test.stats["max_time"]: test.stats["max_time"] = test_time if ret: self.incr("success") test.log_timer() else: self.incr("failure") self._cm.statall() did_run = True # Force the test count to be incremented anyway so test extraction works self.audit(test.errors_to_ignore) return did_run def summarize(self): """Output scenario results.""" self._cm.log("****************") self._cm.log("Overall Results:%r" % self.stats) self._cm.log("****************") stat_filter = { "calls": 0, "failure": 0, "skipped": 0, "auditfail": 0, } self._cm.log("Test Summary") for test in self.tests: for key in stat_filter: stat_filter[key] = test.stats[key] self._cm.log(f"{f'Test {test.name}':<25} {stat_filter!r}") self._cm.debug("Detailed Results") for test in self.tests: self._cm.debug(f"{f'Test {test.name}: ':<25} {test.stats!r}") self._cm.log("<<<<<<<<<<<<<<<< TESTS COMPLETED") def audit(self, local_ignore=None): """ Perform all scenario audits and log results. If there are too many failures, prompt the user to confirm that the scenario should continue running. """ errcount = 0 ignorelist = ["CTS:"] if local_ignore: ignorelist.extend(local_ignore) ignorelist.extend(self._cm.errors_to_ignore) ignorelist.extend(self._cm.instance_errors_to_ignore) # This makes sure everything is stabilized before starting... failed = 0 for audit in self._audits: if not audit(): self._cm.log(f"Audit {audit.name} FAILED.") failed += 1 else: self._cm.debug(f"Audit {audit.name} passed.") while errcount < 1000: match = None if self._bad_news: match = self._bad_news.look(0) if match: add_err = True for ignore in ignorelist: if add_err and re.search(ignore, match): add_err = False if add_err: self._cm.log(f"BadNews: {match}") self.incr("BadNews") errcount += 1 else: break else: print("Big problems") if not should_continue(self._cm.env): self._cm.log("Shutting down.") self.summarize() self.teardown() raise ValueError("Looks like we hit a BadNews jackpot!") if self._bad_news: self._bad_news.end() return failed class AllOnce(Scenario): """Every Test Once.""" - def _run_loop(self, iterations): + def run(self, iterations): testcount = 1 for test in self.tests: self.run_test(test, testcount) testcount += 1 class RandomTests(Scenario): """Random Test Execution.""" - def _run_loop(self, iterations): + def run(self, iterations): testcount = 1 while testcount <= iterations: test = self._cm.env.random_gen.choice(self.tests) self.run_test(test, testcount) testcount += 1 class Sequence(Scenario): """Named Tests in Sequence.""" - def _run_loop(self, iterations): + def run(self, iterations): testcount = 1 while testcount <= iterations: for test in self.tests: self.run_test(test, testcount) testcount += 1 class Boot(Scenario): """Start the Cluster.""" - def _run_loop(self, iterations): + def run(self, iterations): return class BootCluster(ScenarioComponent): """ Start the cluster manager on all nodes. Wait for each to come up before starting in order to account for the possibility that a given node might have been rebooted or crashed beforehand. """ def is_applicable(self): """Return whether this scenario is applicable.""" return True def setup(self): """Set up the component, returning True on success.""" self._cm.prepare() # Clear out the cobwebs ;-) self._cm.stopall(verbose=True, force=True) # Now start the Cluster Manager on all the nodes. self._cm.log("Starting Cluster Manager on all nodes.") return self._cm.startall(verbose=True, quick=True) def teardown(self): """Tear down the component.""" self._cm.log("Stopping Cluster Manager on all nodes") self._cm.stopall(verbose=True, force=False) class LeaveBooted(BootCluster): """Leave all nodes up when the scenario is complete.""" def teardown(self): """Tear down the component.""" self._cm.log("Leaving Cluster running on all nodes") diff --git a/python/pacemaker/_cts/tests/cibsecret.py b/python/pacemaker/_cts/tests/cibsecret.py index 679f8b0dfb..20bc5564f9 100644 --- a/python/pacemaker/_cts/tests/cibsecret.py +++ b/python/pacemaker/_cts/tests/cibsecret.py @@ -1,231 +1,231 @@ """Test managing secrets with cibsecret.""" __all__ = ["CibsecretTest"] __copyright__ = "Copyright 2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # This comes from include/config.h as private API, assuming pacemaker is built # with cibsecrets support. I don't want to expose this value publically, at # least not until we default to including cibsecrets, so it's just set here # for now. SECRETS_DIR = "/var/lib/pacemaker/lrm/secrets" class CibsecretTest(CTSTest): """Test managing secrets with cibsecret.""" def __init__(self, cm): """ Create a new CibsecretTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Cibsecret" self._secret = "passwd" self._secret_val = "SecreT_PASS" self._rid = "secretDummy" self._startall = SimulStartLite(cm) def _insert_dummy(self, node): """Create a dummy resource on the given node.""" pats = [ - f"{node}.*" + (self.templates["Pat:RscOpOK"] % ("start", self._rid)) + f"{node}.*" + (self._cm.templates["Pat:RscOpOK"] % ("start", self._rid)) ] watch = self.create_watch(pats, 60) watch.set_watch() self._cm.add_dummy_rsc(node, self._rid) with Timer(self._logger, self.name, "addDummy"): watch.look_for_all() if watch.unmatched: self.debug("Failed to find patterns when adding dummy resource") return repr(watch.unmatched) return "" def _check_cib_value(self, node, expected): """Check that the secret has the expected value.""" (rc, lines) = self._rsh(node, f"crm_resource -r {self._rid} -g {self._secret}", verbose=1) s = " ".join(lines).strip() if rc != 0 or s != expected: return self.failure(f"Secret set to '{s}' in CIB, not '{expected}'") # This is self.success, except without incrementing the success counter return True def _test_check(self, node): """Test the 'cibsecret check' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret check {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure("Failed to check secret") # This is self.success, except without incrementing the success counter return True def _test_delete(self, node): """Test the 'cibsecret delete' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret delete {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure("Failed to delete secret") # This is self.success, except without incrementing the success counter return True def _test_get(self, node, expected): """Test the 'cibsecret get' subcommand.""" (rc, lines) = self._rsh(node, f"cibsecret get {self._rid} {self._secret}", verbose=1) s = " ".join(lines).strip() if rc != 0 or s != expected: return self.failure(f"Secret set to '{s}' in local file, not '{expected}'") # This is self.success, except without incrementing the success counter return True def _test_set(self, node): """Test the 'cibsecret set' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret set {self._rid} {self._secret} {self._secret_val}", verbose=1) if rc != 0: return self.failure("Failed to set secret") # This is self.success, except without incrementing the success counter return True def _test_stash(self, node): """Test the 'cibsecret stash' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret stash {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure(f"Failed to stash secret {self._secret}") # This is self.success, except without incrementing the success counter return True def _test_sync(self, node): """Test the 'cibsecret sync' subcommand.""" (rc, _) = self._rsh(node, "cibsecret sync", verbose=1) if rc != 0: return self.failure("Failed to sync secrets") # This is self.success, except without incrementing the success counter return True def _test_unstash(self, node): """Test the 'cibsecret unstash' subcommand.""" (rc, _) = self._rsh(node, f"cibsecret unstash {self._rid} {self._secret}", verbose=1) if rc != 0: return self.failure(f"Failed to unstash secret {self._secret}") # This is self.success, except without incrementing the success counter return True def _test_secrets_removed(self): """Verify that the secret and its checksum file has been removed.""" f = f"{SECRETS_DIR}/{self._rid}/{self._secret}" if not self._rsh.exists_on_none(f, self._env["nodes"]): return self.failure(f"{f} not deleted from all hosts") f = f"{SECRETS_DIR}/{self._rid}/{self._secret}.sign" if not self._rsh.exists_on_none(f, self._env["nodes"]): return self.failure(f"{f} not deleted from all hosts") return True # @TODO: Two improvements that could be made to this test: # # (1) Add a test for the 'cibsecret sync' command. This requires modifying # the test so it brings down one node before creating secrets, then # bringing the node back up, running 'cibsecret sync', and verifying the # secrets are copied over. All of this is possible with ctslab, it's # just kind of a lot of code. # # (2) Add some tests for failure cases like trying to stash a value that's # already secret, etc. def __call__(self, node): """Perform this test.""" self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Start all nodes failed") ret = self._insert_dummy(node) if ret != "": return self.failure(ret) # Test setting a new secret, verifying its value in both the CIB and # the local store on each node. if not self._test_set(node): return False if not self._check_cib_value(node, "lrm://"): return False for n in self._env["nodes"]: if not self._test_get(n, self._secret_val): return False # Test checking the secret on each node. for n in self._env["nodes"]: if not self._test_check(n): return False # Test moving the secret into the CIB, but now we can only verify that # its value in the CIB is correct since it's no longer a secret. We # can also verify that it's been removed from the local store everywhere. if not self._test_unstash(node): return False if not self._check_cib_value(node, self._secret_val): return False self._test_secrets_removed() # Test moving the secret back out of the CIB, again verifying its # value in both places. if not self._test_stash(node): return False if not self._check_cib_value(node, "lrm://"): return False for n in self._env["nodes"]: if not self._test_get(n, self._secret_val): return False # Delete the secret if not self._test_delete(node): return False self._test_secrets_removed() return self.success() @property def errors_to_ignore(self): return [r"Reloading .* \(agent\)"] diff --git a/python/pacemaker/_cts/tests/componentfail.py b/python/pacemaker/_cts/tests/componentfail.py index e21d399feb..2edb4aee83 100644 --- a/python/pacemaker/_cts/tests/componentfail.py +++ b/python/pacemaker/_cts/tests/componentfail.py @@ -1,166 +1,161 @@ """Kill a pacemaker daemon and test how the cluster recovers.""" __all__ = ["ComponentFail"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object # @TODO Separate this into a separate test for each component, so the patterns # can be made specific to each component, investigating failures is a little # easier, and specific testing can be done for each component (for example, # set attributes before and after killing pacemaker-attrd and check values). class ComponentFail(CTSTest): """Kill a random pacemaker daemon and wait for the cluster to recover.""" def __init__(self, cm): """ Create a new ComponentFail instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.is_unsafe = True self.name = "ComponentFail" self._complist = cm.components self._okerrpatterns = [] self._patterns = [] self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") self._patterns = [] self._okerrpatterns = [] # start all nodes ret = self._startall(None) if not ret: return self.failure("Setup failed") - if not self._cm.cluster_stable(self._env["StableTime"]): + if not self._cm.cluster_stable(self._env["stable_time"]): return self.failure("Setup failed - unstable") - node_is_dc = self._cm.is_node_dc(node, None) - # select a component to kill chosen = self._env.random_gen.choice(self._complist) - while chosen.dc_only and not node_is_dc: - chosen = self._env.random_gen.choice(self._complist) + node_is_dc = self._cm.is_node_dc(node, None) self.debug(f"...component {chosen.name} (dc={node_is_dc})") self.incr(chosen.name) if chosen.name != "corosync": self._patterns.extend([ - self.templates["Pat:ChildKilled"] % (node, chosen.name), - self.templates["Pat:ChildRespawn"] % (node, chosen.name), + self._cm.templates["Pat:ChildKilled"] % (node, chosen.name), + self._cm.templates["Pat:ChildRespawn"] % (node, chosen.name), ]) self._patterns.extend(chosen.pats) - if node_is_dc: - self._patterns.extend(chosen.dc_pats) # @TODO this should be a flag in the Component if chosen.name in ["corosync", "pacemaker-based", "pacemaker-fenced"]: # Ignore actions for fence devices if fencer will respawn # (their registration will be lost, and probes will fail) self._okerrpatterns = [ - self.templates["Pat:Fencing_active"], + self._cm.templates["Pat:Fencing_active"], ] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self._okerrpatterns.extend([ - self.templates["Pat:Fencing_recover"] % r.id, - self.templates["Pat:Fencing_probe"] % r.id, + self._cm.templates["Pat:Fencing_recover"] % r.id, + self._cm.templates["Pat:Fencing_probe"] % r.id, ]) # supply a copy so self.patterns doesn't end up empty tmp_pats = self._patterns.copy() self._patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonith_pats = [ - self.templates["Pat:Fencing_ok"] % node + self._cm.templates["Pat:Fencing_ok"] % node ] stonith = self.create_watch(stonith_pats, 0) stonith.set_watch() # set the watch for stable watch = self.create_watch( - tmp_pats, self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) + tmp_pats, self._env["dead_time"] + self._env["stable_time"] + self._env["start_time"]) watch.set_watch() # kill the component - chosen.kill(node) + chosen.signal("KILL", node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for any fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") - self._cm.cluster_stable(self._env["StartTime"]) + self._cm.cluster_stable(self._env["start_time"]) self.debug(f"Checking if {node} was shot") shot = stonith.look(60) if shot: self.debug(f"Found: {shot!r}") - self._okerrpatterns.append(self.templates["Pat:Fencing_start"] % node) + self._okerrpatterns.append(self._cm.templates["Pat:Fencing_start"] % node) if not self._env["at-boot"]: self._cm.expected_status[node] = "down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.look_for_all(allow_multiple_matches=True) if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to re-stabilize with all nodes") - is_stable = self._cm.cluster_stable(self._env["StartTime"]) + is_stable = self._cm.cluster_stable(self._env["start_time"]) if not matched: return self.failure(f"Didn't find all expected {chosen.name} patterns") if not is_stable: return self.failure(f"Cluster did not become stable after killing {chosen.name}") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self._okerrpatterns.extend(self._patterns) return self._okerrpatterns diff --git a/python/pacemaker/_cts/tests/ctstest.py b/python/pacemaker/_cts/tests/ctstest.py index 16d0f23e7b..025c3a7582 100644 --- a/python/pacemaker/_cts/tests/ctstest.py +++ b/python/pacemaker/_cts/tests/ctstest.py @@ -1,242 +1,232 @@ """Base classes for CTS tests.""" __all__ = ["CTSTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.environment import EnvFactory from pacemaker._cts.logging import LogFactory -from pacemaker._cts.patterns import PatternSelector from pacemaker._cts.remote import RemoteFactory from pacemaker._cts.timer import Timer from pacemaker._cts.watcher import LogWatcher # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. class CTSTest: """ The base class for all cluster tests. This implements a basic set of properties and behaviors like setup, tear down, time keeping, and statistics tracking. It is up to specific tests to implement their own specialized behavior on top of this class. """ def __init__(self, cm): """ Create a new CTSTest instance. Arguments: cm -- A ClusterManager instance """ # pylint: disable=invalid-name self.audits = [] self.name = None - self.templates = PatternSelector(cm["Name"]) self.stats = { "auditfail": 0, "calls": 0, "failure": 0, "skipped": 0, "success": 0 } self._cm = cm self._env = EnvFactory().getInstance() self._rsh = RemoteFactory().getInstance() self._logger = LogFactory() self._timers = {} self.benchmark = True # which tests to benchmark self.failed = False - self.is_experimental = False - self.is_loop = False self.is_unsafe = False self.passed = True def log(self, args): """Log a message.""" self._logger.log(args) def debug(self, args): """Log a debug message.""" self._logger.debug(args) def get_timer(self, key="test"): """Get the start time of the given timer.""" try: return self._timers[key].start_time except KeyError: return 0 def set_timer(self, key="test"): """Set the start time of the given timer to now, and return that time.""" if key not in self._timers: self._timers[key] = Timer(self._logger, self.name, key) self._timers[key].start() return self._timers[key].start_time def log_timer(self, key="test"): """Log the elapsed time of the given timer.""" if key not in self._timers: return elapsed = self._timers[key].elapsed self.debug(f"{self.name}:{key} runtime: {elapsed:.2f}") del self._timers[key] def incr(self, name): """Increment the given stats key.""" if name not in self.stats: self.stats[name] = 0 self.stats[name] += 1 # Reset the test passed boolean if name == "calls": self.passed = True def failure(self, reason="none"): """Increment the failure count, with an optional failure reason.""" self.passed = False self.incr("failure") self._logger.log(f"{f'Test {self.name}':<35} FAILED: {reason}") return False def success(self): """Increment the success count.""" self.incr("success") return True def skipped(self): """Increment the skipped count.""" self.incr("skipped") return True def __call__(self, node): """Perform this test.""" raise NotImplementedError def audit(self): """Perform all the relevant audits (see ClusterAudit), returning whether or not they all passed.""" passed = True for audit in self.audits: if not audit(): self._logger.log(f"Internal {self.name} Audit {audit.name} FAILED.") self.incr("auditfail") passed = False return passed def setup(self, node): """Set up this test.""" # node is used in subclasses # pylint: disable=unused-argument return self.success() def teardown(self, node): """Tear down this test.""" # node is used in subclasses # pylint: disable=unused-argument return self.success() def create_watch(self, patterns, timeout, name=None): """ Create a new LogWatcher object. This object can be used to search log files for matching patterns during this test's run. Arguments: patterns -- A list of regular expressions to match against the log timeout -- Default number of seconds to watch a log file at a time; this can be overridden by the timeout= parameter to self.look on an as-needed basis name -- A unique name to use when logging about this watch """ if not name: name = self.name return LogWatcher(self._env["LogFileName"], patterns, self._env["nodes"], self._env["log_kind"], name, timeout) def local_badnews(self, prefix, watch, local_ignore=None): """ Search through log files for messages. Arguments: prefix -- The string to look for at the beginning of lines, or "LocalBadNews:" if None. watch -- The LogWatcher object to use for searching. local_ignore -- A list of regexes that, if found in a line, will cause that line to be ignored. Return the number of matches found. """ errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [" CTS: ", prefix] if local_ignore: ignorelist += local_ignore while errcount < 100: match = watch.look(0) if match: add_err = True for ignore in ignorelist: if add_err and re.search(ignore, match): add_err = False if add_err: self._logger.log(f"{prefix} {match}") errcount += 1 else: break else: self._logger.log("Too many errors!") watch.end() return errcount def is_applicable(self): """ Return True if this test is applicable in the current test configuration. This method must be implemented by all subclasses. """ - if self.is_loop and not self._env["loop-tests"]: - return False - if self.is_unsafe and not self._env["unsafe-tests"]: return False - if self.is_experimental and not self._env["experimental-tests"]: - return False - if self._env["benchmark"] and not self.benchmark: return False return True @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [] diff --git a/python/pacemaker/_cts/tests/fliptest.py b/python/pacemaker/_cts/tests/fliptest.py index 2a87265d7a..4579fea55a 100644 --- a/python/pacemaker/_cts/tests/fliptest.py +++ b/python/pacemaker/_cts/tests/fliptest.py @@ -1,59 +1,59 @@ """Stop running nodes, and start stopped nodes.""" __all__ = ["FlipTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import time from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.stoptest import StopTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class FlipTest(CTSTest): """Stop running nodes and start stopped nodes.""" def __init__(self, cm): """ Create a new FlipTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Flip" self._start = StartTest(cm) self._stop = StopTest(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") if self._cm.expected_status[node] == "up": self.incr("stopped") ret = self._stop(node) kind = "up->down" # Give the cluster time to recognize it's gone... - time.sleep(self._env["StableTime"]) + time.sleep(self._env["stable_time"]) elif self._cm.expected_status[node] == "down": self.incr("started") ret = self._start(node) kind = "down->up" else: return self.skipped() self.incr(kind) if ret: return self.success() return self.failure(f"{kind} failure") diff --git a/python/pacemaker/_cts/tests/maintenancemode.py b/python/pacemaker/_cts/tests/maintenancemode.py index 5026a6cf0e..4846cf7539 100644 --- a/python/pacemaker/_cts/tests/maintenancemode.py +++ b/python/pacemaker/_cts/tests/maintenancemode.py @@ -1,228 +1,228 @@ """Toggle nodes in and out of maintenance mode.""" __all__ = ["MaintenanceMode"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class MaintenanceMode(CTSTest): """Toggle nodes in and ount of maintenance mode.""" def __init__(self, cm): """ Create a new MaintenanceMode instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "MaintenanceMode" self._action = "asyncmon" self._rid = "maintenanceDummy" self._start = StartTest(cm) self._startall = SimulStartLite(cm) def _toggle_maintenance_mode(self, node, enabled): """Toggle maintenance mode on the given node.""" pats = [ - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:DC_IDLE"] ] if enabled: action = "On" else: action = "Off" # fail the resource right after turning Maintenance mode on # verify it is not recovered until maintenance mode is turned off if enabled: - pats.append(self.templates["Pat:RscOpFail"] % (self._action, self._rid)) + pats.append(self._cm.templates["Pat:RscOpFail"] % (self._action, self._rid)) else: pats.extend([ - self.templates["Pat:RscOpOK"] % ("stop", self._rid), - self.templates["Pat:RscOpOK"] % ("start", self._rid) + self._cm.templates["Pat:RscOpOK"] % ("stop", self._rid), + self._cm.templates["Pat:RscOpOK"] % ("start", self._rid) ]) watch = self.create_watch(pats, 60) watch.set_watch() self.debug(f"Turning maintenance mode {action}") - self._rsh(node, self.templates[f"MaintenanceMode{action}"]) + self._rsh(node, self._cm.templates[f"MaintenanceMode{action}"]) if enabled: self._rsh(node, f"crm_resource -V -F -r {self._rid} -H {node} &>/dev/null") with Timer(self._logger, self.name, f"recover{action}"): watch.look_for_all() if watch.unmatched: self.debug(f"Failed to find patterns when turning maintenance mode {action}") return repr(watch.unmatched) return "" def _insert_maintenance_dummy(self, node): """Create a dummy resource on the given node.""" pats = [ - f"{node}.*" + (self.templates["Pat:RscOpOK"] % ("start", self._rid)) + f"{node}.*" + (self._cm.templates["Pat:RscOpOK"] % ("start", self._rid)) ] watch = self.create_watch(pats, 60) watch.set_watch() self._cm.add_dummy_rsc(node, self._rid) with Timer(self._logger, self.name, "addDummy"): watch.look_for_all() if watch.unmatched: self.debug("Failed to find patterns when adding maintenance dummy resource") return repr(watch.unmatched) return "" def _remove_maintenance_dummy(self, node): """Remove the previously created dummy resource on the given node.""" pats = [ - self.templates["Pat:RscOpOK"] % ("stop", self._rid) + self._cm.templates["Pat:RscOpOK"] % ("stop", self._rid) ] watch = self.create_watch(pats, 60) watch.set_watch() self._cm.remove_dummy_rsc(node, self._rid) with Timer(self._logger, self.name, "removeDummy"): watch.look_for_all() if watch.unmatched: self.debug("Failed to find patterns when removing maintenance dummy resource") return repr(watch.unmatched) return "" def _managed_rscs(self, node): """Return a list of all resources managed by the cluster.""" rscs = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self._cm, line) if tmp.managed: rscs.append(tmp.id) return rscs def _verify_resources(self, node, rscs, managed): """Verify that all resources are managed or unmanaged as expected.""" managed_rscs = rscs managed_str = "managed" if not managed: managed_str = "unmanaged" (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self._cm, line) if managed and not tmp.managed: continue if not managed and tmp.managed: continue if managed_rscs.count(tmp.id): managed_rscs.remove(tmp.id) if not managed_rscs: self.debug(f"Found all {managed_str} resources on {node}") return True self._logger.log(f"Could not find all {managed_str} resources on {node}. {managed_rscs}") return False def __call__(self, node): """Perform this test.""" self.incr("calls") verify_managed = False verify_unmanaged = False fail_pat = "" if not self._startall(None): return self.failure("Setup failed") # get a list of all the managed resources. We use this list # after enabling maintenance mode to verify all managed resources # become un-managed. After maintenance mode is turned off, we use # this list to verify all the resources become managed again. managed_rscs = self._managed_rscs(node) if not managed_rscs: self._logger.log(f"No managed resources on {node}") return self.skipped() # insert a fake resource we can fail during maintenance mode # so we can verify recovery does not take place until after maintenance # mode is disabled. fail_pat += self._insert_maintenance_dummy(node) # toggle maintenance mode ON, then fail dummy resource. fail_pat += self._toggle_maintenance_mode(node, True) # verify all the resources are now unmanaged if self._verify_resources(node, managed_rscs, False): verify_unmanaged = True # Toggle maintenance mode OFF, verify dummy is recovered. fail_pat += self._toggle_maintenance_mode(node, False) # verify all the resources are now managed again if self._verify_resources(node, managed_rscs, True): verify_managed = True # Remove our maintenance dummy resource. fail_pat += self._remove_maintenance_dummy(node) self._cm.cluster_stable() if fail_pat != "": return self.failure(f"Unmatched patterns: {fail_pat}") if not verify_unmanaged: return self.failure("Failed to verify resources became unmanaged during maintenance mode") if not verify_managed: return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ f"Updating failcount for {self._rid}", fr"schedulerd.*: Recover\s+{self._rid}\s+\(.*\)", r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self._action, self._rid), + self._cm.templates["Pat:RscOpOK"] % (self._action, self._rid), f"(ERROR|error).*: Action {self._rid}_{self._action}_0 .* initiated outside of a transition", ] diff --git a/python/pacemaker/_cts/tests/nearquorumpointtest.py b/python/pacemaker/_cts/tests/nearquorumpointtest.py index 955926a028..f2ede2a5db 100644 --- a/python/pacemaker/_cts/tests/nearquorumpointtest.py +++ b/python/pacemaker/_cts/tests/nearquorumpointtest.py @@ -1,121 +1,121 @@ """Randomly start and stop nodes to bring the cluster close to the quorum point.""" __all__ = ["NearQuorumPointTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class NearQuorumPointTest(CTSTest): """Randomly start and stop nodes to bring the cluster close to the quorum point.""" def __init__(self, cm): """ Create a new NearQuorumPointTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "NearQuorumPoint" def __call__(self, dummy): """Perform this test.""" self.incr("calls") startset = [] stopset = [] stonith = self._cm.prepare_fencing_watcher() # decide what to do with each node for node in self._env["nodes"]: action = self._env.random_gen.choice(["start", "stop"]) if action == "start": startset.append(node) elif action == "stop": stopset.append(node) self.debug(f"start nodes:{startset!r}") self.debug(f"stop nodes:{stopset!r}") # add search patterns watchpats = [] for node in stopset: if self._cm.expected_status[node] == "up": - watchpats.append(self.templates["Pat:We_stopped"] % node) + watchpats.append(self._cm.templates["Pat:We_stopped"] % node) for node in startset: if self._cm.expected_status[node] == "down": - watchpats.append(self.templates["Pat:Local_started"] % node) + watchpats.append(self._cm.templates["Pat:Local_started"] % node) else: for stopping in stopset: if self._cm.expected_status[stopping] == "up": - watchpats.append(self.templates["Pat:They_stopped"] % (node, stopping)) + watchpats.append(self._cm.templates["Pat:They_stopped"] % (node, stopping)) if not watchpats: return self.skipped() if startset: - watchpats.append(self.templates["Pat:DC_IDLE"]) + watchpats.append(self._cm.templates["Pat:DC_IDLE"]) - watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + watch = self.create_watch(watchpats, self._env["dead_time"] + 10) watch.set_watch() # begin actions for node in stopset: if self._cm.expected_status[node] == "up": self._cm.stop_cm_async(node) for node in startset: if self._cm.expected_status[node] == "down": self._cm.start_cm_async(node) # get the result if watch.look_for_all(): self._cm.cluster_stable() self._cm.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self._logger.log(f"Warn: Patterns not found: {watch.unmatched!r}") # get the "bad" nodes upnodes = [] for node in stopset: if self._cm.stat_cm(node): upnodes.append(node) downnodes = [] for node in startset: if not self._cm.stat_cm(node): downnodes.append(node) self._cm.fencing_cleanup("NearQuorumPoint", stonith) if not upnodes and not downnodes: self._cm.cluster_stable() # Make sure they're completely down with no residule for node in stopset: - self._rsh(node, self.templates["StopCmd"]) + self._rsh(node, self._cm.templates["StopCmd"]) return self.success() if upnodes: self._logger.log(f"Warn: Unstoppable nodes: {upnodes!r}") if downnodes: self._logger.log(f"Warn: Unstartable nodes: {downnodes!r}") return self.failure() diff --git a/python/pacemaker/_cts/tests/partialstart.py b/python/pacemaker/_cts/tests/partialstart.py index 9455a7b486..8ef649ab02 100644 --- a/python/pacemaker/_cts/tests/partialstart.py +++ b/python/pacemaker/_cts/tests/partialstart.py @@ -1,72 +1,72 @@ """Start a node and then tell it to stop before it is fully running.""" __all__ = ["PartialStart"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite from pacemaker._cts.tests.stoptest import StopTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class PartialStart(CTSTest): """Interrupt a node before it's finished starting up.""" def __init__(self, cm): """ Create a new PartialStart instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "PartialStart" self._startall = SimulStartLite(cm) self._stop = StopTest(cm) self._stopall = SimulStopLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") ret = self._stopall(None) if not ret: return self.failure("Setup failed") watchpats = [ "pacemaker-controld.*Connecting to .* cluster layer" ] - watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + watch = self.create_watch(watchpats, self._env["dead_time"] + 10) watch.set_watch() self._cm.start_cm_async(node) ret = watch.look_for_all() if not ret: self._logger.log(f"Patterns not found: {watch.unmatched!r}") return self.failure(f"Setup of {node} failed") ret = self._stop(node) if not ret: return self.failure(f"{node} did not stop in time") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" # We might do some fencing in the 2-node case if we make it up far enough return [ r"Executing reboot fencing operation", r"Requesting fencing \([^)]+\) targeting node " ] diff --git a/python/pacemaker/_cts/tests/reattach.py b/python/pacemaker/_cts/tests/reattach.py index 6d445e9818..cc84107ed3 100644 --- a/python/pacemaker/_cts/tests/reattach.py +++ b/python/pacemaker/_cts/tests/reattach.py @@ -1,207 +1,207 @@ """Restart the cluster and verify resources remain running.""" __all__ = ["Reattach"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import re import time from pacemaker.exitstatus import ExitStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.simulstoplite import SimulStopLite from pacemaker._cts.tests.starttest import StartTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class Reattach(CTSTest): """Restart the cluster and verify that resources remain running throughout.""" def __init__(self, cm): """ Create a new Reattach instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Reattach" self._startall = SimulStartLite(cm) self._stopall = SimulStopLite(cm) def _is_managed(self, node): """Return whether resources are managed by the cluster.""" (_, is_managed) = self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", verbose=1) is_managed = is_managed[0].strip() return is_managed == "true" def _set_unmanaged(self, node): """Disable resource management.""" self.debug("Disable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false") def _set_managed(self, node): """Enable resource management.""" self.debug("Re-enable resource management") self._rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D") def _disable_incompatible_rscs(self, node): """ Disable resources that are incompatible with this test. Starts and stops of stonith-class resources are implemented internally by Pacemaker, which means that they must stop when Pacemaker is stopped, even if unmanaged. Disable them before running the Reattach test so they don't affect resource placement. Set target-role to "Stopped" for any of these resources in the CIB. """ self.debug("Disable incompatible resources") xml = """' ' --scope rsc_defaults""" return self._rsh(node, self._cm.templates['CibAddXml'] % xml) def _enable_incompatible_rscs(self, node): """Re-enable resources that were incompatible with this test.""" self.debug("Re-enable incompatible resources") xml = """""" return self._rsh(node, f"""cibadmin --delete --xml-text '{xml}'""") def _reprobe(self, node): """ Reprobe all resources. The placement of some resources (such as promotable-1 in the lab-generated CIB) is affected by constraints using node-attribute-based rules. An earlier test may have erased the relevant node attribute, so do a reprobe, which should add the attribute back. """ return self._rsh(node, """crm_resource --refresh""") def setup(self, node): """Set up this test.""" if not self._startall(None): return self.failure("Startall failed") (rc, _) = self._disable_incompatible_rscs(node) if rc != ExitStatus.OK: return self.failure("Couldn't modify CIB to stop incompatible resources") (rc, _) = self._reprobe(node) if rc != ExitStatus.OK: return self.failure("Couldn't reprobe resources") if not self._cm.cluster_stable(double_check=True): return self.failure("Cluster did not stabilize after setup") return self.success() def teardown(self, node): """Tear down this test.""" # Make sure 'node' is up start = StartTest(self._cm) start(node) if not self._is_managed(node): self._set_managed(node) (rc, _) = self._enable_incompatible_rscs(node) if rc != ExitStatus.OK: return self.failure("Couldn't modify CIB to re-enable incompatible resources") if not self._cm.cluster_stable(): return self.failure("Cluster did not stabilize after teardown") if not self._is_managed(node): return self.failure("Could not re-enable resource management") return self.success() def __call__(self, node): """Perform this test.""" self.incr("calls") # Conveniently, the scheduler will display this message when disabling # management, even if fencing is not enabled, so we can rely on it. managed = self.create_watch(["No fencing will be done"], 60) managed.set_watch() self._set_unmanaged(node) if not managed.look_for_all(): self._logger.log(f"Patterns not found: {managed.unmatched!r}") return self.failure("Resource management not disabled") pats = [ - self.templates["Pat:RscOpOK"] % ("start", ".*"), - self.templates["Pat:RscOpOK"] % ("stop", ".*"), - self.templates["Pat:RscOpOK"] % ("promote", ".*"), - self.templates["Pat:RscOpOK"] % ("demote", ".*"), - self.templates["Pat:RscOpOK"] % ("migrate", ".*") + self._cm.templates["Pat:RscOpOK"] % ("start", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("stop", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("promote", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("demote", ".*"), + self._cm.templates["Pat:RscOpOK"] % ("migrate", ".*") ] watch = self.create_watch(pats, 60, "ShutdownActivity") watch.set_watch() self.debug("Shutting down the cluster") ret = self._stopall(None) if not ret: self._set_managed(node) return self.failure("Couldn't shut down the cluster") self.debug("Bringing the cluster back up") ret = self._startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self._set_managed(node) return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self._set_managed(node) return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.set_watch() # Re-enable resource management (and verify it happened). self._set_managed(node) self._cm.cluster_stable() if not self._is_managed(node): return self.failure("Could not re-enable resource management") # Ignore actions for STONITH resources ignore = [] (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if re.search("^Resource", line): r = AuditResource(self._cm, line) if r.rclass == "stonith": self.debug(f"Ignoring start actions for {r.id}") - ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id)) + ignore.append(self._cm.templates["Pat:RscOpOK"] % ("start", r.id)) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ r"resource( was|s were) active at shutdown" ] diff --git a/python/pacemaker/_cts/tests/remotedriver.py b/python/pacemaker/_cts/tests/remotedriver.py index a0d916d7b4..5cb2335f05 100644 --- a/python/pacemaker/_cts/tests/remotedriver.py +++ b/python/pacemaker/_cts/tests/remotedriver.py @@ -1,542 +1,543 @@ """Base classes for CTS tests.""" __all__ = ["RemoteDriver"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import os import time import subprocess import tempfile +from pacemaker._cts.CTS import Process from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.tests.stoptest import StopTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class RemoteDriver(CTSTest): """ A specialized base class for cluster tests that run on Pacemaker Remote nodes. This builds on top of CTSTest to provide methods for starting and stopping services and resources, and managing remote nodes. This is still just an abstract class -- specific tests need to implement their own specialized behavior. """ def __init__(self, cm): """ Create a new RemoteDriver instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "RemoteDriver" self._corosync_enabled = False self._pacemaker_enabled = False self._remote_node = None self._remote_rsc = "remote-rsc" self._start = StartTest(cm) self._startall = SimulStartLite(cm) self._stop = StopTest(cm) self.reset() def reset(self): """Reset the state of this test back to what it was before the test was run.""" self.failed = False self.fail_string = "" self._pcmk_started = False self._remote_node_added = False self._remote_rsc_added = False self._remote_use_reconnect_interval = self._env.random_gen.choice([True, False]) def fail(self, msg): """Mark test as failed.""" self.failed = True # Always log the failure. self._logger.log(msg) # Use first failure as test status, as it's likely to be most useful. if not self.fail_string: self.fail_string = msg def _get_other_node(self, node): """ Get the first cluster node out of the environment that is not the given node. Typically, this is used to find some node that will still be active that we can run cluster commands on. """ for othernode in self._env["nodes"]: if othernode == node: # we don't want to try and use the cib that we just shutdown. # find a cluster node that is not our soon to be remote-node. continue return othernode def _del_rsc(self, node, rsc): """ Delete the given named resource from the cluster. The given `node` is the cluster node on which we should *not* run the delete command. """ othernode = self._get_other_node(node) (rc, _) = self._rsh(othernode, f"crm_resource -D -r {rsc} -t primitive") if rc != 0: self.fail(f"Removal of resource '{rsc}' failed") def _add_rsc(self, node, rsc_xml): """ Add a resource given in XML format to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ othernode = self._get_other_node(node) (rc, _) = self._rsh(othernode, f"cibadmin -C -o resources -X '{rsc_xml}'") if rc != 0: self.fail("resource creation failed") def _add_primitive_rsc(self, node): """ Add a primitive heartbeat resource for the remote node to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ rsc_xml = f""" """ self._add_rsc(node, rsc_xml) if not self.failed: self._remote_rsc_added = True def _add_connection_rsc(self, node): """ Add a primitive connection resource for the remote node to the cluster. The given `node` is the cluster node on which we should *not* run the add command. """ rsc_xml = f""" """ if self._remote_use_reconnect_interval: # Set reconnect interval on resource rsc_xml += f""" """ rsc_xml += f""" """ self._add_rsc(node, rsc_xml) if not self.failed: self._remote_node_added = True def _disable_services(self, node): """Disable the corosync and pacemaker services on the given node.""" self._corosync_enabled = self._env.service_is_enabled(node, "corosync") if self._corosync_enabled: self._env.disable_service(node, "corosync") self._pacemaker_enabled = self._env.service_is_enabled(node, "pacemaker") if self._pacemaker_enabled: self._env.disable_service(node, "pacemaker") def _enable_services(self, node): """Enable the corosync and pacemaker services on the given node.""" if self._corosync_enabled: self._env.enable_service(node, "corosync") if self._pacemaker_enabled: self._env.enable_service(node, "pacemaker") def _stop_pcmk_remote(self, node): """Stop the Pacemaker Remote service on the given node.""" for _ in range(10): (rc, _) = self._rsh(node, "service pacemaker_remote stop") if rc != 0: time.sleep(6) else: break def _start_pcmk_remote(self, node): """Start the Pacemaker Remote service on the given node.""" for _ in range(10): (rc, _) = self._rsh(node, "service pacemaker_remote start") if rc != 0: time.sleep(6) else: self._pcmk_started = True break def _freeze_pcmk_remote(self, node): """Simulate a Pacemaker Remote daemon failure.""" - self._rsh(node, "killall -STOP pacemaker-remoted") + Process(self._cm, "pacemaker-remoted").signal("STOP", node) def _resume_pcmk_remote(self, node): """Simulate the Pacemaker Remote daemon recovering.""" - self._rsh(node, "killall -CONT pacemaker-remoted") + Process(self._cm, "pacemaker-remoted").signal("CONT", node) def _start_metal(self, node): """ Set up a Pacemaker Remote configuration. Remove any existing connection resources or nodes. Start the pacemaker_remote service. Create a connection resource. """ # Cluster nodes are reused as remote nodes in remote tests. If cluster # services were enabled at boot, in case the remote node got fenced, the # cluster node would join instead of the expected remote one. Meanwhile # pacemaker_remote would not be able to start. Depending on the chances, # the situations might not be able to be orchestrated gracefully any more. # # Temporarily disable any enabled cluster serivces. self._disable_services(node) # make sure the resource doesn't already exist for some reason self._rsh(node, f"crm_resource -D -r {self._remote_rsc} -t primitive") self._rsh(node, f"crm_resource -D -r {self._remote_node} -t primitive") if not self._stop(node): self.fail(f"Failed to shutdown cluster node {node}") return self._start_pcmk_remote(node) if not self._pcmk_started: self.fail(f"Failed to start pacemaker_remote on node {node}") return # Convert node to baremetal now that it has shutdown the cluster stack pats = [] watch = self.create_watch(pats, 120) watch.set_watch() pats.extend([ - self.templates["Pat:RscOpOK"] % ("start", self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscOpOK"] % ("start", self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ]) self._add_connection_rsc(node) with Timer(self._logger, self.name, "remoteMetalInit"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def migrate_connection(self, node): """Move the remote connection resource to any other available node.""" if self.failed: return pats = [ - self.templates["Pat:RscOpOK"] % ("migrate_to", self._remote_node), - self.templates["Pat:RscOpOK"] % ("migrate_from", self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscOpOK"] % ("migrate_to", self._remote_node), + self._cm.templates["Pat:RscOpOK"] % ("migrate_from", self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ] watch = self.create_watch(pats, 120) watch.set_watch() (rc, _) = self._rsh(node, f"crm_resource -M -r {self._remote_node}", verbose=1) if rc != 0: self.fail("failed to move remote node connection resource") return with Timer(self._logger, self.name, "remoteMetalMigrate"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def fail_rsc(self, node): """ Cause the dummy resource running on a Pacemaker Remote node to fail. Verify that the failure is logged correctly. """ if self.failed: return watchpats = [ - self.templates["Pat:RscRemoteOpOK"] % ("stop", self._remote_rsc, self._remote_node), - self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscRemoteOpOK"] % ("stop", self._remote_rsc, self._remote_node), + self._cm.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ] watch = self.create_watch(watchpats, 120) watch.set_watch() self.debug("causing dummy rsc to fail.") self._rsh(node, "rm -f /var/run/resource-agents/Dummy*") with Timer(self._logger, self.name, "remoteRscFail"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns during rsc fail: {watch.unmatched}") def fail_connection(self, node): """ Cause the remote connection resource to fail. Verify that the node is fenced and the connection resource is restarted on another node. """ if self.failed: return watchpats = [ - self.templates["Pat:Fencing_ok"] % self._remote_node, - self.templates["Pat:NodeFenced"] % self._remote_node + self._cm.templates["Pat:Fencing_ok"] % self._remote_node, + self._cm.templates["Pat:NodeFenced"] % self._remote_node ] watch = self.create_watch(watchpats, 120) watch.set_watch() # freeze the pcmk remote daemon. this will result in fencing self.debug("Force stopped active remote node") self._freeze_pcmk_remote(node) self.debug("Waiting for remote node to be fenced.") with Timer(self._logger, self.name, "remoteMetalFence"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") return self.debug("Waiting for the remote node to come back up") self._cm.ns.wait_for_node(node, 120) pats = [] watch = self.create_watch(pats, 240) watch.set_watch() - pats.append(self.templates["Pat:RscOpOK"] % ("start", self._remote_node)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("start", self._remote_node)) if self._remote_rsc_added: - pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node)) + pats.append(self._cm.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node)) # start the remote node again watch it integrate back into cluster. self._start_pcmk_remote(node) if not self._pcmk_started: self.fail(f"Failed to start pacemaker_remote on node {node}") return self.debug("Waiting for remote node to rejoin cluster after being fenced.") with Timer(self._logger, self.name, "remoteMetalRestart"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def _add_dummy_rsc(self, node): """Add a dummy resource that runs on the Pacemaker Remote node.""" if self.failed: return # verify we can put a resource on the remote node pats = [] watch = self.create_watch(pats, 120) watch.set_watch() pats.extend([ - self.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:RscRemoteOpOK"] % ("start", self._remote_rsc, self._remote_node), + self._cm.templates["Pat:DC_IDLE"] ]) # Add a resource that must live on remote-node self._add_primitive_rsc(node) # force that rsc to prefer the remote node. (rc, _) = self._cm.rsh(node, f"crm_resource -M -r {self._remote_rsc} -N {self._remote_node} -f", verbose=1) if rc != 0: self.fail("Failed to place remote resource on remote node.") return with Timer(self._logger, self.name, "remoteMetalRsc"): watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") def test_attributes(self, node): """Verify that attributes can be set on the Pacemaker Remote node.""" if self.failed: return # This verifies permanent attributes can be set on a remote-node. It also # verifies the remote-node can edit its own cib node section remotely. (rc, line) = self._cm.rsh(node, f"crm_attribute -l forever -n testattr -v testval -N {self._remote_node}", verbose=1) if rc != 0: self.fail(f"Failed to set remote-node attribute. rc:{rc} output:{line}") return (rc, _) = self._cm.rsh(node, f"crm_attribute -l forever -n testattr -q -N {self._remote_node}", verbose=1) if rc != 0: self.fail("Failed to get remote-node attribute") return (rc, _) = self._cm.rsh(node, f"crm_attribute -l forever -n testattr -D -N {self._remote_node}", verbose=1) if rc != 0: self.fail("Failed to delete remote-node attribute") def cleanup_metal(self, node): """ Clean up the Pacemaker Remote node configuration previously created by _setup_metal. Stop and remove dummy resources and connection resources. Stop the pacemaker_remote service. Remove the remote node itself. """ self._enable_services(node) if not self._pcmk_started: return pats = [] watch = self.create_watch(pats, 120) watch.set_watch() if self._remote_rsc_added: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_rsc)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("stop", self._remote_rsc)) if self._remote_node_added: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._remote_node)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("stop", self._remote_node)) with Timer(self._logger, self.name, "remoteMetalCleanup"): self._resume_pcmk_remote(node) if self._remote_rsc_added: # Remove dummy resource added for remote node tests self.debug("Cleaning up dummy rsc put on remote node") self._rsh(self._get_other_node(node), f"crm_resource -U -r {self._remote_rsc}") self._del_rsc(node, self._remote_rsc) if self._remote_node_added: # Remove remote node's connection resource self.debug("Cleaning up remote node connection resource") self._rsh(self._get_other_node(node), f"crm_resource -U -r {self._remote_node}") self._del_rsc(node, self._remote_node) watch.look_for_all() if watch.unmatched: self.fail(f"Unmatched patterns: {watch.unmatched}") self._stop_pcmk_remote(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self._remote_node_added: # Remove remote node itself self.debug("Cleaning up node entry for remote node") self._rsh(self._get_other_node(node), f"crm_node --force --remove {self._remote_node}") def _setup_env(self, node): """ Set up the environment to allow Pacemaker Remote to function. This involves generating a key and copying it to all nodes in the cluster. """ self._remote_node = f"remote-{node}" # we are assuming if all nodes have a key, that it is # the right key... If any node doesn't have a remote # key, we regenerate it everywhere. if self._rsh.exists_on_all("/etc/pacemaker/authkey", self._env["nodes"]): return # create key locally (handle, keyfile) = tempfile.mkstemp(".cts") os.close(handle) subprocess.check_call(["dd", "if=/dev/urandom", f"of={keyfile}", "bs=4096", "count=1"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # sync key throughout the cluster for n in self._env["nodes"]: self._rsh(n, "mkdir -p --mode=0750 /etc/pacemaker") self._rsh.copy(keyfile, f"root@{n}:/etc/pacemaker/authkey") self._rsh(n, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey") self._rsh(n, "chmod 0640 /etc/pacemaker/authkey") os.unlink(keyfile) def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" if not CTSTest.is_applicable(self): return False for node in self._env["nodes"]: (rc, _) = self._rsh(node, "which pacemaker-remoted >/dev/null 2>&1") if rc != 0: return False return True def start_new_test(self, node): """Prepare a remote test for running by setting up its environment and resources.""" self.incr("calls") self.reset() ret = self._startall(None) if not ret: return self.failure("setup failed: could not start all nodes") self._setup_env(node) self._start_metal(node) self._add_dummy_rsc(node) return True def __call__(self, node): """Perform this test.""" raise NotImplementedError @property def errors_to_ignore(self): """Return list of errors which should be ignored.""" return [ r"""is running on remote.*which isn't allowed""", r"""Connection terminated""", r"""Could not send remote""" ] diff --git a/python/pacemaker/_cts/tests/remotestonithd.py b/python/pacemaker/_cts/tests/remotestonithd.py index 624b802f46..058949cafa 100644 --- a/python/pacemaker/_cts/tests/remotestonithd.py +++ b/python/pacemaker/_cts/tests/remotestonithd.py @@ -1,53 +1,55 @@ """Fail the connection resource and fence the remote node.""" __all__ = ["RemoteStonithd"] -__copyright__ = "Copyright 2000-2024 the Pacemaker project contributors" +__copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.remotedriver import RemoteDriver class RemoteStonithd(RemoteDriver): """Fail the connection resource and fence the remote node.""" def __init__(self, cm): """ Create a new RemoteStonithd instance. Arguments: cm -- A ClusterManager instance """ RemoteDriver.__init__(self, cm) self.name = "RemoteStonithd" def __call__(self, node): """Perform this test.""" if not self.start_new_test(node): return self.failure(self.fail_string) self.fail_connection(node) self.cleanup_metal(node) self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() if self.failed: return self.failure(self.fail_string) return self.success() def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" - return self._env["DoFencing"] and RemoteDriver.is_applicable(self) + # pylint doesn't understand that self._env is subscriptable. + # pylint: disable=unsubscriptable-object + return self._env["fencing_enabled"] and RemoteDriver.is_applicable(self) @property def errors_to_ignore(self): """Return list of errors which should be ignored.""" return [ r"Lost connection to Pacemaker Remote node", r"Software caused connection abort", r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", r"schedulerd.*:\s+Recover\s+remote-.*\s+\(.*\)", r"error: Result of monitor operation for .* on remote-.*: Internal communication failure" ] + super().errors_to_ignore diff --git a/python/pacemaker/_cts/tests/resourcerecover.py b/python/pacemaker/_cts/tests/resourcerecover.py index 2d25900b8f..a14bab5628 100644 --- a/python/pacemaker/_cts/tests/resourcerecover.py +++ b/python/pacemaker/_cts/tests/resourcerecover.py @@ -1,171 +1,171 @@ """Fail a random resource and verify its fail count increases.""" __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.audits import AuditResource from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable class ResourceRecover(CTSTest): """Fail a random resource.""" def __init__(self, cm): """ Create a new ResourceRecover instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "ResourceRecover" self._action = "asyncmon" self._interval = 0 self._rid = None self._rid_alt = None self._start = StartTest(cm) self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") if not self._startall(None): return self.failure("Setup failed") # List all resources active on the node (skip test if none) resourcelist = self._cm.active_resources(node) if not resourcelist: self._logger.log(f"No active resources on {node}") return self.skipped() # Choose one resource at random rsc = self._choose_resource(node, resourcelist) if rsc is None: return self.failure(f"Could not get details of resource '{self._rid}'") if rsc.id == rsc.clone_id: self.debug(f"Failing {rsc.id}") else: self.debug(f"Failing {rsc.id} (also known as {rsc.clone_id})") # Log patterns to watch for (failure, plus restart if managed) pats = [ - self.templates["Pat:CloneOpFail"] % (self._action, rsc.id, rsc.clone_id) + self._cm.templates["Pat:CloneOpFail"] % (self._action, rsc.id, rsc.clone_id) ] if rsc.managed: - pats.append(self.templates["Pat:RscOpOK"] % ("stop", self._rid)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("stop", self._rid)) if rsc.unique: - pats.append(self.templates["Pat:RscOpOK"] % ("start", self._rid)) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("start", self._rid)) else: # Anonymous clones may get restarted with a different clone number - pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*")) + pats.append(self._cm.templates["Pat:RscOpOK"] % ("start", ".*")) # Fail resource. (Ideally, we'd fail it twice, to ensure the fail count # is incrementing properly, but it might restart on a different node. # We'd have to temporarily ban it from all other nodes and ensure the # migration-threshold hasn't been reached.) if self._fail_resource(rsc, node, pats) is None: # self.failure() already called return None return self.success() def _choose_resource(self, node, resourcelist): """Choose a random resource to target.""" self._rid = self._env.random_gen.choice(resourcelist) self._rid_alt = self._rid (_, lines) = self._rsh(node, "crm_resource -c", verbose=1) for line in lines: if line.startswith("Resource: "): rsc = AuditResource(self._cm, line) if rsc.id == self._rid: # Handle anonymous clones that get renamed self._rid = rsc.clone_id return rsc return None def _get_failcount(self, node): """Check the fail count of targeted resource on given node.""" cmd = "crm_failcount --quiet --query --resource %s --operation %s --interval %d --node %s" (rc, lines) = self._rsh(node, cmd % (self._rid, self._action, self._interval, node), verbose=1) if rc != 0 or len(lines) != 1: lines = [line.strip() for line in lines] s = " // ".join(lines) self._logger.log(f"crm_failcount on {node} failed ({rc}): {s}") return -1 try: failcount = int(lines[0]) except (IndexError, ValueError): s = " ".join(lines) self._logger.log(f"crm_failcount output on {node} unparseable: {s}") return -1 return failcount def _fail_resource(self, rsc, node, pats): """Fail the targeted resource, and verify as expected.""" orig_failcount = self._get_failcount(node) watch = self.create_watch(pats, 60) watch.set_watch() self._rsh(node, f"crm_resource -V -F -r {self._rid} -H {node} &>/dev/null") with Timer(self._logger, self.name, "recover"): watch.look_for_all() self._cm.cluster_stable() recovered = self._cm.resource_location(self._rid) if watch.unmatched: return self.failure(f"Patterns not found: {watch.unmatched!r}") if rsc.unique and len(recovered) > 1: return self.failure(f"{self._rid} is now active on more than one node: {recovered!r}") if recovered: self.debug(f"{self._rid} is running on: {recovered!r}") elif rsc.managed: return self.failure(f"{self._rid} was not recovered and is inactive") new_failcount = self._get_failcount(node) if new_failcount != orig_failcount + 1: return self.failure(f"{self._rid} fail count is {new_failcount} not {orig_failcount + 1}") # Anything but None is success return 0 @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ f"Updating failcount for {self._rid}", fr"schedulerd.*: Recover\s+({self._rid}|{self._rid_alt})\s+\(.*\)", r"Unknown operation: fail", - self.templates["Pat:RscOpOK"] % (self._action, self._rid), + self._cm.templates["Pat:RscOpOK"] % (self._action, self._rid), f"(ERROR|error).*: Action {self._rid}_{self._action}_{self._interval} .* initiated outside of a transition", ] diff --git a/python/pacemaker/_cts/tests/simulstartlite.py b/python/pacemaker/_cts/tests/simulstartlite.py index a327e39d1b..fb9bdef26e 100644 --- a/python/pacemaker/_cts/tests/simulstartlite.py +++ b/python/pacemaker/_cts/tests/simulstartlite.py @@ -1,128 +1,128 @@ """Simultaneously start stopped nodes.""" __all__ = ["SimulStartLite"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class SimulStartLite(CTSTest): """ A pseudo-test that sets up conditions before running some other test. This class starts any stopped nodes more or less simultaneously. Other test classes should not use this one as a superclass. """ def __init__(self, cm): """ Create a new SimulStartLite instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "SimulStartLite" def __call__(self, dummy): """Return whether starting all stopped nodes more or less simultaneously succeeds.""" self.incr("calls") self.debug(f"Setup: {self.name}") # We ignore the "node" parameter... node_list = [] for node in self._env["nodes"]: if self._cm.expected_status[node] == "down": self.incr("WasStopped") node_list.append(node) self.set_timer() while len(node_list) > 0: # Repeat until all nodes come up - uppat = self.templates["Pat:NonDC_started"] + uppat = self._cm.templates["Pat:NonDC_started"] if self._cm.upcount() == 0: - uppat = self.templates["Pat:Local_started"] + uppat = self._cm.templates["Pat:Local_started"] watchpats = [ - self.templates["Pat:DC_IDLE"] + self._cm.templates["Pat:DC_IDLE"] ] for node in node_list: watchpats.extend([uppat % node, - self.templates["Pat:InfraUp"] % node, - self.templates["Pat:PacemakerUp"] % node]) + self._cm.templates["Pat:InfraUp"] % node, + self._cm.templates["Pat:PacemakerUp"] % node]) # Start all the nodes - at about the same time... - watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + watch = self.create_watch(watchpats, self._env["dead_time"] + 10) watch.set_watch() stonith = self._cm.prepare_fencing_watcher() for node in node_list: self._cm.start_cm_async(node) watch.look_for_all() node_list = self._cm.fencing_cleanup(self.name, stonith) if node_list is None: return self.failure("Cluster did not stabilize") # Remove node_list messages from watch.unmatched for node in node_list: self._logger.debug(f"Dealing with stonith operations for {node_list}") if watch.unmatched: try: watch.unmatched.remove(uppat % node) except ValueError: self.debug(f"Already matched: {uppat % node}") try: - watch.unmatched.remove(self.templates["Pat:InfraUp"] % node) + watch.unmatched.remove(self._cm.templates["Pat:InfraUp"] % node) except ValueError: - self.debug(f"Already matched: {self.templates['Pat:InfraUp'] % node}") + self.debug(f"Already matched: {self._cm.templates['Pat:InfraUp'] % node}") try: - watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node) + watch.unmatched.remove(self._cm.templates["Pat:PacemakerUp"] % node) except ValueError: - self.debug(f"Already matched: {self.templates['Pat:PacemakerUp'] % node}") + self.debug(f"Already matched: {self._cm.templates['Pat:PacemakerUp'] % node}") if watch.unmatched: for regex in watch.unmatched: self._logger.log(f"Warn: Startup pattern not found: {regex}") if not self._cm.cluster_stable(): return self.failure("Cluster did not stabilize") did_fail = False unstable = [] for node in self._env["nodes"]: if not self._cm.stat_cm(node): did_fail = True unstable.append(node) if did_fail: return self.failure(f"Unstarted nodes exist: {unstable}") unstable = [] for node in self._env["nodes"]: if not self._cm.node_stable(node): did_fail = True unstable.append(node) if did_fail: return self.failure(f"Unstable cluster nodes exist: {unstable}") return self.success() def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" return False diff --git a/python/pacemaker/_cts/tests/simulstoplite.py b/python/pacemaker/_cts/tests/simulstoplite.py index 1bb8ddc5a0..232fc58203 100644 --- a/python/pacemaker/_cts/tests/simulstoplite.py +++ b/python/pacemaker/_cts/tests/simulstoplite.py @@ -1,86 +1,86 @@ """Simultaneously stop running nodes.""" __all__ = ["SimulStopLite"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class SimulStopLite(CTSTest): """ A pseudo-test that sets up conditions before running some other test. This class stops any running nodes more or less simultaneously. It can be used both to set up a test or to clean up a test. Other test classes should not use this one as a superclass. """ def __init__(self, cm): """ Create a new SimulStopLite instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "SimulStopLite" def __call__(self, dummy): """Return whether stopping all running nodes more or less simultaneously succeeds.""" self.incr("calls") self.debug(f"Setup: {self.name}") # We ignore the "node" parameter... watchpats = [] for node in self._env["nodes"]: if self._cm.expected_status[node] == "up": self.incr("WasStarted") - watchpats.append(self.templates["Pat:We_stopped"] % node) + watchpats.append(self._cm.templates["Pat:We_stopped"] % node) if len(watchpats) == 0: return self.success() # Stop all the nodes - at about the same time... - watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + watch = self.create_watch(watchpats, self._env["dead_time"] + 10) watch.set_watch() self.set_timer() for node in self._env["nodes"]: if self._cm.expected_status[node] == "up": self._cm.stop_cm_async(node) if watch.look_for_all(): # Make sure they're completely down with no residule for node in self._env["nodes"]: - self._rsh(node, self.templates["StopCmd"]) + self._rsh(node, self._cm.templates["StopCmd"]) return self.success() did_fail = False up_nodes = [] for node in self._env["nodes"]: if self._cm.stat_cm(node): did_fail = True up_nodes.append(node) if did_fail: return self.failure(f"Active nodes exist: {up_nodes}") self._logger.log(f"Warn: All nodes stopped but CTS didn't detect: {watch.unmatched}") return self.failure(f"Missing log message: {watch.unmatched}") def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" return False diff --git a/python/pacemaker/_cts/tests/splitbraintest.py b/python/pacemaker/_cts/tests/splitbraintest.py index e020f4263e..102ac156b3 100644 --- a/python/pacemaker/_cts/tests/splitbraintest.py +++ b/python/pacemaker/_cts/tests/splitbraintest.py @@ -1,211 +1,211 @@ """Create a split brain cluster and verify a resource is multiply managed.""" __all__ = ["SplitBrainTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import time from pacemaker._cts.input import should_continue from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class SplitBrainTest(CTSTest): """ Create a split brain cluster. This test verifies that one node in each partition takes over the resource, resulting in two nodes running the same resource. """ def __init__(self, cm): """ Create a new SplitBrainTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) - self.is_experimental = True + self.is_unsafe = True self.name = "SplitBrain" self._start = StartTest(cm) self._startall = SimulStartLite(cm) def _isolate_partition(self, partition): """Create a new partition containing the given nodes.""" other_nodes = self._env["nodes"].copy() for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log(f"Node {node} not in {self._env['nodes']!r} from {partition!r}") if not other_nodes: return self.debug(f"Creating partition: {partition!r}") self.debug(f"Everyone else: {other_nodes!r}") for node in partition: if not self._cm.isolate_node(node, other_nodes): self._logger.log(f"Could not isolate {node}") return def _heal_partition(self, partition): """Move the given nodes out of their own partition back into the cluster.""" other_nodes = self._env["nodes"].copy() for node in partition: try: other_nodes.remove(node) except ValueError: self._logger.log(f"Node {node} not in {self._env['nodes']!r}") if len(other_nodes) == 0: return self.debug(f"Healing partition: {partition!r}") self.debug(f"Everyone else: {other_nodes!r}") for node in partition: self._cm.unisolate_node(node, other_nodes) def __call__(self, node): """Perform this test.""" self.incr("calls") self.passed = True partitions = {} if not self._startall(None): return self.failure("Setup failed") while True: # Retry until we get multiple partitions partitions = {} p_max = len(self._env["nodes"]) for n in self._env["nodes"]: p = self._env.random_gen.randint(1, p_max) if p not in partitions: partitions[p] = [] partitions[p].append(n) p_max = len(partitions) if p_max > 1: break # else, try again self.debug(f"Created {p_max} partitions") for (key, val) in partitions.items(): self.debug(f"Partition[{key}]:\t{val!r}") # Disabling STONITH to reduce test complexity for now self._rsh(node, "crm_attribute -V -n stonith-enabled -v false") for val in partitions.values(): self._isolate_partition(val) count = 30 while count > 0: if len(self._cm.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self._cm.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self._cm.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self._cm.partitions_expected = 1 # And heal them again for val in partitions.values(): self._heal_partition(val) # Wait for a single partition to form count = 30 while count > 0: if len(self._cm.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self._cm.find_partitions() if partitions: members = partitions[0].split() if len(members) != len(self._env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self._cm.cluster_stable(1200): self.failure("Reformed cluster not stable") if not should_continue(self._env): raise ValueError("Reformed cluster not stable") # Turn fencing back on - if self._env["DoFencing"]: + if self._env["fencing_enabled"]: self._rsh(node, "crm_attribute -V -D -n stonith-enabled") self._cm.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ r"Another DC detected:", r"(ERROR|error).*: .*Application of an update diff failed", r"pacemaker-controld.*:.*not in our membership list", r"CRIT:.*node.*returning after partition" ] def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" if not CTSTest.is_applicable(self): return False return len(self._env["nodes"]) > 2 diff --git a/python/pacemaker/_cts/tests/standbytest.py b/python/pacemaker/_cts/tests/standbytest.py index 7992dd0bc4..e70e5febcc 100644 --- a/python/pacemaker/_cts/tests/standbytest.py +++ b/python/pacemaker/_cts/tests/standbytest.py @@ -1,106 +1,106 @@ """Put a node into standby mode and check that resources migrate.""" __all__ = ["StandbyTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.tests.starttest import StartTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class StandbyTest(CTSTest): """Put a node into standby and check that resources migrate away from it.""" def __init__(self, cm): """ Create a new StandbyTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "Standby" self._start = StartTest(cm) self._startall = SimulStartLite(cm) # make sure the node is active # set the node to standby mode # check resources, none resource should be running on the node # set the node to active mode # check resources, resources should have been migrated back (SHOULD THEY?) def __call__(self, node): """Perform this test.""" self.incr("calls") ret = self._startall(None) if not ret: return self.failure("Start all nodes failed") self.debug(f"Make sure node {node} is active") if self._cm.in_standby_mode(node): if not self._cm.set_standby_mode(node, False): return self.failure(f"can't set node {node} to active mode") self._cm.cluster_stable() if self._cm.in_standby_mode(node): return self.failure(f"standby status of {node} is [on] but we expect [off]") watchpats = [ r"State transition .* -> S_POLICY_ENGINE", ] - watch = self.create_watch(watchpats, self._env["DeadTime"] + 10) + watch = self.create_watch(watchpats, self._env["dead_time"] + 10) watch.set_watch() self.debug(f"Setting node {node} to standby mode") if not self._cm.set_standby_mode(node, True): return self.failure(f"can't set node {node} to standby mode") self.set_timer("on") ret = watch.look_for_all() if not ret: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self._cm.set_standby_mode(node, False) return self.failure(f"cluster didn't react to standby change on {node}") self._cm.cluster_stable() if not self._cm.in_standby_mode(node): return self.failure(f"standby status of {node} is [off] but we expect [on]") self.log_timer("on") self.debug("Checking resources") rscs_on_node = self._cm.active_resources(node) if rscs_on_node: rc = self.failure(f"{node} set to standby, {rscs_on_node!r} is still running on it") self.debug(f"Setting node {node} to active mode") self._cm.set_standby_mode(node, False) return rc self.debug(f"Setting node {node} to active mode") if not self._cm.set_standby_mode(node, False): return self.failure(f"can't set node {node} to active mode") self.set_timer("off") self._cm.cluster_stable() if self._cm.in_standby_mode(node): return self.failure(f"standby status of {node} is [on] but we expect [off]") self.log_timer("off") return self.success() diff --git a/python/pacemaker/_cts/tests/stonithdtest.py b/python/pacemaker/_cts/tests/stonithdtest.py index c2e59f80bc..facc0133d5 100644 --- a/python/pacemaker/_cts/tests/stonithdtest.py +++ b/python/pacemaker/_cts/tests/stonithdtest.py @@ -1,141 +1,134 @@ """Fence a running node and wait for it to restart.""" __all__ = ["StonithdTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker.exitstatus import ExitStatus from pacemaker._cts.tests.ctstest import CTSTest from pacemaker._cts.tests.simulstartlite import SimulStartLite from pacemaker._cts.timer import Timer # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class StonithdTest(CTSTest): """Fence a running node and wait for it to restart.""" def __init__(self, cm): """ Create a new StonithdTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.benchmark = True self.name = "Stonithd" self._startall = SimulStartLite(cm) def __call__(self, node): """Perform this test.""" self.incr("calls") if len(self._env["nodes"]) < 2: return self.skipped() ret = self._startall(None) if not ret: return self.failure("Setup failed") watchpats = [ - self.templates["Pat:Fencing_ok"] % node, - self.templates["Pat:NodeFenced"] % node, + self._cm.templates["Pat:Fencing_ok"] % node, + self._cm.templates["Pat:NodeFenced"] % node, ] if not self._env["at-boot"]: self.debug(f"Expecting {node} to stay down") self._cm.expected_status[node] = "down" else: self.debug(f"Expecting {node} to come up again {self._env['at-boot']}") watchpats.extend([ f"{node}.* S_STARTING -> S_PENDING", f"{node}.* S_PENDING -> S_NOT_DC", ]) - watch = self.create_watch(watchpats, 30 + self._env["DeadTime"] + self._env["StableTime"] + self._env["StartTime"]) + watch = self.create_watch(watchpats, + 30 + self._env["dead_time"] + self._env["stable_time"] + self._env["start_time"]) watch.set_watch() origin = self._env.random_gen.choice(self._env["nodes"]) (rc, _) = self._rsh(origin, f"stonith_admin --reboot {node} -VVVVVV") if rc == ExitStatus.TIMEOUT: # Look for the patterns, usually this means the required # device was running on the node to be fenced - or that # the required devices were in the process of being loaded # and/or moved # # Effectively the node committed suicide so there will be # no confirmation, but pacemaker should be watching and # fence the node again self._logger.log(f"Fencing command on {origin} to fence {node} timed out") elif origin != node and rc != 0: self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self._logger.log(f"Fencing command on {origin} failed to fence {node} (rc={rc})") elif origin == node and rc != 255: # 255 == broken pipe, ie. the node was fenced as expected self._logger.log(f"Locally originated fencing returned {rc}") with Timer(self._logger, self.name, "fence"): matched = watch.look_for_all() self.set_timer("reform") if watch.unmatched: self._logger.log(f"Patterns not found: {watch.unmatched!r}") self.debug("Waiting for the cluster to recover") self._cm.cluster_stable() self.debug("Waiting for fenced node to come back up") self._cm.ns.wait_for_all_nodes(self._env["nodes"], 600) self.debug("Waiting for the cluster to re-stabilize with all nodes") - is_stable = self._cm.cluster_stable(self._env["StartTime"]) + is_stable = self._cm.cluster_stable(self._env["start_time"]) if not matched: return self.failure("Didn't find all expected patterns") if not is_stable: return self.failure("Cluster did not become stable") self.log_timer("reform") return self.success() @property def errors_to_ignore(self): """Return a list of errors which should be ignored.""" return [ - self.templates["Pat:Fencing_start"] % ".*", - self.templates["Pat:Fencing_ok"] % ".*", - self.templates["Pat:Fencing_active"], + self._cm.templates["Pat:Fencing_start"] % ".*", + self._cm.templates["Pat:Fencing_ok"] % ".*", + self._cm.templates["Pat:Fencing_active"], r"error.*: Operation 'reboot' targeting .* by .* for stonith_admin.*: Timer expired" ] def is_applicable(self): """Return True if this test is applicable in the current test configuration.""" - if not CTSTest.is_applicable(self): - return False - - # pylint gets confused because of EnvFactory here. - # pylint: disable=unsupported-membership-test - if "DoFencing" in self._env: - return self._env["DoFencing"] - - return True + return self._env["fencing_enabled"] and CTSTest.is_applicable(self) diff --git a/python/pacemaker/_cts/tests/stoptest.py b/python/pacemaker/_cts/tests/stoptest.py index c4d9b559a2..0588f4897b 100644 --- a/python/pacemaker/_cts/tests/stoptest.py +++ b/python/pacemaker/_cts/tests/stoptest.py @@ -1,97 +1,97 @@ """Stop the cluster manager on a given node.""" __all__ = ["StopTest"] __copyright__ = "Copyright 2000-2025 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" from pacemaker._cts.tests.ctstest import CTSTest # Disable various pylint warnings that occur in so many places throughout this # file it's easiest to just take care of them globally. This does introduce the # possibility that we'll miss some other cause of the same warning, but we'll # just have to be careful. # pylint doesn't understand that self._rsh is callable. # pylint: disable=not-callable # pylint doesn't understand that self._env is subscriptable. # pylint: disable=unsubscriptable-object class StopTest(CTSTest): """ A pseudo-test that sets up conditions before running some other test. This class stops the cluster manager on a given node. Other test classes should not use this one as a superclass. """ def __init__(self, cm): """ Create a new StopTest instance. Arguments: cm -- A ClusterManager instance """ CTSTest.__init__(self, cm) self.name = "Stop" def __call__(self, node): """Stop the given node, returning whether this succeeded or not.""" self.incr("calls") if self._cm.expected_status[node] != "up": return self.skipped() # Technically we should always be able to notice ourselves stopping patterns = [ - self.templates["Pat:We_stopped"] % node, + self._cm.templates["Pat:We_stopped"] % node, ] # Any active node needs to notice this one left # (note that this won't work if we have multiple partitions) for other in self._env["nodes"]: if self._cm.expected_status[other] == "up" and other != node: - patterns.append(self.templates["Pat:They_stopped"] % (other, node)) + patterns.append(self._cm.templates["Pat:They_stopped"] % (other, node)) - watch = self.create_watch(patterns, self._env["DeadTime"]) + watch = self.create_watch(patterns, self._env["dead_time"]) watch.set_watch() if node == self._cm.our_node: self.incr("us") else: if self._cm.upcount() <= 1: self.incr("all") else: self.incr("them") self._cm.stop_cm(node) watch.look_for_all() failreason = None unmatched_str = "||" if watch.unmatched: (_, output) = self._rsh(node, "/bin/ps axf", verbose=1) for line in output: self.debug(line) (_, output) = self._rsh(node, "/usr/sbin/dlm_tool dump 2>/dev/null", verbose=1) for line in output: self.debug(line) for regex in watch.unmatched: self._logger.log(f"ERROR: Shutdown pattern not found: {regex}") unmatched_str += f"{regex}||" failreason = "Missing shutdown pattern" - self._cm.cluster_stable(self._env["DeadTime"]) + self._cm.cluster_stable(self._env["dead_time"]) if not watch.unmatched or self._cm.upcount() == 0: return self.success() if len(watch.unmatched) >= self._cm.upcount(): return self.failure(f"no match against ({unmatched_str})") if failreason is None: return self.success() return self.failure(failreason) diff --git a/tools/Makefile.am b/tools/Makefile.am index 5fb1f8c75e..067b856836 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -1,160 +1,158 @@ # # Copyright 2004-2025 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # include $(top_srcdir)/mk/common.mk include $(top_srcdir)/mk/man.mk if BUILD_SYSTEMD systemdsystemunit_DATA = crm_mon.service endif noinst_HEADERS = crm_mon.h \ crm_resource.h pcmkdir = $(datadir)/$(PACKAGE) pcmk_DATA = report.common \ report.collector sbin_SCRIPTS = crm_report \ crm_standby \ crm_master \ crm_failcount -noinst_SCRIPTS = cluster-clean \ - cluster-helper \ - pcmk_simtimes +noinst_SCRIPTS = pcmk_simtimes EXTRA_DIST = $(wildcard *.inc) \ fix-manpages sbin_PROGRAMS = attrd_updater \ cibadmin \ crmadmin \ crm_simulate \ crm_attribute \ crm_diff \ crm_error \ crm_mon \ crm_node \ crm_resource \ crm_rule \ crm_shadow \ crm_verify \ crm_ticket \ iso8601 \ stonith_admin if BUILD_CIBSECRETS sbin_PROGRAMS += cibsecret endif ## SOURCES # A few tools are just thin wrappers around crm_attribute. # This makes their help get updated when crm_attribute changes # (see mk/common.mk). MAN8DEPS = crm_attribute crmadmin_SOURCES = crmadmin.c crmadmin_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crmadmin_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crmadmin_LDADD += $(top_builddir)/lib/cib/libcib.la crmadmin_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_error_SOURCES = crm_error.c crm_error_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_error_LDADD += $(top_builddir)/lib/common/libcrmcommon.la cibadmin_SOURCES = cibadmin.c cibadmin_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la cibadmin_LDADD += $(top_builddir)/lib/cib/libcib.la cibadmin_LDADD += $(top_builddir)/lib/common/libcrmcommon.la if BUILD_CIBSECRETS cibsecret_SOURCES = cibsecret.c cibsecret_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la cibsecret_LDADD += $(top_builddir)/lib/cib/libcib.la cibsecret_LDADD += $(top_builddir)/lib/common/libcrmcommon.la endif crm_shadow_SOURCES = crm_shadow.c crm_shadow_LDADD = $(top_builddir)/lib/cib/libcib.la crm_shadow_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_node_SOURCES = crm_node.c crm_node_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_node_LDADD += $(top_builddir)/lib/cib/libcib.la crm_node_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_simulate_SOURCES = crm_simulate.c crm_simulate_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_simulate_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crm_simulate_LDADD += $(top_builddir)/lib/cib/libcib.la crm_simulate_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_diff_SOURCES = crm_diff.c crm_diff_LDADD = $(top_builddir)/lib/common/libcrmcommon.la crm_mon_SOURCES = crm_mon.c crm_mon_curses.c crm_mon_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_mon_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crm_mon_LDADD += $(top_builddir)/lib/fencing/libstonithd.la crm_mon_LDADD += $(top_builddir)/lib/cib/libcib.la crm_mon_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_mon_LDADD += $(CURSES_LIBS) crm_verify_SOURCES = crm_verify.c crm_verify_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_verify_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crm_verify_LDADD += $(top_builddir)/lib/cib/libcib.la crm_verify_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_attribute_SOURCES = crm_attribute.c crm_attribute_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_attribute_LDADD += $(top_builddir)/lib/cib/libcib.la crm_attribute_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_resource_SOURCES = crm_resource.c \ crm_resource_ban.c \ crm_resource_print.c \ crm_resource_runtime.c crm_resource_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_resource_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crm_resource_LDADD += $(top_builddir)/lib/cib/libcib.la crm_resource_LDADD += $(top_builddir)/lib/lrmd/liblrmd.la crm_resource_LDADD += $(top_builddir)/lib/fencing/libstonithd.la crm_resource_LDADD += $(top_builddir)/lib/services/libcrmservice.la crm_resource_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_rule_SOURCES = crm_rule.c crm_rule_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_rule_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crm_rule_LDADD += $(top_builddir)/lib/cib/libcib.la crm_rule_LDADD += $(top_builddir)/lib/common/libcrmcommon.la iso8601_SOURCES = iso8601.c iso8601_LDADD = $(top_builddir)/lib/common/libcrmcommon.la attrd_updater_SOURCES = attrd_updater.c attrd_updater_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la attrd_updater_LDADD += $(top_builddir)/lib/common/libcrmcommon.la crm_ticket_SOURCES = crm_ticket.c crm_ticket_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la crm_ticket_LDADD += $(top_builddir)/lib/pengine/libpe_status.la crm_ticket_LDADD += $(top_builddir)/lib/cib/libcib.la crm_ticket_LDADD += $(top_builddir)/lib/common/libcrmcommon.la stonith_admin_SOURCES = stonith_admin.c stonith_admin_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la stonith_admin_LDADD += $(top_builddir)/lib/pengine/libpe_status.la stonith_admin_LDADD += $(top_builddir)/lib/cib/libcib.la stonith_admin_LDADD += $(top_builddir)/lib/fencing/libstonithd.la stonith_admin_LDADD += $(top_builddir)/lib/common/libcrmcommon.la CLEANFILES = $(man8_MANS) diff --git a/tools/cluster-clean.in b/tools/cluster-clean.in deleted file mode 100755 index 91a629479a..0000000000 --- a/tools/cluster-clean.in +++ /dev/null @@ -1,101 +0,0 @@ -#!@BASH_PATH@ -# -# Copyright 2011-2023 the Pacemaker project contributors -# -# The version control history for this file may have further details. -# -# This source code is licensed under the GNU General Public License version 2 -# or later (GPLv2+) WITHOUT ANY WARRANTY. -# - -hosts= -group= -kill=0 -while true; do - case "$1" in - -x) set -x; shift;; - -w) for h in $2; do - hosts="$hosts -w $h"; - done - shift; shift;; - -g) group=$2; shift; shift;; - --kill) kill=1; shift;; - --kill-only) kill=2; shift;; - "") break;; - *) echo "unknown option: $1"; exit 1;; - esac -done - -if [ x"$group" = x -a x"$hosts" = x ]; then - group=$CTS_GROUP -fi - -if [ x"$hosts" != x ]; then - echo `date` ": Cleaning up hosts:" - target=$hosts -elif [ x"$group" != x ]; then - echo `date` ": Cleaning up group: $group" - target="-g $group" -else - echo "You didn't specify any nodes to clean up" - exit 1 -fi -cluster-helper --list bullet $target - -if [ $kill != 0 ]; then - echo "Cleaning processes" - - # Bah. Force systemd to actually look at the process and realize it's dead - cluster-helper $target -- "service corosync stop" &> /dev/null & - cluster-helper $target -- "service pacemaker stop" &> /dev/null & - - cluster-helper $target -- "killall -q -9 corosync pacemakerd pacemaker-attrd pacemaker-based pacemaker-controld pacemaker-execd pacemaker-fenced pacemaker-remoted pacemaker-schedulerd dlm_controld gfs_controld" &> /dev/null - cluster-helper $target -- 'kill -9 `pidof valgrind`' &> /dev/null - - if [ $kill == 2 ]; then - exit 0 - fi -fi - -#logrotate -f $cluster_rotate -echo "Cleaning files" - -log_files="" -log_files="$log_files 'messages*'" -log_files="$log_files 'localmessages*'" -log_files="$log_files 'cluster*.log'" -log_files="$log_files 'corosync.log*'" -log_files="$log_files 'pacemaker.log*'" -log_files="$log_files '*.journal'" -log_files="$log_files '*.journal~'" -log_files="$log_files 'secure-*'" - -state_files="" -state_files="$state_files 'cib.xml*'" -state_files="$state_files 'valgrind-*'" -state_files="$state_files 'cib-*'" -state_files="$state_files 'core.*'" -state_files="$state_files 'cts.*'" -state_files="$state_files 'pe*.bz2'" -state_files="$state_files 'fdata-*'" - -for f in $log_files; do - cluster-helper $target -- "find /var/log -name '$f' -exec rm -f \{\} \;" -done - -for f in $state_files; do - cluster-helper $target -- "find /var/lib -name '$f' -exec rm -f \{\} \;" -done - -cluster-helper $target -- "find /dev/shm -name 'qb-*' -exec rm -f \{\} \;" -cluster-helper $target -- "find @CRM_BLACKBOX_DIR@ -name '*-*' -exec rm -f \{\} \;" -cluster-helper $target -- "find /tmp -name '*.valgrind' -exec rm -f \{\} \;" - -cluster-helper $target -- 'service rsyslog restart' > /dev/null 2>&1 -cluster-helper $target -- 'systemctl restart systemd-journald.socket' > /dev/null 2>&1 -cluster-helper $target -- logger -i -p daemon.info __clean_logs__ - -#touch $cluster_log -echo `date` ": Clean complete" - -# vim: set filetype=sh: diff --git a/tools/cluster-helper.in b/tools/cluster-helper.in deleted file mode 100755 index 8cee1b4cb1..0000000000 --- a/tools/cluster-helper.in +++ /dev/null @@ -1,203 +0,0 @@ -#!@BASH_PATH@ -# -# Copyright 2011-2023 the Pacemaker project contributors -# -# The version control history for this file may have further details. -# -# This source code is licensed under the GNU General Public License version 2 -# or later (GPLv2+) WITHOUT ANY WARRANTY. -# - -hosts= -group=$cluster_name -user=root -pdsh=`which pdsh 2>/dev/null` -ssh=`which qarsh 2>/dev/null` -scp=`which qacp 2>/dev/null` -command=list -format=oneline -replace="{}" - -if [ x$ssh = "x" ]; then - ssh=ssh - scp=scp -fi - -function helptext() { - echo "cluster-helper - A tool for running commands on multiple hosts" - echo "" - echo "Attempt to use pdsh, qarsh, or ssh (in that order) to execute commands" - echo "on multiple hosts" - echo "" - echo "DSH groups can be configured and specified with -g instead of listing" - echo "the individual hosts every time" - echo "" - echo "Usage: cluster-helper [options] [command]" - echo "" - echo "Options:" - echo "--ssh Force the use of ssh instead of qarsh even if it available" - echo "-g, --group Specify the group to operate on/with" - echo "-w, --host Specify a host to operate on/with. May be specified multiple times" - echo "-f, --format Specifiy the output format When listing hosts or group contents" - echo " Allowed values: [oneline], long, short, pdsh, bullet" - echo "" - echo "" - echo "Commands:" - echo "--list format List the contents of a group in the specified format" - echo "--add name Add supplied (-w) hosts to the named group" - echo "--create name Create the named group with the supplied (-w) hosts" - echo "--run, -- Treat all subsequent arguments as a command to perform on" - echo " the specified command on the hosts or group" - echo "--xargs Run the supplied command having replaced any occurrences" - echo " of {} with the node name" - echo "" - echo "--copy file(s) host:file Pass subsequent arguments to scp or qacp" - echo " Any occurrences of {} are replaced with the node name" - echo "--key Install an ssh key" - echo "" - exit $1 -} - -while true ; do - case "$1" in - --help|-h|-\?) helptext 0;; - -x) set -x; shift;; - --ssh) ssh="ssh"; scp="scp"; pdsh=""; shift;; - -g|--group) group="$2"; shift; shift;; - -w|--host) for h in $2; do - hosts="$hosts $h"; - done - shift; shift;; - -f|--format) format=$2; shift; shift;; - -I) replace=$2; shift; shift;; - --list|list) format=$2; command=list; shift; shift;; - --add|add) command=group-add; shift;; - --create|create) group="$2"; command=group-create; shift; shift;; - --run|run) command=run; shift;; - --copy|copy) command=copy; shift; break ;; - --key|key) command=key; shift; break ;; - --xargs) command=xargs; shift; break ;; - --) command=run; shift; break ;; - "") break;; - *) helptext 1;; - esac -done - -if [ x"$group" = x -a x"$hosts" = x ]; then - group=$CTS_GROUP -fi - -function expand() { - fmt=$1 - if [ x$group != x -a -f ~/.dsh/group/$group ]; then - hosts=`cat ~/.dsh/group/$group` - elif [ x$group != x ]; then - echo "Unknown group: $group" >&2 - exit 1 - fi - - if [ "x$hosts" != x -a $fmt = oneline ]; then - echo $hosts - - elif [ "x$hosts" != x -a $fmt = short ]; then - ( for h in $hosts; do - echo $h | sed 's:\..*::' - done ) | tr '\n' ' ' - echo "" - - elif [ "x$hosts" != x -a $fmt = pdsh ]; then - ( for h in $hosts; do - echo "-w $h" - done ) | tr '\n' ' ' - echo "" - - elif [ "x$hosts" != x -a $fmt = long ]; then - for h in $hosts; do - echo $h - done - - elif [ "x$hosts" != x -a $fmt = bullet ]; then - for h in $hosts; do - echo " * $h" - done - - elif [ "x$hosts" != x ]; then - echo "Unknown format: $fmt" >&2 - fi -} - -if [ $command = list ]; then - expand $format - -elif [ $command = key ]; then - hosts=`expand oneline` - for h in $hosts; do - ssh-copy-id root@$h - done - -elif [ $command = group-create ]; then - - f=`mktemp` - mkdir -p ~/.dsh/group - - if [ -f ~/.dsh/group/$group ]; then - echo "Overwriting existing group $group" - fi - - for h in $hosts; do - echo $h >> $f - done - - echo "Creating group $group in ~/.dsh/group" - sort -u $f > ~/.dsh/group/$group - rm -f $f - -elif [ $command = group-add ]; then - if [ x$group = x ]; then - echo "Please specify a group to append to" - exit 1 - fi - - f=`mktemp` - mkdir -p ~/.dsh/group - - if [ -f ~/.dsh/group/$group ]; then - cat ~/.dsh/group/$group > $f - fi - - for h in $hosts; do - echo $h >> $f - done - - echo "Appending hosts to group $group in ~/.dsh/group" - sort -u $f > ~/.dsh/group/$group - rm -f $f - -elif [ $command = run ]; then - if [ x$pdsh != x ]; then - hosts=`expand pdsh` - $pdsh -l $user $hosts -- $* - - else - hosts=`expand oneline` - for n in $hosts; do - $ssh -l $user $n -- $* < /dev/null - done - if [ x"$hosts" = x ]; then - echo "No hosts specified" - fi - fi -elif [ $command = copy ]; then - hosts=`expand oneline` - for n in $hosts; do - $scp `echo $* | sed 's@'$replace'@'$n'@'` - done - -elif [ $command = xargs ]; then - hosts=`expand oneline` - for n in $hosts; do - eval `echo $* | sed 's@'$replace'@'$n'@'` - done -fi - -# vim: set filetype=sh: diff --git a/tools/crm_report.in b/tools/crm_report.in index 1df25c044c..842d6ae927 100644 --- a/tools/crm_report.in +++ b/tools/crm_report.in @@ -1,481 +1,406 @@ #!/bin/sh # -# Copyright 2010-2019 the Pacemaker project contributors +# Copyright 2010-2025 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # TEMP=`@GETOPT_PATH@ \ - -o hv?xl:f:t:n:T:L:p:c:dSCu:D:MVse: \ - --long help,corosync,cts:,cts-log:,dest:,node:,nodes:,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ + -o hv?xl:f:t:n:L:p:c:dSCu:D:MVse: \ + --long help,corosync,dest:,node:,nodes:,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ -n 'crm_report' -- "$@"` # The quotes around $TEMP are essential eval set -- "$TEMP" progname=$(basename "$0") rsh="ssh -T" -tests="" nodes="" compress=1 cluster="any" ssh_user="root" search_logs=1 sos_mode=0 report_data=`dirname $0` maxdepth=5 extra_logs="" sanitize_patterns="passw.*" log_patterns="CRIT: ERROR:" usage() { cat< "$l_base/$HALOG_F" fi for node in $nodes; do cat <$l_base/.env LABEL="$label" REPORT_HOME="$r_base" REPORT_MASTER="$host" REPORT_TARGET="$node" LOG_START=$start LOG_END=$end REMOVE=1 SANITIZE="$sanitize_patterns" CLUSTER=$cluster LOG_PATTERNS="$log_patterns" EXTRA_LOGS="$extra_logs" SEARCH_LOGS=$search_logs SOS_MODE=$sos_mode verbose=$verbose maxdepth=$maxdepth EOF if [ $host = $node ]; then cat <>$l_base/.env REPORT_HOME="$l_base" EOF cat $l_base/.env $report_data/report.common $report_data/report.collector > $l_base/collector bash $l_base/collector else cat $l_base/.env $report_data/report.common $report_data/report.collector \ | $rsh -l $ssh_user $node -- "mkdir -p $r_base; cat > $r_base/collector; bash $r_base/collector" | (cd $l_base && tar mxf -) fi done analyze $l_base > $l_base/$ANALYSIS_F if [ -f $l_base/$HALOG_F ]; then node_events $l_base/$HALOG_F > $l_base/$EVENTS_F fi for node in $nodes; do cat $l_base/$node/$ANALYSIS_F >> $l_base/$ANALYSIS_F if [ -s $l_base/$node/$EVENTS_F ]; then cat $l_base/$node/$EVENTS_F >> $l_base/$EVENTS_F elif [ -s $l_base/$HALOG_F ]; then awk "\$4==\"$nodes\"" $l_base/$EVENTS_F >> $l_base/$n/$EVENTS_F fi done log " " if [ $compress = 1 ]; then fname=`shrink $l_base` rm -rf $l_base log "Collected results are available in $fname" log " " log "Please create a bug entry at" log " @BUG_URL@" log "Include a description of your problem and attach this tarball" log " " log "Thank you for taking time to create this report." else log "Collected results are available in $l_base" fi log " " } # # check if files have same content in the cluster # cibdiff() { d1=$(dirname $1) d2=$(dirname $2) if [ -f "$d1/RUNNING" ] && [ ! -f "$d2/RUNNING" ]; then DIFF_OK=0 elif [ -f "$d1/STOPPED" ] && [ ! -f "$d2/STOPPED" ]; then DIFF_OK=0 else DIFF_OK=1 fi if [ $DIFF_OK -eq 1 ]; then if which crm_diff > /dev/null 2>&1; then crm_diff -c -n $1 -o $2 else info "crm_diff(8) not found, cannot diff CIBs" fi else echo "can't compare cibs from running and stopped systems" fi } diffcheck() { [ -f "$1" ] || { echo "$1 does not exist" return 1 } [ -f "$2" ] || { echo "$2 does not exist" return 1 } case $(basename "$1") in $CIB_F) cibdiff $1 $2 ;; *) diff -u $1 $2 ;; esac } # # remove duplicates if files are same, make links instead # consolidate() { for n in $nodes; do if [ -f $1/$2 ]; then rm $1/$n/$2 else mv $1/$n/$2 $1 fi ln -s ../$2 $1/$n done } analyze_one() { rc=0 node0="" for n in $nodes; do if [ "$node0" ]; then diffcheck $1/$node0/$2 $1/$n/$2 rc=$(($rc+$?)) else node0=$n fi done return $rc } analyze() { flist="$MEMBERSHIP_F $CIB_F $CRM_MON_F $SYSINFO_F" for f in $flist; do printf "Diff $f... " ls $1/*/$f >/dev/null 2>&1 || { echo "no $1/*/$f :/" continue } if analyze_one $1 $f; then echo "OK" [ "$f" != $CIB_F ] && consolidate $1 $f else echo "" fi done } -do_cts() { - test_sets=`echo $tests | tr ',' ' '` - for test_set in $test_sets; do - - start_time=0 - start_test=`echo $test_set | tr '-' ' ' | awk '{print $1}'` - - end_time=0 - end_test=`echo $test_set | tr '-' ' ' | awk '{print $2}'` - - if [ x$end_test = x ]; then - msg="Extracting test $start_test" - label="CTS-$start_test-`date +"%b-%d-%Y"`" - end_test=`expr $start_test + 1` - else - msg="Extracting tests $start_test to $end_test" - label="CTS-$start_test-$end_test-`date +"%b-%d-%Y"`" - end_test=`expr $end_test + 1` - fi - - if [ $start_test = 0 ]; then - start_pat="BEGINNING [0-9].* TESTS" - else - start_pat="Running test.*\[ *$start_test\]" - fi - - if [ x$ctslog = x ]; then - ctslog=`findmsg 1 "$start_pat"` - - if [ x$ctslog = x ]; then - fatal "No CTS control file detected" - else - log "Using CTS control file: $ctslog" - fi - fi - - line=`grep -n "$start_pat" $ctslog | tail -1 | sed 's/:.*//'` - if [ ! -z "$line" ]; then - start_time=`linetime $ctslog $line` - fi - - line=`grep -n "Running test.*\[ *$end_test\]" $ctslog | tail -1 | sed 's/:.*//'` - if [ ! -z "$line" ]; then - end_time=`linetime $ctslog $line` - fi - - if [ -z "$nodes" ]; then - nodes=`grep CTS: $ctslog | grep -v debug: | grep " \* " | sed s:.*\\\*::g | sort -u | tr '\\n' ' '` - log "Calculated node list: $nodes" - fi - - if [ $end_time -lt $start_time ]; then - debug "Test didn't complete, grabbing everything up to now" - end_time=`date +%s` - fi - - if [ $start_time != 0 ];then - log "$msg (`time2str $start_time` to `time2str $end_time`)" - collect_data $label $start_time $end_time $ctslog - else - fatal "$msg failed: not found" - fi - done -} - node_names_from_xml() { awk ' /uname/ { for( i=1; i<=NF; i++ ) if( $i~/^uname=/ ) { sub("uname=.","",$i); sub("\".*","",$i); print $i; next; } } ' | tr '\n' ' ' } getnodes() { cluster="$1" # 1. Live (cluster nodes or Pacemaker Remote nodes) # TODO: This will not detect Pacemaker Remote nodes unless they # have ever had a permanent node attribute set, because it only # searches the nodes section. It should also search the config # for resources that create Pacemaker Remote nodes. cib_nodes=$(cibadmin -Q -o nodes 2>/dev/null) if [ $? -eq 0 ]; then debug "Querying CIB for nodes" echo "$cib_nodes" | node_names_from_xml return fi # 2. Saved if [ -f "@CRM_CONFIG_DIR@/cib.xml" ]; then debug "Querying on-disk CIB for nodes" grep "node " "@CRM_CONFIG_DIR@/cib.xml" | node_names_from_xml return fi # 3. logs # TODO: Look for something like crm_update_peer } if [ $compress -eq 1 ]; then require_tar fi -if [ "x$tests" != "x" ]; then - do_cts - -elif [ "x$start_time" != "x" ]; then - masterlog="" - - if [ -z "$sanitize_patterns" ]; then - log "WARNING: The tarball produced by this program may contain" - log " sensitive information such as passwords." - log "" - log "We will attempt to remove such information if you use the" - log "-p option. For example: -p \"pass.*\" -p \"user.*\"" - log "" - log "However, doing this may reduce the ability for the recipients" - log "to diagnose issues and generally provide assistance." - log "" - log "IT IS YOUR RESPONSIBILITY TO PROTECT SENSITIVE DATA FROM EXPOSURE" - log "" - fi +if [ "x$start_time" = "x" ]; then + fatal "Not sure what to do, no time range to extract" +fi - # If user didn't specify a cluster stack, make a best guess if possible. - if [ -z "$cluster" ] || [ "$cluster" = "any" ]; then - cluster=$(get_cluster_type) - fi +masterlog="" + +if [ -z "$sanitize_patterns" ]; then + log "WARNING: The tarball produced by this program may contain" + log " sensitive information such as passwords." + log "" + log "We will attempt to remove such information if you use the" + log "-p option. For example: -p \"pass.*\" -p \"user.*\"" + log "" + log "However, doing this may reduce the ability for the recipients" + log "to diagnose issues and generally provide assistance." + log "" + log "IT IS YOUR RESPONSIBILITY TO PROTECT SENSITIVE DATA FROM EXPOSURE" + log "" +fi - # If user didn't specify node(s), make a best guess if possible. - if [ -z "$nodes" ]; then - nodes=`getnodes $cluster` - if [ -n "$nodes" ]; then - log "Calculated node list: $nodes" - else - fatal "Cannot determine nodes; specify --nodes or --single-node" - fi - fi +# If user didn't specify a cluster stack, make a best guess if possible. +if [ -z "$cluster" ] || [ "$cluster" = "any" ]; then + cluster=$(get_cluster_type) +fi - if - echo $nodes | grep -qs $host - then - debug "We are a cluster node" +# If user didn't specify node(s), make a best guess if possible. +if [ -z "$nodes" ]; then + nodes=`getnodes $cluster` + if [ -n "$nodes" ]; then + log "Calculated node list: $nodes" else - debug "We are a log master" - masterlog=`findmsg 1 "pacemaker-controld\\|CTS"` + fatal "Cannot determine nodes; specify --nodes or --single-node" fi +fi - - if [ -z $end_time ]; then - end_time=`perl -e 'print time()'` - fi - label="pcmk-`date +"%a-%d-%b-%Y"`" - log "Collecting data from $nodes (`time2str $start_time` to `time2str $end_time`)" - collect_data $label $start_time $end_time $masterlog +if echo $nodes | grep -qs $host; then + debug "We are a cluster node" else - fatal "Not sure what to do, no tests or time ranges to extract" + debug "We are a log master" + masterlog=`findmsg 1 "pacemaker-controld\\|CTS"` +fi + +if [ -z $end_time ]; then + end_time=`perl -e 'print time()'` fi +label="pcmk-`date +"%a-%d-%b-%Y"`" +log "Collecting data from $nodes (`time2str $start_time` to `time2str $end_time`)" +collect_data $label $start_time $end_time $masterlog + # vim: set filetype=sh: