Page MenuHomeClusterLabs Projects

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/.gitignore b/.gitignore
index 8451a8f258..aa883022e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,207 +1,207 @@
# Common
\#*
.\#*
GPATH
GRTAGS
GTAGS
TAGS
Makefile
Makefile.in
.deps
.libs
*.pc
*.pyc
*.bz2
*.tar.gz
*.rpm
*.la
*.lo
*.o
*~
*.gcda
*.gcno
# Autobuild
aclocal.m4
autoconf
autoheader
autom4te.cache/
automake
build.counter
compile
config.guess
config.log
config.status
config.sub
configure
depcomp
install-sh
include/stamp-*
libtool
libtool.m4
ltdl.m4
libltdl
ltmain.sh
missing
py-compile
/m4/argz.m4
/m4/ltargz.m4
/m4/ltoptions.m4
/m4/ltsugar.m4
/m4/ltversion.m4
/m4/lt~obsolete.m4
test-driver
ylwrap
# Configure targets
Doxyfile
/cts/CTS.py
/cts/CTSlab.py
/cts/CTSvars.py
/cts/LSBDummy
/cts/OCFIPraTest.py
/cts/benchmark/clubench
/cts/cluster_test
/cts/cts
/cts/cts-cli
/cts/cts-coverage
/cts/cts-exec
/cts/cts-pengine
/cts/cts-regression
/cts/cts-stonithd
/cts/fence_dummy
/cts/lxc_autogen.sh
/cts/pacemaker-cts-dummyd
/cts/pacemaker-cts-dummyd@.service
/daemons/execd/pacemaker_remote
/daemons/execd/pacemaker_remote.service
/daemons/pacemakerd/pacemaker
/daemons/pacemakerd/pacemaker.combined.upstart
/daemons/pacemakerd/pacemaker.service
/daemons/pacemakerd/pacemaker.upstart
extra/logrotate/pacemaker
/fencing/fence_legacy
include/config.h
include/config.h.in
include/crm_config.h
publican.cfg
/tools/cibsecret
/tools/crm_error
/tools/crm_failcount
/tools/crm_master
/tools/crm_mon.service
/tools/crm_mon.upstart
/tools/crm_report
/tools/crm_standby
/tools/report.collector
/tools/report.common
# Build targets
*.7
*.7.xml
*.7.html
*.8
*.8.xml
*.8.html
doc/*/en-US/images/*.png
doc/*/tmp/**
doc/*/publish
cib/cib
cib/cibmon
cib/cibpipe
crmd/atest
crmd/crmd
/daemons/attrd/pacemaker-attrd
/daemons/execd/cts-exec-helper
/daemons/execd/pacemaker-execd
-/daemons/execd/pacemaker_remoted
+/daemons/execd/pacemaker-remoted
/daemons/pacemakerd/pacemakerd
doc/api/*
doc/Clusters_from_Scratch.txt
doc/Pacemaker_Explained.txt
doc/acls.html
doc/crm_fencing.html
doc/publican-catalog*
fencing/stonith-test
fencing/stonith_admin
fencing/stonithd
fencing/stonithd.xml
pengine/pengine
pengine/pengine.xml
pengine/ptest
scratch
tools/attrd_updater
tools/cibadmin
tools/crm_attribute
tools/crm_diff
tools/crm_mon
tools/crm_node
tools/crm_resource
tools/crm_shadow
tools/crm_simulate
tools/crm_verify
tools/crmadmin
tools/iso8601
tools/crm_ticket
tools/report.collector.1
xml/crm.dtd
xml/pacemaker*.rng
xml/versions.rng
doc/shared/en-US/*.xml
doc/Clusters_from_Scratch.build
doc/Clusters_from_Scratch/en-US/Ap-*.xml
doc/Clusters_from_Scratch/en-US/Ch-*.xml
doc/Pacemaker_Administration.build
doc/Pacemaker_Administration/en-US/Ch-*.xml
doc/Pacemaker_Development.build
doc/Pacemaker_Development/en-US/Ch-*.xml
doc/Pacemaker_Explained.build
doc/Pacemaker_Explained/en-US/Ch-*.xml
doc/Pacemaker_Explained/en-US/Ap-*.xml
doc/Pacemaker_Remote.build
doc/Pacemaker_Remote/en-US/Ch-*.xml
lib/gnu/libgnu.a
lib/gnu/stdalign.h
*.coverity
# Test detritus
/cts/.regression.failed.diff
/cts/pengine/*.ref
/cts/pengine/*.up
/cts/pengine/*.up.err
/cts/pengine/bug-rh-1097457.log
/cts/pengine/bug-rh-1097457.trs
/cts/pengine/shadow.*
/cts/test-suite.log
/xml/test-2/*.up
/xml/test-2/*.up.err
# Formerly built files (helps when jumping back and forth in checkout)
/attrd
/coverage.sh
/cts/HBDummy
/fencing/regression.py
/lrmd
/mcp
/pengine/.regression.failed.diff
/pengine/regression.core.sh
/pengine/test10/shadow.*
#Other
mock
HTML
pacemaker*.spec
coverity-*
compat_reports
.ABI-build
abi_dumps
logs
*.patch
*.diff
*.sed
*.orig
*.rej
*.swp
diff --git a/configure.ac b/configure.ac
index b8ee1129e8..9725dc4f59 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,1818 +1,1820 @@
dnl
dnl autoconf for Pacemaker
dnl
dnl Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
dnl
dnl This source code is licensed under the GNU General Public License version 2
dnl or later (GPLv2+) WITHOUT ANY WARRANTY.
dnl ===============================================
dnl Bootstrap
dnl ===============================================
AC_PREREQ(2.64)
AC_CONFIG_MACRO_DIR([m4])
AC_DEFUN([AC_DATAROOTDIR_CHECKED])
dnl Suggested structure:
dnl information on the package
dnl checks for programs
dnl checks for libraries
dnl checks for header files
dnl checks for types
dnl checks for structures
dnl checks for compiler characteristics
dnl checks for library functions
dnl checks for system services
m4_include([version.m4])
AC_INIT([pacemaker], VERSION_NUMBER, [users@clusterlabs.org], [pacemaker],
PCMK_URL)
PCMK_FEATURES=""
AC_CONFIG_AUX_DIR(.)
AC_CANONICAL_HOST
dnl Where #defines go (e.g. `AC_CHECK_HEADERS' below)
dnl
dnl Internal header: include/config.h
dnl - Contains ALL defines
dnl - include/config.h.in is generated automatically by autoheader
dnl - NOT to be included in any header files except crm_internal.h
dnl (which is also not to be included in any other header files)
dnl
dnl External header: include/crm_config.h
dnl - Contains a subset of defines checked here
dnl - Manually edit include/crm_config.h.in to have configure include
dnl new defines
dnl - Should not include HAVE_* defines
dnl - Safe to include anywhere
AM_CONFIG_HEADER(include/config.h include/crm_config.h)
AC_ARG_WITH(version,
[ --with-version=version Override package version (if you are a packager needing to pretend) ],
[ PACKAGE_VERSION="$withval" ])
AC_ARG_WITH(pkg-name,
[ --with-pkg-name=name Override package name (if you are a packager needing to pretend) ],
[ PACKAGE_NAME="$withval" ])
dnl 1.11: minimum automake version required
dnl foreign: don't require GNU-standard top-level files
dnl silent-rules: allow "--enable-silent-rules" (no-op in 1.13+)
AM_INIT_AUTOMAKE([1.11 foreign silent-rules])
dnl Example 2.4. Silent Custom Rule to Generate a File
dnl %-bar.pc: %.pc
dnl $(AM_V_GEN)$(LN_S) $(notdir $^) $@
AC_DEFINE_UNQUOTED(PACEMAKER_VERSION, "$PACKAGE_VERSION",
[Current pacemaker version])
dnl Versioned attributes implementation is not yet production-ready
AC_DEFINE_UNQUOTED(ENABLE_VERSIONED_ATTRS, 0, [Enable versioned attributes])
PACKAGE_SERIES=`echo $PACKAGE_VERSION | awk -F. '{ print $1"."$2 }'`
AC_SUBST(PACKAGE_SERIES)
AC_SUBST(PACKAGE_VERSION)
CC_IN_CONFIGURE=yes
export CC_IN_CONFIGURE
LDD=ldd
dnl ========================================================================
dnl Compiler characteristics
dnl ========================================================================
AC_PROG_CC dnl Can force other with environment variable "CC".
AM_PROG_CC_C_O
AC_PROG_CC_STDC
gl_EARLY
gl_INIT
LT_INIT([dlopen])
LTDL_INIT([convenience])
AC_PROG_YACC
AM_PROG_LEX
AC_C_STRINGIZE
AC_TYPE_SIZE_T
AC_CHECK_SIZEOF(char)
AC_CHECK_SIZEOF(short)
AC_CHECK_SIZEOF(int)
AC_CHECK_SIZEOF(long)
AC_CHECK_SIZEOF(long long)
AC_STRUCT_TIMEZONE
dnl ===============================================
dnl Helpers
dnl ===============================================
cc_supports_flag() {
local CFLAGS="-Werror $@"
AC_MSG_CHECKING(whether $CC supports "$@")
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ ]], [[ ]])],
[RC=0; AC_MSG_RESULT(yes)],
[RC=1; AC_MSG_RESULT(no)])
return $RC
}
# Some tests need to use their own CFLAGS
cc_temp_flags() {
ac_save_CFLAGS="$CFLAGS"
CFLAGS="$*"
}
cc_restore_flags() {
CFLAGS=$ac_save_CFLAGS
}
dnl ===============================================
dnl Configure Options
dnl ===============================================
dnl Some systems, like Solaris require a custom package name
AC_ARG_WITH(pkgname,
[ --with-pkgname=name name for pkg (typically for Solaris) ],
[ PKGNAME="$withval" ],
[ PKGNAME="LXHAhb" ],
)
AC_SUBST(PKGNAME)
AC_ARG_ENABLE([ansi],
[ --enable-ansi Force GCC to compile to ANSI standard for older compilers. @<:@no@:>@])
AC_ARG_ENABLE([fatal-warnings],
[ --enable-fatal-warnings Enable pedantic and fatal warnings for gcc @<:@yes@:>@])
AC_ARG_ENABLE([quiet],
[ --enable-quiet Suppress make output unless there is an error @<:@no@:>@])
AC_ARG_ENABLE([no-stack],
[ --enable-no-stack Build only the Policy Engine and its requirements @<:@no@:>@])
AC_ARG_ENABLE([upstart],
[ --enable-upstart Enable support for managing resources via Upstart @<:@try@:>@ ],
[],
[enable_upstart=try],
)
AC_ARG_ENABLE([systemd],
[ --enable-systemd Enable support for managing resources via systemd @<:@try@:>@],
[],
[enable_systemd=try],
)
AC_ARG_ENABLE(hardening,
[ --with-hardening Harden the resulting executables/libraries @<:@try@:>@],
[ HARDENING="${enableval}" ],
[ HARDENING=try ],
)
AC_ARG_WITH(corosync,
[ --with-corosync Support the Corosync messaging and membership layer ],
[ SUPPORT_CS=$withval ],
[ SUPPORT_CS=try ],
)
AC_ARG_WITH(nagios,
[ --with-nagios Support nagios remote monitoring ],
[ SUPPORT_NAGIOS=$withval ],
[ SUPPORT_NAGIOS=try ],
)
AC_ARG_WITH(nagios-plugin-dir,
[ --with-nagios-plugin-dir=DIR Directory for nagios plugins @<:@LIBEXECDIR/nagios/plugins@:>@],
[ NAGIOS_PLUGIN_DIR="$withval" ]
)
AC_ARG_WITH(nagios-metadata-dir,
[ --with-nagios-metadata-dir=DIR Directory for nagios plugins metadata @<:@DATADIR/nagios/plugins-metadata@:>@],
[ NAGIOS_METADATA_DIR="$withval" ]
)
AC_ARG_WITH(acl,
[ --with-acl Support CIB ACL ],
[ SUPPORT_ACL=$withval ],
[ SUPPORT_ACL=yes ],
)
AC_ARG_WITH(cibsecrets,
[ --with-cibsecrets Support separate file for CIB secrets ],
[ SUPPORT_CIBSECRETS=$withval ],
[ SUPPORT_CIBSECRETS=no ],
)
AC_ARG_WITH(gnutls-priorities,
[ --with-gnutls-priorities GnuTLS cipher priorities @<:@NORMAL@:>@ ],
[ PCMK_GNUTLS_PRIORITIES="$withval" ],
[ PCMK_GNUTLS_PRIORITIES="NORMAL" ],
)
INITDIR=""
AC_ARG_WITH(initdir,
[ --with-initdir=DIR Directory for init (rc) scripts],
[ INITDIR="$withval" ])
SUPPORT_PROFILING=0
AC_ARG_WITH(profiling,
[ --with-profiling Disable optimizations for effective profiling ],
[ SUPPORT_PROFILING=$withval ])
AC_ARG_WITH(coverage,
[ --with-coverage Disable optimizations for effective profiling ],
[ SUPPORT_COVERAGE=$withval ])
PUBLICAN_BRAND="common"
AC_ARG_WITH(brand,
[ --with-brand=brand Brand to use for generated documentation (set empty for no docs) @<:@common@:>@],
[ test x"$withval" = x"no" || PUBLICAN_BRAND="$withval" ])
AC_SUBST(PUBLICAN_BRAND)
CONFIGDIR=""
AC_ARG_WITH(configdir,
[ --with-configdir=DIR Directory for Pacemaker configuration file @<:@SYSCONFDIR/sysconfig@:>@],
[ CONFIGDIR="$withval" ]
)
CRM_LOG_DIR=""
AC_ARG_WITH(logdir,
[ --with-logdir=DIR Directory for Pacemaker log file @<:@LOCALSTATEDIR/log/pacemaker@:>@ ],
[ CRM_LOG_DIR="$withval" ]
)
CRM_BUNDLE_DIR=""
AC_ARG_WITH(bundledir,
[ --with-bundledir=DIR Directory for Pacemaker bundle logs @<:@LOCALSTATEDIR/log/pacemaker/bundles@:>@ ],
[ CRM_BUNDLE_DIR="$withval" ]
)
dnl ===============================================
dnl General Processing
dnl ===============================================
+AC_PROG_LN_S
+
if cc_supports_flag -Werror; then
WERROR="-Werror"
else
WERROR=""
fi
# Normalize enable_fatal_warnings (defaulting to yes, when compiler supports it)
if test "x${enable_fatal_warnings}" != "xno" ; then
if test "$GCC" = "yes" && test "x${WERROR}" != "x" ; then
enable_fatal_warnings=yes
else
AC_MSG_NOTICE(Compiler does not support fatal warnings)
enable_fatal_warnings=no
fi
fi
INIT_EXT=""
echo Our Host OS: $host_os/$host
AC_MSG_NOTICE(Sanitizing prefix: ${prefix})
case $prefix in
NONE)
prefix=/usr
dnl Fix default variables - "prefix" variable if not specified
if test "$localstatedir" = "\${prefix}/var"; then
localstatedir="/var"
fi
if test "$sysconfdir" = "\${prefix}/etc"; then
sysconfdir="/etc"
fi
;;
esac
AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix})
case $exec_prefix in
prefix|NONE)
exec_prefix=$prefix
;;
esac
AC_MSG_NOTICE(Sanitizing INITDIR: ${INITDIR})
case $INITDIR in
prefix) INITDIR=$prefix;;
"")
AC_MSG_CHECKING(which init (rc) directory to use)
for initdir in /etc/init.d /etc/rc.d/init.d /sbin/init.d \
/usr/local/etc/rc.d /etc/rc.d
do
if
test -d $initdir
then
INITDIR=$initdir
break
fi
done
AC_MSG_RESULT($INITDIR)
;;
esac
AC_SUBST(INITDIR)
AC_MSG_NOTICE(Sanitizing libdir: ${libdir})
case $libdir in
prefix|NONE)
AC_MSG_CHECKING(which lib directory to use)
for aDir in lib64 lib
do
trydir="${exec_prefix}/${aDir}"
if
test -d ${trydir}
then
libdir=${trydir}
break
fi
done
AC_MSG_RESULT($libdir);
;;
esac
dnl Expand autoconf variables so that we don't end up with '${prefix}'
dnl in #defines and python scripts
dnl NOTE: Autoconf deliberately leaves them unexpanded to allow
dnl make exec_prefix=/foo install
dnl No longer being able to do this seems like no great loss to me...
eval prefix="`eval echo ${prefix}`"
eval exec_prefix="`eval echo ${exec_prefix}`"
eval bindir="`eval echo ${bindir}`"
eval sbindir="`eval echo ${sbindir}`"
eval libexecdir="`eval echo ${libexecdir}`"
eval datadir="`eval echo ${datadir}`"
eval sysconfdir="`eval echo ${sysconfdir}`"
eval sharedstatedir="`eval echo ${sharedstatedir}`"
eval localstatedir="`eval echo ${localstatedir}`"
eval libdir="`eval echo ${libdir}`"
eval includedir="`eval echo ${includedir}`"
eval oldincludedir="`eval echo ${oldincludedir}`"
eval infodir="`eval echo ${infodir}`"
eval mandir="`eval echo ${mandir}`"
dnl Home-grown variables
eval INITDIR="${INITDIR}"
eval docdir="`eval echo ${docdir}`"
if test x"${docdir}" = x""; then
docdir=${datadir}/doc/${PACKAGE}-${VERSION}
fi
AC_SUBST(docdir)
if test x"${CONFIGDIR}" = x""; then
CONFIGDIR="${sysconfdir}/sysconfig"
fi
AC_SUBST(CONFIGDIR)
if test x"${CRM_LOG_DIR}" = x""; then
CRM_LOG_DIR="${localstatedir}/log/pacemaker"
fi
AC_DEFINE_UNQUOTED(CRM_LOG_DIR,"$CRM_LOG_DIR", Location for Pacemaker log file)
AC_SUBST(CRM_LOG_DIR)
if test x"${CRM_BUNDLE_DIR}" = x""; then
CRM_BUNDLE_DIR="${localstatedir}/log/pacemaker/bundles"
fi
AC_DEFINE_UNQUOTED(CRM_BUNDLE_DIR,"$CRM_BUNDLE_DIR", Location for Pacemaker bundle logs)
AC_SUBST(CRM_BUNDLE_DIR)
AC_DEFINE_UNQUOTED([PCMK_GNUTLS_PRIORITIES], ["$PCMK_GNUTLS_PRIORITIES"],
[GnuTLS cipher priorities])
for j in prefix exec_prefix bindir sbindir libexecdir datadir sysconfdir \
sharedstatedir localstatedir libdir includedir oldincludedir infodir \
mandir INITDIR docdir CONFIGDIR
do
dirname=`eval echo '${'${j}'}'`
if
test ! -d "$dirname"
then
AC_MSG_WARN([$j directory ($dirname) does not exist!])
fi
done
dnl This OS-based decision-making is poor autotools practice;
dnl feature-based mechanisms are strongly preferred.
dnl
dnl So keep this section to a bare minimum; regard as a "necessary evil".
case "$host_os" in
*bsd*)
AC_DEFINE_UNQUOTED(ON_BSD, 1, Compiling for BSD platform)
LIBS="-L/usr/local/lib"
CPPFLAGS="$CPPFLAGS -I/usr/local/include"
INIT_EXT=".sh"
;;
*solaris*)
AC_DEFINE_UNQUOTED(ON_SOLARIS, 1, Compiling for Solaris platform)
;;
*linux*)
AC_DEFINE_UNQUOTED(ON_LINUX, 1, Compiling for Linux platform)
;;
darwin*)
AC_DEFINE_UNQUOTED(ON_DARWIN, 1, Compiling for Darwin platform)
LIBS="$LIBS -L${prefix}/lib"
CFLAGS="$CFLAGS -I${prefix}/include"
;;
esac
AC_SUBST(INIT_EXT)
AC_MSG_NOTICE(Host CPU: $host_cpu)
case "$host_cpu" in
ppc64|powerpc64)
case $CFLAGS in
*powerpc64*)
;;
*)
if test "$GCC" = yes; then
CFLAGS="$CFLAGS -m64"
fi
;;
esac
;;
esac
AC_MSG_CHECKING(which format is needed to print uint64_t)
cc_temp_flags "-Wall $WERROR"
AC_COMPILE_IFELSE(
[AC_LANG_PROGRAM(
[
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
],
[
int max = 512;
uint64_t bignum = 42;
char *buffer = malloc(max);
const char *random = "random";
snprintf(buffer, max-1, "<quorum id=%lu quorate=%s/>", bignum, random);
fprintf(stderr, "Result: %s\n", buffer);
]
)],
[U64T="%lu"],
[U64T="%llu"]
)
cc_restore_flags
AC_MSG_RESULT($U64T)
AC_DEFINE_UNQUOTED(U64T, "$U64T", Correct printf format for logging uint64_t)
dnl ===============================================
dnl Program Paths
dnl ===============================================
PATH="$PATH:/sbin:/usr/sbin:/usr/local/sbin:/usr/local/bin"
export PATH
dnl Replacing AC_PROG_LIBTOOL with AC_CHECK_PROG because LIBTOOL
dnl was NOT being expanded all the time thus causing things to fail.
AC_CHECK_PROGS(LIBTOOL, glibtool libtool libtool15 libtool13)
dnl Pacemaker's executable python scripts will invoke the python specified by
dnl configure's PYTHON variable. If not specified, AM_PATH_PYTHON will check a
dnl built-in list with (unversioned) "python" having precedence. To configure
dnl Pacemaker to use a specific python interpreter version, define PYTHON
dnl when calling configure, for example: ./configure PYTHON=/usr/bin/python3.6
dnl PYTHON must be a full path
case "x$PYTHON" in
/*)
;;
*)
AC_PATH_PROG([PYTHON], [$PYTHON])
;;
esac
case "x$PYTHON" in
x*python3*)
dnl When used with Python 3, Pacemaker requires a minimum of 3.2
AM_PATH_PYTHON([3.2])
;;
*)
dnl Otherwise, Pacemaker requires a minimum of 2.7
AM_PATH_PYTHON([2.7])
;;
esac
AC_CHECK_PROGS(MAKE, gmake make)
AC_PATH_PROGS(HTML2TXT, lynx w3m)
AC_PATH_PROGS(HELP2MAN, help2man)
AC_PATH_PROGS(POD2MAN, pod2man, pod2man)
AC_PATH_PROGS(ASCIIDOC, asciidoc)
AC_PATH_PROGS(PUBLICAN, publican)
AC_PATH_PROGS(INKSCAPE, inkscape)
AC_PATH_PROGS(XSLTPROC, xsltproc)
AC_PATH_PROGS(XMLCATALOG, xmlcatalog)
AC_PATH_PROGS(FOP, fop)
AC_PATH_PROGS(SSH, ssh, /usr/bin/ssh)
AC_PATH_PROGS(SCP, scp, /usr/bin/scp)
AC_PATH_PROGS(TAR, tar)
AC_PATH_PROGS(MD5, md5)
dnl BASH is already an environment variable, so use something else
AC_PATH_PROG([BASH_PATH], [bash])
AC_PATH_PROGS(TEST, test)
PKG_PROG_PKG_CONFIG
AC_PATH_PROGS(VALGRIND_BIN, valgrind, /usr/bin/valgrind)
AC_DEFINE_UNQUOTED(VALGRIND_BIN, "$VALGRIND_BIN", Valgrind command)
if test x"${LIBTOOL}" = x""; then
AC_MSG_ERROR(You need (g)libtool installed in order to build ${PACKAGE})
fi
if test x"${MAKE}" = x""; then
AC_MSG_ERROR(You need (g)make installed in order to build ${PACKAGE})
fi
dnl Bash is needed for building man pages and running regression tests
if test x"${BASH_PATH}" = x""; then
AC_MSG_ERROR(bash must be installed in order to build ${PACKAGE})
fi
AM_CONDITIONAL(BUILD_HELP, test x"${HELP2MAN}" != x"")
if test x"${HELP2MAN}" != x""; then
PCMK_FEATURES="$PCMK_FEATURES generated-manpages"
fi
MANPAGE_XSLT=""
if test x"${XSLTPROC}" != x""; then
AC_MSG_CHECKING(docbook to manpage transform)
# first try to figure out correct template using xmlcatalog query,
# resort to extensive (semi-deterministic) file search if that fails
DOCBOOK_XSL_URI='http://docbook.sourceforge.net/release/xsl/current'
DOCBOOK_XSL_PATH='manpages/docbook.xsl'
MANPAGE_XSLT=$(${XMLCATALOG} "" ${DOCBOOK_XSL_URI}/${DOCBOOK_XSL_PATH} \
| sed -n 's|^file://||p;q')
if test x"${MANPAGE_XSLT}" = x""; then
DIRS=$(find "${datadir}" -name $(basename $(dirname ${DOCBOOK_XSL_PATH})) \
-type d | LC_ALL=C sort)
XSLT=$(basename ${DOCBOOK_XSL_PATH})
for d in ${DIRS}; do
if test -f "${d}/${XSLT}"; then
MANPAGE_XSLT="${d}/${XSLT}"
break
fi
done
fi
fi
AC_MSG_RESULT($MANPAGE_XSLT)
AC_SUBST(MANPAGE_XSLT)
AM_CONDITIONAL(BUILD_XML_HELP, test x"${MANPAGE_XSLT}" != x"")
if test x"${MANPAGE_XSLT}" != x""; then
PCMK_FEATURES="$PCMK_FEATURES agent-manpages"
fi
AM_CONDITIONAL(BUILD_ASCIIDOC, test x"${ASCIIDOC}" != x"")
if test x"${ASCIIDOC}" != x""; then
PCMK_FEATURES="$PCMK_FEATURES ascii-docs"
fi
publican_intree_brand=no
if test x"${PUBLICAN_BRAND}" != x"" \
&& test x"${PUBLICAN}" != x"" \
&& test x"${INKSCAPE}" != x""; then
dnl special handling for clusterlabs brand (possibly in-tree version used)
test "${PUBLICAN_BRAND}" != "clusterlabs" \
|| test -d /usr/share/publican/Common_Content/clusterlabs
if test $? -ne 0; then
dnl Unknown option: brand_dir vs. Option brand_dir requires an argument
if ${PUBLICAN} build --brand_dir 2>&1 | grep -Eq 'brand_dir$'; then
AC_MSG_WARN([Cannot use in-tree clusterlabs brand, resorting to common])
PUBLICAN_BRAND=common
else
publican_intree_brand=yes
fi
fi
AC_MSG_NOTICE([Enabling Publican-generated documentation using ${PUBLICAN_BRAND} brand])
PCMK_FEATURES="$PCMK_FEATURES publican-docs"
fi
AM_CONDITIONAL([BUILD_DOCBOOK],
[test x"${PUBLICAN_BRAND}" != x"" \
&& test x"${PUBLICAN}" != x"" \
&& test x"${INKSCAPE}" != x""])
AM_CONDITIONAL([PUBLICAN_INTREE_BRAND],
[test x"${publican_intree_brand}" = x"yes"])
dnl Pacemaker's shell scripts (and thus man page builders) rely on GNU getopt
AC_MSG_CHECKING([for GNU-compatible getopt])
IFS_orig=$IFS
IFS=:
for PATH_DIR in $PATH; do
IFS=$IFS_orig
GETOPT_PATH="${PATH_DIR}/getopt"
if test -f "$GETOPT_PATH" && test -x "$GETOPT_PATH" ; then
$GETOPT_PATH -T >/dev/null 2>/dev/null
if test $? -eq 4; then
break
fi
fi
GETOPT_PATH=""
done
IFS=$IFS_orig
if test -n "$GETOPT_PATH"; then
AC_MSG_RESULT([$GETOPT_PATH])
else
AC_MSG_RESULT([no])
AC_MSG_ERROR(Pacemaker build requires a GNU-compatible getopt)
fi
AC_SUBST([GETOPT_PATH])
dnl ========================================================================
dnl checks for library functions to replace them
dnl
dnl NoSuchFunctionName:
dnl is a dummy function which no system supplies. It is here to make
dnl the system compile semi-correctly on OpenBSD which doesn't know
dnl how to create an empty archive
dnl
dnl scandir: Only on BSD.
dnl System-V systems may have it, but hidden and/or deprecated.
dnl A replacement function is supplied for it.
dnl
dnl setenv: is some bsdish function that should also be avoided (use
dnl putenv instead)
dnl On the other hand, putenv doesn't provide the right API for the
dnl code and has memory leaks designed in (sigh...) Fortunately this
dnl A replacement function is supplied for it.
dnl
dnl strerror: returns a string that corresponds to an errno.
dnl A replacement function is supplied for it.
dnl
dnl strnlen: is a gnu function similar to strlen, but safer.
dnl We wrote a tolearably-fast replacement function for it.
dnl
dnl strndup: is a gnu function similar to strdup, but safer.
dnl We wrote a tolearably-fast replacement function for it.
AC_REPLACE_FUNCS(alphasort NoSuchFunctionName scandir setenv strerror strchrnul unsetenv strnlen strndup)
dnl ===============================================
dnl Libraries
dnl ===============================================
AC_CHECK_LIB(socket, socket) dnl -lsocket
AC_CHECK_LIB(c, dlopen) dnl if dlopen is in libc...
AC_CHECK_LIB(dl, dlopen) dnl -ldl (for Linux)
AC_CHECK_LIB(rt, sched_getscheduler) dnl -lrt (for Tru64)
AC_CHECK_LIB(gnugetopt, getopt_long) dnl -lgnugetopt ( if available )
AC_CHECK_LIB(pam, pam_start) dnl -lpam (if available)
AC_CHECK_FUNCS([sched_setscheduler])
AC_CHECK_LIB(uuid, uuid_parse) dnl load the library if necessary
AC_CHECK_FUNCS(uuid_unparse) dnl OSX ships uuid_* as standard functions
AC_CHECK_HEADERS(uuid/uuid.h)
if test "x$ac_cv_func_uuid_unparse" != xyes; then
AC_MSG_ERROR(You do not have the libuuid development package installed)
fi
if test x"${PKG_CONFIG}" = x""; then
AC_MSG_ERROR(You need pkgconfig installed in order to build ${PACKAGE})
fi
if
$PKG_CONFIG --exists glib-2.0
then
GLIBCONFIG="$PKG_CONFIG glib-2.0"
else
set -x
echo PKG_CONFIG_PATH=$PKG_CONFIG_PATH
$PKG_CONFIG --exists glib-2.0; echo $?
$PKG_CONFIG --cflags glib-2.0; echo $?
$PKG_CONFIG glib-2.0; echo $?
set +x
AC_MSG_ERROR(You need glib2-devel installed in order to build ${PACKAGE})
fi
AC_MSG_RESULT(using $GLIBCONFIG)
#
# Where is dlopen?
#
if test "$ac_cv_lib_c_dlopen" = yes; then
LIBADD_DL=""
elif test "$ac_cv_lib_dl_dlopen" = yes; then
LIBADD_DL=-ldl
else
LIBADD_DL=${lt_cv_dlopen_libs}
fi
if test "X$GLIBCONFIG" != X; then
AC_MSG_CHECKING(for special glib includes: )
GLIBHEAD=`$GLIBCONFIG --cflags`
AC_MSG_RESULT($GLIBHEAD)
CPPFLAGS="$CPPFLAGS $GLIBHEAD"
AC_MSG_CHECKING(for glib library flags)
GLIBLIB=`$GLIBCONFIG --libs`
AC_MSG_RESULT($GLIBLIB)
LIBS="$LIBS $GLIBLIB"
fi
dnl FreeBSD needs -lcompat for ftime() used by lrmd.c
AC_CHECK_LIB([compat], [ftime], [COMPAT_LIBS='-lcompat'])
AC_SUBST(COMPAT_LIBS)
dnl ========================================================================
dnl Headers
dnl ========================================================================
dnl Some distributions insert #warnings into deprecated headers such as
dnl timeb.h. If we will enable fatal warnings for the build, then enable
dnl them for the header checks as well, otherwise the build could fail
dnl even though the header check succeeds. (We should probably be doing
dnl this in more places.)
if test "x${enable_fatal_warnings}" = xyes ; then
cc_temp_flags "$CFLAGS $WERROR"
fi
AC_CHECK_HEADERS(arpa/inet.h)
AC_CHECK_HEADERS(ctype.h)
AC_CHECK_HEADERS(dirent.h)
AC_CHECK_HEADERS(errno.h)
AC_CHECK_HEADERS(getopt.h)
AC_CHECK_HEADERS(glib.h)
AC_CHECK_HEADERS(grp.h)
AC_CHECK_HEADERS(limits.h)
AC_CHECK_HEADERS(linux/swab.h)
AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(netdb.h)
AC_CHECK_HEADERS(netinet/in.h)
AC_CHECK_HEADERS(netinet/ip.h)
AC_CHECK_HEADERS(pwd.h)
AC_CHECK_HEADERS(sgtty.h)
AC_CHECK_HEADERS(signal.h)
AC_CHECK_HEADERS(stdarg.h)
AC_CHECK_HEADERS(stddef.h)
AC_CHECK_HEADERS(stdio.h)
AC_CHECK_HEADERS(stdlib.h)
AC_CHECK_HEADERS(string.h)
AC_CHECK_HEADERS(strings.h)
AC_CHECK_HEADERS(sys/dir.h)
AC_CHECK_HEADERS(sys/ioctl.h)
AC_CHECK_HEADERS(sys/param.h)
AC_CHECK_HEADERS(sys/reboot.h)
AC_CHECK_HEADERS(sys/resource.h)
AC_CHECK_HEADERS(sys/socket.h)
AC_CHECK_HEADERS(sys/signalfd.h)
AC_CHECK_HEADERS(sys/sockio.h)
AC_CHECK_HEADERS(sys/stat.h)
AC_CHECK_HEADERS(sys/time.h)
AC_CHECK_HEADERS(sys/timeb.h)
AC_CHECK_HEADERS(sys/types.h)
AC_CHECK_HEADERS(sys/utsname.h)
AC_CHECK_HEADERS(sys/wait.h)
AC_CHECK_HEADERS(time.h)
AC_CHECK_HEADERS(unistd.h)
if test "x${enable_fatal_warnings}" = xyes ; then
cc_restore_flags
fi
dnl These headers need prerequisites before the tests will pass
dnl AC_CHECK_HEADERS(net/if.h)
PKG_CHECK_MODULES(LIBXML2, [libxml-2.0],
[CPPFLAGS="${CPPFLAGS} ${LIBXML2_CFLAGS}"
LIBS="${LIBS} ${LIBXML2_LIBS}"])
AC_CHECK_HEADERS(libxml/xpath.h)
if test "$ac_cv_header_libxml_xpath_h" != "yes"; then
AC_MSG_ERROR(libxml development headers not found)
fi
AC_CHECK_LIB(xslt, xsltApplyStylesheet, [],
AC_MSG_ERROR(Unsupported libxslt library version))
AC_CHECK_HEADERS(libxslt/xslt.h)
if test "$ac_cv_header_libxslt_xslt_h" != "yes"; then
AC_MSG_ERROR(libxslt development headers not found)
fi
AC_CACHE_CHECK(whether __progname and __progname_full are available,
pf_cv_var_progname,
AC_TRY_LINK([extern char *__progname, *__progname_full;],
[__progname = "foo"; __progname_full = "foo bar";],
pf_cv_var_progname="yes", pf_cv_var_progname="no"))
if test "$pf_cv_var_progname" = "yes"; then
AC_DEFINE(HAVE___PROGNAME,1,[ ])
fi
dnl ========================================================================
dnl Structures
dnl ========================================================================
AC_CHECK_MEMBERS([struct tm.tm_gmtoff],,,[[#include <time.h>]])
AC_CHECK_MEMBERS([lrm_op_t.rsc_deleted],,,[[#include <lrm/lrm_api.h>]])
AC_CHECK_MEMBER([struct dirent.d_type],
AC_DEFINE(HAVE_STRUCT_DIRENT_D_TYPE,1,[Define this if struct dirent has d_type]),,
[#include <dirent.h>])
dnl ========================================================================
dnl Functions
dnl ========================================================================
AC_CHECK_FUNCS(getopt, AC_DEFINE(HAVE_DECL_GETOPT, 1, [Have getopt function]))
AC_CHECK_FUNCS(nanosleep, AC_DEFINE(HAVE_DECL_NANOSLEEP, 1, [Have nanosleep function]))
dnl ========================================================================
dnl bzip2
dnl ========================================================================
AC_CHECK_HEADERS(bzlib.h)
AC_CHECK_LIB(bz2, BZ2_bzBuffToBuffCompress)
if test x$ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress != xyes ; then
AC_MSG_ERROR(BZ2 libraries not found)
fi
if test x$ac_cv_header_bzlib_h != xyes; then
AC_MSG_ERROR(BZ2 Development headers not found)
fi
dnl ========================================================================
dnl sighandler_t is missing from Illumos, Solaris11 systems
dnl ========================================================================
AC_MSG_CHECKING([for sighandler_t])
AC_TRY_COMPILE([#include <signal.h>],[sighandler_t *f;],
has_sighandler_t=yes,has_sighandler_t=no)
AC_MSG_RESULT($has_sighandler_t)
if test "$has_sighandler_t" = "yes" ; then
AC_DEFINE( HAVE_SIGHANDLER_T, 1, [Define if sighandler_t available] )
fi
dnl ========================================================================
dnl ncurses
dnl ========================================================================
dnl
dnl A few OSes (e.g. Linux) deliver a default "ncurses" alongside "curses".
dnl Many non-Linux deliver "curses"; sites may add "ncurses".
dnl
dnl However, the source-code recommendation for both is to #include "curses.h"
dnl (i.e. "ncurses" still wants the include to be simple, no-'n', "curses.h").
dnl
dnl ncurse takes precedence.
dnl
AC_CHECK_HEADERS(curses.h)
AC_CHECK_HEADERS(curses/curses.h)
AC_CHECK_HEADERS(ncurses.h)
AC_CHECK_HEADERS(ncurses/ncurses.h)
dnl Although n-library is preferred, only look for it if the n-header was found.
CURSESLIBS=''
if test "$ac_cv_header_ncurses_h" = "yes"; then
AC_CHECK_LIB(ncurses, printw,
[AC_DEFINE(HAVE_LIBNCURSES,1, have ncurses library)])
CURSESLIBS=`$PKG_CONFIG --libs ncurses` || CURSESLIBS='-lncurses'
fi
if test "$ac_cv_header_ncurses_ncurses_h" = "yes"; then
AC_CHECK_LIB(ncurses, printw,
[AC_DEFINE(HAVE_LIBNCURSES,1, have ncurses library)])
CURSESLIBS=`$PKG_CONFIG --libs ncurses` || CURSESLIBS='-lncurses'
fi
dnl Only look for non-n-library if there was no n-library.
if test X"$CURSESLIBS" = X"" -a "$ac_cv_header_curses_h" = "yes"; then
AC_CHECK_LIB(curses, printw,
[CURSESLIBS='-lcurses'; AC_DEFINE(HAVE_LIBCURSES,1, have curses library)])
fi
dnl Only look for non-n-library if there was no n-library.
if test X"$CURSESLIBS" = X"" -a "$ac_cv_header_curses_curses_h" = "yes"; then
AC_CHECK_LIB(curses, printw,
[CURSESLIBS='-lcurses'; AC_DEFINE(HAVE_LIBCURSES,1, have curses library)])
fi
if test "x$CURSESLIBS" != "x"; then
PCMK_FEATURES="$PCMK_FEATURES ncurses"
fi
dnl Check for printw() prototype compatibility
if test X"$CURSESLIBS" != X"" && cc_supports_flag -Wcast-qual; then
ac_save_LIBS=$LIBS
LIBS="$CURSESLIBS"
cc_temp_flags "-Wcast-qual $WERROR"
# avoid broken test because of hardened build environment in Fedora 23+
# - https://fedoraproject.org/wiki/Changes/Harden_All_Packages
# - https://bugzilla.redhat.com/1297985
if cc_supports_flag -fPIC; then
CFLAGS="$CFLAGS -fPIC"
fi
AC_MSG_CHECKING(whether printw() requires argument of "const char *")
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([
#if defined(HAVE_NCURSES_H)
# include <ncurses.h>
#elif defined(HAVE_NCURSES_NCURSES_H)
# include <ncurses/ncurses.h>
#elif defined(HAVE_CURSES_H)
# include <curses.h>
#endif
],
[printw((const char *)"Test");]
)],
[ac_cv_compatible_printw=yes],
[ac_cv_compatible_printw=no]
)
LIBS=$ac_save_LIBS
cc_restore_flags
AC_MSG_RESULT([$ac_cv_compatible_printw])
if test "$ac_cv_compatible_printw" = no; then
AC_MSG_WARN([The printw() function of your ncurses or curses library is old, we will disable usage of the library. If you want to use this library anyway, please update to newer version of the library, ncurses 5.4 or later is recommended. You can get the library from http://www.gnu.org/software/ncurses/.])
AC_MSG_NOTICE([Disabling curses])
AC_DEFINE(HAVE_INCOMPATIBLE_PRINTW, 1, [Do we have incompatible printw() in curses library?])
fi
fi
AC_SUBST(CURSESLIBS)
dnl ========================================================================
dnl Profiling and GProf
dnl ========================================================================
AC_MSG_NOTICE(Old CFLAGS: $CFLAGS)
case $SUPPORT_COVERAGE in
1|yes|true)
SUPPORT_PROFILING=1
PCMK_FEATURES="$PCMK_FEATURES coverage"
CFLAGS="$CFLAGS -fprofile-arcs -ftest-coverage"
dnl During linking, make sure to specify -lgcov or -coverage
;;
esac
case $SUPPORT_PROFILING in
1|yes|true)
SUPPORT_PROFILING=1
dnl Disable various compiler optimizations
CFLAGS="$CFLAGS -fno-omit-frame-pointer -fno-inline -fno-builtin "
dnl CFLAGS="$CFLAGS -fno-inline-functions -fno-default-inline -fno-inline-functions-called-once -fno-optimize-sibling-calls"
dnl Turn off optimization so tools can get accurate line numbers
CFLAGS=`echo $CFLAGS | sed -e 's/-O.\ //g' -e 's/-Wp,-D_FORTIFY_SOURCE=.\ //g' -e 's/-D_FORTIFY_SOURCE=.\ //g'`
CFLAGS="$CFLAGS -O0 -g3 -gdwarf-2"
dnl Update features
PCMK_FEATURES="$PCMK_FEATURES profile"
;;
*)
SUPPORT_PROFILING=0
;;
esac
AC_MSG_NOTICE(New CFLAGS: $CFLAGS)
AC_DEFINE_UNQUOTED(SUPPORT_PROFILING, $SUPPORT_PROFILING, Support for profiling)
dnl ========================================================================
dnl Cluster infrastructure - LibQB
dnl ========================================================================
if test x${enable_no_stack} = xyes; then
SUPPORT_CS=no
fi
PKG_CHECK_MODULES(libqb, libqb >= 0.13)
CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
LIBS="$libqb_LIBS $LIBS"
dnl libqb 0.14.0+ (2012-06)
AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set)
PCMK_FEATURES="$PCMK_FEATURES libqb-logging libqb-ipc"
dnl libqb 0.17.0+ (2014-02)
AC_CHECK_FUNCS(qb_ipcs_connection_get_buffer_size,
AC_DEFINE(HAVE_IPCS_GET_BUFFER_SIZE, 1,
[Have qb_ipcc_get_buffer_size function]))
dnl Support Linux-HA fence agents if available
if test "$cross_compiling" != "yes"; then
CPPFLAGS="$CPPFLAGS -I${prefix}/include/heartbeat"
fi
AC_CHECK_HEADERS(stonith/stonith.h)
if test "$ac_cv_header_stonith_stonith_h" = "yes"; then
dnl On Debian, AC_CHECK_LIBS fail if a library has any unresolved symbols
dnl So check for all the dependencies (so they're added to LIBS) before checking for -lplumb
AC_CHECK_LIB(pils, PILLoadPlugin)
AC_CHECK_LIB(plumb, G_main_add_IPC_Channel)
PCMK_FEATURES="$PCMK_FEATURES lha-fencing"
fi
dnl ===============================================
dnl Variables needed for substitution
dnl ===============================================
CRM_SCHEMA_DIRECTORY="${datadir}/pacemaker"
AC_DEFINE_UNQUOTED(CRM_SCHEMA_DIRECTORY,"$CRM_SCHEMA_DIRECTORY", Location for the Pacemaker Relax-NG Schema)
AC_SUBST(CRM_SCHEMA_DIRECTORY)
CRM_CORE_DIR="${localstatedir}/lib/pacemaker/cores"
AC_DEFINE_UNQUOTED(CRM_CORE_DIR,"$CRM_CORE_DIR", Location to store core files produced by Pacemaker daemons)
AC_SUBST(CRM_CORE_DIR)
CRM_DAEMON_USER="hacluster"
AC_DEFINE_UNQUOTED(CRM_DAEMON_USER,"$CRM_DAEMON_USER", User to run Pacemaker daemons as)
AC_SUBST(CRM_DAEMON_USER)
CRM_DAEMON_GROUP="haclient"
AC_DEFINE_UNQUOTED(CRM_DAEMON_GROUP,"$CRM_DAEMON_GROUP", Group to run Pacemaker daemons as)
AC_SUBST(CRM_DAEMON_GROUP)
CRM_STATE_DIR=${localstatedir}/run/crm
AC_DEFINE_UNQUOTED(CRM_STATE_DIR,"$CRM_STATE_DIR", Where to keep state files and sockets)
AC_SUBST(CRM_STATE_DIR)
CRM_PACEMAKER_DIR=${localstatedir}/lib/pacemaker
AC_DEFINE_UNQUOTED(CRM_PACEMAKER_DIR,"$CRM_PACEMAKER_DIR", Location to store directory produced by Pacemaker daemons)
AC_SUBST(CRM_PACEMAKER_DIR)
CRM_BLACKBOX_DIR=${localstatedir}/lib/pacemaker/blackbox
AC_DEFINE_UNQUOTED(CRM_BLACKBOX_DIR,"$CRM_BLACKBOX_DIR", Where to keep blackbox dumps)
AC_SUBST(CRM_BLACKBOX_DIR)
PE_STATE_DIR="${localstatedir}/lib/pacemaker/pengine"
AC_DEFINE_UNQUOTED(PE_STATE_DIR,"$PE_STATE_DIR", Where to keep PEngine outputs)
AC_SUBST(PE_STATE_DIR)
CRM_CONFIG_DIR="${localstatedir}/lib/pacemaker/cib"
AC_DEFINE_UNQUOTED(CRM_CONFIG_DIR,"$CRM_CONFIG_DIR", Where to keep configuration files)
AC_SUBST(CRM_CONFIG_DIR)
CRM_CONFIG_CTS="${localstatedir}/lib/pacemaker/cts"
AC_DEFINE_UNQUOTED(CRM_CONFIG_CTS,"$CRM_CONFIG_CTS", Where to keep cts stateful data)
AC_SUBST(CRM_CONFIG_CTS)
CRM_DAEMON_DIR="${libexecdir}/pacemaker"
AC_DEFINE_UNQUOTED(CRM_DAEMON_DIR,"$CRM_DAEMON_DIR", Location for Pacemaker daemons)
AC_SUBST(CRM_DAEMON_DIR)
HA_STATE_DIR="${localstatedir}/run"
AC_DEFINE_UNQUOTED(HA_STATE_DIR,"$HA_STATE_DIR", Where sbd keeps its PID file)
AC_SUBST(HA_STATE_DIR)
CRM_RSCTMP_DIR="${localstatedir}/run/resource-agents"
AC_DEFINE_UNQUOTED(CRM_RSCTMP_DIR,"$CRM_RSCTMP_DIR", Where resource agents should keep state files)
AC_SUBST(CRM_RSCTMP_DIR)
PACEMAKER_CONFIG_DIR="${sysconfdir}/pacemaker"
AC_DEFINE_UNQUOTED(PACEMAKER_CONFIG_DIR,"$PACEMAKER_CONFIG_DIR", Where to keep configuration files like authkey)
AC_SUBST(PACEMAKER_CONFIG_DIR)
OCF_ROOT_DIR="/usr/lib/ocf"
if test "X$OCF_ROOT_DIR" = X; then
AC_MSG_ERROR(Could not locate OCF directory)
fi
AC_SUBST(OCF_ROOT_DIR)
OCF_RA_DIR="$OCF_ROOT_DIR/resource.d"
AC_DEFINE_UNQUOTED(OCF_RA_DIR,"$OCF_RA_DIR", Location for OCF RAs)
AC_SUBST(OCF_RA_DIR)
RH_STONITH_DIR="$sbindir"
AC_DEFINE_UNQUOTED(RH_STONITH_DIR,"$RH_STONITH_DIR", Location for Red Hat Stonith agents)
AC_DEFINE_UNQUOTED(SBIN_DIR,"$sbindir", Location for system binaries)
RH_STONITH_PREFIX="fence_"
AC_DEFINE_UNQUOTED(RH_STONITH_PREFIX,"$RH_STONITH_PREFIX", Prefix for Red Hat Stonith agents)
AC_PATH_PROGS(GIT, git false)
AC_MSG_CHECKING(build version)
BUILD_VERSION=$Format:%h$
if test $BUILD_VERSION != ":%h$"; then
AC_MSG_RESULT(archive hash: $BUILD_VERSION)
elif test -x $GIT -a -d .git; then
BUILD_VERSION=`$GIT log --pretty="format:%h" -n 1`
AC_MSG_RESULT(git hash: $BUILD_VERSION)
else
# The current directory name make a reasonable default
# Most generated archives will include the hash or tag
BASE=`basename $PWD`
BUILD_VERSION=`echo $BASE | sed s:.*[[Pp]]acemaker-::`
AC_MSG_RESULT(directory based hash: $BUILD_VERSION)
fi
AC_DEFINE_UNQUOTED(BUILD_VERSION, "$BUILD_VERSION", Build version)
AC_SUBST(BUILD_VERSION)
HAVE_dbus=1
HAVE_upstart=0
HAVE_systemd=0
PKG_CHECK_MODULES(DBUS, dbus-1, ,HAVE_dbus=0)
AC_DEFINE_UNQUOTED(SUPPORT_DBUS, $HAVE_dbus, Support dbus)
AM_CONDITIONAL(BUILD_DBUS, test $HAVE_dbus = 1)
if test $HAVE_dbus = 1; then
CFLAGS="$CFLAGS `$PKG_CONFIG --cflags dbus-1`"
fi
DBUS_LIBS="$CFLAGS `$PKG_CONFIG --libs dbus-1`"
AC_SUBST(DBUS_LIBS)
AC_CHECK_TYPES([DBusBasicValue],,,[[#include <dbus/dbus.h>]])
if test "x${enable_systemd}" != xno; then
if test $HAVE_dbus = 0; then
if test "x${enable_systemd}" = xyes; then
AC_MSG_FAILURE([cannot enable systemd without DBus])
else
enable_systemd=no
fi
fi
if test "x${enable_systemd}" = xtry; then
AC_MSG_CHECKING([for systemd version query result via dbus-send])
ret=$({ dbus-send --system --print-reply \
--dest=org.freedesktop.systemd1 \
/org/freedesktop/systemd1 \
org.freedesktop.DBus.Properties.Get \
string:org.freedesktop.systemd1.Manager \
string:Version 2>/dev/null \
|| echo "this borked"; } | tail -n1)
# sanitize output a bit (interested just in value, not type),
# ret is intentionally unenquoted so as to normalize whitespace
ret=$(echo ${ret} | cut -d' ' -f2-)
AC_MSG_RESULT([${ret}])
if test "x${ret}" != xborked \
|| systemctl --version 2>/dev/null | grep -q systemd; then
enable_systemd=yes
else
enable_systemd=no
fi
fi
fi
AC_MSG_CHECKING([whether to enable support for managing resources via systemd])
AC_MSG_RESULT([${enable_systemd}])
if test "x${enable_systemd}" = xyes; then
HAVE_systemd=1
PCMK_FEATURES="$PCMK_FEATURES systemd"
AC_MSG_CHECKING([for systemd path for system unit files])
systemdunitdir="${systemdunitdir-}"
PKG_CHECK_VAR([systemdunitdir], [systemd],
[systemdsystemunitdir], [], [systemdunitdir=no])
AC_MSG_RESULT([${systemdunitdir}])
if test "x${systemdunitdir}" = xno; then
AC_MSG_FAILURE([cannot enable systemd when systemdunitdir unresolved])
fi
fi
AC_SUBST(systemdunitdir)
AC_DEFINE_UNQUOTED(SUPPORT_SYSTEMD, $HAVE_systemd, Support systemd based system services)
AM_CONDITIONAL(BUILD_SYSTEMD, test $HAVE_systemd = 1)
AC_SUBST(SUPPORT_SYSTEMD)
if test "x${enable_upstart}" != xno; then
if test $HAVE_dbus = 0; then
if test "x${enable_upstart}" = xyes; then
AC_MSG_FAILURE([cannot enable Upstart without DBus])
else
enable_upstart=no
fi
fi
if test "x${enable_upstart}" = xtry; then
AC_MSG_CHECKING([for Upstart version query result via dbus-send])
ret=$({ dbus-send --system --print-reply --dest=com.ubuntu.Upstart \
/com/ubuntu/Upstart org.freedesktop.DBus.Properties.Get \
string:com.ubuntu.Upstart0_6 string:version 2>/dev/null \
|| echo "this borked"; } | tail -n1)
# sanitize output a bit (interested just in value, not type),
# ret is intentionally unenquoted so as to normalize whitespace
ret=$(echo ${ret} | cut -d' ' -f2-)
AC_MSG_RESULT([${ret}])
if test "x${ret}" != xborked \
|| initctl --version 2>/dev/null | grep -q upstart; then
enable_upstart=yes
else
enable_upstart=no
fi
fi
fi
AC_MSG_CHECKING([whether to enable support for managing resources via Upstart])
AC_MSG_RESULT([${enable_upstart}])
if test "x${enable_upstart}" = xyes; then
HAVE_upstart=1
PCMK_FEATURES="$PCMK_FEATURES upstart"
fi
AC_DEFINE_UNQUOTED(SUPPORT_UPSTART, $HAVE_upstart, Support upstart based system services)
AM_CONDITIONAL(BUILD_UPSTART, test $HAVE_upstart = 1)
AC_SUBST(SUPPORT_UPSTART)
case $SUPPORT_NAGIOS in
1|yes|true|try)
SUPPORT_NAGIOS=1
;;
*)
SUPPORT_NAGIOS=0
;;
esac
if test $SUPPORT_NAGIOS = 1; then
PCMK_FEATURES="$PCMK_FEATURES nagios"
fi
AC_DEFINE_UNQUOTED(SUPPORT_NAGIOS, $SUPPORT_NAGIOS, Support nagios plugins)
AM_CONDITIONAL(BUILD_NAGIOS, test $SUPPORT_NAGIOS = 1)
if test x"$NAGIOS_PLUGIN_DIR" = x""; then
NAGIOS_PLUGIN_DIR="${libexecdir}/nagios/plugins"
fi
AC_DEFINE_UNQUOTED(NAGIOS_PLUGIN_DIR, "$NAGIOS_PLUGIN_DIR", Directory for nagios plugins)
AC_SUBST(NAGIOS_PLUGIN_DIR)
if test x"$NAGIOS_METADATA_DIR" = x""; then
NAGIOS_METADATA_DIR="${datadir}/nagios/plugins-metadata"
fi
AC_DEFINE_UNQUOTED(NAGIOS_METADATA_DIR, "$NAGIOS_METADATA_DIR", Directory for nagios plugins metadata)
AC_SUBST(NAGIOS_METADATA_DIR)
STACKS=""
CLUSTERLIBS=""
dnl ========================================================================
dnl Cluster stack - Corosync
dnl ========================================================================
dnl Normalize the values
case $SUPPORT_CS in
1|yes|true)
SUPPORT_CS=yes
missingisfatal=1
;;
try)
missingisfatal=0
;;
*)
SUPPORT_CS=no
;;
esac
AC_MSG_CHECKING(for native corosync)
COROSYNC_LIBS=""
if test $SUPPORT_CS = no; then
AC_MSG_RESULT(no (disabled))
SUPPORT_CS=0
else
AC_MSG_RESULT($SUPPORT_CS)
SUPPORT_CS=1
PKG_CHECK_MODULES(cpg, libcpg) dnl Fatal
PKG_CHECK_MODULES(cfg, libcfg) dnl Fatal
PKG_CHECK_MODULES(cmap, libcmap) dnl Fatal
PKG_CHECK_MODULES(quorum, libquorum) dnl Fatal
PKG_CHECK_MODULES(libcorosync_common, libcorosync_common) dnl Fatal
CFLAGS="$CFLAGS $libqb_FLAGS $cpg_FLAGS $cfg_FLAGS $cmap_CFLAGS $quorum_CFLAGS $libcorosync_common_CFLAGS"
COROSYNC_LIBS="$COROSYNC_LIBS $libqb_LIBS $cpg_LIBS $cfg_LIBS $cmap_LIBS $quorum_LIBS $libcorosync_common_LIBS"
CLUSTERLIBS="$CLUSTERLIBS $COROSYNC_LIBS"
STACKS="$STACKS corosync-native"
fi
AC_DEFINE_UNQUOTED(SUPPORT_COROSYNC, $SUPPORT_CS, Support the Corosync messaging and membership layer)
AM_CONDITIONAL(BUILD_CS_SUPPORT, test $SUPPORT_CS = 1)
AC_SUBST(SUPPORT_COROSYNC)
dnl
dnl Cluster stack - Sanity
dnl
if test x${enable_no_stack} = xyes; then
AC_MSG_NOTICE(No cluster stack supported. Just building the Policy Engine)
PCMK_FEATURES="$PCMK_FEATURES no-cluster-stack"
else
AC_MSG_CHECKING(for supported stacks)
if test x"$STACKS" = x; then
AC_MSG_FAILURE(You must support at least one cluster stack)
fi
AC_MSG_RESULT($STACKS)
PCMK_FEATURES="$PCMK_FEATURES $STACKS"
fi
PCMK_FEATURES="$PCMK_FEATURES atomic-attrd"
AC_SUBST(CLUSTERLIBS)
dnl ========================================================================
dnl ACL
dnl ========================================================================
case $SUPPORT_ACL in
1|yes|true)
missingisfatal=1
;;
try)
missingisfatal=0
;;
*)
SUPPORT_ACL=no
;;
esac
AC_MSG_CHECKING(for acl support)
if test $SUPPORT_ACL = no; then
AC_MSG_RESULT(no (disabled))
SUPPORT_ACL=0
else
AC_MSG_RESULT($SUPPORT_ACL)
SUPPORT_ACL=1
AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set)
if test $ac_cv_lib_qb_qb_ipcs_connection_auth_set != yes; then
SUPPORT_ACL=0
fi
if test $SUPPORT_ACL = 0; then
if test $missingisfatal = 0; then
AC_MSG_WARN(Unable to support ACL. You need to use libqb > 0.13.0)
else
AC_MSG_FAILURE(Unable to support ACL. You need to use libqb > 0.13.0)
fi
fi
fi
if test $SUPPORT_ACL = 1; then
PCMK_FEATURES="$PCMK_FEATURES acls"
fi
AM_CONDITIONAL(ENABLE_ACL, test "$SUPPORT_ACL" = "1")
AC_DEFINE_UNQUOTED(ENABLE_ACL, $SUPPORT_ACL, Build in support for CIB ACL)
dnl ========================================================================
dnl CIB secrets
dnl ========================================================================
case $SUPPORT_CIBSECRETS in
1|yes|true|try)
SUPPORT_CIBSECRETS=1
;;
*)
SUPPORT_CIBSECRETS=0
;;
esac
AC_DEFINE_UNQUOTED(SUPPORT_CIBSECRETS, $SUPPORT_CIBSECRETS, Support CIB secrets)
AM_CONDITIONAL(BUILD_CIBSECRETS, test $SUPPORT_CIBSECRETS = 1)
if test $SUPPORT_CIBSECRETS = 1; then
PCMK_FEATURES="$PCMK_FEATURES cibsecrets"
LRM_CIBSECRETS_DIR="${localstatedir}/lib/pacemaker/lrm/secrets"
AC_DEFINE_UNQUOTED(LRM_CIBSECRETS_DIR,"$LRM_CIBSECRETS_DIR", Location for CIB secrets)
AC_SUBST(LRM_CIBSECRETS_DIR)
fi
dnl ========================================================================
dnl GnuTLS
dnl ========================================================================
dnl gnutls_priority_set_direct available since 2.1.7 (released 2007-11-29)
AC_CHECK_LIB(gnutls, gnutls_priority_set_direct)
if test "$ac_cv_lib_gnutls_gnutls_priority_set_direct" != ""; then
AC_CHECK_HEADERS(gnutls/gnutls.h)
fi
dnl ========================================================================
dnl PAM
dnl ========================================================================
AC_CHECK_HEADERS(security/pam_appl.h pam/pam_appl.h)
dnl ========================================================================
dnl System Health
dnl ========================================================================
dnl Check if servicelog development package is installed
SERVICELOG=servicelog-1
SERVICELOG_EXISTS="no"
AC_MSG_CHECKING(for $SERVICELOG packages)
if
$PKG_CONFIG --exists $SERVICELOG
then
PKG_CHECK_MODULES([SERVICELOG], [servicelog-1])
SERVICELOG_EXISTS="yes"
fi
AC_MSG_RESULT($SERVICELOG_EXISTS)
AM_CONDITIONAL(BUILD_SERVICELOG, test "$SERVICELOG_EXISTS" = "yes")
dnl Check if OpenIMPI packages and servicelog are installed
OPENIPMI="OpenIPMI OpenIPMIposix"
OPENIPMI_SERVICELOG_EXISTS="no"
AC_MSG_CHECKING(for $SERVICELOG $OPENIPMI packages)
if
$PKG_CONFIG --exists $OPENIPMI $SERVICELOG
then
PKG_CHECK_MODULES([OPENIPMI_SERVICELOG],[OpenIPMI OpenIPMIposix])
OPENIPMI_SERVICELOG_EXISTS="yes"
fi
AC_MSG_RESULT($OPENIPMI_SERVICELOG_EXISTS)
AM_CONDITIONAL(BUILD_OPENIPMI_SERVICELOG, test "$OPENIPMI_SERVICELOG_EXISTS" = "yes")
dnl ========================================================================
dnl Compiler flags
dnl ========================================================================
dnl Make sure that CFLAGS is not exported. If the user did
dnl not have CFLAGS in their environment then this should have
dnl no effect. However if CFLAGS was exported from the user's
dnl environment, then the new CFLAGS will also be exported
dnl to sub processes.
if export | fgrep " CFLAGS=" > /dev/null; then
SAVED_CFLAGS="$CFLAGS"
unset CFLAGS
CFLAGS="$SAVED_CFLAGS"
unset SAVED_CFLAGS
fi
AC_ARG_VAR([CFLAGS_HARDENED_LIB], [extra C compiler flags for hardened libraries])
AC_ARG_VAR([LDFLAGS_HARDENED_LIB], [extra linker flags for hardened libraries])
AC_ARG_VAR([CFLAGS_HARDENED_EXE], [extra C compiler flags for hardened executables])
AC_ARG_VAR([LDFLAGS_HARDENED_EXE], [extra linker flags for hardened executables])
CC_EXTRAS=""
if test "$GCC" != yes; then
CFLAGS="$CFLAGS -g"
else
CFLAGS="$CFLAGS -ggdb"
dnl When we don't have diagnostic push / pull, we can't explicitly disable
dnl checking for nonliteral formats in the places where they occur on purpose
dnl thus we disable nonliteral format checking globally as we are aborting
dnl on warnings.
dnl what makes the things really ugly is that nonliteral format checking is
dnl obviously available as an extra switch in very modern gcc but for older
dnl gcc this is part of -Wformat=2
dnl so if we have push/pull we can enable -Wformat=2 -Wformat-nonliteral
dnl if we don't have push/pull but -Wformat-nonliteral we can enable -Wformat=2
dnl otherwise none of both
gcc_diagnostic_push_pull=no
cc_temp_flags "$CFLAGS $WERROR"
AC_MSG_CHECKING([for gcc diagnostic push / pull])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#pragma GCC diagnostic push
#pragma GCC diagnostic pop
]])],
[
AC_MSG_RESULT([yes])
gcc_diagnostic_push_pull=yes
], AC_MSG_RESULT([no]))
cc_restore_flags
if cc_supports_flag "-Wformat-nonliteral"; then
gcc_format_nonliteral=yes
else
gcc_format_nonliteral=no
fi
# We had to eliminate -Wnested-externs because of libtool changes
# Make sure to order options so that the former stand for prerequisites
# of the latter (e.g., -Wformat-nonliteral requires -Wformat).
EXTRA_FLAGS="-fgnu89-inline
-Wall
-Waggregate-return
-Wbad-function-cast
-Wcast-align
-Wdeclaration-after-statement
-Wendif-labels
-Wfloat-equal
-Wformat-security
-Wmissing-prototypes
-Wmissing-declarations
-Wnested-externs
-Wno-long-long
-Wno-strict-aliasing
-Wpointer-arith
-Wstrict-prototypes
-Wwrite-strings
-Wunused-but-set-variable
-Wunsigned-char"
if test "x$gcc_diagnostic_push_pull" = "xyes"; then
AC_DEFINE([GCC_FORMAT_NONLITERAL_CHECKING_ENABLED], [],
[gcc can complain about nonliterals in format])
EXTRA_FLAGS="$EXTRA_FLAGS
-Wformat=2
-Wformat-nonliteral"
else
if test "x$gcc_format_nonliteral" = "xyes"; then
EXTRA_FLAGS="$EXTRA_FLAGS -Wformat=2"
fi
fi
# Additional warnings it might be nice to enable one day
# -Wshadow
# -Wunreachable-code
for j in $EXTRA_FLAGS
do
if
cc_supports_flag $CC_EXTRAS $j
then
CC_EXTRAS="$CC_EXTRAS $j"
fi
done
if test "x${enable_ansi}" = xyes && cc_supports_flag -std=iso9899:199409 ; then
AC_MSG_NOTICE(Enabling ANSI Compatibility)
CC_EXTRAS="$CC_EXTRAS -ansi -D_GNU_SOURCE -DANSI_ONLY"
fi
AC_MSG_NOTICE(Activated additional gcc flags: ${CC_EXTRAS})
fi
dnl
dnl Hardening flags
dnl
dnl The prime control of whether to apply (targeted) hardening build flags and
dnl which ones is --{enable,disable}-hardening option passed to ./configure:
dnl
dnl --enable-hardening=try (default):
dnl depending on whether any of CFLAGS_HARDENED_EXE, LDFLAGS_HARDENED_EXE,
dnl CFLAGS_HARDENED_LIB or LDFLAGS_HARDENED_LIB environment variables
dnl (see below) is set and non-null, all these custom flags (even if not
dnl set) are used as are, otherwise the best effort is made to offer
dnl reasonably strong hardening in several categories (RELRO, PIE,
dnl "bind now", stack protector) according to what the selected toolchain
dnl can offer
dnl
dnl --enable-hardening:
dnl same effect as --enable-hardening=try when the environment variables
dnl in question are suppressed
dnl
dnl --disable-hardening:
dnl do not apply any targeted hardening measures at all
dnl
dnl The user-injected environment variables that regulate the hardening in
dnl default case are as follows:
dnl
dnl * CFLAGS_HARDENED_EXE, LDFLAGS_HARDENED_EXE
dnl compiler and linker flags (respectively) for daemon programs
dnl (pacemaker-attrd, pacemaker-execd, cib, crmd, stonithd, pacemakerd,
-dnl pacemaker_remoted, pengine)
+dnl pacemaker-remoted, pengine)
dnl
dnl * CFLAGS_HARDENED_LIB, LDFLAGS_HARDENED_LIB
dnl compiler and linker flags (respectively) for libraries linked
dnl with the daemon programs
dnl
dnl Note that these are purposedly targeted variables (addressing particular
dnl targets all over the scattered Makefiles) and have no effect outside of
dnl the predestined scope (e.g., CLI utilities). For a global reach,
dnl use CFLAGS, LDFLAGS, etc. as usual.
dnl
dnl For guidance on the suitable flags consult, for instance:
dnl https://fedoraproject.org/wiki/Changes/Harden_All_Packages#Detailed_Harden_Flags_Description
dnl https://owasp.org/index.php/C-Based_Toolchain_Hardening#GCC.2FBinutils
dnl
if test "x${HARDENING}" != "xtry"; then
unset CFLAGS_HARDENED_EXE
unset CFLAGS_HARDENED_LIB
unset LDFLAGS_HARDENED_EXE
unset LDFLAGS_HARDENED_LIB
fi
if test "x${HARDENING}" = "xno"; then
AC_MSG_NOTICE([Hardening: explicitly disabled])
elif test "x${HARDENING}" = "xyes" \
|| test "$(env | grep -Ec '^(C|LD)FLAGS_HARDENED_(EXE|LIB)=.')" = 0; then
dnl We'll figure out on our own...
CFLAGS_HARDENED_EXE=
CFLAGS_HARDENED_LIB=
LDFLAGS_HARDENED_EXE=
LDFLAGS_HARDENED_LIB=
relro=0
pie=0
bindnow=0
# daemons incl. libs: partial RELRO
flag="-Wl,-z,relro"
CC_CHECK_LDFLAGS(["${flag}"],
[LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}";
LDFLAGS_HARDENED_LIB="${LDFLAGS_HARDENED_LIB} ${flag}";
relro=1])
# daemons: PIE for both CFLAGS and LDFLAGS
if cc_supports_flag -fPIE; then
flag="-pie"
CC_CHECK_LDFLAGS(["${flag}"],
[CFLAGS_HARDENED_EXE="${CFLAGS_HARDENED_EXE} -fPIE";
LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}";
pie=1])
fi
# daemons incl. libs: full RELRO if sensible + as-needed linking
# so as to possibly mitigate startup performance
# hit caused by excessive linking with unneeded
# libraries
if test "${relro}" = 1 && test "${pie}" = 1; then
flag="-Wl,-z,now"
CC_CHECK_LDFLAGS(["${flag}"],
[LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}";
LDFLAGS_HARDENED_LIB="${LDFLAGS_HARDENED_LIB} ${flag}";
bindnow=1])
fi
if test "${bindnow}" = 1; then
flag="-Wl,--as-needed"
CC_CHECK_LDFLAGS(["${flag}"],
[LDFLAGS_HARDENED_EXE="${LDFLAGS_HARDENED_EXE} ${flag}";
LDFLAGS_HARDENED_LIB="${LDFLAGS_HARDENED_LIB} ${flag}"])
fi
# universal: prefer strong > all > default stack protector if possible
flag=
if cc_supports_flag -fstack-protector-strong; then
flag="-fstack-protector-strong"
elif cc_supports_flag -fstack-protector-all; then
flag="-fstack-protector-all"
elif cc_supports_flag -fstack-protector; then
flag="-fstack-protector"
fi
if test -n "${flag}"; then
CC_EXTRAS="${CC_EXTRAS} ${flag}"
stackprot=1
fi
if test "${relro}" = 1 \
|| test "${pie}" = 1 \
|| test "${stackprot}" = 1; then
AC_MSG_NOTICE([Hardening: relro=${relro} pie=${pie} bindnow=${bindnow} stackprot=${flag}])
else
AC_MSG_WARN([Hardening: no suitable features in the toolchain detected])
fi
else
AC_MSG_NOTICE([Hardening: using custom flags])
fi
CFLAGS="$CFLAGS $CC_EXTRAS"
NON_FATAL_CFLAGS="$CFLAGS"
AC_SUBST(NON_FATAL_CFLAGS)
dnl
dnl We reset CFLAGS to include our warnings *after* all function
dnl checking goes on, so that our warning flags don't keep the
dnl AC_*FUNCS() calls above from working. In particular, -Werror will
dnl *always* cause us troubles if we set it before here.
dnl
dnl
if test "x${enable_fatal_warnings}" = xyes ; then
AC_MSG_NOTICE(Enabling Fatal Warnings)
CFLAGS="$CFLAGS $WERROR"
fi
AC_SUBST(CFLAGS)
dnl This is useful for use in Makefiles that need to remove one specific flag
CFLAGS_COPY="$CFLAGS"
AC_SUBST(CFLAGS_COPY)
AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries
AC_SUBST(LIBADD_INTL) dnl extra flags for GNU gettext stuff...
AC_SUBST(LOCALE)
dnl Options for cleaning up the compiler output
QUIET_LIBTOOL_OPTS=""
QUIET_MAKE_OPTS=""
if test "x${enable_quiet}" = "xyes"; then
QUIET_LIBTOOL_OPTS="--quiet"
QUIET_MAKE_OPTS="--quiet"
fi
AC_MSG_RESULT(Supress make details: ${enable_quiet})
dnl Put the above variables to use
LIBTOOL="${LIBTOOL} --tag=CC \$(QUIET_LIBTOOL_OPTS)"
MAKE="${MAKE} \$(QUIET_MAKE_OPTS)"
AC_SUBST(CC)
AC_SUBST(MAKE)
AC_SUBST(LIBTOOL)
AC_SUBST(QUIET_MAKE_OPTS)
AC_SUBST(QUIET_LIBTOOL_OPTS)
AC_DEFINE_UNQUOTED(CRM_FEATURES, "$PCMK_FEATURES", Set of enabled features)
AC_SUBST(PCMK_FEATURES)
dnl Files we output that need to be executable
AC_CONFIG_FILES([cts/CTSlab.py], [chmod +x cts/CTSlab.py])
AC_CONFIG_FILES([cts/LSBDummy], [chmod +x cts/LSBDummy])
AC_CONFIG_FILES([cts/OCFIPraTest.py], [chmod +x cts/OCFIPraTest.py])
AC_CONFIG_FILES([cts/cluster_test], [chmod +x cts/cluster_test])
AC_CONFIG_FILES([cts/cts], [chmod +x cts/cts])
AC_CONFIG_FILES([cts/cts-cli], [chmod +x cts/cts-cli])
AC_CONFIG_FILES([cts/cts-coverage], [chmod +x cts/cts-coverage])
AC_CONFIG_FILES([cts/cts-exec], [chmod +x cts/cts-exec])
AC_CONFIG_FILES([cts/cts-pengine], [chmod +x cts/cts-pengine])
AC_CONFIG_FILES([cts/cts-regression], [chmod +x cts/cts-regression])
AC_CONFIG_FILES([cts/cts-stonithd], [chmod +x cts/cts-stonithd])
AC_CONFIG_FILES([cts/lxc_autogen.sh], [chmod +x cts/lxc_autogen.sh])
AC_CONFIG_FILES([cts/benchmark/clubench], [chmod +x cts/benchmark/clubench])
AC_CONFIG_FILES([cts/fence_dummy], [chmod +x cts/fence_dummy])
AC_CONFIG_FILES([cts/pacemaker-cts-dummyd], [chmod +x cts/pacemaker-cts-dummyd])
AC_CONFIG_FILES([fencing/fence_legacy], [chmod +x fencing/fence_legacy])
AC_CONFIG_FILES([tools/crm_failcount], [chmod +x tools/crm_failcount])
AC_CONFIG_FILES([tools/crm_master], [chmod +x tools/crm_master])
AC_CONFIG_FILES([tools/crm_report], [chmod +x tools/crm_report])
AC_CONFIG_FILES([tools/crm_standby], [chmod +x tools/crm_standby])
AC_CONFIG_FILES([tools/cibsecret], [chmod +x tools/cibsecret])
dnl Other files we output
AC_CONFIG_FILES(Makefile \
Doxyfile \
cts/Makefile \
cts/CTS.py \
cts/CTSvars.py \
cts/benchmark/Makefile \
cts/pacemaker-cts-dummyd@.service \
cib/Makefile \
daemons/Makefile \
daemons/attrd/Makefile \
daemons/execd/Makefile \
daemons/execd/pacemaker_remote \
daemons/execd/pacemaker_remote.service \
daemons/pacemakerd/Makefile \
daemons/pacemakerd/pacemaker \
daemons/pacemakerd/pacemaker.service \
daemons/pacemakerd/pacemaker.upstart \
daemons/pacemakerd/pacemaker.combined.upstart \
crmd/Makefile \
pengine/Makefile \
doc/Makefile \
doc/Clusters_from_Scratch/publican.cfg \
doc/Pacemaker_Administration/publican.cfg \
doc/Pacemaker_Development/publican.cfg \
doc/Pacemaker_Explained/publican.cfg \
doc/Pacemaker_Remote/publican.cfg \
fencing/Makefile \
include/Makefile \
include/crm/Makefile \
include/crm/cib/Makefile \
include/crm/common/Makefile \
include/crm/cluster/Makefile \
include/crm/fencing/Makefile \
include/crm/pengine/Makefile \
replace/Makefile \
lib/Makefile \
lib/pacemaker.pc \
lib/pacemaker-cib.pc \
lib/pacemaker-lrmd.pc \
lib/pacemaker-service.pc \
lib/pacemaker-pengine.pc \
lib/pacemaker-fencing.pc \
lib/pacemaker-cluster.pc \
lib/common/Makefile \
lib/cluster/Makefile \
lib/cib/Makefile \
lib/pengine/Makefile \
lib/transition/Makefile \
lib/fencing/Makefile \
lib/lrmd/Makefile \
lib/services/Makefile \
extra/Makefile \
extra/alerts/Makefile \
extra/resources/Makefile \
extra/logrotate/Makefile \
extra/logrotate/pacemaker \
tools/Makefile \
tools/report.collector \
tools/report.common \
tools/crm_mon.service \
tools/crm_mon.upstart \
xml/Makefile \
lib/gnu/Makefile \
)
dnl Now process the entire list of files added by previous
dnl calls to AC_CONFIG_FILES()
AC_OUTPUT()
dnl *****************
dnl Configure summary
dnl *****************
AC_MSG_RESULT([])
AC_MSG_RESULT([$PACKAGE configuration:])
AC_MSG_RESULT([ Version = ${VERSION} (Build: $BUILD_VERSION)])
AC_MSG_RESULT([ Features =${PCMK_FEATURES}])
AC_MSG_RESULT([])
AC_MSG_RESULT([ Prefix = ${prefix}])
AC_MSG_RESULT([ Executables = ${sbindir}])
AC_MSG_RESULT([ Man pages = ${mandir}])
AC_MSG_RESULT([ Libraries = ${libdir}])
AC_MSG_RESULT([ Header files = ${includedir}])
AC_MSG_RESULT([ Arch-independent files = ${datadir}])
AC_MSG_RESULT([ State information = ${localstatedir}])
AC_MSG_RESULT([ System configuration = ${sysconfdir}])
AC_MSG_RESULT([])
AC_MSG_RESULT([ HA group name = ${CRM_DAEMON_GROUP}])
AC_MSG_RESULT([ HA user name = ${CRM_DAEMON_USER}])
AC_MSG_RESULT([])
AC_MSG_RESULT([ CFLAGS = ${CFLAGS}])
AC_MSG_RESULT([ CFLAGS_HARDENED_EXE = ${CFLAGS_HARDENED_EXE}])
AC_MSG_RESULT([ CFLAGS_HARDENED_LIB = ${CFLAGS_HARDENED_LIB}])
AC_MSG_RESULT([ LDFLAGS_HARDENED_EXE = ${LDFLAGS_HARDENED_EXE}])
AC_MSG_RESULT([ LDFLAGS_HARDENED_LIB = ${LDFLAGS_HARDENED_LIB}])
AC_MSG_RESULT([ Libraries = ${LIBS}])
AC_MSG_RESULT([ Stack Libraries = ${CLUSTERLIBS}])
diff --git a/cts/CTStests.py b/cts/CTStests.py
index aed0856eb9..ffac2b0f09 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1,3117 +1,3117 @@
""" Test-specific classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = """Copyright 2000, 2001 Alan Robertson <alanr@unix.sh>
Add RecourceRecover testcase Zhao Kai <zhaokai@cn.ibm.com>
"""
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
#
# SPECIAL NOTE:
#
# Tests may NOT implement any cluster-manager-specific code in them.
# EXTEND the ClusterManager object to provide the base capabilities
# the test needs if you need to do something that the current CM classes
# do not. Otherwise you screw up the whole point of the object structure
# in CTS.
#
# Thank you.
#
import os
import re
import time
import subprocess
import tempfile
from stat import *
from cts import CTS
from cts.CTSaudits import *
from cts.CTSvars import *
from cts.patterns import PatternSelector
from cts.logging import LogFactory
from cts.remote import RemoteFactory, input_wrapper
from cts.watcher import LogWatcher
from cts.environment import EnvFactory
AllTestClasses = [ ]
class CTSTest(object):
'''
A Cluster test.
We implement the basic set of properties and behaviors for a generic
cluster test.
Cluster tests track their own statistics.
We keep each of the kinds of counts we track as separate {name,value}
pairs.
'''
def __init__(self, cm):
#self.name="the unnamed test"
self.Stats = {"calls":0
, "success":0
, "failure":0
, "skipped":0
, "auditfail":0}
# if not issubclass(cm.__class__, ClusterManager):
# raise ValueError("Must be a ClusterManager object")
self.CM = cm
self.Env = EnvFactory().getInstance()
self.rsh = RemoteFactory().getInstance()
self.logger = LogFactory()
self.templates = PatternSelector(cm["Name"])
self.Audits = []
self.timeout = 120
self.passed = 1
self.is_loop = 0
self.is_unsafe = 0
self.is_docker_unsafe = 0
self.is_experimental = 0
self.is_container = 0
self.is_valgrind = 0
self.benchmark = 0 # which tests to benchmark
self.timer = {} # timers
def log(self, args):
self.logger.log(args)
def debug(self, args):
self.logger.debug(args)
def has_key(self, key):
return key in self.Stats
def __setitem__(self, key, value):
self.Stats[key] = value
def __getitem__(self, key):
if str(key) == "0":
raise ValueError("Bad call to 'foo in X', should reference 'foo in X.Stats' instead")
if key in self.Stats:
return self.Stats[key]
return None
def log_mark(self, msg):
self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
return
def get_timer(self,key = "test"):
try: return self.timer[key]
except: return 0
def set_timer(self,key = "test"):
self.timer[key] = time.time()
return self.timer[key]
def log_timer(self,key = "test"):
elapsed = 0
if key in self.timer:
elapsed = time.time() - self.timer[key]
s = key == "test" and self.name or "%s:%s" % (self.name,key)
self.debug("%s runtime: %.2f" % (s, elapsed))
del self.timer[key]
return elapsed
def incr(self, name):
'''Increment (or initialize) the value associated with the given name'''
if not name in self.Stats:
self.Stats[name] = 0
self.Stats[name] = self.Stats[name]+1
# Reset the test passed boolean
if name == "calls":
self.passed = 1
def failure(self, reason="none"):
'''Increment the failure count'''
self.passed = 0
self.incr("failure")
self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
return None
def success(self):
'''Increment the success count'''
self.incr("success")
return 1
def skipped(self):
'''Increment the skipped count'''
self.incr("skipped")
return 1
def __call__(self, node):
'''Perform the given test'''
raise ValueError("Abstract Class member (__call__)")
self.incr("calls")
return self.failure()
def audit(self):
passed = 1
if len(self.Audits) > 0:
for audit in self.Audits:
if not audit():
self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
self.incr("auditfail")
passed = 0
return passed
def setup(self, node):
'''Setup the given test'''
return self.success()
def teardown(self, node):
'''Tear down the given test'''
return self.success()
def create_watch(self, patterns, timeout, name=None):
if not name:
name = self.name
return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
def local_badnews(self, prefix, watch, local_ignore=[]):
errcount = 0
if not prefix:
prefix = "LocalBadNews:"
ignorelist = []
ignorelist.append(" CTS: ")
ignorelist.append(prefix)
ignorelist.extend(local_ignore)
while errcount < 100:
match = watch.look(0)
if match:
add_err = 1
for ignore in ignorelist:
if add_err == 1 and re.search(ignore, match):
add_err = 0
if add_err == 1:
self.logger.log(prefix + " " + match)
errcount = errcount + 1
else:
break
else:
self.logger.log("Too many errors!")
watch.end()
return errcount
def is_applicable(self):
return self.is_applicable_common()
def is_applicable_common(self):
'''Return TRUE if we are applicable in the current test configuration'''
#raise ValueError("Abstract Class member (is_applicable)")
if self.is_loop and not self.Env["loop-tests"]:
return 0
elif self.is_unsafe and not self.Env["unsafe-tests"]:
return 0
elif self.is_valgrind and not self.Env["valgrind-tests"]:
return 0
elif self.is_experimental and not self.Env["experimental-tests"]:
return 0
elif self.is_docker_unsafe and self.Env["docker"]:
return 0
elif self.is_container and not self.Env["container-tests"]:
return 0
elif self.Env["benchmark"] and self.benchmark == 0:
return 0
return 1
def find_ocfs2_resources(self, node):
self.r_o2cb = None
self.r_ocfs2 = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "o2cb" and r.parent != "NA":
self.debug("Found o2cb: %s" % self.r_o2cb)
self.r_o2cb = r.parent
if re.search("^Constraint", line):
c = AuditConstraint(self.CM, line)
if c.type == "rsc_colocation" and c.target == self.r_o2cb:
self.r_ocfs2.append(c.rsc)
self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
return len(self.r_ocfs2)
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
return 1
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return []
class StopTest(CTSTest):
'''Stop (deactivate) the cluster manager on a node'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stop"
def __call__(self, node):
'''Perform the 'stop' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] != "up":
return self.skipped()
patterns = []
# Technically we should always be able to notice ourselves stopping
patterns.append(self.templates["Pat:We_stopped"] % node)
# Any active node needs to notice this one left
# (note that this won't work if we have multiple partitions)
for other in self.Env["nodes"]:
if self.CM.ShouldBeStatus[other] == "up" and other != node:
patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
#self.debug("Checking %s will notice %s left"%(other, node))
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
if node == self.CM.OurNode:
self.incr("us")
else:
if self.CM.upcount() <= 1:
self.incr("all")
else:
self.incr("them")
self.CM.StopaCM(node)
watch_result = watch.lookforall()
failreason = None
UnmatchedList = "||"
if watch.unmatched:
(rc, output) = self.rsh(node, "/bin/ps axf", None)
for line in output:
self.debug(line)
(rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None)
for line in output:
self.debug(line)
for regex in watch.unmatched:
self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
UnmatchedList += regex + "||";
failreason = "Missing shutdown pattern"
self.CM.cluster_stable(self.Env["DeadTime"])
if not watch.unmatched or self.CM.upcount() == 0:
return self.success()
if len(watch.unmatched) >= self.CM.upcount():
return self.failure("no match against (%s)" % UnmatchedList)
if failreason == None:
return self.success()
else:
return self.failure(failreason)
#
# We don't register StopTest because it's better when called by
# another test...
#
class StartTest(CTSTest):
'''Start (activate) the cluster manager on a node'''
def __init__(self, cm, debug=None):
CTSTest.__init__(self,cm)
self.name = "start"
self.debug = debug
def __call__(self, node):
'''Perform the 'start' test. '''
self.incr("calls")
if self.CM.upcount() == 0:
self.incr("us")
else:
self.incr("them")
if self.CM.ShouldBeStatus[node] != "down":
return self.skipped()
elif self.CM.StartaCM(node):
return self.success()
else:
return self.failure("Startup %s on node %s failed"
% (self.Env["Name"], node))
#
# We don't register StartTest because it's better when called by
# another test...
#
class FlipTest(CTSTest):
'''If it's running, stop it. If it's stopped start it.
Overthrow the status quo...
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Flip"
self.start = StartTest(cm)
self.stop = StopTest(cm)
def __call__(self, node):
'''Perform the 'Flip' test. '''
self.incr("calls")
if self.CM.ShouldBeStatus[node] == "up":
self.incr("stopped")
ret = self.stop(node)
type = "up->down"
# Give the cluster time to recognize it's gone...
time.sleep(self.Env["StableTime"])
elif self.CM.ShouldBeStatus[node] == "down":
self.incr("started")
ret = self.start(node)
type = "down->up"
else:
return self.skipped()
self.incr(type)
if ret:
return self.success()
else:
return self.failure("%s failure" % type)
# Register FlipTest as a good test to run
AllTestClasses.append(FlipTest)
class RestartTest(CTSTest):
'''Stop and restart a node'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Restart"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.benchmark = 1
def __call__(self, node):
'''Perform the 'restart' test. '''
self.incr("calls")
self.incr("node:" + node)
ret1 = 1
if self.CM.StataCM(node):
self.incr("WasStopped")
if not self.start(node):
return self.failure("start (setup) failure: "+node)
self.set_timer()
if not self.stop(node):
return self.failure("stop failure: "+node)
if not self.start(node):
return self.failure("start failure: "+node)
return self.success()
# Register RestartTest as a good test to run
AllTestClasses.append(RestartTest)
class StonithdTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Stonithd"
self.startall = SimulStartLite(cm)
self.benchmark = 1
def __call__(self, node):
self.incr("calls")
if len(self.Env["nodes"]) < 2:
return self.skipped()
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
is_dc = self.CM.is_node_dc(node)
watchpats = []
watchpats.append(self.templates["Pat:FenceOpOK"] % node)
watchpats.append(self.templates["Pat:NodeFenced"] % node)
if self.Env["at-boot"] == 0:
self.debug("Expecting %s to stay down" % node)
self.CM.ShouldBeStatus[node] = "down"
else:
self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)
watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
origin = self.Env.RandomGen.choice(self.Env["nodes"])
rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)
if rc == 194:
# 194 - 256 = -62 = Timer expired
#
# Look for the patterns, usually this means the required
# device was running on the node to be fenced - or that
# the required devices were in the process of being loaded
# and/or moved
#
# Effectively the node committed suicide so there will be
# no confirmation, but pacemaker should be watching and
# fence the node again
self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))
elif origin != node and rc != 0:
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting STONITHd node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))
elif origin == node and rc != 255:
# 255 == broken pipe, ie. the node was fenced as expected
self.logger.log("Locally originated fencing returned %d" % rc)
self.set_timer("fence")
matched = watch.lookforall()
self.log_timer("fence")
self.set_timer("reform")
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting STONITHd node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected patterns")
elif not is_stable:
return self.failure("Cluster did not become stable")
self.log_timer("reform")
return self.success()
def errorstoignore(self):
return [
self.templates["Pat:Fencing_start"] % ".*",
self.templates["Pat:Fencing_ok"] % ".*",
r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery",
r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return 1
AllTestClasses.append(StonithdTest)
class StartOnebyOne(CTSTest):
'''Start all the nodes ~ one by one'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StartOnebyOne"
self.stopall = SimulStopLite(cm)
self.start = StartTest(cm)
self.ns = CTS.NodeStatus(cm.Env)
def __call__(self, dummy):
'''Perform the 'StartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Test setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.start(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to start: " + repr(failed))
return self.success()
# Register StartOnebyOne as a good test to run
AllTestClasses.append(StartOnebyOne)
class SimulStart(CTSTest):
'''Start all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStart"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStart' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
if not self.startall(None):
return self.failure("Startall failed")
return self.success()
# Register SimulStart as a good test to run
AllTestClasses.append(SimulStart)
class SimulStop(CTSTest):
'''Stop all the nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStop"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, dummy):
'''Perform the 'SimulStop' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.stopall(None):
return self.failure("Stopall failed")
return self.success()
# Register SimulStop as a good test to run
AllTestClasses.append(SimulStop)
class StopOnebyOne(CTSTest):
'''Stop all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "StopOnebyOne"
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
def __call__(self, dummy):
'''Perform the 'StopOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
failed = []
self.set_timer()
for node in self.Env["nodes"]:
if not self.stop(node):
failed.append(node)
if len(failed) > 0:
return self.failure("Some node failed to stop: " + repr(failed))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(StopOnebyOne)
class RestartOnebyOne(CTSTest):
'''Restart all the nodes in order'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RestartOnebyOne"
self.startall = SimulStartLite(cm)
def __call__(self, dummy):
'''Perform the 'RestartOnebyOne' test. '''
self.incr("calls")
# We ignore the "node" parameter...
# Start up all the nodes...
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
did_fail = []
self.set_timer()
self.restart = RestartTest(self.CM)
for node in self.Env["nodes"]:
if not self.restart(node):
did_fail.append(node)
if did_fail:
return self.failure("Could not restart %d nodes: %s"
% (len(did_fail), repr(did_fail)))
return self.success()
# Register StopOnebyOne as a good test to run
AllTestClasses.append(RestartOnebyOne)
class PartialStart(CTSTest):
'''Start a node - but tell it to stop before it finishes starting up'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "PartialStart"
self.startall = SimulStartLite(cm)
self.stopall = SimulStopLite(cm)
self.stop = StopTest(cm)
#self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'PartialStart' test. '''
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Setup failed")
# FIXME! This should use the CM class to get the pattern
# then it would be applicable in general
watchpats = []
watchpats.append("crmd.*Connecting to cluster infrastructure")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.CM.StartaCMnoBlock(node)
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
return self.failure("Setup of %s failed" % node)
ret = self.stop(node)
if not ret:
return self.failure("%s did not stop in time" % node)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# We might do some fencing in the 2-node case if we make it up far enough
return [
r"Executing reboot fencing operation",
r"Requesting fencing \([^)]+\) of node ",
]
# Register StopOnebyOne as a good test to run
AllTestClasses.append(PartialStart)
class StandbyTest(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Standby"
self.benchmark = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
# make sure the node is active
# set the node to standby mode
# check resources, none resource should be running on the node
# set the node to active mode
# check resouces, resources should have been migrated back (SHOULD THEY?)
def __call__(self, node):
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
self.debug("Make sure node %s is active" % node)
if self.CM.StandbyStatus(node) != "off":
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.debug("Getting resources running on node %s" % node)
rsc_on_node = self.CM.active_resources(node)
watchpats = []
watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.debug("Setting node %s to standby mode" % node)
if not self.CM.SetStandbyMode(node, "on"):
return self.failure("can't set node %s to standby mode" % node)
self.set_timer("on")
ret = watch.lookforall()
if not ret:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.CM.SetStandbyMode(node, "off")
return self.failure("cluster didn't react to standby change on %s" % node)
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "on":
return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
self.log_timer("on")
self.debug("Checking resources")
bad_run = self.CM.active_resources(node)
if len(bad_run) > 0:
rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
self.debug("Setting node %s to active mode" % node)
self.CM.SetStandbyMode(node, "off")
return rc
self.debug("Setting node %s to active mode" % node)
if not self.CM.SetStandbyMode(node, "off"):
return self.failure("can't set node %s to active mode" % node)
self.set_timer("off")
self.CM.cluster_stable()
status = self.CM.StandbyStatus(node)
if status != "off":
return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
self.log_timer("off")
return self.success()
AllTestClasses.append(StandbyTest)
class ValgrindTest(CTSTest):
'''Check for memory leaks'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Valgrind"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_valgrind = 1
self.is_loop = 1
def setup(self, node):
self.incr("calls")
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
# Enable valgrind
self.logger.logPat = "/tmp/%s-*.valgrind" % self.name
self.Env["valgrind-prefix"] = self.name
self.rsh(node, "rm -f %s" % self.logger.logPat, None)
ret = self.startall(None)
if not ret:
return self.failure("Start all nodes failed")
return self.success()
def teardown(self, node):
# Disable valgrind
self.Env["valgrind-prefix"] = None
# Return all nodes to normal
ret = self.stopall(None)
if not ret:
return self.failure("Stop all nodes failed")
return self.success()
def find_leaks(self):
# Check for leaks
leaked = []
self.stop = StopTest(self.CM)
for node in self.Env["nodes"]:
rc = self.stop(node)
if not rc:
self.failure("Couldn't shut down %s" % node)
rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0)
if rc != 1:
leaked.append(node)
self.failure("Valgrind errors detected on %s" % node)
(rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None)
for line in output:
self.logger.log(line)
(rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None)
for line in output:
self.debug(line)
self.rsh(node, "rm -f %s" % self.logger.logPat, None)
return leaked
def __call__(self, node):
leaked = self.find_leaks()
if len(leaked) > 0:
return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"cib.*: \*\*\*\*\*\*\*\*\*\*\*\*\*",
r"cib.*: .* avoid confusing Valgrind",
r"HA_VALGRIND_ENABLED",
]
class StandbyLoopTest(ValgrindTest):
'''Check for memory leaks by putting a node in and out of standby for an hour'''
def __init__(self, cm):
ValgrindTest.__init__(self,cm)
self.name = "StandbyLoop"
def __call__(self, node):
lpc = 0
delay = 2
failed = 0
done = time.time() + self.Env["loop-minutes"] * 60
while time.time() <= done and not failed:
lpc = lpc + 1
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "on"):
self.failure("can't set node %s to standby mode" % node)
failed = lpc
time.sleep(delay)
if not self.CM.SetStandbyMode(node, "off"):
self.failure("can't set node %s to active mode" % node)
failed = lpc
leaked = self.find_leaks()
if failed:
return self.failure("Iteration %d failed" % failed)
elif len(leaked) > 0:
return self.failure("Nodes %s leaked" % repr(leaked))
return self.success()
AllTestClasses.append(StandbyLoopTest)
class BandwidthTest(CTSTest):
# Tests should not be cluster-manager-specific
# If you need to find out cluster manager configuration to do this, then
# it should be added to the generic cluster manager API.
'''Test the bandwidth which the cluster uses'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "Bandwidth"
self.start = StartTest(cm)
self.__setitem__("min",0)
self.__setitem__("max",0)
self.__setitem__("totalbandwidth",0)
(handle, self.tempfile) = tempfile.mkstemp(".cts")
os.close(handle)
self.startall = SimulStartLite(cm)
def __call__(self, node):
'''Perform the Bandwidth test'''
self.incr("calls")
if self.CM.upcount() < 1:
return self.skipped()
Path = self.CM.InternalCommConfig()
if "ip" not in Path["mediatype"]:
return self.skipped()
port = Path["port"][0]
port = int(port)
ret = self.startall(None)
if not ret:
return self.failure("Test setup failed")
time.sleep(5) # We get extra messages right after startup.
fstmpfile = "/var/run/band_estimate"
dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
% (port, fstmpfile)
rc = self.rsh(node, dumpcmd)
if rc == 0:
farfile = "root@%s:%s" % (node, fstmpfile)
self.rsh.cp(farfile, self.tempfile)
Bandwidth = self.countbandwidth(self.tempfile)
if not Bandwidth:
self.logger.log("Could not compute bandwidth.")
return self.success()
intband = int(Bandwidth + 0.5)
self.logger.log("...bandwidth: %d bits/sec" % intband)
self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
if self.Stats["min"] == 0:
self.Stats["min"] = Bandwidth
if Bandwidth > self.Stats["max"]:
self.Stats["max"] = Bandwidth
if Bandwidth < self.Stats["min"]:
self.Stats["min"] = Bandwidth
self.rsh(node, "rm -f %s" % fstmpfile)
os.unlink(self.tempfile)
return self.success()
else:
return self.failure("no response from tcpdump command [%d]!" % rc)
def countbandwidth(self, file):
fp = open(file, "r")
fp.seek(0)
count = 0
sum = 0
while 1:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count + 1
linesplit = line.split(" ")
for j in range(len(linesplit)-1):
if linesplit[j] == "udp": break
if linesplit[j] == "length:": break
try:
sum = sum + int(linesplit[j+1])
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T1 = linesplit[0]
timesplit = T1.split(":")
time2split = timesplit[2].split(".")
time1 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
break
while count < 100:
line = fp.readline()
if not line:
return None
if re.search("udp",line) or re.search("UDP,", line):
count = count+1
linessplit = line.split(" ")
for j in range(len(linessplit)-1):
if linessplit[j] == "udp": break
if linesplit[j] == "length:": break
try:
sum = int(linessplit[j+1]) + sum
except ValueError:
self.logger.log("Invalid tcpdump line: %s" % line)
return None
T2 = linessplit[0]
timesplit = T2.split(":")
time2split = timesplit[2].split(".")
time2 = (int(timesplit[0])*60+int(timesplit[1]))*60+int(time2split[0])+int(time2split[1])*0.000001
time = time2-time1
if (time <= 0):
return 0
return int((sum*8)/time)
def is_applicable(self):
'''BandwidthTest never applicable'''
return 0
AllTestClasses.append(BandwidthTest)
###################################################################
class MaintenanceMode(CTSTest):
###################################################################
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "MaintenanceMode"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
#self.is_unsafe = 1
self.benchmark = 1
self.action = "asyncmon"
self.interval = 0
self.rid = "maintenanceDummy"
def toggleMaintenanceMode(self, node, action):
pats = []
pats.append(self.templates["Pat:DC_IDLE"])
# fail the resource right after turning Maintenance mode on
# verify it is not recovered until maintenance mode is turned off
if action == "On":
pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for %s on" % (self.action, self.rid))
else:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.debug("Turning maintenance mode %s" % action)
self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
if (action == "On"):
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover%s" % (action))
watch.lookforall()
self.log_timer("recover%s" % (action))
if watch.unmatched:
self.debug("Failed to find patterns when turning maintenance mode %s" % action)
return repr(watch.unmatched)
return ""
def insertMaintenanceDummy(self, node):
pats = []
pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % ("start", self.rid)))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.AddDummyRsc(node, self.rid)
self.set_timer("addDummy")
watch.lookforall()
self.log_timer("addDummy")
if watch.unmatched:
self.debug("Failed to find patterns when adding maintenance dummy resource")
return repr(watch.unmatched)
return ""
def removeMaintenanceDummy(self, node):
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.CM.RemoveDummyRsc(node, self.rid)
self.set_timer("removeDummy")
watch.lookforall()
self.log_timer("removeDummy")
if watch.unmatched:
self.debug("Failed to find patterns when removing maintenance dummy resource")
return repr(watch.unmatched)
return ""
def managedRscList(self, node):
rscList = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.managed():
rscList.append(tmp.id)
return rscList
def verifyResources(self, node, rscList, managed):
managedList = list(rscList)
managed_str = "managed"
if not managed:
managed_str = "unmanaged"
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if managed and not tmp.managed():
continue
elif not managed and tmp.managed():
continue
elif managedList.count(tmp.id):
managedList.remove(tmp.id)
if len(managedList) == 0:
self.debug("Found all %s resources on %s" % (managed_str, node))
return True
self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
return False
def __call__(self, node):
'''Perform the 'MaintenanceMode' test. '''
self.incr("calls")
verify_managed = False
verify_unmanaged = False
failPat = ""
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
# get a list of all the managed resources. We use this list
# after enabling maintenance mode to verify all managed resources
# become un-managed. After maintenance mode is turned off, we use
# this list to verify all the resources become managed again.
managedResources = self.managedRscList(node)
if len(managedResources) == 0:
self.logger.log("No managed resources on %s" % node)
return self.skipped()
# insert a fake resource we can fail during maintenance mode
# so we can verify recovery does not take place until after maintenance
# mode is disabled.
failPat = failPat + self.insertMaintenanceDummy(node)
# toggle maintenance mode ON, then fail dummy resource.
failPat = failPat + self.toggleMaintenanceMode(node, "On")
# verify all the resources are now unmanaged
if self.verifyResources(node, managedResources, False):
verify_unmanaged = True
# Toggle maintenance mode OFF, verify dummy is recovered.
failPat = failPat + self.toggleMaintenanceMode(node, "Off")
# verify all the resources are now managed again
if self.verifyResources(node, managedResources, True):
verify_managed = True
# Remove our maintenance dummy resource.
failPat = failPat + self.removeMaintenanceDummy(node)
self.CM.cluster_stable()
if failPat != "":
return self.failure("Unmatched patterns: %s" % (failPat))
elif verify_unmanaged is False:
return self.failure("Failed to verify resources became unmanaged during maintenance mode")
elif verify_managed is False:
return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"pengine.*: Recover %s\s*\(.*\)" % self.rid,
r"Unknown operation: fail",
r"(ERROR|error): sending stonithRA op to stonithd failed.",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(MaintenanceMode)
class ResourceRecover(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ResourceRecover"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.max = 30
self.rid = None
self.rid_alt = None
#self.is_unsafe = 1
self.benchmark = 1
# these are the values used for the new LRM API call
self.action = "asyncmon"
self.interval = 0
def __call__(self, node):
'''Perform the 'ResourceRecover' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
resourcelist = self.CM.active_resources(node)
# if there are no resourcelist, return directly
if len(resourcelist) == 0:
self.logger.log("No active resources on %s" % node)
return self.skipped()
self.rid = self.Env.RandomGen.choice(resourcelist)
self.rid_alt = self.rid
rsc = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
tmp = AuditResource(self.CM, line)
if tmp.id == self.rid:
rsc = tmp
# Handle anonymous clones that get renamed
self.rid = rsc.clone_id
break
if not rsc:
return self.failure("Could not find %s in the resource list" % self.rid)
self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
pats = []
pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for (%s|%s) on" % (self.action,
rsc.id, rsc.clone_id))
if rsc.managed():
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.rid))
if rsc.unique():
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.rid))
else:
# Anonymous clones may get restarted with a different clone number
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
watch = self.create_watch(pats, 60)
watch.setwatch()
self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
self.set_timer("recover")
watch.lookforall()
self.log_timer("recover")
self.CM.cluster_stable()
recovered = self.CM.ResourceLocation(self.rid)
if watch.unmatched:
return self.failure("Patterns not found: %s" % repr(watch.unmatched))
elif rsc.unique() and len(recovered) > 1:
return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))
elif len(recovered) > 0:
self.debug("%s is running on: %s" % (self.rid, repr(recovered)))
elif rsc.managed():
return self.failure("%s was not recovered and is inactive" % self.rid)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for %s" % self.rid,
r"pengine.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
r"Unknown operation: fail",
r"(ERROR|error): sending stonithRA op to stonithd failed.",
self.templates["Pat:RscOpOK"] % (self.action, self.rid),
r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
]
AllTestClasses.append(ResourceRecover)
class ComponentFail(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "ComponentFail"
# TODO make this work correctly in docker.
self.is_docker_unsafe = 1
self.startall = SimulStartLite(cm)
self.complist = cm.Components()
self.patterns = []
self.okerrpatterns = []
self.is_unsafe = 1
def __call__(self, node):
'''Perform the 'ComponentFail' test. '''
self.incr("calls")
self.patterns = []
self.okerrpatterns = []
# start all nodes
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
if not self.CM.cluster_stable(self.Env["StableTime"]):
return self.failure("Setup failed - unstable")
node_is_dc = self.CM.is_node_dc(node, None)
# select a component to kill
chosen = self.Env.RandomGen.choice(self.complist)
while chosen.dc_only == 1 and node_is_dc == 0:
chosen = self.Env.RandomGen.choice(self.complist)
self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
self.incr(chosen.name)
if chosen.name != "corosync":
self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
self.patterns.extend(chosen.pats)
if node_is_dc:
self.patterns.extend(chosen.dc_pats)
if chosen.name == "stonith":
# Ignore actions for STONITH resources
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
# supply a copy so self.patterns doesn't end up empty
tmpPats = []
tmpPats.extend(self.patterns)
self.patterns.extend(chosen.badnews_ignore)
# Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
stonithPats = []
stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
stonith = self.create_watch(stonithPats, 0)
stonith.setwatch()
# set the watch for stable
watch = self.create_watch(
tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
watch.setwatch()
# kill the component
chosen.kill(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
self.debug("Waiting for any STONITHd node to come back up")
self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
self.debug("Waiting for the cluster to re-stabilize with all nodes")
self.CM.cluster_stable(self.Env["StartTime"])
self.debug("Checking if %s was shot" % node)
shot = stonith.look(60)
if shot:
self.debug("Found: " + repr(shot))
self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)
if self.Env["at-boot"] == 0:
self.CM.ShouldBeStatus[node] = "down"
# If fencing occurred, chances are many (if not all) the expected logs
# will not be sent - or will be lost when the node reboots
return self.success()
# check for logs indicating a graceful recovery
matched = watch.lookforall(allow_multiple_matches=1)
if watch.unmatched:
self.logger.log("Patterns not found: " + repr(watch.unmatched))
self.debug("Waiting for the cluster to re-stabilize with all nodes")
is_stable = self.CM.cluster_stable(self.Env["StartTime"])
if not matched:
return self.failure("Didn't find all expected %s patterns" % chosen.name)
elif not is_stable:
return self.failure("Cluster did not become stable after killing %s" % chosen.name)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Note that okerrpatterns refers to the last time we ran this test
# The good news is that this works fine for us...
self.okerrpatterns.extend(self.patterns)
return self.okerrpatterns
AllTestClasses.append(ComponentFail)
class SplitBrainTest(CTSTest):
'''It is used to test split-brain. when the path between the two nodes break
check the two nodes both take over the resource'''
def __init__(self,cm):
CTSTest.__init__(self,cm)
self.name = "SplitBrain"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.is_experimental = 1
def isolate_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))
if len(other_nodes) == 0:
return 1
self.debug("Creating partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
if not self.CM.isolate_node(node, other_nodes):
self.logger.log("Could not isolate %s" % node)
return 0
return 1
def heal_partition(self, partition):
other_nodes = []
other_nodes.extend(self.Env["nodes"])
for node in partition:
try:
other_nodes.remove(node)
except ValueError:
self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))
if len(other_nodes) == 0:
return 1
self.debug("Healing partition: " + repr(partition))
self.debug("Everyone else: " + repr(other_nodes))
for node in partition:
self.CM.unisolate_node(node, other_nodes)
def __call__(self, node):
'''Perform split-brain test'''
self.incr("calls")
self.passed = 1
partitions = {}
ret = self.startall(None)
if not ret:
return self.failure("Setup failed")
while 1:
# Retry until we get multiple partitions
partitions = {}
p_max = len(self.Env["nodes"])
for node in self.Env["nodes"]:
p = self.Env.RandomGen.randint(1, p_max)
if not p in partitions:
partitions[p] = []
partitions[p].append(node)
p_max = len(list(partitions.keys()))
if p_max > 1:
break
# else, try again
self.debug("Created %d partitions" % p_max)
for key in list(partitions.keys()):
self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))
# Disabling STONITH to reduce test complexity for now
self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")
for key in list(partitions.keys()):
self.isolate_partition(partitions[key])
count = 30
while count > 0:
if len(self.CM.find_partitions()) != p_max:
time.sleep(10)
else:
break
else:
self.failure("Expected partitions were not created")
# Target number of partitions formed - wait for stability
if not self.CM.cluster_stable():
self.failure("Partitioned cluster not stable")
# Now audit the cluster state
self.CM.partitions_expected = p_max
if not self.audit():
self.failure("Audits failed")
self.CM.partitions_expected = 1
# And heal them again
for key in list(partitions.keys()):
self.heal_partition(partitions[key])
# Wait for a single partition to form
count = 30
while count > 0:
if len(self.CM.find_partitions()) != 1:
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not reform")
# Wait for it to have the right number of members
count = 30
while count > 0:
members = []
partitions = self.CM.find_partitions()
if len(partitions) > 0:
members = partitions[0].split()
if len(members) != len(self.Env["nodes"]):
time.sleep(10)
count -= 1
else:
break
else:
self.failure("Cluster did not completely reform")
# Wait up to 20 minutes - the delay is more preferable than
# trying to continue with in a messed up state
if not self.CM.cluster_stable(1200):
self.failure("Reformed cluster not stable")
if self.Env["continue"] == 1:
answer = "Y"
else:
try:
answer = input_wrapper('Continue? [nY]')
except EOFError as e:
answer = "n"
if answer and answer == "n":
raise ValueError("Reformed cluster not stable")
# Turn fencing back on
if self.Env["DoFencing"]:
self.rsh(node, "crm_attribute -V -D -n stonith-enabled")
self.CM.cluster_stable()
if self.passed:
return self.success()
return self.failure("See previous errors")
def errorstoignore(self):
'''Return list of errors which are 'normal' and should be ignored'''
return [
r"Another DC detected:",
r"(ERROR|error).*: .*Application of an update diff failed",
r"crmd.*:.*not in our membership list",
r"CRIT:.*node.*returning after partition",
]
def is_applicable(self):
if not self.is_applicable_common():
return 0
return len(self.Env["nodes"]) > 2
AllTestClasses.append(SplitBrainTest)
class Reattach(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "Reattach"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
self.is_unsafe = 0 # Handled by canrunnow()
def _is_managed(self, node):
is_managed = self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -q -G -d true", 1)
is_managed = is_managed[:-1] # Strip off the newline
return is_managed == "true"
def _set_unmanaged(self, node):
self.debug("Disable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -v false")
def _set_managed(self, node):
self.debug("Re-enable resource management")
self.rsh(node, "crm_attribute -t rsc_defaults -n is-managed -D")
def setup(self, node):
attempt = 0
if not self.startall(None):
return None
# Make sure we are really _really_ stable and that all
# resources, including those that depend on transient node
# attributes, are started
while not self.CM.cluster_stable(double_check=True):
if attempt < 5:
attempt += 1
self.debug("Not stable yet, re-testing")
else:
self.logger.log("Cluster is not stable")
return None
return 1
def teardown(self, node):
# Make sure 'node' is up
start = StartTest(self.CM)
start(node)
if not self._is_managed(node):
self.logger.log("Attempting to re-enable resource management on %s" % node)
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
self.logger.log("Could not re-enable resource management")
return 0
return 1
def canrunnow(self, node):
'''Return TRUE if we can meaningfully run right now'''
if self.find_ocfs2_resources(node):
self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
return 0
return 1
def __call__(self, node):
self.incr("calls")
pats = []
# Conveniently, pengine will display this message when disabling management,
# even if fencing is not enabled, so we can rely on it.
managed = self.create_watch(["Delaying fencing operations"], 60)
managed.setwatch()
self._set_unmanaged(node)
if not managed.lookforall():
self.logger.log("Patterns not found: " + repr(managed.unmatched))
return self.failure("Resource management not disabled")
pats = []
pats.append(self.templates["Pat:RscOpOK"] % ("start", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("demote", ".*"))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate", ".*"))
watch = self.create_watch(pats, 60, "ShutdownActivity")
watch.setwatch()
self.debug("Shutting down the cluster")
ret = self.stopall(None)
if not ret:
self._set_managed(node)
return self.failure("Couldn't shut down the cluster")
self.debug("Bringing the cluster back up")
ret = self.startall(None)
time.sleep(5) # allow ping to update the CIB
if not ret:
self._set_managed(node)
return self.failure("Couldn't restart the cluster")
if self.local_badnews("ResourceActivity:", watch):
self._set_managed(node)
return self.failure("Resources stopped or started during cluster restart")
watch = self.create_watch(pats, 60, "StartupActivity")
watch.setwatch()
# Re-enable resource management (and verify it happened).
self._set_managed(node)
self.CM.cluster_stable()
if not self._is_managed(node):
return self.failure("Could not re-enable resource management")
# Ignore actions for STONITH resources
ignore = []
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rclass == "stonith":
self.debug("Ignoring start actions for %s" % r.id)
ignore.append(self.templates["Pat:RscOpOK"] % ("start", r.id))
if self.local_badnews("ResourceActivity:", watch, ignore):
return self.failure("Resources stopped or started after resource management was re-enabled")
return ret
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"resource( was|s were) active at shutdown",
]
def is_applicable(self):
return 1
AllTestClasses.append(Reattach)
class SpecialTest1(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SpecialTest1"
self.startall = SimulStartLite(cm)
self.restart1 = RestartTest(cm)
self.stopall = SimulStopLite(cm)
def __call__(self, node):
'''Perform the 'SpecialTest1' test for Andrew. '''
self.incr("calls")
# Shut down all the nodes...
ret = self.stopall(None)
if not ret:
return self.failure("Could not stop all nodes")
# Test config recovery when the other nodes come up
self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
# Start the selected node
ret = self.restart1(node)
if not ret:
return self.failure("Could not start "+node)
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Could not start the remaining nodes")
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
# Errors that occur as a result of the CIB being wiped
return [
r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
r"error.*: Resource start-up disabled since no STONITH resources have been defined",
r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
]
AllTestClasses.append(SpecialTest1)
class HAETest(CTSTest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "HAETest"
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
self.is_loop = 1
def setup(self, node):
# Start all remaining nodes
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
return self.success()
def wait_on_state(self, node, resource, expected_clones, attempts=240):
while attempts > 0:
active = 0
(rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None)
# Hack until crm_resource does the right thing
if rc == 0 and lines:
active = len(lines)
if len(lines) == expected_clones:
return 1
elif rc == 1:
self.debug("Resource %s is still inactive" % resource)
elif rc == 234:
self.logger.log("Unknown resource %s" % resource)
return 0
elif rc == 246:
self.logger.log("Cluster is inactive")
return 0
elif rc != 0:
self.logger.log("Call to crm_resource failed, rc=%d" % rc)
return 0
else:
self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))
attempts -= 1
time.sleep(1)
return 0
def find_dlm(self, node):
self.r_dlm = None
(rc, lines) = self.rsh(node, "crm_resource -c", None)
for line in lines:
if re.search("^Resource", line):
r = AuditResource(self.CM, line)
if r.rtype == "controld" and r.parent != "NA":
self.debug("Found dlm: %s" % self.r_dlm)
self.r_dlm = r.parent
return 1
return 0
def find_hae_resources(self, node):
self.r_dlm = None
self.r_o2cb = None
self.r_ocfs2 = []
if self.find_dlm(node):
self.find_ocfs2_resources(node)
def is_applicable(self):
if not self.is_applicable_common():
return 0
if self.Env["Schema"] == "hae":
return 1
return None
class HAERoleTest(HAETest):
def __init__(self, cm):
'''Lars' mount/unmount test for the HA extension. '''
HAETest.__init__(self,cm)
self.name = "HAERoleTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
delay = 2
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "Stopped")
if not self.wait_on_state(node, self.r_dlm, 0):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "Started")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAERoleTest)
class HAEStandbyTest(HAETest):
'''Set up a custom test to cause quorum failure issues for Andrew'''
def __init__(self, cm):
HAETest.__init__(self,cm)
self.name = "HAEStandbyTest"
def change_state(self, node, resource, target):
rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
return rc
def __call__(self, node):
self.incr("calls")
lpc = 0
failed = 0
done = time.time() + self.Env["loop-minutes"]*60
self.find_hae_resources(node)
clone_max = len(self.Env["nodes"])
while time.time() <= done and not failed:
lpc = lpc + 1
self.change_state(node, self.r_dlm, "true")
if not self.wait_on_state(node, self.r_dlm, clone_max-1):
self.failure("%s did not go down correctly" % self.r_dlm)
failed = lpc
self.change_state(node, self.r_dlm, "false")
if not self.wait_on_state(node, self.r_dlm, clone_max):
self.failure("%s did not come up correctly" % self.r_dlm)
failed = lpc
if not self.wait_on_state(node, self.r_o2cb, clone_max):
self.failure("%s did not come up correctly" % self.r_o2cb)
failed = lpc
for fs in self.r_ocfs2:
if not self.wait_on_state(node, fs, clone_max):
self.failure("%s did not come up correctly" % fs)
failed = lpc
if failed:
return self.failure("iteration %d failed" % failed)
return self.success()
AllTestClasses.append(HAEStandbyTest)
class NearQuorumPointTest(CTSTest):
'''
This test brings larger clusters near the quorum point (50%).
In addition, it will test doing starts and stops at the same time.
Here is how I think it should work:
- loop over the nodes and decide randomly which will be up and which
will be down Use a 50% probability for each of up/down.
- figure out what to do to get into that state from the current state
- in parallel, bring up those going up and bring those going down.
'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "NearQuorumPoint"
def __call__(self, dummy):
'''Perform the 'NearQuorumPoint' test. '''
self.incr("calls")
startset = []
stopset = []
stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
#decide what to do with each node
for node in self.Env["nodes"]:
action = self.Env.RandomGen.choice(["start","stop"])
#action = self.Env.RandomGen.choice(["start","stop","no change"])
if action == "start" :
startset.append(node)
elif action == "stop" :
stopset.append(node)
self.debug("start nodes:" + repr(startset))
self.debug("stop nodes:" + repr(stopset))
#add search patterns
watchpats = [ ]
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
watchpats.append(self.templates["Pat:We_stopped"] % node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
#watchpats.append(self.templates["Pat:NonDC_started"] % node)
watchpats.append(self.templates["Pat:Local_started"] % node)
else:
for stopping in stopset:
if self.CM.ShouldBeStatus[stopping] == "up":
watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))
if len(watchpats) == 0:
return self.skipped()
if len(startset) != 0:
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
#begin actions
for node in stopset:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
for node in startset:
if self.CM.ShouldBeStatus[node] == "down":
self.CM.StartaCMnoBlock(node)
#get the result
if watch.lookforall():
self.CM.cluster_stable()
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
return self.success()
self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
#get the "bad" nodes
upnodes = []
for node in stopset:
if self.CM.StataCM(node) == 1:
upnodes.append(node)
downnodes = []
for node in startset:
if self.CM.StataCM(node) == 0:
downnodes.append(node)
self.CM.fencing_cleanup("NearQuorumPoint", stonith)
if upnodes == [] and downnodes == []:
self.CM.cluster_stable()
# Make sure they're completely down with no residule
for node in stopset:
self.rsh(node, self.templates["StopCmd"])
return self.success()
if len(upnodes) > 0:
self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
if len(downnodes) > 0:
self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))
return self.failure()
def is_applicable(self):
return 1
AllTestClasses.append(NearQuorumPointTest)
class RollingUpgradeTest(CTSTest):
'''Perform a rolling upgrade of the cluster'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RollingUpgrade"
self.start = StartTest(cm)
self.stop = StopTest(cm)
self.stopall = SimulStopLite(cm)
self.startall = SimulStartLite(cm)
def setup(self, node):
# Start all remaining nodes
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.downgrade(node, None):
return self.failure("Couldn't downgrade %s" % node)
ret = self.startall(None)
if not ret:
return self.failure("Couldn't start all nodes")
return self.success()
def teardown(self, node):
# Stop everything
ret = self.stopall(None)
if not ret:
return self.failure("Couldn't stop all nodes")
for node in self.Env["nodes"]:
if not self.upgrade(node, None):
return self.failure("Couldn't upgrade %s" % node)
return self.success()
def install(self, node, version, start=1, flags="--force"):
target_dir = "/tmp/rpm-%s" % version
src_dir = "%s/%s" % (self.Env["rpm-dir"], version)
self.logger.log("Installing %s on %s with %s" % (version, node, flags))
if not self.stop(node):
return self.failure("stop failure: "+node)
rc = self.rsh(node, "mkdir -p %s" % target_dir)
rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir)
(rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None)
for line in lines:
line = line[:-1]
rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir))
rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))
if start and not self.start(node):
return self.failure("start failure: "+node)
return self.success()
def upgrade(self, node, start=1):
return self.install(node, self.Env["current-version"], start)
def downgrade(self, node, start=1):
return self.install(node, self.Env["previous-version"], start, "--force --nodeps")
def __call__(self, node):
'''Perform the 'Rolling Upgrade' test. '''
self.incr("calls")
for node in self.Env["nodes"]:
if self.upgrade(node):
return self.failure("Couldn't upgrade %s" % node)
self.CM.cluster_stable()
return self.success()
def is_applicable(self):
if not self.is_applicable_common():
return None
if not "rpm-dir" in list(self.Env.keys()):
return None
if not "current-version" in list(self.Env.keys()):
return None
if not "previous-version" in list(self.Env.keys()):
return None
return 1
# Register RestartTest as a good test to run
AllTestClasses.append(RollingUpgradeTest)
class BSC_AddResource(CTSTest):
'''Add a resource to the cluster'''
def __init__(self, cm):
CTSTest.__init__(self, cm)
self.name = "AddResource"
self.resource_offset = 0
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
def __call__(self, node):
self.incr("calls")
self.resource_offset = self.resource_offset + 1
r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
start_pat = "crmd.*%s_start_0.*confirmed.*ok"
patterns = []
patterns.append(start_pat % r_id)
watch = self.create_watch(patterns, self.Env["DeadTime"])
watch.setwatch()
ip = self.NextIP()
if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
return self.failure("Make resource %s failed" % r_id)
failed = 0
watch_result = watch.lookforall()
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Pattern not found: %s" % (regex))
failed = 1
if failed:
return self.failure("Resource pattern(s) not found")
if not self.CM.cluster_stable(self.Env["DeadTime"]):
return self.failure("Unstable cluster")
return self.success()
def NextIP(self):
ip = self.Env["IPBase"]
if ":" in ip:
fields = ip.rpartition(":")
fields[2] = str(hex(int(fields[2], 16)+1))
print(str(hex(int(f[2], 16)+1)))
else:
fields = ip.rpartition('.')
fields[2] = str(int(fields[2])+1)
ip = fields[0] + fields[1] + fields[3];
self.Env["IPBase"] = ip
return ip.strip()
def make_ip_resource(self, node, id, rclass, type, ip):
self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
rsc_xml="""
<primitive id="%s" class="%s" type="%s" provider="heartbeat">
<instance_attributes id="%s"><attributes>
<nvpair id="%s" name="ip" value="%s"/>
</attributes></instance_attributes>
</primitive>""" % (id, rclass, type, id, id, ip)
node_constraint = """
<rsc_location id="run_%s" rsc="%s">
<rule id="pref_run_%s" score="100">
<expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/>
</rule>
</rsc_location>""" % (id, id, id, id, node)
rc = 0
(rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None)
if rc != 0:
self.logger.log("Constraint creation failed: %d" % rc)
return None
(rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None)
if rc != 0:
self.logger.log("Resource creation failed: %d" % rc)
return None
return 1
def is_applicable(self):
if self.Env["DoBSC"]:
return 1
return None
AllTestClasses.append(BSC_AddResource)
class SimulStopLite(CTSTest):
'''Stop any active nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStopLite"
def __call__(self, dummy):
'''Perform the 'SimulStopLite' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
watchpats = [ ]
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.incr("WasStarted")
watchpats.append(self.templates["Pat:We_stopped"] % node)
if len(watchpats) == 0:
return self.success()
# Stop all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
self.set_timer()
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "up":
self.CM.StopaCMnoBlock(node)
if watch.lookforall():
# Make sure they're completely down with no residule
for node in self.Env["nodes"]:
self.rsh(node, self.templates["StopCmd"])
return self.success()
did_fail = 0
up_nodes = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 1:
did_fail = 1
up_nodes.append(node)
if did_fail:
return self.failure("Active nodes exist: " + repr(up_nodes))
self.logger.log("Warn: All nodes stopped but CTS didnt detect: "
+ repr(watch.unmatched))
return self.failure("Missing log message: "+repr(watch.unmatched))
def is_applicable(self):
'''SimulStopLite is a setup test and never applicable'''
return 0
class SimulStartLite(CTSTest):
'''Start any stopped nodes ~ simultaneously'''
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "SimulStartLite"
def __call__(self, dummy):
'''Perform the 'SimulStartList' setup work. '''
self.incr("calls")
self.debug("Setup: " + self.name)
# We ignore the "node" parameter...
node_list = []
for node in self.Env["nodes"]:
if self.CM.ShouldBeStatus[node] == "down":
self.incr("WasStopped")
node_list.append(node)
self.set_timer()
while len(node_list) > 0:
# Repeat until all nodes come up
watchpats = [ ]
uppat = self.templates["Pat:NonDC_started"]
if self.CM.upcount() == 0:
uppat = self.templates["Pat:Local_started"]
watchpats.append(self.templates["Pat:DC_IDLE"])
for node in node_list:
watchpats.append(uppat % node)
watchpats.append(self.templates["Pat:InfraUp"] % node)
watchpats.append(self.templates["Pat:PacemakerUp"] % node)
# Start all the nodes - at about the same time...
watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
watch.setwatch()
stonith = self.CM.prepare_fencing_watcher(self.name)
for node in node_list:
self.CM.StartaCMnoBlock(node)
watch.lookforall()
node_list = self.CM.fencing_cleanup(self.name, stonith)
if node_list == None:
return self.failure("Cluster did not stabilize")
# Remove node_list messages from watch.unmatched
for node in node_list:
self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
if watch.unmatched:
try:
watch.unmatched.remove(uppat % node)
except:
self.debug("Already matched: %s" % (uppat % node))
try:
watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
try:
watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
except:
self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))
if watch.unmatched:
for regex in watch.unmatched:
self.logger.log ("Warn: Startup pattern not found: %s" %(regex))
if not self.CM.cluster_stable():
return self.failure("Cluster did not stabilize")
did_fail = 0
unstable = []
for node in self.Env["nodes"]:
if self.CM.StataCM(node) == 0:
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstarted nodes exist: " + repr(unstable))
unstable = []
for node in self.Env["nodes"]:
if not self.CM.node_stable(node):
did_fail = 1
unstable.append(node)
if did_fail:
return self.failure("Unstable cluster nodes exist: " + repr(unstable))
return self.success()
def is_applicable(self):
'''SimulStartLite is a setup test and never applicable'''
return 0
def TestList(cm, audits):
result = []
for testclass in AllTestClasses:
bound_test = testclass(cm)
if bound_test.is_applicable():
bound_test.Audits = audits
result.append(bound_test)
return result
class RemoteLXC(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = "RemoteLXC"
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.num_containers = 2
self.is_container = 1
self.is_docker_unsafe = 1
self.failed = 0
self.fail_string = ""
def start_lxc_simple(self, node):
# restore any artifacts laying around from a previous test.
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
# generate the containers, put them in the config, add some resources to them
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc1"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc2"))
pats.append(self.templates["Pat:RscOpOK"] % ("start", "lxc-ms"))
pats.append(self.templates["Pat:RscOpOK"] % ("promote", "lxc-ms"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
self.set_timer("remoteSimpleInit")
watch.lookforall()
self.log_timer("remoteSimpleInit")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
def cleanup_lxc_simple(self, node):
pats = [ ]
# if the test failed, attempt to clean up the cib and libvirt environment
# as best as possible
if self.failed == 1:
# restore libvirt and cib
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
return
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container1"))
pats.append(self.templates["Pat:RscOpOK"] % ("stop", "container2"))
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
self.set_timer("remoteSimpleCleanup")
watch.lookforall()
self.log_timer("remoteSimpleCleanup")
if watch.unmatched:
self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
self.failed = 1
# cleanup libvirt
self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -s -R &>/dev/null")
def __call__(self, node):
'''Perform the 'RemoteLXC' test. '''
self.incr("calls")
ret = self.startall(None)
if not ret:
return self.failure("Setup failed, start all nodes failed.")
rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
if rc == 1:
self.log("Environment test for lxc support failed.")
return self.skipped()
self.start_lxc_simple(node)
self.cleanup_lxc_simple(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed == 1:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"Updating failcount for ping",
r"pengine.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
# The orphaned lxc-ms resource causes an expected transition error
# that is a result of the pengine not having knowledge that the
# ms resource used to be a clone. As a result it looks like that
# resource is running in multiple locations when it shouldn't... But in
# this instance we know why this error is occurring and that it is expected.
r"Calculated [Tt]ransition .*pe-error",
r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
r"Unknown operation: fail",
r"(ERROR|error): sending stonithRA op to stonithd failed.",
r"VirtualDomain.*ERROR: Unable to determine emulator",
]
AllTestClasses.append(RemoteLXC)
class RemoteDriver(CTSTest):
def __init__(self, cm):
CTSTest.__init__(self,cm)
self.name = self.__class__.__name__
self.is_docker_unsafe = 1
self.start = StartTest(cm)
self.startall = SimulStartLite(cm)
self.stop = StopTest(cm)
self.remote_rsc = "remote-rsc"
self.cib_cmd = """cibadmin -C -o %s -X '%s' """
self.reset()
def reset(self):
self.pcmk_started = 0
self.failed = False
self.fail_string = ""
self.remote_node_added = 0
self.remote_rsc_added = 0
self.remote_use_reconnect_interval = self.Env.RandomGen.choice([True,False])
def fail(self, msg):
""" Mark test as failed. """
self.failed = True
# Always log the failure.
self.logger.log(msg)
# Use first failure as test status, as it's likely to be most useful.
if not self.fail_string:
self.fail_string = msg
def get_othernode(self, node):
for othernode in self.Env["nodes"]:
if othernode == node:
# we don't want to try and use the cib that we just shutdown.
# find a cluster node that is not our soon to be remote-node.
continue
else:
return othernode
def del_rsc(self, node, rsc):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
if rc != 0:
self.fail("Removal of resource '%s' failed" % rsc)
def add_rsc(self, node, rsc_xml):
othernode = self.get_othernode(node)
rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
if rc != 0:
self.fail("resource creation failed")
def add_primitive_rsc(self, node):
rsc_xml = """
<primitive class="ocf" id="%s" provider="heartbeat" type="Dummy">
<operations>
<op id="remote-rsc-monitor-interval-10s" interval="10s" name="monitor"/>
</operations>
<meta_attributes id="remote-meta_attributes"/>
</primitive>""" % (self.remote_rsc)
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_rsc_added = 1
def add_connection_rsc(self, node):
if self.remote_use_reconnect_interval:
# use reconnect interval and make sure to set cluster-recheck-interval as well.
rsc_xml = """
<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
<instance_attributes id="remote-instance_attributes"/>
<instance_attributes id="remote-instance_attributes">
<nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
<nvpair id="remote-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
</instance_attributes>
<operations>
<op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
<op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="60"/>
</operations>
</primitive>""" % (self.remote_node, node)
self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s"))
else:
# not using reconnect interval
rsc_xml = """
<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
<instance_attributes id="remote-instance_attributes"/>
<instance_attributes id="remote-instance_attributes">
<nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
</instance_attributes>
<operations>
<op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
<op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="120"/>
</operations>
</primitive>""" % (self.remote_node, node)
self.add_rsc(node, rsc_xml)
if not self.failed:
self.remote_node_added = 1
def stop_pcmk_remote(self, node):
# disable pcmk remote
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote stop")
if rc != 0:
time.sleep(6)
else:
break
def start_pcmk_remote(self, node):
for i in range(10):
rc = self.rsh(node, "service pacemaker_remote start")
if rc != 0:
time.sleep(6)
else:
self.pcmk_started = 1
break
def kill_pcmk_remote(self, node):
""" Simulate a Pacemaker Remote daemon failure. """
# We kill the process to prevent a graceful stop,
# then stop it to prevent the OS from restarting it.
- self.rsh(node, "killall -9 pacemaker_remoted")
+ self.rsh(node, "killall -9 pacemaker-remoted")
self.stop_pcmk_remote(node)
def start_metal(self, node):
pcmk_started = 0
# make sure the resource doesn't already exist for some reason
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))
if not self.stop(node):
self.fail("Failed to shutdown cluster node %s" % node)
return
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
# Convert node to baremetal now that it has shutdown the cluster stack
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
self.add_connection_rsc(node)
self.set_timer("remoteMetalInit")
watch.lookforall()
self.log_timer("remoteMetalInit")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def migrate_connection(self, node):
if self.failed:
return
pats = [ ]
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_to", self.remote_node))
pats.append(self.templates["Pat:RscOpOK"] % ("migrate_from", self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(pats, 120)
watch.setwatch()
(rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None)
if rc != 0:
self.fail("failed to move remote node connection resource")
return
self.set_timer("remoteMetalMigrate")
watch.lookforall()
self.log_timer("remoteMetalMigrate")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def fail_rsc(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("stop", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
watchpats.append(self.templates["Pat:DC_IDLE"])
watch = self.create_watch(watchpats, 120)
watch.setwatch()
self.debug("causing dummy rsc to fail.")
rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")
self.set_timer("remoteRscFail")
watch.lookforall()
self.log_timer("remoteRscFail")
if watch.unmatched:
self.fail("Unmatched patterns during rsc fail: %s" % watch.unmatched)
def fail_connection(self, node):
if self.failed:
return
watchpats = [ ]
watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node)
watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)
watch = self.create_watch(watchpats, 120)
watch.setwatch()
# force stop the pcmk remote daemon. this will result in fencing
self.debug("Force stopped active remote node")
self.kill_pcmk_remote(node)
self.debug("Waiting for remote node to be fenced.")
self.set_timer("remoteMetalFence")
watch.lookforall()
self.log_timer("remoteMetalFence")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
self.debug("Waiting for the remote node to come back up")
self.CM.ns.WaitForNodeToComeUp(node, 120);
pats = [ ]
watch = self.create_watch(pats, 240)
watch.setwatch()
pats.append(self.templates["Pat:RscOpOK"] % ("start", self.remote_node))
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
# start the remote node again watch it integrate back into cluster.
self.start_pcmk_remote(node)
if self.pcmk_started == 0:
self.fail("Failed to start pacemaker_remote on node %s" % node)
return
self.debug("Waiting for remote node to rejoin cluster after being fenced.")
self.set_timer("remoteMetalRestart")
watch.lookforall()
self.log_timer("remoteMetalRestart")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
return
def add_dummy_rsc(self, node):
if self.failed:
return
# verify we can put a resource on the remote node
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
pats.append(self.templates["Pat:RscRemoteOpOK"] % ("start", self.remote_rsc, self.remote_node))
pats.append(self.templates["Pat:DC_IDLE"])
# Add a resource that must live on remote-node
self.add_primitive_rsc(node)
# force that rsc to prefer the remote node.
(rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None)
if rc != 0:
self.fail("Failed to place remote resource on remote node.")
return
self.set_timer("remoteMetalRsc")
watch.lookforall()
self.log_timer("remoteMetalRsc")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
def test_attributes(self, node):
if self.failed:
return
# This verifies permanent attributes can be set on a remote-node. It also
# verifies the remote-node can edit its own cib node section remotely.
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to set remote-node attribute. rc:%s output:%s" % (rc, line))
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -q -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to get remote-node attribute")
return
(rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None)
if rc != 0:
self.fail("Failed to delete remote-node attribute")
return
def cleanup_metal(self, node):
if self.pcmk_started == 0:
return
pats = [ ]
watch = self.create_watch(pats, 120)
watch.setwatch()
if self.remote_rsc_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_rsc))
if self.remote_node_added == 1:
pats.append(self.templates["Pat:RscOpOK"] % ("stop", self.remote_node))
self.set_timer("remoteMetalCleanup")
if self.remote_use_reconnect_interval:
self.debug("Cleaning up re-check interval")
self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"])
if self.remote_rsc_added == 1:
# Remove dummy resource added for remote node tests
self.debug("Cleaning up dummy rsc put on remote node")
self.rsh(node, "crm_resource -U -r %s" % self.remote_rsc)
self.del_rsc(node, self.remote_rsc)
if self.remote_node_added == 1:
# Remove remote node's connection resource
self.debug("Cleaning up remote node connection resource")
self.rsh(node, "crm_resource -U -r %s" % (self.remote_node))
self.del_rsc(node, self.remote_node)
watch.lookforall()
self.log_timer("remoteMetalCleanup")
if watch.unmatched:
self.fail("Unmatched patterns: %s" % watch.unmatched)
self.stop_pcmk_remote(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.remote_node_added == 1:
# Remove remote node itself
self.debug("Cleaning up node entry for remote node")
self.rsh(self.get_othernode(node), "crm_node --force --remove %s" % self.remote_node)
def setup_env(self, node):
self.remote_node = "remote-%s" % (node)
# we are assuming if all nodes have a key, that it is
# the right key... If any node doesn't have a remote
# key, we regenerate it everywhere.
if self.rsh.exists_on_all("/etc/pacemaker/authkey", self.Env["nodes"]):
return
# create key locally
(handle, keyfile) = tempfile.mkstemp(".cts")
os.close(handle)
devnull = open(os.devnull, 'wb')
subprocess.check_call(["dd", "if=/dev/urandom", "of=%s" % keyfile, "bs=4096", "count=1"],
stdout=devnull, stderr=devnull)
devnull.close()
# sync key throughout the cluster
for node in self.Env["nodes"]:
self.rsh(node, "mkdir -p --mode=0750 /etc/pacemaker")
self.rsh.cp(keyfile, "root@%s:/etc/pacemaker/authkey" % node)
self.rsh(node, "chgrp haclient /etc/pacemaker /etc/pacemaker/authkey")
self.rsh(node, "chmod 0640 /etc/pacemaker/authkey")
os.unlink(keyfile)
def is_applicable(self):
if not self.is_applicable_common():
return False
for node in self.Env["nodes"]:
- rc = self.rsh(node, "type pacemaker_remoted >/dev/null 2>&1")
+ rc = self.rsh(node, "which pacemaker-remoted >/dev/null 2>&1")
if rc != 0:
return False
return True
def start_new_test(self, node):
self.incr("calls")
self.reset()
ret = self.startall(None)
if not ret:
return self.failure("setup failed: could not start all nodes")
self.setup_env(node)
self.start_metal(node)
self.add_dummy_rsc(node)
return True
def __call__(self, node):
return self.failure("This base class is not meant to be called directly.")
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [ """is running on remote.*which isn't allowed""",
"""Connection terminated""",
"""Failed to send remote""",
]
# RemoteDriver is just a base class for other tests, so it is not added to AllTestClasses
class RemoteBasic(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteBaremetal' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.test_attributes(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteBasic)
class RemoteStonithd(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteStonithd' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.fail_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def is_applicable(self):
if not RemoteDriver.is_applicable(self):
return False
if "DoFencing" in list(self.Env.keys()):
return self.Env["DoFencing"]
return True
def errorstoignore(self):
ignore_pats = [
r"Lost connection to Pacemaker Remote node",
r"Software caused connection abort",
r"crmd.*:\s+error.*: Operation remote-.*_monitor",
r"crmd.*:\s+error.*: Result of monitor operation for remote-.*",
r"pengine.*:\s+Recover remote-.*\s*\(.*\)",
r"Calculated [Tt]ransition .*pe-error",
r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteStonithd)
class RemoteMigrate(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteMigrate' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
self.migrate_connection(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
AllTestClasses.append(RemoteMigrate)
class RemoteRscFailure(RemoteDriver):
def __call__(self, node):
'''Perform the 'RemoteRscFailure' test. '''
if not self.start_new_test(node):
return self.failure(self.fail_string)
# This is an important step. We are migrating the connection
# before failing the resource. This verifies that the migration
# has properly maintained control over the remote-node.
self.migrate_connection(node)
self.fail_rsc(node)
self.cleanup_metal(node)
self.debug("Waiting for the cluster to recover")
self.CM.cluster_stable()
if self.failed:
return self.failure(self.fail_string)
return self.success()
def errorstoignore(self):
ignore_pats = [
r"pengine.*: Recover remote-rsc\s*\(.*\)",
r"Dummy.*: No process state file found",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
return ignore_pats
AllTestClasses.append(RemoteRscFailure)
# vim:ts=4:sw=4:et:
diff --git a/cts/README.md b/cts/README.md
index 552eaf760c..2932106039 100644
--- a/cts/README.md
+++ b/cts/README.md
@@ -1,284 +1,284 @@
# Pacemaker Cluster Test Suite (CTS)
## Purpose
Pacemaker's CTS is primarily for developers and packagers of the Pacemaker
source code, but it can be useful for users who wish to see how their cluster
will react to various situations.
CTS consists of two main parts: a set of regression tests for verifying the
functionality of particular Pacemaker components, and a cluster exerciser for
intensively testing the behavior of an entire working cluster.
The primary regression test front end is cts-regression in this directory. Run
it with the --help option to see its usage. The regression tests can be run on
any single cluster node. The cluster should be stopped on that node when
running the tests.
The rest of this document focuses on the cluster exerciser. The cluster
exerciser runs a randomized series of predefined tests on the cluster. CTS can
be run against a pre-existing cluster configuration or overwrite the existing
configuration with a test configuration.
## Requirements
* Three or more machines (one test exerciser and two or more test cluster
machines).
* The test cluster machines should be on the same subnet and have journalling
filesystems (ext3, ext4, xfs, etc.) for all of their filesystems other than
/boot. You also need a number of free IP addresses on that subnet if you
intend to test mutual IP address takeover.
* The test exerciser machine doesn't need to be on the same subnet as the test
cluster machines. Minimal demands are made on the exerciser machine - it
just has to stay up during the tests.
* It helps a lot in tracking problems if all machines' clocks are closely
synchronized. NTP does this automatically, but you can do it by hand if you
want.
* The exerciser needs to be able to ssh over to the cluster nodes as root
without a password challenge. Configure ssh accordingly (see the Mini-HOWTO
at the end of this document for more details).
* The exerciser needs to be able to resolve the machine names of the
test cluster - either by DNS or by /etc/hosts.
* CTS is not guaranteed to run on all platforms that pacemaker itself does.
It calls commands such as service that may not be provided by all OSes.
## Preparation
Install Pacemaker (including CTS) on all machines. These scripts are
coordinated with particular versions of Pacemaker, so you need the same version
of CTS as the rest of Pacemaker, and you need the same version of
pacemaker and CTS on both the test exerciser and the test cluster machines.
You can install CTS from source, although many distributions provide
packages that include it (e.g. pacemaker-cts or pacemaker-dev).
Typically, packages will install CTS as /usr/share/pacemaker/tests/cts.
Configure cluster communications (Corosync) on the
cluster machines and verify everything works.
NOTE: Do not run the cluster on the test exerciser machine.
NOTE: Wherever machine names are mentioned in these configuration files,
they must match the machines' `uname -n` name. This may or may not match
the machines' FQDN (fully qualified domain name) - it depends on how
you (and your OS) have named the machines.
## Run CTS
Now assuming you did all this, what you need to do is run CTSlab.py:
python ./CTSlab.py [options] number-of-tests-to-run
You must specify which nodes are part of the cluster with --nodes, e.g.:
--node "pcmk-1 pcmk-2 pcmk-3"
Most people will want to save the output with --outputfile, e.g.:
--outputfile ~/cts.log
Unless you want to test your pre-existing cluster configuration, you also want:
--clobber-cib
--populate-resources
--test-ip-base $IP # e.g. --test-ip-base 192.168.9.100
and configure some sort of fencing:
--stonith $TYPE # e.g. "--stonith xvm" to use fence_xvm or "--stonith ssh" to use external/ssh
A complete command line might look like:
python ./CTSlab.py --nodes "pcmk-1 pcmk-2 pcmk-3" --outputfile ~/cts.log \
--clobber-cib --populate-resources --test-ip-base 192.168.9.100 \
--stonith xvm 50
For more options, use the --help option.
NOTE: Perhaps more convenient way to compile a command line like above
is to use cluster_test script that, at least in the source repository,
sits in the same directory as this very file.
To extract the result of a particular test, run:
crm_report -T $test
## Optional/advanced testing
### Memory testing
Pacemaker and CTS have various options for testing memory management. On the
cluster nodes, pacemaker components will use various environment variables to
control these options. How these variables are set varies by OS, but usually
they are set in the /etc/sysconfig/pacemaker or /etc/default/pacemaker file.
Valgrind is a program for detecting memory management problems (such as
use-after-free errors). If you have valgrind installed, you can enable it by
setting the following environment variables on all cluster nodes:
PCMK_valgrind_enabled=pacemaker-attrd,pacemaker-execd,cib,crmd,pengine,stonith-ng
VALGRIND_OPTS="--leak-check=full --trace-children=no --num-callers=25
--log-file=/var/lib/pacemaker/valgrind-%p
--suppressions=/usr/share/pacemaker/tests/valgrind-pcmk.suppressions
--gen-suppressions=all"
and running CTS with these options:
--valgrind-tests --valgrind-procs="pacemaker-attrd pacemaker-execd cib crmd pengine stonith-ng"
These options should only be set while specifically testing memory management,
because they may slow down the cluster significantly, and they will disable
writes to the CIB. If desired, you can enable valgrind on a subset of pacemaker
components rather than all of them as listed above.
Valgrind will put a text file for each process in the location specified by
valgrind's --log-file option. For explanations of the messages valgrind
generates, see http://valgrind.org/docs/manual/mc-manual.html
Separately, if you are using the GNU C library, the G_SLICE, MALLOC_PERTURB_,
and MALLOC_CHECK_ environment variables can be set to affect the library's
memory management functions.
When using valgrind, G_SLICE should be set to "always-malloc", which helps
valgrind track memory by always using the malloc() and free() routines
directly. When not using valgrind, G_SLICE can be left unset, or set to
"debug-blocks", which enables the C library to catch many memory errors
but may impact performance.
If the MALLOC_PERTURB_ environment variable is set to an 8-bit integer, the C
library will initialize all newly allocated bytes of memory to the integer
value, and will set all newly freed bytes of memory to the bitwise inverse of
the integer value. This helps catch uses of uninitialized or freed memory
blocks that might otherwise go unnoticed. Example:
MALLOC_PERTURB_=221
If the MALLOC_CHECK_ environment variable is set, the C library will check for
certain heap corruption errors. The most useful value in testing is 3, which
will cause the library to print a message to stderr and abort execution.
Example:
MALLOC_CHECK_=3
Valgrind should be enabled for either all nodes or none, but the C library
variables may be set differently on different nodes.
### Remote node testing
-If the pacemaker_remoted daemon is installed on all cluster nodes, CTS will
+If the pacemaker-remoted daemon is installed on all cluster nodes, CTS will
enable remote node tests.
The remote node tests choose a random node, stop the cluster on it, start
-pacemaker_remote on it, and add an ocf:pacemaker:remote resource to turn it
+pacemaker-remoted on it, and add an ocf:pacemaker:remote resource to turn it
into a remote node. When the test is done, CTS will turn the node back into
a cluster node.
To avoid conflicts, CTS will rename the node, prefixing the original node name
with "remote-". For example, "pcmk-1" will become "remote-pcmk-1".
The name change may require special stonith configuration, if the fence agent
expects the node name to be the same as its hostname. A common approach is to
specify the "remote-" names in pcmk_host_list. If you use pcmk_host_list=all,
CTS will expand that to all cluster nodes and their "remote-" names.
You may additionally need a pcmk_host_map argument to map the "remote-" names
to the hostnames. Example:
--stonith xvm --stonith-args \
pcmk_arg_map=domain:uname,pcmk_host_list=all,pcmk_host_map=remote-pcmk-1:pcmk-1;remote-pcmk-2:pcmk-2
### Remote node testing with valgrind
When running the remote node tests, the pacemaker components on the cluster
nodes can be run under valgrind as described in the "Memory testing" section.
-However, pacemaker_remote cannot be run under valgrind that way, because it is
+However, pacemaker-remoted cannot be run under valgrind that way, because it is
started by the OS's regular boot system and not by pacemaker.
Details vary by system, but the goal is to set the VALGRIND_OPTS environment
-variable and then start pacemaker_remoted by prefixing it with the path to
+variable and then start pacemaker-remoted by prefixing it with the path to
valgrind.
-The init script and systemd service file provided with pacemaker_remote will
+The init script and systemd service file provided with pacemaker-remoted will
load the pacemaker environment variables from the same location used by other
pacemaker components, so VALGRIND_OPTS will be set correctly if using one of
those.
For an OS using systemd, you can override the ExecStart parameter to run
valgrind. For example:
mkdir /etc/systemd/system/pacemaker_remote.service.d
cat >/etc/systemd/system/pacemaker_remote.service.d/valgrind.conf <<EOF
[Service]
ExecStart=
- ExecStart=/usr/bin/valgrind /usr/sbin/pacemaker_remoted
+ ExecStart=/usr/bin/valgrind /usr/sbin/pacemaker-remoted
EOF
### Container testing
If the --container-tests option is given to CTS, it will enable
testing of LXC resources (currently only the RemoteLXC test,
which starts a remote node using an LXC container).
The container tests have additional package dependencies (see the toplevel
README). Also, SELinux must be enabled (in either permissive or enforcing mode),
libvirtd must be enabled and running, and root must be able to ssh without a
password between all cluster nodes (not just from the test machine). Before
running the tests, you can verify your environment with:
/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v
LXC tests will create two containers with hardcoded parameters: a NAT'ed bridge
named virbr0 using the IP network 192.168.123.0/24 will be created on the
cluster node hosting the containers; the host will be assigned
52:54:00:A8:12:35 as the MAC address and 192.168.123.1 as the IP address.
Each container will be assigned a random MAC address starting with 52:54:,
the IP address 192.168.123.11 or 192.168.123.12, the hostname lxc1 or lxc2
(which will be added to the host's /etc/hosts file), and 196MB RAM.
The test will revert all of the configuration when it is done.
## Mini-HOWTO: Allow passwordless remote SSH connections
The CTS scripts run "ssh -l root" so you don't have to do any of your testing
logged in as root on the test machine. Here is how to allow such connections
without requiring a password to be entered each time:
* On your test exerciser, create an SSH key if you do not already have one.
Most commonly, SSH keys will be in your ~/.ssh directory, with the
private key file not having an extension, and the public key file
named the same with the extension ".pub" (for example, ~/.ssh/id_rsa.pub).
If you don't already have a key, you can create one with:
ssh-keygen -t rsa
* From your test exerciser, authorize your SSH public key for root on all test
machines (both the exerciser and the cluster test machines):
ssh-copy-id -i ~/.ssh/id_rsa.pub root@$MACHINE
You will probably have to provide your password, and possibly say
"yes" to some questions about accepting the identity of the test machines.
The above assumes you have a RSA SSH key in the specified location;
if you have some other type of key (DSA, ECDSA, etc.), use its file name
in the -i option above.
* To test, try this command from the exerciser machine for each
of your cluster machines, and for the exerciser machine itself.
ssh -l root $MACHINE
If this works without prompting for a password, you're in business.
If not, look at the documentation for your version of ssh.
diff --git a/cts/cts-exec.in b/cts/cts-exec.in
index ac9690e6d3..11c074146f 100644
--- a/cts/cts-exec.in
+++ b/cts/cts-exec.in
@@ -1,1278 +1,1278 @@
#!@PYTHON@
""" Regression tests for Pacemaker's pacemaker-execd
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2012-2018 Andrew Beekhof <andrew@beekhof.net>"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import io
import os
import stat
import sys
import subprocess
import shlex
import time
# Where to find test binaries
# Prefer the source tree if available
BUILD_DIR = "@abs_top_builddir@"
TEST_DIR = sys.path[0]
SBIN_DIR = "@sbindir@"
# File permissions for executable scripts we create
EXECMODE = stat.S_IRUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH
# These values must be kept in sync with include/crm/crm.h
class CrmExit:
OK = 0
ERROR = 1
INVALID_PARAM = 2
UNIMPLEMENT_FEATURE = 3
INSUFFICIENT_PRIV = 4
NOT_INSTALLED = 5
NOT_CONFIGURED = 6
NOT_RUNNING = 7
USAGE = 64
DATAERR = 65
NOINPUT = 66
NOUSER = 67
NOHOST = 68
UNAVAILABLE = 69
SOFTWARE = 70
OSERR = 71
OSFILE = 72
CANTCREAT = 73
IOERR = 74
TEMPFAIL = 75
PROTOCOL = 76
NOPERM = 77
CONFIG = 78
FATAL = 100
PANIC = 101
DISCONNECT = 102
SOLO = 103
DIGEST = 104
NOSUCH = 105
QUORUM = 106
UNSAFE = 107
EXISTS = 108
MULTIPLE = 109
OLD = 110
TIMEOUT = 124
MAX = 255
def update_path():
""" Set the PATH environment variable appropriately for the tests """
new_path = os.environ['PATH']
if os.path.exists("%s/cts-exec.in" % TEST_DIR):
print("Running tests from the source tree: %s (%s)" % (BUILD_DIR, TEST_DIR))
- # For pacemaker-execd, cts-exec-helper, and pacemaker_remoted
+ # For pacemaker-execd, cts-exec-helper, and pacemaker-remoted
new_path = "%s/daemons/execd:%s" % (BUILD_DIR, new_path)
new_path = "%s/tools:%s" % (BUILD_DIR, new_path) # For crm_resource
new_path = "%s/fencing:%s" % (BUILD_DIR, new_path) # For stonithd
else:
print("Running tests from the install tree: @CRM_DAEMON_DIR@ (not %s)" % TEST_DIR)
- # For stonithd, pacemaker-execd, cts-exec-helper, and pacemaker_remoted
+ # For stonithd, pacemaker-execd, cts-exec-helper, and pacemaker-remoted
new_path = "@CRM_DAEMON_DIR@:%s" % (new_path)
print('Using PATH="{}"'.format(new_path))
os.environ['PATH'] = new_path
def pipe_output(pipes, stdout=True, stderr=False):
""" Wrapper to get text output from pipes regardless of Python version """
output = ""
pipe_outputs = pipes.communicate()
if sys.version_info < (3,):
if stdout:
output = output + pipe_outputs[0]
if stderr:
output = output + pipe_outputs[1]
else:
if stdout:
output = output + pipe_outputs[0].decode(sys.stdout.encoding)
if stderr:
output = output + pipe_outputs[1].decode(sys.stderr.encoding)
return output
def output_from_command(command):
""" Run a command, and return its standard output. """
test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE)
test.wait()
return pipe_output(test).split("\n")
def write_file(filename, contents, executable=False):
""" Create a file. """
f = io.open(filename, "w+")
f.write(contents)
f.close()
if executable:
os.chmod(filename, EXECMODE)
class TestError(Exception):
""" Base class for exceptions in this module """
pass
class ExitCodeError(TestError):
""" Exception raised when command exit status is unexpected """
def __init__(self, exit_code):
self.exit_code = exit_code
def __str__(self):
return repr(self.exit_code)
class OutputNotFoundError(TestError):
""" Exception raised when command output does not contain wanted string """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class OutputFoundError(TestError):
""" Exception raised when command output contains unwanted string """
def __init__(self, output):
self.output = output
def __str__(self):
return repr(self.output)
class Test(object):
""" Executor for a single pacemaker-execd regression test """
def __init__(self, name, description, verbose=0, tls=0):
self.name = name
self.description = description
self.cmds = []
if tls:
- self.daemon_location = "pacemaker_remoted"
+ self.daemon_location = "pacemaker-remoted"
else:
self.daemon_location = "pacemaker-execd"
self.test_tool_location = "cts-exec-helper"
self.verbose = verbose
self.tls = tls
self.result_txt = ""
self.cmd_tool_output = ""
self.result_exitcode = CrmExit.OK
self.execd_process = None
self.stonith_process = None
self.executed = 0
def __new_cmd(self, cmd, args, exitcode, stdout_match="", no_wait=0, stdout_negative_match="", kill=None):
""" Add a command to be executed as part of this test """
if self.verbose and cmd == self.test_tool_location:
args = args + " -V "
if (cmd == self.test_tool_location) and self.tls:
args = args + " -S "
self.cmds.append(
{
"cmd" : cmd,
"kill" : kill,
"args" : args,
"expected_exitcode" : exitcode,
"stdout_match" : stdout_match,
"stdout_negative_match" : stdout_negative_match,
"no_wait" : no_wait,
"cmd_output" : "",
}
)
def start_environment(self):
""" Prepare the host for running a test """
### make sure we are in full control here ###
- cmd = shlex.split("killall -q -9 stonithd lt-stonithd pacemaker-execd lt-pacemaker-execd cts-exec-helper lt-cts-exec-helper pacemaker_remoted")
+ cmd = shlex.split("killall -q -9 stonithd lt-stonithd pacemaker-execd lt-pacemaker-execd cts-exec-helper lt-cts-exec-helper pacemaker-remoted")
test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
test.wait()
additional_args = ""
if self.tls == 0:
self.stonith_process = subprocess.Popen(shlex.split("stonithd -s"))
if self.verbose:
additional_args = additional_args + " -V"
self.execd_process = subprocess.Popen(shlex.split("%s %s -l /tmp/pacemaker-execd-regression.log"
% (self.daemon_location, additional_args)))
time.sleep(1)
def clean_environment(self):
""" Clean up the host after running a test """
if self.execd_process:
self.execd_process.terminate()
self.execd_process.wait()
if self.verbose:
print("Daemon output")
logfile = io.open('/tmp/pacemaker-execd-regression.log', 'rt', errors='replace')
for line in logfile:
print(line.strip().encode('utf-8', 'replace'))
os.remove('/tmp/pacemaker-execd-regression.log')
if self.stonith_process:
self.stonith_process.terminate()
self.stonith_process.wait()
self.execd_process = None
self.stonith_process = None
def add_sys_cmd(self, cmd, args):
""" Add a simple command to be executed as part of this test """
self.__new_cmd(cmd, args, CrmExit.OK, "")
def add_cmd_check_stdout(self, args, match, no_match=""):
""" Add a command with expected output to be executed as part of this test """
self.__new_cmd(self.test_tool_location, args, CrmExit.OK, match, 0, no_match)
def add_cmd(self, args):
""" Add a cts-exec-helper command to be executed as part of this test """
self.__new_cmd(self.test_tool_location, args, CrmExit.OK, "")
def add_cmd_and_kill(self, kill_proc, args):
""" Add a cts-exec-helper command and system command to be executed as part of this test """
self.__new_cmd(self.test_tool_location, args, CrmExit.OK, "", kill=kill_proc)
def add_expected_fail_cmd(self, args, exitcode=CrmExit.ERROR):
""" Add a cts-exec-helper command to be executed as part of this test and expected to fail """
self.__new_cmd(self.test_tool_location, args, exitcode, "")
def get_exitcode(self):
""" Return the exit status of the last test execution """
return self.result_exitcode
def print_result(self, filler):
""" Print the result of the last test execution """
print("%s%s" % (filler, self.result_txt))
def run_cmd(self, args):
""" Execute a command as part of this test """
cmd = shlex.split(args['args'])
cmd.insert(0, args['cmd'])
if self.verbose:
print("\n\nRunning: "+" ".join(cmd))
test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
if args['kill']:
if self.verbose:
print("Also running: "+args['kill'])
### Typically, the kill argument is used to detect some sort of
### failure. Without yielding for a few seconds here, the process
### launched earlier that is listening for the failure may not have
### time to connect to pacemaker-execd.
time.sleep(2)
subprocess.Popen(shlex.split(args['kill']))
if args['no_wait'] == 0:
test.wait()
else:
return CrmExit.OK
output = pipe_output(test)
args['cmd_output'] = output
if test.returncode != args['expected_exitcode']:
raise ExitCodeError(test.returncode)
if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0:
raise OutputNotFoundError(output)
if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0:
raise OutputFoundError(output)
def set_error(self, step, cmd):
""" Record failure of this test """
msg = "FAILURE - '%s' failed at step %d. Command: %s %s"
self.result_txt = msg % (self.name, step, cmd['cmd'], cmd['args'])
self.result_exitcode = CrmExit.ERROR
def run(self):
""" Execute this test. """
res = 0
i = 1
if self.tls and self.name.count("stonith") != 0:
self.result_txt = "SKIPPED - '%s' - disabled when testing pacemaker_remote" % (self.name)
print(self.result_txt)
return res
self.start_environment()
if self.verbose:
print("\n--- START TEST - %s" % self.name)
self.result_txt = "SUCCESS - '%s'" % (self.name)
self.result_exitcode = CrmExit.OK
for cmd in self.cmds:
try:
self.run_cmd(cmd)
except ExitCodeError as e:
print(cmd['cmd_output'])
print("Step %d FAILED - command returned %s, expected %d" % (i, e, cmd['expected_exitcode']))
self.set_error(i, cmd);
break
except OutputNotFoundError as e:
print("Step %d FAILED - '%s' was not found in command output: %s" % (i, cmd['stdout_match'], e))
self.set_error(i, cmd);
break
except OutputFoundError as e:
print("Step %d FAILED - '%s' was found in command output: %s" % (i, cmd['stdout_negative_match'], e))
self.set_error(i, cmd);
break
if self.verbose:
print(cmd['cmd_output'].strip())
print("Step %d SUCCESS" % (i))
i = i + 1
self.clean_environment()
print(self.result_txt)
if self.verbose:
print("--- END TEST - %s\n" % self.name)
self.executed = 1
return res
class Tests(object):
""" Collection of all pacemaker-execd regression tests """
def __init__(self, verbose=0, tls=0):
self.tests = []
self.verbose = verbose
self.tls = tls
self.rsc_classes = output_from_command("crm_resource --list-standards")
self.rsc_classes = self.rsc_classes[:-1] # Strip trailing empty line
self.need_authkey = 0
self.action_timeout = " -t 9000 "
if self.tls:
self.rsc_classes.remove("stonith")
if "systemd" in self.rsc_classes:
try:
# This code doesn't need this import, but pacemaker-cts-dummyd
# does, so ensure the dependency is available rather than cause
# all systemd tests to fail.
import systemd.daemon
except ImportError:
print("Fatal error: python systemd bindings not found. Is package installed?",
file=sys.stderr)
sys.exit(CrmExit.ERROR)
print("Testing resource classes", repr(self.rsc_classes))
self.common_cmds = {
"ocf_reg_line" : "-c register_rsc -r ocf_test_rsc "+self.action_timeout+" -C ocf -P pacemaker -T Dummy",
"ocf_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:ocf_test_rsc action:none rc:ok op_status:complete\"",
"ocf_unreg_line" : "-c unregister_rsc -r \"ocf_test_rsc\" "+self.action_timeout,
"ocf_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:ocf_test_rsc action:none rc:ok op_status:complete\"",
"ocf_start_line" : "-c exec -r \"ocf_test_rsc\" -a \"start\" "+self.action_timeout,
"ocf_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:start rc:ok op_status:complete\" ",
"ocf_stop_line" : "-c exec -r \"ocf_test_rsc\" -a \"stop\" "+self.action_timeout,
"ocf_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:stop rc:ok op_status:complete\" ",
"ocf_monitor_line" : '-c exec -r ocf_test_rsc -a monitor -i 2s ' + self.action_timeout,
"ocf_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"ocf_cancel_line" : '-c cancel -r ocf_test_rsc -a monitor -i 2s -t 6000 ',
"ocf_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:ocf_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"systemd_reg_line" : "-c register_rsc -r systemd_test_rsc " +
self.action_timeout +
" -C systemd -T pacemaker-cts-dummyd@3",
"systemd_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:systemd_test_rsc action:none rc:ok op_status:complete\"",
"systemd_unreg_line" : "-c unregister_rsc -r \"systemd_test_rsc\" "+self.action_timeout,
"systemd_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:systemd_test_rsc action:none rc:ok op_status:complete\"",
"systemd_start_line" : "-c exec -r \"systemd_test_rsc\" -a \"start\" "+self.action_timeout,
"systemd_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:start rc:ok op_status:complete\" ",
"systemd_stop_line" : "-c exec -r \"systemd_test_rsc\" -a \"stop\" "+self.action_timeout,
"systemd_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:stop rc:ok op_status:complete\" ",
"systemd_monitor_line" : '-c exec -r systemd_test_rsc -a monitor -i 2s ' + self.action_timeout,
# not sure why this one takes so much longer
"systemd_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:complete\" -t 12000 ",
"systemd_cancel_line" : '-c cancel -r systemd_test_rsc -a monitor -i 2s -t 6000 ',
"systemd_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:systemd_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"upstart_reg_line" : "-c register_rsc -r upstart_test_rsc "+self.action_timeout+" -C upstart -T pacemaker-cts-dummyd",
"upstart_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:upstart_test_rsc action:none rc:ok op_status:complete\"",
"upstart_unreg_line" : "-c unregister_rsc -r \"upstart_test_rsc\" "+self.action_timeout,
"upstart_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:upstart_test_rsc action:none rc:ok op_status:complete\"",
"upstart_start_line" : "-c exec -r \"upstart_test_rsc\" -a \"start\" "+self.action_timeout,
"upstart_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:start rc:ok op_status:complete\" ",
"upstart_stop_line" : "-c exec -r \"upstart_test_rsc\" -a \"stop\" "+self.action_timeout,
"upstart_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:stop rc:ok op_status:complete\" ",
"upstart_monitor_line" : '-c exec -r upstart_test_rsc -a monitor -i 2s ' + self.action_timeout,
"upstart_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"upstart_cancel_line" : '-c cancel -r upstart_test_rsc -a monitor -i 2s -t 6000 ',
"upstart_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:upstart_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"service_reg_line" : "-c register_rsc -r service_test_rsc "+self.action_timeout+" -C service -T LSBDummy",
"service_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:service_test_rsc action:none rc:ok op_status:complete\"",
"service_unreg_line" : "-c unregister_rsc -r \"service_test_rsc\" "+self.action_timeout,
"service_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:service_test_rsc action:none rc:ok op_status:complete\"",
"service_start_line" : "-c exec -r \"service_test_rsc\" -a \"start\" "+self.action_timeout,
"service_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:start rc:ok op_status:complete\" ",
"service_stop_line" : "-c exec -r \"service_test_rsc\" -a \"stop\" "+self.action_timeout,
"service_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:stop rc:ok op_status:complete\" ",
"service_monitor_line" : '-c exec -r service_test_rsc -a monitor -i 2s ' + self.action_timeout,
"service_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"service_cancel_line" : '-c cancel -r service_test_rsc -a monitor -i 2s -t 6000 ',
"service_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:service_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
"lsb_reg_line" : "-c register_rsc -r lsb_test_rsc "+self.action_timeout+" -C lsb -T LSBDummy",
"lsb_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:lsb_test_rsc action:none rc:ok op_status:complete\" ",
"lsb_unreg_line" : "-c unregister_rsc -r \"lsb_test_rsc\" "+self.action_timeout,
"lsb_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:lsb_test_rsc action:none rc:ok op_status:complete\"",
"lsb_start_line" : "-c exec -r \"lsb_test_rsc\" -a \"start\" "+self.action_timeout,
"lsb_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:start rc:ok op_status:complete\" ",
"lsb_stop_line" : "-c exec -r \"lsb_test_rsc\" -a \"stop\" "+self.action_timeout,
"lsb_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:stop rc:ok op_status:complete\" ",
"lsb_monitor_line" : '-c exec -r lsb_test_rsc -a status -i 2s ' + self.action_timeout,
"lsb_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:complete\" "+self.action_timeout,
"lsb_cancel_line" : '-c cancel -r lsb_test_rsc -a status -i 2s -t 6000 ',
"lsb_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:lsb_test_rsc action:status rc:ok op_status:Cancelled\" ",
"stonith_reg_line" : "-c register_rsc -r stonith_test_rsc "+self.action_timeout+" -C stonith -P pacemaker -T fence_dummy_monitor",
"stonith_reg_event" : "-l \"NEW_EVENT event_type:register rsc_id:stonith_test_rsc action:none rc:ok op_status:complete\" ",
"stonith_unreg_line" : "-c unregister_rsc -r \"stonith_test_rsc\" "+self.action_timeout,
"stonith_unreg_event" : "-l \"NEW_EVENT event_type:unregister rsc_id:stonith_test_rsc action:none rc:ok op_status:complete\"",
"stonith_start_line" : "-c exec -r \"stonith_test_rsc\" -a \"start\" -t 8000 ",
"stonith_start_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:start rc:ok op_status:complete\" ",
"stonith_stop_line" : "-c exec -r \"stonith_test_rsc\" -a \"stop\" "+self.action_timeout,
"stonith_stop_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:stop rc:ok op_status:complete\" ",
"stonith_monitor_line" : '-c exec -r stonith_test_rsc -a monitor -i 2s ' + self.action_timeout,
"stonith_monitor_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout,
"stonith_cancel_line" : '-c cancel -r stonith_test_rsc -a monitor -i 2s -t 6000 ',
"stonith_cancel_event" : "-l \"NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:Cancelled\" ",
}
def new_test(self, name, description):
""" Create a named test """
test = Test(name, description, self.verbose, self.tls)
self.tests.append(test)
return test
def setup_test_environment(self):
""" Prepare the host before executing any tests """
os.system("service pacemaker_remote stop")
self.cleanup_test_environment()
if self.tls and not os.path.isfile("/etc/pacemaker/authkey"):
self.need_authkey = 1
os.system("mkdir -p /etc/pacemaker")
os.system("dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1")
dummy_upstart_job = ("""
description "Dummy service for regression tests"
exec dd if=/dev/random of=/dev/null
""")
dummy_fence_sleep_agent = ("""#!@PYTHON@
import sys
import time
def main():
for line in sys.stdin:
if line.count("monitor") > 0:
time.sleep(30000)
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()
""")
dummy_fence_agent = ("""#!/bin/sh
while read line; do
case ${line} in
*monitor*) exit 0;;
*metadata*)
echo '<resource-agent name="fence_dummy_monitor" shortdesc="Dummy Fence agent for testing">'
echo ' <longdesc>dummy description.</longdesc>'
echo ' <vendor-url>http://www.example.com</vendor-url>'
echo ' <parameters>'
echo ' <parameter name="action" unique="0" required="1">'
echo ' <getopt mixed="-o, --action=[action]"/>'
echo ' <content type="string" default="reboot"/>'
echo ' <shortdesc lang="en">Fencing Action</shortdesc>'
echo ' </parameter>'
echo ' <parameter name="port" unique="0" required="0">'
echo ' <getopt mixed="-n, --plug=[id]"/>'
echo ' <content type="string"/>'
echo ' <shortdesc lang="en">Physical plug number or name of virtual machine</shortdesc>'
echo ' </parameter>'
echo ' </parameters>'
echo ' <actions>'
echo ' <action name="on"/>'
echo ' <action name="off"/>'
echo ' <action name="monitor"/>'
echo ' <action name="metadata"/>'
echo ' </actions>'
echo '</resource-agent>'
exit 0;;
esac
exit 1
done
""")
if os.path.isdir("/etc/init"):
write_file("/etc/init/pacemaker-cts-dummyd.conf", dummy_upstart_job);
write_file(SBIN_DIR + "/fence_dummy_sleep", dummy_fence_sleep_agent, executable=True);
write_file(SBIN_DIR + "/fence_dummy_monitor", dummy_fence_agent, executable=True);
if os.path.exists("%s/cts/LSBDummy" % BUILD_DIR):
os.system("cp %s/cts/LSBDummy /etc/init.d/LSBDummy" % BUILD_DIR)
if not os.path.exists("@OCF_RA_DIR@/pacemaker"):
os.system("mkdir -p @OCF_RA_DIR@/pacemaker/")
# Install helper OCF agents
for agent in ["Dummy", "Stateful", "ping"]:
os.system("cp %s/extra/resources/%s @OCF_RA_DIR@/pacemaker/%s" % (BUILD_DIR, agent, agent))
os.system("chmod a+x @OCF_RA_DIR@/pacemaker/%s" % (agent))
else:
# Assume it's installed
os.system("cp @datadir@/@PACKAGE@/tests/cts/LSBDummy /etc/init.d/LSBDummy")
os.system("chmod a+x /etc/init.d/LSBDummy")
os.system("mkdir -p @CRM_CORE_DIR@/root")
if os.path.exists("/bin/systemctl"):
os.system("systemctl daemon-reload")
def cleanup_test_environment(self):
""" Clean up the host after executing desired tests """
if self.need_authkey:
os.system("rm -f /etc/pacemaker/authkey")
os.system("rm -f /etc/init.d/LSBDummy")
os.system("rm -f " + SBIN_DIR + "/fence_dummy_monitor")
os.system("rm -f " + SBIN_DIR + "/fence_dummy_sleep")
if os.path.exists("/bin/systemctl"):
os.system("systemctl daemon-reload")
def build_generic_tests(self):
""" Register tests that apply to all resource classes """
common_cmds = self.common_cmds
### register/unregister tests ###
for rsc in self.rsc_classes:
test = self.new_test("generic_registration_%s" % (rsc),
"Simple resource registration test for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### start/stop tests ###
for rsc in self.rsc_classes:
test = self.new_test("generic_start_stop_%s" % (rsc), "Simple start and stop test for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### monitor cancel test ###
for rsc in self.rsc_classes:
test = self.new_test("generic_monitor_cancel_%s" % (rsc),
"Simple monitor cancel test for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)])
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### monitor duplicate test ###
for rsc in self.rsc_classes:
test = self.new_test("generic_monitor_duplicate_%s" % (rsc),
"Test creation and canceling of duplicate monitors for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
# Add the duplicate monitors
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
# verify we still get update events
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
# cancel the monitor, if the duplicate merged with the original, we should no longer see monitor updates
test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)])
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
### stop implies cancel test ###
for rsc in self.rsc_classes:
test = self.new_test("generic_stop_implies_cancel_%s" % (rsc),
"Verify stopping a resource implies cancel of recurring ops for %s standard" % (rsc))
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
### If this fails, that means the monitor may not be getting rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
### If this happens the monitor did not actually cancel correctly. ###
test.add_expected_fail_cmd(common_cmds["%s_monitor_event" % (rsc)], CrmExit.TIMEOUT)
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
def build_multi_rsc_tests(self):
""" Register complex tests that involve managing multiple resouces of different types """
common_cmds = self.common_cmds
# do not use service and systemd at the same time, it is the same resource.
### register start monitor stop unregister resources of each type at the same time. ###
test = self.new_test("multi_rsc_start_stop_all",
"Start, monitor, and stop resources of multiple types and classes")
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_reg_line" % (rsc)] + " " + common_cmds["%s_reg_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_start_line" % (rsc)] + " " + common_cmds["%s_start_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_monitor_line" % (rsc)] + " " + common_cmds["%s_monitor_event" % (rsc)])
for rsc in self.rsc_classes:
### If this fails, that means the monitor is not being rescheduled ####
test.add_cmd(common_cmds["%s_monitor_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_cancel_line" % (rsc)] + " " + common_cmds["%s_cancel_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_stop_line" % (rsc)] + " " + common_cmds["%s_stop_event" % (rsc)])
for rsc in self.rsc_classes:
test.add_cmd(common_cmds["%s_unreg_line" % (rsc)] + " " + common_cmds["%s_unreg_event" % (rsc)])
def build_negative_tests(self):
""" Register tests related to how pacemaker-execd handles failures """
### ocf start timeout test ###
test = self.new_test("ocf_start_timeout", "Force start timeout to occur, verify start failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
# -t must be less than self.action_timeout
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" -k \"op_sleep\" -v \"5\" -t 1000 -w")
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:Timed Out" '
+ self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### stonith start timeout test ###
test = self.new_test("stonith_start_timeout", "Force start timeout to occur, verify start failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"stonith\" -P \"pacemaker\" -T \"fence_dummy_sleep\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" -t 1000 -w") # -t must be less than self.action_timeout
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:Timed Out" '
+ self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### stonith component fail ###
common_cmds = self.common_cmds
test = self.new_test("stonith_component_fail", "Kill stonith component after pacemaker-execd connects")
test.add_cmd(common_cmds["stonith_reg_line"] + " " + common_cmds["stonith_reg_event"])
test.add_cmd(common_cmds["stonith_start_line"] + " " + common_cmds["stonith_start_event"])
test.add_cmd('-c exec -r stonith_test_rsc -a monitor -i 600s '
'-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:ok op_status:complete" '
+ self.action_timeout)
test.add_cmd_and_kill("killall -9 -q stonithd lt-stonithd",
'-l "NEW_EVENT event_type:exec_complete rsc_id:stonith_test_rsc action:monitor rc:unknown error op_status:error" -t 15000')
test.add_cmd(common_cmds["stonith_unreg_line"] + " " + common_cmds["stonith_unreg_event"])
### monitor fail for ocf resources ###
test = self.new_test("monitor_fail_ocf", "Force ocf monitor to fail, verify failure is reported.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"')
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"'
+ self.action_timeout)
test.add_cmd('-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"'
+ self.action_timeout)
test.add_cmd_and_kill("rm -f @localstatedir@/run/Dummy-test_rsc.state",
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete" -t 6000')
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 '
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "
+ self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "
+ self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "
+ self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### verify notify changes only for monitor operation. ###
test = self.new_test("monitor_changes_only", "Verify when flag is set, only monitor changes are notified.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+" -o "
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
' -o -l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete" ')
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd_and_kill("rm -f @localstatedir@/run/Dummy-test_rsc.state", "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" -t 6000")
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 '
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd('-c unregister_rsc -r "test_rsc" ' + self.action_timeout +
'-l "NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete"')
### monitor fail for systemd resource ###
if "systemd" in self.rsc_classes:
test = self.new_test("monitor_fail_systemd", "Force systemd monitor to fail, verify failure is reported..")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C systemd -T pacemaker-cts-dummyd@3 " +
self.action_timeout +
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd_and_kill("killall -9 -q pacemaker-cts-dummyd",
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" -t 8000")
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 '
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### monitor fail for upstart resource ###
if "upstart" in self.rsc_classes:
test = self.new_test("monitor_fail_upstart", "Force upstart monitor to fail, verify failure is reported..")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C upstart -T pacemaker-cts-dummyd "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s ' + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd_and_kill("killall -9 -q dd", "-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" -t 8000")
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 '
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Cancel non-existent operation on a resource ###
test = self.new_test("cancel_non_existent_op", "Attempt to cancel the wrong monitor operation, verify expected failure")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r test_rsc -a monitor -i 1s '
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout)
test.add_expected_fail_cmd('-c cancel -r test_rsc -a monitor -i 2s -t 6000 ' ### interval is wrong, should fail
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_expected_fail_cmd('-c cancel -r test_rsc -a stop -i 1s -t 6000 ' ### action name is wrong, should fail
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:not running op_status:Cancelled\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Attempt to invoke non-existent rsc id ###
test = self.new_test("invoke_non_existent_rsc", "Attempt to perform operations on a non-existent rsc id.")
test.add_expected_fail_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:unknown error op_status:complete\" ")
test.add_expected_fail_cmd("-c exec -r test_rsc -a stop "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_expected_fail_cmd('-c exec -r test_rsc -a monitor -i 6s '
+ self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" ")
test.add_expected_fail_cmd("-c cancel -r test_rsc -a start "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register and start a resource that doesn't exist, systemd ###
if "systemd" in self.rsc_classes:
test = self.new_test("start_uninstalled_systemd", "Register uninstalled systemd agent, try to start, verify expected failure")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C systemd -T this_is_fake1234 "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
if "upstart" in self.rsc_classes:
test = self.new_test("start_uninstalled_upstart", "Register uninstalled upstart agent, try to start, verify expected failure")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C upstart -T this_is_fake1234 "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register and start a resource that doesn't exist, ocf ###
test = self.new_test("start_uninstalled_ocf", "Register uninstalled ocf agent, try to start, verify expected failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C ocf -P pacemaker -T this_is_fake1234 "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register ocf with non-existent provider ###
test = self.new_test("start_ocf_bad_provider", "Register ocf agent with a non-existent provider, verify expected failure.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C ocf -P pancakes -T Dummy "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:not installed op_status:Not installed\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register ocf with empty provider field ###
test = self.new_test("start_ocf_no_provider", "Register ocf agent with a no provider, verify expected failure.")
test.add_expected_fail_cmd("-c register_rsc -r \"test_rsc\" -C ocf -T Dummy "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_expected_fail_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Error\" ")
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
def build_stress_tests(self):
""" Register stress tests """
timeout = "-t 20000"
iterations = 25
test = self.new_test("ocf_stress", "Verify OCF agent handling works under load")
for i in range(iterations):
test.add_cmd("-c register_rsc -r rsc_%s %s -C ocf -P heartbeat -T Dummy -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd('-c exec -r rsc_%s -a monitor %s -i 1s '
'-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete"' % (i, timeout, i))
for i in range(iterations):
test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
if "systemd" in self.rsc_classes:
test = self.new_test("systemd_stress", "Verify systemd dbus connection works under load")
for i in range(iterations):
test.add_cmd("-c register_rsc -r rsc_%s %s -C systemd -T pacemaker-cts-dummyd@3 -l \"NEW_EVENT event_type:register rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c exec -r rsc_%s -a start %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:start rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd('-c exec -r rsc_%s -a monitor %s -i 1s '
'-l "NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:monitor rc:ok op_status:complete"' % (i, timeout, i))
for i in range(iterations):
test.add_cmd("-c exec -r rsc_%s -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:rsc_%s action:stop rc:ok op_status:complete\"" % (i, timeout, i))
test.add_cmd("-c unregister_rsc -r rsc_%s %s -l \"NEW_EVENT event_type:unregister rsc_id:rsc_%s action:none rc:ok op_status:complete\"" % (i, timeout, i))
iterations = 9
timeout = "-t 30000"
### Verify recurring op in-flight collision is handled in series properly
test = self.new_test("rsc_inflight_collision", "Verify recurring ops do not collide with other operations for the same rsc.")
test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a start %s -k op_sleep -v 1 -l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\"" % (timeout))
for i in range(iterations):
test.add_cmd('-c exec -r test_rsc -a monitor %s -i 100%dms '
'-k op_sleep -v 2 '
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"' % (timeout, i))
test.add_cmd("-c exec -r test_rsc -a stop %s -l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\"" % (timeout))
test.add_cmd("-c unregister_rsc -r test_rsc %s -l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\"" % (timeout))
def build_custom_tests(self):
""" Register tests that target specific cases """
### verify resource temporary folder is created and used by OCF agents. ###
test = self.new_test("rsc_tmp_dir", "Verify creation and use of rsc temporary state directory")
test.add_sys_cmd("ls", "-al @CRM_RSCTMP_DIR@")
test.add_cmd("-c register_rsc -r test_rsc -P heartbeat -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -a start -t 4000")
test.add_sys_cmd("ls", "-al @CRM_RSCTMP_DIR@")
test.add_sys_cmd("ls", "@CRM_RSCTMP_DIR@/Dummy-test_rsc.state")
test.add_cmd("-c exec -r test_rsc -a stop -t 4000")
test.add_cmd("-c unregister_rsc -r test_rsc "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### start delay then stop test ###
test = self.new_test("start_delay", "Verify start delay works as expected.")
test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -s 6000 -a start -w -t 6000")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 2000", CrmExit.TIMEOUT)
test.add_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 6000")
test.add_cmd("-c exec -r test_rsc -a stop " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:stop rc:ok op_status:complete\" ")
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### start delay, but cancel before it gets a chance to start. ###
test = self.new_test("start_delay_cancel", "Using start_delay, start a rsc, but cancel the start op before execution.")
test.add_cmd("-c register_rsc -r test_rsc -P pacemaker -C ocf -T Dummy "
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" "+self.action_timeout)
test.add_cmd("-c exec -r test_rsc -s 5000 -a start -w -t 4000")
test.add_cmd("-c cancel -r test_rsc -a start " + self.action_timeout +
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:Cancelled\" ")
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" -t 5000", CrmExit.TIMEOUT)
test.add_cmd("-c unregister_rsc -r test_rsc " + self.action_timeout +
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### Register a bunch of resources, verify we can get info on them ###
test = self.new_test("verify_get_rsc_info", "Register multiple resources, verify retrieval of rsc info.")
if "systemd" in self.rsc_classes:
test.add_cmd("-c register_rsc -r rsc1 -C systemd -T pacemaker-cts-dummyd@3 "+self.action_timeout)
test.add_cmd("-c get_rsc_info -r rsc1 ")
test.add_cmd("-c unregister_rsc -r rsc1 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc1 ")
if "upstart" in self.rsc_classes:
test.add_cmd("-c register_rsc -r rsc1 -C upstart -T pacemaker-cts-dummyd "+self.action_timeout)
test.add_cmd("-c get_rsc_info -r rsc1 ")
test.add_cmd("-c unregister_rsc -r rsc1 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc1 ")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout)
test.add_cmd("-c get_rsc_info -r rsc2 ")
test.add_cmd("-c unregister_rsc -r rsc2 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc2 ")
### Register duplicate, verify only one entry exists and can still be removed.
test = self.new_test("duplicate_registration", "Register resource multiple times, verify only one entry exists and can be removed.")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout)
test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Dummy")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Dummy -P pacemaker "+self.action_timeout)
test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Dummy")
test.add_cmd("-c register_rsc -r rsc2 -C ocf -T Stateful -P pacemaker "+self.action_timeout)
test.add_cmd_check_stdout("-c get_rsc_info -r rsc2 ", "id:rsc2 class:ocf provider:pacemaker type:Stateful")
test.add_cmd("-c unregister_rsc -r rsc2 "+self.action_timeout)
test.add_expected_fail_cmd("-c get_rsc_info -r rsc2 ")
### verify the option to only send notification to the original client. ###
test = self.new_test("notify_orig_client_only", "Verify option to only send notifications to the client originating the action.")
test.add_cmd("-c register_rsc -r \"test_rsc\" -C \"ocf\" -P \"pacemaker\" -T \"Dummy\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:register rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
test.add_cmd("-c exec -r \"test_rsc\" -a \"start\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:start rc:ok op_status:complete\" ")
test.add_cmd('-c exec -r \"test_rsc\" -a \"monitor\" -i 1s '
+ self.action_timeout + ' -n '
'-l "NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete"')
# this will fail because the monitor notifications should only go to the original caller, which no longer exists.
test.add_expected_fail_cmd("-l \"NEW_EVENT event_type:exec_complete rsc_id:test_rsc action:monitor rc:ok op_status:complete\" "+self.action_timeout, CrmExit.TIMEOUT)
test.add_cmd('-c cancel -r test_rsc -a monitor -i 1s -t 6000 ')
test.add_cmd("-c unregister_rsc -r \"test_rsc\" "+self.action_timeout+
"-l \"NEW_EVENT event_type:unregister rsc_id:test_rsc action:none rc:ok op_status:complete\" ")
### get metadata ###
test = self.new_test("get_ocf_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"Dummy\"",
"resource-agent name=\"Dummy\"")
test.add_cmd("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"Stateful\"")
test.add_expected_fail_cmd("-c metadata -P \"pacemaker\" -T \"Stateful\"")
test.add_expected_fail_cmd("-c metadata -C \"ocf\" -P \"pacemaker\" -T \"fake_agent\"")
### get metadata ###
test = self.new_test("get_lsb_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"lsb\" -T \"LSBDummy\"",
"resource-agent name='LSBDummy'")
### get stonith metadata ###
test = self.new_test("get_stonith_metadata", "Retrieve stonith metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"stonith\" -P \"pacemaker\" -T \"fence_dummy_monitor\"",
"resource-agent name=\"fence_dummy_monitor\"")
### get metadata ###
if "systemd" in self.rsc_classes:
test = self.new_test("get_systemd_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"systemd\" -T \"pacemaker-cts-dummyd@\"",
"resource-agent name=\"pacemaker-cts-dummyd@\"")
### get metadata ###
if "upstart" in self.rsc_classes:
test = self.new_test("get_upstart_metadata", "Retrieve metadata for a resource")
test.add_cmd_check_stdout("-c metadata -C \"upstart\" -T \"pacemaker-cts-dummyd\"",
"resource-agent name=\"pacemaker-cts-dummyd\"")
### get ocf providers ###
test = self.new_test("list_ocf_providers",
"Retrieve list of available resource providers, verifies pacemaker is a provider.")
test.add_cmd_check_stdout("-c list_ocf_providers ", "pacemaker")
test.add_cmd_check_stdout("-c list_ocf_providers -T ping", "pacemaker")
### Verify agents only exist in their lists ###
test = self.new_test("verify_agent_lists", "Verify the agent lists contain the right data.")
test.add_cmd_check_stdout("-c list_agents ", "Stateful") ### ocf ###
test.add_cmd_check_stdout("-c list_agents -C ocf", "Stateful")
test.add_cmd_check_stdout("-c list_agents -C lsb", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C service", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents ", "LSBDummy") ### init.d ###
test.add_cmd_check_stdout("-c list_agents -C lsb", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C ocf", "", "pacemaker-cts-dummyd@") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C ocf", "", "pacemaker-cts-dummyd@") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C lsb", "", "fence_dummy_monitor") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C service", "", "fence_dummy_monitor") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C ocf", "", "fence_dummy_monitor") ### should not exist
if "systemd" in self.rsc_classes:
test.add_cmd_check_stdout("-c list_agents ", "pacemaker-cts-dummyd@") ### systemd ###
test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C systemd", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C systemd", "pacemaker-cts-dummyd@")
test.add_cmd_check_stdout("-c list_agents -C systemd", "", "fence_dummy_monitor") ### should not exist
if "upstart" in self.rsc_classes:
test.add_cmd_check_stdout("-c list_agents ", "pacemaker-cts-dummyd") ### upstart ###
test.add_cmd_check_stdout("-c list_agents -C service", "LSBDummy")
test.add_cmd_check_stdout("-c list_agents -C upstart", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C upstart", "pacemaker-cts-dummyd")
test.add_cmd_check_stdout("-c list_agents -C upstart", "", "fence_dummy_monitor") ### should not exist
if "stonith" in self.rsc_classes:
test.add_cmd_check_stdout("-c list_agents -C stonith", "fence_dummy_monitor") ### stonith ###
test.add_cmd_check_stdout("-c list_agents -C stonith", "", "pacemaker-cts-dummyd@") ### should not exist
test.add_cmd_check_stdout("-c list_agents -C stonith", "", "Stateful") ### should not exist
test.add_cmd_check_stdout("-c list_agents ", "fence_dummy_monitor")
def print_list(self):
""" List all registered tests """
print("\n==== %d TESTS FOUND ====" % (len(self.tests)))
print("%35s - %s" % ("TEST NAME", "TEST DESCRIPTION"))
print("%35s - %s" % ("--------------------", "--------------------"))
for test in self.tests:
print("%35s - %s" % (test.name, test.description))
print("==== END OF LIST ====\n")
def run_single(self, name):
""" Run a single named test """
for test in self.tests:
if test.name == name:
test.run()
break
def run_tests_matching(self, pattern):
""" Run all tests whose name matches a pattern """
for test in self.tests:
if test.name.count(pattern) != 0:
test.run()
def run_tests(self):
""" Run all tests """
for test in self.tests:
test.run()
def exit(self):
""" Exit (with error status code if any test failed) """
for test in self.tests:
if test.executed == 0:
continue
if test.get_exitcode() != CrmExit.OK:
sys.exit(CrmExit.ERROR)
sys.exit(CrmExit.OK)
def print_results(self):
""" Print summary of results of executed tests """
failures = 0
success = 0
print("\n\n======= FINAL RESULTS ==========")
print("\n--- FAILURE RESULTS:")
for test in self.tests:
if test.executed == 0:
continue
if test.get_exitcode() != CrmExit.OK:
failures = failures + 1
test.print_result(" ")
else:
success = success + 1
if failures == 0:
print(" None")
print("\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures))
class TestOptions(object):
""" Option handler """
def __init__(self):
self.options = {}
self.options['list-tests'] = 0
self.options['run-all'] = 1
self.options['run-only'] = ""
self.options['run-only-pattern'] = ""
self.options['verbose'] = 0
self.options['invalid-arg'] = ""
self.options['show-usage'] = 0
self.options['pacemaker-remote'] = 0
def build_options(self, argv):
""" Set options based on command-line arguments """
args = argv[1:]
skip = 0
for i in range(0, len(args)):
if skip:
skip = 0
continue
elif args[i] == "-h" or args[i] == "--help":
self.options['show-usage'] = 1
elif args[i] == "-l" or args[i] == "--list-tests":
self.options['list-tests'] = 1
elif args[i] == "-V" or args[i] == "--verbose":
self.options['verbose'] = 1
elif args[i] == "-R" or args[i] == "--pacemaker-remote":
self.options['pacemaker-remote'] = 1
elif args[i] == "-r" or args[i] == "--run-only":
self.options['run-only'] = args[i+1]
skip = 1
elif args[i] == "-p" or args[i] == "--run-only-pattern":
self.options['run-only-pattern'] = args[i+1]
skip = 1
def show_usage(self):
""" Show command usage """
print("usage: " + sys.argv[0] + " [options]")
print("If no options are provided, all tests will run")
print("Options:")
print("\t [--help | -h] Show usage")
print("\t [--list-tests | -l] Print out all registered tests.")
print("\t [--run-only | -r 'testname'] Run a specific test")
print("\t [--verbose | -V] Verbose output")
- print("\t [--pacemaker-remote | -R Test pacemaker_remoted binary instead of pacemaker-execd.")
+ print("\t [--pacemaker-remote | -R Test pacemaker-remoted binary instead of pacemaker-execd")
print("\t [--run-only-pattern | -p 'string'] Run only tests containing the string value")
print("\n\tExample: Run only the test 'start_stop'")
print("\t\t " + sys.argv[0] + " --run-only start_stop")
print("\n\tExample: Run only the tests with the string 'systemd' present in them")
print("\t\t " + sys.argv[0] + " --run-only-pattern systemd")
def main(argv):
""" Run pacemaker-execd regression tests as specified by arguments """
update_path()
opts = TestOptions()
opts.build_options(argv)
tests = Tests(opts.options['verbose'], opts.options['pacemaker-remote'])
tests.build_generic_tests()
tests.build_multi_rsc_tests()
tests.build_negative_tests()
tests.build_custom_tests()
tests.build_stress_tests()
tests.setup_test_environment()
print("Starting ...")
if opts.options['list-tests']:
tests.print_list()
elif opts.options['show-usage']:
opts.show_usage()
elif opts.options['run-only-pattern'] != "":
tests.run_tests_matching(opts.options['run-only-pattern'])
tests.print_results()
elif opts.options['run-only'] != "":
tests.run_single(opts.options['run-only'])
tests.print_results()
else:
tests.run_tests()
tests.print_results()
tests.cleanup_test_environment()
tests.exit()
if __name__ == "__main__":
main(sys.argv)
diff --git a/cts/cts-regression.in b/cts/cts-regression.in
index 386867db3e..6dd8f46a10 100755
--- a/cts/cts-regression.in
+++ b/cts/cts-regression.in
@@ -1,216 +1,216 @@
#!@BASH_PATH@
#
# cts-regression
#
# Convenience wrapper for running any of the Pacemaker regression tests
#
# Copyright 2012-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
USAGE_TEXT="Usage: cts-regression [<options>] [<test> ...]
Options:
--help Display this text, then exit
-V, --verbose Increase test verbosity
-v, --valgrind Run test commands under valgrind
Tests (default tests are 'pengine cli exec'):
pengine Policy engine
cli Command-line tools
exec Local resource agent executor
- pacemaker_remote Local resource manager in remote mode
+ pacemaker_remote Resource agent executor in remote mode
fencing Fencing daemon
all Synonym for 'pengine cli exec fencing'"
# If readlink supports -e (i.e. GNU), use it
readlink -e / >/dev/null 2>/dev/null
if [ $? -eq 0 ]; then
test_home="$(dirname $(readlink -e $0))"
else
test_home="$(dirname $0)"
fi
valgrind=""
verbose=""
tests=""
# These constants must track crm_exit_t values
CRM_EX_OK=0
CRM_EX_ERROR=1
CRM_EX_NOT_INSTALLED=5
CRM_EX_USAGE=64
function info() {
printf "$*\n"
}
function error() {
printf " * ERROR: $*\n"
}
function run_as_root() {
CMD="$1"
shift
ARGS="$@"
# Test might not be executable if run from source directory
chmod a+x $CMD
CMD="$CMD $ARGS $verbose"
if [ $EUID -eq 0 ]; then
$CMD
elif [ -z $TRAVIS ]; then
# sudo doesn't work in buildbot, su doesn't work in travis
echo "Enter the root password..."
su root -c "$CMD"
else
echo "Enter the root password if prompted..."
sudo -- $CMD
fi
}
add_test() {
local TEST="$1"
case "$TEST" in
pengine|exec|pacemaker_remote|fencing|cli)
if [[ ! $tests =~ $TEST ]]; then
tests="$tests $TEST"
fi
;;
*)
error "unknown test: $TEST"
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
;;
esac
}
run_test() {
local t="$1"
info "Executing the $t regression tests"
info "============================================================"
case $t in
pengine)
if [ -x $test_home/cts-pengine ]; then
$test_home/cts-pengine $verbose $valgrind
rc=$?
else
error "pengine regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
exec)
if [ -x $test_home/cts-exec ]; then
run_as_root $test_home/cts-exec
rc=$?
else
error "executor regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
pacemaker_remote)
if [ -x $test_home/cts-exec ]; then
run_as_root $test_home/cts-exec -R
rc=$?
else
error "pacemaker_remote regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
fencing)
if [ -x $test_home/cts-stonithd ]; then
run_as_root $test_home/cts-stonithd
rc=$?
else
error "fencing regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
cli)
if [ -x $test_home/cts-cli ]; then
$test_home/cts-cli $verbose $valgrind
rc=$?
else
error "cli regression test not found"
rc=$CRM_EX_NOT_INSTALLED
fi
;;
esac
info "============================================================"
info ""
info ""
return $rc
}
run_tests() {
TESTS="$@"
local TEST
local TEST_RC
local FAILED
FAILED=""
for TEST in $TESTS; do
run_test $TEST
TEST_RC=$?
if [ $TEST_RC -ne 0 ]; then
info "$TEST regression tests failed ($TEST_RC)"
FAILED="$FAILED $TEST"
fi
done
if [ -n "$FAILED" ]; then
error "failed regression tests: $FAILED"
return $CRM_EX_ERROR
fi
return $CRM_EX_OK
}
while [ $# -gt 0 ] ; do
case "$1" in
--help)
echo "$USAGE_TEXT"
exit $CRM_EX_OK
;;
-V|--verbose)
verbose="-V"
shift
;;
-v|--valgrind)
valgrind="-v"
shift
;;
pengine|exec|pacemaker_remote|fencing|cli)
add_test $1
shift
;;
all)
add_test pengine
add_test cli
add_test exec
add_test fencing
shift
;;
*)
error "unknown option: $1"
echo
echo "$USAGE_TEXT"
exit $CRM_EX_USAGE
;;
esac
done
if [ -z "$tests" ]; then
add_test pengine
add_test cli
add_test exec
fi
run_tests $tests
diff --git a/cts/lxc_autogen.sh.in b/cts/lxc_autogen.sh.in
index ab828311df..1479296af8 100644
--- a/cts/lxc_autogen.sh.in
+++ b/cts/lxc_autogen.sh.in
@@ -1,526 +1,532 @@
#!@BASH_PATH@
+#
+# Copyright 2013-2018 David Vossel <davidvossel@gmail.com>
+#
+# This source code is licensed under the GNU General Public License version 2
+# or later (GPLv2+) WITHOUT ANY WARRANTY.
+#
containers="2"
download=0
share_configs=0
# different than default libvirt network in case this is run nested in a KVM instance
addr="192.168.123.1"
restore=0
restore_pcmk=0
restore_all=0
generate=0
key_gen=0
cib=0
anywhere=0
add_master=0
verify=0
working_dir="@CRM_CONFIG_CTS@/lxc"
run_dirs="/run /var/run /usr/var/run"
SSH_CMD_OPTS="
-o StrictHostKeyChecking=no
-o ConnectTimeout=30
-o BatchMode=yes
-l root
-T
"
# must be on one line b/c used inside quotes
SSH_RSYNC_OPTS="-o UserKnownHostsFile=/dev/null -o BatchMode=yes -o StrictHostKeyChecking=no"
function helptext() {
echo "lxc_autogen.sh - A tool for generating libvirt lxc containers for testing purposes."
echo ""
echo "Usage: lxc-autogen [options]"
echo ""
echo "Options:"
echo "-g, --generate Generate libvirt lxc environment in the directory this script is run from."
echo "-k, --key-gen Generate pacemaker remote key only."
echo "-r, --restore-libvirt Restore the default network, and libvirt config to before this script ran."
echo "-p, --restore-cib Remove cib entries this script generated."
echo "-R, --restore-all Restore both libvirt and cib plus clean working directory. This will leave libvirt xml files though so rsc can be stopped properly."
echo ""
echo "-A, --allow-anywhere Allow the containers to live anywhere in the cluster"
echo "-a, --add-cib Add remote-node entries for each lxc instance into the cib"
echo "-m, --add-master Add master resource shared between remote-nodes"
echo "-d, --download-agent Download and install the latest VirtualDomain agent."
echo "-s, --share-configs Synchronize on all known cluster nodes"
echo "-c, --containers Specify the number of containers to generate, defaults to $containers. Used with -g"
echo "-n, --network What network to override default libvirt network to. Example: -n 192.168.123.1. Used with -g"
echo "-v, --verify Verify environment is capable of running lxc"
echo ""
exit $1
}
while true ; do
case "$1" in
--help|-h|-\?) helptext 0;;
-c|--containers) containers="$2"; shift; shift;;
-d|--download-agent) download=1; shift;;
-s|--share-configs) share_configs=1; shift;;
-n|--network) addr="$2"; shift; shift;;
-r|--restore-libvirt) restore=1; shift;;
-p|--restore-cib) restore_pcmk=1; shift;;
-R|--restore-all)
restore_all=1
restore=1
restore_pcmk=1
shift;;
-g|--generate) generate=1; key_gen=1; shift;;
-k|--key-gen) key_gen=1; shift;;
-a|--add-cib) cib=1; shift;;
-A|--allow-anywhere) anywhere=1; shift;;
-m|--add-master) add_master=1; shift;;
-v|--verify) verify=1; shift;;
"") break;;
*) helptext 1;;
esac
done
if [ $verify -eq 1 ]; then
# verify virsh tool is available and that
# we can connect to lxc driver.
virsh -c lxc:/// list --all > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Could not connect 'virsh -c lxc:///' check that libvirt lxc driver is installed"
# yum install -y libvirt-daemon-driver-lxc libvirt-daemon-lxc libvirt-login-shell
exit 1
fi
cat /etc/selinux/config | grep -e "SELINUX.*=.*permissive" -e "SELINUX.*=.*enforcing" > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "/etc/selinux/config must have SELINUX set to permissive or enforcing mode."
exit 1
fi
ps x > /tmp/lxc-autogen-libvirt-test.txt
grep "libvirtd" /tmp/lxc-autogen-libvirt-test.txt
if [ $? -ne 0 ]; then
rm -f /tmp/lxc-autogen-libvirt-test.txt
echo "libvirtd isn't up."
exit 1
fi
rm -f /tmp/lxc-autogen-libvirt-test.txt
which rsync > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "rsync is required"
fi
- which pacemaker_remoted > /dev/null 2>&1
+ which pacemaker-remoted > /dev/null 2>&1
if [ $? -ne 0 ]; then
- echo "pacemaker_remoted is required"
+ echo "pacemaker-remoted is required"
fi
fi
#strip last digits off addr
addr=$(echo $addr | awk -F. '{print $1"."$2"."$3}')
this_node()
{
crm_node -n
}
other_nodes()
{
crm_node -l | awk "\$2 != \"$(this_node)\" {print \$2}"
}
make_directory()
{
# argument must be full path
DIR="$1"
mkdir -p "$DIR"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
ssh $SSH_CMD_OPTS $node mkdir -p "$DIR"
done
fi
}
sync_file()
{
TARGET="$1"
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
rsync -ave "ssh $SSH_RSYNC_OPTS" "$TARGET" "${node}:${TARGET}"
done
fi
}
download_agent()
{
wget https://raw.github.com/ClusterLabs/resource-agents/master/heartbeat/VirtualDomain
chmod 755 VirtualDomain
mv -f VirtualDomain /usr/lib/ocf/resource.d/heartbeat/VirtualDomain
sync_file /usr/lib/ocf/resource.d/heartbeat/VirtualDomain
}
set_network()
{
rm -f cur_network.xml
cat << END >> cur_network.xml
<network>
<name>default</name>
<uuid>41ebdb84-7134-1111-a136-91f0f1119225</uuid>
<forward mode='nat'/>
<bridge name='virbr0' stp='on' delay='0' />
<mac address='52:54:00:A8:12:35'/>
<ip address='$addr.1' netmask='255.255.255.0'>
<dhcp>
<range start='$addr.2' end='$addr.254' />
</dhcp>
</ip>
</network>
END
sync_file ${working_dir}/cur_network.xml
}
distribute_configs()
{
for node in $(other_nodes); do
rsync -ave "ssh $SSH_RSYNC_OPTS" ${working_dir}/lxc*.xml ${node}:${working_dir}
rsync -ave "ssh $SSH_RSYNC_OPTS" ${working_dir}/lxc*-filesystem ${node}:${working_dir}
done
}
start_network()
{
NODE="$1"
ssh $SSH_CMD_OPTS $NODE <<-EOF
cd $working_dir
virsh net-info default >/dev/null 2>&1
if [ \$? -eq 0 ]; then
if [ ! -f restore_default.xml ]; then
virsh net-dumpxml default > restore_default.xml
fi
virsh net-destroy default
virsh net-undefine default
fi
virsh net-define cur_network.xml
virsh net-start default
virsh net-autostart default
EOF
}
start_network_all()
{
start_network $(this_node)
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
start_network $node
done
fi
}
add_hosts_entry()
{
IP="$1"
HNAME="$2"
echo $IP $HNAME >>/etc/hosts
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
ssh $SSH_CMD_OPTS $node "echo $IP $HNAME >>/etc/hosts"
done
fi
}
generate_key()
{
if [ ! -e /etc/pacemaker/authkey ]; then
make_directory /etc/pacemaker
dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1
sync_file /etc/pacemaker/authkey
fi
}
generate()
{
set_network
# Generate libvirt domains in xml
for (( c=1; c <= $containers; c++ ))
do
# Clean any previous definition
rm -rf lxc$c.xml lxc$c-filesystem
# Create a basic filesystem with run directories
for dir in $run_dirs; do
mkdir -p lxc$c-filesystem/$dir
done
# Create libvirt definition
suffix=$((10 + $c))
prefix=$(echo $addr | awk -F. '{print $1"."$2}')
subnet=$(echo $addr | awk -F. '{print $3}')
while [ $suffix -gt 255 ]; do
subnet=$(($subnet + 1))
suffix=$(($subnet - 255))
done
cip=$prefix.$subnet.$suffix
cat << END >> lxc$c.xml
<domain type='lxc'>
<name>lxc$c</name>
<memory unit='KiB'>200704</memory>
<os>
<type>exe</type>
<init>$working_dir/lxc$c-filesystem/launch-helper</init>
</os>
<devices>
<console type='pty'/>
<filesystem type='ram'>
<source usage='150528'/>
<target dir='/dev/shm'/>
</filesystem>
END
for dir in $run_dirs; do
cat << END >> lxc$c.xml
<filesystem type='mount'>
<source dir='$working_dir/lxc$c-filesystem${dir}'/>
<target dir='$dir'/>
</filesystem>
END
done
cat << END >> lxc$c.xml
<interface type='network'>
<mac address='52:54:$(($RANDOM % 9))$(($RANDOM % 9)):$(($RANDOM % 9))$(($RANDOM % 9)):$(($RANDOM % 9))$(($RANDOM % 9)):$(($RANDOM % 9))$(($RANDOM % 9))'/>
<source network='default'/>
</interface>
</devices>
</domain>
END
# Create CIB definition
rm -f container$c.cib
cat << END >> container$c.cib
<primitive class="ocf" id="container$c" provider="heartbeat" type="VirtualDomain">
<instance_attributes id="container$c-instance_attributes">
<nvpair id="container$c-instance_attributes-force_stop" name="force_stop" value="true"/>
<nvpair id="container$c-instance_attributes-hypervisor" name="hypervisor" value="lxc:///"/>
<nvpair id="container$c-instance_attributes-config" name="config" value="$working_dir/lxc$c.xml"/>
</instance_attributes>
<utilization id="container$c-utilization">
<nvpair id="container$c-utilization-cpu" name="cpu" value="1"/>
<nvpair id="container$c-utilization-hv_memory" name="hv_memory" value="100"/>
</utilization>
<meta_attributes id="container$c-meta_attributes">
<nvpair id="container$c-meta_attributes-remote-node" name="remote-node" value="lxc$c"/>
</meta_attributes>
</primitive>
END
# Create container init
rm -f lxc$c-filesystem/launch-helper
cat << END >> lxc$c-filesystem/launch-helper
#!@BASH_PATH@
ip -f inet addr add $cip/24 dev eth0
ip link set eth0 up
ip route add default via $addr.1
hostname lxc$c
df > $working_dir/lxc$c-filesystem/disk_usage.txt
export PCMK_debugfile=@CRM_LOG_DIR@/pacemaker_remote_lxc$c.log
-/usr/sbin/pacemaker_remoted
+/usr/sbin/pacemaker-remoted
END
chmod 711 lxc$c-filesystem/launch-helper
add_hosts_entry $cip lxc$c
done
# Create CIB fragment for a master-slave resource
rm -f lxc-ms.cib
cat << END >> lxc-ms.cib
<master id="lxc-ms-master">
<primitive class="ocf" id="lxc-ms" provider="pacemaker" type="Stateful">
<instance_attributes id="lxc-ms-instance_attributes"/>
<operations>
<op id="lxc-ms-monitor-interval-10s" interval="10s" name="monitor"/>
</operations>
</primitive>
<meta_attributes id="lxc-ms-meta_attributes">
<nvpair id="lxc-ms-meta_attributes-master-max" name="master-max" value="1"/>
<nvpair id="lxc-ms-meta_attributes-clone-max" name="clone-max" value="$containers"/>
</meta_attributes>
</master>
END
}
apply_cib_master()
{
cibadmin -Q > cur.cib
export CIB_file=cur.cib
cibadmin -o resources -Mc -x lxc-ms.cib
for tmp in $(ls lxc*.xml | sed -e 's/\.xml//g'); do
echo "<rsc_location id=\"lxc-ms-location-${tmp}\" node=\"${tmp}\" rsc=\"lxc-ms-master\" score=\"INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -Mc -x tmp_constraint
done
# Make sure the version changes even if the content doesn't
cibadmin -B
unset CIB_file
cibadmin --replace -o configuration --xml-file cur.cib
rm -f cur.cib
}
apply_cib_entries()
{
cibadmin -Q > cur.cib
export CIB_file=cur.cib
for tmp in $(ls container*.cib); do
cibadmin -o resources -Mc -x $tmp
remote_node=$(cat ${tmp} | grep remote-node | sed -n -e 's/^.*value=\"\(.*\)\".*/\1/p')
if [ $anywhere -eq 0 ]; then
tmp=$(echo $tmp | sed -e 's/\.cib//g')
crm_resource -M -r $tmp -H $(this_node)
fi
echo "<rsc_location id=\"lxc-ping-location-${remote_node}\" node=\"${remote_node}\" rsc=\"Connectivity\" score=\"-INFINITY\"/>" > tmp_constraint
# it's fine if applying this constraint fails. it's just to help with cts
# when the connectivity resources are in use. those resources fail the remote-nodes.
cibadmin -o constraints -Mc -x tmp_constraint > /dev/null 2>&1
for rsc in $(crm_resource -l | grep rsc_ ); do
echo "<rsc_location id=\"lxc-${rsc}-location-${remote_node}\" node=\"${remote_node}\" rsc=\"${rsc}\" score=\"-INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -Mc -x tmp_constraint > /dev/null 2>&1
done
rm -f tmp_constraint
done
# Make sure the version changes even if the content doesn't
cibadmin -B
unset CIB_file
cibadmin --replace -o configuration --xml-file cur.cib
rm -f cur.cib
}
restore_cib()
{
cibadmin -Q > cur.cib
export CIB_file=cur.cib
for tmp in $(ls lxc*.xml | sed -e 's/\.xml//g'); do
echo "<rsc_location id=\"lxc-ms-location-${tmp}\" node=\"${tmp}\" rsc=\"lxc-ms-master\" score=\"INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -D -x tmp_constraint
echo "<rsc_location id=\"lxc-ping-location-${tmp}\" node=\"${tmp}\" rsc=\"Connectivity\" score=\"-INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -D -x tmp_constraint
for rsc in $(crm_resource -l | grep rsc_ ); do
echo "<rsc_location id=\"lxc-${rsc}-location-${tmp}\" node=\"${tmp}\" rsc=\"${rsc}\" score=\"-INFINITY\"/>" > tmp_constraint
cibadmin -o constraints -D -x tmp_constraint
done
rm -f tmp_constraint
done
cibadmin -o resources -D -x lxc-ms.cib
for tmp in $(ls container*.cib); do
tmp=$(echo $tmp | sed -e 's/\.cib//g')
crm_resource -U -r $tmp -H $(this_node)
crm_resource -D -r $tmp -t primitive
done
# Make sure the version changes even if the content doesn't
cibadmin -B
unset CIB_file
cibadmin --replace -o configuration --xml-file cur.cib
rm -f cur.cib
# Allow the cluster to stabilize before continuing
crm_resource --wait
# Purge nodes from caches and CIB status section
for tmp in $(ls lxc*.xml | sed -e 's/\.xml//g'); do
crm_node --force --remove $tmp
done
}
restore_network()
{
NODE="$1"
ssh $SSH_CMD_OPTS $NODE <<-EOF
cd $working_dir
for tmp in \$(ls lxc*.xml | sed -e 's/\.xml//g'); do
virsh -c lxc:/// destroy \$tmp >/dev/null 2>&1
virsh -c lxc:/// undefine \$tmp >/dev/null 2>&1
sed -i.bak "/...\....\....\..* \${tmp}/d" /etc/hosts
done
virsh net-destroy default >/dev/null 2>&1
virsh net-undefine default >/dev/null 2>&1
if [ -f restore_default.xml ]; then
virsh net-define restore_default.xml
virsh net-start default
rm restore_default.xml
fi
EOF
echo "Containers destroyed and default network restored on $NODE"
}
restore_libvirt()
{
restore_network $(this_node)
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
restore_network $node
done
fi
}
restore_files()
{
ls | grep -v "lxc.\.xml" | xargs rm -rf
if [ $share_configs -eq 1 ]; then
for node in $(other_nodes); do
ssh $SSH_CMD_OPTS $node rm -rf \
$working_dir/lxc*-filesystem \
$working_dir/cur_network.xml
done
fi
}
make_directory $working_dir
cd $working_dir
# Generate files as requested
if [ $download -eq 1 ]; then
download_agent
fi
if [ $key_gen -eq 1 ]; then
generate_key
fi
if [ $generate -eq 1 ]; then
generate
fi
if [ $share_configs -eq 1 ]; then
distribute_configs
fi
if [ $generate -eq 1 ]; then
start_network_all
fi
# Update cluster as requested
if [ $cib -eq 1 ]; then
apply_cib_entries
fi
if [ $add_master -eq 1 ]; then
apply_cib_master
fi
# Restore original state as requested
if [ $restore_pcmk -eq 1 ]; then
restore_cib
fi
if [ $restore -eq 1 ]; then
restore_libvirt
fi
if [ $restore_all -eq 1 ]; then
restore_files
fi
diff --git a/cts/patterns.py b/cts/patterns.py
index f20f8d98e6..d2fadf382b 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -1,402 +1,402 @@
""" Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS)
"""
# Pacemaker targets compatibility with Python 2.7 and 3.2+
from __future__ import print_function, unicode_literals, absolute_import, division
__copyright__ = "Copyright 2008-2018 Andrew Beekhof <andrew@beekhof.net>"
__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY"
import sys, os
from cts.CTSvars import *
patternvariants = {}
class BasePatterns(object):
def __init__(self, name):
self.name = name
patternvariants[name] = self
self.ignore = [
"avoid confusing Valgrind",
]
self.BadNews = []
self.components = {}
self.commands = {
"StatusCmd" : "crmadmin -t 60000 -S %s 2>/dev/null",
"CibQuery" : "cibadmin -Ql",
"CibAddXml" : "cibadmin --modify -c --xml-text %s",
"CibDelXpath" : "cibadmin --delete --xpath %s",
# 300,000 == 5 minutes
"RscRunning" : CTSvars.CRM_DAEMON_DIR + "/cts-exec-helper -R -r %s",
"CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml",
"TmpDir" : "/tmp",
"BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1",
"FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1",
# tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit
# tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated
# tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1
# tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null
"ReduceCommCmd" : "",
"RestoreCommCmd" : "tc qdisc del dev lo root",
"SetCheckInterval" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-recheck-interval-setting\" name=\"cluster-recheck-interval\" value=\"%s\"/></cluster_property_set>'",
"ClearCheckInterval" : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"",
"MaintenanceModeOn" : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-maintenance-mode-setting\" name=\"maintenance-mode\" value=\"true\"/></cluster_property_set>'",
"MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",
"StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null",
"StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null",
}
self.search = {
"Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE",
# This won't work if we have multiple partitions
"Pat:Local_started" : "%s\W.*The local CRM is operational",
"Pat:NonDC_started" : r"%s\W.*State transition.*-> S_NOT_DC",
"Pat:DC_started" : r"%s\W.*State transition.*-> S_IDLE",
"Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN",
"Pat:They_stopped" : "%s\W.*LOST:.* %s ",
"Pat:They_dead" : "node %s.*: is dead",
"Pat:TransitionComplete" : "Transition status: Complete: complete",
"Pat:Fencing_start" : "(Initiating remote operation|Requesting peer fencing ).* (for|of) %s",
"Pat:Fencing_ok" : r"stonith.*:\s*Operation .* of %s by .* for .*@.*: OK",
"Pat:Fencing_recover" : r"pengine.*: Recover %s",
"Pat:RscOpOK" : r"crmd.*:\s+Result of %s operation for %s.*: (0 \()?ok",
"Pat:RscRemoteOpOK" : r"crmd.*:\s+Result of %s operation for %s on %s: (0 \()?ok",
"Pat:NodeFenced" : r"crmd.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK",
"Pat:FenceOpOK" : "Operation .* for host '%s' with device .* returned: 0",
}
def get_component(self, key):
if key in self.components:
return self.components[key]
print("Unknown component '%s' for %s" % (key, self.name))
return []
def get_patterns(self, key):
if key == "BadNews":
return self.BadNews
elif key == "BadNewsIgnore":
return self.ignore
elif key == "Commands":
return self.commands
elif key == "Search":
return self.search
elif key == "Components":
return self.components
def __getitem__(self, key):
if key == "Name":
return self.name
elif key in self.commands:
return self.commands[key]
elif key in self.search:
return self.search[key]
else:
print("Unknown template '%s' for %s" % (key, self.name))
return None
class crm_corosync(BasePatterns):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
BasePatterns.__init__(self, name)
self.commands.update({
"StartCmd" : "service corosync start && service pacemaker start",
- "StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker_remoted ] || service pacemaker_remote stop; service corosync stop",
+ "StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop",
"EpochCmd" : "crm_node -e",
"QuorumCmd" : "crm_node -q",
"PartitionCmd" : "crm_node -p",
})
self.search.update({
# Close enough ... "Corosync Cluster Engine exiting normally" isn't
# printed reliably.
"Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines",
"Pat:They_stopped" : "%s\W.*crmd.*Node %s(\[|\s).*state is now lost",
"Pat:They_dead" : "crmd.*Node %s(\[|\s).*state is now lost",
"Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(",
"Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated with signal 9",
"Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning failed child process: %s",
"Pat:InfraUp" : "%s\W.*corosync.*Initializing transport",
"Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker",
})
self.ignore = self.ignore + [
r"crm_mon:",
r"crmadmin:",
r"update_trace_data",
r"async_notify:.*strange, client not found",
r"Parse error: Ignoring unknown option .*nodename",
r"error.*: Operation 'reboot' .* with device 'FencingFail' returned:",
r"getinfo response error: 1$",
"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro failed",
r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* closed .I/O condition=17",
]
self.BadNews = [
r"error:",
r"crit:",
r"ERROR:",
r"CRIT:",
r"Shutting down...NOW",
r"Timer I_TERMINATE just popped",
r"input=I_ERROR",
r"input=I_FAIL",
r"input=I_INTEGRATED cause=C_TIMER_POPPED",
r"input=I_FINALIZED cause=C_TIMER_POPPED",
r"input=I_ERROR",
r"(pacemakerd|pacemaker-execd|crmd):.*, exiting",
r"pengine.*Attempting recovery of resource",
r"is taking more than 2x its timeout",
r"Confirm not received from",
r"Welcome reply not received from",
r"Attempting to schedule .* after a stop",
r"Resource .* was active at shutdown",
r"duplicate entries for call_id",
r"Search terminated:",
r":global_timer_callback",
r"Faking parameter digest creation",
r"Parameters to .* action changed:",
r"Parameters to .* changed",
r"\[[0-9]+\] terminated with signal [0-9]+ \(",
r"pengine:.*Recover .*\(.* -\> .*\)",
r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting",
r"Peer is not part of our cluster",
r"We appear to be in an election loop",
r"Unknown node -> we will not deliver message",
r"(Blackbox dump requested|Problem detected)",
r"pacemakerd.*Could not connect to Cluster Configuration Database API",
r"Receiving messages from a node we think is dead",
r"share the same cluster nodeid",
r"share the same name",
#r"crm_ipc_send:.*Request .* failed",
#r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received",
# Not inherently bad, but worth tracking
#r"No need to invoke the TE",
#r"ping.*: DEBUG: Updated connected = 0",
#r"Digest mis-match:",
r"crmd:.*Transition failed: terminated",
r"Local CIB .* differs from .*:",
r"warn.*:\s*Continuing but .* will NOT be used",
r"warn.*:\s*Cluster configuration file .* is corrupt",
#r"Executing .* fencing operation",
r"Election storm",
r"stalled the FSA with pending inputs",
]
self.components["common-ignore"] = [
"Pending action:",
"error: crm_log_message_adv:",
r"resource( was|s were) active at shutdown",
"pending LRM operations at shutdown",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
"Sending message to CIB service FAILED",
"apply_xml_diff:.*Diff application failed!",
r"crmd.*:\s*Action A_RECOVER .* not supported",
"unconfirmed_actions:.*Waiting on .* unconfirmed actions",
"cib_native_msgready:.*Message pending on command channel",
r"crmd.*:\s*Performing A_EXIT_1 - forcefully exiting the CRMd",
"verify_stopped:.*Resource .* was active at shutdown. You may ignore this error if it is unmanaged.",
"error: attrd_connection_destroy:.*Lost connection to attrd",
r".*:\s*Executing .* fencing operation \(.*\) on ",
r".*:\s*Requesting fencing \([^)]+\) of node ",
r"(Blackbox dump requested|Problem detected)",
# "error: native_create_actions: Resource .*stonith::.* is active on 2 nodes attempting recovery",
# "error: process_pe_message: Transition .* ERRORs found during PE processing",
]
self.components["corosync-ignore"] = [
r"error:.*Connection to the CPG API failed: Library error",
r"\[[0-9]+\] exited with status [0-9]+ \(",
r"cib.*error:.*Corosync connection lost",
r"stonith-ng.*error:.*Corosync connection terminated",
r"pacemaker-execd.*error:.*Connection to stonith-ng.* (failed|closed)",
r"crmd.*State transition .* S_RECOVERY",
r"crmd.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state",
r"crmd.*error:.*Could not recover from internal error",
r"error:.*Connection to cib_(shm|rw).* (failed|closed)",
r"error:.*STONITH connection failed",
r"error: Connection to stonith-ng.* (failed|closed)",
r"crit: Fencing daemon connection failed",
]
self.components["corosync"] = [
r"pacemakerd.*error:.*Connection destroyed",
r"attrd.*:\s*(crit|error):.*Lost connection to (Corosync|CIB) service",
r"stonith.*:\s*(Corosync connection terminated|Shutting down)",
r"cib.*:\s*Corosync connection lost!\s+Exiting.",
r"crmd.*:\s*(connection terminated|Disconnected from Corosync)",
r"pengine.*Scheduling Node .* for STONITH",
r"crmd.*:\s*Peer .* was terminated \(.*\) by .* for .*:\s*OK",
]
self.components["cib-ignore"] = [
"pacemaker-execd.*Connection to stonith-ng failed",
"pacemaker-execd.*Connection to stonith-ng.* closed",
"pacemaker-execd.*LRMD lost STONITH connection",
"pacemaker-execd.*STONITH connection failed, finalizing .* pending operations",
]
self.components["cib"] = [
"State transition .* S_RECOVERY",
r"Respawning failed child process: (pacemaker-attrd|crmd)",
"Connection to cib_.* failed",
"Connection to cib_.* closed",
r"crmd.*:.*Connection to the CIB terminated...",
r"attrd.*:.*(Lost connection to CIB service|Connection to the CIB terminated)",
r"crmd\[[0-9]+\] exited with status 1 \(",
r"attrd\[[0-9]+\] exited with status 102 \(",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*I_ERROR.*crmd_cib_connection_destroy",
"crmd.*Could not recover from internal error",
]
self.components["pacemaker-execd"] = [
r"crmd.*Connection to (pacemaker-execd|lrmd|executor) (failed|closed)",
r"crmd.*I_ERROR.*lrm_connection_destroy",
r"crmd.*State transition .* S_RECOVERY",
r"crmd.*: Input I_TERMINATE .*from do_recover",
r"crmd.*Could not recover from internal error",
r"pacemakerd.*pacemaker-execd.* terminated with signal 9",
r"pacemakerd.*crmd\[[0-9]+\] exited with status 1",
r"pacemakerd.*Respawning failed child process: pacemaker-execd",
r"pacemakerd.*Respawning failed child process: crmd",
]
self.components["pacemaker-execd-ignore"] = []
self.components["crmd"] = [
# "WARN: determine_online_status: Node .* is unclean",
# "Scheduling Node .* for STONITH",
# "Executing .* fencing operation",
# Only if the node wasn't the DC: "State transition S_IDLE",
"State transition .* -> S_IDLE",
]
self.components["crmd-ignore"] = []
self.components["pacemaker-attrd"] = []
self.components["pacemaker-attrd-ignore"] = []
self.components["pengine"] = [
"State transition .* S_RECOVERY",
r"Respawning failed child process: crmd",
r"crmd\[[0-9]+\] exited with status 1 \(",
"Connection to pengine failed",
"Connection to pengine.* closed",
"Connection to the Policy Engine failed",
"crmd.*I_ERROR.*save_cib_contents",
r"crmd.*: Input I_TERMINATE .*from do_recover",
"crmd.*Could not recover from internal error",
]
self.components["pengine-ignore"] = []
self.components["stonith"] = [
"Connection to stonith-ng failed",
"LRMD lost STONITH connection",
"Connection to stonith-ng.* closed",
"Fencing daemon connection failed",
r"crmd.*:\s*warn.*:\s*Callback already present",
]
self.components["stonith-ignore"] = [
r"pengine.*: Recover Fencing",
r"Updating failcount for Fencing",
r"error:.*Connection to stonith-ng failed",
r"error:.*Connection to stonith-ng.*closed \(I/O condition=17\)",
r"crit:.*Fencing daemon connection failed",
r"error:.*Sign-in failed: triggered a retry",
"STONITH connection failed, finalizing .* pending operations.",
r"crmd.*:\s+Result of .* operation for Fencing.*Error",
]
self.components["stonith-ignore"].extend(self.components["common-ignore"])
class crm_corosync_docker(crm_corosync):
'''
Patterns for Corosync version 2 cluster manager class
'''
def __init__(self, name):
crm_corosync.__init__(self, name)
self.commands.update({
"StartCmd" : "pcmk_start",
"StopCmd" : "pcmk_stop",
})
class PatternSelector(object):
def __init__(self, name=None):
self.name = name
self.base = BasePatterns("crm-base")
if not name:
crm_corosync("crm-corosync")
elif name == "crm-corosync":
crm_corosync(name)
elif name == "crm-corosync-docker":
crm_corosync_docker(name)
def get_variant(self, variant):
if variant in patternvariants:
return patternvariants[variant]
print("defaulting to crm-base for %s" % variant)
return self.base
def get_patterns(self, variant, kind):
return self.get_variant(variant).get_patterns(kind)
def get_template(self, variant, key):
v = self.get_variant(variant)
return v[key]
def get_component(self, variant, kind):
return self.get_variant(variant).get_component(kind)
def __getitem__(self, key):
return self.get_template(self.name, key)
# python cts/CTSpatt.py -k crm-corosync -t StartCmd
if __name__ == '__main__':
pdir=os.path.dirname(sys.path[0])
sys.path.insert(0, pdir) # So that things work from the source directory
kind=None
template=None
skipthis=None
args=sys.argv[1:]
for i in range(0, len(args)):
if skipthis:
skipthis=None
continue
elif args[i] == "-k" or args[i] == "--kind":
skipthis=1
kind = args[i+1]
elif args[i] == "-t" or args[i] == "--template":
skipthis=1
template = args[i+1]
else:
print("Illegal argument " + args[i])
print(PatternSelector(kind)[template])
diff --git a/cts/pengine/bundle-replicas-change.exp b/cts/pengine/bundle-replicas-change.exp
index 138dfcd1fa..ce77af7048 100644
--- a/cts/pengine/bundle-replicas-change.exp
+++ b/cts/pengine/bundle-replicas-change.exp
@@ -1,586 +1,586 @@
<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY" transition_id="0">
<synapse id="0">
<action_set>
<rsc_op id="37" operation="monitor" operation_key="httpd:0_monitor_10000" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test">
<primitive id="httpd:0" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="0" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_interval="10000" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-0" CRM_meta_on_node_uuid="httpd-bundle-0" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="40000" pcmk_external_ip="192.168.20.188"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="19" operation="start" operation_key="httpd-bundle-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="36" operation="start" operation_key="httpd:0_start_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
</inputs>
</synapse>
<synapse id="1">
<action_set>
<rsc_op id="36" operation="start" operation_key="httpd:0_start_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test">
<primitive id="httpd:0" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="0" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_name="start" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-0" CRM_meta_on_node_uuid="httpd-bundle-0" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="200000" pcmk_external_ip="192.168.20.188"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="6" operation="start" operation_key="httpd-bundle-docker-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="19" operation="start" operation_key="httpd-bundle-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="42" operation="start" operation_key="httpd-bundle-clone_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="2">
<action_set>
<rsc_op id="8" operation="monitor" operation_key="httpd:0_monitor_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test">
<primitive id="httpd:0" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="0" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-0" CRM_meta_on_node_uuid="httpd-bundle-0" CRM_meta_op_target_rc="7" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="40000" pcmk_external_ip="192.168.20.188"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="19" operation="start" operation_key="httpd-bundle-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="3">
<action_set>
<rsc_op id="39" operation="monitor" operation_key="httpd:1_monitor_10000" on_node="httpd-bundle-1" on_node_uuid="httpd-bundle-1" router_node="rh74-test">
<primitive id="httpd:1" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="1" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_interval="10000" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-1" CRM_meta_on_node_uuid="httpd-bundle-1" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="40000" pcmk_external_ip="192.168.20.189"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="24" operation="start" operation_key="httpd-bundle-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="38" operation="start" operation_key="httpd:1_start_0" on_node="httpd-bundle-1" on_node_uuid="httpd-bundle-1" router_node="rh74-test"/>
</trigger>
</inputs>
</synapse>
<synapse id="4">
<action_set>
<rsc_op id="38" operation="start" operation_key="httpd:1_start_0" on_node="httpd-bundle-1" on_node_uuid="httpd-bundle-1" router_node="rh74-test">
<primitive id="httpd:1" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="1" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_name="start" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-1" CRM_meta_on_node_uuid="httpd-bundle-1" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="200000" pcmk_external_ip="192.168.20.189"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="22" operation="start" operation_key="httpd-bundle-docker-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="24" operation="start" operation_key="httpd-bundle-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="36" operation="start" operation_key="httpd:0_start_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
<trigger>
<pseudo_event id="42" operation="start" operation_key="httpd-bundle-clone_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="5">
<action_set>
<rsc_op id="41" operation="monitor" operation_key="httpd:2_monitor_10000" on_node="httpd-bundle-2" on_node_uuid="httpd-bundle-2" router_node="rh74-test">
<primitive id="httpd:2" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="2" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_interval="10000" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-2" CRM_meta_on_node_uuid="httpd-bundle-2" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="40000" pcmk_external_ip="192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="30" operation="start" operation_key="httpd-bundle-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="40" operation="start" operation_key="httpd:2_start_0" on_node="httpd-bundle-2" on_node_uuid="httpd-bundle-2" router_node="rh74-test"/>
</trigger>
</inputs>
</synapse>
<synapse id="6">
<action_set>
<rsc_op id="40" operation="start" operation_key="httpd:2_start_0" on_node="httpd-bundle-2" on_node_uuid="httpd-bundle-2" router_node="rh74-test">
<primitive id="httpd:2" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_clone="2" CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_name="start" CRM_meta_notify="false" CRM_meta_on_node="httpd-bundle-2" CRM_meta_on_node_uuid="httpd-bundle-2" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="200000" pcmk_external_ip="192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="28" operation="start" operation_key="httpd-bundle-docker-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="30" operation="start" operation_key="httpd-bundle-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="38" operation="start" operation_key="httpd:1_start_0" on_node="httpd-bundle-1" on_node_uuid="httpd-bundle-1" router_node="rh74-test"/>
</trigger>
<trigger>
<pseudo_event id="42" operation="start" operation_key="httpd-bundle-clone_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="7" priority="1000000">
<action_set>
<pseudo_event id="43" operation="running" operation_key="httpd-bundle-clone_running_0">
<attributes CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_notify="false" CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
<inputs>
<trigger>
<rsc_op id="36" operation="start" operation_key="httpd:0_start_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
<trigger>
<rsc_op id="38" operation="start" operation_key="httpd:1_start_0" on_node="httpd-bundle-1" on_node_uuid="httpd-bundle-1" router_node="rh74-test"/>
</trigger>
<trigger>
<rsc_op id="40" operation="start" operation_key="httpd:2_start_0" on_node="httpd-bundle-2" on_node_uuid="httpd-bundle-2" router_node="rh74-test"/>
</trigger>
<trigger>
<pseudo_event id="42" operation="start" operation_key="httpd-bundle-clone_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="8">
<action_set>
<pseudo_event id="42" operation="start" operation_key="httpd-bundle-clone_start_0">
<attributes CRM_meta_clone_max="3" CRM_meta_clone_node_max="3" CRM_meta_globally_unique="true" CRM_meta_notify="false" CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
<inputs>
<trigger>
<rsc_op id="8" operation="monitor" operation_key="httpd:0_monitor_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
<trigger>
<rsc_op id="10" operation="monitor" operation_key="httpd-bundle-docker-1_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="13" operation="monitor" operation_key="httpd-bundle-docker-2_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="32" operation="start" operation_key="httpd-bundle_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="9">
<action_set>
<rsc_op id="17" operation="stop" operation_key="httpd-bundle-docker-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-0" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-0 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-0:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-0:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-0:/var/log -p 192.168.20.188:80:80 -p 192.168.20.188:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-0 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-0:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-0:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-0:/var/log -p 192.168.20.188:80:80 -p 192.168.20.188:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="18" operation="stop" operation_key="httpd-bundle-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="34" operation="stop" operation_key="httpd-bundle_stop_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="10">
<action_set>
<rsc_op id="6" operation="start" operation_key="httpd-bundle-docker-0_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-0" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-0 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-0:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-0:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-0:/var/log -p 192.168.20.188:80:80 -p 192.168.20.188:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-0 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-0:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-0:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-0:/var/log -p 192.168.20.188:80:80 -p 192.168.20.188:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="17" operation="stop" operation_key="httpd-bundle-docker-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="32" operation="start" operation_key="httpd-bundle_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="11">
<action_set>
<rsc_op id="2" operation="monitor" operation_key="httpd-bundle-docker-0_monitor_60000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-0" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-0 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-0:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-0:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-0:/var/log -p 192.168.20.188:80:80 -p 192.168.20.188:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0,/var/log/pacemaker/bundles/httpd-bundle-0" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-0 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-0:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-0:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-0:/var/log -p 192.168.20.188:80:80 -p 192.168.20.188:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="6" operation="start" operation_key="httpd-bundle-docker-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="12">
<action_set>
<rsc_op id="19" operation="start" operation_key="httpd-bundle-0_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-0" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-0" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" addr="192.168.20.188" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="6" operation="start" operation_key="httpd-bundle-docker-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="18" operation="stop" operation_key="httpd-bundle-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="13">
<action_set>
<rsc_op id="18" operation="stop" operation_key="httpd-bundle-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-0" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-0" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" addr="192.168.20.188" port="3121"/>
<downed>
<node id="httpd-bundle-0"/>
</downed>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="46" operation="stop" operation_key="httpd_stop_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
</inputs>
</synapse>
<synapse id="14">
<action_set>
<rsc_op id="3" operation="monitor" operation_key="httpd-bundle-0_monitor_30000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-0" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-0" CRM_meta_interval="30000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="30000" addr="192.168.20.188" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="19" operation="start" operation_key="httpd-bundle-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="15">
<action_set>
<rsc_op id="21" operation="monitor" operation_key="httpd-bundle-ip-192.168.20.189_monitor_60000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-ip-192.168.20.189" class="ocf" provider="heartbeat" type="IPaddr2"/>
<attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" cidr_netmask="24" ip="192.168.20.189" nic="ens192"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="20" operation="start" operation_key="httpd-bundle-ip-192.168.20.189_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="16">
<action_set>
<rsc_op id="20" operation="start" operation_key="httpd-bundle-ip-192.168.20.189_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-ip-192.168.20.189" class="ocf" provider="heartbeat" type="IPaddr2"/>
<attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" cidr_netmask="24" ip="192.168.20.189" nic="ens192"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="9" operation="monitor" operation_key="httpd-bundle-ip-192.168.20.189_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="17">
<action_set>
<rsc_op id="9" operation="monitor" operation_key="httpd-bundle-ip-192.168.20.189_monitor_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-ip-192.168.20.189" class="ocf" provider="heartbeat" type="IPaddr2"/>
<attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" cidr_netmask="24" ip="192.168.20.189" nic="ens192"/>
</rsc_op>
</action_set>
<inputs/>
</synapse>
<synapse id="18">
<action_set>
<rsc_op id="23" operation="monitor" operation_key="httpd-bundle-docker-1_monitor_60000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-1" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-1 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-1:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-1:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-1:/var/log -p 192.168.20.189:80:80 -p 192.168.20.189:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-1 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-1:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-1:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-1:/var/log -p 192.168.20.189:80:80 -p 192.168.20.189:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="22" operation="start" operation_key="httpd-bundle-docker-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="19">
<action_set>
<rsc_op id="22" operation="start" operation_key="httpd-bundle-docker-1_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-1" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-1 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-1:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-1:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-1:/var/log -p 192.168.20.189:80:80 -p 192.168.20.189:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-1 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-1:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-1:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-1:/var/log -p 192.168.20.189:80:80 -p 192.168.20.189:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="10" operation="monitor" operation_key="httpd-bundle-docker-1_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="20" operation="start" operation_key="httpd-bundle-ip-192.168.20.189_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="32" operation="start" operation_key="httpd-bundle_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="20">
<action_set>
<rsc_op id="10" operation="monitor" operation_key="httpd-bundle-docker-1_monitor_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-1" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-1 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-1:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-1:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-1:/var/log -p 192.168.20.189:80:80 -p 192.168.20.189:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1,/var/log/pacemaker/bundles/httpd-bundle-1" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-1 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-1:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-1:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-1:/var/log -p 192.168.20.189:80:80 -p 192.168.20.189:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs/>
</synapse>
<synapse id="21">
<action_set>
<rsc_op id="25" operation="monitor" operation_key="httpd-bundle-1_monitor_30000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-1" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-1" CRM_meta_interval="30000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="30000" addr="192.168.20.189" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="24" operation="start" operation_key="httpd-bundle-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="22">
<action_set>
<rsc_op id="24" operation="start" operation_key="httpd-bundle-1_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-1" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-1" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" addr="192.168.20.189" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="11" operation="monitor" operation_key="httpd-bundle-1_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="22" operation="start" operation_key="httpd-bundle-docker-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="23">
<action_set>
<rsc_op id="11" operation="monitor" operation_key="httpd-bundle-1_monitor_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-1" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-1" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="30000" addr="192.168.20.189" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="22" operation="start" operation_key="httpd-bundle-docker-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="24">
<action_set>
<rsc_op id="27" operation="monitor" operation_key="httpd-bundle-ip-192.168.20.190_monitor_60000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-ip-192.168.20.190" class="ocf" provider="heartbeat" type="IPaddr2"/>
<attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" cidr_netmask="24" ip="192.168.20.190" nic="ens192"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="26" operation="start" operation_key="httpd-bundle-ip-192.168.20.190_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="25">
<action_set>
<rsc_op id="26" operation="start" operation_key="httpd-bundle-ip-192.168.20.190_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-ip-192.168.20.190" class="ocf" provider="heartbeat" type="IPaddr2"/>
<attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" cidr_netmask="24" ip="192.168.20.190" nic="ens192"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="12" operation="monitor" operation_key="httpd-bundle-ip-192.168.20.190_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="26">
<action_set>
<rsc_op id="12" operation="monitor" operation_key="httpd-bundle-ip-192.168.20.190_monitor_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-ip-192.168.20.190" class="ocf" provider="heartbeat" type="IPaddr2"/>
<attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" cidr_netmask="24" ip="192.168.20.190" nic="ens192"/>
</rsc_op>
</action_set>
<inputs/>
</synapse>
<synapse id="27">
<action_set>
<rsc_op id="29" operation="monitor" operation_key="httpd-bundle-docker-2_monitor_60000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-2" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-2 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-2:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-2:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-2:/var/log -p 192.168.20.190:80:80 -p 192.168.20.190:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_interval="60000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-2 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-2:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-2:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-2:/var/log -p 192.168.20.190:80:80 -p 192.168.20.190:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="28" operation="start" operation_key="httpd-bundle-docker-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="28">
<action_set>
<rsc_op id="28" operation="start" operation_key="httpd-bundle-docker-2_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-2" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-2 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-2:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-2:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-2:/var/log -p 192.168.20.190:80:80 -p 192.168.20.190:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-2 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-2:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-2:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-2:/var/log -p 192.168.20.190:80:80 -p 192.168.20.190:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="13" operation="monitor" operation_key="httpd-bundle-docker-2_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="26" operation="start" operation_key="httpd-bundle-ip-192.168.20.190_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="32" operation="start" operation_key="httpd-bundle_start_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="29">
<action_set>
<rsc_op id="13" operation="monitor" operation_key="httpd-bundle-docker-2_monitor_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-docker-2" class="ocf" provider="heartbeat" type="docker"/>
- <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2" reuse="false" run_cmd="/usr/sbin/pacemaker_remoted" run_opts=" --restart=no -h httpd-bundle-2 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-2:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-2:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-2:/var/log -p 192.168.20.190:80:80 -p 192.168.20.190:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
+ <attributes CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" allow_pull="true" force_kill="false" image="pcmktest:http" monitor_cmd="/bin/true" mount_points="/var/local/containers/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2,/var/log/pacemaker/bundles/httpd-bundle-2" reuse="false" run_cmd="/usr/sbin/pacemaker-remoted" run_opts=" --restart=no -h httpd-bundle-2 -e PCMK_stderr=1 -e PCMK_remote_port=3121 -v /var/local/containers/httpd-bundle-2:/var/www/html:rw -v /var/log/pacemaker/bundles/httpd-bundle-2:/etc/httpd/logs:rw -v /etc/pacemaker/authkey:/etc/pacemaker/authkey -v /var/log/pacemaker/bundles/httpd-bundle-2:/var/log -p 192.168.20.190:80:80 -p 192.168.20.190:3121:3121 --log-driver=journald --add-host=httpd-bundle-0:192.168.20.188 --add-host=httpd-bundle-1:192.168.20.189 --add-host=httpd-bundle-2:192.168.20.190"/>
</rsc_op>
</action_set>
<inputs/>
</synapse>
<synapse id="30">
<action_set>
<rsc_op id="31" operation="monitor" operation_key="httpd-bundle-2_monitor_30000" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-2" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-2" CRM_meta_interval="30000" CRM_meta_name="monitor" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="30000" addr="192.168.20.190" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="30" operation="start" operation_key="httpd-bundle-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="31">
<action_set>
<rsc_op id="30" operation="start" operation_key="httpd-bundle-2_start_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-2" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-2" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_timeout="20000" addr="192.168.20.190" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="14" operation="monitor" operation_key="httpd-bundle-2_monitor_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="28" operation="start" operation_key="httpd-bundle-docker-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="32">
<action_set>
<rsc_op id="14" operation="monitor" operation_key="httpd-bundle-2_monitor_0" on_node="rh74-test" on_node_uuid="3232287163">
<primitive id="httpd-bundle-2" class="ocf" provider="pacemaker" type="remote"/>
<attributes CRM_meta_container="httpd-bundle-docker-2" CRM_meta_on_node="rh74-test" CRM_meta_on_node_uuid="3232287163" CRM_meta_op_target_rc="7" CRM_meta_timeout="30000" addr="192.168.20.190" port="3121"/>
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="28" operation="start" operation_key="httpd-bundle-docker-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="33">
<action_set>
<rsc_op id="46" operation="stop" operation_key="httpd_stop_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test">
<primitive id="httpd" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_on_node="httpd-bundle-0" CRM_meta_on_node_uuid="httpd-bundle-0" CRM_meta_physical_host="rh74-test" CRM_meta_timeout="20000" />
</rsc_op>
</action_set>
<inputs/>
</synapse>
<synapse id="34">
<action_set>
<rsc_op id="7" operation="delete" operation_key="httpd_delete_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test">
<primitive id="httpd" class="ocf" provider="heartbeat" type="apache"/>
<attributes CRM_meta_on_node="httpd-bundle-0" CRM_meta_on_node_uuid="httpd-bundle-0" CRM_meta_timeout="20000" />
</rsc_op>
</action_set>
<inputs>
<trigger>
<rsc_op id="19" operation="start" operation_key="httpd-bundle-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="46" operation="stop" operation_key="httpd_stop_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
</inputs>
</synapse>
<synapse id="35" priority="1000000">
<action_set>
<pseudo_event id="35" operation="stopped" operation_key="httpd-bundle_stopped_0">
<attributes CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
<inputs>
<trigger>
<rsc_op id="17" operation="stop" operation_key="httpd-bundle-docker-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
</inputs>
</synapse>
<synapse id="36">
<action_set>
<pseudo_event id="34" operation="stop" operation_key="httpd-bundle_stop_0">
<attributes CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
<inputs/>
</synapse>
<synapse id="37" priority="1000000">
<action_set>
<pseudo_event id="33" operation="running" operation_key="httpd-bundle_running_0">
<attributes CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
<inputs>
<trigger>
<rsc_op id="6" operation="start" operation_key="httpd-bundle-docker-0_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="22" operation="start" operation_key="httpd-bundle-docker-1_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="28" operation="start" operation_key="httpd-bundle-docker-2_start_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<pseudo_event id="43" operation="running" operation_key="httpd-bundle-clone_running_0"/>
</trigger>
</inputs>
</synapse>
<synapse id="38">
<action_set>
<pseudo_event id="32" operation="start" operation_key="httpd-bundle_start_0">
<attributes CRM_meta_timeout="20000" />
</pseudo_event>
</action_set>
<inputs/>
</synapse>
<synapse id="39">
<action_set>
<pseudo_event id="5" operation="all_stopped" operation_key="all_stopped">
<attributes />
</pseudo_event>
</action_set>
<inputs>
<trigger>
<rsc_op id="17" operation="stop" operation_key="httpd-bundle-docker-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="18" operation="stop" operation_key="httpd-bundle-0_stop_0" on_node="rh74-test" on_node_uuid="3232287163"/>
</trigger>
<trigger>
<rsc_op id="46" operation="stop" operation_key="httpd_stop_0" on_node="httpd-bundle-0" on_node_uuid="httpd-bundle-0" router_node="rh74-test"/>
</trigger>
</inputs>
</synapse>
</transition_graph>
diff --git a/daemons/execd/Makefile.am b/daemons/execd/Makefile.am
index e5d3854d86..39820b61c5 100644
--- a/daemons/execd/Makefile.am
+++ b/daemons/execd/Makefile.am
@@ -1,50 +1,50 @@
#
# Copyright 2012-2018 David Vossel <davidvossel@gmail.com>
#
# This source code is licensed under the GNU Lesser General Public License
# version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
#
include $(top_srcdir)/Makefile.common
halibdir = $(CRM_DAEMON_DIR)
halib_PROGRAMS = pacemaker-execd cts-exec-helper
initdir = $(INITDIR)
init_SCRIPTS = pacemaker_remote
-sbin_PROGRAMS = pacemaker_remoted
+sbin_PROGRAMS = pacemaker-remoted
if BUILD_SYSTEMD
systemdunit_DATA = pacemaker_remote.service
endif
pacemaker_execd_CFLAGS = $(CFLAGS_HARDENED_EXE)
pacemaker_execd_LDFLAGS = $(LDFLAGS_HARDENED_EXE)
pacemaker_execd_LDADD = $(top_builddir)/lib/common/libcrmcommon.la \
$(top_builddir)/lib/services/libcrmservice.la \
$(top_builddir)/lib/fencing/libstonithd.la ${COMPAT_LIBS}
pacemaker_execd_SOURCES = pacemaker-execd.c execd_commands.c \
execd_alerts.c
pacemaker_remoted_CPPFLAGS = -DSUPPORT_REMOTE $(AM_CPPFLAGS)
pacemaker_remoted_CFLAGS = $(CFLAGS_HARDENED_EXE)
pacemaker_remoted_LDFLAGS = $(LDFLAGS_HARDENED_EXE)
pacemaker_remoted_LDADD = $(pacemaker_execd_LDADD) \
$(top_builddir)/lib/lrmd/liblrmd.la
pacemaker_remoted_SOURCES = $(pacemaker_execd_SOURCES) \
remoted_tls.c remoted_proxy.c
cts_exec_helper_LDADD = $(top_builddir)/lib/common/libcrmcommon.la \
$(top_builddir)/lib/lrmd/liblrmd.la \
$(top_builddir)/lib/cib/libcib.la \
$(top_builddir)/lib/services/libcrmservice.la \
$(top_builddir)/lib/pengine/libpe_status.la \
$(top_builddir)/pengine/libpengine.la
cts_exec_helper_SOURCES = cts-exec-helper.c
noinst_HEADERS = pacemaker-execd.h
CLEANFILES = $(man8_MANS)
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
index 57927bef07..4905fb19af 100644
--- a/daemons/execd/pacemaker-execd.c
+++ b/daemons/execd/pacemaker-execd.c
@@ -1,615 +1,615 @@
/*
* Copyright 2012-2018 David Vossel <davidvossel@gmail.com>
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <glib.h>
#include <unistd.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/services.h>
#include <crm/common/mainloop.h>
#include <crm/common/ipc.h>
#include <crm/common/ipcs.h>
#include "pacemaker-execd.h"
#if defined(HAVE_GNUTLS_GNUTLS_H) && defined(SUPPORT_REMOTE)
# define ENABLE_PCMK_REMOTE
#endif
GMainLoop *mainloop = NULL;
static qb_ipcs_service_t *ipcs = NULL;
stonith_t *stonith_api = NULL;
int lrmd_call_id = 0;
#ifdef ENABLE_PCMK_REMOTE
/* whether shutdown request has been sent */
static volatile sig_atomic_t shutting_down = FALSE;
/* timer for waiting for acknowledgment of shutdown request */
static volatile guint shutdown_ack_timer = 0;
static gboolean lrmd_exit(gpointer data);
#endif
static void
stonith_connection_destroy_cb(stonith_t * st, stonith_event_t * e)
{
stonith_api->state = stonith_disconnected;
crm_err("STONITH connection lost");
stonith_connection_failed();
}
stonith_t *
get_stonith_connection(void)
{
if (stonith_api && stonith_api->state == stonith_disconnected) {
stonith_api_delete(stonith_api);
stonith_api = NULL;
}
if (!stonith_api) {
int rc = 0;
int tries = 10;
stonith_api = stonith_api_new();
do {
rc = stonith_api->cmds->connect(stonith_api, "pacemaker-execd", NULL);
if (rc == pcmk_ok) {
stonith_api->cmds->register_notification(stonith_api,
T_STONITH_NOTIFY_DISCONNECT,
stonith_connection_destroy_cb);
break;
}
sleep(1);
tries--;
} while (tries);
if (rc) {
crm_err("Unable to connect to stonith daemon to execute command. error: %s",
pcmk_strerror(rc));
stonith_api_delete(stonith_api);
stonith_api = NULL;
}
}
return stonith_api;
}
static int32_t
lrmd_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
{
crm_trace("Connection %p", c);
if (crm_client_new(c, uid, gid) == NULL) {
return -EIO;
}
return 0;
}
static void
lrmd_ipc_created(qb_ipcs_connection_t * c)
{
crm_client_t *new_client = crm_client_get(c);
crm_trace("Connection %p", c);
CRM_ASSERT(new_client != NULL);
/* Now that the connection is offically established, alert
* the other clients a new connection exists. */
notify_of_new_client(new_client);
}
static int32_t
lrmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size)
{
uint32_t id = 0;
uint32_t flags = 0;
crm_client_t *client = crm_client_get(c);
xmlNode *request = crm_ipcs_recv(client, data, size, &id, &flags);
CRM_CHECK(client != NULL, crm_err("Invalid client");
return FALSE);
CRM_CHECK(client->id != NULL, crm_err("Invalid client: %p", client);
return FALSE);
CRM_CHECK(flags & crm_ipc_client_response, crm_err("Invalid client request: %p", client);
return FALSE);
if (!request) {
return 0;
}
if (!client->name) {
const char *value = crm_element_value(request, F_LRMD_CLIENTNAME);
if (value == NULL) {
client->name = crm_itoa(crm_ipcs_client_pid(c));
} else {
client->name = strdup(value);
}
}
lrmd_call_id++;
if (lrmd_call_id < 1) {
lrmd_call_id = 1;
}
crm_xml_add(request, F_LRMD_CLIENTID, client->id);
crm_xml_add(request, F_LRMD_CLIENTNAME, client->name);
crm_xml_add_int(request, F_LRMD_CALLID, lrmd_call_id);
process_lrmd_message(client, id, request);
free_xml(request);
return 0;
}
/*!
* \internal
* \brief Free a client connection, and exit if appropriate
*
* \param[in] client Client connection to free
*/
void
lrmd_client_destroy(crm_client_t *client)
{
crm_client_destroy(client);
#ifdef ENABLE_PCMK_REMOTE
/* If we were waiting to shut down, we can now safely do so
* if there are no more proxied IPC providers
*/
if (shutting_down && (ipc_proxy_get_provider() == NULL)) {
lrmd_exit(NULL);
}
#endif
}
static int32_t
lrmd_ipc_closed(qb_ipcs_connection_t * c)
{
crm_client_t *client = crm_client_get(c);
if (client == NULL) {
return 0;
}
crm_trace("Connection %p", c);
client_disconnect_cleanup(client->id);
#ifdef ENABLE_PCMK_REMOTE
ipc_proxy_remove_provider(client);
#endif
lrmd_client_destroy(client);
return 0;
}
static void
lrmd_ipc_destroy(qb_ipcs_connection_t * c)
{
lrmd_ipc_closed(c);
crm_trace("Connection %p", c);
}
static struct qb_ipcs_service_handlers lrmd_ipc_callbacks = {
.connection_accept = lrmd_ipc_accept,
.connection_created = lrmd_ipc_created,
.msg_process = lrmd_ipc_dispatch,
.connection_closed = lrmd_ipc_closed,
.connection_destroyed = lrmd_ipc_destroy
};
int
lrmd_server_send_reply(crm_client_t * client, uint32_t id, xmlNode * reply)
{
crm_trace("Sending reply (%d) to client (%s)", id, client->id);
switch (client->kind) {
case CRM_CLIENT_IPC:
return crm_ipcs_send(client, id, reply, FALSE);
#ifdef ENABLE_PCMK_REMOTE
case CRM_CLIENT_TLS:
return lrmd_tls_send_msg(client->remote, reply, id, "reply");
#endif
default:
crm_err("Could not send reply: unknown client type %d",
client->kind);
}
return -ENOTCONN;
}
int
lrmd_server_send_notify(crm_client_t * client, xmlNode * msg)
{
crm_trace("Sending notification to client (%s)", client->id);
switch (client->kind) {
case CRM_CLIENT_IPC:
if (client->ipcs == NULL) {
crm_trace("Could not notify local client: disconnected");
return -ENOTCONN;
}
return crm_ipcs_send(client, 0, msg, crm_ipc_server_event);
#ifdef ENABLE_PCMK_REMOTE
case CRM_CLIENT_TLS:
if (client->remote == NULL) {
crm_trace("Could not notify remote client: disconnected");
return -ENOTCONN;
}
return lrmd_tls_send_msg(client->remote, msg, 0, "notify");
#endif
default:
crm_err("Could not notify client: unknown type %d", client->kind);
}
return -ENOTCONN;
}
/*!
* \internal
* \brief Clean up and exit immediately
*
* \param[in] data Ignored
*
* \return Doesn't return
* \note This can be used as a timer callback.
*/
static gboolean
lrmd_exit(gpointer data)
{
crm_info("Terminating with %d clients",
crm_hash_table_size(client_connections));
if (stonith_api) {
stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
stonith_api->cmds->disconnect(stonith_api);
stonith_api_delete(stonith_api);
}
if (ipcs) {
mainloop_del_ipc_server(ipcs);
}
#ifdef ENABLE_PCMK_REMOTE
lrmd_tls_server_destroy();
ipc_proxy_cleanup();
#endif
crm_client_cleanup();
g_hash_table_destroy(rsc_list);
if (mainloop) {
lrmd_drain_alerts(g_main_loop_get_context(mainloop));
}
crm_exit(CRM_EX_OK);
return FALSE;
}
/*!
* \internal
* \brief Request cluster shutdown if appropriate, otherwise exit immediately
*
* \param[in] nsig Signal that caused invocation (ignored)
*/
static void
lrmd_shutdown(int nsig)
{
#ifdef ENABLE_PCMK_REMOTE
crm_client_t *ipc_proxy = ipc_proxy_get_provider();
/* If there are active proxied IPC providers, then we may be running
* resources, so notify the cluster that we wish to shut down.
*/
if (ipc_proxy) {
if (shutting_down) {
crm_notice("Waiting for cluster to stop resources before exiting");
return;
}
crm_info("Sending shutdown request to cluster");
if (ipc_proxy_shutdown_req(ipc_proxy) < 0) {
crm_crit("Shutdown request failed, exiting immediately");
} else {
/* We requested a shutdown. Now, we need to wait for an
* acknowledgement from the proxy host (which ensures the proxy host
* supports shutdown requests), then wait for all proxy hosts to
* disconnect (which ensures that all resources have been stopped).
*/
shutting_down = TRUE;
/* Stop accepting new proxy connections */
lrmd_tls_server_destroy();
/* Older crmd versions will never acknowledge our request, so set a
* fairly short timeout to exit quickly in that case. If we get the
* ack, we'll defuse this timer.
*/
shutdown_ack_timer = g_timeout_add_seconds(20, lrmd_exit, NULL);
/* Currently, we let the OS kill us if the clients don't disconnect
* in a reasonable time. We could instead set a long timer here
* (shorter than what the OS is likely to use) and exit immediately
* if it pops.
*/
return;
}
}
#endif
lrmd_exit(NULL);
}
/*!
* \internal
* \brief Defuse short exit timer if shutting down
*/
void handle_shutdown_ack()
{
#ifdef ENABLE_PCMK_REMOTE
if (shutting_down) {
crm_info("Received shutdown ack");
if (shutdown_ack_timer > 0) {
g_source_remove(shutdown_ack_timer);
shutdown_ack_timer = 0;
}
return;
}
#endif
crm_debug("Ignoring unexpected shutdown ack");
}
/*!
* \internal
* \brief Make short exit timer fire immediately
*/
void handle_shutdown_nack()
{
#ifdef ENABLE_PCMK_REMOTE
if (shutting_down) {
crm_info("Received shutdown nack");
if (shutdown_ack_timer > 0) {
g_source_remove(shutdown_ack_timer);
shutdown_ack_timer = g_timeout_add(0, lrmd_exit, NULL);
}
return;
}
#endif
crm_debug("Ignoring unexpected shutdown nack");
}
static pid_t main_pid = 0;
static void
sigdone(void)
{
exit(CRM_EX_OK);
}
static void
sigreap(void)
{
pid_t pid = 0;
int status;
do {
/*
* Opinions seem to differ as to what to put here:
* -1, any child process
* 0, any child process whose process group ID is equal to that of the calling process
*/
pid = waitpid(-1, &status, WNOHANG);
if(pid == main_pid) {
/* Exit when pacemaker-remote exits and use the same return code */
if (WIFEXITED(status)) {
exit(WEXITSTATUS(status));
}
exit(CRM_EX_ERROR);
}
} while (pid > 0);
}
static struct {
int sig;
void (*handler)(void);
} sigmap[] = {
{ SIGCHLD, sigreap },
{ SIGINT, sigdone },
};
static void spawn_pidone(int argc, char **argv, char **envp)
{
sigset_t set;
if (getpid() != 1) {
return;
}
sigfillset(&set);
sigprocmask(SIG_BLOCK, &set, 0);
main_pid = fork();
switch (main_pid) {
case 0:
sigprocmask(SIG_UNBLOCK, &set, NULL);
setsid();
setpgid(0, 0);
- /* Child remains as pacemaker_remoted */
+ /* Child remains as pacemaker-remoted */
return;
case -1:
perror("fork");
}
/* Parent becomes the reaper of zombie processes */
/* Safe to initialize logging now if needed */
#ifdef HAVE___PROGNAME
/* Differentiate ourselves in the 'ps' output */
{
char *p;
int i, maxlen;
char *LastArgv = NULL;
const char *name = "pcmk-init";
for(i = 0; i < argc; i++) {
if(!i || (LastArgv + 1 == argv[i]))
LastArgv = argv[i] + strlen(argv[i]);
}
for(i = 0; envp[i] != NULL; i++) {
if((LastArgv + 1) == envp[i]) {
LastArgv = envp[i] + strlen(envp[i]);
}
}
maxlen = (LastArgv - argv[0]) - 2;
i = strlen(name);
/* We can overwrite individual argv[] arguments */
snprintf(argv[0], maxlen, "%s", name);
/* Now zero out everything else */
p = &argv[0][i];
while(p < LastArgv)
*p++ = '\0';
argv[1] = NULL;
}
#endif /* HAVE___PROGNAME */
while (1) {
int sig;
size_t i;
sigwait(&set, &sig);
for (i = 0; i < DIMOF(sigmap); i++) {
if (sigmap[i].sig == sig) {
sigmap[i].handler();
break;
}
}
}
}
/* *INDENT-OFF* */
static struct crm_option long_options[] = {
/* Top-level Options */
{"help", 0, 0, '?', "\tThis text"},
{"version", 0, 0, '$', "\tVersion information" },
{"verbose", 0, 0, 'V', "\tIncrease debug output"},
{"logfile", 1, 0, 'l', "\tSend logs to the additional named logfile"},
#ifdef ENABLE_PCMK_REMOTE
{"port", 1, 0, 'p', "\tPort to listen on"},
#endif
{0, 0, 0, 0}
};
/* *INDENT-ON* */
int
main(int argc, char **argv, char **envp)
{
int flag = 0;
int index = 0;
int bump_log_num = 0;
const char *option = NULL;
/* If necessary, create PID1 now before any FDs are opened */
spawn_pidone(argc, argv, envp);
#ifndef ENABLE_PCMK_REMOTE
crm_log_preinit("pacemaker-execd", argc, argv);
crm_set_options(NULL, "[options]", long_options,
- "Daemon for controlling services confirming to different standards");
+ "Resource agent executor daemon for cluster nodes");
#else
- crm_log_preinit("pacemaker_remoted", argc, argv);
+ crm_log_preinit("pacemaker-remoted", argc, argv);
crm_set_options(NULL, "[options]", long_options,
- "Pacemaker Remote daemon for extending pacemaker functionality to remote nodes.");
+ "Resource agent executor daemon for Pacemaker Remote nodes");
#endif
while (1) {
flag = crm_get_option(argc, argv, &index);
if (flag == -1) {
break;
}
switch (flag) {
case 'l':
crm_add_logfile(optarg);
break;
case 'p':
setenv("PCMK_remote_port", optarg, 1);
break;
case 'V':
bump_log_num++;
break;
case '?':
case '$':
crm_help(flag, CRM_EX_OK);
break;
default:
crm_help('?', CRM_EX_USAGE);
break;
}
}
crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE);
while (bump_log_num > 0) {
crm_bump_log_level(argc, argv);
bump_log_num--;
}
option = daemon_option("logfacility");
if(option && safe_str_neq(option, "none")) {
setenv("HA_LOGFACILITY", option, 1); /* Used by the ocf_log/ha_log OCF macro */
}
option = daemon_option("logfile");
if(option && safe_str_neq(option, "none")) {
setenv("HA_LOGFILE", option, 1); /* Used by the ocf_log/ha_log OCF macro */
if (daemon_option_enabled(crm_system_name, "debug")) {
setenv("HA_DEBUGLOG", option, 1); /* Used by the ocf_log/ha_debug OCF macro */
}
}
/* The presence of this variable allegedly controls whether child
* processes like httpd will try and use Systemd's sd_notify
* API
*/
unsetenv("NOTIFY_SOCKET");
/* Used by RAs - Leave owned by root */
crm_build_path(CRM_RSCTMP_DIR, 0755);
rsc_list = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_rsc);
ipcs = mainloop_add_ipc_server(CRM_SYSTEM_LRMD, QB_IPC_SHM, &lrmd_ipc_callbacks);
if (ipcs == NULL) {
crm_err("Failed to create IPC server: shutting down and inhibiting respawn");
crm_exit(CRM_EX_FATAL);
}
#ifdef ENABLE_PCMK_REMOTE
if (lrmd_init_remote_tls_server() < 0) {
crm_err("Failed to create TLS listener: shutting down and staying down");
crm_exit(CRM_EX_FATAL);
}
ipc_proxy_init();
#endif
mainloop_add_signal(SIGTERM, lrmd_shutdown);
mainloop = g_main_loop_new(NULL, FALSE);
crm_info("Starting");
g_main_loop_run(mainloop);
/* should never get here */
lrmd_exit(NULL);
return CRM_EX_OK;
}
diff --git a/daemons/execd/pacemaker_remote.in b/daemons/execd/pacemaker_remote.in
index e07e1b9671..2096c5f39c 100644
--- a/daemons/execd/pacemaker_remote.in
+++ b/daemons/execd/pacemaker_remote.in
@@ -1,176 +1,176 @@
#!@BASH_PATH@
# Authors:
# Andrew Beekhof <abeekhof@redhat.com>
#
# License: Revised BSD
# chkconfig: - 99 01
# description: Pacemaker Cluster Manager
-# processname: pacemaker_remoted
+# processname: pacemaker-remoted
#
### BEGIN INIT INFO
# Provides: pacemaker_remote
# Required-Start: $network $remote_fs
# Should-Start: $syslog
# Required-Stop: $network $remote_fs
# Default-Start:
# Default-Stop:
-# Short-Description: Starts and stops the Pacemaker remote agent for non-cluster nodes
-# Description: Starts and stops the Pacemaker remote agent for non-cluster nodes
+# Short-Description: Manage the executor for Pacemaker Remote nodes
+# Description: Manage the executor for Pacemaker Remote nodes
### END INIT INFO
-desc="Pacemaker Remote Agent"
-prog="pacemaker_remoted"
+desc="Pacemaker Remote Executor"
+prog="pacemaker-remoted"
# set secure PATH
PATH="/sbin:/bin:/usr/sbin:/usr/bin:@sbindir@"
checkrc() {
if [ $? = 0 ]; then
success
else
failure
fi
}
success()
{
echo -ne "[ OK ]\r"
}
failure()
{
echo -ne "[FAILED]\r"
}
status()
{
pid=$(pidof $1 2>/dev/null)
local rtrn=$?
if [ $rtrn -ne 0 ]; then
echo "$1 is stopped"
if [ -f "@localstatedir@/run/$prog.pid" ]; then
rtrn=1
else
rtrn=3
fi
else
echo "$1 (pid $pid) is running..."
fi
return $rtrn
}
if [ -d @CONFIGDIR@ ]; then
[ -f @INITDIR@/functions ] && . @INITDIR@/functions
set -a
[ -f @CONFIGDIR@/pacemaker ] && . @CONFIGDIR@/pacemaker
[ -f @CONFIGDIR@/sbd ] && . @CONFIGDIR@/sbd
set +a
fi
LOCK_DIR="."
if [ -d "@localstatedir@/lock/subsys" ]; then
LOCK_DIR="@localstatedir@/lock/subsys"
elif [ -d "@localstatedir@/lock" ]; then
LOCK_DIR="@localstatedir@/lock"
fi
[ -z "$LOCK_FILE" ] && LOCK_FILE="$LOCK_DIR/pacemaker_remote"
# Check if there is a valid watchdog-device configured in sbd config
if [ x != "x$SBD_WATCHDOG_DEV" -a "/dev/null" != "$SBD_WATCHDOG_DEV" -a -c "$SBD_WATCHDOG_DEV" ]; then
# enhance for unavailable chkconfig - don't touch sbd for now
if chkconfig --list sbd_remote_helper 2>/dev/null | grep -q ":on"; then
SBD_SERVICE=sbd_remote_helper
fi
fi
start()
{
echo -n "Starting $desc: "
# most recent distributions use tmpfs for $@localstatedir@/run
# to avoid to clean it up on every boot.
# they also assume that init scripts will create
# required subdirectories for proper operations
mkdir -p "@localstatedir@/run"
if status $prog > /dev/null 2>&1; then
success
else
$prog > /dev/null 2>&1 &
# Time to connect to corosync and fail
sleep 5
if status $prog > /dev/null 2>&1; then
touch "$LOCK_FILE"
pidof $prog > "@localstatedir@/run/$prog.pid"
success
else
failure
rtrn=1
fi
fi
echo
[ "x$SBD_SERVICE" = "x" ] || service $SBD_SERVICE start
}
stop()
{
if status $prog > /dev/null 2>&1; then
echo -n "Signaling $desc to terminate: "
kill -TERM $(pidof $prog) > /dev/null 2>&1
success
echo
echo -n "Waiting for $desc to unload:"
while status $prog > /dev/null 2>&1; do
sleep 1
echo -n "."
done
else
echo -n "$desc is already stopped"
fi
rm -f "$LOCK_FILE"
rm -f "@localstatedir@/run/$prog.pid"
success
echo
[ "x$SBD_SERVICE" = "x" ] || service $SBD_SERVICE stop
}
rtrn=0
case "$1" in
start)
start
;;
restart|reload|force-reload)
stop
start
;;
condrestart|try-restart)
if status $prog > /dev/null 2>&1; then
stop
start
rtrn=$?
fi
;;
status)
status $prog
rtrn=$?
;;
stop)
stop
rtrn=$?
;;
*)
echo "usage: $0 {start|stop|restart|reload|force-reload|condrestart|try-restart|status}"
rtrn=2
;;
esac
exit $rtrn
diff --git a/daemons/execd/pacemaker_remote.service.in b/daemons/execd/pacemaker_remote.service.in
index 8269d796e8..961122b064 100644
--- a/daemons/execd/pacemaker_remote.service.in
+++ b/daemons/execd/pacemaker_remote.service.in
@@ -1,44 +1,44 @@
[Unit]
-Description=Pacemaker Remote Service
-Documentation=man:pacemaker_remoted http://clusterlabs.org/doc/en-US/Pacemaker/1.1-pcs/html/Pacemaker_Remote/index.html
+Description=Pacemaker Remote executor daemon
+Documentation=man:pacemaker-remoted http://clusterlabs.org/doc/en-US/Pacemaker/1.1-pcs/html/Pacemaker_Remote/index.html
# See main pacemaker unit file for descriptions of why these are needed
After=network.target
After=time-sync.target
After=dbus.service
Wants=dbus.service
After=resource-agents-deps.target
Wants=resource-agents-deps.target
After=syslog.service
After=rsyslog.service
[Install]
WantedBy=multi-user.target
[Service]
Type=simple
KillMode=process
NotifyAccess=none
EnvironmentFile=-@CONFIGDIR@/pacemaker
EnvironmentFile=-@CONFIGDIR@/sbd
-ExecStart=@sbindir@/pacemaker_remoted
+ExecStart=@sbindir@/pacemaker-remoted
# Systemd v227 and above can limit the number of processes spawned by a
# service. That is a bad idea for an HA cluster resource manager, so disable it
# by default. The administrator can create a local override if they really want
# a limit. If your systemd version does not support TasksMax, and you want to
# get rid of the resulting log warnings, comment out this option.
TasksMax=infinity
# Pacemaker Remote can exit only after all managed services have shut down;
# an HA database could conceivably take even longer than this
TimeoutStopSec=30min
TimeoutStartSec=30s
# Restart options include: no, on-success, on-failure, on-abort or always
Restart=on-failure
# crm_perror() writes directly to stderr, so ignore it here
# to avoid double-logging with the wrong format
StandardError=null
diff --git a/daemons/execd/remoted_tls.c b/daemons/execd/remoted_tls.c
index afd0c91ecf..9a14b072d0 100644
--- a/daemons/execd/remoted_tls.c
+++ b/daemons/execd/remoted_tls.c
@@ -1,384 +1,385 @@
/*
* Copyright 2012-2018 David Vossel <davidvossel@gmail.com>
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <glib.h>
#include <unistd.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/common/mainloop.h>
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <arpa/inet.h>
#include "pacemaker-execd.h"
#ifdef HAVE_GNUTLS_GNUTLS_H
# define LRMD_REMOTE_AUTH_TIMEOUT 10000
gnutls_psk_server_credentials_t psk_cred_s;
gnutls_dh_params_t dh_params;
static int ssock = -1;
extern int lrmd_call_id;
static void
debug_log(int level, const char *str)
{
fputs(str, stderr);
}
static int
lrmd_remote_client_msg(gpointer data)
{
int id = 0;
int rc = 0;
int disconnected = 0;
xmlNode *request = NULL;
crm_client_t *client = data;
if (client->remote->tls_handshake_complete == FALSE) {
int rc = 0;
/* Muliple calls to handshake will be required, this callback
* will be invoked once the client sends more handshake data. */
do {
rc = gnutls_handshake(*client->remote->tls_session);
if (rc < 0 && rc != GNUTLS_E_AGAIN) {
crm_err("TLS handshake with Pacemaker Remote failed");
return -1;
}
} while (rc == GNUTLS_E_INTERRUPTED);
if (rc == 0) {
crm_debug("TLS handshake with Pacemaker Remote completed");
client->remote->tls_handshake_complete = TRUE;
if (client->remote->auth_timeout) {
g_source_remove(client->remote->auth_timeout);
}
client->remote->auth_timeout = 0;
/* Alert other clients of the new connection */
notify_of_new_client(client);
}
return 0;
}
rc = crm_remote_ready(client->remote, 0);
if (rc == 0) {
/* no msg to read */
return 0;
} else if (rc < 0) {
crm_info("Client disconnected while polling it");
return -1;
}
crm_remote_recv(client->remote, -1, &disconnected);
request = crm_remote_parse_buffer(client->remote);
while (request) {
crm_element_value_int(request, F_LRMD_REMOTE_MSG_ID, &id);
crm_trace("processing request from remote client with remote msg id %d", id);
if (!client->name) {
const char *value = crm_element_value(request, F_LRMD_CLIENTNAME);
if (value) {
client->name = strdup(value);
}
}
lrmd_call_id++;
if (lrmd_call_id < 1) {
lrmd_call_id = 1;
}
crm_xml_add(request, F_LRMD_CLIENTID, client->id);
crm_xml_add(request, F_LRMD_CLIENTNAME, client->name);
crm_xml_add_int(request, F_LRMD_CALLID, lrmd_call_id);
process_lrmd_message(client, id, request);
free_xml(request);
/* process all the messages in the current buffer */
request = crm_remote_parse_buffer(client->remote);
}
if (disconnected) {
crm_info("Client disconnected while reading from it");
return -1;
}
return 0;
}
static void
lrmd_remote_client_destroy(gpointer user_data)
{
crm_client_t *client = user_data;
if (client == NULL) {
return;
}
crm_notice("Cleaning up after remote client %s disconnected "
CRM_XS " id=%s",
(client->name? client->name : ""), client->id);
ipc_proxy_remove_provider(client);
/* if this is the last remote connection, stop recurring
* operations */
if (crm_hash_table_size(client_connections) == 1) {
client_disconnect_cleanup(NULL);
}
if (client->remote->tls_session) {
void *sock_ptr;
int csock;
sock_ptr = gnutls_transport_get_ptr(*client->remote->tls_session);
csock = GPOINTER_TO_INT(sock_ptr);
gnutls_bye(*client->remote->tls_session, GNUTLS_SHUT_RDWR);
gnutls_deinit(*client->remote->tls_session);
gnutls_free(client->remote->tls_session);
close(csock);
}
lrmd_client_destroy(client);
return;
}
static gboolean
lrmd_auth_timeout_cb(gpointer data)
{
crm_client_t *client = data;
client->remote->auth_timeout = 0;
if (client->remote->tls_handshake_complete == TRUE) {
return FALSE;
}
mainloop_del_fd(client->remote->source);
client->remote->source = NULL;
crm_err("Remote client authentication timed out");
return FALSE;
}
static int
lrmd_remote_listen(gpointer data)
{
int csock = 0;
gnutls_session_t *session = NULL;
crm_client_t *new_client = NULL;
static struct mainloop_fd_callbacks lrmd_remote_fd_cb = {
.dispatch = lrmd_remote_client_msg,
.destroy = lrmd_remote_client_destroy,
};
csock = crm_remote_accept(ssock);
if (csock < 0) {
return TRUE;
}
session = create_psk_tls_session(csock, GNUTLS_SERVER, psk_cred_s);
if (session == NULL) {
crm_err("TLS session creation failed");
close(csock);
return TRUE;
}
new_client = crm_client_alloc(NULL);
new_client->remote = calloc(1, sizeof(crm_remote_t));
new_client->kind = CRM_CLIENT_TLS;
new_client->remote->tls_session = session;
new_client->remote->auth_timeout =
g_timeout_add(LRMD_REMOTE_AUTH_TIMEOUT, lrmd_auth_timeout_cb, new_client);
crm_notice("Client connection to Pacemaker Remote established "
CRM_XS " %p id: %s", new_client, new_client->id);
new_client->remote->source =
- mainloop_add_fd("lrmd-remote-client", G_PRIORITY_DEFAULT, csock, new_client,
- &lrmd_remote_fd_cb);
+ mainloop_add_fd("pacemaker-remote-client", G_PRIORITY_DEFAULT, csock,
+ new_client, &lrmd_remote_fd_cb);
return TRUE;
}
static void
lrmd_remote_connection_destroy(gpointer user_data)
{
crm_notice("Remote tls server disconnected");
return;
}
static int
lrmd_tls_server_key_cb(gnutls_session_t session, const char *username, gnutls_datum_t * key)
{
return lrmd_tls_set_key(key);
}
static int
bind_and_listen(struct addrinfo *addr)
{
int optval;
int fd;
int rc;
char buffer[INET6_ADDRSTRLEN] = { 0, };
crm_sockaddr2str(addr->ai_addr, buffer);
crm_trace("Attempting to bind on address %s", buffer);
fd = socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol);
if (fd < 0) {
return -1;
}
/* reuse address */
optval = 1;
rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval));
if (rc < 0) {
crm_perror(LOG_INFO, "Couldn't allow the reuse of local addresses by our remote listener, bind address %s", buffer);
close(fd);
return -1;
}
if (addr->ai_family == AF_INET6) {
optval = 0;
rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &optval, sizeof(optval));
if (rc < 0) {
crm_perror(LOG_INFO, "Couldn't disable IPV6 only on address %s", buffer);
close(fd);
return -1;
}
}
if (bind(fd, addr->ai_addr, addr->ai_addrlen) != 0) {
close(fd);
return -1;
}
if (listen(fd, 10) == -1) {
crm_err("Can not start listen on address %s", buffer);
close(fd);
return -1;
}
crm_notice("Listening on address %s", buffer);
return fd;
}
int
lrmd_init_remote_tls_server()
{
int rc;
int filter;
int port = crm_default_remote_port();
struct addrinfo hints, *res = NULL, *iter;
char port_str[6]; // at most "65535"
gnutls_datum_t psk_key = { NULL, 0 };
static struct mainloop_fd_callbacks remote_listen_fd_callbacks = {
.dispatch = lrmd_remote_listen,
.destroy = lrmd_remote_connection_destroy,
};
crm_notice("Starting TLS listener on port %d", port);
crm_gnutls_global_init();
gnutls_global_set_log_function(debug_log);
gnutls_dh_params_init(&dh_params);
gnutls_dh_params_generate2(dh_params, 1024);
gnutls_psk_allocate_server_credentials(&psk_cred_s);
gnutls_psk_set_server_credentials_function(psk_cred_s, lrmd_tls_server_key_cb);
gnutls_psk_set_server_dh_params(psk_cred_s, dh_params);
/* The key callback won't get called until the first client connection
* attempt. Do it once here, so we can warn the user at start-up if we can't
* read the key. We don't error out, though, because it's fine if the key is
* going to be added later.
*/
rc = lrmd_tls_set_key(&psk_key);
if (rc != 0) {
crm_warn("A cluster connection will not be possible until the key is available");
}
memset(&hints, 0, sizeof(struct addrinfo));
/* Bind to the wildcard address (INADDR_ANY or IN6ADDR_ANY_INIT).
* @TODO allow user to specify a specific address
*/
hints.ai_flags = AI_PASSIVE;
hints.ai_family = AF_UNSPEC; /* Return IPv6 or IPv4 */
hints.ai_socktype = SOCK_STREAM;
hints.ai_protocol = IPPROTO_TCP;
snprintf(port_str, sizeof(port_str), "%d", port);
rc = getaddrinfo(NULL, port_str, &hints, &res);
if (rc) {
crm_err("Unable to get IP address info for local node: %s",
gai_strerror(rc));
return -1;
}
iter = res;
filter = AF_INET6;
/* Try IPv6 addresses first, then IPv4 */
while (iter) {
if (iter->ai_family == filter) {
ssock = bind_and_listen(iter);
}
if (ssock != -1) {
break;
}
iter = iter->ai_next;
if (iter == NULL && filter == AF_INET6) {
iter = res;
filter = AF_INET;
}
}
if (ssock < 0) {
crm_err("unable to bind to address");
goto init_remote_cleanup;
}
- mainloop_add_fd("lrmd-remote", G_PRIORITY_DEFAULT, ssock, NULL, &remote_listen_fd_callbacks);
+ mainloop_add_fd("pacemaker-remote-server", G_PRIORITY_DEFAULT, ssock, NULL,
+ &remote_listen_fd_callbacks);
rc = ssock;
init_remote_cleanup:
if (rc < 0) {
close(ssock);
ssock = 0;
}
freeaddrinfo(res);
return rc;
}
void
lrmd_tls_server_destroy(void)
{
if (psk_cred_s) {
gnutls_psk_free_server_credentials(psk_cred_s);
psk_cred_s = 0;
}
if (ssock > 0) {
close(ssock);
ssock = 0;
}
}
#endif
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
index 9fda599e54..096e9c10e3 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
@@ -1,1446 +1,1450 @@
= Advanced Resource Types =
[[group-resources]]
== Groups - A Syntactic Shortcut ==
indexterm:[Group Resources]
indexterm:[Resource,Groups]
One of the most common elements of a cluster is a set of resources
that need to be located together, start sequentially, and stop in the
reverse order. To simplify this configuration, we support the concept
of groups.
.A group of two primitive resources
======
[source,XML]
-------
<group id="shortcut">
<primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
<instance_attributes id="params-public-ip">
<nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
</instance_attributes>
</primitive>
<primitive id="Email" class="lsb" type="exim"/>
</group>
-------
======
Although the example above contains only two resources, there is no
limit to the number of resources a group can contain. The example is
also sufficient to explain the fundamental properties of a group:
* Resources are started in the order they appear in (+Public-IP+
first, then +Email+)
* Resources are stopped in the reverse order to which they appear in
(+Email+ first, then +Public-IP+)
If a resource in the group can't run anywhere, then nothing after that
is allowed to run, too.
* If +Public-IP+ can't run anywhere, neither can +Email+;
* but if +Email+ can't run anywhere, this does not affect +Public-IP+
in any way
The group above is logically equivalent to writing:
.How the cluster sees a group resource
======
[source,XML]
-------
<configuration>
<resources>
<primitive id="Public-IP" class="ocf" type="IPaddr" provider="heartbeat">
<instance_attributes id="params-public-ip">
<nvpair id="public-ip-addr" name="ip" value="192.0.2.2"/>
</instance_attributes>
</primitive>
<primitive id="Email" class="lsb" type="exim"/>
</resources>
<constraints>
<rsc_colocation id="xxx" rsc="Email" with-rsc="Public-IP" score="INFINITY"/>
<rsc_order id="yyy" first="Public-IP" then="Email"/>
</constraints>
</configuration>
-------
======
Obviously as the group grows bigger, the reduced configuration effort
can become significant.
Another (typical) example of a group is a DRBD volume, the filesystem
mount, an IP address, and an application that uses them.
=== Group Properties ===
.Properties of a Group Resource
[width="95%",cols="3m,5<",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the group
indexterm:[id,Group Resource Property]
indexterm:[Resource,Group Property,id]
|=========================================================
=== Group Options ===
Groups inherit the +priority+, +target-role+, and +is-managed+ properties
from primitive resources. See <<s-resource-options>> for information about
those properties.
=== Group Instance Attributes ===
Groups have no instance attributes. However, any that are set for the group
object will be inherited by the group's children.
=== Group Contents ===
Groups may only contain a collection of cluster resources (see
<<primitive-resource>>). To refer to a child of a group resource, just use
the child's +id+ instead of the group's.
=== Group Constraints ===
Although it is possible to reference a group's children in
constraints, it is usually preferable to reference the group itself.
.Some constraints involving groups
======
[source,XML]
-------
<constraints>
<rsc_location id="group-prefers-node1" rsc="shortcut" node="node1" score="500"/>
<rsc_colocation id="webserver-with-group" rsc="Webserver" with-rsc="shortcut"/>
<rsc_order id="start-group-then-webserver" first="Webserver" then="shortcut"/>
</constraints>
-------
======
=== Group Stickiness ===
indexterm:[resource-stickiness,Groups]
Stickiness, the measure of how much a resource wants to stay where it
is, is additive in groups. Every active resource of the group will
contribute its stickiness value to the group's total. So if the
default +resource-stickiness+ is 100, and a group has seven members,
five of which are active, then the group as a whole will prefer its
current location with a score of 500.
[[s-resource-clone]]
== Clones - Resources That Can Have Multiple Active Instances ==
indexterm:[Clone Resources]
indexterm:[Resource,Clones]
'Clone' resources are resources that can have more than one copy active at the
same time. This allows you, for example, to run a copy of a daemon on every
node. You can clone any primitive or group resource.
footnote:[
Of course, the service must support running multiple instances.
]
=== Anonymous versus Unique Clones ===
A clone resource is configured to be either 'anonymous' or 'globally unique'.
Anonymous clones are the simplest. These behave completely identically
everywhere they are running. Because of this, there can be only one instance of
an anonymous clone active per node.
The instances of globally unique clones are distinct entities. All instances
are launched identically, but one instance of the clone is not identical to any
other instance, whether running on the same node or a different node. As an
example, a cloned IP address can use special kernel functionality such that
each instance handles a subset of requests for the same IP address.
[[s-resource-promotable]]
=== Promotable clones ===
indexterm:[Promotable Clone Resources]
indexterm:[Resource,Promotable]
If a clone is 'promotable', its instances can perform a special role that
Pacemaker will manage via the +promote+ and +demote+ actions of the resource
agent.
Services that support such a special role have various terms for the special
role and the default role: primary and secondary, master and replica,
controller and worker, etc. Pacemaker uses the terms 'master' and 'slave',
footnote:[
These are historical terms that will eventually be replaced, but the extensive
use of them and the need for backward compatibility makes it a long process.
You may see examples using a +master+ tag instead of a +clone+ tag with the
+promotable+ meta-attribute set to +true+; the +master+ tag is supported, but
deprecated, and will be removed in a future version. You may also see such
services referred to as 'multi-state' or 'stateful'; these means the same thing
as 'promotable'.
]
but is agnostic to what the service calls them or what they do.
All that Pacemaker cares about is that an instance comes up in the default role
when started, and the resource agent supports the +promote+ and +demote+ actions
to manage entering and exiting the special role.
=== Clone Properties ===
.Properties of a Clone Resource
[width="95%",cols="3m,5<",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the clone
indexterm:[id,Clone Property]
indexterm:[Clone,Property,id]
|=========================================================
=== Clone Options ===
<<s-resource-options,Options>> inherited from primitive resources:
+priority, target-role, is-managed+
.Clone-specific configuration options
[width="95%",cols="1m,1,3<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|globally-unique
|false
|If +true+, each clone instance performs a distinct function
indexterm:[globally-unique,Clone Option]
indexterm:[Clone,Option,globally-unique]
|clone-max
|number of nodes in cluster
|The maximum number of clone instances that can be started across the entire
cluster
indexterm:[clone-max,Clone Option]
indexterm:[Clone,Option,clone-max]
|clone-node-max
|1
|If +globally-unique+ is +true+, the maximum number of clone instances that can
be started on a single node
indexterm:[clone-node-max,Clone Option]
indexterm:[Clone,Option,clone-node-max]
|clone-min
|0
|Require at least this number of clone instances to be runnable before allowing
resources depending on the clone to be runnable. A value of 0 means require
all clone instances to be runnable.
indexterm:[clone-min,Clone Option]
indexterm:[Clone,Option,clone-min]
|notify
|false
|Call the resource agent's +notify+ action for all active instances, before and
after starting or stopping any clone instance. The resource agent must support
this action. Allowed values: +false+, +true+
indexterm:[notify,Clone Option]
indexterm:[Clone,Option,notify]
|ordered
|false
|If +true+, clone instances must be started sequentially instead of in parallel
Allowed values: +false+, +true+
indexterm:[ordered,Clone Option]
indexterm:[Clone,Option,ordered]
|interleave
|false
|When this clone is ordered relative to another clone, if this option is
+false+ (the default), the ordering is relative to 'all' instances of the
other clone, whereas if this option is +true+, the ordering is relative only
to instances on the same node.
Allowed values: +false+, +true+
indexterm:[interleave,Clone Option]
indexterm:[Clone,Option,interleave]
|promotable
|false
|If +true+, clone instances can perform a special role that Pacemaker will
manage via the resource agent's +promote+ and +demote+ actions. The resource
agent must support these actions.
Allowed values: +false+, +true+
indexterm:[promotable,Clone Option]
indexterm:[Clone,Option,promotable]
|promoted-max
|1
|If +promotable+ is +true+, the number of instances that can be promoted at one
time across the entire cluster
indexterm:[promoted-max,Clone Option]
indexterm:[Clone,Option,promoted-max]
|promoted-node-max
|1
|If +promotable+ is +true+ and +globally-unique+ is +false+, the number of
clone instances can be promoted at one time on a single node
indexterm:[promoted-node-max,Clone Option]
indexterm:[Clone,Option,promoted-node-max]
|=========================================================
For backward compatibility, +master-max+ and +master-node-max+ are accepted as
aliases for +promoted-max+ and +promoted-node-max+, but are deprecated since
2.0.0, and support for them will be removed in a future version.
=== Clone Contents ===
Clones must contain exactly one primitive or group resource.
.A clone that runs a web server on all nodes
====
[source,XML]
----
<clone id="apache-clone">
<primitive id="apache" class="lsb" type="apache">
<operations>
<op id="apache-monitor" name="monitor" interval="30"/>
</operations>
</primitive>
</clone>
----
====
[WARNING]
You should never reference the name of a clone's child (the primitive or group
resource being cloned). If you think you need to do this, you probably need to
re-evaluate your design.
=== Clone Instance Attributes ===
Clones have no instance attributes; however, any that are set here will be
inherited by the clone's child.
=== Clone Constraints ===
In most cases, a clone will have a single instance on each active cluster
node. If this is not the case, you can indicate which nodes the
cluster should preferentially assign copies to with resource location
constraints. These constraints are written no differently from those
for primitive resources except that the clone's +id+ is used.
.Some constraints involving clones
======
[source,XML]
-------
<constraints>
<rsc_location id="clone-prefers-node1" rsc="apache-clone" node="node1" score="500"/>
<rsc_colocation id="stats-with-clone" rsc="apache-stats" with="apache-clone"/>
<rsc_order id="start-clone-then-stats" first="apache-clone" then="apache-stats"/>
</constraints>
-------
======
Ordering constraints behave slightly differently for clones. In the
example above, +apache-stats+ will wait until all copies of +apache-clone+
that need to be started have done so before being started itself.
Only if _no_ copies can be started will +apache-stats+ be prevented
from being active. Additionally, the clone will wait for
+apache-stats+ to be stopped before stopping itself.
Colocation of a primitive or group resource with a clone means that
the resource can run on any node with an active instance of the clone.
The cluster will choose an instance based on where the clone is running and
the resource's own location preferences.
Colocation between clones is also possible. If one clone +A+ is colocated
with another clone +B+, the set of allowed locations for +A+ is limited to
nodes on which +B+ is (or will be) active. Placement is then performed
normally.
==== Promotable Clone Constraints ====
For promotable clone resources, the +first-action+ and/or +then-action+ fields
for ordering constraints may be set to +promote+ or +demote+ to constrain the
master role, and colocation constraints may contain +rsc-role+ and/or
+with-rsc-role+ fields.
.Additional colocation constraint options for promotable clone resources
[width="95%",cols="1m,1,3<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|rsc-role
|Started
|An additional attribute of colocation constraints that specifies the
role that +rsc+ must be in. Allowed values: +Started+, +Master+,
+Slave+.
indexterm:[rsc-role,Ordering Constraints]
indexterm:[Constraints,Ordering,rsc-role]
|with-rsc-role
|Started
|An additional attribute of colocation constraints that specifies the
role that +with-rsc+ must be in. Allowed values: +Started+,
+Master+, +Slave+.
indexterm:[with-rsc-role,Ordering Constraints]
indexterm:[Constraints,Ordering,with-rsc-role]
|=========================================================
.Constraints involving promotable clone resources
======
[source,XML]
-------
<constraints>
<rsc_location id="db-prefers-node1" rsc="database" node="node1" score="500"/>
<rsc_colocation id="backup-with-db-slave" rsc="backup"
with-rsc="database" with-rsc-role="Slave"/>
<rsc_colocation id="myapp-with-db-master" rsc="myApp"
with-rsc="database" with-rsc-role="Master"/>
<rsc_order id="start-db-before-backup" first="database" then="backup"/>
<rsc_order id="promote-db-then-app" first="database" first-action="promote"
then="myApp" then-action="start"/>
</constraints>
-------
======
In the example above, +myApp+ will wait until one of the database
copies has been started and promoted to master before being started
itself on the same node. Only if no copies can be promoted will +myApp+ be
prevented from being active. Additionally, the cluster will wait for
+myApp+ to be stopped before demoting the database.
Colocation of a primitive or group resource with a promotable clone
resource means that it can run on any node with an active instance of
the promotable clone resource that has the specified role (+master+ or
+slave+). In the example above, the cluster will choose a location based on
where database is running as a +master+, and if there are multiple
+master+ instances it will also factor in +myApp+'s own location
preferences when deciding which location to choose.
Colocation with regular clones and other promotable clone resources is also
possible. In such cases, the set of allowed locations for the +rsc+
clone is (after role filtering) limited to nodes on which the
+with-rsc+ promotable clone resource is (or will be) in the specified role.
Placement is then performed as normal.
==== Using Promotable Clone Resources in Colocation Sets ====
.Additional colocation set options relevant to promotable clone resources
[width="95%",cols="1m,1,6<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|role
|Started
|The role that 'all members' of the set must be in. Allowed values: +Started+, +Master+,
+Slave+.
indexterm:[role,Ordering Constraints]
indexterm:[Constraints,Ordering,role]
|=========================================================
In the following example +B+'s master must be located on the same node as +A+'s master.
Additionally resources +C+ and +D+ must be located on the same node as +A+'s
and +B+'s masters.
.Colocate C and D with A's and B's master instances
======
[source,XML]
-------
<constraints>
<rsc_colocation id="coloc-1" score="INFINITY" >
<resource_set id="colocated-set-example-1" sequential="true" role="Master">
<resource_ref id="A"/>
<resource_ref id="B"/>
</resource_set>
<resource_set id="colocated-set-example-2" sequential="true">
<resource_ref id="C"/>
<resource_ref id="D"/>
</resource_set>
</rsc_colocation>
</constraints>
-------
======
==== Using Promotable Clone Resources in Ordered Sets ====
.Additional ordered set options relevant to promotable clone resources
[width="95%",cols="1m,1,3<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|action
|value of +first-action+
|An additional attribute of ordering constraint sets that specifies the
action that applies to 'all members' of the set. Allowed
values: +start+, +stop+, +promote+, +demote+.
indexterm:[action,Ordering Constraints]
indexterm:[Constraints,Ordering,action]
|=========================================================
.Start C and D after first promoting A and B
======
[source,XML]
-------
<constraints>
<rsc_order id="order-1" score="INFINITY" >
<resource_set id="ordered-set-1" sequential="true" action="promote">
<resource_ref id="A"/>
<resource_ref id="B"/>
</resource_set>
<resource_set id="ordered-set-2" sequential="true" action="start">
<resource_ref id="C"/>
<resource_ref id="D"/>
</resource_set>
</rsc_order>
</constraints>
-------
======
In the above example, +B+ cannot be promoted to a master role until +A+ has
been promoted. Additionally, resources +C+ and +D+ must wait until +A+ and +B+
have been promoted before they can start.
[[s-clone-stickiness]]
=== Clone Stickiness ===
indexterm:[resource-stickiness,Clones]
To achieve a stable allocation pattern, clones are slightly sticky by
default. If no value for +resource-stickiness+ is provided, the clone
will use a value of 1. Being a small value, it causes minimal
disturbance to the score calculations of other resources but is enough
to prevent Pacemaker from needlessly moving copies around the cluster.
[NOTE]
====
For globally unique clones, this may result in multiple instances of the
clone staying on a single node, even after another eligible node becomes
active (for example, after being put into standby mode then made active again).
If you do not want this behavior, specify a +resource-stickiness+ of 0
for the clone temporarily and let the cluster adjust, then set it back
to 1 if you want the default behavior to apply again.
====
=== Clone Resource Agent Requirements ===
Any resource can be used as an anonymous clone, as it requires no
additional support from the resource agent. Whether it makes sense to
do so depends on your resource and its resource agent.
==== Resource Agent Requirements for Globally Unique Clones ====
Globally unique clones require additional support in the resource agent. In
particular, it must only respond with +$\{OCF_SUCCESS}+ if the node has that
exact instance active. All other probes for instances of the clone should
result in +$\{OCF_NOT_RUNNING}+ (or one of the other OCF error codes if
they are failed).
Individual instances of a clone are identified by appending a colon and a
numerical offset, e.g. +apache:2+.
Resource agents can find out how many copies there are by examining
the +OCF_RESKEY_CRM_meta_clone_max+ environment variable and which
instance it is by examining +OCF_RESKEY_CRM_meta_clone+.
The resource agent must not make any assumptions (based on
+OCF_RESKEY_CRM_meta_clone+) about which numerical instances are active. In
particular, the list of active copies will not always be an unbroken
sequence, nor always start at 0.
==== Resource Agent Requirements for Promotable Clones ====
Promotable clone resources require two extra actions, +demote+ and +promote+,
which are responsible for changing the state of the resource. Like +start+ and
+stop+, they should return +$\{OCF_SUCCESS}+ if they completed successfully or
a relevant error code if they did not.
The states can mean whatever you wish, but when the resource is
started, it must come up in the mode called +slave+. From there the
cluster will decide which instances to promote to +master+.
In addition to the clone requirements for monitor actions, agents must
also _accurately_ report which state they are in. The cluster relies
on the agent to report its status (including role) accurately and does
not indicate to the agent what role it currently believes it to be in.
.Role implications of OCF return codes
[width="95%",cols="1,1<",options="header",align="center"]
|=========================================================
|Monitor Return Code
|Description
|OCF_NOT_RUNNING
|Stopped
indexterm:[Return Code,OCF_NOT_RUNNING]
|OCF_SUCCESS
|Running (Slave)
indexterm:[Return Code,OCF_SUCCESS]
|OCF_RUNNING_MASTER
|Running (Master)
indexterm:[Return Code,OCF_RUNNING_MASTER]
|OCF_FAILED_MASTER
|Failed (Master)
indexterm:[Return Code,OCF_FAILED_MASTER]
|Other
|Failed (Slave)
|=========================================================
==== Clone Notifications ====
If the clone has the +notify+ meta-attribute set to +true+, and the resource
agent supports the +notify+ action, Pacemaker will call the action when
appropriate, passing a number of extra variables which, when combined with
additional context, can be used to calculate the current state of the cluster
and what is about to happen to it.
.Environment variables supplied with Clone notify actions
[width="95%",cols="5,3<",options="header",align="center"]
|=========================================================
|Variable
|Description
|OCF_RESKEY_CRM_meta_notify_type
|Allowed values: +pre+, +post+
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,type]
indexterm:[type,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_operation
|Allowed values: +start+, +stop+
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,operation]
indexterm:[operation,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_start_resource
|Resources to be started
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_resource]
indexterm:[start_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_stop_resource
|Resources to be stopped
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_resource]
indexterm:[stop_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_active_resource
|Resources that are running
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_resource]
indexterm:[active_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_inactive_resource
|Resources that are not running
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,inactive_resource]
indexterm:[inactive_resource,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_start_uname
|Nodes on which resources will be started
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_uname]
indexterm:[start_uname,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_stop_uname
|Nodes on which resources will be stopped
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_uname]
indexterm:[stop_uname,Notification Environment Variable]
|OCF_RESKEY_CRM_meta_notify_active_uname
|Nodes on which resources are running
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_uname]
indexterm:[active_uname,Notification Environment Variable]
|=========================================================
The variables come in pairs, such as
+OCF_RESKEY_CRM_meta_notify_start_resource+ and
+OCF_RESKEY_CRM_meta_notify_start_uname+ and should be treated as an
array of whitespace-separated elements.
+OCF_RESKEY_CRM_meta_notify_inactive_resource+ is an exception as the
matching +uname+ variable does not exist since inactive resources
are not running on any node.
Thus in order to indicate that +clone:0+ will be started on +sles-1+,
+clone:2+ will be started on +sles-3+, and +clone:3+ will be started
on +sles-2+, the cluster would set
.Notification variables
======
[source,Bash]
-------
OCF_RESKEY_CRM_meta_notify_start_resource="clone:0 clone:2 clone:3"
OCF_RESKEY_CRM_meta_notify_start_uname="sles-1 sles-3 sles-2"
-------
======
==== Interpretation of Notification Variables ====
.Pre-notification (stop):
* Active resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
* Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (stop) / Pre-notification (start):
* Active resources
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Inactive resources
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (start):
* Active resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
==== Extra Notifications for Promotable Clones ====
.Extra environment variables supplied for promotable clones
[width="95%",cols="5,3<",options="header",align="center"]
|=========================================================
|_OCF_RESKEY_CRM_meta_notify_master_resource_
|Resources that are running in +Master+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_resource]
indexterm:[master_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_slave_resource_
|Resources that are running in +Slave+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_resource]
indexterm:[slave_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_promote_resource_
|Resources to be promoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_resource]
indexterm:[promote_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_demote_resource_
|Resources to be demoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_resource]
indexterm:[demote_resource,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_promote_uname_
|Nodes on which resources will be promoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_uname]
indexterm:[promote_uname,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_demote_uname_
|Nodes on which resources will be demoted
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_uname]
indexterm:[demote_uname,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_master_uname_
|Nodes on which resources are running in +Master+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_uname]
indexterm:[master_uname,Notification Environment Variable]
|_OCF_RESKEY_CRM_meta_notify_slave_uname_
|Nodes on which resources are running in +Slave+ mode
indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_uname]
indexterm:[slave_uname,Notification Environment Variable]
|=========================================================
==== Interpretation of Promotable Notification Variables ====
.Pre-notification (demote):
* +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
* +Master+ resources: +$OCF_RESKEY_CRM_meta_notify_master_resource+
* +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+
* Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (demote) / Pre-notification (stop):
* +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+
* Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
.Post-notification (stop) / Pre-notification (start)
* +Active+ resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* +Slave+ resources:
** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (start) / Pre-notification (promote)
* +Active+ resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* +Slave+ resources:
** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
.Post-notification (promote)
* +Active+ resources:
** +$OCF_RESKEY_CRM_meta_notify_active_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* +Master+ resources:
** +$OCF_RESKEY_CRM_meta_notify_master_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* +Slave+ resources:
** +$OCF_RESKEY_CRM_meta_notify_slave_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Inactive resources:
** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+
** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+
** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
* Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+
* Resources that were promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+
* Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+
* Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+
=== Monitoring Promotable Clone Resources ===
The usual monitor actions are insufficient to monitor a promotable clone
resource, because Pacemaker needs to verify not only that the resource is
active, but also that its actual role matches its intended one.
Define two monitoring actions: the usual one will cover the slave role,
and an additional one with +role="master"+ will cover the master role.
.Monitoring both states of a promotable clone resource
======
[source,XML]
-------
<clone id="myMasterRsc">
<meta_attributes id="myMasterRsc-meta">
<nvpair name="promotable" value="true"/>
</meta_attributes>
<primitive id="myRsc" class="ocf" type="myApp" provider="myCorp">
<operations>
<op id="public-ip-slave-check" name="monitor" interval="60"/>
<op id="public-ip-master-check" name="monitor" interval="61" role="Master"/>
</operations>
</primitive>
</clone>
-------
======
[IMPORTANT]
===========
It is crucial that _every_ monitor operation has a different interval!
Pacemaker currently differentiates between operations
only by resource and interval; so if (for example) a promotable clone resource
had the same monitor interval for both roles, Pacemaker would ignore the
role when checking the status -- which would cause unexpected return
codes, and therefore unnecessary complications.
===========
[[s-promotion-scores]]
=== Determining Which Instance is Promoted ===
Pacemaker can choose a promotable clone instance to be promoted in one of two
ways:
* Promotion scores: These are node attributes set via the `crm_master` utility,
which generally would be called by the resource agent's start action if it
supports promotable clones. This tool automatically detects both the resource
and host, and should be used to set a preference for being promoted. Based on
this, +promoted-max+, and +promoted-node-max+, the instance(s) with the
highest preference will be promoted.
* Constraints: Location constraints can indicate which nodes are most preferred
as masters.
.Explicitly preferring node1 to be promoted to master
======
[source,XML]
-------
<rsc_location id="master-location" rsc="myMasterRsc">
<rule id="master-rule" score="100" role="Master">
<expression id="master-exp" attribute="#uname" operation="eq" value="node1"/>
</rule>
</rsc_location>
-------
======
[[s-resource-bundle]]
== Bundles - Isolated Environments ==
indexterm:[bundle]
indexterm:[Resource,bundle]
indexterm:[Docker,bundle]
indexterm:[rkt,bundle]
Pacemaker supports a special syntax for launching a
https://en.wikipedia.org/wiki/Operating-system-level_virtualization[container]
with any infrastructure it requires: the 'bundle'.
Pacemaker bundles support https://www.docker.com/[Docker] and
https://coreos.com/rkt/[rkt] container technologies.
footnote:[Docker is a trademark of Docker, Inc. No endorsement by or
association with Docker, Inc. is implied.]
.A bundle for a containerized web server
====
[source,XML]
----
<bundle id="httpd-bundle">
<docker image="pcmk:http" replicas="3"/>
<network ip-range-start="192.168.122.131"
host-netmask="24"
host-interface="eth0">
<port-mapping id="httpd-port" port="80"/>
</network>
<storage>
<storage-mapping id="httpd-syslog"
source-dir="/dev/log"
target-dir="/dev/log"
options="rw"/>
<storage-mapping id="httpd-root"
source-dir="/srv/html"
target-dir="/var/www/html"
options="rw"/>
<storage-mapping id="httpd-logs"
source-dir-root="/var/log/pacemaker/bundles"
target-dir="/etc/httpd/logs"
options="rw"/>
</storage>
<primitive class="ocf" id="httpd" provider="heartbeat" type="apache"/>
</bundle>
----
====
=== Bundle Properties ===
.Properties of a Bundle
[width="95%",cols="3m,5<",options="header",align="center"]
|=========================================================
|Field
|Description
|id
|A unique name for the bundle (required)
indexterm:[id,bundle]
indexterm:[bundle,Property,id]
|description
|Arbitrary text (not used by Pacemaker)
indexterm:[description,bundle]
indexterm:[bundle,Property,description]
|=========================================================
A bundle must contain exactly one +<docker>+ or +<rkt>+ element.
=== Docker Properties ===
Before configuring a Docker bundle in Pacemaker, the user must install Docker
and supply a fully configured Docker image on every node allowed to run the
bundle.
Pacemaker will create an implicit +ocf:heartbeat:docker+ resource to manage
a bundle's Docker container. The user must ensure that resource agent is
installed on every node allowed to run the bundle.
.Properties of a Bundle's Docker Element
[width="95%",cols="3m,4,5<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|image
|
|Docker image tag (required)
indexterm:[image,Docker]
indexterm:[Docker,Property,image]
|replicas
|Value of +promoted-max+ if that is positive, else 1
|A positive integer specifying the number of container instances to launch
indexterm:[replicas,Docker]
indexterm:[Docker,Property,replicas]
|replicas-per-host
|1
|A positive integer specifying the number of container instances allowed to run
on a single node
indexterm:[replicas-per-host,Docker]
indexterm:[Docker,Property,replicas-per-host]
|promoted-max
|0
|A non-negative integer that, if positive, indicates that the containerized
service should be treated as a promotable service, with this many replicas
allowed to run the service in the master role
indexterm:[promoted-max,Docker]
indexterm:[Docker,Property,promoted-max]
|network
|
|If specified, this will be passed to +docker run+ as the
https://docs.docker.com/engine/reference/run/#network-settings[network setting]
for the Docker container.
indexterm:[network,Docker]
indexterm:[Docker,Property,network]
|run-command
-|`/usr/sbin/pacemaker_remoted` if bundle contains a +primitive+, otherwise none
+|`/usr/sbin/pacemaker-remoted` if bundle contains a +primitive+, otherwise none
|This command will be run inside the container when launching it ("PID 1"). If
- the bundle contains a +primitive+, this command 'must' start pacemaker_remoted
- (but could, for example, be a script that does other stuff, too).
+ the bundle contains a +primitive+, this command 'must' start pacemaker-remoted
+ (but could, for example, be a script that does other stuff, too). If the
+ container image has a pre-2.0.0 version of Pacemaker, set this to
+ +/usr/sbin/pacemaker_remoted+ (note the underbar instead of dash).
indexterm:[run-command,Docker]
indexterm:[Docker,Property,run-command]
|options
|
|Extra command-line options to pass to `docker run`
indexterm:[options,Docker]
indexterm:[Docker,Property,options]
|=========================================================
For backward compatibility, +masters+ is accepted as an alias for
+promoted-max+, but is deprecated since 2.0.0, and support for it will be
removed in a future version.
=== rkt Properties ===
Before configuring a rkt bundle in Pacemaker, the user must install rkt
and supply a fully configured container image on every node allowed to run the
bundle.
Pacemaker will create an implicit +ocf:heartbeat:rkt+ resource to manage
a bundle's rkt container. The user must ensure that resource agent is
installed on every node allowed to run the bundle.
.Properties of a Bundle's rkt Element
[width="95%",cols="3m,4,5<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|image
|
|Container image tag (required)
indexterm:[image,rkt]
indexterm:[rkt,Property,image]
|replicas
|Value of +promoted-max+ if that is positive, else 1
|A positive integer specifying the number of container instances to launch
indexterm:[replicas,rkt]
indexterm:[rkt,Property,replicas]
|replicas-per-host
|1
|A positive integer specifying the number of container instances allowed to run
on a single node
indexterm:[replicas-per-host,rkt]
indexterm:[rkt,Property,replicas-per-host]
|promoted-max
|0
|A non-negative integer that, if positive, indicates that the containerized
service should be treated as a promotable service, with this many replicas
allowed to run the service in the master role
indexterm:[promoted-max,rkt]
indexterm:[rkt,Property,promoted-max]
|network
|
|If specified, this will be passed to +rkt run+ as the
network setting for the rkt container.
indexterm:[network,rkt]
indexterm:[rkt,Property,network]
|run-command
-|`/usr/sbin/pacemaker_remoted` if bundle contains a +primitive+, otherwise none
+|`/usr/sbin/pacemaker-remoted` if bundle contains a +primitive+, otherwise none
|This command will be run inside the container when launching it ("PID 1"). If
- the bundle contains a +primitive+, this command 'must' start pacemaker_remoted
- (but could, for example, be a script that does other stuff, too).
+ the bundle contains a +primitive+, this command 'must' start pacemaker-remoted
+ (but could, for example, be a script that does other stuff, too). If the
+ container image has a pre-2.0.0 version of Pacemaker, set this to
+ +/usr/sbin/pacemaker_remoted+ (note the underbar instead of dash).
indexterm:[run-command,rkt]
indexterm:[rkt,Property,run-command]
|options
|
|Extra command-line options to pass to `rkt run`
indexterm:[options,rkt]
indexterm:[rkt,Property,options]
|=========================================================
For backward compatibility, +masters+ is accepted as an alias for
+promoted-max+, but is deprecated since 2.0.0, and support for it will be
removed in a future version.
=== Bundle Network Properties ===
A bundle may optionally contain one +<network>+ element.
indexterm:[bundle,network]
.Properties of a Bundle's Network Element
[width="95%",cols="2m,1,4<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|add-host
|TRUE
|If TRUE, and +ip-range-start+ is used, Pacemaker will automatically ensure
that +/etc/hosts+ inside the containers has entries for each
<<s-resource-bundle-note-replica-names,replica name>> and its assigned IP.
indexterm:[add-host,network]
indexterm:[network,Property,add-host]
|ip-range-start
|
|If specified, Pacemaker will create an implicit +ocf:heartbeat:IPaddr2+
resource for each container instance, starting with this IP address,
using up to +replicas+ sequential addresses. These addresses can be used
from the host's network to reach the service inside the container, though
it is not visible within the container itself. Only IPv4 addresses are
currently supported.
indexterm:[ip-range-start,network]
indexterm:[network,Property,ip-range-start]
|host-netmask
|32
|If +ip-range-start+ is specified, the IP addresses are created with this
CIDR netmask (as a number of bits).
indexterm:[host-netmask,network]
indexterm:[network,Property,host-netmask]
|host-interface
|
|If +ip-range-start+ is specified, the IP addresses are created on this
host interface (by default, it will be determined from the IP address).
indexterm:[host-interface,network]
indexterm:[network,Property,host-interface]
|control-port
|3121
|If the bundle contains a +primitive+, the cluster will use this integer TCP
port for communication with Pacemaker Remote inside the container. Changing
this is useful when the container is unable to listen on the default port,
for example, when the container uses the host's network rather than
+ip-range-start+ (in which case +replicas-per-host+ must be 1), or when the
bundle may run on a Pacemaker Remote node that is already listening on the
default port. Any PCMK_remote_port environment variable set on the host or in
the container is ignored for bundle connections.
indexterm:[control-port,network]
indexterm:[network,Property,control-port]
|=========================================================
[[s-resource-bundle-note-replica-names]]
[NOTE]
====
Replicas are named by the bundle id plus a dash and an integer counter starting
with zero. For example, if a bundle named +httpd-bundle+ has +replicas=2+, its
containers will be named +httpd-bundle-0+ and +httpd-bundle-1+.
====
Additionally, a +<network>+ element may optionally contain one or more
+<port-mapping>+ elements.
indexterm:[bundle,network,port-mapping]
.Properties of a Bundle's Port-Mapping Element
[width="95%",cols="2m,1,4<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|id
|
|A unique name for the port mapping (required)
indexterm:[id,port-mapping]
indexterm:[port-mapping,Property,id]
|port
|
|If this is specified, connections to this TCP port number on the host network
(on the container's assigned IP address, if +ip-range-start+ is specified)
will be forwarded to the container network. Exactly one of +port+ or +range+
must be specified in a +port-mapping+.
indexterm:[port,port-mapping]
indexterm:[port-mapping,Property,port]
|internal-port
|value of +port+
|If +port+ and this are specified, connections to +port+ on the host's network
will be forwarded to this port on the container network.
indexterm:[internal-port,port-mapping]
indexterm:[port-mapping,Property,internal-port]
|range
|
|If this is specified, connections to these TCP port numbers (expressed as
'first_port'-'last_port') on the host network (on the container's assigned IP
address, if +ip-range-start+ is specified) will be forwarded to the same ports
in the container network. Exactly one of +port+ or +range+ must be specified
in a +port-mapping+.
indexterm:[range,port-mapping]
indexterm:[port-mapping,Property,range]
|=========================================================
[NOTE]
====
If the bundle contains a +primitive+, Pacemaker will automatically map the
+control-port+, so it is not necessary to specify that port in a
+port-mapping+.
====
=== Bundle Storage Properties ===
A bundle may optionally contain one +<storage>+ element. A +<storage>+ element
has no properties of its own, but may contain one or more +<storage-mapping>+
elements.
indexterm:[bundle,storage,storage-mapping]
.Properties of a Bundle's Storage-Mapping Element
[width="95%",cols="2m,1,4<",options="header",align="center"]
|=========================================================
|Field
|Default
|Description
|id
|
|A unique name for the storage mapping (required)
indexterm:[id,storage-mapping]
indexterm:[storage-mapping,Property,id]
|source-dir
|
|The absolute path on the host's filesystem that will be mapped into the
container. Exactly one of +source-dir+ and +source-dir-root+ must be specified
in a +storage-mapping+.
indexterm:[source-dir,storage-mapping]
indexterm:[storage-mapping,Property,source-dir]
|source-dir-root
|
|The start of a path on the host's filesystem that will be mapped into the
container, using a different subdirectory on the host for each container
instance. The subdirectory will be named the same as the
<<s-resource-bundle-note-replica-names,replica name>>.
Exactly one of +source-dir+ and +source-dir-root+ must be specified in a
+storage-mapping+.
indexterm:[source-dir-root,storage-mapping]
indexterm:[storage-mapping,Property,source-dir-root]
|target-dir
|
|The path name within the container where the host storage will be mapped
(required)
indexterm:[target-dir,storage-mapping]
indexterm:[storage-mapping,Property,target-dir]
|options
|
|File system mount options to use when mapping the storage
indexterm:[options,storage-mapping]
indexterm:[storage-mapping,Property,options]
|=========================================================
[NOTE]
====
Pacemaker does not define the behavior if the source directory does not already
exist on the host. However, it is expected that the container technology and/or
its resource agent will create the source directory in that case.
====
[NOTE]
====
If the bundle contains a +primitive+,
Pacemaker will automatically map the equivalent of
+source-dir=/etc/pacemaker/authkey target-dir=/etc/pacemaker/authkey+
and +source-dir-root=/var/log/pacemaker/bundles target-dir=/var/log+ into the
container, so it is not necessary to specify those paths in a
+storage-mapping+.
====
[IMPORTANT]
====
The +PCMK_authkey_location+ environment variable must not be set to anything
other than the default of `/etc/pacemaker/authkey` on any node in the cluster.
====
=== Bundle Primitive ===
A bundle may optionally contain one +<primitive>+ resource
(see <<s-resource-primitive>>). The primitive may have operations,
instance attributes and meta-attributes defined, as usual.
If a bundle contains a primitive resource, the container image must include
the Pacemaker Remote daemon, and at least one of +ip-range-start+ or
+control-port+ must be configured in the bundle. Pacemaker will create an
implicit +ocf:pacemaker:remote+ resource for the connection, launch
Pacemaker Remote within the container, and monitor and manage the primitive
resource via Pacemaker Remote.
If the bundle has more than one container instance (replica), the primitive
resource will function as an implicit clone (see <<s-resource-clone>>) --
a promotable clone if the bundle has +masters+ greater than zero
(see <<s-resource-promotable>>).
[IMPORTANT]
====
Containers in bundles with a +primitive+ must have an accessible networking
environment, so that Pacemaker on the cluster nodes can contact
Pacemaker Remote inside the container. For example, the Docker option
`--net=none` should not be used with a +primitive+. The default (using a
distinct network space inside the container) works in combination with
+ip-range-start+. If the Docker option `--net=host` is used (making the
container share the host's network space), a unique +control-port+ should be
specified for each bundle. Any firewall must allow access to the
+control-port+.
====
[[s-bundle-attributes]]
=== Bundle Node Attributes ===
If the bundle has a +primitive+, the primitive's resource agent may want to set
node attributes such as <<s-promotion-scores,promotion scores>>. However, with
containers, it is not apparent which node should get the attribute.
If the container uses shared storage that is the same no matter which node the
container is hosted on, then it is appropriate to use the promotion score on the
bundle node itself.
On the other hand, if the container uses storage exported from the underlying host,
then it may be more appropriate to use the promotion score on the underlying host.
Since this depends on the particular situation, the
+container-attribute-target+ resource meta-attribute allows the user to specify
which approach to use. If it is set to +host+, then user-defined node attributes
will be checked on the underlying host. If it is anything else, the local node
(in this case the bundle node) is used as usual.
This only applies to user-defined attributes; the cluster will always check the
local node for cluster-defined attributes such as +#uname+.
If +container-attribute-target+ is +host+, the cluster will pass additional
environment variables to the primitive's resource agent that allow it to set
node attributes appropriately: +container_attribute_target+ (identical to the
meta-attribute value) and +physical_host+ (the name of the underlying host).
[NOTE]
====
It is up to the resource agent to check for the additional variables and use
them when setting node attributes.
====
=== Bundle Meta-Attributes ===
Any meta-attribute set on a bundle will be inherited by the bundle's
primitive and any resources implicitly created by Pacemaker for the bundle.
This includes options such as +priority+, +target-role+, and +is-managed+. See
<<s-resource-options>> for more information.
=== Limitations of Bundles ===
Restarting pacemaker while a bundle is unmanaged or the cluster is in
maintenance mode may cause the bundle to fail.
Bundles may not be cloned or included in groups. This includes the bundle's
primitive and any resources implicitly created by Pacemaker for the bundle.
Bundles do not have instance attributes, utilization attributes, or operations,
though a bundle's primitive may have them.
A bundle with a primitive can run on a Pacemaker Remote node only if the bundle
uses a distinct +control-port+.
diff --git a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt
index 4324824709..302b653768 100644
--- a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt
+++ b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt
@@ -1,305 +1,305 @@
= Remote Node Walk-through =
*What this tutorial is:* An in-depth walk-through of how to get Pacemaker to
integrate a remote node into the cluster as a node capable of running cluster
resources.
*What this tutorial is not:* A realistic deployment scenario. The steps shown
here are meant to get users familiar with the concept of remote nodes as
quickly as possible.
This tutorial requires three machines: two to act as cluster nodes, and
a third to act as the remote node.
== Configure Remote Node ==
=== Configure Firewall on Remote Node ===
Allow cluster-related services through the local firewall:
----
# firewall-cmd --permanent --add-service=high-availability
success
# firewall-cmd --reload
success
----
[NOTE]
======
If you are using iptables directly, or some other firewall solution besides
firewalld, simply open the following ports, which can be used by various
clustering components: TCP ports 2224, 3121, and 21064, and UDP port 5405.
If you run into any problems during testing, you might want to disable
the firewall and SELinux entirely until you have everything working.
This may create significant security issues and should not be performed on
machines that will be exposed to the outside world, but may be appropriate
during development and testing on a protected host.
To disable security measures:
----
# setenforce 0
# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config
# systemctl mask firewalld.service
# systemctl stop firewalld.service
# iptables --flush
----
======
=== Configure pacemaker_remote on Remote Node ===
Install the pacemaker_remote daemon on the remote node.
----
# yum install -y pacemaker-remote resource-agents pcs
----
Create a location for the shared authentication key:
----
# mkdir -p --mode=0750 /etc/pacemaker
# chgrp haclient /etc/pacemaker
----
All nodes (both cluster nodes and remote nodes) must have the same
authentication key installed for the communication to work correctly.
If you already have a key on an existing node, copy it to the new
remote node. Otherwise, create a new key, for example:
----
# dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1
----
Now start and enable the pacemaker_remote daemon on the remote node.
----
# systemctl enable pacemaker_remote.service
# systemctl start pacemaker_remote.service
----
Verify the start is successful.
----
# systemctl status pacemaker_remote
pacemaker_remote.service - Pacemaker Remote Service
Loaded: loaded (/usr/lib/systemd/system/pacemaker_remote.service; enabled)
Active: active (running) since Fri 2018-01-12 15:21:20 CDT; 20s ago
Main PID: 21273 (pacemaker_remot)
CGroup: /system.slice/pacemaker_remote.service
- └─21273 /usr/sbin/pacemaker_remoted
+ └─21273 /usr/sbin/pacemaker-remoted
Jan 12 15:21:20 remote1 systemd[1]: Starting Pacemaker Remote Service...
Jan 12 15:21:20 remote1 systemd[1]: Started Pacemaker Remote Service.
-Jan 12 15:21:20 remote1 pacemaker_remoted[21273]: notice: crm_add_logfile: Additional logging available in /var/log/pacemaker.log
-Jan 12 15:21:20 remote1 pacemaker_remoted[21273]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121.
-Jan 12 15:21:20 remote1 pacemaker_remoted[21273]: notice: bind_and_listen: Listening on address ::
+Jan 12 15:21:20 remote1 pacemaker-remoted[21273]: notice: crm_add_logfile: Additional logging available in /var/log/pacemaker.log
+Jan 12 15:21:20 remote1 pacemaker-remoted[21273]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121.
+Jan 12 15:21:20 remote1 pacemaker-remoted[21273]: notice: bind_and_listen: Listening on address ::
----
== Verify Connection to Remote Node ==
Before moving forward, it's worth verifying that the cluster nodes
can contact the remote node on port 3121. Here's a trick you can use.
Connect using ssh from each of the cluster nodes. The connection will get
destroyed, but how it is destroyed tells you whether it worked or not.
First, add the remote node's hostname (we're using *remote1* in this tutorial)
to the cluster nodes' +/etc/hosts+ files if you haven't already. This
is required unless you have DNS set up in a way where remote1's address can be
discovered.
Execute the following on each cluster node, replacing the IP address with the
actual IP address of the remote node.
----
# cat << END >> /etc/hosts
192.168.122.10 remote1
END
----
If running the ssh command on one of the cluster nodes results in this
output before disconnecting, the connection works:
----
# ssh -p 3121 remote1
ssh_exchange_identification: read: Connection reset by peer
----
If you see one of these, the connection is not working:
----
# ssh -p 3121 remote1
ssh: connect to host remote1 port 3121: No route to host
----
----
# ssh -p 3121 remote1
ssh: connect to host remote1 port 3121: Connection refused
----
Once you can successfully connect to the remote node from the both
cluster nodes, move on to setting up Pacemaker on the cluster nodes.
== Configure Cluster Nodes ==
=== Configure Firewall on Cluster Nodes ===
On each cluster node, allow cluster-related services through the local
firewall, following the same procedure as in <<_configure_firewall_on_remote_node>>.
=== Install Pacemaker on Cluster Nodes ===
On the two cluster nodes, install the following packages.
----
# yum install -y pacemaker corosync pcs resource-agents
----
=== Copy Authentication Key to Cluster Nodes ===
Create a location for the shared authentication key,
and copy it from any existing node:
----
# mkdir -p --mode=0750 /etc/pacemaker
# chgrp haclient /etc/pacemaker
# scp remote1:/etc/pacemaker/authkey /etc/pacemaker/authkey
----
=== Configure Corosync on Cluster Nodes ===
Corosync handles Pacemaker's cluster membership and messaging. The corosync
config file is located in +/etc/corosync/corosync.conf+. That config file must be
initialized with information about the two cluster nodes before pacemaker can
start.
To initialize the corosync config file, execute the following pcs command on
both nodes, filling in the information in <> with your nodes' information.
----
# pcs cluster setup --force --local --name mycluster <node1 ip or hostname> <node2 ip or hostname>
----
=== Start Pacemaker on Cluster Nodes ===
Start the cluster stack on both cluster nodes using the following command.
----
# pcs cluster start
----
Verify corosync membership
....
# pcs status corosync
Membership information
----------------------
Nodeid Votes Name
1 1 node1 (local)
....
Verify Pacemaker status. At first, the `pcs cluster status` output will look
like this.
----
# pcs status
Cluster name: mycluster
Stack: corosync
Current DC: NONE
Last updated: Fri Jan 12 16:14:05 2018
Last change: Fri Jan 12 14:02:14 2018
1 node configured
0 resources configured
----
After about a minute, you should see your two cluster nodes come online.
----
# pcs status
Cluster name: mycluster
Stack: corosync
Current DC: node1 (version 1.1.16-12.el7_4.5-94ff4df) - partition with quorum
Last updated: Fri Jan 12 16:16:32 2018
Last change: Fri Jan 12 14:02:14 2018
2 nodes configured
0 resources configured
Online: [ node1 node2 ]
----
For the sake of this tutorial, we are going to disable stonith to avoid having to cover fencing device configuration.
----
# pcs property set stonith-enabled=false
----
== Integrate Remote Node into Cluster ==
Integrating a remote node into the cluster is achieved through the
creation of a remote node connection resource. The remote node connection
resource both establishes the connection to the remote node and defines that
the remote node exists. Note that this resource is actually internal to
Pacemaker's crmd component. A metadata file for this resource can be found in
the +/usr/lib/ocf/resource.d/pacemaker/remote+ file that describes what options
are available, but there is no actual *ocf:pacemaker:remote* resource agent
script that performs any work.
Define the remote node connection resource to our remote node,
*remote1*, using the following command on any cluster node.
----
# pcs resource create remote1 ocf:pacemaker:remote
----
That's it. After a moment you should see the remote node come online.
----
Cluster name: mycluster
Stack: corosync
Current DC: node1 (version 1.1.16-12.el7_4.5-94ff4df) - partition with quorum
Last updated: Fri Jan 12 17:13:09 2018
Last change: Fri Jan 12 17:02:02 2018
3 nodes configured
1 resources configured
Online: [ node1 node2 ]
RemoteOnline: [ remote1 ]
Full list of resources:
remote1 (ocf::pacemaker:remote): Started node1
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
== Starting Resources on Remote Node ==
Once the remote node is integrated into the cluster, starting resources on a
remote node is the exact same as on cluster nodes. Refer to the
http://clusterlabs.org/doc/['Clusters from Scratch'] document for examples of
resource creation.
[WARNING]
=========
Never involve a remote node connection resource in a resource group,
colocation constraint, or order constraint.
=========
== Fencing Remote Nodes ==
Remote nodes are fenced the same way as cluster nodes. No special
considerations are required. Configure fencing resources for use with
remote nodes the same as you would with cluster nodes.
Note, however, that remote nodes can never 'initiate' a fencing action. Only
cluster nodes are capable of actually executing a fencing operation against
another node.
== Accessing Cluster Tools from a Remote Node ==
Besides allowing the cluster to manage resources on a remote node,
pacemaker_remote has one other trick. The pacemaker_remote daemon allows
nearly all the pacemaker tools (`crm_resource`, `crm_mon`, `crm_attribute`,
`crm_master`, etc.) to work on remote nodes natively.
Try it: Run `crm_mon` on the remote node after pacemaker has
integrated it into the cluster. These tools just work. These means resource
agents such as promotable resources (which need access to tools like
`crm_master`) work seamlessly on the remote nodes.
Higher-level command shells such as `pcs` may have partial support
on remote nodes, but it is recommended to run them from a cluster node.
diff --git a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt
index ef510795a8..251f590d09 100644
--- a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt
+++ b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt
@@ -1,578 +1,578 @@
= Guest Node Walk-through =
*What this tutorial is:* An in-depth walk-through of how to get Pacemaker to
manage a KVM guest instance and integrate that guest into the cluster as a
guest node.
*What this tutorial is not:* A realistic deployment scenario. The steps shown
here are meant to get users familiar with the concept of guest nodes as quickly
as possible.
== Configure the Physical Host ==
[NOTE]
======
For this example, we will use a single physical host named *example-host*.
A production cluster would likely have multiple physical hosts, in which case
you would run the commands here on each one, unless noted otherwise.
======
=== Configure Firewall on Host ===
On the physical host, allow cluster-related services through the local firewall:
----
# firewall-cmd --permanent --add-service=high-availability
success
# firewall-cmd --reload
success
----
[NOTE]
======
If you are using iptables directly, or some other firewall solution besides
firewalld, simply open the following ports, which can be used by various
clustering components: TCP ports 2224, 3121, and 21064, and UDP port 5405.
If you run into any problems during testing, you might want to disable
the firewall and SELinux entirely until you have everything working.
This may create significant security issues and should not be performed on
machines that will be exposed to the outside world, but may be appropriate
during development and testing on a protected host.
To disable security measures:
----
[root@pcmk-1 ~]# setenforce 0
[root@pcmk-1 ~]# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config
[root@pcmk-1 ~]# systemctl mask firewalld.service
[root@pcmk-1 ~]# systemctl stop firewalld.service
[root@pcmk-1 ~]# iptables --flush
----
======
=== Install Cluster Software ===
----
# yum install -y pacemaker corosync pcs resource-agents
----
=== Configure Corosync ===
Corosync handles pacemaker's cluster membership and messaging. The corosync
config file is located in +/etc/corosync/corosync.conf+. That config file must
be initialized with information about the cluster nodes before pacemaker can
start.
To initialize the corosync config file, execute the following `pcs` command,
replacing the cluster name and hostname as desired:
----
# pcs cluster setup --force --local --name mycluster example-host
----
[NOTE]
======
If you have multiple physical hosts, you would execute the setup command on
only one host, but list all of them at the end of the command.
======
=== Configure Pacemaker for Remote Node Communication ===
Create a place to hold an authentication key for use with pacemaker_remote:
----
# mkdir -p --mode=0750 /etc/pacemaker
# chgrp haclient /etc/pacemaker
----
Generate a key:
----
# dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1
----
[NOTE]
======
If you have multiple physical hosts, you would generate the key on only one
host, and copy it to the same location on all hosts.
======
=== Verify Cluster Software ===
Start the cluster
----
# pcs cluster start
----
Verify corosync membership
....
# pcs status corosync
Membership information
----------------------
Nodeid Votes Name
1 1 example-host (local)
....
Verify pacemaker status. At first, the output will look like this:
----
# pcs status
Cluster name: mycluster
WARNING: no stonith devices and stonith-enabled is not false
Stack: corosync
Current DC: NONE
Last updated: Fri Jan 12 15:18:32 2018
Last change: Fri Jan 12 12:42:21 2018 by root via cibadmin on example-host
1 node configured
0 resources configured
Node example-host: UNCLEAN (offline)
No active resources
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
After a short amount of time, you should see your host as a single node in the
cluster:
----
# pcs status
Cluster name: mycluster
WARNING: no stonith devices and stonith-enabled is not false
Stack: corosync
Current DC: example-host (version 1.1.16-12.el7_4.5-94ff4df) - partition WITHOUT quorum
Last updated: Fri Jan 12 15:20:05 2018
Last change: Fri Jan 12 12:42:21 2018 by root via cibadmin on example-host
1 node configured
0 resources configured
Online: [ example-host ]
No active resources
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
=== Disable STONITH and Quorum ===
Now, enable the cluster to work without quorum or stonith. This is required
for the sake of getting this tutorial to work with a single cluster node.
----
# pcs property set stonith-enabled=false
# pcs property set no-quorum-policy=ignore
----
[WARNING]
=========
The use of `stonith-enabled=false` is completely inappropriate for a production
cluster. It tells the cluster to simply pretend that failed nodes are safely
powered off. Some vendors will refuse to support clusters that have STONITH
disabled. We disable STONITH here only to focus the discussion on
pacemaker_remote, and to be able to use a single physical host in the example.
=========
Now, the status output should look similar to this:
----
# pcs status
Cluster name: mycluster
Stack: corosync
Current DC: example-host (version 1.1.16-12.el7_4.5-94ff4df) - partition with quorum
Last updated: Fri Jan 12 15:22:49 2018
Last change: Fri Jan 12 15:22:46 2018 by root via cibadmin on example-host
1 node configured
0 resources configured
Online: [ example-host ]
No active resources
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
Go ahead and stop the cluster for now after verifying everything is in order.
----
# pcs cluster stop --force
----
=== Install Virtualization Software ===
----
# yum install -y kvm libvirt qemu-system qemu-kvm bridge-utils virt-manager
# systemctl enable libvirtd.service
----
Reboot the host.
[NOTE]
======
While KVM is used in this example, any virtualization platform with a Pacemaker
resource agent can be used to create a guest node. The resource agent needs
only to support usual commands (start, stop, etc.); Pacemaker implements the
*remote-node* meta-attribute, independent of the agent.
======
== Configure the KVM guest ==
=== Create Guest ===
We will not outline here the installation steps required to create a KVM
guest. There are plenty of tutorials available elsewhere that do that.
Just be sure to configure the guest with a hostname and a static IP address
(as an example here, we will use guest1 and 192.168.122.10).
=== Configure Firewall on Guest ===
On each guest, allow cluster-related services through the local firewall,
following the same procedure as in <<_configure_firewall_on_host>>.
=== Verify Connectivity ===
At this point, you should be able to ping and ssh into guests from hosts, and
vice versa.
=== Configure pacemaker_remote ===
Install pacemaker_remote, and enable it to run at start-up. Here, we also
install the pacemaker package; it is not required, but it contains the dummy
resource agent that we will use later for testing.
----
# yum install -y pacemaker pacemaker-remote resource-agents
# systemctl enable pacemaker_remote.service
----
Copy the authentication key from a host:
----
# mkdir -p --mode=0750 /etc/pacemaker
# chgrp haclient /etc/pacemaker
# scp root@example-host:/etc/pacemaker/authkey /etc/pacemaker
----
Start pacemaker_remote, and verify the start was successful:
----
# systemctl start pacemaker_remote
# systemctl status pacemaker_remote
pacemaker_remote.service - Pacemaker Remote Service
Loaded: loaded (/usr/lib/systemd/system/pacemaker_remote.service; enabled)
Active: active (running) since Thu 2013-03-14 18:24:04 EDT; 2min 8s ago
Main PID: 1233 (pacemaker_remot)
CGroup: name=systemd:/system/pacemaker_remote.service
- └─1233 /usr/sbin/pacemaker_remoted
+ └─1233 /usr/sbin/pacemaker-remoted
Mar 14 18:24:04 guest1 systemd[1]: Starting Pacemaker Remote Service...
Mar 14 18:24:04 guest1 systemd[1]: Started Pacemaker Remote Service.
- Mar 14 18:24:04 guest1 pacemaker_remoted[1233]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121.
+ Mar 14 18:24:04 guest1 pacemaker-remoted[1233]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121.
----
=== Verify Host Connection to Guest ===
Before moving forward, it's worth verifying that the host can contact the guest
on port 3121. Here's a trick you can use. Connect using ssh from the host. The
connection will get destroyed, but how it is destroyed tells you whether it
worked or not.
First add guest1 to the host machine's +/etc/hosts+ file if you haven't
already. This is required unless you have DNS setup in a way where guest1's
address can be discovered.
----
# cat << END >> /etc/hosts
192.168.122.10 guest1
END
----
If running the ssh command on one of the cluster nodes results in this
output before disconnecting, the connection works:
----
# ssh -p 3121 guest1
ssh_exchange_identification: read: Connection reset by peer
----
If you see one of these, the connection is not working:
----
# ssh -p 3121 guest1
ssh: connect to host guest1 port 3121: No route to host
----
----
# ssh -p 3121 guest1
ssh: connect to host guest1 port 3121: Connection refused
----
Once you can successfully connect to the guest from the host, shutdown the guest. Pacemaker will be managing the virtual machine from this point forward.
== Integrate Guest into Cluster ==
Now the fun part, integrating the virtual machine you've just created into the cluster. It is incredibly simple.
=== Start the Cluster ===
On the host, start pacemaker.
----
# pcs cluster start
----
Wait for the host to become the DC. The output of `pcs status` should look
as it did in <<_disable_stonith_and_quorum>>.
=== Integrate as Guest Node ===
If you didn't already do this earlier in the verify host to guest connection
section, add the KVM guest's IP address to the host's +/etc/hosts+ file so we
can connect by hostname. For this example:
----
# cat << END >> /etc/hosts
192.168.122.10 guest1
END
----
We will use the *VirtualDomain* resource agent for the management of the
virtual machine. This agent requires the virtual machine's XML config to be
dumped to a file on disk. To do this, pick out the name of the virtual machine
you just created from the output of this list.
....
# virsh list --all
Id Name State
----------------------------------------------------
- guest1 shut off
....
In my case I named it guest1. Dump the xml to a file somewhere on the host using the following command.
----
# virsh dumpxml guest1 > /etc/pacemaker/guest1.xml
----
Now just register the resource with pacemaker and you're set!
----
# pcs resource create vm-guest1 VirtualDomain hypervisor="qemu:///system" \
config="/etc/pacemaker/guest1.xml" meta remote-node=guest1
----
[NOTE]
======
This example puts the guest XML under /etc/pacemaker because the
permissions and SELinux labeling should not need any changes.
If you run into trouble with this or any step, try disabling SELinux
with `setenforce 0`. If it works after that, see SELinux documentation
for how to troubleshoot, if you wish to reenable SELinux.
======
[NOTE]
======
Pacemaker will automatically monitor pacemaker_remote connections for failure,
so it is not necessary to create a recurring monitor on the VirtualDomain
resource.
======
Once the *vm-guest1* resource is started you will see *guest1* appear in the
`pcs status` output as a node. The final `pcs status` output should look
something like this.
----
# pcs status
Cluster name: mycluster
Stack: corosync
Current DC: example-host (version 1.1.16-12.el7_4.5-94ff4df) - partition with quorum
Last updated: Fri Jan 12 18:00:45 2018
Last change: Fri Jan 12 17:53:44 2018 by root via crm_resource on example-host
2 nodes configured
2 resources configured
Online: [ example-host ]
GuestOnline: [ guest1@example-host ]
Full list of resources:
vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
=== Starting Resources on KVM Guest ===
The commands below demonstrate how resources can be executed on both the
guest node and the cluster node.
Create a few Dummy resources. Dummy resources are real resource agents used just for testing purposes. They actually execute on the host they are assigned to just like an apache server or database would, except their execution just means a file was created. When the resource is stopped, that the file it created is removed.
----
# pcs resource create FAKE1 ocf:pacemaker:Dummy
# pcs resource create FAKE2 ocf:pacemaker:Dummy
# pcs resource create FAKE3 ocf:pacemaker:Dummy
# pcs resource create FAKE4 ocf:pacemaker:Dummy
# pcs resource create FAKE5 ocf:pacemaker:Dummy
----
Now check your `pcs status` output. In the resource section, you should see
something like the following, where some of the resources started on the
cluster node, and some started on the guest node.
----
Full list of resources:
vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host
FAKE1 (ocf::pacemaker:Dummy): Started guest1
FAKE2 (ocf::pacemaker:Dummy): Started guest1
FAKE3 (ocf::pacemaker:Dummy): Started example-host
FAKE4 (ocf::pacemaker:Dummy): Started guest1
FAKE5 (ocf::pacemaker:Dummy): Started example-host
----
The guest node, *guest1*, reacts just like any other node in the cluster. For
example, pick out a resource that is running on your cluster node. For my
purposes, I am picking FAKE3 from the output above. We can force FAKE3 to run
on *guest1* in the exact same way we would any other node.
----
# pcs constraint location FAKE3 prefers guest1
----
Now, looking at the bottom of the `pcs status` output you'll see FAKE3 is on
*guest1*.
----
Full list of resources:
vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host
FAKE1 (ocf::pacemaker:Dummy): Started guest1
FAKE2 (ocf::pacemaker:Dummy): Started guest1
FAKE3 (ocf::pacemaker:Dummy): Started guest1
FAKE4 (ocf::pacemaker:Dummy): Started example-host
FAKE5 (ocf::pacemaker:Dummy): Started example-host
----
=== Testing Recovery and Fencing ===
Pacemaker's policy engine is smart enough to know fencing guest nodes
associated with a virtual machine means shutting off/rebooting the virtual
machine. No special configuration is necessary to make this happen. If you
are interested in testing this functionality out, trying stopping the guest's
pacemaker_remote daemon. This would be equivalent of abruptly terminating a
cluster node's corosync membership without properly shutting it down.
ssh into the guest and run this command.
----
-# kill -9 `pidof pacemaker_remoted`
+# kill -9 $(pidof pacemaker-remoted)
----
Within a few seconds, your `pcs status` output will show a monitor failure,
and the *guest1* node will not be shown while it is being recovered.
----
# pcs status
Cluster name: mycluster
Stack: corosync
Current DC: example-host (version 1.1.16-12.el7_4.5-94ff4df) - partition with quorum
Last updated: Fri Jan 12 18:08:35 2018
Last change: Fri Jan 12 18:07:00 2018 by root via cibadmin on example-host
2 nodes configured
7 resources configured
Online: [ example-host ]
Full list of resources:
vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host
FAKE1 (ocf::pacemaker:Dummy): Stopped
FAKE2 (ocf::pacemaker:Dummy): Stopped
FAKE3 (ocf::pacemaker:Dummy): Stopped
FAKE4 (ocf::pacemaker:Dummy): Started example-host
FAKE5 (ocf::pacemaker:Dummy): Started example-host
Failed Actions:
* guest1_monitor_30000 on example-host 'unknown error' (1): call=8, status=Error, exitreason='none',
last-rc-change='Fri Jan 12 18:08:29 2018', queued=0ms, exec=0ms
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
[NOTE]
======
A guest node involves two resources: the one you explicitly configured creates the guest,
and Pacemaker creates an implicit resource for the pacemaker_remote connection, which
will be named the same as the value of the *remote-node* attribute of the explicit resource.
When we killed pacemaker_remote, it is the implicit resource that failed, which is why
the failed action starts with *guest1* and not *vm-guest1*.
======
Once recovery of the guest is complete, you'll see it automatically get
re-integrated into the cluster. The final `pcs status` output should look
something like this.
----
Cluster name: mycluster
Stack: corosync
Current DC: example-host (version 1.1.16-12.el7_4.5-94ff4df) - partition with quorum
Last updated: Fri Jan 12 18:18:30 2018
Last change: Fri Jan 12 18:07:00 2018 by root via cibadmin on example-host
2 nodes configured
7 resources configured
Online: [ example-host ]
GuestOnline: [ guest1@example-host ]
Full list of resources:
vm-guest1 (ocf::heartbeat:VirtualDomain): Started example-host
FAKE1 (ocf::pacemaker:Dummy): Started guest1
FAKE2 (ocf::pacemaker:Dummy): Started guest1
FAKE3 (ocf::pacemaker:Dummy): Started guest1
FAKE4 (ocf::pacemaker:Dummy): Started example-host
FAKE5 (ocf::pacemaker:Dummy): Started example-host
Failed Actions:
* guest1_monitor_30000 on example-host 'unknown error' (1): call=8, status=Error, exitreason='none',
last-rc-change='Fri Jan 12 18:08:29 2018', queued=0ms, exec=0ms
Daemon Status:
corosync: active/disabled
pacemaker: active/disabled
pcsd: active/enabled
----
Normally, once you've investigated and addressed a failed action, you can clear the
failure. However Pacemaker does not yet support cleanup for the implicitly
created connection resource while the explicit resource is active. If you want
to clear the failed action from the status output, stop the guest resource before
clearing it. For example:
----
# pcs resource disable vm-guest1 --wait
# pcs resource cleanup guest1
# pcs resource enable vm-guest1
----
=== Accessing Cluster Tools from Guest Node ===
Besides allowing the cluster to manage resources on a guest node,
pacemaker_remote has one other trick. The pacemaker_remote daemon allows
nearly all the pacemaker tools (`crm_resource`, `crm_mon`, `crm_attribute`,
`crm_master`, etc.) to work on guest nodes natively.
Try it: Run `crm_mon` on the guest after pacemaker has
integrated the guest node into the cluster. These tools just work. This
means resource agents such as promotable resources (which need access to tools
like `crm_master`) work seamlessly on the guest nodes.
Higher-level command shells such as `pcs` may have partial support
on guest nodes, but it is recommended to run them from a cluster node.
diff --git a/extra/ansible/docker/roles/docker-host/files/pcmk_remote_start b/extra/ansible/docker/roles/docker-host/files/pcmk_remote_start
index 1bf032003a..0adfb23729 100644
--- a/extra/ansible/docker/roles/docker-host/files/pcmk_remote_start
+++ b/extra/ansible/docker/roles/docker-host/files/pcmk_remote_start
@@ -1,18 +1,18 @@
#!/bin/bash
/usr/sbin/ip_start
-pid=$(pidof pacemaker_remoted)
+pid=$(pidof pacemaker-remoted)
if [ "$?" -ne 0 ]; then
mkdir -p /var/run
export PCMK_debugfile=$pcmklogs
- (pacemaker_remoted &) & > /dev/null 2>&1
+ (pacemaker-remoted &) & > /dev/null 2>&1
sleep 5
- pid=$(pidof pacemaker_remoted)
+ pid=$(pidof pacemaker-remoted)
if [ "$?" -ne 0 ]; then
echo "startup of pacemaker failed"
exit 1
fi
- echo "$pid" > /var/run/pacemaker_remoted.pid
+ echo "$pid" > /var/run/pacemaker-remoted.pid
fi
exit 0
diff --git a/extra/ansible/docker/roles/docker-host/files/pcmk_remote_stop b/extra/ansible/docker/roles/docker-host/files/pcmk_remote_stop
index 32728f02f8..a2586e6e9d 100644
--- a/extra/ansible/docker/roles/docker-host/files/pcmk_remote_stop
+++ b/extra/ansible/docker/roles/docker-host/files/pcmk_remote_stop
@@ -1,37 +1,37 @@
#!/bin/bash
status()
{
pid=$(pidof $1 2>/dev/null)
rtrn=$?
if [ $rtrn -ne 0 ]; then
echo "$1 is stopped"
else
echo "$1 (pid $pid) is running..."
fi
return $rtrn
}
stop()
{
desc="Pacemaker Remote"
prog=$1
shutdown_prog=$prog
if status $shutdown_prog > /dev/null 2>&1; then
kill -TERM $(pidof $prog) > /dev/null 2>&1
while status $prog > /dev/null 2>&1; do
sleep 1
echo -n "."
done
else
echo -n "$desc is already stopped"
fi
rm -f /var/lock/subsystem/pacemaker
rm -f /var/run/${prog}.pid
killall -q -9 pacemakerd pacemaker-attrd pacemaker-execd crmd \
- stonithd cib pacemaker_remoted
+ stonithd cib pacemaker-remoted
}
-stop "pacemaker_remoted"
+stop "pacemaker-remoted"
exit 0
diff --git a/extra/ansible/docker/roles/docker-host/files/pcmk_stop b/extra/ansible/docker/roles/docker-host/files/pcmk_stop
index 2849665e30..ac35675ad2 100644
--- a/extra/ansible/docker/roles/docker-host/files/pcmk_stop
+++ b/extra/ansible/docker/roles/docker-host/files/pcmk_stop
@@ -1,46 +1,46 @@
#!/bin/bash
status()
{
pid=$(pidof $1 2>/dev/null)
rtrn=$?
if [ $rtrn -ne 0 ]; then
echo "$1 is stopped"
else
echo "$1 (pid $pid) is running..."
fi
return $rtrn
}
stop()
{
desc="Pacemaker Cluster Manager"
prog=$1
shutdown_prog=$prog
if ! status $prog > /dev/null 2>&1; then
shutdown_prog="crmd"
fi
cname=$(crm_node --name)
crm_attribute -N $cname -n standby -v true -l reboot
if status $shutdown_prog > /dev/null 2>&1; then
kill -TERM $(pidof $prog) > /dev/null 2>&1
while status $prog > /dev/null 2>&1; do
sleep 1
echo -n "."
done
else
echo -n "$desc is already stopped"
fi
rm -f /var/lock/subsystem/pacemaker
rm -f /var/run/${prog}.pid
killall -q -9 pacemakerd pacemaker-attrd pacemaker-execd crmd \
- stonithd cib pacemaker_remoted
+ stonithd cib pacemaker-remoted
}
stop "pacemakerd"
/usr/share/corosync/corosync stop > /dev/null 2>&1
killall -q -9 'corosync'
exit 0
diff --git a/extra/cluster-clean b/extra/cluster-clean
index 67f3624e70..228881b528 100755
--- a/extra/cluster-clean
+++ b/extra/cluster-clean
@@ -1,97 +1,97 @@
#!/bin/bash
#
# Copyright 2011-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
hosts=
group=
kill=0
while true; do
case "$1" in
-x) set -x; shift;;
-w) for h in $2; do
hosts="$hosts -w $h";
done
shift; shift;;
-g) group=$2; shift; shift;;
--kill) kill=1; shift;;
--kill-only) kill=2; shift;;
"") break;;
*) echo "unknown option: $1"; exit 1;;
esac
done
if [ x"$group" = x -a x"$hosts" = x ]; then
group=$CTS_GROUP
fi
if [ x"$hosts" != x ]; then
echo `date` ": Cleaning up hosts:"
target=$hosts
elif [ x"$group" != x ]; then
echo `date` ": Cleaning up group: $group"
target="-g $group"
else
echo "You didn't specify any nodes to clean up"
exit 1
fi
cluster-helper --list bullet $target
if [ $kill != 0 ]; then
echo "Cleaning processes"
# Bah. Force systemd to actually look at the process and realize it's dead
cluster-helper $target -- "service corosync stop" &> /dev/null &
cluster-helper $target -- "service pacemaker stop" &> /dev/null &
- cluster-helper $target -- "killall -q -9 corosync pacemakerd pacemaker-attrd pacemaker-execd pacemaker_remoted stonithd crmd pengine cib dlm_controld gfs_controld" &> /dev/null
+ cluster-helper $target -- "killall -q -9 corosync pacemakerd pacemaker-attrd pacemaker-execd pacemaker-remoted stonithd crmd pengine cib dlm_controld gfs_controld" &> /dev/null
cluster-helper $target -- 'kill -9 `pidof valgrind`' &> /dev/null
if [ $kill == 2 ]; then
exit 0
fi
fi
#logrotate -f $cluster_rotate
echo "Cleaning files"
log_files=""
log_files="$log_files 'messages*'"
log_files="$log_files 'localmessages*'"
log_files="$log_files 'cluster*.log'"
log_files="$log_files 'corosync.log*'"
log_files="$log_files 'pacemaker.log*'"
log_files="$log_files '*.journal'"
log_files="$log_files '*.journal~'"
log_files="$log_files 'secure-*'"
state_files=""
state_files="$state_files 'cib.xml*'"
state_files="$state_files 'valgrind-*'"
state_files="$state_files 'cib-*'"
state_files="$state_files 'core.*'"
state_files="$state_files 'cts.*'"
state_files="$state_files 'pe*.bz2'"
state_files="$state_files 'fdata-*'"
for f in $log_files; do
cluster-helper $target -- "find /var/log -name '$f' -exec rm -f \{\} \;"
done
for f in $state_files; do
cluster-helper $target -- "find /var/lib -name '$f' -exec rm -f \{\} \;"
done
cluster-helper $target -- "find /dev/shm -name 'qb-*' -exec rm -f \{\} \;"
cluster-helper $target -- "find /var/lib/pacemaker/blackbox -name '*-*' -exec rm -f \{\} \;"
cluster-helper $target -- "find /tmp -name '*.valgrind' -exec rm -f \{\} \;"
cluster-helper $target -- 'service rsyslog restart' > /dev/null 2>&1
cluster-helper $target -- 'systemctl restart systemd-journald.socket' > /dev/null 2>&1
cluster-helper $target -- logger -i -p daemon.info __clean_logs__
#touch $cluster_log
echo `date` ": Clean complete"
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index ec858f1fab..ea5bc0551b 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -1,1973 +1,1974 @@
/*
* Copyright 2012-2018 David Vossel <davidvossel@gmail.com>
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <glib.h>
#include <dirent.h>
#include <crm/crm.h>
#include <crm/lrmd.h>
#include <crm/services.h>
#include <crm/common/mainloop.h>
#include <crm/common/ipcs.h>
#include <crm/msg_xml.h>
#include <crm/stonith-ng.h>
#ifdef HAVE_GNUTLS_GNUTLS_H
# undef KEYFILE
# include <gnutls/gnutls.h>
#endif
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <arpa/inet.h>
#include <netdb.h>
#define MAX_TLS_RECV_WAIT 10000
CRM_TRACE_INIT_DATA(lrmd);
static int lrmd_api_disconnect(lrmd_t * lrmd);
static int lrmd_api_is_connected(lrmd_t * lrmd);
/* IPC proxy functions */
int lrmd_internal_proxy_send(lrmd_t * lrmd, xmlNode *msg);
static void lrmd_internal_proxy_dispatch(lrmd_t *lrmd, xmlNode *msg);
void lrmd_internal_set_proxy_callback(lrmd_t * lrmd, void *userdata, void (*callback)(lrmd_t *lrmd, void *userdata, xmlNode *msg));
#ifdef HAVE_GNUTLS_GNUTLS_H
# define LRMD_CLIENT_HANDSHAKE_TIMEOUT 5000 /* 5 seconds */
gnutls_psk_client_credentials_t psk_cred_s;
int lrmd_tls_set_key(gnutls_datum_t * key);
static void lrmd_tls_disconnect(lrmd_t * lrmd);
static int global_remote_msg_id = 0;
int lrmd_tls_send_msg(crm_remote_t * session, xmlNode * msg, uint32_t id, const char *msg_type);
static void lrmd_tls_connection_destroy(gpointer userdata);
#endif
typedef struct lrmd_private_s {
enum client_type type;
char *token;
mainloop_io_t *source;
/* IPC parameters */
crm_ipc_t *ipc;
crm_remote_t *remote;
/* Extra TLS parameters */
char *remote_nodename;
#ifdef HAVE_GNUTLS_GNUTLS_H
char *server;
int port;
gnutls_psk_client_credentials_t psk_cred_c;
/* while the async connection is occurring, this is the id
* of the connection timeout timer. */
int async_timer;
int sock;
/* since tls requires a round trip across the network for a
* request/reply, there are times where we just want to be able
* to send a request from the client and not wait around (or even care
* about) what the reply is. */
int expected_late_replies;
GList *pending_notify;
crm_trigger_t *process_notify;
#endif
lrmd_event_callback callback;
/* Internal IPC proxy msg passing for remote guests */
void (*proxy_callback)(lrmd_t *lrmd, void *userdata, xmlNode *msg);
void *proxy_callback_userdata;
char *peer_version;
} lrmd_private_t;
static lrmd_list_t *
lrmd_list_add(lrmd_list_t * head, const char *value)
{
lrmd_list_t *p, *end;
p = calloc(1, sizeof(lrmd_list_t));
p->val = strdup(value);
end = head;
while (end && end->next) {
end = end->next;
}
if (end) {
end->next = p;
} else {
head = p;
}
return head;
}
void
lrmd_list_freeall(lrmd_list_t * head)
{
lrmd_list_t *p;
while (head) {
char *val = (char *)head->val;
p = head->next;
free(val);
free(head);
head = p;
}
}
lrmd_key_value_t *
lrmd_key_value_add(lrmd_key_value_t * head, const char *key, const char *value)
{
lrmd_key_value_t *p, *end;
p = calloc(1, sizeof(lrmd_key_value_t));
p->key = strdup(key);
p->value = strdup(value);
end = head;
while (end && end->next) {
end = end->next;
}
if (end) {
end->next = p;
} else {
head = p;
}
return head;
}
void
lrmd_key_value_freeall(lrmd_key_value_t * head)
{
lrmd_key_value_t *p;
while (head) {
p = head->next;
free(head->key);
free(head->value);
free(head);
head = p;
}
}
lrmd_event_data_t *
lrmd_copy_event(lrmd_event_data_t * event)
{
lrmd_event_data_t *copy = NULL;
copy = calloc(1, sizeof(lrmd_event_data_t));
/* This will get all the int values.
* we just have to be careful not to leave any
* dangling pointers to strings. */
memcpy(copy, event, sizeof(lrmd_event_data_t));
copy->rsc_id = event->rsc_id ? strdup(event->rsc_id) : NULL;
copy->op_type = event->op_type ? strdup(event->op_type) : NULL;
copy->user_data = event->user_data ? strdup(event->user_data) : NULL;
copy->output = event->output ? strdup(event->output) : NULL;
copy->exit_reason = event->exit_reason ? strdup(event->exit_reason) : NULL;
copy->remote_nodename = event->remote_nodename ? strdup(event->remote_nodename) : NULL;
copy->params = crm_str_table_dup(event->params);
return copy;
}
void
lrmd_free_event(lrmd_event_data_t * event)
{
if (!event) {
return;
}
/* free gives me grief if i try to cast */
free((char *)event->rsc_id);
free((char *)event->op_type);
free((char *)event->user_data);
free((char *)event->output);
free((char *)event->exit_reason);
free((char *)event->remote_nodename);
if (event->params) {
g_hash_table_destroy(event->params);
}
free(event);
}
static int
lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg)
{
const char *type;
const char *proxy_session = crm_element_value(msg, F_LRMD_IPC_SESSION);
lrmd_private_t *native = lrmd->lrmd_private;
lrmd_event_data_t event = { 0, };
if (proxy_session != NULL) {
/* this is proxy business */
lrmd_internal_proxy_dispatch(lrmd, msg);
return 1;
} else if (!native->callback) {
/* no callback set */
crm_trace("notify event received but client has not set callback");
return 1;
}
event.remote_nodename = native->remote_nodename;
type = crm_element_value(msg, F_LRMD_OPERATION);
crm_element_value_int(msg, F_LRMD_CALLID, &event.call_id);
event.rsc_id = crm_element_value(msg, F_LRMD_RSC_ID);
if (crm_str_eq(type, LRMD_OP_RSC_REG, TRUE)) {
event.type = lrmd_event_register;
} else if (crm_str_eq(type, LRMD_OP_RSC_UNREG, TRUE)) {
event.type = lrmd_event_unregister;
} else if (crm_str_eq(type, LRMD_OP_RSC_EXEC, TRUE)) {
crm_element_value_int(msg, F_LRMD_TIMEOUT, &event.timeout);
crm_element_value_ms(msg, F_LRMD_RSC_INTERVAL, &event.interval_ms);
crm_element_value_int(msg, F_LRMD_RSC_START_DELAY, &event.start_delay);
crm_element_value_int(msg, F_LRMD_EXEC_RC, (int *)&event.rc);
crm_element_value_int(msg, F_LRMD_OP_STATUS, &event.op_status);
crm_element_value_int(msg, F_LRMD_RSC_DELETED, &event.rsc_deleted);
crm_element_value_int(msg, F_LRMD_RSC_RUN_TIME, (int *)&event.t_run);
crm_element_value_int(msg, F_LRMD_RSC_RCCHANGE_TIME, (int *)&event.t_rcchange);
crm_element_value_int(msg, F_LRMD_RSC_EXEC_TIME, (int *)&event.exec_time);
crm_element_value_int(msg, F_LRMD_RSC_QUEUE_TIME, (int *)&event.queue_time);
event.op_type = crm_element_value(msg, F_LRMD_RSC_ACTION);
event.user_data = crm_element_value(msg, F_LRMD_RSC_USERDATA_STR);
event.output = crm_element_value(msg, F_LRMD_RSC_OUTPUT);
event.exit_reason = crm_element_value(msg, F_LRMD_RSC_EXIT_REASON);
event.type = lrmd_event_exec_complete;
event.params = xml2list(msg);
} else if (crm_str_eq(type, LRMD_OP_NEW_CLIENT, TRUE)) {
event.type = lrmd_event_new_client;
} else if (crm_str_eq(type, LRMD_OP_POKE, TRUE)) {
event.type = lrmd_event_poke;
} else {
return 1;
}
crm_trace("op %s notify event received", type);
native->callback(&event);
if (event.params) {
g_hash_table_destroy(event.params);
}
return 1;
}
static int
lrmd_ipc_dispatch(const char *buffer, ssize_t length, gpointer userdata)
{
lrmd_t *lrmd = userdata;
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *msg;
int rc;
if (!native->callback) {
/* no callback set */
return 1;
}
msg = string2xml(buffer);
rc = lrmd_dispatch_internal(lrmd, msg);
free_xml(msg);
return rc;
}
#ifdef HAVE_GNUTLS_GNUTLS_H
static void
lrmd_free_xml(gpointer userdata)
{
free_xml((xmlNode *) userdata);
}
static int
lrmd_tls_connected(lrmd_t * lrmd)
{
lrmd_private_t *native = lrmd->lrmd_private;
if (native->remote->tls_session) {
return TRUE;
}
return FALSE;
}
static int
lrmd_tls_dispatch(gpointer userdata)
{
lrmd_t *lrmd = userdata;
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *xml = NULL;
int rc = 0;
int disconnected = 0;
if (lrmd_tls_connected(lrmd) == FALSE) {
crm_trace("TLS dispatch triggered after disconnect");
return 0;
}
crm_trace("TLS dispatch triggered");
/* First check if there are any pending notifies to process that came
* while we were waiting for replies earlier. */
if (native->pending_notify) {
GList *iter = NULL;
crm_trace("Processing pending notifies");
for (iter = native->pending_notify; iter; iter = iter->next) {
lrmd_dispatch_internal(lrmd, iter->data);
}
g_list_free_full(native->pending_notify, lrmd_free_xml);
native->pending_notify = NULL;
}
/* Next read the current buffer and see if there are any messages to handle. */
rc = crm_remote_ready(native->remote, 0);
if (rc == 0) {
/* nothing to read, see if any full messages are already in buffer. */
xml = crm_remote_parse_buffer(native->remote);
} else if (rc < 0) {
disconnected = 1;
} else {
crm_remote_recv(native->remote, -1, &disconnected);
xml = crm_remote_parse_buffer(native->remote);
}
while (xml) {
const char *msg_type = crm_element_value(xml, F_LRMD_REMOTE_MSG_TYPE);
if (safe_str_eq(msg_type, "notify")) {
lrmd_dispatch_internal(lrmd, xml);
} else if (safe_str_eq(msg_type, "reply")) {
if (native->expected_late_replies > 0) {
native->expected_late_replies--;
} else {
int reply_id = 0;
crm_element_value_int(xml, F_LRMD_CALLID, &reply_id);
/* if this happens, we want to know about it */
crm_err("Got outdated Pacemaker Remote reply %d", reply_id);
}
}
free_xml(xml);
xml = crm_remote_parse_buffer(native->remote);
}
if (disconnected) {
crm_info("Lost %s executor connection while reading data",
(native->remote_nodename? native->remote_nodename : "local"));
lrmd_tls_disconnect(lrmd);
return 0;
}
return 1;
}
#endif
/* Not used with mainloop */
int
lrmd_poll(lrmd_t * lrmd, int timeout)
{
lrmd_private_t *native = lrmd->lrmd_private;
switch (native->type) {
case CRM_CLIENT_IPC:
return crm_ipc_ready(native->ipc);
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
if (native->pending_notify) {
return 1;
}
return crm_remote_ready(native->remote, 0);
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
return 0;
}
/* Not used with mainloop */
bool
lrmd_dispatch(lrmd_t * lrmd)
{
lrmd_private_t *private = NULL;
CRM_ASSERT(lrmd != NULL);
private = lrmd->lrmd_private;
switch (private->type) {
case CRM_CLIENT_IPC:
while (crm_ipc_ready(private->ipc)) {
if (crm_ipc_read(private->ipc) > 0) {
const char *msg = crm_ipc_buffer(private->ipc);
lrmd_ipc_dispatch(msg, strlen(msg), lrmd);
}
}
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
lrmd_tls_dispatch(lrmd);
break;
#endif
default:
crm_err("Unsupported connection type: %d", private->type);
}
if (lrmd_api_is_connected(lrmd) == FALSE) {
crm_err("Connection closed");
return FALSE;
}
return TRUE;
}
static xmlNode *
lrmd_create_op(const char *token, const char *op, xmlNode *data, int timeout,
enum lrmd_call_options options)
{
xmlNode *op_msg = create_xml_node(NULL, "lrmd_command");
CRM_CHECK(op_msg != NULL, return NULL);
CRM_CHECK(token != NULL, return NULL);
crm_xml_add(op_msg, F_XML_TAGNAME, "lrmd_command");
crm_xml_add(op_msg, F_TYPE, T_LRMD);
crm_xml_add(op_msg, F_LRMD_CALLBACK_TOKEN, token);
crm_xml_add(op_msg, F_LRMD_OPERATION, op);
crm_xml_add_int(op_msg, F_LRMD_TIMEOUT, timeout);
crm_xml_add_int(op_msg, F_LRMD_CALLOPTS, options);
if (data != NULL) {
add_message_xml(op_msg, F_LRMD_CALLDATA, data);
}
crm_trace("Created executor %s command with call options %.8lx (%d)",
op, (long)options, options);
return op_msg;
}
static void
lrmd_ipc_connection_destroy(gpointer userdata)
{
lrmd_t *lrmd = userdata;
lrmd_private_t *native = lrmd->lrmd_private;
crm_info("IPC connection destroyed");
/* Prevent these from being cleaned up in lrmd_api_disconnect() */
native->ipc = NULL;
native->source = NULL;
if (native->callback) {
lrmd_event_data_t event = { 0, };
event.type = lrmd_event_disconnect;
event.remote_nodename = native->remote_nodename;
native->callback(&event);
}
}
#ifdef HAVE_GNUTLS_GNUTLS_H
static void
lrmd_tls_connection_destroy(gpointer userdata)
{
lrmd_t *lrmd = userdata;
lrmd_private_t *native = lrmd->lrmd_private;
crm_info("TLS connection destroyed");
if (native->remote->tls_session) {
gnutls_bye(*native->remote->tls_session, GNUTLS_SHUT_RDWR);
gnutls_deinit(*native->remote->tls_session);
gnutls_free(native->remote->tls_session);
}
if (native->psk_cred_c) {
gnutls_psk_free_client_credentials(native->psk_cred_c);
}
if (native->sock) {
close(native->sock);
}
if (native->process_notify) {
mainloop_destroy_trigger(native->process_notify);
native->process_notify = NULL;
}
if (native->pending_notify) {
g_list_free_full(native->pending_notify, lrmd_free_xml);
native->pending_notify = NULL;
}
free(native->remote->buffer);
native->remote->buffer = NULL;
native->source = 0;
native->sock = 0;
native->psk_cred_c = NULL;
native->remote->tls_session = NULL;
native->sock = 0;
if (native->callback) {
lrmd_event_data_t event = { 0, };
event.remote_nodename = native->remote_nodename;
event.type = lrmd_event_disconnect;
native->callback(&event);
}
return;
}
int
lrmd_tls_send_msg(crm_remote_t * session, xmlNode * msg, uint32_t id, const char *msg_type)
{
crm_xml_add_int(msg, F_LRMD_REMOTE_MSG_ID, id);
crm_xml_add(msg, F_LRMD_REMOTE_MSG_TYPE, msg_type);
return crm_remote_send(session, msg);
}
static xmlNode *
lrmd_tls_recv_reply(lrmd_t * lrmd, int total_timeout, int expected_reply_id, int *disconnected)
{
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *xml = NULL;
time_t start = time(NULL);
const char *msg_type = NULL;
int reply_id = 0;
int remaining_timeout = 0;
/* A timeout of 0 here makes no sense. We have to wait a period of time
* for the response to come back. If -1 or 0, default to 10 seconds. */
if (total_timeout <= 0 || total_timeout > MAX_TLS_RECV_WAIT) {
total_timeout = MAX_TLS_RECV_WAIT;
}
while (!xml) {
xml = crm_remote_parse_buffer(native->remote);
if (!xml) {
/* read some more off the tls buffer if we still have time left. */
if (remaining_timeout) {
remaining_timeout = total_timeout - ((time(NULL) - start) * 1000);
} else {
remaining_timeout = total_timeout;
}
if (remaining_timeout <= 0) {
crm_err("Never received the expected reply during the timeout period, disconnecting.");
*disconnected = TRUE;
return NULL;
}
crm_remote_recv(native->remote, remaining_timeout, disconnected);
xml = crm_remote_parse_buffer(native->remote);
if (!xml) {
crm_err("Unable to receive expected reply, disconnecting.");
*disconnected = TRUE;
return NULL;
} else if (*disconnected) {
return NULL;
}
}
CRM_ASSERT(xml != NULL);
crm_element_value_int(xml, F_LRMD_REMOTE_MSG_ID, &reply_id);
msg_type = crm_element_value(xml, F_LRMD_REMOTE_MSG_TYPE);
if (!msg_type) {
crm_err("Empty msg type received while waiting for reply");
free_xml(xml);
xml = NULL;
} else if (safe_str_eq(msg_type, "notify")) {
/* got a notify while waiting for reply, trigger the notify to be processed later */
crm_info("queueing notify");
native->pending_notify = g_list_append(native->pending_notify, xml);
if (native->process_notify) {
crm_info("notify trigger set.");
mainloop_set_trigger(native->process_notify);
}
xml = NULL;
} else if (safe_str_neq(msg_type, "reply")) {
/* msg isn't a reply, make some noise */
crm_err("Expected a reply, got %s", msg_type);
free_xml(xml);
xml = NULL;
} else if (reply_id != expected_reply_id) {
if (native->expected_late_replies > 0) {
native->expected_late_replies--;
} else {
crm_err("Got outdated reply, expected id %d got id %d", expected_reply_id, reply_id);
}
free_xml(xml);
xml = NULL;
}
}
if (native->remote->buffer && native->process_notify) {
mainloop_set_trigger(native->process_notify);
}
return xml;
}
static int
lrmd_tls_send(lrmd_t * lrmd, xmlNode * msg)
{
int rc = 0;
lrmd_private_t *native = lrmd->lrmd_private;
global_remote_msg_id++;
if (global_remote_msg_id <= 0) {
global_remote_msg_id = 1;
}
rc = lrmd_tls_send_msg(native->remote, msg, global_remote_msg_id, "request");
if (rc <= 0) {
crm_err("Disconnecting because TLS message could not be sent to Pacemaker Remote");
lrmd_tls_disconnect(lrmd);
return -ENOTCONN;
}
return pcmk_ok;
}
static int
lrmd_tls_send_recv(lrmd_t * lrmd, xmlNode * msg, int timeout, xmlNode ** reply)
{
int rc = 0;
int disconnected = 0;
xmlNode *xml = NULL;
if (lrmd_tls_connected(lrmd) == FALSE) {
return -1;
}
rc = lrmd_tls_send(lrmd, msg);
if (rc < 0) {
return rc;
}
xml = lrmd_tls_recv_reply(lrmd, timeout, global_remote_msg_id, &disconnected);
if (disconnected) {
crm_err("Pacemaker Remote disconnected while waiting for reply to request id %d",
global_remote_msg_id);
lrmd_tls_disconnect(lrmd);
rc = -ENOTCONN;
} else if (!xml) {
crm_err("Did not receive reply from Pacemaker Remote for request id %d (timeout %dms)",
global_remote_msg_id, timeout);
rc = -ECOMM;
}
if (reply) {
*reply = xml;
} else {
free_xml(xml);
}
return rc;
}
#endif
static int
lrmd_send_xml(lrmd_t * lrmd, xmlNode * msg, int timeout, xmlNode ** reply)
{
int rc = -1;
lrmd_private_t *native = lrmd->lrmd_private;
switch (native->type) {
case CRM_CLIENT_IPC:
rc = crm_ipc_send(native->ipc, msg, crm_ipc_client_response, timeout, reply);
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
rc = lrmd_tls_send_recv(lrmd, msg, timeout, reply);
break;
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
return rc;
}
static int
lrmd_send_xml_no_reply(lrmd_t * lrmd, xmlNode * msg)
{
int rc = -1;
lrmd_private_t *native = lrmd->lrmd_private;
switch (native->type) {
case CRM_CLIENT_IPC:
rc = crm_ipc_send(native->ipc, msg, crm_ipc_flags_none, 0, NULL);
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
rc = lrmd_tls_send(lrmd, msg);
if (rc == pcmk_ok) {
/* we don't want to wait around for the reply, but
* since the request/reply protocol needs to behave the same
* as libqb, a reply will eventually come later anyway. */
native->expected_late_replies++;
}
break;
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
return rc;
}
static int
lrmd_api_is_connected(lrmd_t * lrmd)
{
lrmd_private_t *native = lrmd->lrmd_private;
switch (native->type) {
case CRM_CLIENT_IPC:
return crm_ipc_connected(native->ipc);
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
return lrmd_tls_connected(lrmd);
break;
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
return 0;
}
/*!
* \internal
* \brief Send a prepared API command to the executor
*
* \param[in] lrmd Existing connection to the executor
* \param[in] op Name of API command to send
* \param[in] data Command data XML to add to the sent command
* \param[out] output_data If expecting a reply, it will be stored here
* \param[in] timeout Timeout in milliseconds (if 0, defaults to 1000);
* will be added to the command XML
* \param[in] call_options Call options to pass to server when sending
* \param[in] expect_reply If TRUE, wait for a reply from the server;
* must be TRUE for IPC (as opposed to TLS) clients
*
* \return pcmk_ok on success, -errno on error
*/
static int
lrmd_send_command(lrmd_t *lrmd, const char *op, xmlNode *data,
xmlNode **output_data, int timeout,
enum lrmd_call_options options, gboolean expect_reply)
{
int rc = pcmk_ok;
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *op_msg = NULL;
xmlNode *op_reply = NULL;
if (!lrmd_api_is_connected(lrmd)) {
return -ENOTCONN;
}
if (op == NULL) {
crm_err("No operation specified");
return -EINVAL;
}
CRM_CHECK(native->token != NULL,;
);
crm_trace("Sending %s op to executor", op);
op_msg = lrmd_create_op(native->token, op, data, timeout, options);
if (op_msg == NULL) {
return -EINVAL;
}
if (expect_reply) {
rc = lrmd_send_xml(lrmd, op_msg, timeout, &op_reply);
} else {
rc = lrmd_send_xml_no_reply(lrmd, op_msg);
goto done;
}
if (rc < 0) {
crm_perror(LOG_ERR, "Couldn't perform %s operation (timeout=%d): %d", op, timeout, rc);
rc = -ECOMM;
goto done;
} else if(op_reply == NULL) {
rc = -ENOMSG;
goto done;
}
rc = pcmk_ok;
crm_trace("%s op reply received", op);
if (crm_element_value_int(op_reply, F_LRMD_RC, &rc) != 0) {
rc = -ENOMSG;
goto done;
}
crm_log_xml_trace(op_reply, "Reply");
if (output_data) {
*output_data = op_reply;
op_reply = NULL; /* Prevent subsequent free */
}
done:
if (lrmd_api_is_connected(lrmd) == FALSE) {
crm_err("Executor disconnected");
}
free_xml(op_msg);
free_xml(op_reply);
return rc;
}
static int
lrmd_api_poke_connection(lrmd_t * lrmd)
{
int rc;
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *data = create_xml_node(NULL, F_LRMD_RSC);
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
rc = lrmd_send_command(lrmd, LRMD_OP_POKE, data, NULL, 0, 0, native->type == CRM_CLIENT_IPC ? TRUE : FALSE);
free_xml(data);
return rc < 0 ? rc : pcmk_ok;
}
int
remote_proxy_check(lrmd_t * lrmd, GHashTable *hash)
{
int rc;
const char *value;
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *data = create_xml_node(NULL, F_LRMD_OPERATION);
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
value = g_hash_table_lookup(hash, "stonith-watchdog-timeout");
crm_xml_add(data, F_LRMD_WATCHDOG, value);
rc = lrmd_send_command(lrmd, LRMD_OP_CHECK, data, NULL, 0, 0, native->type == CRM_CLIENT_IPC ? TRUE : FALSE);
free_xml(data);
return rc < 0 ? rc : pcmk_ok;
}
static int
lrmd_handshake(lrmd_t * lrmd, const char *name)
{
int rc = pcmk_ok;
lrmd_private_t *native = lrmd->lrmd_private;
xmlNode *reply = NULL;
xmlNode *hello = create_xml_node(NULL, "lrmd_command");
crm_xml_add(hello, F_TYPE, T_LRMD);
crm_xml_add(hello, F_LRMD_OPERATION, CRM_OP_REGISTER);
crm_xml_add(hello, F_LRMD_CLIENTNAME, name);
crm_xml_add(hello, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION);
/* advertise that we are a proxy provider */
if (native->proxy_callback) {
crm_xml_add(hello, F_LRMD_IS_IPC_PROVIDER, "true");
}
rc = lrmd_send_xml(lrmd, hello, -1, &reply);
if (rc < 0) {
crm_perror(LOG_DEBUG, "Couldn't complete registration with the executor API: %d", rc);
rc = -ECOMM;
} else if (reply == NULL) {
crm_err("Did not receive registration reply");
rc = -EPROTO;
} else {
const char *version = crm_element_value(reply, F_LRMD_PROTOCOL_VERSION);
const char *msg_type = crm_element_value(reply, F_LRMD_OPERATION);
const char *tmp_ticket = crm_element_value(reply, F_LRMD_CLIENTID);
crm_element_value_int(reply, F_LRMD_RC, &rc);
if (rc == -EPROTO) {
crm_err("Executor protocol version mismatch between client (%s) and server (%s)",
LRMD_PROTOCOL_VERSION, version);
crm_log_xml_err(reply, "Protocol Error");
} else if (safe_str_neq(msg_type, CRM_OP_REGISTER)) {
crm_err("Invalid registration message: %s", msg_type);
crm_log_xml_err(reply, "Bad reply");
rc = -EPROTO;
} else if (tmp_ticket == NULL) {
crm_err("No registration token provided");
crm_log_xml_err(reply, "Bad reply");
rc = -EPROTO;
} else {
crm_trace("Obtained registration token: %s", tmp_ticket);
native->token = strdup(tmp_ticket);
native->peer_version = strdup(version?version:"1.0"); /* Included since 1.1 */
rc = pcmk_ok;
}
}
free_xml(reply);
free_xml(hello);
if (rc != pcmk_ok) {
lrmd_api_disconnect(lrmd);
}
return rc;
}
static int
lrmd_ipc_connect(lrmd_t * lrmd, int *fd)
{
int rc = pcmk_ok;
lrmd_private_t *native = lrmd->lrmd_private;
static struct ipc_client_callbacks lrmd_callbacks = {
.dispatch = lrmd_ipc_dispatch,
.destroy = lrmd_ipc_connection_destroy
};
crm_info("Connecting to executor");
if (fd) {
/* No mainloop */
native->ipc = crm_ipc_new(CRM_SYSTEM_LRMD, 0);
if (native->ipc && crm_ipc_connect(native->ipc)) {
*fd = crm_ipc_get_fd(native->ipc);
} else if (native->ipc) {
crm_perror(LOG_ERR, "Connection to executor failed");
rc = -ENOTCONN;
}
} else {
native->source = mainloop_add_ipc_client(CRM_SYSTEM_LRMD, G_PRIORITY_HIGH, 0, lrmd, &lrmd_callbacks);
native->ipc = mainloop_get_ipc_client(native->source);
}
if (native->ipc == NULL) {
crm_debug("Could not connect to the executor API");
rc = -ENOTCONN;
}
return rc;
}
#ifdef HAVE_GNUTLS_GNUTLS_H
static int
set_key(gnutls_datum_t * key, const char *location)
{
FILE *stream;
int read_len = 256;
int cur_len = 0;
int buf_len = read_len;
static char *key_cache = NULL;
static size_t key_cache_len = 0;
static time_t key_cache_updated;
if (location == NULL) {
return -1;
}
if (key_cache) {
time_t now = time(NULL);
if ((now - key_cache_updated) < 60) {
key->data = gnutls_malloc(key_cache_len + 1);
key->size = key_cache_len;
memcpy(key->data, key_cache, key_cache_len);
crm_debug("Using cached executor key");
return 0;
} else {
key_cache_len = 0;
key_cache_updated = 0;
free(key_cache);
key_cache = NULL;
crm_debug("Clearing executor key cache");
}
}
stream = fopen(location, "r");
if (!stream) {
return -1;
}
key->data = gnutls_malloc(read_len);
while (!feof(stream)) {
int next;
if (cur_len == buf_len) {
buf_len = cur_len + read_len;
key->data = gnutls_realloc(key->data, buf_len);
}
next = fgetc(stream);
if (next == EOF && feof(stream)) {
break;
}
key->data[cur_len] = next;
cur_len++;
}
fclose(stream);
key->size = cur_len;
if (!cur_len) {
gnutls_free(key->data);
key->data = 0;
return -1;
}
if (!key_cache) {
key_cache = calloc(1, key->size + 1);
memcpy(key_cache, key->data, key->size);
key_cache_len = key->size;
key_cache_updated = time(NULL);
}
return 0;
}
int
lrmd_tls_set_key(gnutls_datum_t * key)
{
const char *specific_location = getenv("PCMK_authkey_location");
if (set_key(key, specific_location) == 0) {
crm_debug("Using custom authkey location %s", specific_location);
return pcmk_ok;
} else if (specific_location) {
crm_err("No valid Pacemaker Remote key found at %s, trying default location", specific_location);
}
if ((set_key(key, DEFAULT_REMOTE_KEY_LOCATION) != 0)
&& (set_key(key, ALT_REMOTE_KEY_LOCATION) != 0)) {
crm_err("No valid Pacemaker Remote key found at %s", DEFAULT_REMOTE_KEY_LOCATION);
return -ENOKEY;
}
return pcmk_ok;
}
static void
lrmd_gnutls_global_init(void)
{
static int gnutls_init = 0;
if (!gnutls_init) {
crm_gnutls_global_init();
}
gnutls_init = 1;
}
#endif
static void
report_async_connection_result(lrmd_t * lrmd, int rc)
{
lrmd_private_t *native = lrmd->lrmd_private;
if (native->callback) {
lrmd_event_data_t event = { 0, };
event.type = lrmd_event_connect;
event.remote_nodename = native->remote_nodename;
event.connection_rc = rc;
native->callback(&event);
}
}
#ifdef HAVE_GNUTLS_GNUTLS_H
static void
lrmd_tcp_connect_cb(void *userdata, int sock)
{
lrmd_t *lrmd = userdata;
lrmd_private_t *native = lrmd->lrmd_private;
char *name;
static struct mainloop_fd_callbacks lrmd_tls_callbacks = {
.dispatch = lrmd_tls_dispatch,
.destroy = lrmd_tls_connection_destroy,
};
int rc = sock;
gnutls_datum_t psk_key = { NULL, 0 };
native->async_timer = 0;
if (rc < 0) {
lrmd_tls_connection_destroy(lrmd);
crm_info("Could not connect to Pacemaker Remote at %s:%d",
native->server, native->port);
report_async_connection_result(lrmd, rc);
return;
}
/* The TCP connection was successful, so establish the TLS connection.
* @TODO make this async to avoid blocking code in client
*/
native->sock = sock;
rc = lrmd_tls_set_key(&psk_key);
if (rc != 0) {
crm_warn("Could not set key for Pacemaker Remote at %s:%d " CRM_XS " rc=%d",
native->server, native->port, rc);
lrmd_tls_connection_destroy(lrmd);
report_async_connection_result(lrmd, rc);
return;
}
gnutls_psk_allocate_client_credentials(&native->psk_cred_c);
gnutls_psk_set_client_credentials(native->psk_cred_c, DEFAULT_REMOTE_USERNAME, &psk_key, GNUTLS_PSK_KEY_RAW);
gnutls_free(psk_key.data);
native->remote->tls_session = create_psk_tls_session(sock, GNUTLS_CLIENT, native->psk_cred_c);
if (crm_initiate_client_tls_handshake(native->remote, LRMD_CLIENT_HANDSHAKE_TIMEOUT) != 0) {
crm_warn("Disconnecting after TLS handshake with Pacemaker Remote server %s:%d failed",
native->server, native->port);
gnutls_deinit(*native->remote->tls_session);
gnutls_free(native->remote->tls_session);
native->remote->tls_session = NULL;
lrmd_tls_connection_destroy(lrmd);
report_async_connection_result(lrmd, -EKEYREJECTED);
return;
}
crm_info("TLS connection to Pacemaker Remote server %s:%d succeeded",
native->server, native->port);
- name = crm_strdup_printf("remote-lrmd-%s:%d", native->server, native->port);
+ name = crm_strdup_printf("pacemaker-remote-%s:%d",
+ native->server, native->port);
native->process_notify = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_tls_dispatch, lrmd);
native->source =
mainloop_add_fd(name, G_PRIORITY_HIGH, native->sock, lrmd, &lrmd_tls_callbacks);
rc = lrmd_handshake(lrmd, name);
free(name);
report_async_connection_result(lrmd, rc);
return;
}
static int
lrmd_tls_connect_async(lrmd_t * lrmd, int timeout /*ms */ )
{
int sock = 0;
int timer_id = 0;
lrmd_private_t *native = lrmd->lrmd_private;
lrmd_gnutls_global_init();
sock = crm_remote_tcp_connect_async(native->server, native->port, timeout,
&timer_id, lrmd, lrmd_tcp_connect_cb);
if (sock < 0) {
return sock;
}
native->sock = sock;
native->async_timer = timer_id;
return pcmk_ok;
}
static int
lrmd_tls_connect(lrmd_t * lrmd, int *fd)
{
static struct mainloop_fd_callbacks lrmd_tls_callbacks = {
.dispatch = lrmd_tls_dispatch,
.destroy = lrmd_tls_connection_destroy,
};
int rc;
lrmd_private_t *native = lrmd->lrmd_private;
int sock;
gnutls_datum_t psk_key = { NULL, 0 };
lrmd_gnutls_global_init();
sock = crm_remote_tcp_connect(native->server, native->port);
if (sock < 0) {
crm_warn("Could not establish Pacemaker Remote connection to %s", native->server);
lrmd_tls_connection_destroy(lrmd);
return -ENOTCONN;
}
native->sock = sock;
rc = lrmd_tls_set_key(&psk_key);
if (rc < 0) {
lrmd_tls_connection_destroy(lrmd);
return rc;
}
gnutls_psk_allocate_client_credentials(&native->psk_cred_c);
gnutls_psk_set_client_credentials(native->psk_cred_c, DEFAULT_REMOTE_USERNAME, &psk_key, GNUTLS_PSK_KEY_RAW);
gnutls_free(psk_key.data);
native->remote->tls_session = create_psk_tls_session(sock, GNUTLS_CLIENT, native->psk_cred_c);
if (crm_initiate_client_tls_handshake(native->remote, LRMD_CLIENT_HANDSHAKE_TIMEOUT) != 0) {
crm_err("Session creation for %s:%d failed", native->server, native->port);
gnutls_deinit(*native->remote->tls_session);
gnutls_free(native->remote->tls_session);
native->remote->tls_session = NULL;
lrmd_tls_connection_destroy(lrmd);
return -EKEYREJECTED;
}
crm_info("Client TLS connection established with Pacemaker Remote server %s:%d", native->server,
native->port);
if (fd) {
*fd = sock;
} else {
- char *name = crm_strdup_printf("remote-lrmd-%s:%d",
+ char *name = crm_strdup_printf("pacemaker-remote-%s:%d",
native->server, native->port);
native->process_notify = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_tls_dispatch, lrmd);
native->source =
mainloop_add_fd(name, G_PRIORITY_HIGH, native->sock, lrmd, &lrmd_tls_callbacks);
free(name);
}
return pcmk_ok;
}
#endif
static int
lrmd_api_connect(lrmd_t * lrmd, const char *name, int *fd)
{
int rc = -ENOTCONN;
lrmd_private_t *native = lrmd->lrmd_private;
switch (native->type) {
case CRM_CLIENT_IPC:
rc = lrmd_ipc_connect(lrmd, fd);
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
rc = lrmd_tls_connect(lrmd, fd);
break;
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
if (rc == pcmk_ok) {
rc = lrmd_handshake(lrmd, name);
}
return rc;
}
static int
lrmd_api_connect_async(lrmd_t * lrmd, const char *name, int timeout)
{
int rc = 0;
lrmd_private_t *native = lrmd->lrmd_private;
CRM_CHECK(native && native->callback, return -1);
switch (native->type) {
case CRM_CLIENT_IPC:
/* fake async connection with ipc. it should be fast
* enough that we gain very little from async */
rc = lrmd_api_connect(lrmd, name, NULL);
if (!rc) {
report_async_connection_result(lrmd, rc);
}
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
rc = lrmd_tls_connect_async(lrmd, timeout);
if (rc) {
/* connection failed, report rc now */
report_async_connection_result(lrmd, rc);
}
break;
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
return rc;
}
static void
lrmd_ipc_disconnect(lrmd_t * lrmd)
{
lrmd_private_t *native = lrmd->lrmd_private;
if (native->source != NULL) {
/* Attached to mainloop */
mainloop_del_ipc_client(native->source);
native->source = NULL;
native->ipc = NULL;
} else if (native->ipc) {
/* Not attached to mainloop */
crm_ipc_t *ipc = native->ipc;
native->ipc = NULL;
crm_ipc_close(ipc);
crm_ipc_destroy(ipc);
}
}
#ifdef HAVE_GNUTLS_GNUTLS_H
static void
lrmd_tls_disconnect(lrmd_t * lrmd)
{
lrmd_private_t *native = lrmd->lrmd_private;
if (native->remote->tls_session) {
gnutls_bye(*native->remote->tls_session, GNUTLS_SHUT_RDWR);
gnutls_deinit(*native->remote->tls_session);
gnutls_free(native->remote->tls_session);
native->remote->tls_session = 0;
}
if (native->async_timer) {
g_source_remove(native->async_timer);
native->async_timer = 0;
}
if (native->source != NULL) {
/* Attached to mainloop */
mainloop_del_ipc_client(native->source);
native->source = NULL;
} else if (native->sock) {
close(native->sock);
native->sock = 0;
}
if (native->pending_notify) {
g_list_free_full(native->pending_notify, lrmd_free_xml);
native->pending_notify = NULL;
}
}
#endif
static int
lrmd_api_disconnect(lrmd_t * lrmd)
{
lrmd_private_t *native = lrmd->lrmd_private;
crm_info("Disconnecting %s %s executor connection",
crm_client_type_text(native->type),
(native->remote_nodename? native->remote_nodename : "local"));
switch (native->type) {
case CRM_CLIENT_IPC:
lrmd_ipc_disconnect(lrmd);
break;
#ifdef HAVE_GNUTLS_GNUTLS_H
case CRM_CLIENT_TLS:
lrmd_tls_disconnect(lrmd);
break;
#endif
default:
crm_err("Unsupported connection type: %d", native->type);
}
free(native->token);
native->token = NULL;
free(native->peer_version);
native->peer_version = NULL;
return 0;
}
static int
lrmd_api_register_rsc(lrmd_t * lrmd,
const char *rsc_id,
const char *class,
const char *provider, const char *type, enum lrmd_call_options options)
{
int rc = pcmk_ok;
xmlNode *data = NULL;
if (!class || !type || !rsc_id) {
return -EINVAL;
}
if (crm_provider_required(class) && !provider) {
return -EINVAL;
}
data = create_xml_node(NULL, F_LRMD_RSC);
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_RSC_ID, rsc_id);
crm_xml_add(data, F_LRMD_CLASS, class);
crm_xml_add(data, F_LRMD_PROVIDER, provider);
crm_xml_add(data, F_LRMD_TYPE, type);
rc = lrmd_send_command(lrmd, LRMD_OP_RSC_REG, data, NULL, 0, options, TRUE);
free_xml(data);
return rc;
}
static int
lrmd_api_unregister_rsc(lrmd_t * lrmd, const char *rsc_id, enum lrmd_call_options options)
{
int rc = pcmk_ok;
xmlNode *data = create_xml_node(NULL, F_LRMD_RSC);
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_RSC_ID, rsc_id);
rc = lrmd_send_command(lrmd, LRMD_OP_RSC_UNREG, data, NULL, 0, options, TRUE);
free_xml(data);
return rc;
}
lrmd_rsc_info_t *
lrmd_new_rsc_info(const char *rsc_id, const char *standard,
const char *provider, const char *type)
{
lrmd_rsc_info_t *rsc_info = calloc(1, sizeof(lrmd_rsc_info_t));
CRM_ASSERT(rsc_info);
if (rsc_id) {
rsc_info->id = strdup(rsc_id);
CRM_ASSERT(rsc_info->id);
}
if (standard) {
rsc_info->standard = strdup(standard);
CRM_ASSERT(rsc_info->standard);
}
if (provider) {
rsc_info->provider = strdup(provider);
CRM_ASSERT(rsc_info->provider);
}
if (type) {
rsc_info->type = strdup(type);
CRM_ASSERT(rsc_info->type);
}
return rsc_info;
}
lrmd_rsc_info_t *
lrmd_copy_rsc_info(lrmd_rsc_info_t * rsc_info)
{
return lrmd_new_rsc_info(rsc_info->id, rsc_info->standard,
rsc_info->provider, rsc_info->type);
}
void
lrmd_free_rsc_info(lrmd_rsc_info_t * rsc_info)
{
if (!rsc_info) {
return;
}
free(rsc_info->id);
free(rsc_info->type);
free(rsc_info->standard);
free(rsc_info->provider);
free(rsc_info);
}
static lrmd_rsc_info_t *
lrmd_api_get_rsc_info(lrmd_t * lrmd, const char *rsc_id, enum lrmd_call_options options)
{
lrmd_rsc_info_t *rsc_info = NULL;
xmlNode *data = create_xml_node(NULL, F_LRMD_RSC);
xmlNode *output = NULL;
const char *class = NULL;
const char *provider = NULL;
const char *type = NULL;
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_RSC_ID, rsc_id);
lrmd_send_command(lrmd, LRMD_OP_RSC_INFO, data, &output, 0, options, TRUE);
free_xml(data);
if (!output) {
return NULL;
}
class = crm_element_value(output, F_LRMD_CLASS);
provider = crm_element_value(output, F_LRMD_PROVIDER);
type = crm_element_value(output, F_LRMD_TYPE);
if (!class || !type) {
free_xml(output);
return NULL;
} else if (crm_provider_required(class) && !provider) {
free_xml(output);
return NULL;
}
rsc_info = lrmd_new_rsc_info(rsc_id, class, provider, type);
free_xml(output);
return rsc_info;
}
void
lrmd_free_op_info(lrmd_op_info_t *op_info)
{
if (op_info) {
free(op_info->rsc_id);
free(op_info->action);
free(op_info->interval_ms_s);
free(op_info->timeout_ms_s);
free(op_info);
}
}
static int
lrmd_api_get_recurring_ops(lrmd_t *lrmd, const char *rsc_id, int timeout_ms,
enum lrmd_call_options options, GList **output)
{
xmlNode *data = NULL;
xmlNode *output_xml = NULL;
int rc = pcmk_ok;
if (output == NULL) {
return -EINVAL;
}
*output = NULL;
// Send request
if (rsc_id) {
data = create_xml_node(NULL, F_LRMD_RSC);
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_RSC_ID, rsc_id);
}
rc = lrmd_send_command(lrmd, LRMD_OP_GET_RECURRING, data, &output_xml,
timeout_ms, options, TRUE);
if (data) {
free_xml(data);
}
// Process reply
if ((rc != pcmk_ok) || (output_xml == NULL)) {
return rc;
}
for (xmlNode *rsc_xml = first_named_child(output_xml, F_LRMD_RSC);
rsc_xml != NULL; rsc_xml = crm_next_same_xml(rsc_xml)) {
const char *rsc_id = crm_element_value(rsc_xml, F_LRMD_RSC_ID);
if (rsc_id == NULL) {
crm_err("Could not parse recurring operation information from executor");
continue;
}
for (xmlNode *op_xml = first_named_child(rsc_xml, T_LRMD_RSC_OP);
op_xml != NULL; op_xml = crm_next_same_xml(op_xml)) {
lrmd_op_info_t *op_info = calloc(1, sizeof(lrmd_op_info_t));
CRM_CHECK(op_info != NULL, break);
op_info->rsc_id = strdup(rsc_id);
op_info->action = crm_element_value_copy(op_xml, F_LRMD_RSC_ACTION);
op_info->interval_ms_s = crm_element_value_copy(op_xml,
F_LRMD_RSC_INTERVAL);
op_info->timeout_ms_s = crm_element_value_copy(op_xml,
F_LRMD_TIMEOUT);
*output = g_list_prepend(*output, op_info);
}
}
free_xml(output_xml);
return rc;
}
static void
lrmd_api_set_callback(lrmd_t * lrmd, lrmd_event_callback callback)
{
lrmd_private_t *native = lrmd->lrmd_private;
native->callback = callback;
}
void
lrmd_internal_set_proxy_callback(lrmd_t * lrmd, void *userdata, void (*callback)(lrmd_t *lrmd, void *userdata, xmlNode *msg))
{
lrmd_private_t *native = lrmd->lrmd_private;
native->proxy_callback = callback;
native->proxy_callback_userdata = userdata;
}
void
lrmd_internal_proxy_dispatch(lrmd_t *lrmd, xmlNode *msg)
{
lrmd_private_t *native = lrmd->lrmd_private;
if (native->proxy_callback) {
crm_log_xml_trace(msg, "PROXY_INBOUND");
native->proxy_callback(lrmd, native->proxy_callback_userdata, msg);
}
}
int
lrmd_internal_proxy_send(lrmd_t * lrmd, xmlNode *msg)
{
if (lrmd == NULL) {
return -ENOTCONN;
}
crm_xml_add(msg, F_LRMD_OPERATION, CRM_OP_IPC_FWD);
crm_log_xml_trace(msg, "PROXY_OUTBOUND");
return lrmd_send_xml_no_reply(lrmd, msg);
}
static int
stonith_get_metadata(const char *provider, const char *type, char **output)
{
int rc = pcmk_ok;
stonith_t *stonith_api = stonith_api_new();
if(stonith_api) {
stonith_api->cmds->metadata(stonith_api, st_opt_sync_call, type, provider, output, 0);
stonith_api->cmds->free(stonith_api);
}
if (*output == NULL) {
rc = -EIO;
}
return rc;
}
static int
lrmd_api_get_metadata(lrmd_t * lrmd,
const char *class,
const char *provider,
const char *type, char **output, enum lrmd_call_options options)
{
svc_action_t *action;
if (!class || !type) {
return -EINVAL;
}
if (safe_str_eq(class, PCMK_RESOURCE_CLASS_STONITH)) {
return stonith_get_metadata(provider, type, output);
}
action = resources_action_create(type, class, provider, type,
"meta-data", 0,
CRMD_METADATA_CALL_TIMEOUT, NULL, 0);
if (action == NULL) {
crm_err("Unable to retrieve meta-data for %s:%s:%s", class, provider, type);
services_action_free(action);
return -EINVAL;
}
if (!(services_action_sync(action))) {
crm_err("Failed to retrieve meta-data for %s:%s:%s", class, provider, type);
services_action_free(action);
return -EIO;
}
if (!action->stdout_data) {
crm_err("Failed to receive meta-data for %s:%s:%s", class, provider, type);
services_action_free(action);
return -EIO;
}
*output = strdup(action->stdout_data);
services_action_free(action);
return pcmk_ok;
}
static int
lrmd_api_exec(lrmd_t *lrmd, const char *rsc_id, const char *action,
const char *userdata, guint interval_ms,
int timeout, /* ms */
int start_delay, /* ms */
enum lrmd_call_options options, lrmd_key_value_t * params)
{
int rc = pcmk_ok;
xmlNode *data = create_xml_node(NULL, F_LRMD_RSC);
xmlNode *args = create_xml_node(data, XML_TAG_ATTRS);
lrmd_key_value_t *tmp = NULL;
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_RSC_ID, rsc_id);
crm_xml_add(data, F_LRMD_RSC_ACTION, action);
crm_xml_add(data, F_LRMD_RSC_USERDATA_STR, userdata);
crm_xml_add_ms(data, F_LRMD_RSC_INTERVAL, interval_ms);
crm_xml_add_int(data, F_LRMD_TIMEOUT, timeout);
crm_xml_add_int(data, F_LRMD_RSC_START_DELAY, start_delay);
for (tmp = params; tmp; tmp = tmp->next) {
hash2smartfield((gpointer) tmp->key, (gpointer) tmp->value, args);
}
rc = lrmd_send_command(lrmd, LRMD_OP_RSC_EXEC, data, NULL, timeout, options, TRUE);
free_xml(data);
lrmd_key_value_freeall(params);
return rc;
}
/* timeout is in ms */
static int
lrmd_api_exec_alert(lrmd_t *lrmd, const char *alert_id, const char *alert_path,
int timeout, lrmd_key_value_t *params)
{
int rc = pcmk_ok;
xmlNode *data = create_xml_node(NULL, F_LRMD_ALERT);
xmlNode *args = create_xml_node(data, XML_TAG_ATTRS);
lrmd_key_value_t *tmp = NULL;
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_ALERT_ID, alert_id);
crm_xml_add(data, F_LRMD_ALERT_PATH, alert_path);
crm_xml_add_int(data, F_LRMD_TIMEOUT, timeout);
for (tmp = params; tmp; tmp = tmp->next) {
hash2smartfield((gpointer) tmp->key, (gpointer) tmp->value, args);
}
rc = lrmd_send_command(lrmd, LRMD_OP_ALERT_EXEC, data, NULL, timeout,
lrmd_opt_notify_orig_only, TRUE);
free_xml(data);
lrmd_key_value_freeall(params);
return rc;
}
static int
lrmd_api_cancel(lrmd_t *lrmd, const char *rsc_id, const char *action,
guint interval_ms)
{
int rc = pcmk_ok;
xmlNode *data = create_xml_node(NULL, F_LRMD_RSC);
crm_xml_add(data, F_LRMD_ORIGIN, __FUNCTION__);
crm_xml_add(data, F_LRMD_RSC_ACTION, action);
crm_xml_add(data, F_LRMD_RSC_ID, rsc_id);
crm_xml_add_ms(data, F_LRMD_RSC_INTERVAL, interval_ms);
rc = lrmd_send_command(lrmd, LRMD_OP_RSC_CANCEL, data, NULL, 0, 0, TRUE);
free_xml(data);
return rc;
}
static int
list_stonith_agents(lrmd_list_t ** resources)
{
int rc = 0;
stonith_t *stonith_api = stonith_api_new();
stonith_key_value_t *stonith_resources = NULL;
stonith_key_value_t *dIter = NULL;
if(stonith_api) {
stonith_api->cmds->list_agents(stonith_api, st_opt_sync_call, NULL, &stonith_resources, 0);
stonith_api->cmds->free(stonith_api);
}
for (dIter = stonith_resources; dIter; dIter = dIter->next) {
rc++;
if (resources) {
*resources = lrmd_list_add(*resources, dIter->value);
}
}
stonith_key_value_freeall(stonith_resources, 1, 0);
return rc;
}
static int
lrmd_api_list_agents(lrmd_t * lrmd, lrmd_list_t ** resources, const char *class,
const char *provider)
{
int rc = 0;
if (safe_str_eq(class, PCMK_RESOURCE_CLASS_STONITH)) {
rc += list_stonith_agents(resources);
} else {
GListPtr gIter = NULL;
GList *agents = resources_list_agents(class, provider);
for (gIter = agents; gIter != NULL; gIter = gIter->next) {
*resources = lrmd_list_add(*resources, (const char *)gIter->data);
rc++;
}
g_list_free_full(agents, free);
if (!class) {
rc += list_stonith_agents(resources);
}
}
if (rc == 0) {
crm_notice("No agents found for class %s", class);
rc = -EPROTONOSUPPORT;
}
return rc;
}
static int
does_provider_have_agent(const char *agent, const char *provider, const char *class)
{
int found = 0;
GList *agents = NULL;
GListPtr gIter2 = NULL;
agents = resources_list_agents(class, provider);
for (gIter2 = agents; gIter2 != NULL; gIter2 = gIter2->next) {
if (safe_str_eq(agent, gIter2->data)) {
found = 1;
}
}
g_list_free_full(agents, free);
return found;
}
static int
lrmd_api_list_ocf_providers(lrmd_t * lrmd, const char *agent, lrmd_list_t ** providers)
{
int rc = pcmk_ok;
char *provider = NULL;
GList *ocf_providers = NULL;
GListPtr gIter = NULL;
ocf_providers = resources_list_providers(PCMK_RESOURCE_CLASS_OCF);
for (gIter = ocf_providers; gIter != NULL; gIter = gIter->next) {
provider = gIter->data;
if (!agent || does_provider_have_agent(agent, provider,
PCMK_RESOURCE_CLASS_OCF)) {
*providers = lrmd_list_add(*providers, (const char *)gIter->data);
rc++;
}
}
g_list_free_full(ocf_providers, free);
return rc;
}
static int
lrmd_api_list_standards(lrmd_t * lrmd, lrmd_list_t ** supported)
{
int rc = 0;
GList *standards = NULL;
GListPtr gIter = NULL;
standards = resources_list_standards();
for (gIter = standards; gIter != NULL; gIter = gIter->next) {
*supported = lrmd_list_add(*supported, (const char *)gIter->data);
rc++;
}
if (list_stonith_agents(NULL) > 0) {
*supported = lrmd_list_add(*supported, PCMK_RESOURCE_CLASS_STONITH);
rc++;
}
g_list_free_full(standards, free);
return rc;
}
lrmd_t *
lrmd_api_new(void)
{
lrmd_t *new_lrmd = NULL;
lrmd_private_t *pvt = NULL;
new_lrmd = calloc(1, sizeof(lrmd_t));
pvt = calloc(1, sizeof(lrmd_private_t));
pvt->remote = calloc(1, sizeof(crm_remote_t));
new_lrmd->cmds = calloc(1, sizeof(lrmd_api_operations_t));
pvt->type = CRM_CLIENT_IPC;
new_lrmd->lrmd_private = pvt;
new_lrmd->cmds->connect = lrmd_api_connect;
new_lrmd->cmds->connect_async = lrmd_api_connect_async;
new_lrmd->cmds->is_connected = lrmd_api_is_connected;
new_lrmd->cmds->poke_connection = lrmd_api_poke_connection;
new_lrmd->cmds->disconnect = lrmd_api_disconnect;
new_lrmd->cmds->register_rsc = lrmd_api_register_rsc;
new_lrmd->cmds->unregister_rsc = lrmd_api_unregister_rsc;
new_lrmd->cmds->get_rsc_info = lrmd_api_get_rsc_info;
new_lrmd->cmds->get_recurring_ops = lrmd_api_get_recurring_ops;
new_lrmd->cmds->set_callback = lrmd_api_set_callback;
new_lrmd->cmds->get_metadata = lrmd_api_get_metadata;
new_lrmd->cmds->exec = lrmd_api_exec;
new_lrmd->cmds->cancel = lrmd_api_cancel;
new_lrmd->cmds->list_agents = lrmd_api_list_agents;
new_lrmd->cmds->list_ocf_providers = lrmd_api_list_ocf_providers;
new_lrmd->cmds->list_standards = lrmd_api_list_standards;
new_lrmd->cmds->exec_alert = lrmd_api_exec_alert;
return new_lrmd;
}
lrmd_t *
lrmd_remote_api_new(const char *nodename, const char *server, int port)
{
#ifdef HAVE_GNUTLS_GNUTLS_H
lrmd_t *new_lrmd = lrmd_api_new();
lrmd_private_t *native = new_lrmd->lrmd_private;
if (!nodename && !server) {
lrmd_api_delete(new_lrmd);
return NULL;
}
native->type = CRM_CLIENT_TLS;
native->remote_nodename = nodename ? strdup(nodename) : strdup(server);
native->server = server ? strdup(server) : strdup(nodename);
native->port = port;
if (native->port == 0) {
native->port = crm_default_remote_port();
}
return new_lrmd;
#else
crm_err("Cannot communicate with Pacemaker Remote because GnuTLS is not enabled for this build");
return NULL;
#endif
}
void
lrmd_api_delete(lrmd_t * lrmd)
{
if (!lrmd) {
return;
}
lrmd->cmds->disconnect(lrmd); /* no-op if already disconnected */
free(lrmd->cmds);
if (lrmd->lrmd_private) {
lrmd_private_t *native = lrmd->lrmd_private;
#ifdef HAVE_GNUTLS_GNUTLS_H
free(native->server);
#endif
free(native->remote_nodename);
free(native->remote);
free(native->token);
free(native->peer_version);
}
free(lrmd->lrmd_private);
free(lrmd);
}
diff --git a/lib/lrmd/proxy_common.c b/lib/lrmd/proxy_common.c
index 3335148b88..7323f0d1cc 100644
--- a/lib/lrmd/proxy_common.c
+++ b/lib/lrmd/proxy_common.c
@@ -1,320 +1,308 @@
/*
- * Copyright (c) 2015 David Vossel <davidvossel@gmail.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ * Copyright 2015-2018 David Vossel <davidvossel@gmail.com>
*
+ * This source code is licensed under the GNU Lesser General Public License
+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <glib.h>
#include <unistd.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/services.h>
#include <crm/common/mainloop.h>
#include <crm/pengine/status.h>
#include <crm/cib.h>
#include <crm/lrmd.h>
int lrmd_internal_proxy_send(lrmd_t * lrmd, xmlNode *msg);
GHashTable *proxy_table = NULL;
static void
remote_proxy_notify_destroy(lrmd_t *lrmd, const char *session_id)
{
/* sending to the remote node that an ipc connection has been destroyed */
xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY);
crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_DESTROY);
crm_xml_add(msg, F_LRMD_IPC_SESSION, session_id);
lrmd_internal_proxy_send(lrmd, msg);
free_xml(msg);
}
/*!
* \brief Send an acknowledgment of a remote proxy shutdown request.
*
* \param[in] lrmd Connection to proxy
*/
void
remote_proxy_ack_shutdown(lrmd_t *lrmd)
{
xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY);
crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_ACK);
lrmd_internal_proxy_send(lrmd, msg);
free_xml(msg);
}
/*!
* \brief We're not going to shutdown as response to
* a remote proxy shutdown request.
*
* \param[in] lrmd Connection to proxy
*/
void
remote_proxy_nack_shutdown(lrmd_t *lrmd)
{
xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY);
crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_NACK);
lrmd_internal_proxy_send(lrmd, msg);
free_xml(msg);
}
void
remote_proxy_relay_event(remote_proxy_t *proxy, xmlNode *msg)
{
/* sending to the remote node an event msg. */
xmlNode *event = create_xml_node(NULL, T_LRMD_IPC_PROXY);
crm_xml_add(event, F_LRMD_IPC_OP, LRMD_IPC_OP_EVENT);
crm_xml_add(event, F_LRMD_IPC_SESSION, proxy->session_id);
add_message_xml(event, F_LRMD_IPC_MSG, msg);
crm_log_xml_explicit(event, "EventForProxy");
lrmd_internal_proxy_send(proxy->lrm, event);
free_xml(event);
}
void
remote_proxy_relay_response(remote_proxy_t *proxy, xmlNode *msg, int msg_id)
{
/* sending to the remote node a response msg. */
xmlNode *response = create_xml_node(NULL, T_LRMD_IPC_PROXY);
crm_xml_add(response, F_LRMD_IPC_OP, LRMD_IPC_OP_RESPONSE);
crm_xml_add(response, F_LRMD_IPC_SESSION, proxy->session_id);
crm_xml_add_int(response, F_LRMD_IPC_MSG_ID, msg_id);
add_message_xml(response, F_LRMD_IPC_MSG, msg);
lrmd_internal_proxy_send(proxy->lrm, response);
free_xml(response);
}
static void
remote_proxy_end_session(remote_proxy_t *proxy)
{
if (proxy == NULL) {
return;
}
crm_trace("ending session ID %s", proxy->session_id);
if (proxy->source) {
mainloop_del_ipc_client(proxy->source);
}
}
void
remote_proxy_free(gpointer data)
{
remote_proxy_t *proxy = data;
crm_trace("freed proxy session ID %s", proxy->session_id);
free(proxy->node_name);
free(proxy->session_id);
free(proxy);
}
int
remote_proxy_dispatch(const char *buffer, ssize_t length, gpointer userdata)
{
- /* Async responses from cib and friends back to clients via pacemaker_remoted */
+ // Async responses from cib and friends to clients via pacemaker-remoted
xmlNode *xml = NULL;
uint32_t flags = 0;
remote_proxy_t *proxy = userdata;
xml = string2xml(buffer);
if (xml == NULL) {
crm_warn("Received a NULL msg from IPC service.");
return 1;
}
flags = crm_ipc_buffer_flags(proxy->ipc);
if (flags & crm_ipc_proxied_relay_response) {
crm_trace("Passing response back to %.8s on %s: %.200s - request id: %d", proxy->session_id, proxy->node_name, buffer, proxy->last_request_id);
remote_proxy_relay_response(proxy, xml, proxy->last_request_id);
proxy->last_request_id = 0;
} else {
crm_trace("Passing event back to %.8s on %s: %.200s", proxy->session_id, proxy->node_name, buffer);
remote_proxy_relay_event(proxy, xml);
}
free_xml(xml);
return 1;
}
void
remote_proxy_disconnected(gpointer userdata)
{
remote_proxy_t *proxy = userdata;
crm_trace("destroying %p", proxy);
proxy->source = NULL;
proxy->ipc = NULL;
if(proxy->lrm) {
remote_proxy_notify_destroy(proxy->lrm, proxy->session_id);
proxy->lrm = NULL;
}
g_hash_table_remove(proxy_table, proxy->session_id);
}
remote_proxy_t *
remote_proxy_new(lrmd_t *lrmd, struct ipc_client_callbacks *proxy_callbacks,
const char *node_name, const char *session_id, const char *channel)
{
remote_proxy_t *proxy = NULL;
if(channel == NULL) {
crm_err("No channel specified to proxy");
remote_proxy_notify_destroy(lrmd, session_id);
return NULL;
}
proxy = calloc(1, sizeof(remote_proxy_t));
proxy->node_name = strdup(node_name);
proxy->session_id = strdup(session_id);
proxy->lrm = lrmd;
if (safe_str_eq(crm_system_name, CRM_SYSTEM_CRMD)
&& safe_str_eq(channel, CRM_SYSTEM_CRMD)) {
/* The crmd doesn't need to connect to itself */
proxy->is_local = TRUE;
} else {
proxy->source = mainloop_add_ipc_client(channel, G_PRIORITY_LOW, 0, proxy, proxy_callbacks);
proxy->ipc = mainloop_get_ipc_client(proxy->source);
if (proxy->source == NULL) {
remote_proxy_free(proxy);
remote_proxy_notify_destroy(lrmd, session_id);
return NULL;
}
}
crm_trace("new remote proxy client established to %s on %s, session id %s",
channel, node_name, session_id);
g_hash_table_insert(proxy_table, proxy->session_id, proxy);
return proxy;
}
void
remote_proxy_cb(lrmd_t *lrmd, const char *node_name, xmlNode *msg)
{
const char *op = crm_element_value(msg, F_LRMD_IPC_OP);
const char *session = crm_element_value(msg, F_LRMD_IPC_SESSION);
remote_proxy_t *proxy = g_hash_table_lookup(proxy_table, session);
int msg_id = 0;
/* sessions are raw ipc connections to IPC,
* all we do is proxy requests/responses exactly
* like they are given to us at the ipc level. */
CRM_CHECK(op != NULL, return);
CRM_CHECK(session != NULL, return);
crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id);
/* This is msg from remote ipc client going to real ipc server */
if (safe_str_eq(op, LRMD_IPC_OP_DESTROY)) {
remote_proxy_end_session(proxy);
} else if (safe_str_eq(op, LRMD_IPC_OP_REQUEST)) {
int flags = 0;
xmlNode *request = get_message_xml(msg, F_LRMD_IPC_MSG);
const char *name = crm_element_value(msg, F_LRMD_IPC_CLIENT);
CRM_CHECK(request != NULL, return);
if (proxy == NULL) {
/* proxy connection no longer exists */
remote_proxy_notify_destroy(lrmd, session);
return;
}
/* crmd requests MUST be handled by the crmd, not us */
CRM_CHECK(proxy->is_local == FALSE,
remote_proxy_end_session(proxy); return);
if (crm_ipc_connected(proxy->ipc) == FALSE) {
remote_proxy_end_session(proxy);
return;
}
proxy->last_request_id = 0;
crm_element_value_int(msg, F_LRMD_IPC_MSG_FLAGS, &flags);
crm_xml_add(request, XML_ACL_TAG_ROLE, "pacemaker-remote");
#if ENABLE_ACL
CRM_ASSERT(node_name);
crm_acl_get_set_user(request, F_LRMD_IPC_USER, node_name);
#endif
if(is_set(flags, crm_ipc_proxied)) {
const char *type = crm_element_value(request, F_TYPE);
int rc = 0;
if (safe_str_eq(type, T_ATTRD)
&& crm_element_value(request, F_ATTRD_HOST) == NULL) {
crm_xml_add(request, F_ATTRD_HOST, proxy->node_name);
}
rc = crm_ipc_send(proxy->ipc, request, flags, 5000, NULL);
if(rc < 0) {
xmlNode *op_reply = create_xml_node(NULL, "nack");
crm_err("Could not relay %s request %d from %s to %s for %s: %s (%d)",
op, msg_id, proxy->node_name, crm_ipc_name(proxy->ipc), name, pcmk_strerror(rc), rc);
/* Send a n'ack so the caller doesn't block */
crm_xml_add(op_reply, "function", __FUNCTION__);
crm_xml_add_int(op_reply, "line", __LINE__);
crm_xml_add_int(op_reply, "rc", rc);
remote_proxy_relay_response(proxy, op_reply, msg_id);
free_xml(op_reply);
} else {
crm_trace("Relayed %s request %d from %s to %s for %s",
op, msg_id, proxy->node_name, crm_ipc_name(proxy->ipc), name);
proxy->last_request_id = msg_id;
}
} else {
int rc = pcmk_ok;
xmlNode *op_reply = NULL;
- /* For backwards compatibility with pacemaker_remoted <= 1.1.10 */
+ // @COMPAT pacemaker_remoted <= 1.1.10
crm_trace("Relaying %s request %d from %s to %s for %s",
op, msg_id, proxy->node_name, crm_ipc_name(proxy->ipc), name);
rc = crm_ipc_send(proxy->ipc, request, flags, 10000, &op_reply);
if(rc < 0) {
crm_err("Could not relay %s request %d from %s to %s for %s: %s (%d)",
op, msg_id, proxy->node_name, crm_ipc_name(proxy->ipc), name, pcmk_strerror(rc), rc);
} else {
crm_trace("Relayed %s request %d from %s to %s for %s",
op, msg_id, proxy->node_name, crm_ipc_name(proxy->ipc), name);
}
if(op_reply) {
remote_proxy_relay_response(proxy, op_reply, msg_id);
free_xml(op_reply);
}
}
} else {
crm_err("Unknown proxy operation: %s", op);
}
}
diff --git a/lib/pengine/container.c b/lib/pengine/container.c
index 1d8f303b08..5eab244f9d 100644
--- a/lib/pengine/container.c
+++ b/lib/pengine/container.c
@@ -1,1463 +1,1464 @@
/*
* Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <ctype.h>
#include <crm/pengine/rules.h>
#include <crm/pengine/status.h>
#include <crm/pengine/internal.h>
#include <unpack.h>
#include <crm/msg_xml.h>
#define VARIANT_CONTAINER 1
#include "./variant.h"
void tuple_free(container_grouping_t *tuple);
static char *
next_ip(const char *last_ip)
{
unsigned int oct1 = 0;
unsigned int oct2 = 0;
unsigned int oct3 = 0;
unsigned int oct4 = 0;
int rc = sscanf(last_ip, "%u.%u.%u.%u", &oct1, &oct2, &oct3, &oct4);
if (rc != 4) {
/*@ TODO check for IPv6 */
return NULL;
} else if (oct3 > 253) {
return NULL;
} else if (oct4 > 253) {
++oct3;
oct4 = 1;
} else {
++oct4;
}
return crm_strdup_printf("%u.%u.%u.%u", oct1, oct2, oct3, oct4);
}
static int
allocate_ip(container_variant_data_t *data, container_grouping_t *tuple, char *buffer, int max)
{
if(data->ip_range_start == NULL) {
return 0;
} else if(data->ip_last) {
tuple->ipaddr = next_ip(data->ip_last);
} else {
tuple->ipaddr = strdup(data->ip_range_start);
}
data->ip_last = tuple->ipaddr;
#if 0
return snprintf(buffer, max, " --add-host=%s-%d:%s --link %s-docker-%d:%s-link-%d",
data->prefix, tuple->offset, tuple->ipaddr,
data->prefix, tuple->offset, data->prefix, tuple->offset);
#else
if (data->type == PE_CONTAINER_TYPE_DOCKER) {
if (data->add_host == FALSE) {
return 0;
}
return snprintf(buffer, max, " --add-host=%s-%d:%s",
data->prefix, tuple->offset, tuple->ipaddr);
} else if (data->type == PE_CONTAINER_TYPE_RKT) {
return snprintf(buffer, max, " --hosts-entry=%s=%s-%d",
tuple->ipaddr, data->prefix, tuple->offset);
} else {
return 0;
}
#endif
}
static xmlNode *
create_resource(const char *name, const char *provider, const char *kind)
{
xmlNode *rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE);
crm_xml_add(rsc, XML_ATTR_ID, name);
crm_xml_add(rsc, XML_AGENT_ATTR_CLASS, PCMK_RESOURCE_CLASS_OCF);
crm_xml_add(rsc, XML_AGENT_ATTR_PROVIDER, provider);
crm_xml_add(rsc, XML_ATTR_TYPE, kind);
return rsc;
}
/*!
* \internal
* \brief Check whether cluster can manage resource inside container
*
* \param[in] data Container variant data
*
* \return TRUE if networking configuration is acceptable, FALSE otherwise
*
* \note The resource is manageable if an IP range or control port has been
* specified. If a control port is used without an IP range, replicas per
* host must be 1.
*/
static bool
valid_network(container_variant_data_t *data)
{
if(data->ip_range_start) {
return TRUE;
}
if(data->control_port) {
if(data->replicas_per_host > 1) {
pe_err("Specifying the 'control-port' for %s requires 'replicas-per-host=1'", data->prefix);
data->replicas_per_host = 1;
/* @TODO to be sure: clear_bit(rsc->flags, pe_rsc_unique); */
}
return TRUE;
}
return FALSE;
}
static bool
create_ip_resource(
resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple,
pe_working_set_t * data_set)
{
if(data->ip_range_start) {
char *id = NULL;
xmlNode *xml_ip = NULL;
xmlNode *xml_obj = NULL;
id = crm_strdup_printf("%s-ip-%s", data->prefix, tuple->ipaddr);
crm_xml_sanitize_id(id);
xml_ip = create_resource(id, "heartbeat", "IPaddr2");
free(id);
xml_obj = create_xml_node(xml_ip, XML_TAG_ATTR_SETS);
crm_xml_set_id(xml_obj, "%s-attributes-%d", data->prefix, tuple->offset);
crm_create_nvpair_xml(xml_obj, NULL, "ip", tuple->ipaddr);
if(data->host_network) {
crm_create_nvpair_xml(xml_obj, NULL, "nic", data->host_network);
}
if(data->host_netmask) {
crm_create_nvpair_xml(xml_obj, NULL,
"cidr_netmask", data->host_netmask);
} else {
crm_create_nvpair_xml(xml_obj, NULL, "cidr_netmask", "32");
}
xml_obj = create_xml_node(xml_ip, "operations");
crm_create_op_xml(xml_obj, ID(xml_ip), "monitor", "60s", NULL);
// TODO: Other ops? Timeouts and intervals from underlying resource?
crm_log_xml_trace(xml_ip, "Container-ip");
if (common_unpack(xml_ip, &tuple->ip, parent, data_set) == false) {
return FALSE;
}
parent->children = g_list_append(parent->children, tuple->ip);
}
return TRUE;
}
static bool
create_docker_resource(
resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple,
pe_working_set_t * data_set)
{
int offset = 0, max = 4096;
char *buffer = calloc(1, max+1);
int doffset = 0, dmax = 1024;
char *dbuffer = calloc(1, dmax+1);
char *id = NULL;
xmlNode *xml_docker = NULL;
xmlNode *xml_obj = NULL;
id = crm_strdup_printf("%s-docker-%d", data->prefix, tuple->offset);
crm_xml_sanitize_id(id);
xml_docker = create_resource(id, "heartbeat", "docker");
free(id);
xml_obj = create_xml_node(xml_docker, XML_TAG_ATTR_SETS);
crm_xml_set_id(xml_obj, "%s-attributes-%d", data->prefix, tuple->offset);
crm_create_nvpair_xml(xml_obj, NULL, "image", data->image);
crm_create_nvpair_xml(xml_obj, NULL, "allow_pull", XML_BOOLEAN_TRUE);
crm_create_nvpair_xml(xml_obj, NULL, "force_kill", XML_BOOLEAN_FALSE);
crm_create_nvpair_xml(xml_obj, NULL, "reuse", XML_BOOLEAN_FALSE);
offset += snprintf(buffer+offset, max-offset, " --restart=no");
/* Set a container hostname only if we have an IP to map it to.
* The user can set -h or --uts=host themselves if they want a nicer
* name for logs, but this makes applications happy who need their
* hostname to match the IP they bind to.
*/
if (data->ip_range_start != NULL) {
offset += snprintf(buffer+offset, max-offset, " -h %s-%d",
data->prefix, tuple->offset);
}
offset += snprintf(buffer+offset, max-offset, " -e PCMK_stderr=1");
if(data->docker_network) {
// offset += snprintf(buffer+offset, max-offset, " --link-local-ip=%s", tuple->ipaddr);
offset += snprintf(buffer+offset, max-offset, " --net=%s", data->docker_network);
}
if(data->control_port) {
offset += snprintf(buffer+offset, max-offset, " -e PCMK_remote_port=%s", data->control_port);
} else {
offset += snprintf(buffer+offset, max-offset, " -e PCMK_remote_port=%d", DEFAULT_REMOTE_PORT);
}
for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) {
container_mount_t *mount = pIter->data;
if(mount->flags) {
char *source = crm_strdup_printf(
"%s/%s-%d", mount->source, data->prefix, tuple->offset);
if(doffset > 0) {
doffset += snprintf(dbuffer+doffset, dmax-doffset, ",");
}
doffset += snprintf(dbuffer+doffset, dmax-doffset, "%s", source);
offset += snprintf(buffer+offset, max-offset, " -v %s:%s", source, mount->target);
free(source);
} else {
offset += snprintf(buffer+offset, max-offset, " -v %s:%s", mount->source, mount->target);
}
if(mount->options) {
offset += snprintf(buffer+offset, max-offset, ":%s", mount->options);
}
}
for(GListPtr pIter = data->ports; pIter != NULL; pIter = pIter->next) {
container_port_t *port = pIter->data;
if(tuple->ipaddr) {
offset += snprintf(buffer+offset, max-offset, " -p %s:%s:%s",
tuple->ipaddr, port->source, port->target);
} else if(safe_str_neq(data->docker_network, "host")) {
// No need to do port mapping if net=host
offset += snprintf(buffer+offset, max-offset, " -p %s:%s", port->source, port->target);
}
}
if(data->docker_run_options) {
offset += snprintf(buffer+offset, max-offset, " %s", data->docker_run_options);
}
if(data->docker_host_options) {
offset += snprintf(buffer+offset, max-offset, " %s", data->docker_host_options);
}
crm_create_nvpair_xml(xml_obj, NULL, "run_opts", buffer);
free(buffer);
crm_create_nvpair_xml(xml_obj, NULL, "mount_points", dbuffer);
free(dbuffer);
if(tuple->child) {
if(data->docker_run_command) {
crm_create_nvpair_xml(xml_obj, NULL,
"run_cmd", data->docker_run_command);
} else {
crm_create_nvpair_xml(xml_obj, NULL,
- "run_cmd", SBIN_DIR "/pacemaker_remoted");
+ "run_cmd", SBIN_DIR "/pacemaker-remoted");
}
/* TODO: Allow users to specify their own?
*
* We just want to know if the container is alive, we'll
* monitor the child independently
*/
crm_create_nvpair_xml(xml_obj, NULL, "monitor_cmd", "/bin/true");
/* } else if(child && data->untrusted) {
* Support this use-case?
*
* The ability to have resources started/stopped by us, but
* unable to set attributes, etc.
*
* Arguably better to control API access this with ACLs like
* "normal" remote nodes
*
* crm_create_nvpair_xml(xml_obj, NULL,
* "run_cmd",
* "/usr/libexec/pacemaker/pacemaker-execd");
* crm_create_nvpair_xml(xml_obj, NULL, "monitor_cmd",
* "/usr/libexec/pacemaker/lrmd_internal_ctl -c poke");
*/
} else {
if(data->docker_run_command) {
crm_create_nvpair_xml(xml_obj, NULL,
"run_cmd", data->docker_run_command);
}
/* TODO: Allow users to specify their own?
*
* We don't know what's in the container, so we just want
* to know if it is alive
*/
crm_create_nvpair_xml(xml_obj, NULL, "monitor_cmd", "/bin/true");
}
xml_obj = create_xml_node(xml_docker, "operations");
crm_create_op_xml(xml_obj, ID(xml_docker), "monitor", "60s", NULL);
// TODO: Other ops? Timeouts and intervals from underlying resource?
crm_log_xml_trace(xml_docker, "Container-docker");
if (common_unpack(xml_docker, &tuple->docker, parent, data_set) == FALSE) {
return FALSE;
}
parent->children = g_list_append(parent->children, tuple->docker);
return TRUE;
}
static bool
create_rkt_resource(
resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple,
pe_working_set_t * data_set)
{
int offset = 0, max = 4096;
char *buffer = calloc(1, max+1);
int doffset = 0, dmax = 1024;
char *dbuffer = calloc(1, dmax+1);
char *id = NULL;
xmlNode *xml_docker = NULL;
xmlNode *xml_obj = NULL;
int volid = 0;
id = crm_strdup_printf("%s-rkt-%d", data->prefix, tuple->offset);
crm_xml_sanitize_id(id);
xml_docker = create_resource(id, "heartbeat", "rkt");
free(id);
xml_obj = create_xml_node(xml_docker, XML_TAG_ATTR_SETS);
crm_xml_set_id(xml_obj, "%s-attributes-%d", data->prefix, tuple->offset);
crm_create_nvpair_xml(xml_obj, NULL, "image", data->image);
crm_create_nvpair_xml(xml_obj, NULL, "allow_pull", "true");
crm_create_nvpair_xml(xml_obj, NULL, "force_kill", "false");
crm_create_nvpair_xml(xml_obj, NULL, "reuse", "false");
/* Set a container hostname only if we have an IP to map it to.
* The user can set -h or --uts=host themselves if they want a nicer
* name for logs, but this makes applications happy who need their
* hostname to match the IP they bind to.
*/
if (data->ip_range_start != NULL) {
offset += snprintf(buffer+offset, max-offset, " --hostname=%s-%d",
data->prefix, tuple->offset);
}
offset += snprintf(buffer+offset, max-offset, " --environment=PCMK_stderr=1");
if(data->docker_network) {
// offset += snprintf(buffer+offset, max-offset, " --link-local-ip=%s", tuple->ipaddr);
offset += snprintf(buffer+offset, max-offset, " --net=%s", data->docker_network);
}
if(data->control_port) {
offset += snprintf(buffer+offset, max-offset, " --environment=PCMK_remote_port=%s", data->control_port);
} else {
offset += snprintf(buffer+offset, max-offset, " --environment=PCMK_remote_port=%d", DEFAULT_REMOTE_PORT);
}
for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) {
container_mount_t *mount = pIter->data;
if(mount->flags) {
char *source = crm_strdup_printf(
"%s/%s-%d", mount->source, data->prefix, tuple->offset);
if(doffset > 0) {
doffset += snprintf(dbuffer+doffset, dmax-doffset, ",");
}
doffset += snprintf(dbuffer+doffset, dmax-doffset, "%s", source);
offset += snprintf(buffer+offset, max-offset, " --volume vol%d,kind=host,source=%s", volid, source);
if(mount->options) {
offset += snprintf(buffer+offset, max-offset, ",%s", mount->options);
}
offset += snprintf(buffer+offset, max-offset, " --mount volume=vol%d,target=%s", volid, mount->target);
free(source);
} else {
offset += snprintf(buffer+offset, max-offset, " --volume vol%d,kind=host,source=%s", volid, mount->source);
if(mount->options) {
offset += snprintf(buffer+offset, max-offset, ",%s", mount->options);
}
offset += snprintf(buffer+offset, max-offset, " --mount volume=vol%d,target=%s", volid, mount->target);
}
volid++;
}
for(GListPtr pIter = data->ports; pIter != NULL; pIter = pIter->next) {
container_port_t *port = pIter->data;
if(tuple->ipaddr) {
offset += snprintf(buffer+offset, max-offset, " --port=%s:%s:%s",
port->target, tuple->ipaddr, port->source);
} else {
offset += snprintf(buffer+offset, max-offset, " --port=%s:%s", port->target, port->source);
}
}
if(data->docker_run_options) {
offset += snprintf(buffer+offset, max-offset, " %s", data->docker_run_options);
}
if(data->docker_host_options) {
offset += snprintf(buffer+offset, max-offset, " %s", data->docker_host_options);
}
crm_create_nvpair_xml(xml_obj, NULL, "run_opts", buffer);
free(buffer);
crm_create_nvpair_xml(xml_obj, NULL, "mount_points", dbuffer);
free(dbuffer);
if(tuple->child) {
if(data->docker_run_command) {
crm_create_nvpair_xml(xml_obj, NULL, "run_cmd", data->docker_run_command);
} else {
- crm_create_nvpair_xml(xml_obj, NULL, "run_cmd", SBIN_DIR"/pacemaker_remoted");
+ crm_create_nvpair_xml(xml_obj, NULL, "run_cmd",
+ SBIN_DIR "/pacemaker-remoted");
}
/* TODO: Allow users to specify their own?
*
* We just want to know if the container is alive, we'll
* monitor the child independently
*/
crm_create_nvpair_xml(xml_obj, NULL, "monitor_cmd", "/bin/true");
/* } else if(child && data->untrusted) {
* Support this use-case?
*
* The ability to have resources started/stopped by us, but
* unable to set attributes, etc.
*
* Arguably better to control API access this with ACLs like
* "normal" remote nodes
*
* crm_create_nvpair_xml(xml_obj, NULL,
* "run_cmd",
* "/usr/libexec/pacemaker/pacemaker-execd");
* crm_create_nvpair_xml(xml_obj, NULL, "monitor_cmd",
* "/usr/libexec/pacemaker/lrmd_internal_ctl -c poke");
*/
} else {
if(data->docker_run_command) {
crm_create_nvpair_xml(xml_obj, NULL, "run_cmd",
data->docker_run_command);
}
/* TODO: Allow users to specify their own?
*
* We don't know what's in the container, so we just want
* to know if it is alive
*/
crm_create_nvpair_xml(xml_obj, NULL, "monitor_cmd", "/bin/true");
}
xml_obj = create_xml_node(xml_docker, "operations");
crm_create_op_xml(xml_obj, ID(xml_docker), "monitor", "60s", NULL);
// TODO: Other ops? Timeouts and intervals from underlying resource?
crm_log_xml_trace(xml_docker, "Container-rkt");
if (common_unpack(xml_docker, &tuple->docker, parent, data_set) == FALSE) {
return FALSE;
}
parent->children = g_list_append(parent->children, tuple->docker);
return TRUE;
}
/*!
* \brief Ban a node from a resource's (and its children's) allowed nodes list
*
* \param[in,out] rsc Resource to modify
* \param[in] uname Name of node to ban
*/
static void
disallow_node(resource_t *rsc, const char *uname)
{
gpointer match = g_hash_table_lookup(rsc->allowed_nodes, uname);
if (match) {
((pe_node_t *) match)->weight = -INFINITY;
((pe_node_t *) match)->rsc_discover_mode = pe_discover_never;
}
if (rsc->children) {
GListPtr child;
for (child = rsc->children; child != NULL; child = child->next) {
disallow_node((resource_t *) (child->data), uname);
}
}
}
static bool
create_remote_resource(
resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple,
pe_working_set_t * data_set)
{
if (tuple->child && valid_network(data)) {
GHashTableIter gIter;
GListPtr rsc_iter = NULL;
node_t *node = NULL;
xmlNode *xml_remote = NULL;
char *id = crm_strdup_printf("%s-%d", data->prefix, tuple->offset);
char *port_s = NULL;
const char *uname = NULL;
const char *connect_name = NULL;
if (remote_id_conflict(id, data_set)) {
free(id);
// The biggest hammer we have
id = crm_strdup_printf("pcmk-internal-%s-remote-%d", tuple->child->id, tuple->offset);
CRM_ASSERT(remote_id_conflict(id, data_set) == FALSE);
}
/* REMOTE_CONTAINER_HACK: Using "#uname" as the server name when the
* connection does not have its own IP is a magic string that we use to
* support nested remotes (i.e. a bundle running on a remote node).
*/
connect_name = (tuple->ipaddr? tuple->ipaddr : "#uname");
if (data->control_port == NULL) {
port_s = crm_itoa(DEFAULT_REMOTE_PORT);
}
/* This sets tuple->docker as tuple->remote's container, which is
* similar to what happens with guest nodes. This is how the PE knows
* that the bundle node is fenced by recovering docker, and that
* remote should be ordered relative to docker.
*/
xml_remote = pe_create_remote_xml(NULL, id, tuple->docker->id,
NULL, NULL, NULL,
connect_name, (data->control_port?
data->control_port : port_s));
free(port_s);
/* Abandon our created ID, and pull the copy from the XML, because we
* need something that will get freed during data set cleanup to use as
* the node ID and uname.
*/
free(id);
id = NULL;
uname = ID(xml_remote);
/* Ensure a node has been created for the guest (it may have already
* been, if it has a permanent node attribute), and ensure its weight is
* -INFINITY so no other resources can run on it.
*/
node = pe_find_node(data_set->nodes, uname);
if (node == NULL) {
node = pe_create_node(uname, uname, "remote", "-INFINITY",
data_set);
} else {
node->weight = -INFINITY;
}
node->rsc_discover_mode = pe_discover_never;
/* unpack_remote_nodes() ensures that each remote node and guest node
* has a pe_node_t entry. Ideally, it would do the same for bundle nodes.
* Unfortunately, a bundle has to be mostly unpacked before it's obvious
* what nodes will be needed, so we do it just above.
*
* Worse, that means that the node may have been utilized while
* unpacking other resources, without our weight correction. The most
* likely place for this to happen is when common_unpack() calls
* resource_location() to set a default score in symmetric clusters.
* This adds a node *copy* to each resource's allowed nodes, and these
* copies will have the wrong weight.
*
* As a hacky workaround, fix those copies here.
*
* @TODO Possible alternative: ensure bundles are unpacked before other
* resources, so the weight is correct before any copies are made.
*/
for (rsc_iter = data_set->resources; rsc_iter; rsc_iter = rsc_iter->next) {
disallow_node((resource_t *) (rsc_iter->data), uname);
}
tuple->node = node_copy(node);
tuple->node->weight = 500;
tuple->node->rsc_discover_mode = pe_discover_exclusive;
/* Ensure the node shows up as allowed and with the correct discovery set */
g_hash_table_destroy(tuple->child->allowed_nodes);
tuple->child->allowed_nodes = g_hash_table_new_full(crm_str_hash,
g_str_equal, NULL,
free);
g_hash_table_insert(tuple->child->allowed_nodes, (gpointer) tuple->node->details->id, node_copy(tuple->node));
{
node_t *copy = node_copy(tuple->node);
copy->weight = -INFINITY;
g_hash_table_insert(tuple->child->parent->allowed_nodes, (gpointer) tuple->node->details->id, copy);
}
crm_log_xml_trace(xml_remote, "Container-remote");
if (common_unpack(xml_remote, &tuple->remote, parent, data_set) == FALSE) {
return FALSE;
}
g_hash_table_iter_init(&gIter, tuple->remote->allowed_nodes);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&node)) {
if(is_remote_node(node)) {
/* Remote resources can only run on 'normal' cluster node */
node->weight = -INFINITY;
}
}
tuple->node->details->remote_rsc = tuple->remote;
tuple->remote->container = tuple->docker; // Ensures is_container_remote_node() functions correctly immediately
/* A bundle's #kind is closer to "container" (guest node) than the
* "remote" set by pe_create_node().
*/
g_hash_table_insert(tuple->node->details->attrs,
strdup(CRM_ATTR_KIND), strdup("container"));
/* One effect of this is that setup_container() will add
* tuple->remote to tuple->docker's fillers, which will make
* rsc_contains_remote_node() true for tuple->docker.
*
* tuple->child does NOT get added to tuple->docker's fillers.
* The only noticeable effect if it did would be for its fail count to
* be taken into account when checking tuple->docker's migration
* threshold.
*/
parent->children = g_list_append(parent->children, tuple->remote);
}
return TRUE;
}
static bool
create_container(
resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple,
pe_working_set_t * data_set)
{
if (data->type == PE_CONTAINER_TYPE_DOCKER &&
create_docker_resource(parent, data, tuple, data_set) == FALSE) {
return FALSE;
}
if (data->type == PE_CONTAINER_TYPE_RKT &&
create_rkt_resource(parent, data, tuple, data_set) == FALSE) {
return FALSE;
}
if(create_ip_resource(parent, data, tuple, data_set) == FALSE) {
return FALSE;
}
if(create_remote_resource(parent, data, tuple, data_set) == FALSE) {
return FALSE;
}
if(tuple->child && tuple->ipaddr) {
add_hash_param(tuple->child->meta, "external-ip", tuple->ipaddr);
}
if(tuple->remote) {
/*
* Allow the remote connection resource to be allocated to a
* different node than the one on which the docker container
* is active.
*
- * Makes it possible to have remote nodes, running docker
- * containers with pacemaker_remoted inside in order to start
+ * This makes it possible to have Pacemaker Remote nodes running
+ * containers with pacemaker-remoted inside in order to start
* services inside those containers.
*/
set_bit(tuple->remote->flags, pe_rsc_allow_remote_remotes);
}
return TRUE;
}
static void
mount_add(container_variant_data_t *container_data, const char *source,
const char *target, const char *options, int flags)
{
container_mount_t *mount = calloc(1, sizeof(container_mount_t));
mount->source = strdup(source);
mount->target = strdup(target);
if (options) {
mount->options = strdup(options);
}
mount->flags = flags;
container_data->mounts = g_list_append(container_data->mounts, mount);
}
static void mount_free(container_mount_t *mount)
{
free(mount->source);
free(mount->target);
free(mount->options);
free(mount);
}
static void port_free(container_port_t *port)
{
free(port->source);
free(port->target);
free(port);
}
static container_grouping_t *
tuple_for_remote(resource_t *remote)
{
resource_t *top = remote;
container_variant_data_t *container_data = NULL;
if (top == NULL) {
return NULL;
}
while (top->parent != NULL) {
top = top->parent;
}
get_container_variant_data(container_data, top);
for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) {
container_grouping_t *tuple = (container_grouping_t *)gIter->data;
if(tuple->remote == remote) {
return tuple;
}
}
CRM_LOG_ASSERT(FALSE);
return NULL;
}
bool
container_fix_remote_addr(resource_t *rsc)
{
const char *name;
const char *value;
const char *attr_list[] = {
XML_ATTR_TYPE,
XML_AGENT_ATTR_CLASS,
XML_AGENT_ATTR_PROVIDER
};
const char *value_list[] = {
"remote",
PCMK_RESOURCE_CLASS_OCF,
"pacemaker"
};
if(rsc == NULL) {
return FALSE;
}
name = "addr";
value = g_hash_table_lookup(rsc->parameters, name);
if (safe_str_eq(value, "#uname") == FALSE) {
return FALSE;
}
for (int lpc = 0; lpc < DIMOF(attr_list); lpc++) {
name = attr_list[lpc];
value = crm_element_value(rsc->xml, attr_list[lpc]);
if (safe_str_eq(value, value_list[lpc]) == FALSE) {
return FALSE;
}
}
return TRUE;
}
const char *
container_fix_remote_addr_in(resource_t *rsc, xmlNode *xml, const char *field)
{
// REMOTE_CONTAINER_HACK: Allow remote nodes that start containers with pacemaker remote inside
pe_node_t *node = NULL;
container_grouping_t *tuple = NULL;
if(container_fix_remote_addr(rsc) == FALSE) {
return NULL;
}
tuple = tuple_for_remote(rsc);
if(tuple == NULL) {
return NULL;
}
node = tuple->docker->allocated_to;
if(node == NULL && tuple->docker->running_on) {
/* If it won't be running anywhere after the
* transition, go with where it's running now.
*/
node = tuple->docker->running_on->data;
}
if(node == NULL) {
crm_trace("Cannot fix address for %s", tuple->remote->id);
return NULL;
}
crm_trace("Fixing addr for %s on %s", rsc->id, node->details->uname);
if(xml != NULL && field != NULL) {
crm_xml_add(xml, field, node->details->uname);
}
return node->details->uname;
}
gboolean
container_unpack(resource_t * rsc, pe_working_set_t * data_set)
{
const char *value = NULL;
xmlNode *xml_obj = NULL;
xmlNode *xml_resource = NULL;
container_variant_data_t *container_data = NULL;
CRM_ASSERT(rsc != NULL);
pe_rsc_trace(rsc, "Processing resource %s...", rsc->id);
container_data = calloc(1, sizeof(container_variant_data_t));
rsc->variant_opaque = container_data;
container_data->prefix = strdup(rsc->id);
xml_obj = first_named_child(rsc->xml, "docker");
if (xml_obj != NULL) {
container_data->type = PE_CONTAINER_TYPE_DOCKER;
} else {
xml_obj = first_named_child(rsc->xml, "rkt");
if (xml_obj != NULL) {
container_data->type = PE_CONTAINER_TYPE_RKT;
} else {
return FALSE;
}
}
value = crm_element_value(xml_obj, XML_RSC_ATTR_PROMOTED_MAX);
if (value == NULL) {
// @COMPAT deprecated since 2.0.0
value = crm_element_value(xml_obj, "masters");
}
container_data->promoted_max = crm_parse_int(value, "0");
if (container_data->promoted_max < 0) {
pe_err("%s for %s must be nonnegative integer, using 0",
XML_RSC_ATTR_PROMOTED_MAX, rsc->id);
container_data->promoted_max = 0;
}
value = crm_element_value(xml_obj, "replicas");
if ((value == NULL) && container_data->promoted_max) {
container_data->replicas = container_data->promoted_max;
} else {
container_data->replicas = crm_parse_int(value, "1");
}
if (container_data->replicas < 1) {
pe_err("'replicas' for %s must be positive integer, using 1", rsc->id);
container_data->replicas = 1;
}
/*
* Communication between containers on the same host via the
* floating IPs only works if docker is started with:
* --userland-proxy=false --ip-masq=false
*/
value = crm_element_value(xml_obj, "replicas-per-host");
container_data->replicas_per_host = crm_parse_int(value, "1");
if (container_data->replicas_per_host < 1) {
pe_err("'replicas-per-host' for %s must be positive integer, using 1",
rsc->id);
container_data->replicas_per_host = 1;
}
if (container_data->replicas_per_host == 1) {
clear_bit(rsc->flags, pe_rsc_unique);
}
container_data->docker_run_command = crm_element_value_copy(xml_obj, "run-command");
container_data->docker_run_options = crm_element_value_copy(xml_obj, "options");
container_data->image = crm_element_value_copy(xml_obj, "image");
container_data->docker_network = crm_element_value_copy(xml_obj, "network");
xml_obj = first_named_child(rsc->xml, "network");
if(xml_obj) {
container_data->ip_range_start = crm_element_value_copy(xml_obj, "ip-range-start");
container_data->host_netmask = crm_element_value_copy(xml_obj, "host-netmask");
container_data->host_network = crm_element_value_copy(xml_obj, "host-interface");
container_data->control_port = crm_element_value_copy(xml_obj, "control-port");
value = crm_element_value(xml_obj, "add-host");
if (check_boolean(value) == FALSE) {
container_data->add_host = TRUE;
} else {
crm_str_to_boolean(value, &container_data->add_host);
}
for (xmlNode *xml_child = __xml_first_child_element(xml_obj); xml_child != NULL;
xml_child = __xml_next_element(xml_child)) {
container_port_t *port = calloc(1, sizeof(container_port_t));
port->source = crm_element_value_copy(xml_child, "port");
if(port->source == NULL) {
port->source = crm_element_value_copy(xml_child, "range");
} else {
port->target = crm_element_value_copy(xml_child, "internal-port");
}
if(port->source != NULL && strlen(port->source) > 0) {
if(port->target == NULL) {
port->target = strdup(port->source);
}
container_data->ports = g_list_append(container_data->ports, port);
} else {
pe_err("Invalid port directive %s", ID(xml_child));
port_free(port);
}
}
}
xml_obj = first_named_child(rsc->xml, "storage");
for (xmlNode *xml_child = __xml_first_child_element(xml_obj); xml_child != NULL;
xml_child = __xml_next_element(xml_child)) {
const char *source = crm_element_value(xml_child, "source-dir");
const char *target = crm_element_value(xml_child, "target-dir");
const char *options = crm_element_value(xml_child, "options");
int flags = 0;
if (source == NULL) {
source = crm_element_value(xml_child, "source-dir-root");
flags = 1;
}
if (source && target) {
mount_add(container_data, source, target, options, flags);
} else {
pe_err("Invalid mount directive %s", ID(xml_child));
}
}
xml_obj = first_named_child(rsc->xml, "primitive");
if (xml_obj && valid_network(container_data)) {
char *value = NULL;
xmlNode *xml_set = NULL;
xml_resource = create_xml_node(NULL, XML_CIB_TAG_INCARNATION);
/* @COMPAT We no longer use the <master> tag, but we need to keep it as
* part of the resource name, so that bundles don't restart in a rolling
* upgrade. (It also avoids needing to change regression tests.)
*/
crm_xml_set_id(xml_resource, "%s-%s", container_data->prefix,
(container_data->promoted_max? "master"
: (const char *)xml_resource->name));
xml_set = create_xml_node(xml_resource, XML_TAG_META_SETS);
crm_xml_set_id(xml_set, "%s-%s-meta", container_data->prefix, xml_resource->name);
crm_create_nvpair_xml(xml_set, NULL,
XML_RSC_ATTR_ORDERED, XML_BOOLEAN_TRUE);
value = crm_itoa(container_data->replicas);
crm_create_nvpair_xml(xml_set, NULL,
XML_RSC_ATTR_INCARNATION_MAX, value);
free(value);
value = crm_itoa(container_data->replicas_per_host);
crm_create_nvpair_xml(xml_set, NULL,
XML_RSC_ATTR_INCARNATION_NODEMAX, value);
free(value);
crm_create_nvpair_xml(xml_set, NULL, XML_RSC_ATTR_UNIQUE,
(container_data->replicas_per_host > 1)?
XML_BOOLEAN_TRUE : XML_BOOLEAN_FALSE);
if (container_data->promoted_max) {
crm_create_nvpair_xml(xml_set, NULL,
XML_RSC_ATTR_PROMOTABLE, XML_BOOLEAN_TRUE);
value = crm_itoa(container_data->promoted_max);
crm_create_nvpair_xml(xml_set, NULL,
XML_RSC_ATTR_PROMOTED_MAX, value);
free(value);
}
//crm_xml_add(xml_obj, XML_ATTR_ID, container_data->prefix);
add_node_copy(xml_resource, xml_obj);
} else if(xml_obj) {
pe_err("Cannot control %s inside %s without either ip-range-start or control-port",
rsc->id, ID(xml_obj));
return FALSE;
}
if(xml_resource) {
int lpc = 0;
GListPtr childIter = NULL;
resource_t *new_rsc = NULL;
container_port_t *port = NULL;
int offset = 0, max = 1024;
char *buffer = NULL;
if (common_unpack(xml_resource, &new_rsc, rsc, data_set) == FALSE) {
pe_err("Failed unpacking resource %s", ID(rsc->xml));
if (new_rsc != NULL && new_rsc->fns != NULL) {
new_rsc->fns->free(new_rsc);
}
return FALSE;
}
container_data->child = new_rsc;
/* Currently, we always map the default authentication key location
* into the same location inside the container.
*
* Ideally, we would respect the host's PCMK_authkey_location, but:
* - it may be different on different nodes;
* - the actual connection will do extra checking to make sure the key
* file exists and is readable, that we can't do here on the DC
* - tools such as crm_resource and crm_simulate may not have the same
* environment variables as the cluster, causing operation digests to
* differ
*
* Always using the default location inside the container is fine,
* because we control the pacemaker_remote environment, and it avoids
* having to pass another environment variable to the container.
*
* @TODO A better solution may be to have only pacemaker_remote use the
* environment variable, and have the cluster nodes use a new
* cluster option for key location. This would introduce the limitation
* of the location being the same on all cluster nodes, but that's
* reasonable.
*/
mount_add(container_data, DEFAULT_REMOTE_KEY_LOCATION,
DEFAULT_REMOTE_KEY_LOCATION, NULL, 0);
mount_add(container_data, CRM_BUNDLE_DIR, "/var/log", NULL, 1);
port = calloc(1, sizeof(container_port_t));
if(container_data->control_port) {
port->source = strdup(container_data->control_port);
} else {
/* If we wanted to respect PCMK_remote_port, we could use
* crm_default_remote_port() here and elsewhere in this file instead
* of DEFAULT_REMOTE_PORT.
*
* However, it gains nothing, since we control both the container
* environment and the connection resource parameters, and the user
* can use a different port if desired by setting control-port.
*/
port->source = crm_itoa(DEFAULT_REMOTE_PORT);
}
port->target = strdup(port->source);
container_data->ports = g_list_append(container_data->ports, port);
buffer = calloc(1, max+1);
for(childIter = container_data->child->children; childIter != NULL; childIter = childIter->next) {
container_grouping_t *tuple = calloc(1, sizeof(container_grouping_t));
tuple->child = childIter->data;
tuple->child->exclusive_discover = TRUE;
tuple->offset = lpc++;
// Ensure the child's notify gets set based on the underlying primitive's value
if(is_set(tuple->child->flags, pe_rsc_notify)) {
set_bit(container_data->child->flags, pe_rsc_notify);
}
offset += allocate_ip(container_data, tuple, buffer+offset, max-offset);
container_data->tuples = g_list_append(container_data->tuples, tuple);
container_data->attribute_target = g_hash_table_lookup(tuple->child->meta, XML_RSC_ATTR_TARGET);
}
container_data->docker_host_options = buffer;
if(container_data->attribute_target) {
g_hash_table_replace(rsc->meta, strdup(XML_RSC_ATTR_TARGET), strdup(container_data->attribute_target));
g_hash_table_replace(container_data->child->meta, strdup(XML_RSC_ATTR_TARGET), strdup(container_data->attribute_target));
}
} else {
// Just a naked container, no pacemaker-remote
int offset = 0, max = 1024;
char *buffer = calloc(1, max+1);
for(int lpc = 0; lpc < container_data->replicas; lpc++) {
container_grouping_t *tuple = calloc(1, sizeof(container_grouping_t));
tuple->offset = lpc;
offset += allocate_ip(container_data, tuple, buffer+offset, max-offset);
container_data->tuples = g_list_append(container_data->tuples, tuple);
}
container_data->docker_host_options = buffer;
}
for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) {
container_grouping_t *tuple = (container_grouping_t *)gIter->data;
if (create_container(rsc, container_data, tuple, data_set) == FALSE) {
pe_err("Failed unpacking resource %s", rsc->id);
rsc->fns->free(rsc);
return FALSE;
}
}
if(container_data->child) {
rsc->children = g_list_append(rsc->children, container_data->child);
}
return TRUE;
}
static int
tuple_rsc_active(resource_t *rsc, gboolean all)
{
if (rsc) {
gboolean child_active = rsc->fns->active(rsc, all);
if (child_active && !all) {
return TRUE;
} else if (!child_active && all) {
return FALSE;
}
}
return -1;
}
gboolean
container_active(resource_t * rsc, gboolean all)
{
container_variant_data_t *container_data = NULL;
GListPtr iter = NULL;
get_container_variant_data(container_data, rsc);
for (iter = container_data->tuples; iter != NULL; iter = iter->next) {
container_grouping_t *tuple = (container_grouping_t *)(iter->data);
int rsc_active;
rsc_active = tuple_rsc_active(tuple->ip, all);
if (rsc_active >= 0) {
return (gboolean) rsc_active;
}
rsc_active = tuple_rsc_active(tuple->child, all);
if (rsc_active >= 0) {
return (gboolean) rsc_active;
}
rsc_active = tuple_rsc_active(tuple->docker, all);
if (rsc_active >= 0) {
return (gboolean) rsc_active;
}
rsc_active = tuple_rsc_active(tuple->remote, all);
if (rsc_active >= 0) {
return (gboolean) rsc_active;
}
}
/* If "all" is TRUE, we've already checked that no resources were inactive,
* so return TRUE; if "all" is FALSE, we didn't find any active resources,
* so return FALSE.
*/
return all;
}
/*!
* \internal
* \brief Find the container child corresponding to a given node
*
* \param[in] bundle Top-level bundle resource
* \param[in] node Node to search for
*
* \return Container child if found, NULL otherwise
*/
resource_t *
find_container_child(const resource_t *bundle, const node_t *node)
{
container_variant_data_t *container_data = NULL;
CRM_ASSERT(bundle && node);
get_container_variant_data(container_data, bundle);
for (GListPtr gIter = container_data->tuples; gIter != NULL;
gIter = gIter->next) {
container_grouping_t *tuple = (container_grouping_t *)gIter->data;
CRM_ASSERT(tuple && tuple->node);
if (tuple->node->details == node->details) {
return tuple->child;
}
}
return NULL;
}
static void
print_rsc_in_list(resource_t *rsc, const char *pre_text, long options,
void *print_data)
{
if (rsc != NULL) {
if (options & pe_print_html) {
status_print("<li>");
}
rsc->fns->print(rsc, pre_text, options, print_data);
if (options & pe_print_html) {
status_print("</li>\n");
}
}
}
static const char*
container_type_as_string(enum container_type t)
{
if (t == PE_CONTAINER_TYPE_DOCKER) {
return PE_CONTAINER_TYPE_DOCKER_S;
} else if (t == PE_CONTAINER_TYPE_RKT) {
return PE_CONTAINER_TYPE_RKT_S;
} else {
return PE_CONTAINER_TYPE_UNKNOWN_S;
}
}
static void
container_print_xml(resource_t * rsc, const char *pre_text, long options, void *print_data)
{
container_variant_data_t *container_data = NULL;
char *child_text = NULL;
CRM_CHECK(rsc != NULL, return);
if (pre_text == NULL) {
pre_text = "";
}
child_text = crm_concat(pre_text, " ", ' ');
get_container_variant_data(container_data, rsc);
status_print("%s<bundle ", pre_text);
status_print("id=\"%s\" ", rsc->id);
// Always lowercase the container technology type for use as XML value
status_print("type=\"");
for (const char *c = container_type_as_string(container_data->type);
*c; ++c) {
status_print("%c", tolower(*c));
}
status_print("\" ");
status_print("image=\"%s\" ", container_data->image);
status_print("unique=\"%s\" ", is_set(rsc->flags, pe_rsc_unique)? "true" : "false");
status_print("managed=\"%s\" ", is_set(rsc->flags, pe_rsc_managed) ? "true" : "false");
status_print("failed=\"%s\" ", is_set(rsc->flags, pe_rsc_failed) ? "true" : "false");
status_print(">\n");
for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) {
container_grouping_t *tuple = (container_grouping_t *)gIter->data;
CRM_ASSERT(tuple);
status_print("%s <replica id=\"%d\">\n", pre_text, tuple->offset);
print_rsc_in_list(tuple->ip, child_text, options, print_data);
print_rsc_in_list(tuple->child, child_text, options, print_data);
print_rsc_in_list(tuple->docker, child_text, options, print_data);
print_rsc_in_list(tuple->remote, child_text, options, print_data);
status_print("%s </replica>\n", pre_text);
}
status_print("%s</bundle>\n", pre_text);
free(child_text);
}
static void
tuple_print(container_grouping_t * tuple, const char *pre_text, long options, void *print_data)
{
node_t *node = NULL;
resource_t *rsc = tuple->child;
int offset = 0;
char buffer[LINE_MAX];
if(rsc == NULL) {
rsc = tuple->docker;
}
if(tuple->remote) {
offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", rsc_printable_id(tuple->remote));
} else {
offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", rsc_printable_id(tuple->docker));
}
if(tuple->ipaddr) {
offset += snprintf(buffer + offset, LINE_MAX - offset, " (%s)", tuple->ipaddr);
}
if (tuple->docker->running_on) {
node = tuple->docker->running_on->data;
}
common_print(rsc, pre_text, buffer, node, options, print_data);
}
void
container_print(resource_t * rsc, const char *pre_text, long options, void *print_data)
{
container_variant_data_t *container_data = NULL;
char *child_text = NULL;
CRM_CHECK(rsc != NULL, return);
if (options & pe_print_xml) {
container_print_xml(rsc, pre_text, options, print_data);
return;
}
get_container_variant_data(container_data, rsc);
if (pre_text == NULL) {
pre_text = " ";
}
status_print("%s%s container%s: %s [%s]%s%s\n",
pre_text, container_type_as_string(container_data->type),
container_data->replicas>1?" set":"", rsc->id, container_data->image,
is_set(rsc->flags, pe_rsc_unique) ? " (unique)" : "",
is_set(rsc->flags, pe_rsc_managed) ? "" : " (unmanaged)");
if (options & pe_print_html) {
status_print("<br />\n<ul>\n");
}
for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) {
container_grouping_t *tuple = (container_grouping_t *)gIter->data;
CRM_ASSERT(tuple);
if (options & pe_print_html) {
status_print("<li>");
}
if (is_set(options, pe_print_implicit)) {
child_text = crm_strdup_printf(" %s", pre_text);
if(g_list_length(container_data->tuples) > 1) {
status_print(" %sReplica[%d]\n", pre_text, tuple->offset);
}
if (options & pe_print_html) {
status_print("<br />\n<ul>\n");
}
print_rsc_in_list(tuple->ip, child_text, options, print_data);
print_rsc_in_list(tuple->docker, child_text, options, print_data);
print_rsc_in_list(tuple->remote, child_text, options, print_data);
print_rsc_in_list(tuple->child, child_text, options, print_data);
if (options & pe_print_html) {
status_print("</ul>\n");
}
} else {
child_text = crm_strdup_printf("%s ", pre_text);
tuple_print(tuple, child_text, options, print_data);
}
free(child_text);
if (options & pe_print_html) {
status_print("</li>\n");
}
}
if (options & pe_print_html) {
status_print("</ul>\n");
}
}
void
tuple_free(container_grouping_t *tuple)
{
if(tuple == NULL) {
return;
}
if(tuple->node) {
free(tuple->node);
tuple->node = NULL;
}
if(tuple->ip) {
free_xml(tuple->ip->xml);
tuple->ip->xml = NULL;
tuple->ip->fns->free(tuple->ip);
tuple->ip = NULL;
}
if(tuple->docker) {
free_xml(tuple->docker->xml);
tuple->docker->xml = NULL;
tuple->docker->fns->free(tuple->docker);
tuple->docker = NULL;
}
if(tuple->remote) {
free_xml(tuple->remote->xml);
tuple->remote->xml = NULL;
tuple->remote->fns->free(tuple->remote);
tuple->remote = NULL;
}
free(tuple->ipaddr);
free(tuple);
}
void
container_free(resource_t * rsc)
{
container_variant_data_t *container_data = NULL;
CRM_CHECK(rsc != NULL, return);
get_container_variant_data(container_data, rsc);
pe_rsc_trace(rsc, "Freeing %s", rsc->id);
free(container_data->prefix);
free(container_data->image);
free(container_data->control_port);
free(container_data->host_network);
free(container_data->host_netmask);
free(container_data->ip_range_start);
free(container_data->docker_network);
free(container_data->docker_run_options);
free(container_data->docker_run_command);
free(container_data->docker_host_options);
g_list_free_full(container_data->tuples, (GDestroyNotify)tuple_free);
g_list_free_full(container_data->mounts, (GDestroyNotify)mount_free);
g_list_free_full(container_data->ports, (GDestroyNotify)port_free);
g_list_free(rsc->children);
if(container_data->child) {
free_xml(container_data->child->xml);
container_data->child->xml = NULL;
container_data->child->fns->free(container_data->child);
}
common_free(rsc);
}
enum rsc_role_e
container_resource_state(const resource_t * rsc, gboolean current)
{
enum rsc_role_e container_role = RSC_ROLE_UNKNOWN;
return container_role;
}
/*!
* \brief Get the number of configured replicas in a bundle
*
* \param[in] rsc Bundle resource
*
* \return Number of configured replicas, or 0 on error
*/
int
pe_bundle_replicas(const resource_t *rsc)
{
if ((rsc == NULL) || (rsc->variant != pe_container)) {
return 0;
} else {
container_variant_data_t *container_data = NULL;
get_container_variant_data(container_data, rsc);
return container_data->replicas;
}
}
diff --git a/pacemaker.spec.in b/pacemaker.spec.in
index f09173177f..778628063c 100644
--- a/pacemaker.spec.in
+++ b/pacemaker.spec.in
@@ -1,771 +1,771 @@
# Globals and defines to control package behavior (configure these as desired)
## User and group to use for nonprivileged services
%global uname hacluster
%global gname haclient
## Where to install Pacemaker documentation
%global pcmk_docdir %{_docdir}/%{name}
## GitHub entity that distributes source (for ease of using a fork)
%global github_owner ClusterLabs
## Upstream pacemaker version, and its package version (specversion
## can be incremented to build packages reliably considered "newer"
## than previously built packages with the same pcmkversion)
%global pcmkversion 2.0.0
%global specversion 1
## Upstream commit (or git tag, such as "Pacemaker-" plus the
## {pcmkversion} macro for an official release) to use for this package
%global commit HEAD
## Since git v2.11, the extent of abbreviation is autoscaled by default
## (used to be constant of 7), so we need to convey it for non-tags, too.
%global commit_abbrev 7
## Python major version to use (2, 3, or 0 for auto-detect)
%global python_major 0
# Define globals for convenient use later
## Workaround to use parentheses in other globals
%global lparen (
%global rparen )
## Short version of git commit
%define shortcommit %(c=%{commit}; case ${c} in
Pacemaker-*%{rparen} echo ${c:10};;
*%{rparen} echo ${c:0:%{commit_abbrev}};; esac)
## Whether this is a tagged release
%define tag_release %([ %{commit} != Pacemaker-%{shortcommit} ]; echo $?)
## Whether this is a release candidate (in case of a tagged release)
%define pre_release %([ "%{tag_release}" -eq 0 ] || {
case "%{shortcommit}" in *-rc[[:digit:]]*%{rparen} false;;
esac; }; echo $?)
## Heuristic used to infer bleeding-edge deployments that are
## less likely to have working versions of the documentation tools
%define bleeding %(test ! -e /etc/yum.repos.d/fedora-rawhide.repo; echo $?)
## Whether this platform defaults to using systemd as an init system
## (needs to be evaluated prior to BuildRequires being enumerated and
## installed as it's intended to conditionally select some of these, and
## for that there are only few indicators with varying reliability:
## - presence of systemd-defined macros (when building in a full-fledged
## environment, which is not the case with ordinary mock-based builds)
## - systemd-aware rpm as manifested with the presence of particular
## macro (rpm itself will trivially always be present when building)
## - existence of /usr/lib/os-release file, which is something heavily
## propagated by systemd project
## - when not good enough, there's always a possibility to check
## particular distro-specific macros (incl. version comparison)
%define systemd_native (%{?_unitdir:1}%{!?_unitdir:0}%{nil \
} || %{?__transaction_systemd_inhibit:1}%{!?__transaction_systemd_inhibit:0}%{nil \
} || %(test -f /usr/lib/os-release; test $? -ne 0; echo $?))
%if 0%{?fedora} > 20 || 0%{?rhel} > 7
%global gnutls_priorities @SYSTEM
%endif
# Python-related definitions
## Use Python 3 on certain platforms if major version not specified
%if %{?python_major} == 0
%if 0%{?fedora} > 26 || 0%{?rhel} > 7
%global python_major 3
%endif
%endif
## Turn off auto-compilation of Python files outside site-packages directory,
## so that the -libs-devel package is multilib-compliant (no *.py[co] files)
%global __os_install_post %(echo '%{__os_install_post}' | {
sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g'; })
## Values that differ by Python major version
%if 0%{?python_major} > 2
%global python_path /usr/bin/python%{?python3_pkgversion}%{!?python3_pkgversion:3}
%global python_pkg python3
%global python_min 3.2
%define py_site %{?python3_sitelib}%{!?python3_sitelib:%(
python3 -c 'from distutils.sysconfig import get_python_lib as gpl; print(gpl(1))' 2>/dev/null)}
%else
%if 0%{?python_major} > 1
%global python_path /usr/bin/python%{?python2_pkgversion}%{!?python2_pkgversion:2}
%global python_pkg python2
%global python_min 2.7
%define py_site %{?python2_sitelib}%{!?python2_sitelib:%(
python2 -c 'from distutils.sysconfig import get_python_lib as gpl; print(gpl(1))' 2>/dev/null)}
%else
%global python_min 2.7
%global python_pkg python
%define py_site %{?python_sitelib}%{!?python_sitelib:%(
python -c 'from distutils.sysconfig import get_python_lib as gpl; print(gpl(1))' 2>/dev/null)}
%endif
%endif
# Definitions for backward compatibility with older RPM versions
## Ensure the license macro behaves consistently (older RPM will otherwise
## overwrite it once it encounters "License:"). Courtesy Jason Tibbitts:
## https://pkgs.fedoraproject.org/cgit/rpms/epel-rpm-macros.git/tree/macros.zzz-epel?h=el6&id=e1adcb77
%if !%{defined _licensedir}
%define description %{lua:
rpm.define("license %doc")
print("%description")
}
%endif
# Define conditionals so that "rpmbuild --with <feature>" and
# "rpmbuild --without <feature>" can enable and disable specific features
## Add option to enable support for stonith/external fencing agents
%bcond_with stonithd
## Add option to create binaries suitable for use with profiling tools
%bcond_with profiling
## Add option to create binaries with coverage analysis
%bcond_with coverage
## Add option to skip generating documentation
## (the build tools aren't available everywhere)
%bcond_without doc
## Add option to prefix package version with "0."
## (so later "official" packages will be considered updates)
%bcond_with pre_release
## Add option to ship Upstart job files
%bcond_with upstart_job
## Add option to turn off hardening of libraries and daemon executables
%bcond_without hardening
# Keep sane profiling data if requested
%if %{with profiling}
## Disable -debuginfo package and stripping binaries/libraries
%define debug_package %{nil}
%endif
# Define the release version
# (do not look at externally enforced pre-release flag for tagged releases
# as only -rc tags, captured with the second condition, implies that then)
%if (!%{tag_release} && %{with pre_release}) || 0%{pre_release}
%if 0%{pre_release}
%define pcmk_release 0.%{specversion}.%(s=%{shortcommit}; echo ${s: -3})
%else
%define pcmk_release 0.%{specversion}.%{shortcommit}.git
%endif
%else
%if 0%{tag_release}
%define pcmk_release %{specversion}
%else
%define pcmk_release %{specversion}.%{shortcommit}.git
%endif
%endif
Name: pacemaker
Summary: Scalable High-Availability cluster resource manager
Version: %{pcmkversion}
Release: %{pcmk_release}%{?dist}
%if %{defined _unitdir}
License: GPLv2+ and LGPLv2+
%else
# initscript is Revised BSD
License: GPLv2+ and LGPLv2+ and BSD
%endif
Url: http://www.clusterlabs.org
Group: System Environment/Daemons
# Hint: use "spectool -s 0 pacemaker.spec" (rpmdevtools) to check the final URL:
# https://github.com/ClusterLabs/pacemaker/archive/e91769e5a39f5cb2f7b097d3c612368f0530535e/pacemaker-e91769e.tar.gz
Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{shortcommit}.tar.gz
Requires: resource-agents
Requires: %{name}-libs = %{version}-%{release}
Requires: %{name}-cluster-libs = %{version}-%{release}
Requires: %{name}-cli = %{version}-%{release}
%{?systemd_requires}
# Pacemaker requires a minimum Python functionality
Requires: %{python_pkg} >= %{python_min}
BuildRequires: %{python_pkg}-devel >= %{python_min}
# Pacemaker requires a minimum libqb functionality
Requires: libqb >= 0.13.0
BuildRequires: libqb-devel >= 0.13.0
# Basics required for the build (even if usually satisfied through other BRs)
BuildRequires: coreutils findutils grep sed
# Required for core functionality
BuildRequires: automake autoconf gcc libtool pkgconfig libtool-ltdl-devel
BuildRequires: pkgconfig(glib-2.0) >= 2.16
BuildRequires: libxml2-devel libxslt-devel libuuid-devel
BuildRequires: bzip2-devel
# Enables optional functionality
BuildRequires: ncurses-devel docbook-style-xsl
BuildRequires: help2man gnutls-devel pam-devel pkgconfig(dbus-1)
%if %{systemd_native}
BuildRequires: pkgconfig(systemd)
%endif
Requires: corosync >= 2.0.0
BuildRequires: corosynclib-devel >= 2.0.0
%if %{with stonithd}
BuildRequires: cluster-glue-libs-devel
%endif
## (note no avoiding effect when building through non-customized mock)
%if !%{bleeding}
%if %{with doc}
BuildRequires: inkscape asciidoc publican
%endif
%endif
%description
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
It supports more than 16 node clusters with significant capabilities
for managing resources and dependencies.
It will run scripts at initialization, when machines go up or down,
when related resources fail and can be configured to periodically check
resource health.
Available rpmbuild rebuild options:
--with(out) : coverage doc stonithd hardening pre_release profiling
upstart_job
%package cli
License: GPLv2+ and LGPLv2+
Summary: Command line tools for controlling Pacemaker clusters
Group: System Environment/Daemons
Requires: %{name}-libs = %{version}-%{release}
Requires: perl-TimeDate
%description cli
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
The %{name}-cli package contains command line tools that can be used
to query and control the cluster from machines that may, or may not,
be part of the cluster.
%package libs
License: GPLv2+ and LGPLv2+
Summary: Core Pacemaker libraries
Group: System Environment/Daemons
Requires(pre): shadow-utils
%description libs
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
The %{name}-libs package contains shared libraries needed for cluster
nodes and those just running the CLI tools.
%package cluster-libs
License: GPLv2+ and LGPLv2+
Summary: Cluster Libraries used by Pacemaker
Group: System Environment/Daemons
Requires: %{name}-libs = %{version}-%{release}
%description cluster-libs
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
The %{name}-cluster-libs package contains cluster-aware shared
libraries needed for nodes that will form part of the cluster nodes.
%package remote
%if %{defined _unitdir}
License: GPLv2+ and LGPLv2+
%else
# initscript is Revised BSD
License: GPLv2+ and LGPLv2+ and BSD
%endif
Summary: Pacemaker remote daemon for non-cluster nodes
Group: System Environment/Daemons
Requires: %{name}-libs = %{version}-%{release}
Requires: %{name}-cli = %{version}-%{release}
Requires: resource-agents
# -remote can be fully independent of systemd
%{?systemd_ordering}%{!?systemd_ordering:%{?systemd_requires}}
%description remote
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
The %{name}-remote package contains the Pacemaker Remote daemon
which is capable of extending pacemaker functionality to remote
nodes not running the full corosync/cluster stack.
%package libs-devel
License: GPLv2+ and LGPLv2+
Summary: Pacemaker development package
Group: Development/Libraries
Requires: %{name}-cts = %{version}-%{release}
Requires: %{name}-libs%{?_isa} = %{version}-%{release}
Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release}
Requires: libuuid-devel%{?_isa} libtool-ltdl-devel%{?_isa}
Requires: libxml2-devel%{?_isa} libxslt-devel%{?_isa}
Requires: bzip2-devel%{?_isa} glib2-devel%{?_isa}
Requires: libqb-devel%{?_isa}
Requires: corosynclib-devel%{?_isa} >= 2.0.0
%description libs-devel
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
The %{name}-libs-devel package contains headers and shared libraries
for developing tools for Pacemaker.
%package cts
License: GPLv2+ and LGPLv2+
Summary: Test framework for cluster-related technologies like Pacemaker
Group: System Environment/Daemons
Requires: %{python_pkg} >= %{python_min}
Requires: %{name}-libs = %{version}-%{release}
BuildArch: noarch
# systemd python bindings are separate package in some distros
%if %{defined systemd_requires}
%if 0%{?fedora} > 22 || 0%{?rhel} > 7
Requires: %{python_pkg}-systemd
%else
%if 0%{?fedora} > 20 || 0%{?rhel} > 6
Requires: systemd-python
%endif
%endif
%endif
%description cts
Test framework for cluster-related technologies like Pacemaker
%package doc
License: CC-BY-SA-4.0
Summary: Documentation for Pacemaker
Group: Documentation
BuildArch: noarch
%description doc
Documentation for Pacemaker.
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
%prep
%setup -q -n %{name}-%{commit}
%build
# Early versions of autotools (e.g. RHEL <= 5) do not support --docdir
export docdir=%{pcmk_docdir}
export systemdunitdir=%{?_unitdir}%{!?_unitdir:no}
%if %{with hardening}
# prefer distro-provided hardening flags in case they are defined
# through _hardening_{c,ld}flags macros, configure script will
# use its own defaults otherwise; if such hardenings are completely
# undesired, rpmbuild using "--without hardening"
# (or "--define '_without_hardening 1'")
export CFLAGS_HARDENED_EXE="%{?_hardening_cflags}"
export CFLAGS_HARDENED_LIB="%{?_hardening_cflags}"
export LDFLAGS_HARDENED_EXE="%{?_hardening_ldflags}"
export LDFLAGS_HARDENED_LIB="%{?_hardening_ldflags}"
%endif
./autogen.sh
%{configure} \
%{?with_profiling: --with-profiling} \
%{?with_coverage: --with-coverage} \
%{!?with_doc: --with-brand=} \
%{!?with_hardening: --disable-hardening} \
%{?python_path: PYTHON=%{python_path}} \
%{?gnutls_priorities: --with-gnutls-priorities="%{gnutls_priorities}"} \
--with-initdir=%{_initrddir} \
--localstatedir=%{_var} \
--with-version=%{version}-%{release}
%if 0%{?suse_version} >= 1200
# Fedora handles rpath removal automagically
sed -i 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' libtool
sed -i 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool
%endif
make %{_smp_mflags} V=1 all
%check
{ cts/cts-pengine --run one-or-more-unrunnable-instances \
&& cts/cts-cli \
&& touch .CHECKED
} 2>&1 | sed 's/[fF]ail/faiil/g' # prevent false positives in rpmlint
[ -f .CHECKED ] && rm -f -- .CHECKED
exit $? # TODO remove when rpm<4.14 compatibility irrelevant
%install
make DESTDIR=%{buildroot} docdir=%{pcmk_docdir} V=1 install
mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig
install -m 644 daemons/pacemakerd/pacemaker.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/pacemaker
install -m 644 tools/crm_mon.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/crm_mon
%if %{with upstart_job}
mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/init
install -m 644 pacemakerd/pacemaker.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.conf
install -m 644 pacemakerd/pacemaker.combined.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.combined.conf
install -m 644 tools/crm_mon.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/crm_mon.conf
%endif
%if %{defined _unitdir}
mkdir -p ${RPM_BUILD_ROOT}%{_localstatedir}/lib/rpm-state/%{name}
%endif
# These are not actually scripts
find %{buildroot} -name '*.xml' -type f -print0 | xargs -0 chmod a-x
# Don't package static libs
find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f
find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f
# Don't ship init scripts for systemd based platforms
%if %{defined _unitdir}
rm -f %{buildroot}/%{_initrddir}/pacemaker
rm -f %{buildroot}/%{_initrddir}/pacemaker_remote
%endif
%if %{with coverage}
GCOV_BASE=%{buildroot}/%{_var}/lib/pacemaker/gcov
mkdir -p $GCOV_BASE
find . -name '*.gcno' -type f | while read F ; do
D=`dirname $F`
mkdir -p ${GCOV_BASE}/$D
cp $F ${GCOV_BASE}/$D
done
%endif
%post
%if %{defined _unitdir}
%systemd_post pacemaker.service
%else
/sbin/chkconfig --add pacemaker || :
%endif
%preun
%if %{defined _unitdir}
%systemd_preun pacemaker.service
%else
/sbin/service pacemaker stop >/dev/null 2>&1 || :
if [ $1 -eq 0 ]; then
# Package removal, not upgrade
/sbin/chkconfig --del pacemaker || :
fi
%endif
%postun
%if %{defined _unitdir}
%systemd_postun_with_restart pacemaker.service
%endif
%pre remote
%if %{defined _unitdir}
# Stop the service before anything is touched, and remember to restart
# it as one of the last actions (compared to using systemd_postun_with_restart,
# this avoids suicide when sbd is in use)
systemctl --quiet is-active pacemaker_remote
if [ $? -eq 0 ] ; then
mkdir -p %{_localstatedir}/lib/rpm-state/%{name}
touch %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote
systemctl stop pacemaker_remote >/dev/null 2>&1
else
rm -f %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote
fi
%endif
%post remote
%if %{defined _unitdir}
%systemd_post pacemaker_remote.service
%else
/sbin/chkconfig --add pacemaker_remote || :
%endif
%preun remote
%if %{defined _unitdir}
%systemd_preun pacemaker_remote.service
%else
/sbin/service pacemaker_remote stop >/dev/null 2>&1 || :
if [ $1 -eq 0 ]; then
# Package removal, not upgrade
/sbin/chkconfig --del pacemaker_remote || :
fi
%endif
%postun remote
%if %{defined _unitdir}
# This next line is a no-op, because we stopped the service earlier, but
# we leave it here because it allows us to revert to the standard behavior
# in the future if desired
%systemd_postun_with_restart pacemaker_remote.service
# Explicitly take care of removing the flag-file(s) upon final removal
if [ $1 -eq 0 ] ; then
rm -f %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote
fi
%endif
%posttrans remote
%if %{defined _unitdir}
if [ -e %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote ] ; then
systemctl start pacemaker_remote >/dev/null 2>&1
rm -f %{_localstatedir}/lib/rpm-state/%{name}/restart_pacemaker_remote
fi
%endif
%post cli
%if %{defined _unitdir}
%systemd_post crm_mon.service
%endif
%preun cli
%if %{defined _unitdir}
%systemd_preun crm_mon.service
%endif
%postun cli
%if %{defined _unitdir}
%systemd_postun_with_restart crm_mon.service
%endif
%post cts
%if %{defined _unitdir}
%systemd_post pacemaker-cts-dummyd@.service
%endif
%preun cts
%if %{defined _unitdir}
%systemd_preun pacemaker-cts-dummyd@.service
%endif
%postun cts
%if %{defined _unitdir}
%systemd_postun_with_restart pacemaker-cts-dummyd@.service
%endif
%pre libs
getent group %{gname} >/dev/null || groupadd -r %{gname} -g 189
getent passwd %{uname} >/dev/null || useradd -r -g %{gname} -u 189 -s /sbin/nologin -c "cluster user" %{uname}
exit 0
%if %{defined ldconfig_scriptlets}
%ldconfig_scriptlets libs
%ldconfig_scriptlets cluster-libs
%else
%post libs -p /sbin/ldconfig
%postun libs -p /sbin/ldconfig
%post cluster-libs -p /sbin/ldconfig
%postun cluster-libs -p /sbin/ldconfig
%endif
%files
###########################################################
%config(noreplace) %{_sysconfdir}/sysconfig/pacemaker
%{_sbindir}/pacemakerd
%if %{defined _unitdir}
%{_unitdir}/pacemaker.service
%else
%{_initrddir}/pacemaker
%endif
%exclude %{_libexecdir}/pacemaker/cts-exec-helper
-%exclude %{_sbindir}/pacemaker_remoted
+%exclude %{_sbindir}/pacemaker-remoted
%{_libexecdir}/pacemaker/*
%{_sbindir}/crm_attribute
%{_sbindir}/crm_master
%{_sbindir}/crm_node
%{_sbindir}/fence_legacy
%{_sbindir}/stonith_admin
%doc %{_mandir}/man7/crmd.*
%doc %{_mandir}/man7/pengine.*
%doc %{_mandir}/man7/stonithd.*
%doc %{_mandir}/man7/ocf_pacemaker_controld.*
%doc %{_mandir}/man7/ocf_pacemaker_o2cb.*
%doc %{_mandir}/man7/ocf_pacemaker_remote.*
%doc %{_mandir}/man8/crm_attribute.*
%doc %{_mandir}/man8/crm_node.*
%doc %{_mandir}/man8/crm_master.*
%doc %{_mandir}/man8/fence_legacy.*
%doc %{_mandir}/man8/pacemakerd.*
%doc %{_mandir}/man8/stonith_admin.*
%doc %{_datadir}/pacemaker/alerts
%license licenses/GPLv2
%doc COPYING
%doc ChangeLog
%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/cib
%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/pengine
/usr/lib/ocf/resource.d/pacemaker/controld
/usr/lib/ocf/resource.d/pacemaker/o2cb
/usr/lib/ocf/resource.d/pacemaker/remote
%if %{with upstart_job}
%config(noreplace) %{_sysconfdir}/init/pacemaker.conf
%config(noreplace) %{_sysconfdir}/init/pacemaker.combined.conf
%endif
%files cli
%dir %attr (750, root, %{gname}) %{_sysconfdir}/pacemaker
%config(noreplace) %{_sysconfdir}/logrotate.d/pacemaker
%config(noreplace) %{_sysconfdir}/sysconfig/crm_mon
%if %{defined _unitdir}
%{_unitdir}/crm_mon.service
%endif
%if %{with upstart_job}
%config(noreplace) %{_sysconfdir}/init/crm_mon.conf
%endif
%{_sbindir}/attrd_updater
%{_sbindir}/cibadmin
%{_sbindir}/crm_diff
%{_sbindir}/crm_error
%{_sbindir}/crm_failcount
%{_sbindir}/crm_mon
%{_sbindir}/crm_resource
%{_sbindir}/crm_standby
%{_sbindir}/crm_verify
%{_sbindir}/crmadmin
%{_sbindir}/iso8601
%{_sbindir}/crm_shadow
%{_sbindir}/crm_simulate
%{_sbindir}/crm_report
%{_sbindir}/crm_ticket
%exclude %{_datadir}/pacemaker/alerts
%exclude %{_datadir}/pacemaker/tests
%{_datadir}/pacemaker
%{_datadir}/snmp/mibs/PCMK-MIB.txt
%{_libexecdir}/pacemaker/cts-exec-helper
%exclude /usr/lib/ocf/resource.d/pacemaker/controld
%exclude /usr/lib/ocf/resource.d/pacemaker/o2cb
%exclude /usr/lib/ocf/resource.d/pacemaker/remote
%dir /usr/lib/ocf
%dir /usr/lib/ocf/resource.d
/usr/lib/ocf/resource.d/pacemaker
%doc %{_mandir}/man7/*
%exclude %{_mandir}/man7/crmd.*
%exclude %{_mandir}/man7/pengine.*
%exclude %{_mandir}/man7/stonithd.*
%exclude %{_mandir}/man7/ocf_pacemaker_controld.*
%exclude %{_mandir}/man7/ocf_pacemaker_o2cb.*
%exclude %{_mandir}/man7/ocf_pacemaker_remote.*
%doc %{_mandir}/man8/*
%exclude %{_mandir}/man8/crm_attribute.*
%exclude %{_mandir}/man8/crm_node.*
%exclude %{_mandir}/man8/crm_master.*
%exclude %{_mandir}/man8/fence_legacy.*
%exclude %{_mandir}/man8/pacemakerd.*
-%exclude %{_mandir}/man8/pacemaker_remoted.*
+%exclude %{_mandir}/man8/pacemaker-remoted.*
%exclude %{_mandir}/man8/stonith_admin.*
%license licenses/GPLv2
%doc COPYING
%doc ChangeLog
%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker
%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/blackbox
%dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/cores
%dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker
%dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker/bundles
%files libs
%{_libdir}/libcib.so.*
%{_libdir}/liblrmd.so.*
%{_libdir}/libcrmservice.so.*
%{_libdir}/libcrmcommon.so.*
%{_libdir}/libpe_status.so.*
%{_libdir}/libpe_rules.so.*
%{_libdir}/libpengine.so.*
%{_libdir}/libstonithd.so.*
%{_libdir}/libtransitioner.so.*
%license licenses/LGPLv2.1
%doc COPYING
%doc ChangeLog
%files cluster-libs
%{_libdir}/libcrmcluster.so.*
%license licenses/LGPLv2.1
%doc COPYING
%doc ChangeLog
%files remote
%config(noreplace) %{_sysconfdir}/sysconfig/pacemaker
%if %{defined _unitdir}
# state directory is shared between the subpackets
# let rpm take care of removing it once it isn't
# referenced anymore and empty
%ghost %dir %{_localstatedir}/lib/rpm-state/%{name}
%{_unitdir}/pacemaker_remote.service
%else
%{_initrddir}/pacemaker_remote
%endif
-%{_sbindir}/pacemaker_remoted
-%{_mandir}/man8/pacemaker_remoted.*
+%{_sbindir}/pacemaker-remoted
+%{_mandir}/man8/pacemaker-remoted.*
%license licenses/GPLv2
%doc COPYING
%doc ChangeLog
%files doc
%doc %{pcmk_docdir}
%license licenses/CC-BY-SA-4.0
%files cts
%{py_site}/cts
%{_datadir}/pacemaker/tests
%if %{defined _unitdir}
%{_unitdir}/pacemaker-cts-dummyd@.service
%endif
%license licenses/GPLv2
%doc COPYING
%doc ChangeLog
%files libs-devel
%{_includedir}/pacemaker
%{_libdir}/*.so
%if %{with coverage}
%{_var}/lib/pacemaker/gcov
%endif
%{_libdir}/pkgconfig/*.pc
%license licenses/LGPLv2.1
%doc COPYING
%doc ChangeLog
%changelog
diff --git a/tools/report.collector.in b/tools/report.collector.in
index f565009fd3..db4cbf483f 100644
--- a/tools/report.collector.in
+++ b/tools/report.collector.in
@@ -1,818 +1,823 @@
#
# Originally based on hb_report
# Copyright 2007 Dejan Muhamedagic <dmuhamedagic@suse.de>
#
# Later changes copyright 2010-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
if
echo $REPORT_HOME | grep -qs '^/'
then
debug "Using full path to working directory: $REPORT_HOME"
else
REPORT_HOME="$HOME/$REPORT_HOME"
debug "Canonicalizing working directory path: $REPORT_HOME"
fi
detect_host
#
# find files newer than a and older than b
#
isnumber() {
echo "$*" | grep -qs '^[0-9][0-9]*$'
}
touchfile() {
t=`mktemp` &&
perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' &&
echo $t
}
find_files_clean() {
[ -z "$from_stamp" ] || rm -f "$from_stamp"
[ -z "$to_stamp" ] || rm -f "$to_stamp"
from_stamp=""
to_stamp=""
}
find_files() {
dirs=
from_time=$2
to_time=$3
for d in $1; do
if [ -d $d ]; then
dirs="$dirs $d"
fi
done
if [ x"$dirs" = x ]; then
return
fi
isnumber "$from_time" && [ "$from_time" -gt 0 ] || {
warning "sorry, can't find files in [ $1 ] based on time if you don't supply time"
return
}
trap find_files_clean 0
if ! from_stamp=`touchfile $from_time`; then
warning "sorry, can't create temporary file for find_files"
return
fi
findexp="-newer $from_stamp"
if isnumber "$to_time" && [ "$to_time" -gt 0 ]; then
if ! to_stamp=`touchfile $to_time`; then
warning "sorry, can't create temporary file for find_files"
find_files_clean
return
fi
findexp="$findexp ! -newer $to_stamp"
fi
find $dirs -type f $findexp
find_files_clean
trap "" 0
}
#
# check permissions of files/dirs
#
pl_checkperms() {
perl -e '
# check permissions and ownership
# uid and gid are numeric
# everything must match exactly
# no error checking! (file should exist, etc)
($filename, $perms, $in_uid, $in_gid) = @ARGV;
($mode,$uid,$gid) = (stat($filename))[2,4,5];
$p=sprintf("%04o", $mode & 07777);
$p ne $perms and exit(1);
$uid ne $in_uid and exit(1);
$gid ne $in_gid and exit(1);
' $*
}
num_id() {
getent $1 $2 | awk -F: '{print $3}'
}
chk_id() {
[ "$2" ] && return 0
echo "$1: id not found"
return 1
}
check_perms() {
while read type f p uid gid; do
if [ ! -e "$f" ]; then
echo "$f doesn't exist"
continue
elif [ ! -$type "$f" ]; then
echo "$f has wrong type"
continue
fi
n_uid=`num_id passwd $uid`
chk_id "$uid" "$n_uid" || continue
n_gid=`num_id group $gid`
chk_id "$gid" "$n_gid" || continue
pl_checkperms $f $p $n_uid $n_gid || {
echo "wrong permissions or ownership for $f:"
ls -ld $f
}
done
}
#
# coredumps
#
findbinary() {
random_binary=`which cat 2>/dev/null` # suppose we are lucky
binary=`gdb $random_binary $1 < /dev/null 2>/dev/null |
grep 'Core was generated' | awk '{print $5}' |
sed "s/^.//;s/[.':]*$//"`
if [ x = x"$binary" ]; then
debug "Could not detect the program name for core $1 from the gdb output; will try with file(1)"
binary=$(file $1 | awk '/from/{
for( i=1; i<=NF; i++ )
if( $i == "from" ) {
print $(i+1)
break
}
}')
binary=`echo $binary | tr -d "'"`
binary=$(echo $binary | tr -d '`')
if [ "$binary" ]; then
binary=`which $binary 2>/dev/null`
fi
fi
if [ x = x"$binary" ]; then
warning "Could not find the program path for core $1"
return
fi
fullpath=`which $binary 2>/dev/null`
if [ x = x"$fullpath" ]; then
if [ -x $CRM_DAEMON_DIR/$binary ]; then
echo $CRM_DAEMON_DIR/$binary
debug "Found the program at $CRM_DAEMON_DIR/$binary for core $1"
else
warning "Could not find the program path for core $1"
fi
else
echo $fullpath
debug "Found the program at $fullpath for core $1"
fi
}
getbt() {
which gdb > /dev/null 2>&1 || {
warning "Please install gdb to get backtraces"
return
}
for corefile; do
absbinpath=`findbinary $corefile`
[ x = x"$absbinpath" ] && continue
echo "====================== start backtrace ======================"
ls -l $corefile
# Summary first...
gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt"} -ex quit \
$absbinpath $corefile 2>/dev/null
echo "====================== start detail ======================"
# Now the unreadable details...
gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \
$absbinpath $corefile 2>/dev/null
echo "======================= end backtrace ======================="
done
}
dump_status_and_config() {
crm_mon -1 2>&1 | grep -v '^Last upd' > $target/$CRM_MON_F
cibadmin -Ql 2>/dev/null > $target/${CIB_F}.live
}
getconfig() {
cluster=$1; shift;
target=$1; shift;
for cf in $*; do
if [ -e "$cf" ]; then
cp -a "$cf" $target/
fi
done
if is_running crmd; then
dump_status_and_config
case $cluster in
corosync) crm_node -p --corosync > $target/$MEMBERSHIP_F 2>&1;;
*) crm_node -p > $target/$MEMBERSHIP_F 2>&1;;
esac
echo "$host" > $target/RUNNING
+ elif is_running pacemaker-remoted; then
+ dump_status_and_config
+ echo "$host" > $target/RUNNING
+
+ # Pre-2.0.0 daemon name in case we're collecting on a mixed-version cluster
elif is_running pacemaker_remoted; then
dump_status_and_config
echo "$host" > $target/RUNNING
else
echo "$host" > $target/STOPPED
fi
}
get_readable_cib() {
target="$1"; shift;
if [ -f "$target/$CIB_F" ]; then
crm_verify -V -x "$target/$CIB_F" >"$target/$CRM_VERIFY_F" 2>&1
if which crm >/dev/null 2>&1 ; then
CIB_file="$target/$CIB_F" crm configure show >"$target/$CIB_TXT_F" 2>&1
elif which pcs >/dev/null 2>&1 ; then
pcs config -f "$target/$CIB_F" >"$target/$CIB_TXT_F" 2>&1
fi
fi
}
#
# remove values of sensitive attributes
#
# this is not proper xml parsing, but it will work under the
# circumstances
sanitize_xml_attrs() {
sed $(
for patt in $SANITIZE; do
echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/"
done
)
}
sanitize_hacf() {
awk '
$1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; }
{print}
'
}
sanitize_one_clean() {
[ -z "$tmp" ] || rm -f "$tmp"
tmp=""
[ -z "$ref" ] || rm -f "$ref"
ref=""
}
sanitize() {
file=$1
compress=""
if [ -z "$SANITIZE" ]; then
return
fi
echo $file | grep -qs 'gz$' && compress=gzip
echo $file | grep -qs 'bz2$' && compress=bzip2
if [ "$compress" ]; then
decompress="$compress -dc"
else
compress=cat
decompress=cat
fi
trap sanitize_one_clean 0
tmp=`mktemp`
ref=`mktemp`
if [ -z "$tmp" -o -z "$ref" ]; then
sanitize_one_clean
fatal "cannot create temporary files"
fi
touch -r $file $ref # save the mtime
if [ "`basename $file`" = ha.cf ]; then
sanitize_hacf
else
$decompress | sanitize_xml_attrs | $compress
fi < $file > $tmp
mv $tmp $file
# note: cleaning $tmp up is still needed even after it's renamed
# because its temp directory is still there.
touch -r $ref $file
sanitize_one_clean
trap "" 0
}
#
# get some system info
#
distro() {
if
which lsb_release >/dev/null 2>&1
then
lsb_release -d | sed -e 's/^Description:\s*//'
debug "Using lsb_release for distribution info"
return
fi
relf=`ls /etc/debian_version 2>/dev/null` ||
relf=`ls /etc/slackware-version 2>/dev/null` ||
relf=`ls -d /etc/*-release 2>/dev/null` && {
for f in $relf; do
test -f $f && {
echo "`ls $f` `cat $f`"
debug "Found `echo $relf | tr '\n' ' '` distribution release file(s)"
return
}
done
}
warning "No lsb_release, no /etc/*-release, no /etc/debian_version: no distro information"
}
pkg_ver() {
if which dpkg >/dev/null 2>&1 ; then
pkg_mgr="deb"
elif which rpm >/dev/null 2>&1 ; then
pkg_mgr="rpm"
elif which pkg_info >/dev/null 2>&1 ; then
pkg_mgr="pkg_info"
elif which pkginfo >/dev/null 2>&1 ; then
pkg_mgr="pkginfo"
else
warning "Unknown package manager"
return
fi
debug "The package manager is: $pkg_mgr"
echo "The package manager is: $pkg_mgr"
echo "Installed packages:"
case $pkg_mgr in
deb)
dpkg-query -f '${Package} ${Version} ${Architecture}\n' -W | sort
echo
for pkg in $*; do
if dpkg-query -W $pkg 2>/dev/null ; then
debug "Verifying installation of: $pkg"
echo "Verifying installation of: $pkg"
debsums -s $pkg 2>/dev/null
fi
done
;;
rpm)
rpm -qa --qf '%{name} %{version}-%{release} - %{distribution} %{arch}\n' | sort
echo
for pkg in $*; do
if rpm -q $pkg >/dev/null 2>&1 ; then
debug "Verifying installation of: $pkg"
echo "Verifying installation of: $pkg"
rpm --verify $pkg 2>&1
fi
done
;;
pkg_info)
pkg_info
;;
pkginfo)
pkginfo | awk '{print $3}' # format?
;;
esac
}
getbacktraces() {
debug "Looking for backtraces: $*"
flist=$(
for f in `find_files "$CRM_CORE_DIRS" $1 $2`; do
bf=`basename $f`
test `expr match $bf core` -gt 0 &&
echo $f
done)
if [ "$flist" ]; then
for core in $flist; do
log "Found core file: `ls -al $core`"
done
# Make a copy of them in case we need more data later
# Luckily they compress well
mkdir cores >/dev/null 2>&1
cp -a $flist cores/
shrink cores
rm -rf cores
# Now get as much as we can from them automagically
for f in $flist; do
getbt $f
done
fi
}
getpeinputs() {
if [ -n "$PE_STATE_DIR" ]; then
flist=$(
find_files "$PE_STATE_DIR" "$1" "$2" | sed "s,`dirname $PE_STATE_DIR`/,,g"
)
if [ "$flist" ]; then
(cd $(dirname "$PE_STATE_DIR") && tar cf - $flist) | (cd "$3" && tar xf -)
debug "found `echo $flist | wc -w` pengine input files in $PE_STATE_DIR"
fi
fi
}
getblackboxes() {
flist=$(
find_files $BLACKBOX_DIR $1 $2
)
for bb in $flist; do
bb_short=`basename $bb`
qb-blackbox $bb > $3/${bb_short}.blackbox 2>&1
info "Extracting contents of blackbox: $bb_short"
done
}
#
# some basic system info and stats
#
sys_info() {
cluster=$1; shift
echo "Platform: `uname`"
echo "Kernel release: `uname -r`"
echo "Architecture: `uname -m`"
if [ `uname` = Linux ]; then
echo "Distribution: `distro`"
fi
echo
cibadmin --version 2>&1 | head -1
cibadmin -! 2>&1
case $cluster in
corosync)
/usr/sbin/corosync -v 2>&1 | head -1
;;
esac
# Cluster glue version hash (if available)
stonith -V 2>/dev/null
# Resource agents version hash
echo "resource-agents: `grep 'Build version:' /usr/lib/ocf/resource.d/heartbeat/.ocf-shellfuncs`"
echo
pkg_ver $*
}
sys_stats() {
set -x
uname -n
uptime
ps axf
ps auxw
top -b -n 1
ifconfig -a
ip addr list
netstat -i
arp -an
test -d /proc && {
cat /proc/cpuinfo
}
lsscsi
lspci
mount
df
set +x
}
dlm_dump() {
if which dlm_tool >/dev/null 2>&1 ; then
if is_running dlm_controld; then
echo "--- Lockspace overview:"
dlm_tool ls -n
echo "---Lockspace history:"
dlm_tool dump
echo "---Lockspace status:"
dlm_tool status
dlm_tool status -v
echo "---Lockspace config:"
dlm_tool dump_config
dlm_tool log_plock
dlm_tool ls | grep name |
while read X N ; do
echo "--- Lockspace $N:"
dlm_tool lockdump "$N"
dlm_tool lockdebug -svw "$N"
done
fi
fi
}
drbd_info() {
test -f /proc/drbd && {
echo "--- /proc/drbd:"
cat /proc/drbd 2>&1
echo
}
if which drbd-overview >/dev/null 2>&1; then
echo "--- drbd-overview:"
drbd-overview 2>&1
echo
fi
if which drbdsetup >/dev/null 2>&1; then
echo "--- drbdsetup status:"
drbdsetup status --verbose --statistics 2>&1
echo
echo "--- drbdsetup events2:"
drbdsetup events2 --timestamps --statistics --now 2>&1
echo
fi
if which drbdadm >/dev/null 2>&1; then
echo "--- drbdadm show-gi:"
for res in $(drbdsetup status | grep -e ^\\S | awk '{ print $1 }'); do
echo "$res:"
drbdadm show-gi $res 2>&1
echo
done
fi
}
iscfvarset() {
test "`getcfvar $1 $2`"
}
iscfvartrue() {
getcfvar $1 $2 $3 | egrep -qsi "^(true|y|yes|on|1)"
}
get_logfiles() {
cf_type=$1
cf_file="$2"
facility_var="logfacility"
case $cf_type in
corosync)
if [ -f "$cf_file" ]; then
debug "Reading $cf_type log settings from $cf_file"
if iscfvartrue $cf_type to_syslog "$cf_file"; then
facility_var=syslog_facility
fi
if iscfvartrue $cf_type to_logfile "$cf_file"; then
logfile=$(getcfvar $cf_type logfile "$cf_file")
fi
fi
;;
esac
if [ -z "$logfile" ]; then
# @TODO Use PCMK_logfile if set
logfile="@CRM_LOG_DIR@/pacemaker.log"
debug "Log settings not found for cluster type $cf_type, assuming $logfile"
fi
if [ -f "$logfile" ]; then
echo $logfile
fi
if [ "x$facility" = x ]; then
facility=`getcfvar $cf_type $facility_var $cf_file`
[ "" = "$facility" ] && facility="daemon"
fi
# Always include system logs (if we can find them)
msg="Mark:pcmk:`perl -e 'print time()'`"
logger -p $facility.info $msg >/dev/null 2>&1
sleep 2 # Give syslog time to catch up in case it's busy
findmsg 1 "$msg"
# Look for detail logs:
# - initial pacemakerd logs and tracing might go to a different file
pattern="Starting Pacemaker"
# - make sure we get something from the Policy Engine
pattern="$pattern\\|Calculated Transition"
# - cib and pacemaker-execd updates
# (helpful on non-DC nodes and when cluster has been up for a long time)
pattern="$pattern\\|cib_perform_op\\|process_lrm_event"
# - pacemaker_remote might use a different file
- pattern="$pattern\\|pacemaker_remoted:"
+ pattern="$pattern\\|pacemaker[-_]remoted:"
findmsg 3 "$pattern"
}
essential_files() {
cat<<EOF
d $PE_STATE_DIR 0750 hacluster haclient
d $CRM_CONFIG_DIR 0750 hacluster haclient
d $CRM_STATE_DIR 0750 hacluster haclient
EOF
}
# Trim leading and ending whitespace (using only POSIX expressions)
trim() {
TRIM_S="$1"
TRIM_S="${TRIM_S#"${TRIM_S%%[![:space:]]*}"}"
TRIM_S="${TRIM_S%"${TRIM_S##*[![:space:]]}"}"
echo -n "$TRIM_S"
}
collect_logs() {
CL_START="$1"
shift
CL_END="$1"
shift
CL_LOGFILES="$@"
which journalctl > /dev/null 2>&1
if [ $? -eq 0 ]; then
cl_have_journald=1
else
cl_have_journald=0
fi
cl_lognames="$CL_LOGFILES"
if [ $cl_have_journald -eq 1 ]; then
cl_lognames="$cl_lognames journalctl"
fi
cl_lognames=$(trim "$cl_lognames")
if [ -z "$cl_lognames" ]; then
return
fi
# YYYY-MM-DD HH:MM:SS
cl_start_ymd=$(date -d @${CL_START} +"%F %T")
cl_end_ymd=$(date -d @${CL_END} +"%F %T")
debug "Gathering logs from $cl_start_ymd to $cl_end_ymd:"
debug " $cl_lognames"
# Remove our temporary file if we get interrupted here
trap '[ -z "$cl_pattfile" ] || rm -f "$cl_pattfile"' 0
# Create a temporary file with patterns to grep for
cl_pattfile=$(mktemp) || fatal "cannot create temporary files"
for cl_pattern in $LOG_PATTERNS; do
echo "$cl_pattern"
done > $cl_pattfile
echo "Log pattern matches from $REPORT_TARGET:" > $ANALYSIS_F
if [ -n "$CL_LOGFILES" ]; then
for cl_logfile in $CL_LOGFILES; do
cl_extract="$(basename $cl_logfile).extract.txt"
if [ ! -f "$cl_logfile" ]; then
# Not a file
continue
elif [ -f "$cl_extract" ]; then
# We already have it
continue
fi
dumplogset "$cl_logfile" $LOG_START $LOG_END > "$cl_extract"
sanitize "$cl_extract"
grep -f "$cl_pattfile" "$cl_extract" >> $ANALYSIS_F
done
fi
# Collect systemd logs if present
if [ $cl_have_journald -eq 1 ]; then
journalctl --since "$cl_start_ymd" --until "$cl_end_ymd" > journal.log
grep -f "$cl_pattfile" journal.log >> $ANALYSIS_F
fi
rm -f $cl_pattfile
trap "" 0
}
debug "Initializing $REPORT_TARGET subdir"
if [ "$REPORT_MASTER" != "$REPORT_TARGET" ]; then
if [ -e $REPORT_HOME/$REPORT_TARGET ]; then
warning "Directory $REPORT_HOME/$REPORT_TARGET already exists, using /tmp/$$/$REPORT_TARGET instead"
REPORT_HOME=/tmp/$$
fi
fi
mkdir -p $REPORT_HOME/$REPORT_TARGET
cd $REPORT_HOME/$REPORT_TARGET
case $CLUSTER in
any) cluster=`get_cluster_type`;;
*) cluster=$CLUSTER;;
esac
cluster_cf=`find_cluster_cf $cluster`
# If cluster stack is still "any", this might be a Pacemaker Remote node,
# so don't complain in that case.
if [ -z "$cluster_cf" ] && [ $cluster != "any" ]; then
warning "Could not determine the location of your cluster configuration"
fi
if [ "$SEARCH_LOGS" = "1" ]; then
logfiles=$(get_logfiles "$cluster" "$cluster_cf" | sort -u)
fi
logfiles="$(trim "$logfiles $EXTRA_LOGS")"
if [ -z "$logfiles" ]; then
which journalctl > /dev/null 2>&1
if [ $? -eq 0 ]; then
info "Systemd journal will be only log collected"
else
info "No logs will be collected"
fi
info "No log files found or specified with --logfile /some/path"
fi
debug "Config: $cluster ($cluster_cf) $logfiles"
sys_info $cluster $PACKAGES > $SYSINFO_F
essential_files $cluster | check_perms > $PERMISSIONS_F 2>&1
getconfig $cluster "$REPORT_HOME/$REPORT_TARGET" "$cluster_cf" "$CRM_CONFIG_DIR/$CIB_F" "/etc/drbd.conf" "/etc/drbd.d" "/etc/booth"
getpeinputs $LOG_START $LOG_END $REPORT_HOME/$REPORT_TARGET
getbacktraces $LOG_START $LOG_END > $REPORT_HOME/$REPORT_TARGET/$BT_F
getblackboxes $LOG_START $LOG_END $REPORT_HOME/$REPORT_TARGET
case $cluster in
corosync)
if is_running corosync; then
corosync-blackbox >corosync-blackbox-live.txt 2>&1
# corosync-fplay > corosync-blackbox.txt
tool=`pickfirst corosync-objctl corosync-cmapctl`
case $tool in
*objctl) $tool -a > corosync.dump 2>/dev/null;;
*cmapctl) $tool > corosync.dump 2>/dev/null;;
esac
corosync-quorumtool -s -i > corosync.quorum 2>&1
fi
;;
esac
dc=`crm_mon -1 2>/dev/null | awk '/Current DC/ {print $3}'`
if [ "$REPORT_TARGET" = "$dc" ]; then
echo "$REPORT_TARGET" > DC
fi
dlm_dump > $DLM_DUMP_F 2>&1
sys_stats > $SYSSTATS_F 2>&1
drbd_info > $DRBD_INFO_F 2>&1
debug "Sanitizing files: $SANITIZE"
#
# replace sensitive info with '****'
#
cf=""
if [ ! -z "$cluster_cf" ]; then
cf=`basename $cluster_cf`
fi
for f in "$cf" "$CIB_F" "$CIB_F.live" pengine/*; do
if [ -f "$f" ]; then
sanitize "$f"
fi
done
# For convenience, generate human-readable version of CIB and any XML errors
# in it (AFTER sanitizing, so we don't need to sanitize this output)
get_readable_cib "$REPORT_HOME/$REPORT_TARGET"
collect_logs "$LOG_START" "$LOG_END" $logfiles
# Purge files containing no information
for f in `ls -1`; do
if [ -d "$f" ]; then
continue
elif [ ! -s "$f" ]; then
case $f in
*core*) log "Detected empty core file: $f";;
*) debug "Removing empty file: `ls -al $f`"
rm -f $f
;;
esac
fi
done
# Parse for events
for l in $logfiles; do
b="$(basename $l).extract.txt"
node_events "$b" > $EVENTS_F
# Link the first logfile to a standard name if it doesn't yet exist
if [ -e "$b" -a ! -e "$HALOG_F" ]; then
ln -s "$b" "$HALOG_F"
fi
done
if [ -e $REPORT_HOME/.env ]; then
debug "Localhost: $REPORT_MASTER $REPORT_TARGET"
elif [ "$REPORT_MASTER" != "$REPORT_TARGET" ]; then
debug "Streaming report back to $REPORT_MASTER"
(cd $REPORT_HOME && tar cf - $REPORT_TARGET)
if [ "$REMOVE" = "1" ]; then
cd
rm -rf $REPORT_HOME
fi
fi
# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80:
diff --git a/tools/report.common.in b/tools/report.common.in
index 3a051e6f73..c4c7289467 100644
--- a/tools/report.common.in
+++ b/tools/report.common.in
@@ -1,851 +1,858 @@
#
# Originally based on hb_report
# Copyright 2007 Dejan Muhamedagic <dmuhamedagic@suse.de>
#
# Later changes copyright 2010-2018 Andrew Beekhof <andrew@beekhof.net>
#
# This source code is licensed under the GNU General Public License version 2
# or later (GPLv2+) WITHOUT ANY WARRANTY.
#
host=`uname -n`
shorthost=`echo $host | sed s:\\\\..*::`
if [ -z $verbose ]; then
verbose=0
fi
# Target Files
EVENTS_F=events.txt
ANALYSIS_F=analysis.txt
HALOG_F=cluster-log.txt
BT_F=backtraces.txt
SYSINFO_F=sysinfo.txt
SYSSTATS_F=sysstats.txt
DLM_DUMP_F=dlm_dump.txt
CRM_MON_F=crm_mon.txt
MEMBERSHIP_F=members.txt
CRM_VERIFY_F=crm_verify.txt
PERMISSIONS_F=permissions.txt
CIB_F=cib.xml
CIB_TXT_F=cib.txt
DRBD_INFO_F=drbd_info.txt
EVENT_PATTERNS="
state do_state_transition
membership pcmk_peer_update.*(lost|memb):
quorum crmd.*crm_update_quorum
pause Process.pause.detected
resources (lrmd|pacemaker-execd).*rsc:(start|stop)
stonith te_fence_node|stonith-ng.*log_oper.*report|stonithd.*(requests|(Succeeded|Failed).to.STONITH|result=)
start_stop shutdown.decision|Corosync.Cluster.Engine|corosync.*Initializing.transport|Executive.Service.RELEASE|crm_shutdown:.Requesting.shutdown|pcmk_shutdown:.Shutdown.complete
"
# superset of all packages of interest on all distros
# (the package manager will be used to validate the installation
# of any of these packages that are installed)
PACKAGES="pacemaker pacemaker-libs pacemaker-cluster-libs libpacemaker3
pacemaker-remote pacemaker-pygui pacemaker-pymgmt pymgmt-client
corosync corosynclib libcorosync4
resource-agents cluster-glue-libs cluster-glue libglue2 ldirectord
ocfs2-tools ocfs2-tools-o2cb ocfs2console
ocfs2-kmp-default ocfs2-kmp-pae ocfs2-kmp-xen ocfs2-kmp-debug ocfs2-kmp-trace
drbd drbd-kmp-xen drbd-kmp-pae drbd-kmp-default drbd-kmp-debug drbd-kmp-trace
drbd-pacemaker drbd-utils drbd-bash-completion drbd-xen
lvm2 lvm2-clvm cmirrord
libdlm libdlm2 libdlm3
hawk ruby lighttpd
kernel-default kernel-pae kernel-xen
glibc
"
# Potential locations of system log files
SYSLOGS="
/var/log/*
/var/logs/*
/var/syslog/*
/var/adm/*
/var/log/ha/*
/var/log/cluster/*
"
-# Whether pacemaker_remoted was found (0 = yes, 1 = no, -1 = haven't looked yet)
+# Whether pacemaker-remoted was found (0 = yes, 1 = no, -1 = haven't looked yet)
REMOTED_STATUS=-1
#
# keep the user posted
#
record() {
if [ x != x"$REPORT_HOME" -a -d "${REPORT_HOME}/$shorthost" ]; then
rec="${REPORT_HOME}/$shorthost/report.out"
elif [ x != x"${l_base}" -a -d "${l_base}" ]; then
rec="${l_base}/report.summary"
else
rec="/dev/null"
fi
printf "%-10s $*\n" "$shorthost:" 2>&1 >> "${rec}"
}
log() {
printf "%-10s $*\n" "$shorthost:" 1>&2
record "$*"
}
debug() {
if [ $verbose -gt 0 ]; then
log "Debug: $*"
else
record "Debug: $*"
fi
}
info() {
log "$*"
}
warning() {
log "WARN: $*"
}
fatal() {
log "ERROR: $*"
exit 1
}
is_running() {
ps -ef | egrep -qs $(echo "$1" | sed -e 's/^\(.\)/[\1]/')
}
has_remoted() {
if [ $REMOTED_STATUS -eq -1 ]; then
REMOTED_STATUS=1
- if which pacemaker_remoted >/dev/null 2>&1; then
+ if which pacemaker-remoted >/dev/null 2>&1; then
+ REMOTED_STATUS=0
+ # Check for pre-2.0.0 daemon name in case we have mixed-version cluster
+ elif which pacemaker_remoted >/dev/null 2>&1; then
+ REMOTED_STATUS=0
+ elif [ -x "@sbindir@/pacemaker-remoted" ]; then
REMOTED_STATUS=0
elif [ -x "@sbindir@/pacemaker_remoted" ]; then
REMOTED_STATUS=0
else
# @TODO: the binary might be elsewhere,
# but a global search is too expensive
for d in /{usr,opt}/{local/,}{s,}bin; do
- if [ -x "${d}/pacemaker_remoted" ]; then
+ if [ -x "${d}/pacemaker-remoted" ]; then
+ REMOTED_STATUS=0
+ elif [ -x "${d}/pacemaker_remoted" ]; then
REMOTED_STATUS=0
fi
done
fi
fi
return $REMOTED_STATUS
}
# found_dir <description> <dirname>
found_dir() {
echo "$2"
info "Pacemaker $1 found in: $2"
}
detect_daemon_dir() {
info "Searching for where Pacemaker daemons live... this may take a while"
for d in \
{/usr,/usr/local,/opt/local,@exec_prefix@}/{libexec,lib64,lib}/pacemaker
do
# pacemaker and pacemaker-cts packages can install to daemon directory,
# so check for a file from each
if [ -e $d/pengine ] || [ -e $d/cts-exec-helper ]; then
found_dir "daemons" "$d"
return
fi
done
# Pacemaker Remote nodes don't need to install daemons
if has_remoted; then
info "Pacemaker daemons not found (this appears to be a Pacemaker Remote node)"
return
fi
for f in $(find / -maxdepth $maxdepth -type f -name pengine -o -name cts-exec-helper); do
d=$(dirname "$f")
found_dir "daemons" "$d"
return
done
fatal "Pacemaker daemons not found (nonstandard installation?)"
}
detect_cib_dir() {
if [ "-f ${local_state_dir}/lib/pacemaker/cib/cib.xml" ]; then
found_dir "config files" "$d"
return
fi
# Pacemaker Remote nodes don't need a CIB
if has_remoted; then
info "Pacemaker config not found (this appears to be a Pacemaker Remote node)"
return
fi
info "Searching for where Pacemaker keeps config information... this may take a while"
# TODO: What about false positives where someone copied the CIB?
for f in $(find / -maxdepth $maxdepth -type f -name cib.xml); do
d=$(dirname $f)
found_dir "config files" "$d"
return
done
warning "Pacemaker config not found (nonstandard installation?)"
}
detect_state_dir() {
if [ -n "$CRM_CONFIG_DIR" ]; then
# Assume new layout
# $local_state_dir/lib/pacemaker/(cib,pengine,blackbox,cores)
dirname "$CRM_CONFIG_DIR"
# Pacemaker Remote nodes might not have a CRM_CONFIG_DIR
elif [ -d "$local_state_dir/lib/pacemaker" ]; then
echo $local_state_dir/lib/pacemaker
fi
}
detect_pe_dir() {
config_root="$1"
d="$config_root/pengine"
if [ -d "$d" ]; then
found_dir "policy engine inputs" "$d"
return
fi
if has_remoted; then
info "Pacemaker policy engine inputs not found (this appears to be a Pacemaker Remote node)"
return
fi
info "Searching for where Pacemaker keeps Policy Engine inputs... this may take a while"
for d in $(find / -maxdepth $maxdepth -type d -name pengine); do
found_dir "policy engine inputs" "$d"
return
done
fatal "Pacemaker policy engine inputs not found (nonstandard installation?)"
}
detect_host() {
local_state_dir=@localstatedir@
if [ -d $local_state_dir/run ]; then
CRM_STATE_DIR=$local_state_dir/run/crm
else
info "Searching for where Pacemaker keeps runtime data... this may take a while"
for d in `find / -maxdepth $maxdepth -type d -name run`; do
local_state_dir=`dirname $d`
CRM_STATE_DIR=$d/crm
break
done
info "Found: $CRM_STATE_DIR"
fi
debug "Machine runtime directory: $local_state_dir"
debug "Pacemaker runtime data located in: $CRM_STATE_DIR"
CRM_DAEMON_DIR=$(detect_daemon_dir)
CRM_CONFIG_DIR=$(detect_cib_dir)
config_root=$(detect_state_dir)
# Older versions had none
BLACKBOX_DIR=$config_root/blackbox
debug "Pacemaker blackboxes (if any) located in: $BLACKBOX_DIR"
PE_STATE_DIR=$(detect_pe_dir "$config_root")
CRM_CORE_DIRS=""
for d in $config_root/cores $local_state_dir/lib/corosync; do
if [ -d $d ]; then
CRM_CORE_DIRS="$CRM_CORE_DIRS $d"
fi
done
debug "Core files located under: $CRM_CORE_DIRS"
}
time2str() {
perl -e "use POSIX; print strftime('%x %X',localtime($1));"
}
get_time() {
perl -e "\$time=\"$*\";" -e '
$unix_tm = 0;
eval "use Date::Parse";
if (index($time, ":") < 0) {
} elsif (!$@) {
$unix_tm = str2time($time);
} else {
eval "use Date::Manip";
if (!$@) {
$unix_tm = UnixDate(ParseDateString($time), "%s");
}
}
if ($unix_tm != "") {
print int($unix_tm);
} else {
print "";
}
'
}
get_time_() {
warning "Unknown time format used by: $*"
}
get_time_syslog() {
awk '{print $1,$2,$3}'
}
get_time_legacy() {
awk '{print $2}' | sed 's/_/ /'
}
get_time_iso8601() {
awk '{print $1}'
}
get_time_format_for_string() {
l="$*"
t=$(get_time `echo $l | get_time_syslog`)
if [ "x$t" != x ]; then
echo syslog
return
fi
t=$(get_time `echo $l | get_time_iso8601`)
if [ "x$t" != x ]; then
echo iso8601
return
fi
t=$(get_time `echo $l | get_time_legacy`)
if [ "x$t" != x ]; then
echo legacy
return
fi
}
get_time_format() {
t=0 l="" func=""
trycnt=10
while [ $trycnt -gt 0 ] && read l; do
func=$(get_time_format_for_string $l)
if [ "x$func" != x ]; then
break
fi
trycnt=$(($trycnt-1))
done
#debug "Logfile uses the $func time format"
echo $func
}
get_first_time() {
l=""
format=$1
while read l; do
t=$(echo $l | get_time_$format)
ts=$(get_time $t)
if [ "x$ts" != x ]; then
echo "$ts"
return
fi
done
}
get_last_time() {
l=""
best=`date +%s` # Now
format=$1
while read l; do
t=$(echo $l | get_time_$format)
ts=$(get_time $t)
if [ "x$ts" != x ]; then
best=$ts
fi
done
echo $best
}
linetime() {
l=`tail -n +$2 $1 | grep -a ":[0-5][0-9]:" | head -n 1`
format=`get_time_format_for_string $l`
t=`echo $l | get_time_$format`
get_time "$t"
}
#
# findmsg <max> <pattern>
#
# Print the names of up to <max> system logs that contain <pattern>,
# ordered by most recently modified.
#
findmsg() {
max=$1
pattern="$2"
found=0
# List all potential system logs ordered by most recently modified.
candidates=$(ls -1td $SYSLOGS 2>/dev/null)
if [ -z "$candidates" ]; then
debug "No system logs found to search for pattern \'$pattern\'"
return
fi
# Portable way to handle files with spaces in their names.
SAVE_IFS=$IFS
IFS="
"
# Check each log file for matches.
logfiles=""
for f in $candidates; do
local cat=""
# We only care about readable files with something in them.
if [ ! -f "$f" ] || [ ! -r "$f" ] || [ ! -s "$f" ] ; then
continue
fi
cat=$(find_decompressor "$f")
# We want to avoid grepping through potentially huge binary logs such
# as lastlog. However, control characters sometimes find their way into
# text logs, so we use a heuristic of more than 256 nonprintable
# characters in the file's first kilobyte.
if [ $($cat "$f" 2>/dev/null | head -c 1024 | tr -d '[:print:][:space:]' | wc -c) -gt 256 ]
then
continue
fi
# Our patterns are ASCII, so we can use LC_ALL="C" to speed up grep
$cat "$f" 2>/dev/null | LC_ALL="C" grep -q -e "$pattern"
if [ $? -eq 0 ]; then
# Add this file to the list of hits
# (using newline as separator to handle spaces in names).
if [ -z "$logfiles" ]; then
logfiles="$f"
else
logfiles="$logfiles
$f"
fi
# If we have enough hits, print them and return.
found=$(($found+1))
if [ $found -ge $max ]; then
debug "Pattern \'$pattern\' found in: [ $logfiles ]"
IFS=$SAVE_IFS
echo "$logfiles"
return
fi
fi
done 2>/dev/null
IFS=$SAVE_IFS
debug "Pattern \'$pattern\' not found in any system logs"
}
node_events() {
if [ -e $1 ]; then
Epatt=`echo "$EVENT_PATTERNS" |
while read title p; do [ -n "$p" ] && echo -n "|$p"; done |
sed 's/.//'
`
grep -E "$Epatt" $1
fi
}
pickfirst() {
for x; do
which $x >/dev/null 2>&1 && {
echo $x
return 0
}
done
return 1
}
shrink() {
olddir=$PWD
dir=`dirname $1`
base=`basename $1`
target=$1.tar
tar_options="cf"
variant=`pickfirst bzip2 gzip xz false`
case $variant in
bz*)
tar_options="jcf"
target="$target.bz2"
;;
gz*)
tar_options="zcf"
target="$target.gz"
;;
xz*)
tar_options="Jcf"
target="$target.xz"
;;
*)
warning "Could not find a compression program, the resulting tarball may be huge"
;;
esac
if [ -e $target ]; then
fatal "Destination $target already exists, specify an alternate name with --dest"
fi
cd $dir >/dev/null 2>&1
tar $tar_options $target $base >/dev/null 2>&1
cd $olddir >/dev/null 2>&1
echo $target
}
findln_by_time() {
local logf=$1
local tm=$2
local first=1
# Some logs can be massive (over 1,500,000,000 lines have been seen in the wild)
# Even just 'wc -l' on these files can take 10+ minutes
local fileSize=`ls -lh | awk '{ print $5 }' | grep -ie G`
if [ x$fileSize != x ]; then
warning "$logf is ${fileSize} in size and could take many hours to process. Skipping."
return
fi
local last=`wc -l < $logf`
while [ $first -le $last ]; do
mid=$((($last+$first)/2))
trycnt=10
while [ $trycnt -gt 0 ]; do
tmid=`linetime $logf $mid`
[ "$tmid" ] && break
warning "cannot extract time: $logf:$mid; will try the next one"
trycnt=$(($trycnt-1))
# shift the whole first-last segment
first=$(($first-1))
last=$(($last-1))
mid=$((($last+$first)/2))
done
if [ -z "$tmid" ]; then
warning "giving up on log..."
return
fi
if [ $tmid -gt $tm ]; then
last=$(($mid-1))
elif [ $tmid -lt $tm ]; then
first=$(($mid+1))
else
break
fi
done
echo $mid
}
dumplog() {
local logf=$1
local from_line=$2
local to_line=$3
[ "$from_line" ] ||
return
tail -n +$from_line $logf |
if [ "$to_line" ]; then
head -$(($to_line-$from_line+1))
else
cat
fi
}
#
# find log/set of logs which are interesting for us
#
#
# find log slices
#
find_decompressor() {
case $1 in
*bz2) echo "bzip2 -dc" ;;
*gz) echo "gzip -dc" ;;
*xz) echo "xz -dc" ;;
*) echo "cat" ;;
esac
}
#
# check if the log contains a piece of our segment
#
is_our_log() {
local logf=$1
local from_time=$2
local to_time=$3
local cat=`find_decompressor $logf`
local format=`$cat $logf | get_time_format`
local first_time=`$cat $logf | head -10 | get_first_time $format`
local last_time=`$cat $logf | tail -10 | get_last_time $format`
if [ x = "x$first_time" -o x = "x$last_time" ]; then
warning "Skipping bad logfile '$1': Could not determine log dates"
return 0 # skip (empty log?)
fi
if [ $from_time -gt $last_time ]; then
# we shouldn't get here anyway if the logs are in order
return 2 # we're past good logs; exit
fi
if [ $from_time -ge $first_time ]; then
return 3 # this is the last good log
fi
# have to go further back
if [ x = "x$to_time" -o $to_time -ge $first_time ]; then
return 1 # include this log
else
return 0 # don't include this log
fi
}
#
# go through archived logs (timewise backwards) and see if there
# are lines belonging to us
# (we rely on untouched log files, i.e. that modify time
# hasn't been changed)
#
arch_logs() {
local logf=$1
local from_time=$2
local to_time=$3
# look for files such as: ha-log-20090308 or
# ha-log-20090308.gz (.bz2) or ha-log.0, etc
ls -t $logf $logf*[0-9z] 2>/dev/null |
while read next_log; do
is_our_log $next_log $from_time $to_time
case $? in
0) ;; # noop, continue
1) echo $next_log # include log and continue
debug "Found log $next_log"
;;
2) break;; # don't go through older logs!
3) echo $next_log # include log and continue
debug "Found log $next_log"
break
;; # don't go through older logs!
esac
done
}
#
# print part of the log
#
drop_tmp_file() {
[ -z "$tmp" ] || rm -f "$tmp"
}
print_logseg() {
local logf=$1
local from_time=$2
local to_time=$3
# uncompress to a temp file (if necessary)
local cat=`find_decompressor $logf`
if [ "$cat" != "cat" ]; then
tmp=`mktemp`
$cat $logf > $tmp
trap drop_tmp_file 0
sourcef=$tmp
else
sourcef=$logf
tmp=""
fi
if [ "$from_time" = 0 ]; then
FROM_LINE=1
else
FROM_LINE=`findln_by_time $sourcef $from_time`
fi
if [ -z "$FROM_LINE" ]; then
warning "couldn't find line for time $from_time; corrupt log file?"
return
fi
TO_LINE=""
if [ "$to_time" != 0 ]; then
TO_LINE=`findln_by_time $sourcef $to_time`
if [ -z "$TO_LINE" ]; then
warning "couldn't find line for time $to_time; corrupt log file?"
return
fi
if [ $FROM_LINE -lt $TO_LINE ]; then
dumplog $sourcef $FROM_LINE $TO_LINE
log "Including segment [$FROM_LINE-$TO_LINE] from $logf"
else
debug "Empty segment [$FROM_LINE-$TO_LINE] from $logf"
fi
else
dumplog $sourcef $FROM_LINE $TO_LINE
log "Including all logs after line $FROM_LINE from $logf"
fi
drop_tmp_file
trap "" 0
}
#
# find log/set of logs which are interesting for us
#
dumplogset() {
local logf=$1
local from_time=$2
local to_time=$3
local logf_set=`arch_logs $logf $from_time $to_time`
if [ x = "x$logf_set" ]; then
return
fi
local num_logs=`echo "$logf_set" | wc -l`
local oldest=`echo $logf_set | awk '{print $NF}'`
local newest=`echo $logf_set | awk '{print $1}'`
local mid_logfiles=`echo $logf_set | awk '{for(i=NF-1; i>1; i--) print $i}'`
# the first logfile: from $from_time to $to_time (or end)
# logfiles in the middle: all
# the last logfile: from beginning to $to_time (or end)
case $num_logs in
1) print_logseg $newest $from_time $to_time;;
*)
print_logseg $oldest $from_time 0
for f in $mid_logfiles; do
`find_decompressor $f` $f
debug "including complete $f logfile"
done
print_logseg $newest 0 $to_time
;;
esac
}
# cut out a stanza
getstanza() {
awk -v name="$1" '
!in_stanza && NF==2 && /^[a-z][a-z]*[[:space:]]*{/ { # stanza start
if ($1 == name)
in_stanza = 1
}
in_stanza { print }
in_stanza && NF==1 && $1 == "}" { exit }
'
}
# supply stanza in $1 and variable name in $2
# (stanza is optional)
getcfvar() {
cf_type=$1; shift;
cf_var=$1; shift;
cf_file=$*
[ -f "$cf_file" ] || return
case $cf_type in
corosync)
sed 's/#.*//' < $cf_file |
if [ $# -eq 2 ]; then
getstanza "$cf_var"
shift 1
else
cat
fi |
awk -v varname="$cf_var" '
NF==2 && match($1,varname":$")==1 { print $2; exit; }
'
;;
esac
}
pickfirst() {
for x; do
which $x >/dev/null 2>&1 && {
echo $x
return 0
}
done
return 1
}
#
# figure out the cluster type, depending on the process list
# and existence of configuration files
#
get_cluster_type() {
if is_running corosync; then
tool=`pickfirst corosync-objctl corosync-cmapctl`
case $tool in
*objctl) quorum=`$tool -a | grep quorum.provider | sed 's/.*=\s*//'`;;
*cmapctl) quorum=`$tool | grep quorum.provider | sed 's/.*=\s*//'`;;
esac
stack="corosync"
# Now we're guessing...
# TODO: Technically these could be anywhere :-/
elif [ -f /etc/corosync/corosync.conf ]; then
stack="corosync"
else
# We still don't know. This might be a Pacemaker Remote node,
# or the configuration might be in a nonstandard location.
stack="any"
fi
debug "Detected the '$stack' cluster stack"
echo $stack
}
find_cluster_cf() {
case $1 in
corosync)
best_size=0
best_file=""
# TODO: Technically these could be anywhere :-/
for cf in /etc/corosync/corosync.conf; do
if [ -f $cf ]; then
size=`wc -l $cf | awk '{print $1}'`
if [ $size -gt $best_size ]; then
best_size=$size
best_file=$cf
fi
fi
done
if [ -z "$best_file" ]; then
debug "Looking for corosync configuration file. This may take a while..."
for f in `find / -maxdepth $maxdepth -type f -name corosync.conf`; do
best_file=$f
break
done
fi
debug "Located corosync config file: $best_file"
echo "$best_file"
;;
any)
# Cluster type is undetermined. Don't complain, because this
# might be a Pacemaker Remote node.
;;
*)
warning "Unknown cluster type: $1"
;;
esac
}
#
# check for the major prereq for a) parameter parsing and b)
# parsing logs
#
t=`get_time "12:00"`
if [ "$t" = "" ]; then
fatal "please install the perl Date::Parse module (perl-DateTime-Format-DateParse on Fedora/Red Hat)"
fi
# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80:

File Metadata

Mime Type
text/x-diff
Expires
Thu, Jun 26, 6:02 PM (17 h, 29 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1959316
Default Alt Text
(659 KB)

Event Timeline