diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f3fb327e9c..f9faf4759c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,274 +1,275 @@ -# Copyright 2018-2019 the Pacemaker project contributors +# Copyright 2018-2020 the Pacemaker project contributors # # The version control history for this file may have further details. # -# SPDX-License-Identifier: FSFAP +# This source code is licensed under the GNU General Public License version 2 +# or later (GPLv2+) WITHOUT ANY WARRANTY. # Conventions: # 0/ subject to change: compatibility with GitLab (CE) as old as 10.8 # 1/ heavy reuse (compatible map merges for now, see fedora-rawhide example) # 2/ consolidated prefixes for custom variable names: GLOBAL_* and LOCAL_* variables: GIT_DEPTH: "16" # nice-to-have automatic "bisect" for even more convenience stages: - buildenv - build - test # currently, 1:1 with the eponymous stage .buildenv: &buildenv-anchor stage: buildenv # currently, subset of all jobs pertaining "build" stage .compilation: &compilation-anchor variables: GLOBAL_COMPILATION: "1" # bogus content so that this anchor is not empty # only since GitLab 11.4 # changes: # - "configure.ac" # - "**/*.[ch]" # - "**/*.xml" # - "**/*.xsl" # - "**/Makefile.am" # - ".gitlab-ci.yml" # # Fedora Rawhide # # In case of failure imposed with branching/changed GPG keys, downgrade, e.g.: # sed -i.orig 's|fedora-rawhide|fedora|g;s|rawhide|fedora|g;s|:rawhide|:29|' \ # .gitlab-ci.yml # and run it through your side branch (omit such changes for upstream request). # .fedora-rawhide: &fedora-rawhide-anchor tags: - docker image: registry.fedoraproject.org/fedora:rawhide # down? mere fedora:rawhide .fedora-rawhide-variables-rpm: &fedora-rawhide-variables-rpm-anchor # below are captured major sets of packages that need to be installed on top # of what the base image contains; some packages named individually later; # transitive dependencies preferably kept at minimum, i.e., w/o proliferation GLOBAL_RPM_BR_BASIC: autoconf automake pkgconf-pkg-config libtool libtool-ltdl-devel make # with these two, first word denotes also the derived value of CC variable GLOBAL_RPM_BR_GCC: gcc GLOBAL_RPM_BR_CLANG: clang # compiler-rt # for when with coverage GLOBAL_RPM_BR_LIBS_PLAIN: libuuid-devel libxslt-devel bzip2-devel # note a special trick using \v escaped character, since it's not in IFS # (word separators) in Bash, while fulfilling the requirements in libdnf # (cf. https://github.com/rpm-software-management/libdnf/pull/476) GLOBAL_RPM_BR_LIBS_PKGCONFIG: "pkgconfig(glib-2.0)\v>=2.16.0 pkgconfig(libxml-2.0) pkgconfig(libqb)\v>=0.13 pkgconfig(libcpg)" GLOBAL_RPM_REQ_LIBS: bzip2-libs glib2 # not dragged transitively .fedora-rawhide-gcc: &fedora-rawhide-gcc-anchor <<: *fedora-rawhide-anchor #extends: .fedora-rawhide variables: <<: *fedora-rawhide-variables-rpm-anchor LOCAL_RPM_COMPILER: $GLOBAL_RPM_BR_GCC LOCAL_WITH_COVERAGE: --with-coverage .fedora-rawhide-clang: &fedora-rawhide-clang-anchor <<: *fedora-rawhide-anchor #extends: .fedora-rawhide variables: <<: *fedora-rawhide-variables-rpm-anchor LOCAL_RPM_COMPILER: $GLOBAL_RPM_BR_CLANG LOCAL_WITH_COVERAGE: "" # override from enabling in GCC case for now # buildenv buildenv:fedora-rawhide:info: # to gather some info about the image itself/container settings <<: *buildenv-anchor #extends: .buildenv <<: *fedora-rawhide-anchor #extends: .fedora-rawhide variables: GIT_STRATEGY: "none" # invariant stage, not project dependence at all script: - cat /proc/meminfo | grep Free - df -H - lsblk - nproc - ulimit -a - uname -r - dnf --version buildenv:fedora-rawhide:rpms: # primarily to download RPMs, but alas, "download" plugin not baked into image <<: *buildenv-anchor #extends: .buildenv <<: *fedora-rawhide-anchor #extends: .fedora-rawhide variables: <<: *fedora-rawhide-variables-rpm-anchor GIT_STRATEGY: "none" # invariant stage, not project dependence at all cache: key: "fedora-rawhide-dnf" paths: - ".cache/fedora-rawhide/dnf/rawhide-*" - "xml/assets/*.rng" script: - mkdir -p .cache/fedora-rawhide/dnf - pushd .cache/fedora-rawhide - mv dnf/rawhide-* /var/cache/dnf 2>/dev/null || true - dnf-custom() { timeout 192 dnf -y --noplugins --nodocs --setopt=tsflags=test --setopt=install_weak_deps=0 --setopt=keepcache=1 --setopt=autocheck_running_kernel=0 --disablerepo='*' --enablerepo=rawhide "$@"; }; dnf-custom install $GLOBAL_RPM_BR_BASIC $GLOBAL_RPM_BR_GCC $GLOBAL_RPM_BR_CLANG $GLOBAL_RPM_BR_LIBS_PLAIN $GLOBAL_RPM_BR_LIBS_PKGCONFIG $GLOBAL_RPM_REQ_LIBS findutils libxml libxslt psmisc which gcovr - mv /var/cache/dnf/rawhide-* dnf - popd # we don't have any other warm-up-cache job around, so piggyback it here - test -s xml/assets/relaxng.rng 2>/dev/null || curl -SsLo xml/assets/relaxng.rng 'https://raw.githubusercontent.com/relaxng/relaxng.org/master/relaxng.rng' - test -s xml/assets/xslt.rng 2>/dev/null || curl -SsLo xml/assets/xslt.rng 'https://raw.githubusercontent.com/relaxng/jing-trang/master/eg/xslt.rng' # build maint:fedora-rawhide: # to run some compilation-less checks <<: *fedora-rawhide-anchor #extends: .fedora-rawhide stage: build cache: key: "fedora-rawhide-dnf" policy: pull paths: - ".cache/fedora-rawhide/dnf/rawhide-*" - "xml/assets/*.rng" before_script: - mkdir -p .cache/fedora-rawhide/dnf - pushd .cache/fedora-rawhide - mv dnf/rawhide-* /var/cache/dnf 2>/dev/null || true - dnf-custom() { timeout 96 dnf -y --cacheonly --noplugins --nodocs --setopt=install_weak_deps=0 --setopt=autocheck_running_kernel=0 --disablerepo='*' --enablerepo=rawhide "$@"; }; dnf-custom install findutils make libxml libxslt - popd - ( cd xml; { cat Makefile.am; printf 'hack_rng\x3a %s' '${RNG_generated}'; } | make -f- top_srcdir=$(pwd)/.. top_builddir=$(pwd)/.. hack_rng ) script: - echo 'looking for presence of control characters...'; { git ls-files | xargs grep -Ensv "^([^[:cntrl:]]*|$(printf '\t'))*$"||:; } 2>/dev/null | { ! grep -Ev '^Binary file' && echo 'ALL OK'; }; - cd xml; ./regression.sh && ./regression.sh -B && ./regression.sh -S && { schemas=; for schema in *.rng; do case ${schema} in *cibtr*);; *)schemas="${schemas} ${schema}";; esac; done; xmllint --noout --relaxng assets/relaxng.rng ${schemas}; } build:fedora-rawhide:gcc: &build-fedora-rawhide-gcc-anchor # to build using GCC <<: *compilation-anchor #extends: .compilation <<: *fedora-rawhide-gcc-anchor #extends: .fedora-rawhide-gcc stage: build cache: key: "fedora-rawhide-dnf" policy: pull paths: - ".cache/fedora-rawhide/dnf/rawhide-*" artifacts: name: "fedora-rawhide-gcc-$CI_COMMIT_REF_SLUG" untracked: true expire_in: "10h" before_script: - mkdir -p .cache/fedora-rawhide/dnf - pushd .cache/fedora-rawhide - mv dnf/rawhide-* /var/cache/dnf 2>/dev/null || true - dnf-custom() { timeout 96 dnf -y --cacheonly --noplugins --nodocs --setopt=install_weak_deps=0 --setopt=autocheck_running_kernel=0 --disablerepo='*' --enablerepo=rawhide "$@"; }; dnf-custom install $GLOBAL_RPM_BR_BASIC $LOCAL_RPM_COMPILER $GLOBAL_RPM_BR_LIBS_PLAIN $GLOBAL_RPM_BR_LIBS_PKGCONFIG - popd script: - ./autogen.sh - CC=$(echo $LOCAL_RPM_COMPILER | cut -d ' ' -f1) ./configure --enable-silent-rules $LOCAL_WITH_COVERAGE || { cat config.log; false; } - timeout 480 make -j$(nproc) # -j$(($(nproc) + 1)) all - find \( -name '*.l[ao]' -o -name 'config.*' -o -name configure -o -name Makefile.in -o -name Makefile \) -delete # *.o need to remain for test build:fedora-rawhide:clang: # to build using Clang <<: *build-fedora-rawhide-gcc-anchor #extends build:fedora-rawhide:gcc <<: *fedora-rawhide-clang-anchor #extends: .fedora-rawhide-clang # test test:fedora-rawhide:gcc: &test-fedora-rawhide-gcc-anchor # to test the result of GCC build + measure coverage for that <<: *compilation-anchor #extends: .compilation <<: *fedora-rawhide-gcc-anchor #extends: .fedora-rawhide-gcc stage: test cache: key: "fedora-rawhide-dnf" policy: pull paths: - ".cache/fedora-rawhide/dnf/rawhide-*" dependencies: - build:fedora-rawhide:gcc before_script: - mkdir -p .cache/fedora-rawhide/dnf - pushd .cache/fedora-rawhide - mv dnf/rawhide-* /var/cache/dnf 2>/dev/null || true - dnf-custom() { timeout 96 dnf -y --cacheonly --noplugins --nodocs --setopt=install_weak_deps=0 --setopt=autocheck_running_kernel=0 --disablerepo='*' --enablerepo=rawhide "$@"; }; dnf-custom install $LOCAL_RPM_COMPILER $GLOBAL_RPM_BR_LIBS_PLAIN $GLOBAL_RPM_BR_LIBS_PKGCONFIG psmisc which $(test -z "$LOCAL_WITH_COVERAGE" || echo gcovr) findutils - popd script: - timeout 600 ./cts/cts-regression -V cli scheduler after_script: - gcovr -r . - find -name 'core*' || true coverage: '/^TOTAL.*\s+(\d+\%)$/' test:fedora-rawhide:clang: # to test the result of Clang build (+ possibly measure coverage for that) <<: *test-fedora-rawhide-gcc-anchor #extends: test:fedora-rawhide:gcc <<: *fedora-rawhide-clang-anchor #extends: .fedora-rawhide-clang dependencies: - build:fedora-rawhide:clang diff --git a/cts/cluster_test.in b/cts/cluster_test.in index 38f941d3ec..1741b472c0 100755 --- a/cts/cluster_test.in +++ b/cts/cluster_test.in @@ -1,175 +1,175 @@ #!@BASH_PATH@ # # Copyright 2008-2020 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # if [ -e ~/.cts ]; then . ~/.cts fi anyAsked=0 [ $# -lt 1 ] || CTS_numtests=$1 die() { echo "$@"; exit 1; } if [ -z "$CTS_asked_once" ]; then anyAsked=1 echo "This script should only be executed on the test exerciser." echo "The test exerciser will remotely execute the actions required by the" echo "tests and should not be part of the cluster itself." read -p "Is this host intended to be the test exerciser? (yN) " doUnderstand [ "$doUnderstand" = "y" ] \ || die "This script must be executed on the test exerciser" fi if [ -z "$CTS_node_list" ]; then anyAsked=1 read -p "Please list your cluster nodes (eg. node1 node2 node3): " CTS_node_list else echo "Beginning test of cluster: $CTS_node_list" fi if [ -z "$CTS_stack" ]; then anyAsked=1 read -p "Which cluster stack are you using? ([corosync]): " CTS_stack [ -n "$CTS_stack" ] || CTS_stack=corosync else echo "Using the $CTS_stack cluster stack" fi [ "${CTS_node_list}" = "${CTS_node_list/$HOSTNAME/}" ] \ || die "This script must be executed on the test exerciser, and the test exerciser cannot be part of the cluster" -printf "+ Bootstraping ssh... " +printf "+ Bootstrapping ssh... " if [ -z "$SSH_AUTH_SOCK" ]; then printf "\n + Initializing SSH " eval "$(ssh-agent)" echo " + Adding identities..." ssh-add rc=$? if [ $rc -ne 0 ]; then echo " -- No identities added" printf "\nThe ability to open key-based 'ssh' connections (as the user 'root') is required to use CTS.\n" read -p " - Do you want this program to help you create one? (yN) " auto_fix if [ "$auto_fix" = "y" ]; then ssh-keygen -t dsa ssh-add else die "Please run 'ssh-keygen -t dsa' to create a new key" fi fi else echo "OK" fi test_ok=1 printf "+ Testing ssh configuration... " for n in $CTS_node_list; do ssh -l root -o PasswordAuthentication=no -o ConnectTimeout=5 "$n" /bin/true rc=$? if [ $rc -ne 0 ]; then echo " - connection to $n failed" test_ok=0 fi done if [ $test_ok -eq 0 ]; then printf "\nThe ability to open key-based 'ssh' connections (as the user 'root') is required to use CTS.\n" read -p " - Do you want this program to help you with such a setup? (yN) " auto_fix if [ "$auto_fix" = "y" ]; then # XXX are we picking the most suitable identity? privKey=$(ssh-add -L | head -n1 | cut -d" " -f3) sshCopyIdOpts="-o User=root" [ -z "$privKey" ] || sshCopyIdOpts+=" -i \"${privKey}.pub\"" for n in $CTS_node_list; do eval "ssh-copy-id $sshCopyIdOpts \"${n}\"" \ || die "Attempt to 'ssh-copy-id $sshCopyIdOpts \"$n\"' failed" done else die "Please install one of your SSH public keys to root's account on all cluster nodes" fi fi echo "OK" if [ -z "$CTS_logfile" ]; then anyAsked=1 read -p " + Where does/should syslog store logs from remote hosts? (/var/log/messages) " CTS_logfile [ -n "$CTS_logfile" ] || CTS_logfile=/var/log/messages fi [ -e "$CTS_logfile" ] || die "$CTS_logfile doesn't exist" if [ -z "$CTS_logfacility" ]; then anyAsked=1 read -p " + Which log facility does the cluster use? (daemon) " CTS_logfacility [ -n "$CTS_logfacility" ] || CTS_logfacility=daemon fi if [ -z "$CTS_boot" ]; then read -p "+ Is the cluster software started automatically when a node boots? [yN] " CTS_boot if [ -z "$CTS_boot" ]; then CTS_boot=0 else case $CTS_boot in 1|y|Y) CTS_boot=1;; *) CTS_boot=0;; esac fi fi if [ -z "$CTS_numtests" ]; then read -p "+ How many test iterations should be performed? (500) " CTS_numtests [ -n "$CTS_numtests" ] || CTS_numtests=500 fi if [ -z "$CTS_asked_once" ]; then anyAsked=1 read -p "+ What type of STONITH agent do you use? (none) " CTS_stonith [ -z "$CTS_stonith" ] \ || read -p "+ List any STONITH agent parameters (eq. device_host=switch.power.com): " CTS_stonith_args [ -n "$CTS_adv" ] \ || read -p "+ (Advanced) Any extra CTS parameters? (none) " CTS_adv fi [ $anyAsked -eq 0 ] \ || read -p "+ Save values to ~/.cts for next time? (yN) " doSave if [ "$doSave" = "y" ]; then cat > ~/.cts <<-EOF # CTS Test data CTS_stack="$CTS_stack" CTS_node_list="$CTS_node_list" CTS_logfile="$CTS_logfile" CTS_logport="$CTS_logport" CTS_logfacility="$CTS_logfacility" CTS_asked_once=1 CTS_adv="$CTS_adv" CTS_stonith="$CTS_stonith" CTS_stonith_args="$CTS_stonith_args" CTS_boot="$CTS_boot" EOF fi cts_extra="" if [ -n "$CTS_stonith" ]; then cts_extra="$cts_extra --stonith-type $CTS_stonith" [ -z "$CTS_stonith_args" ] \ || cts_extra="$cts_extra --stonith-params \"$CTS_stonith_args\"" else cts_extra="$cts_extra --stonith 0" echo " - Testing a cluster without STONITH is like a blunt pencil... pointless" fi printf "\nAll set to go for %d iterations!\n" "$CTS_numtests" [ $anyAsked -ne 0 ] \ || echo "+ To use a different configuration, remove ~/.cts and re-run cts (or edit it manually)." echo Now paste the following command into this shell: echo "@PYTHON@ `dirname "$0"`/CTSlab.py -L \"$CTS_logfile\" --syslog-facility \"$CTS_logfacility\" --no-unsafe-tests --stack \"$CTS_stack\" $CTS_adv --at-boot \"$CTS_boot\" $cts_extra \"$CTS_numtests\" --nodes \"$CTS_node_list\"" diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index 932925b97c..96ad5a33ec 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -1,1547 +1,1549 @@ /* * Copyright 2009-2020 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include // PRIu32, PRIx32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *stonith_our_uname = NULL; char *stonith_our_uuid = NULL; long stonith_watchdog_timeout_ms = 0; static GMainLoop *mainloop = NULL; gboolean stand_alone = FALSE; static gboolean no_cib_connect = FALSE; static gboolean stonith_shutdown_flag = FALSE; static qb_ipcs_service_t *ipcs = NULL; static xmlNode *local_cib = NULL; static pe_working_set_t *fenced_data_set = NULL; static cib_t *cib_api = NULL; static void stonith_shutdown(int nsig); static void stonith_cleanup(void); static int32_t st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { if (stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", pcmk__client_pid(c)); return -EPERM; } if (pcmk__new_client(c, uid, gid) == NULL) { return -EIO; } return 0; } /* Exit code means? */ static int32_t st_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; int call_options = 0; xmlNode *request = NULL; pcmk__client_t *c = pcmk__find_client(qbc); const char *op = NULL; if (c == NULL) { crm_info("Invalid client: %p", qbc); return 0; } request = pcmk__client_data2xml(c, data, &id, &flags); if (request == NULL) { pcmk__ipc_send_ack(c, id, flags, "nack", CRM_EX_PROTOCOL); return 0; } op = crm_element_value(request, F_CRM_TASK); if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { crm_xml_add(request, F_TYPE, T_STONITH_NG); crm_xml_add(request, F_STONITH_OPERATION, op); crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); send_cluster_message(NULL, crm_msg_stonith_ng, request, FALSE); free_xml(request); return 0; } if (c->name == NULL) { const char *value = crm_element_value(request, F_STONITH_CLIENTNAME); if (value == NULL) { value = "unknown"; } c->name = crm_strdup_printf("%s.%u", value, c->pid); } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_trace("Flags 0x%08" PRIx32 "/0x%08x for command %" PRIu32 " from client %s", flags, call_options, id, pcmk__client_name(c)); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(flags & crm_ipc_client_response); CRM_LOG_ASSERT(c->request_id == 0); /* This means the client has two synchronous events in-flight */ c->request_id = id; /* Reply only to the last one */ } crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); stonith_command(c, id, flags, request, NULL); free_xml(request); return 0; } /* Error code means? */ static int32_t st_ipc_closed(qb_ipcs_connection_t * c) { pcmk__client_t *client = pcmk__find_client(c); if (client == NULL) { return 0; } crm_trace("Connection %p closed", c); pcmk__free_client(client); /* 0 means: yes, go ahead and destroy the connection */ return 0; } static void st_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p destroyed", c); st_ipc_closed(c); } static void stonith_peer_callback(xmlNode * msg, void *private_data) { const char *remote_peer = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_STONITH_OPERATION); if (pcmk__str_eq(op, "poke", pcmk__str_none)) { return; } crm_log_xml_trace(msg, "Peer[inbound]"); stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_COROSYNC static void stonith_peer_ais_callback(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if(data == NULL) { return; } if (kind == crm_class_cluster) { xml = string2xml(data); if (xml == NULL) { crm_err("Invalid XML: '%.120s'", data); free(data); return; } crm_xml_add(xml, F_ORIG, from); /* crm_xml_add_int(xml, F_SEQ, wrapper->id); */ stonith_peer_callback(xml, NULL); } free_xml(xml); free(data); return; } static void stonith_peer_cs_destroy(gpointer user_data) { crm_crit("Lost connection to cluster layer, shutting down"); stonith_shutdown(0); } #endif void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer) { /* send callback to originating child */ pcmk__client_t *client_obj = NULL; int local_rc = pcmk_rc_ok; crm_trace("Sending response"); client_obj = pcmk__find_client_by_id(client_id); crm_trace("Sending callback to request originator"); if (client_obj == NULL) { local_rc = EPROTO; crm_trace("No client to sent the response to. F_STONITH_CLIENTID not set."); } else { int rid = 0; if (sync_reply) { CRM_LOG_ASSERT(client_obj->request_id); rid = client_obj->request_id; client_obj->request_id = 0; crm_trace("Sending response %d to client %s%s", rid, pcmk__client_name(client_obj), (from_peer? " (originator of delegated request)" : "")); } else { crm_trace("Sending an event to client %s%s", pcmk__client_name(client_obj), (from_peer? " (originator of delegated request)" : "")); } local_rc = pcmk__ipc_send_xml(client_obj, rid, notify_src, (sync_reply? crm_ipc_flags_none : crm_ipc_server_event)); } if ((local_rc != pcmk_rc_ok) && (client_obj != NULL)) { crm_warn("%s reply to client %s failed: %s", (sync_reply? "Synchronous" : "Asynchronous"), pcmk__client_name(client_obj), pcmk_rc_str(local_rc)); } } uint64_t get_stonith_flag(const char *name) { if (pcmk__str_eq(name, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { return st_callback_notify_fence; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_ADD, pcmk__str_casei)) { return st_callback_device_add; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_DEL, pcmk__str_casei)) { return st_callback_device_del; } else if (pcmk__str_eq(name, T_STONITH_NOTIFY_HISTORY, pcmk__str_casei)) { return st_callback_notify_history; } else if (pcmk__str_eq(name, T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk__str_casei)) { return st_callback_notify_history_synced; } return st_callback_unknown; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { xmlNode *update_msg = user_data; pcmk__client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, F_SUBTYPE); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if (client->ipcs == NULL) { crm_trace("Skipping client with NULL channel"); return; } if (pcmk_is_set(client->flags, get_stonith_flag(type))) { int rc = pcmk__ipc_send_xml(client, 0, update_msg, crm_ipc_server_event|crm_ipc_server_error); if (rc != pcmk_rc_ok) { crm_warn("%s notification of client %s failed: %s " CRM_XS " id=%.8s rc=%d", type, pcmk__client_name(client), pcmk_rc_str(rc), client->id, rc); } else { crm_trace("Sent %s notification to client %s", type, pcmk__client_name(client)); } } } void do_stonith_async_timeout_update(const char *client_id, const char *call_id, int timeout) { pcmk__client_t *client = NULL; xmlNode *notify_data = NULL; if (!timeout || !call_id || !client_id) { return; } client = pcmk__find_client_by_id(client_id); if (!client) { return; } notify_data = create_xml_node(NULL, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_TYPE, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_STONITH_CALLID, call_id); crm_xml_add_int(notify_data, F_STONITH_TIMEOUT, timeout); crm_trace("timeout update is %d for client %s and call id %s", timeout, client_id, call_id); if (client) { pcmk__ipc_send_xml(client, 0, notify_data, crm_ipc_server_event); } free_xml(notify_data); } void do_stonith_notify(int options, const char *type, int result, xmlNode * data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); CRM_CHECK(type != NULL,;); crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(update_msg, F_SUBTYPE, type); crm_xml_add(update_msg, F_STONITH_OPERATION, type); crm_xml_add_int(update_msg, F_STONITH_RC, result); if (data != NULL) { add_message_xml(update_msg, F_STONITH_CALLDATA, data); } crm_trace("Notifying clients"); pcmk__foreach_ipc_client(stonith_notify_client, update_msg); free_xml(update_msg); crm_trace("Notify complete"); } static void do_stonith_notify_config(int options, const char *op, int rc, const char *desc, int active) { xmlNode *notify_data = create_xml_node(NULL, op); CRM_CHECK(notify_data != NULL, return); crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); do_stonith_notify(options, op, rc, notify_data); free_xml(notify_data); } void do_stonith_notify_device(int options, const char *op, int rc, const char *desc) { do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(device_list)); } void do_stonith_notify_level(int options, const char *op, int rc, const char *desc) { do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(topology)); } static void topology_remove_helper(const char *node, int level) { int rc; char *desc = NULL; xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); rc = stonith_level_remove(data, &desc); do_stonith_notify_level(0, STONITH_OP_LEVEL_DEL, rc, desc); free_xml(data); free(desc); } static void remove_cib_device(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if(match != NULL) { standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); } if (!pcmk__str_eq(standard, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { continue; } rsc_id = crm_element_value(match, XML_ATTR_ID); stonith_device_remove(rsc_id, TRUE); } } static void handle_topology_change(xmlNode *match, bool remove) { int rc; char *desc = NULL; CRM_CHECK(match != NULL, return); crm_trace("Updating %s", ID(match)); if(remove) { int index = 0; char *key = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); topology_remove_helper(key, index); free(key); } rc = stonith_level_register(match, &desc); do_stonith_notify_level(0, STONITH_OP_LEVEL_ADD, rc, desc); free(desc); } static void remove_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if (match && crm_element_value(match, XML_DIFF_MARKER)) { /* Deletion */ int index = 0; char *target = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); if (target == NULL) { crm_err("Invalid fencing target in element %s", ID(match)); } else if (index <= 0) { crm_err("Invalid level for %s in element %s", target, ID(match)); } else { topology_remove_helper(target, index); } /* } else { Deal with modifications during the 'addition' stage */ } } } static void register_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); handle_topology_change(match, TRUE); } } /* Fencing */ static void fencing_topology_init(void) { xmlXPathObjectPtr xpathObj = NULL; const char *xpath = "//" XML_TAG_FENCING_LEVEL; crm_trace("Full topology refresh"); free_topology_list(); init_topology_list(); /* Grab everything */ xpathObj = xpath_search(local_cib, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } #define rsc_name(x) x->clone_name?x->clone_name:x->id /*! * \internal * \brief Check whether our uname is in a resource's allowed node list * * \param[in] rsc Resource to check * * \return Pointer to node object if found, NULL otherwise */ static pe_node_t * our_node_allowed_for(pe_resource_t *rsc) { GHashTableIter iter; pe_node_t *node = NULL; if (rsc && stonith_our_uname) { g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { if (node && strcmp(node->details->uname, stonith_our_uname) == 0) { break; } node = NULL; } } return node; } /*! * \internal * \brief If a resource or any of its children are STONITH devices, update their * definitions given a cluster working set. * * \param[in] rsc Resource to check * \param[in] data_set Cluster working set with device information */ static void cib_device_update(pe_resource_t *rsc, pe_working_set_t *data_set) { pe_node_t *node = NULL; const char *value = NULL; const char *rclass = NULL; pe_node_t *parent = NULL; gboolean remove = TRUE; /* If this is a complex resource, check children rather than this resource itself. * TODO: Mark each installed device and remove if untouched when this process finishes. */ if(rsc->children) { GListPtr gIter = NULL; for (gIter = rsc->children; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, data_set); if(pe_rsc_is_clone(rsc)) { crm_trace("Only processing one copy of the clone %s", rsc->id); break; } } return; } /* We only care about STONITH resources. */ rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (!pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { return; } /* If this STONITH resource is disabled, just remove it. */ if (pe__resource_is_disabled(rsc)) { crm_info("Device %s has been disabled", rsc->id); goto update_done; } /* Check whether our node is allowed for this resource (and its parent if in a group) */ node = our_node_allowed_for(rsc); if (rsc->parent && (rsc->parent->variant == pe_group)) { parent = our_node_allowed_for(rsc->parent); } if(node == NULL) { /* Our node is disallowed, so remove the device */ GHashTableIter iter; crm_info("Device %s has been disabled on %s: unknown", rsc->id, stonith_our_uname); g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { crm_trace("Available: %s = %d", node->details->uname, node->weight); } goto update_done; } else if(node->weight < 0 || (parent && parent->weight < 0)) { /* Our node (or its group) is disallowed by score, so remove the device */ char *score = score2char((node->weight < 0) ? node->weight : parent->weight); crm_info("Device %s has been disabled on %s: score=%s", rsc->id, stonith_our_uname, score); free(score); goto update_done; } else { /* Our node is allowed, so update the device information */ int rc; xmlNode *data; GHashTableIter gIter; stonith_key_value_t *params = NULL; const char *name = NULL; const char *agent = crm_element_value(rsc->xml, XML_EXPR_ATTR_TYPE); const char *rsc_provides = NULL; crm_debug("Device %s is allowed on %s: score=%d", rsc->id, stonith_our_uname, node->weight); get_rsc_attributes(rsc->parameters, rsc, node, data_set); get_meta_attributes(rsc->meta, rsc, node, data_set); rsc_provides = g_hash_table_lookup(rsc->meta, PCMK_STONITH_PROVIDES); g_hash_table_iter_init(&gIter, rsc->parameters); while (g_hash_table_iter_next(&gIter, (gpointer *) & name, (gpointer *) & value)) { if (!name || !value) { continue; } params = stonith_key_value_add(params, name, value); crm_trace(" %s=%s", name, value); } remove = FALSE; data = create_device_registration_xml(rsc_name(rsc), st_namespace_any, agent, params, rsc_provides); stonith_key_value_freeall(params, 1, 1); rc = stonith_device_register(data, NULL, TRUE); CRM_ASSERT(rc == pcmk_ok); free_xml(data); } update_done: if(remove && g_hash_table_lookup(device_list, rsc_name(rsc))) { stonith_device_remove(rsc_name(rsc), TRUE); } } /*! * \internal * \brief Update all STONITH device definitions based on current CIB */ static void cib_devices_update(void) { GListPtr gIter = NULL; crm_info("Updating devices to version %s.%s.%s", crm_element_value(local_cib, XML_ATTR_GENERATION_ADMIN), crm_element_value(local_cib, XML_ATTR_GENERATION), crm_element_value(local_cib, XML_ATTR_NUMUPDATES)); CRM_ASSERT(fenced_data_set != NULL); fenced_data_set->input = local_cib; fenced_data_set->now = crm_time_new(NULL); fenced_data_set->localhost = stonith_our_uname; pe__set_working_set_flags(fenced_data_set, pe_flag_quick_location); cluster_status(fenced_data_set); pcmk__schedule_actions(fenced_data_set, NULL, NULL); for (gIter = fenced_data_set->resources; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, fenced_data_set); } fenced_data_set->input = NULL; // Wasn't a copy, so don't let API free it pe_reset_working_set(fenced_data_set); } static void update_cib_stonith_devices_v2(const char *event, xmlNode * msg) { xmlNode *change = NULL; char *reason = NULL; bool needs_update = FALSE; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); for (change = pcmk__xml_first_child(patchset); change != NULL; change = pcmk__xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); const char *shortpath = NULL; if ((op == NULL) || (strcmp(op, "move") == 0) || strstr(xpath, "/"XML_CIB_TAG_STATUS)) { continue; } else if (pcmk__str_eq(op, "delete", pcmk__str_casei) && strstr(xpath, "/"XML_CIB_TAG_RESOURCE)) { const char *rsc_id = NULL; char *search = NULL; char *mutable = NULL; if (strstr(xpath, XML_TAG_ATTR_SETS) || strstr(xpath, XML_TAG_META_SETS)) { needs_update = TRUE; reason = strdup("(meta) attribute deleted from resource"); break; } mutable = strdup(xpath); rsc_id = strstr(mutable, "primitive[@id=\'"); if (rsc_id != NULL) { rsc_id += strlen("primitive[@id=\'"); search = strchr(rsc_id, '\''); } if (search != NULL) { *search = 0; stonith_device_remove(rsc_id, TRUE); } else { crm_warn("Ignoring malformed CIB update (resource deletion)"); } free(mutable); } else if (strstr(xpath, "/"XML_CIB_TAG_RESOURCES) || strstr(xpath, "/"XML_CIB_TAG_CONSTRAINTS) || strstr(xpath, "/"XML_CIB_TAG_RSCCONFIG)) { shortpath = strrchr(xpath, '/'); CRM_ASSERT(shortpath); reason = crm_strdup_printf("%s %s", op, shortpath+1); needs_update = TRUE; break; } } if(needs_update) { crm_info("Updating device list from CIB: %s", reason); cib_devices_update(); } else { crm_trace("No updates for device list found in CIB"); } free(reason); } static void update_cib_stonith_devices_v1(const char *event, xmlNode * msg) { const char *reason = "none"; gboolean needs_update = FALSE; xmlXPathObjectPtr xpath_obj = NULL; /* process new constraints */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_CONS_TAG_RSC_LOCATION); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; /* Safest and simplest to always recompute */ needs_update = TRUE; reason = "new location constraint"; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpath_obj, lpc); crm_log_xml_trace(match, "new constraint"); } } freeXpathObject(xpath_obj); /* process deletions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { remove_cib_device(xpath_obj); } freeXpathObject(xpath_obj); /* process additions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpath_obj, lpc); rsc_id = crm_element_value(match, XML_ATTR_ID); standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); if (!pcmk__str_eq(standard, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { continue; } crm_trace("Fencing resource %s was added or modified", rsc_id); reason = "new resource"; needs_update = TRUE; } } freeXpathObject(xpath_obj); if(needs_update) { crm_info("Updating device list from CIB: %s", reason); cib_devices_update(); } } static void update_cib_stonith_devices(const char *event, xmlNode * msg) { int format = 1; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); switch(format) { case 1: update_cib_stonith_devices_v1(event, msg); break; case 2: update_cib_stonith_devices_v2(event, msg); break; default: crm_warn("Unknown patch format: %d", format); } } /* Needs to hold node name + attribute name + attribute value + 75 */ #define XPATH_MAX 512 /*! * \internal * \brief Check whether a node has a specific attribute name/value * * \param[in] node Name of node to check * \param[in] name Name of an attribute to look for * \param[in] value The value the named attribute needs to be set to in order to be considered a match * * \return TRUE if the locally cached CIB has the specified node attribute */ gboolean node_has_attr(const char *node, const char *name, const char *value) { char xpath[XPATH_MAX]; xmlNode *match; int n; CRM_CHECK(local_cib != NULL, return FALSE); /* Search for the node's attributes in the CIB. While the schema allows * multiple sets of instance attributes, and allows instance attributes to * use id-ref to reference values elsewhere, that is intended for resources, * so we ignore that here. */ n = snprintf(xpath, XPATH_MAX, "//" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE "[@uname='%s']/" XML_TAG_ATTR_SETS "/" XML_CIB_TAG_NVPAIR "[@name='%s' and @value='%s']", node, name, value); match = get_xpath_object(xpath, local_cib, LOG_NEVER); CRM_CHECK(n < XPATH_MAX, return FALSE); return (match != NULL); } static void update_fencing_topology(const char *event, xmlNode * msg) { int format = 1; const char *xpath; xmlXPathObjectPtr xpathObj = NULL; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); if(format == 1) { /* Process deletions (only) */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); remove_fencing_topology(xpathObj); freeXpathObject(xpathObj); /* Process additions and changes */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } else if(format == 2) { xmlNode *change = NULL; int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; xml_patch_versions(patchset, add, del); for (change = pcmk__xml_first_child(patchset); change != NULL; change = pcmk__xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); if(op == NULL) { continue; } else if(strstr(xpath, "/" XML_TAG_FENCING_LEVEL) != NULL) { /* Change to a specific entry */ crm_trace("Handling %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); if(strcmp(op, "move") == 0) { continue; } else if(strcmp(op, "create") == 0) { handle_topology_change(change->children, FALSE); } else if(strcmp(op, "modify") == 0) { xmlNode *match = first_named_child(change, XML_DIFF_RESULT); if(match) { handle_topology_change(match->children, TRUE); } } else if(strcmp(op, "delete") == 0) { /* Nuclear option, all we have is the path and an id... not enough to remove a specific entry */ crm_info("Re-initializing fencing topology after %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else if (strstr(xpath, "/" XML_TAG_FENCING_TOPOLOGY) != NULL) { /* Change to the topology in general */ crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } else if (strstr(xpath, "/" XML_CIB_TAG_CONFIGURATION)) { /* Changes to the whole config section, possibly including the topology as a whild */ if(first_named_child(change, XML_TAG_FENCING_TOPOLOGY) == NULL) { crm_trace("Nothing for us in %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); } else if(strcmp(op, "delete") == 0 || strcmp(op, "create") == 0) { crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else { crm_trace("Nothing for us in %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); } } } else { crm_warn("Unknown patch format: %d", format); } } static bool have_cib_devices = FALSE; static void update_cib_cache_cb(const char *event, xmlNode * msg) { int rc = pcmk_ok; xmlNode *stonith_enabled_xml = NULL; xmlNode *stonith_watchdog_xml = NULL; const char *stonith_enabled_s = NULL; static gboolean stonith_enabled_saved = TRUE; if(!have_cib_devices) { crm_trace("Skipping updates until we get a full dump"); return; } else if(msg == NULL) { crm_trace("Missing %s update", event); return; } /* Maintain a local copy of the CIB so that we have full access * to device definitions, location constraints, and node attributes */ if (local_cib != NULL) { int rc = pcmk_ok; xmlNode *patchset = NULL; crm_element_value_int(msg, F_CIB_RC, &rc); if (rc != pcmk_ok) { return; } patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); xml_log_patchset(LOG_TRACE, "Config update", patchset); rc = xml_apply_patchset(local_cib, patchset, TRUE); switch (rc) { case pcmk_ok: case -pcmk_err_old_data: break; case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_notice("[%s] Patch aborted: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; break; default: crm_warn("[%s] ABORTED: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; } } if (local_cib == NULL) { crm_trace("Re-requesting full CIB"); rc = cib_api->cmds->query(cib_api, NULL, &local_cib, cib_scope_local | cib_sync_call); if(rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); return; } CRM_ASSERT(local_cib != NULL); stonith_enabled_saved = FALSE; /* Trigger a full refresh below */ } crm_peer_caches_refresh(local_cib); stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", local_cib, LOG_NEVER); if (stonith_enabled_xml) { stonith_enabled_s = crm_element_value(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE); } if (stonith_enabled_s == NULL || crm_is_true(stonith_enabled_s)) { long timeout_ms = 0; const char *value = NULL; stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", local_cib, LOG_NEVER); if (stonith_watchdog_xml) { value = crm_element_value(stonith_watchdog_xml, XML_NVPAIR_ATTR_VALUE); } if(value) { timeout_ms = crm_get_msec(value); } if (timeout_ms < 0) { timeout_ms = pcmk__auto_watchdog_timeout(); } if(timeout_ms != stonith_watchdog_timeout_ms) { crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000); stonith_watchdog_timeout_ms = timeout_ms; } } else { stonith_watchdog_timeout_ms = 0; } if (stonith_enabled_s && crm_is_true(stonith_enabled_s) == FALSE) { crm_trace("Ignoring CIB updates while fencing is disabled"); stonith_enabled_saved = FALSE; return; } else if (stonith_enabled_saved == FALSE) { crm_info("Updating fencing device and topology lists " "now that fencing is enabled"); stonith_enabled_saved = TRUE; fencing_topology_init(); cib_devices_update(); } else { update_fencing_topology(event, msg); update_cib_stonith_devices(event, msg); } } static void init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { crm_info("Updating device list from CIB"); have_cib_devices = TRUE; local_cib = copy_xml(output); crm_peer_caches_refresh(local_cib); fencing_topology_init(); cib_devices_update(); } static void stonith_shutdown(int nsig) { crm_info("Terminating with %d clients", pcmk__ipc_client_count()); stonith_shutdown_flag = TRUE; if (mainloop != NULL && g_main_loop_is_running(mainloop)) { g_main_loop_quit(mainloop); } else { stonith_cleanup(); crm_exit(CRM_EX_OK); } } static void cib_connection_destroy(gpointer user_data) { if (stonith_shutdown_flag) { crm_info("Connection to the CIB manager closed"); return; } else { crm_crit("Lost connection to the CIB manager, shutting down"); } if (cib_api) { cib_api->cmds->signoff(cib_api); } stonith_shutdown(0); } static void stonith_cleanup(void) { if (cib_api) { cib_api->cmds->del_notify_callback(cib_api, T_CIB_DIFF_NOTIFY, update_cib_cache_cb); cib_api->cmds->signoff(cib_api); } if (ipcs) { qb_ipcs_destroy(ipcs); } crm_peer_destroy(); pcmk__client_cleanup(); free_stonith_remote_op_list(); free_topology_list(); free_device_list(); free_metadata_cache(); free(stonith_our_uname); stonith_our_uname = NULL; free_xml(local_cib); local_cib = NULL; } static pcmk__cli_option_t long_options[] = { // long option, argument type, storage, short option, description, flags { "stand-alone", no_argument, 0, 's', NULL, pcmk__option_default }, { "stand-alone-w-cpg", no_argument, 0, 'c', NULL, pcmk__option_default }, { "logfile", required_argument, 0, 'l', NULL, pcmk__option_default }, { "verbose", no_argument, 0, 'V', NULL, pcmk__option_default }, { "version", no_argument, 0, '$', NULL, pcmk__option_default }, { "help", no_argument, 0, '?', NULL, pcmk__option_default }, { 0, 0, 0, 0 } }; static void setup_cib(void) { int rc, retries = 0; cib_api = cib_new(); if (cib_api == NULL) { crm_err("No connection to the CIB manager"); return; } do { sleep(retries); rc = cib_api->cmds->signon(cib_api, CRM_SYSTEM_STONITHD, cib_command); } while (rc == -ENOTCONN && ++retries < 5); if (rc != pcmk_ok) { crm_err("Could not connect to the CIB manager: %s (%d)", pcmk_strerror(rc), rc); } else if (pcmk_ok != cib_api->cmds->add_notify_callback(cib_api, T_CIB_DIFF_NOTIFY, update_cib_cache_cb)) { crm_err("Could not set CIB notification callback"); } else { rc = cib_api->cmds->query(cib_api, NULL, NULL, cib_scope_local); cib_api->cmds->register_callback(cib_api, rc, 120, FALSE, NULL, "init_cib_cache_cb", init_cib_cache_cb); cib_api->cmds->set_connection_dnotify(cib_api, cib_connection_destroy); crm_info("Watching for fencing topology changes"); } } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = st_ipc_accept, .connection_created = NULL, .msg_process = st_ipc_dispatch, .connection_closed = st_ipc_closed, .connection_destroyed = st_ipc_destroy }; /*! * \internal * \brief Callback for peer status changes * * \param[in] type What changed * \param[in] node What peer had the change * \param[in] data Previous value of what changed */ static void st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { if ((type != crm_status_processes) && !pcmk_is_set(node->flags, crm_remote_node)) { /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in stonith_peer_callback() */ xmlNode *query = create_xml_node(NULL, "stonith_command"); crm_xml_add(query, F_XML_TAGNAME, "stonith_command"); crm_xml_add(query, F_TYPE, T_STONITH_NG); crm_xml_add(query, F_STONITH_OPERATION, "poke"); crm_debug("Broadcasting our uname because of node %u", node->id); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); } } int main(int argc, char **argv) { int flag; int lpc = 0; int argerr = 0; int option_index = 0; crm_cluster_t cluster; const char *actions[] = { "reboot", "off", "on", "list", "monitor", "status" }; crm_ipc_t *old_instance = NULL; crm_log_preinit(NULL, argc, argv); pcmk__set_cli_options(NULL, "[options]", long_options, "daemon for executing fencing devices in a " "Pacemaker cluster"); while (1) { flag = pcmk__next_cli_option(argc, argv, &option_index, NULL); if (flag == -1) { break; } switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'l': crm_add_logfile(optarg); break; case 's': stand_alone = TRUE; break; case 'c': stand_alone = FALSE; no_cib_connect = TRUE; break; case '$': case '?': pcmk__cli_help(flag, CRM_EX_OK); break; default: ++argerr; break; } } if (argc - optind == 1 && pcmk__str_eq("metadata", argv[optind], pcmk__str_casei)) { printf("\n"); printf("\n"); printf(" 1.0\n"); printf(" Instance attributes available for all \"stonith\"-class resources" " and used by Pacemaker's fence daemon, formerly known as stonithd\n"); printf(" Instance attributes available for all \"stonith\"-class resources\n"); printf(" \n"); #if 0 // priority is not implemented yet printf(" \n"); printf(" Devices that are not in a topology " "are tried in order of highest to lowest integer priority\n"); printf(" \n"); printf(" \n"); #endif printf(" \n", PCMK_STONITH_HOST_ARGUMENT); printf (" Advanced use only: An alternate parameter to supply instead of 'port'\n"); printf (" Some devices do not support the standard 'port' parameter or may provide additional ones.\n" "Use this to specify an alternate, device-specific, parameter that should indicate the machine to be fenced.\n" "A value of 'none' can be used to tell the cluster not to supply any additional parameters.\n" " \n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_HOST_MAP); printf (" A mapping of host names to ports numbers for devices that do not support host names.\n"); printf (" Eg. node1:1;node2:2,3 would tell the cluster to use port 1 for node1 and ports 2 and 3 for node2\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_HOST_LIST); printf(" A list of machines controlled by " "this device (Optional unless %s=static-list).\n", PCMK_STONITH_HOST_CHECK); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_HOST_CHECK); printf (" How to determine which machines are controlled by the device.\n"); printf(" Allowed values: dynamic-list " "(query the device via the 'list' command), static-list " "(check the " PCMK_STONITH_HOST_LIST " attribute), status " "(query the device via the 'status' command), none (assume " "every device can fence every machine)\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_DELAY_MAX); - printf(" Enable a random delay for " - "fencing actions and specify the maximum of random " - "delay.\n"); + printf(" Enable a delay of no more than the " + "time specified before executing fencing actions. Pacemaker " + "derives the overall delay by taking the value of " + PCMK_STONITH_DELAY_BASE " and adding a random delay value such " + "that the sum is kept below this maximum.\n"); printf(" This prevents double fencing when " "using slow devices such as sbd.\nUse this to enable a random " "delay for fencing actions.\nThe overall delay is derived from " "this random delay value adding a static delay so that the sum " "is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_DELAY_BASE); printf(" Enable a base delay for " "fencing actions and specify base delay value.\n"); printf(" This prevents double fencing when " "different delays are configured on the nodes.\nUse this to " "enable a static delay for fencing actions.\nThe overall delay " "is derived from a random delay value adding this static delay " "so that the sum is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_ACTION_LIMIT); printf (" The maximum number of actions can be performed in parallel on this device\n"); printf (" Cluster property concurrent-fencing=true needs to be configured first.\n" "Then use this to specify the maximum number of actions can be performed in parallel on this device. -1 is unlimited.\n"); printf(" \n"); printf(" \n"); for (lpc = 0; lpc < DIMOF(actions); lpc++) { printf(" \n", actions[lpc]); printf (" Advanced use only: An alternate command to run instead of '%s'\n", actions[lpc]); printf (" Some devices do not support the standard commands or may provide additional ones.\n" "Use this to specify an alternate, device-specific, command that implements the '%s' action.\n", actions[lpc]); printf(" \n", actions[lpc]); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: Specify an alternate timeout to use for %s actions instead of stonith-timeout\n", actions[lpc]); printf (" Some devices need much more/less time to complete than normal.\n" "Use this to specify an alternate, device-specific, timeout for '%s' actions.\n", actions[lpc]); printf(" \n"); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: The maximum number of times to retry the '%s' command within the timeout period\n", actions[lpc]); printf(" Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries '%s' actions before giving up." "\n", actions[lpc]); printf(" \n"); printf(" \n"); } printf(" \n"); printf("\n"); return CRM_EX_OK; } if (optind != argc) { ++argerr; } if (argerr) { pcmk__cli_help('?', CRM_EX_USAGE); } crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); crm_notice("Starting Pacemaker fencer"); old_instance = crm_ipc_new("stonith-ng", 0); if (crm_ipc_connect(old_instance)) { /* IPC end-point already up */ crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); crm_err("pacemaker-fenced is already active, aborting startup"); crm_exit(CRM_EX_OK); } else { /* not up or not authentic, we'll proceed either way */ crm_ipc_destroy(old_instance); old_instance = NULL; } mainloop_add_signal(SIGTERM, stonith_shutdown); crm_peer_init(); fenced_data_set = pe_new_working_set(); CRM_ASSERT(fenced_data_set != NULL); pe__set_working_set_flags(fenced_data_set, pe_flag_no_counts|pe_flag_no_compat); if (stand_alone == FALSE) { if (is_corosync_cluster()) { #if SUPPORT_COROSYNC cluster.destroy = stonith_peer_cs_destroy; cluster.cpg.cpg_deliver_fn = stonith_peer_ais_callback; cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; #endif } crm_set_status_callback(&st_peer_update_callback); if (crm_cluster_connect(&cluster) == FALSE) { crm_crit("Cannot sign in to the cluster... terminating"); crm_exit(CRM_EX_FATAL); } stonith_our_uname = cluster.uname; stonith_our_uuid = cluster.uuid; if (no_cib_connect == FALSE) { setup_cib(); } } else { stonith_our_uname = strdup("localhost"); } init_device_list(); init_topology_list(); if(stonith_watchdog_timeout_ms > 0) { int rc; xmlNode *xml; stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, PCMK_STONITH_HOST_LIST, stonith_our_uname); xml = create_device_registration_xml("watchdog", st_namespace_internal, STONITH_WATCHDOG_AGENT, params, NULL); stonith_key_value_freeall(params, 1, 1); rc = stonith_device_register(xml, NULL, FALSE); free_xml(xml); if (rc != pcmk_ok) { crm_crit("Cannot register watchdog pseudo fence agent"); crm_exit(CRM_EX_FATAL); } } pcmk__serve_fenced_ipc(&ipcs, &ipc_callbacks); /* Create the mainloop and run it... */ mainloop = g_main_loop_new(NULL, FALSE); crm_notice("Pacemaker fencer successfully started and accepting connections"); g_main_loop_run(mainloop); stonith_cleanup(); pe_free_working_set(fenced_data_set); crm_exit(CRM_EX_OK); } diff --git a/devel/Makefile.am b/devel/Makefile.am index 29232923cd..edb113cd7a 100644 --- a/devel/Makefile.am +++ b/devel/Makefile.am @@ -1,67 +1,67 @@ # # Copyright 2020 the Pacemaker project contributors # # The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # # Coccinelle is a tool that takes special patch-like files (called semantic patches) and # applies them throughout a source tree. This is useful when refactoring, changing APIs, # catching dangerous or incorrect code, and other similar tasks. It's not especially # easy to write a semantic patch but most users should only be concerned about running # the target and inspecting the results. # # Documentation (including examples, which are the most useful): # https://coccinelle.gitlabpages.inria.fr/website/docs/ # # Run the "make cocci" target to just output what would be done, or "make cocci-inplace" # to apply the changes to the source tree. # # COCCI_FILES may be set on the command line, if you want to test just a single file # while it's under development. Otherwise, it is a list of all the files that are ready # to be run. # # ref-passed-variables-inited.cocci seems to be returning some false positives around # GHashTableIters, so it is disabled for the moment. COCCI_FILES ?= coccinelle/string-any-of.cocci \ coccinelle/string-empty.cocci \ coccinelle/string-null-matches.cocci \ coccinelle/use-func.cocci -EXTRA_SCRIPTS = coccinelle/test/testrunner.sh -EXTRA_DIST = $(EXTRA_SCRIPTS) $(COCCI_FILES) \ +dist_noinst_SCRIPTS = coccinelle/test/testrunner.sh +EXTRA_DIST = README gdbhelpers $(COCCI_FILES) \ coccinelle/ref-passed-variables-inited.cocci \ coccinelle/string-replacements.cocci \ coccinelle/test/ref-passed-variables-inited.input.c \ coccinelle/test/ref-passed-variables-inited.output # Any file in this list is allowed to use any of the pcmk__ internal functions. # Coccinelle can use any transformation that depends on "internal" to rewrite # code to use the internal functions. MAY_USE_INTERNAL_FILES = $(shell find .. -path "../lib/*.c" -o -path "../tools/*.c" -o -path "../daemons/*.c" -o -path '../include/pcmki/*h' -o -name '*internal.h') # And then any file in this list is public API, which may not use internal # functions. Thus, only those transformations that do not depend on "internal" # may be applied. OTHER_FILES = $(shell find ../include -name '*h' -a \! -name '*internal.h' -a \! -path '../include/pcmki/*') cocci: -for cf in $(COCCI_FILES); do \ for f in $(MAY_USE_INTERNAL_FILES); do \ spatch $(_SPATCH_FLAGS) -D internal --very-quiet --local-includes --preprocess --sp-file $$cf $$f; \ done ; \ for f in $(OTHER_FILES); do \ spatch $(_SPATCH_FLAGS) --very-quiet --local-includes --preprocess --sp-file $$cf $$f; \ done ; \ done cocci-inplace: $(MAKE) $(AM_MAKEFLAGS) _SPATCH_FLAGS=--in-place cocci cocci-test: for f in coccinelle/test/*.c; do \ coccinelle/test/testrunner.sh $$f; \ done diff --git a/devel/README b/devel/README new file mode 100644 index 0000000000..526df19253 --- /dev/null +++ b/devel/README @@ -0,0 +1,2 @@ +This directory contains helpers for the project developers. +Everyone else can safely ignore it. diff --git a/devel/gdbhelpers b/devel/gdbhelpers new file mode 100644 index 0000000000..319a5f2544 --- /dev/null +++ b/devel/gdbhelpers @@ -0,0 +1,109 @@ +# Copyright 2018-2020 the Pacemaker project contributors +# +# The version control history for this file may have further details. +# +# This source code is licensed under the GNU General Public License version 2 +# or later (GPLv2+) WITHOUT ANY WARRANTY. + +# notes: +# "print"/"set" as "compile code" requires re-including headers/multi-line + +define pcmk + # shelling avoids necessity to have an inferior around + shell printf '%s %s\n\n' 'Available commands' \ + '(all require an inferior, just break on main or so):' + shell printf '%s\t%s\n' pcmk-follow-daemon \ + 'Convenient pacemakerd to particular daemon descent' + shell printf '%s\t%s\n' pcmk-xmlnode2file \ + 'Convenient XML node to (specified/temporary) file dump' + shell printf '%s\t%s\n' pcmk-terminate \ + 'Convenient way to shut pacemaker down as a whole' +end +document pcmk +Show possible pcmk namespace rooted user-defined convenience commands. +end + + +# XXX one would expect that order of pcmk_children naturally matches +# the actual sequential ordering, but that's governed with start_seq +define pcmk-follow-daemon + dont-repeat + set $d = $arg0 + eval "set $d_alt = \"pacemaker-%s\"", $d + set $cont = 1 + break start_child + cont + while ($cont) + if (!(int)strcmp(child->name, $d) || !(int)strcmp(child->name, $d_alt)) + set $cont = 0 + set follow-fork-mode child + break getrlimit + continue + set follow-fork-mode parent # restore + else + if (child->start_seq == sizeof(pcmk_children)/sizeof(*pcmk_children) - 1) + set $cont = 0 + printf "no such daemon: %s (%s)\n", $d, $d_alt + set follow-fork-mode parent # restore + else + continue + end + end + end +end +document pcmk-follow-daemon +Convenient way to follow into particular daemon when starting debugging +from the master control process of pacemaker (pacemakerd). +For "pacemaker-" prefixed daemons, this very prefix is not needed. + +Synopsis: pcmk-follow-daemon CHILD-DAEMON +Examples: pcmk-follow-daemon "controld" +end + + +define pcmk-xmlnode2file + dont-repeat + set $n = $arg0 + if ($argc > 1) + set $fn = $arg1 + else + set $fn = tmpnam((void *) 0) + printf "refer to this file: %s\n", $fn + end + set $f = (FILE *) fopen($fn, "w") + eval "print (xmlElemDump((FILE *) %p, ((xmlNodePtr) %p)->doc, (xmlNodePtr) %p), \"%s\")", \ + $f, $n, $n, "dump done" + eval "print (fprintf((FILE *) %p, \"\\n\"), fclose((FILE *) %p), \"%s\")", \ + $f, $f, "dump finished" +end +document pcmk-xmlnode2file +Convenient way to dump an XML element to specified file, or a temporary file +when it's not. + +Synopsis: pcmk-xmlnode2file XMLNODEPTR-EXPR [FILE-NAME] +Examples: pcmk-xmlnode2file node "debug-dump.xml" + pcmk-xmlnode2file elem +end + + +define pcmk-terminate + dont-repeat + eval "set $msg = \"\"", \ + "t='crmd' subt='request' crm_task='quit' crm_sys_to='pacemakerd'" + set $con = crm_ipc_new("pacemakerd", 0) + eval "set $con_ok = crm_ipc_connect((crm_ipc_t *) %p)", $con + if ($con_ok) + eval "print (crm_ipc_send((crm_ipc_t *) %p, (xmlReadMemory(\"%s\", %u, %s))->children, %s), \"%s\")", \ + $con, $msg, sizeof($msg), "\"gdb.xml\", (void *) 0, 0", "0, 0, (void *) 0", "terminating..." + else + print "cannot terminate, not running/listening?" + end + eval "print (crm_ipc_close((crm_ipc_t *) %p), \"connection closed\")", $con + eval "print (crm_ipc_destroy((crm_ipc_t *) %p), \"connection destroyed\")", $con +end +document pcmk-terminate +Convenient way to shut pacemaker down as a whole, no matter from +which process, parent or child, and safely(!). + +Synopsis: pcmk-terminate +end diff --git a/doc/sphinx/Pacemaker_Development/hacking.rst b/doc/sphinx/Pacemaker_Development/hacking.rst index 5324fb407c..4a0e0870fd 100644 --- a/doc/sphinx/Pacemaker_Development/hacking.rst +++ b/doc/sphinx/Pacemaker_Development/hacking.rst @@ -1,69 +1,69 @@ Advanced Hacking on the Project ------------------------------- Foreword ######## This chapter aims to be a gentle introduction (or perhaps, rather a summarization of advanced techniques we developed for backreferences) to how deal with the Pacemaker internals effectively. For instance, how to: * debug with an ease * verify various interesting interaction-based properties or simply put, all that is in the interest of the core contributors on the project to know, master, and (preferably) also evolve -- way beyond what is in the presumed repertoire of a generic contributor role, which is detailed in other chapters of this guide. Therefore, if you think you will not benefit from any such details in the scope of this chapter, feel free to skip it. Debugging ######### In the GNU userland tradition, preferred way of debugging is based on ``gdb`` (directly or via specific frontends atop) that is widely available on platforms (semi-)supported with Pacemaker itself. To make some advanced debugging easier, we maintain a script defining some -useful helpers in ``extra/gdb/gdbhelpers`` file, which you can make available +useful helpers in ``devel/gdbhelpers`` file, which you can make available in the debugging session easily when invoking it as ``gdb -x ...``. From within the debugger, you can then invoke the new ``pcmk`` command that will guide you regarding other helper functions available, so we won't replicate that here. Working with mocked daemons ########################### Since the Pacemaker run-time consists of multiple co-operating daemons as detailed elsewhere, tracking down the interaction details amongst them can be rather cumbersome. Since rebuilding existing daemons in a more modular way as opposed to clusters of mutually dependent functions, we elected to grow separate bare-bones counterparts built evolutionary as skeletons just to get the basic (long-term stabilized) communication with typical daemon clients going, and to add new modules in their outer circles (plus minimalistic hook support at those cores) on a demand-driven basis. The code for these is located at ``maint/mocked``; for instance, ``based-notifyfenced.c`` module of ``based.c`` skeleton mocking ``pacemaker-based`` daemon was exactly to fulfill investigation helper role (the case at hand was also an impulse to kick off this very sort of maintenance support material, to begin with). Non-trivial knowledge of Pacemaker internals and other skills are needed to use such devised helpers, but given the other way around, some sorts of investigation may be even heftier, it may be the least effort choice. And when that's the case, advanced contributors are expected to contribute their own extensions they used to validate the reproducibility/actual correctness of the fix along the actual code modifications. This way, the rest of the development teams is not required to deal with elaborate preconditions, be at guess, or even forced to use a blind faith regarding the causes, consequences and validity regarding the raised issues/fixes, for the greater benefit of all. diff --git a/doc/sphinx/Pacemaker_Explained/fencing.rst b/doc/sphinx/Pacemaker_Explained/fencing.rst index df928b5dbc..9ed12b39a4 100644 --- a/doc/sphinx/Pacemaker_Explained/fencing.rst +++ b/doc/sphinx/Pacemaker_Explained/fencing.rst @@ -1,1170 +1,1170 @@ .. index:: single: fencing single: STONITH .. _fencing: Fencing ------- What Is Fencing? ################ *Fencing* is the ability to make a node unable to run resources, even when that node is unresponsive to cluster commands. Fencing is also known as *STONITH*, an acronym for "Shoot The Other Node In The Head", since the most common fencing method is cutting power to the node. Another method is "fabric fencing", cutting the node's access to some capability required to run resources (such as network access or a shared disk). .. index:: single: fencing; why necessary Why Is Fencing Necessary? ######################### Fencing protects your data from being corrupted by malfunctioning nodes or unintentional concurrent access to shared resources. Fencing protects against the "split brain" failure scenario, where cluster nodes have lost the ability to reliably communicate with each other but are still able to run resources. If the cluster just assumed that uncommunicative nodes were down, then multiple instances of a resource could be started on different nodes. The effect of split brain depends on the resource type. For example, an IP address brought up on two hosts on a network will cause packets to randomly be sent to one or the other host, rendering the IP useless. For a database or clustered file system, the effect could be much more severe, causing data corruption or divergence. Fencing is also used when a resource cannot otherwise be stopped. If a resource fails to stop on a node, it cannot be started on a different node without risking the same type of conflict as split-brain. Fencing the original node ensures the resource can be safely started elsewhere. Users may also configure the ``on-fail`` property of :ref:`operation` or the ``loss-policy`` property of :ref:`ticket constraints ` to ``fence``, in which case the cluster will fence the resource's node if the operation fails or the ticket is lost. .. index:: single: fencing; device Fence Devices ############# A *fence device* or *fencing device* is a special type of resource that provides the means to fence a node. Examples of fencing devices include intelligent power switches and IPMI devices that accept SNMP commands to cut power to a node, and iSCSI controllers that allow SCSI reservations to be used to cut a node's access to a shared disk. Since fencing devices will be used to recover from loss of networking connectivity to other nodes, it is essential that they do not rely on the same network as the cluster itself, otherwise that network becomes a single point of failure. Since loss of a node due to power outage is indistinguishable from loss of network connectivity to that node, it is also essential that at least one fence device for a node does not share power with that node. For example, an on-board IPMI controller that shares power with its host should not be used as the sole fencing device for that host. Since fencing is used to isolate malfunctioning nodes, no fence device should rely on its target functioning properly. This includes, for example, devices that ssh into a node and issue a shutdown command (such devices might be suitable for testing, but never for production). .. index:: single: fencing; agent Fence Agents ############ A *fence agent* or *fencing agent* is a ``stonith``-class resource agent. The fence agent standard provides commands (such as ``off`` and ``reboot``) that the cluster can use to fence nodes. As with other resource agent classes, this allows a layer of abstraction so that Pacemaker doesn't need any knowledge about specific fencing technologies -- that knowledge is isolated in the agent. When a Fence Device Can Be Used ############################### Fencing devices do not actually "run" like most services. Typically, they just provide an interface for sending commands to an external device. Additionally, fencing may be initiated by Pacemaker, by other cluster-aware software such as DRBD or DLM, or manually by an administrator, at any point in the cluster life cycle, including before any resources have been started. To accommodate this, Pacemaker does not require the fence device resource to be "started" in order to be used. Whether a fence device is started or not determines whether a node runs any recurring monitor for the device, and gives the node a slight preference for being chosen to execute fencing using that device. By default, any node can execute any fencing device. If a fence device is disabled by setting its ``target-role`` to ``Stopped``, then no node can use that device. If a location constraint with a negative score prevents a specific node from "running" a fence device, then that node will never be chosen to execute fencing using the device. A node may fence itself, but the cluster will choose that only if no other nodes can do the fencing. A common configuration scenario is to have one fence device per target node. In such a case, users often configure anti-location constraints so that the target node does not monitor its own device. Limitations of Fencing Resources ################################ Fencing resources have certain limitations that other resource classes don't: * They may have only one set of meta-attributes and one set of instance attributes. * If :ref:`rules` are used to determine fencing resource options, these might be evaluated only when first read, meaning that later changes to the rules will have no effect. Therefore, it is better to avoid confusion and not use rules at all with fencing resources. These limitations could be revisited if there is sufficient user demand. .. index:: single: fencing; special instance attributes .. _fencing-attributes: Special Options for Fencing Resources ##################################### The table below lists special instance attributes that may be set for any fencing resource (*not* meta-attributes, even though they are interpreted by Pacemaker rather than the fence agent). These are also listed in the man page for ``pacemaker-fenced``. .. Not_Yet_Implemented: +----------------------+---------+--------------------+----------------------------------------+ | priority | integer | 0 | .. index:: | | | | | single: priority | | | | | | | | | | The priority of the fence device. | | | | | Devices are tried in order of highest | | | | | priority to lowest. | +----------------------+---------+--------------------+----------------------------------------+ .. table:: **Additional Properties of Fencing Resources** +----------------------+---------+--------------------+----------------------------------------+ | Field | Type | Default | Description | +======================+=========+====================+========================================+ | stonith-timeout | time | | .. index:: | | | | | single: stonith-timeout | | | | | | | | | | Older versions used this to override | | | | | the default period to wait for a fence | | | | | action (reboot, on, or off) to | | | | | complete for this device. It has been | | | | | replaced by the | | | | | ``pcmk_reboot_timeout`` and | | | | | ``pcmk_off_timeout`` properties. | +----------------------+---------+--------------------+----------------------------------------+ | provides | string | | .. index:: | | | | | single: provides | | | | | | | | | | Any special capability provided by the | | | | | fence device. Currently, only one such | | | | | capability is meaningful: | | | | | :ref:`unfencing `. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_host_map | string | | .. index:: | | | | | single: pcmk_host_map | | | | | | | | | | A mapping of host names to ports | | | | | numbers for devices that do not | | | | | support host names. | | | | | | | | | | Example: ``node1:1;node2:2,3`` tells | | | | | the cluster to use port 1 for | | | | | ``node1`` and ports 2 and 3 for | | | | | ``node2``. If ``pcmk_host_check`` is | | | | | explicitly set to ``static-list``, | | | | | either this or ``pcmk_host_list`` must | | | | | be set. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_host_list | string | | .. index:: | | | | | single: pcmk_host_list | | | | | | | | | | A list of machines controlled by this | | | | | device. If ``pcmk_host_check`` is | | | | | explicitly set to ``static-list``, | | | | | either this or ``pcmk_host_map`` must | | | | | be set. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_host_check | string | The default is | .. index:: | | | | ``static-list`` if | single: pcmk_host_check | | | | either | | | | | ``pcmk_host_list`` | How to determine which machines are | | | | or | controlled by the device. Allowed | | | | ``pcmk_host_map`` | values: | | | | is configured. If | | | | | neither of those | * ``dynamic-list:`` query the device | | | | are configured, | via the agent's ``list`` action | | | | the default is | * ``static-list:`` check the | | | | ``dynamic-list`` | ``pcmk_host_list`` or | | | | if the fence | ``pcmk_host_map`` attribute | | | | device supports | * ``status:`` query the device via the | | | | the list action, | "status" command | | | | or ``status`` if | * ``none:`` assume the device can | | | | the fence device | fence any node | | | | supports the | | | | | status action but | | | | | not the list | | | | | action. If none of | | | | | those conditions | | | | | apply, the default | | | | | is ``none``. | | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_delay_max | time | 0s | .. index:: | | | | | single: pcmk_delay_max | | | | | | - | | | | Enable a random delay of up to the | + | | | | Enable a delay of no more than the | | | | | time specified before executing | - | | | | fencing actions. This is sometimes | + | | | | fencing actions. Pacemaker derives the | + | | | | overall delay by taking the value of | + | | | | pcmk_delay_base and adding a random | + | | | | delay value such that the sum is kept | + | | | | below this maximum. This is sometimes | | | | | used in two-node clusters to ensure | | | | | that the nodes don't fence each other | - | | | | at the same time. The overall delay | - | | | | introduced by pacemaker is derived | - | | | | from this random delay value adding a | - | | | | static delay so that the sum is kept | - | | | | below the maximum delay. | + | | | | at the same time. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_delay_base | time | 0s | .. index:: | | | | | single: pcmk_delay_base | | | | | | | | | | Enable a static delay before executing | | | | | fencing actions. This can be used, for | | | | | example, in two-node clusters to | | | | | ensure that the nodes don't fence each | | | | | other, by having separate fencing | | | | | resources with different values. The | | | | | node that is fenced with the shorter | | | | | delay will lose a fencing race. The | | | | | overall delay introduced by pacemaker | | | | | is derived from this value plus a | | | | | random delay such that the sum is kept | | | | | below the maximum delay. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_action_limit | integer | 1 | .. index:: | | | | | single: pcmk_action_limit | | | | | | | | | | The maximum number of actions that can | | | | | be performed in parallel on this | | | | | device, if the cluster option | | | | | ``concurrent-fencing`` is ``true``. A | | | | | value of -1 means unlimited. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_host_argument | string | ``port`` otherwise | .. index:: | | | | ``plug`` if | single: pcmk_host_argument | | | | supported | | | | | according to the | *Advanced use only.* Which parameter | | | | metadata of the | should be supplied to the fence agent | | | | fence agent | to identify the node to be fenced. | | | | | Some devices support neither the | | | | | standard ``plug`` nor the deprecated | | | | | ``port`` parameter, or may provide | | | | | additional ones. Use this to specify | | | | | an alternate, device-specific | | | | | parameter. A value of ``none`` tells | | | | | the cluster not to supply any | | | | | additional parameters. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_reboot_action | string | reboot | .. index:: | | | | | single: pcmk_reboot_action | | | | | | | | | | *Advanced use only.* The command to | | | | | send to the resource agent in order to | | | | | reboot a node. Some devices do not | | | | | support the standard commands or may | | | | | provide additional ones. Use this to | | | | | specify an alternate, device-specific | | | | | command. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_reboot_timeout | time | 60s | .. index:: | | | | | single: pcmk_reboot_timeout | | | | | | | | | | *Advanced use only.* Specify an | | | | | alternate timeout to use for | | | | | ``reboot`` actions instead of the | | | | | value of ``stonith-timeout``. Some | | | | | devices need much more or less time to | | | | | complete than normal. Use this to | | | | | specify an alternate, device-specific | | | | | timeout. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_reboot_retries | integer | 2 | .. index:: | | | | | single: pcmk_reboot_retries | | | | | | | | | | *Advanced use only.* The maximum | | | | | number of times to retry the | | | | | ``reboot`` command within the timeout | | | | | period. Some devices do not support | | | | | multiple connections, and operations | | | | | may fail if the device is busy with | | | | | another task, so Pacemaker will | | | | | automatically retry the operation, if | | | | | there is time remaining. Use this | | | | | option to alter the number of times | | | | | Pacemaker retries before giving up. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_off_action | string | off | .. index:: | | | | | single: pcmk_off_action | | | | | | | | | | *Advanced use only.* The command to | | | | | send to the resource agent in order to | | | | | shut down a node. Some devices do not | | | | | support the standard commands or may | | | | | provide additional ones. Use this to | | | | | specify an alternate, device-specific | | | | | command. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_off_timeout | time | 60s | .. index:: | | | | | single: pcmk_off_timeout | | | | | | | | | | *Advanced use only.* Specify an | | | | | alternate timeout to use for | | | | | ``off`` actions instead of the | | | | | value of ``stonith-timeout``. Some | | | | | devices need much more or less time to | | | | | complete than normal. Use this to | | | | | specify an alternate, device-specific | | | | | timeout. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_off_retries | integer | 2 | .. index:: | | | | | single: pcmk_off_retries | | | | | | | | | | *Advanced use only.* The maximum | | | | | number of times to retry the | | | | | ``off`` command within the timeout | | | | | period. Some devices do not support | | | | | multiple connections, and operations | | | | | may fail if the device is busy with | | | | | another task, so Pacemaker will | | | | | automatically retry the operation, if | | | | | there is time remaining. Use this | | | | | option to alter the number of times | | | | | Pacemaker retries before giving up. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_list_action | string | list | .. index:: | | | | | single: pcmk_list_action | | | | | | | | | | *Advanced use only.* The command to | | | | | send to the resource agent in order to | | | | | list nodes. Some devices do not | | | | | support the standard commands or may | | | | | provide additional ones. Use this to | | | | | specify an alternate, device-specific | | | | | command. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_list_timeout | time | 60s | .. index:: | | | | | single: pcmk_list_timeout | | | | | | | | | | *Advanced use only.* Specify an | | | | | alternate timeout to use for | | | | | ``list`` actions instead of the | | | | | value of ``stonith-timeout``. Some | | | | | devices need much more or less time to | | | | | complete than normal. Use this to | | | | | specify an alternate, device-specific | | | | | timeout. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_list_retries | integer | 2 | .. index:: | | | | | single: pcmk_list_retries | | | | | | | | | | *Advanced use only.* The maximum | | | | | number of times to retry the | | | | | ``list`` command within the timeout | | | | | period. Some devices do not support | | | | | multiple connections, and operations | | | | | may fail if the device is busy with | | | | | another task, so Pacemaker will | | | | | automatically retry the operation, if | | | | | there is time remaining. Use this | | | | | option to alter the number of times | | | | | Pacemaker retries before giving up. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_monitor_action | string | monitor | .. index:: | | | | | single: pcmk_monitor_action | | | | | | | | | | *Advanced use only.* The command to | | | | | send to the resource agent in order to | | | | | report extended status. Some devices do| | | | | not support the standard commands or | | | | | may provide additional ones. Use this | | | | | to specify an alternate, | | | | | device-specific command. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_monitor_timeout | time | 60s | .. index:: | | | | | single: pcmk_monitor_timeout | | | | | | | | | | *Advanced use only.* Specify an | | | | | alternate timeout to use for | | | | | ``monitor`` actions instead of the | | | | | value of ``stonith-timeout``. Some | | | | | devices need much more or less time to | | | | | complete than normal. Use this to | | | | | specify an alternate, device-specific | | | | | timeout. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_monitor_retries | integer | 2 | .. index:: | | | | | single: pcmk_monitor_retries | | | | | | | | | | *Advanced use only.* The maximum | | | | | number of times to retry the | | | | | ``monitor`` command within the timeout | | | | | period. Some devices do not support | | | | | multiple connections, and operations | | | | | may fail if the device is busy with | | | | | another task, so Pacemaker will | | | | | automatically retry the operation, if | | | | | there is time remaining. Use this | | | | | option to alter the number of times | | | | | Pacemaker retries before giving up. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_status_action | string | status | .. index:: | | | | | single: pcmk_status_action | | | | | | | | | | *Advanced use only.* The command to | | | | | send to the resource agent in order to | | | | | report status. Some devices do | | | | | not support the standard commands or | | | | | may provide additional ones. Use this | | | | | to specify an alternate, | | | | | device-specific command. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_status_timeout | time | 60s | .. index:: | | | | | single: pcmk_status_timeout | | | | | | | | | | *Advanced use only.* Specify an | | | | | alternate timeout to use for | | | | | ``status`` actions instead of the | | | | | value of ``stonith-timeout``. Some | | | | | devices need much more or less time to | | | | | complete than normal. Use this to | | | | | specify an alternate, device-specific | | | | | timeout. | +----------------------+---------+--------------------+----------------------------------------+ | pcmk_status_retries | integer | 2 | .. index:: | | | | | single: pcmk_status_retries | | | | | | | | | | *Advanced use only.* The maximum | | | | | number of times to retry the | | | | | ``status`` command within the timeout | | | | | period. Some devices do not support | | | | | multiple connections, and operations | | | | | may fail if the device is busy with | | | | | another task, so Pacemaker will | | | | | automatically retry the operation, if | | | | | there is time remaining. Use this | | | | | option to alter the number of times | | | | | Pacemaker retries before giving up. | +----------------------+---------+--------------------+----------------------------------------+ .. index:: single: unfencing single: fencing; unfencing .. _unfencing: Unfencing ######### With fabric fencing (such as cutting network or shared disk access rather than power), it is expected that the cluster will fence the node, and then a system administrator must manually investigate what went wrong, correct any issues found, then reboot (or restart the cluster services on) the node. Once the node reboots and rejoins the cluster, some fabric fencing devices require an explicit command to restore the node's access. This capability is called *unfencing* and is typically implemented as the fence agent's ``on`` command. If any cluster resource has ``requires`` set to ``unfencing``, then that resource will not be probed or started on a node until that node has been unfenced. Fence Devices Dependent on Other Resources ########################################## In some cases, a fence device may require some other cluster resource (such as an IP address) to be active in order to function properly. This is obviously undesirable in general: fencing may be required when the depended-on resource is not active, or fencing may be required because the node running the depended-on resource is no longer responding. However, this may be acceptable under certain conditions: * The dependent fence device should not be able to target any node that is allowed to run the depended-on resource. * The depended-on resource should not be disabled during production operation. * The ``concurrent-fencing`` cluster property should be set to ``true``. Otherwise, if both the node running the depended-on resource and some node targeted by the dependent fence device need to be fenced, the fencing of the node running the depended-on resource might be ordered first, making the second fencing impossible and blocking further recovery. With concurrent fencing, the dependent fence device might fail at first due to the depended-on resource being unavailable, but it will be retried and eventually succeed once the resource is brought back up. Even under those conditions, there is one unlikely problem scenario. The DC always schedules fencing of itself after any other fencing needed, to avoid unnecessary repeated DC elections. If the dependent fence device targets the DC, and both the DC and a different node running the depended-on resource need to be fenced, the DC fencing will always fail and block further recovery. Note, however, that losing a DC node entirely causes some other node to become DC and schedule the fencing, so this is only a risk when a stop or other operation with ``on-fail`` set to ``fencing`` fails on the DC. .. index:: single: fencing; configuration Configuring Fencing ################### Higher-level tools can provide simpler interfaces to this process, but using Pacemaker command-line tools, this is how you could configure a fence device. #. Find the correct driver: .. code-block:: none # stonith_admin --list-installed .. note:: You may have to install packages to make fence agents available on your host. Searching your available packages for ``fence-`` is usually helpful. Ensure the packages providing the fence agents you require are installed on every cluster node. #. Find the required parameters associated with the device (replacing ``$AGENT_NAME`` with the name obtained from the previous step): .. code-block:: none # stonith_admin --metadata --agent $AGENT_NAME #. Create a file called ``stonith.xml`` containing a primitive resource with a class of ``stonith``, a type equal to the agent name obtained earlier, and a parameter for each of the values returned in the previous step. #. If the device does not know how to fence nodes based on their uname, you may also need to set the special ``pcmk_host_map`` parameter. See :ref:`fencing-attributes` for details. #. If the device does not support the ``list`` command, you may also need to set the special ``pcmk_host_list`` and/or ``pcmk_host_check`` parameters. See :ref:`fencing-attributes` for details. #. If the device does not expect the victim to be specified with the ``port`` parameter, you may also need to set the special ``pcmk_host_argument`` parameter. See :ref:`fencing-attributes` for details. #. Upload it into the CIB using cibadmin: .. code-block:: none # cibadmin --create --scope resources --xml-file stonith.xml #. Set ``stonith-enabled`` to true: .. code-block:: none # crm_attribute --type crm_config --name stonith-enabled --update true #. Once the stonith resource is running, you can test it by executing the following, replacing ``$NODE_NAME`` with the name of the node to fence (although you might want to stop the cluster on that machine first): .. code-block:: none # stonith_admin --reboot $NODE_NAME Example Fencing Configuration _____________________________ For this example, we assume we have a cluster node, ``pcmk-1``, whose IPMI controller is reachable at the IP address 192.0.2.1. The IPMI controller uses the username ``testuser`` and the password ``abc123``. #. Looking at what's installed, we may see a variety of available agents: .. code-block:: none # stonith_admin --list-installed .. code-block:: none (... some output omitted ...) fence_idrac fence_ilo3 fence_ilo4 fence_ilo5 fence_imm fence_ipmilan (... some output omitted ...) Perhaps after some reading some man pages and doing some Internet searches, we might decide ``fence_ipmilan`` is our best choice. #. Next, we would check what parameters ``fence_ipmilan`` provides: .. code-block:: none # stonith_admin --metadata -a fence_ipmilan .. code-block:: xml fence_ipmilan is an I/O Fencing agentwhich can be used with machines controlled by IPMI.This agent calls support software ipmitool (http://ipmitool.sf.net/). WARNING! This fence agent might report success before the node is powered off. You should use -m/method onoff if your fence device works correctly with that option. Fencing action IPMI Lan Auth type. Ciphersuite to use (same as ipmitool -C parameter) Hexadecimal-encoded Kg key for IPMIv2 authentication IP address or hostname of fencing device IP address or hostname of fencing device TCP/UDP port to use for connection with device Use Lanplus to improve security of connection Login name Method to fence Login password or passphrase Script to run to retrieve password Login password or passphrase Script to run to retrieve password IP address or hostname of fencing device (together with --port-as-ip) IP address or hostname of fencing device (together with --port-as-ip) Privilege level on IPMI device Bridge IPMI requests to the remote target address Login name Disable logging to stderr. Does not affect --verbose or --debug-file or logging to syslog. Verbose mode Write debug information to given file Write debug information to given file Display version information and exit Display help and exit Wait X seconds before fencing is started Path to ipmitool binary Wait X seconds for cmd prompt after login Make "port/plug" to be an alias to IP address Test X seconds for status change after ON/OFF Wait X seconds after issuing ON/OFF Wait X seconds for cmd prompt after issuing command Count of attempts to retry power on Use sudo (without password) when calling 3rd party software Use sudo (without password) when calling 3rd party software Path to sudo binary Once we've decided what parameter values we think we need, it is a good idea to run the fence agent's status action manually, to verify that our values work correctly: .. code-block:: none # fence_ipmilan --lanplus -a 192.0.2.1 -l testuser -p abc123 -o status Chassis Power is on #. Based on that, we might create a fencing resource configuration like this in ``stonith.xml`` (or any file name, just use the same name with ``cibadmin`` later): .. code-block:: xml .. note:: Even though the man page shows that the ``action`` parameter is supported, we do not provide that in the resource configuration. Pacemaker will supply an appropriate action whenever the fence device must be used. #. In this case, we don't need to configure ``pcmk_host_map`` because ``fence_ipmilan`` ignores the target node name and instead uses its ``ip`` parameter to know how to contact the IPMI controller. #. We do need to let Pacemaker know which cluster node can be fenced by this device, since ``fence_ipmilan`` doesn't support the ``list`` action. Add a line like this to the agent's instance attributes: .. code-block:: xml #. We don't need to configure ``pcmk_host_argument`` since ``ip`` is all the fence agent needs (it ignores the target name). #. Make the configuration active: .. code-block:: none # cibadmin --create --scope resources --xml-file stonith.xml #. Set ``stonith-enabled`` to true (this only has to be done once): .. code-block:: none # crm_attribute --type crm_config --name stonith-enabled --update true #. Since our cluster is still in testing, we can reboot ``pcmk-1`` without bothering anyone, so we'll test our fencing configuration by running this from one of the other cluster nodes: .. code-block:: none # stonith_admin --reboot pcmk-1 Then we will verify that the node did, in fact, reboot. We can repeat that process to create a separate fencing resource for each node. With some other fence device types, a single fencing resource is able to be used for all nodes. In fact, we could do that with ``fence_ipmilan``, using the ``port-as-ip`` parameter along with ``pcmk_host_map``. Either approach is fine. .. index:: single: fencing; topology single: fencing-topology single: fencing-level Fencing Topologies ################## Pacemaker supports fencing nodes with multiple devices through a feature called *fencing topologies*. Fencing topologies may be used to provide alternative devices in case one fails, or to require multiple devices to all be executed successfully in order to consider the node successfully fenced, or even a combination of the two. Create the individual devices as you normally would, then define one or more ``fencing-level`` entries in the ``fencing-topology`` section of the configuration. * Each fencing level is attempted in order of ascending ``index``. Allowed values are 1 through 9. * If a device fails, processing terminates for the current level. No further devices in that level are exercised, and the next level is attempted instead. * If the operation succeeds for all the listed devices in a level, the level is deemed to have passed. * The operation is finished when a level has passed (success), or all levels have been attempted (failed). * If the operation failed, the next step is determined by the scheduler and/or the controller. Some possible uses of topologies include: * Try on-board IPMI, then an intelligent power switch if that fails * Try fabric fencing of both disk and network, then fall back to power fencing if either fails * Wait up to a certain time for a kernel dump to complete, then cut power to the node .. table:: **Attributes of a fencing-level Element** +------------------+-----------------------------------------------------------------------------------------+ | Attribute | Description | +==================+=========================================================================================+ | id | .. index:: | | | pair: fencing-level; id | | | | | | A unique name for this element (required) | +------------------+-----------------------------------------------------------------------------------------+ | target | .. index:: | | | pair: fencing-level; target | | | | | | The name of a single node to which this level applies | +------------------+-----------------------------------------------------------------------------------------+ | target-pattern | .. index:: | | | pair: fencing-level; target-pattern | | | | | | An extended regular expression (as defined in `POSIX | | | `_) | | | matching the names of nodes to which this level applies | +------------------+-----------------------------------------------------------------------------------------+ | target-attribute | .. index:: | | | pair: fencing-level; target-attribute | | | | | | The name of a node attribute that is set (to ``target-value``) for nodes to which this | | | level applies | +------------------+-----------------------------------------------------------------------------------------+ | target-value | .. index:: | | | pair: fencing-level; target-value | | | | | | The node attribute value (of ``target-attribute``) that is set for nodes to which this | | | level applies | +------------------+-----------------------------------------------------------------------------------------+ | index | .. index:: | | | pair: fencing-level; index | | | | | | The order in which to attempt the levels. Levels are attempted in ascending order | | | *until one succeeds*. Valid values are 1 through 9. | +------------------+-----------------------------------------------------------------------------------------+ | devices | .. index:: | | | pair: fencing-level; devices | | | | | | A comma-separated list of devices that must all be tried for this level | +------------------+-----------------------------------------------------------------------------------------+ .. note:: **Fencing topology with different devices for different nodes** .. code-block:: xml ... ... Example Dual-Layer, Dual-Device Fencing Topologies __________________________________________________ The following example illustrates an advanced use of ``fencing-topology`` in a cluster with the following properties: * 2 nodes (prod-mysql1 and prod-mysql2) * the nodes have IPMI controllers reachable at 192.0.2.1 and 192.0.2.2 * the nodes each have two independent Power Supply Units (PSUs) connected to two independent Power Distribution Units (PDUs) reachable at 198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11) * fencing via the IPMI controller uses the ``fence_ipmilan`` agent (1 fence device per controller, with each device targeting a separate node) * fencing via the PDUs uses the ``fence_apc_snmp`` agent (1 fence device per PDU, with both devices targeting both nodes) * a random delay is used to lessen the chance of a "death match" * fencing topology is set to try IPMI fencing first then dual PDU fencing if that fails In a node failure scenario, Pacemaker will first select ``fence_ipmilan`` to try to kill the faulty node. Using the fencing topology, if that method fails, it will then move on to selecting ``fence_apc_snmp`` twice (once for the first PDU, then again for the second PDU). The fence action is considered successful only if both PDUs report the required status. If any of them fails, fencing loops back to the first fencing method, ``fence_ipmilan``, and so on, until the node is fenced or the fencing action is cancelled. .. note:: **First fencing method: single IPMI device per target** Each cluster node has it own dedicated IPMI controller that can be contacted for fencing using the following primitives: .. code-block:: xml .. note:: **Second fencing method: dual PDU devices** Each cluster node also has 2 distinct power supplies controlled by 2 distinct PDUs: * Node 1: PDU 1 port 10 and PDU 2 port 10 * Node 2: PDU 1 port 11 and PDU 2 port 11 The matching fencing agents are configured as follows: .. code-block:: xml .. note:: **Fencing topology** Now that all the fencing resources are defined, it's time to create the right topology. We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node. .. code-block:: xml In ``fencing-topology``, the lowest ``index`` value for a target determines its first fencing method. Remapping Reboots ################# When the cluster needs to reboot a node, whether because ``stonith-action`` is ``reboot`` or because a reboot was requested externally (such as by ``stonith_admin --reboot``), it will remap that to other commands in two cases: * If the chosen fencing device does not support the ``reboot`` command, the cluster will ask it to perform ``off`` instead. * If a fencing topology level with multiple devices must be executed, the cluster will ask all the devices to perform ``off``, then ask the devices to perform ``on``. To understand the second case, consider the example of a node with redundant power supplies connected to intelligent power switches. Rebooting one switch and then the other would have no effect on the node. Turning both switches off, and then on, actually reboots the node. In such a case, the fencing operation will be treated as successful as long as the ``off`` commands succeed, because then it is safe for the cluster to recover any resources that were on the node. Timeouts and errors in the ``on`` phase will be logged but ignored. When a reboot operation is remapped, any action-specific timeout for the remapped action will be used (for example, ``pcmk_off_timeout`` will be used when executing the ``off`` command, not ``pcmk_reboot_timeout``). diff --git a/extra/gdb/gdbhelpers b/extra/gdb/gdbhelpers deleted file mode 100644 index e013e5d153..0000000000 --- a/extra/gdb/gdbhelpers +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2018 the Pacemaker project contributors -# -# The version control history for this file may have further details. -# Author: Jan Pokorny -# Part of pacemaker project -# SPDX-License-Identifier: GPL-2.0-or-later - -define pcmk - # shelling avoids necessity to have an inferior around - shell printf '%s %s\n' 'Available commands' \ - '(all require an inferior, just break on main or so):\n' - shell printf '%s\t%s\n' pcmk-follow-daemon \ - 'Convenient pacemakerd to particular daemon descent' -end -document pcmk -Show possible pcmk namespace rooted user-defined convenience commands. -end - -# XXX one would expect that order of pcmk_children naturally matches -# the actual sequential ordering, but that's governed with start_seq -define pcmk-follow-daemon - dont-repeat - set $d = $arg0 - eval "set $d_alt = \"pacemaker-%s\"", $d - set $cont = 1 - break start_child - cont - while ($cont) - if (!strcmp(child->name, $d) || !strcmp(child->name, $d_alt)) - set $cont = 0 - set follow-fork-mode child - break getrlimit - continue - set follow-fork-mode parent # restore - else - if (child->start_seq == sizeof(pcmk_children)/sizeof(*pcmk_children) - 1) - set $cont = 0 - printf "no such daemon: %s (%s)\n", $d, $d_alt - set follow-fork-mode parent # restore - else - continue - end - end - end -end -document pcmk-follow-daemon -Convenient way to follow into particular daemon when starting debugging -from the master control process of pacemaker (pacemakerd). -For "pacemaker-" prefixed daemons, this very prefix is not needed. - -Example: pcmk-follow-daemon "controld" -end diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c index f59f2d606d..1029d46bc7 100644 --- a/lib/common/ipc_client.c +++ b/lib/common/ipc_client.c @@ -1,1462 +1,1462 @@ /* * Copyright 2004-2020 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #include #if defined(US_AUTH_PEERCRED_UCRED) || defined(US_AUTH_PEERCRED_SOCKPEERCRED) # ifdef US_AUTH_PEERCRED_UCRED # ifndef _GNU_SOURCE # define _GNU_SOURCE # endif # endif # include #elif defined(US_AUTH_GETPEERUCRED) # include #endif #include #include #include #include #include /* indirectly: pcmk_err_generic */ #include #include #include #include "crmcommon_private.h" /*! * \brief Create a new object for using Pacemaker daemon IPC * * \param[out] api Where to store new IPC object * \param[in] server Which Pacemaker daemon the object is for * * \return Standard Pacemaker result code * * \note The caller is responsible for freeing *api using pcmk_free_ipc_api(). * \note This is intended to supersede crm_ipc_new() but currently only * supports the controller & pacemakerd IPC API. */ int pcmk_new_ipc_api(pcmk_ipc_api_t **api, enum pcmk_ipc_server server) { if (api == NULL) { return EINVAL; } *api = calloc(1, sizeof(pcmk_ipc_api_t)); if (*api == NULL) { return errno; } (*api)->server = server; if (pcmk_ipc_name(*api, false) == NULL) { pcmk_free_ipc_api(*api); *api = NULL; return EOPNOTSUPP; } (*api)->ipc_size_max = 0; // Set server methods and max_size (if not default) switch (server) { case pcmk_ipc_attrd: break; case pcmk_ipc_based: (*api)->ipc_size_max = 512 * 1024; // 512KB break; case pcmk_ipc_controld: (*api)->cmds = pcmk__controld_api_methods(); break; case pcmk_ipc_execd: break; case pcmk_ipc_fenced: break; case pcmk_ipc_pacemakerd: (*api)->cmds = pcmk__pacemakerd_api_methods(); break; case pcmk_ipc_schedulerd: // @TODO max_size could vary by client, maybe take as argument? (*api)->ipc_size_max = 5 * 1024 * 1024; // 5MB break; } if ((*api)->cmds == NULL) { pcmk_free_ipc_api(*api); *api = NULL; return ENOMEM; } (*api)->ipc = crm_ipc_new(pcmk_ipc_name(*api, false), (*api)->ipc_size_max); if ((*api)->ipc == NULL) { pcmk_free_ipc_api(*api); *api = NULL; return ENOMEM; } // If daemon API has its own data to track, allocate it if ((*api)->cmds->new_data != NULL) { if ((*api)->cmds->new_data(*api) != pcmk_rc_ok) { pcmk_free_ipc_api(*api); *api = NULL; return ENOMEM; } } crm_trace("Created %s API IPC object", pcmk_ipc_name(*api, true)); return pcmk_rc_ok; } static void free_daemon_specific_data(pcmk_ipc_api_t *api) { if ((api != NULL) && (api->cmds != NULL)) { if ((api->cmds->free_data != NULL) && (api->api_data != NULL)) { api->cmds->free_data(api->api_data); api->api_data = NULL; } free(api->cmds); api->cmds = NULL; } } /*! * \internal * \brief Call an IPC API event callback, if one is registed * * \param[in] api IPC API connection * \param[in] event_type The type of event that occurred * \param[in] status Event status * \param[in] event_data Event-specific data */ void pcmk__call_ipc_callback(pcmk_ipc_api_t *api, enum pcmk_ipc_event event_type, crm_exit_t status, void *event_data) { if ((api != NULL) && (api->cb != NULL)) { api->cb(api, event_type, status, event_data, api->user_data); } } /*! * \internal * \brief Clean up after an IPC disconnect * * \param[in] user_data IPC API connection that disconnected * * \note This function can be used as a main loop IPC destroy callback. */ static void ipc_post_disconnect(gpointer user_data) { pcmk_ipc_api_t *api = user_data; crm_info("Disconnected from %s IPC API", pcmk_ipc_name(api, true)); // Perform any daemon-specific handling needed if ((api->cmds != NULL) && (api->cmds->post_disconnect != NULL)) { api->cmds->post_disconnect(api); } // Call client's registered event callback pcmk__call_ipc_callback(api, pcmk_ipc_event_disconnect, CRM_EX_DISCONNECT, NULL); /* If this is being called from a running main loop, mainloop_gio_destroy() * will free ipc and mainloop_io immediately after calling this function. * If this is called from a stopped main loop, these will leak, so the best * practice is to close the connection before stopping the main loop. */ api->ipc = NULL; api->mainloop_io = NULL; if (api->free_on_disconnect) { /* pcmk_free_ipc_api() has already been called, but did not free api * or api->cmds because this function needed them. Do that now. */ free_daemon_specific_data(api); crm_trace("Freeing IPC API object after disconnect"); free(api); } } /*! * \brief Free the contents of an IPC API object * * \param[in] api IPC API object to free */ void pcmk_free_ipc_api(pcmk_ipc_api_t *api) { bool free_on_disconnect = false; if (api == NULL) { return; } crm_debug("Releasing %s IPC API", pcmk_ipc_name(api, true)); if (api->ipc != NULL) { if (api->mainloop_io != NULL) { /* We need to keep the api pointer itself around, because it is the * user data for the IPC client destroy callback. That will be * triggered by the pcmk_disconnect_ipc() call below, but it might * happen later in the main loop (if still running). * * This flag tells the destroy callback to free the object. It can't * do that unconditionally, because the application might call this * function after a disconnect that happened by other means. */ free_on_disconnect = api->free_on_disconnect = true; } pcmk_disconnect_ipc(api); // Frees api if free_on_disconnect is true } if (!free_on_disconnect) { free_daemon_specific_data(api); crm_trace("Freeing IPC API object"); free(api); } } /*! * \brief Get the IPC name used with an IPC API connection * * \param[in] api IPC API connection * \param[in] for_log If true, return human-friendly name instead of IPC name * * \return IPC API's human-friendly or connection name, or if none is available, * "Pacemaker" if for_log is true and NULL if for_log is false */ const char * pcmk_ipc_name(pcmk_ipc_api_t *api, bool for_log) { if (api == NULL) { return for_log? "Pacemaker" : NULL; } switch (api->server) { case pcmk_ipc_attrd: return for_log? "attribute manager" : NULL /* T_ATTRD */; case pcmk_ipc_based: return for_log? "CIB manager" : NULL /* PCMK__SERVER_BASED_RW */; case pcmk_ipc_controld: return for_log? "controller" : CRM_SYSTEM_CRMD; case pcmk_ipc_execd: return for_log? "executor" : NULL /* CRM_SYSTEM_LRMD */; case pcmk_ipc_fenced: return for_log? "fencer" : NULL /* "stonith-ng" */; case pcmk_ipc_pacemakerd: return for_log? "launcher" : CRM_SYSTEM_MCP; case pcmk_ipc_schedulerd: return for_log? "scheduler" : NULL /* CRM_SYSTEM_PENGINE */; default: return for_log? "Pacemaker" : NULL; } } /*! * \brief Check whether an IPC API connection is active * * \param[in] api IPC API connection * * \return true if IPC is connected, false otherwise */ bool pcmk_ipc_is_connected(pcmk_ipc_api_t *api) { return (api != NULL) && crm_ipc_connected(api->ipc); } /*! * \internal * \brief Call the daemon-specific API's dispatch function * * Perform daemon-specific handling of IPC reply dispatch. It is the daemon * method's responsibility to call the client's registered event callback, as * well as allocate and free any event data. * * \param[in] api IPC API connection */ static void call_api_dispatch(pcmk_ipc_api_t *api, xmlNode *message) { crm_log_xml_trace(message, "ipc-received"); if ((api->cmds != NULL) && (api->cmds->dispatch != NULL)) { api->cmds->dispatch(api, message); } } /*! * \internal * \brief Dispatch data read from IPC source * * \param[in] buffer Data read from IPC * \param[in] length Number of bytes of data in buffer (ignored) * \param[in] user_data IPC object * * \return Always 0 (meaning connection is still required) * * \note This function can be used as a main loop IPC dispatch callback. */ static int dispatch_ipc_data(const char *buffer, ssize_t length, gpointer user_data) { pcmk_ipc_api_t *api = user_data; xmlNode *msg; CRM_CHECK(api != NULL, return 0); if (buffer == NULL) { crm_warn("Empty message received from %s IPC", pcmk_ipc_name(api, true)); return 0; } msg = string2xml(buffer); if (msg == NULL) { crm_warn("Malformed message received from %s IPC", pcmk_ipc_name(api, true)); return 0; } call_api_dispatch(api, msg); free_xml(msg); return 0; } /*! * \brief Check whether an IPC connection has data available (without main loop) * * \param[in] api IPC API connection * \param[in] timeout_ms If less than 0, poll indefinitely; if 0, poll once * and return immediately; otherwise, poll for up to * this many milliseconds * * \return Standard Pacemaker return code * * \note Callers of pcmk_connect_ipc() using pcmk_ipc_dispatch_poll should call * this function to check whether IPC data is available. Return values of * interest include pcmk_rc_ok meaning data is available, and EAGAIN * meaning no data is available; all other values indicate errors. * \todo This does not allow the caller to poll multiple file descriptors at * once. If there is demand for that, we could add a wrapper for * crm_ipc_get_fd(api->ipc), so the caller can call poll() themselves. */ int pcmk_poll_ipc(pcmk_ipc_api_t *api, int timeout_ms) { int rc; struct pollfd pollfd = { 0, }; if ((api == NULL) || (api->dispatch_type != pcmk_ipc_dispatch_poll)) { return EINVAL; } pollfd.fd = crm_ipc_get_fd(api->ipc); pollfd.events = POLLIN; rc = poll(&pollfd, 1, timeout_ms); if (rc < 0) { return errno; } else if (rc == 0) { return EAGAIN; } return pcmk_rc_ok; } /*! * \brief Dispatch available messages on an IPC connection (without main loop) * * \param[in] api IPC API connection * * \return Standard Pacemaker return code * * \note Callers of pcmk_connect_ipc() using pcmk_ipc_dispatch_poll should call * this function when IPC data is available. */ void pcmk_dispatch_ipc(pcmk_ipc_api_t *api) { if (api == NULL) { return; } while (crm_ipc_ready(api->ipc) > 0) { if (crm_ipc_read(api->ipc) > 0) { dispatch_ipc_data(crm_ipc_buffer(api->ipc), 0, api); } } } // \return Standard Pacemaker return code static int connect_with_main_loop(pcmk_ipc_api_t *api) { int rc; struct ipc_client_callbacks callbacks = { .dispatch = dispatch_ipc_data, .destroy = ipc_post_disconnect, }; rc = pcmk__add_mainloop_ipc(api->ipc, G_PRIORITY_DEFAULT, api, &callbacks, &(api->mainloop_io)); if (rc != pcmk_rc_ok) { return rc; } crm_debug("Connected to %s IPC (attached to main loop)", pcmk_ipc_name(api, true)); /* After this point, api->mainloop_io owns api->ipc, so api->ipc * should not be explicitly freed. */ return pcmk_rc_ok; } // \return Standard Pacemaker return code static int connect_without_main_loop(pcmk_ipc_api_t *api) { int rc; if (!crm_ipc_connect(api->ipc)) { rc = errno; crm_ipc_close(api->ipc); return rc; } crm_debug("Connected to %s IPC (without main loop)", pcmk_ipc_name(api, true)); return pcmk_rc_ok; } /*! * \brief Connect to a Pacemaker daemon via IPC * * \param[in] api IPC API instance * \param[out] dispatch_type How IPC replies should be dispatched * * \return Standard Pacemaker return code */ int pcmk_connect_ipc(pcmk_ipc_api_t *api, enum pcmk_ipc_dispatch dispatch_type) { int rc = pcmk_rc_ok; if (api == NULL) { crm_err("Cannot connect to uninitialized API object"); return EINVAL; } if (api->ipc == NULL) { api->ipc = crm_ipc_new(pcmk_ipc_name(api, false), api->ipc_size_max); if (api->ipc == NULL) { crm_err("Failed to re-create IPC API"); return ENOMEM; } } if (crm_ipc_connected(api->ipc)) { crm_trace("Already connected to %s IPC API", pcmk_ipc_name(api, true)); return pcmk_rc_ok; } api->dispatch_type = dispatch_type; switch (dispatch_type) { case pcmk_ipc_dispatch_main: rc = connect_with_main_loop(api); break; case pcmk_ipc_dispatch_sync: case pcmk_ipc_dispatch_poll: rc = connect_without_main_loop(api); break; } if (rc != pcmk_rc_ok) { return rc; } if ((api->cmds != NULL) && (api->cmds->post_connect != NULL)) { rc = api->cmds->post_connect(api); if (rc != pcmk_rc_ok) { crm_ipc_close(api->ipc); } } return rc; } /*! * \brief Disconnect an IPC API instance * * \param[in] api IPC API connection * * \return Standard Pacemaker return code * * \note If the connection is attached to a main loop, this function should be * called before quitting the main loop, to ensure that all memory is * freed. */ void pcmk_disconnect_ipc(pcmk_ipc_api_t *api) { if ((api == NULL) || (api->ipc == NULL)) { return; } switch (api->dispatch_type) { case pcmk_ipc_dispatch_main: { mainloop_io_t *mainloop_io = api->mainloop_io; // Make sure no code with access to api can use these again api->mainloop_io = NULL; api->ipc = NULL; mainloop_del_ipc_client(mainloop_io); // After this point api might have already been freed } break; case pcmk_ipc_dispatch_poll: case pcmk_ipc_dispatch_sync: { crm_ipc_t *ipc = api->ipc; // Make sure no code with access to api can use ipc again api->ipc = NULL; // This should always be the case already, but to be safe api->free_on_disconnect = false; crm_ipc_destroy(ipc); ipc_post_disconnect(api); } break; } } /*! * \brief Register a callback for IPC API events * * \param[in] api IPC API connection * \param[in] callback Callback to register * \param[in] userdata Caller data to pass to callback * * \note This function may be called multiple times to update the callback * and/or user data. The caller remains responsible for freeing * userdata in any case (after the IPC is disconnected, if the * user data is still registered with the IPC). */ void pcmk_register_ipc_callback(pcmk_ipc_api_t *api, pcmk_ipc_callback_t cb, void *user_data) { if (api == NULL) { return; } api->cb = cb; api->user_data = user_data; } /*! * \internal * \brief Send an XML request across an IPC API connection * * \param[in] api IPC API connection * \param[in] request XML request to send * * \return Standard Pacemaker return code * * \note Daemon-specific IPC API functions should call this function to send * requests, because it handles different dispatch types appropriately. */ int pcmk__send_ipc_request(pcmk_ipc_api_t *api, xmlNode *request) { int rc; xmlNode *reply = NULL; enum crm_ipc_flags flags = crm_ipc_flags_none; if ((api == NULL) || (api->ipc == NULL) || (request == NULL)) { return EINVAL; } crm_log_xml_trace(request, "ipc-sent"); // Synchronous dispatch requires waiting for a reply if ((api->dispatch_type == pcmk_ipc_dispatch_sync) && (api->cmds != NULL) && (api->cmds->reply_expected != NULL) && (api->cmds->reply_expected(api, request))) { flags = crm_ipc_client_response; } // The 0 here means a default timeout of 5 seconds rc = crm_ipc_send(api->ipc, request, flags, 0, &reply); if (rc < 0) { return pcmk_legacy2rc(rc); } else if (rc == 0) { return ENODATA; } // With synchronous dispatch, we dispatch any reply now if (reply != NULL) { call_api_dispatch(api, reply); free_xml(reply); } return pcmk_rc_ok; } /*! * \internal * \brief Create the XML for an IPC request to purge a node from the peer cache * * \param[in] api IPC API connection * \param[in] node_name If not NULL, name of node to purge * \param[in] nodeid If not 0, node ID of node to purge * * \return Newly allocated IPC request XML * * \note The controller, fencer, and pacemakerd use the same request syntax, but * the attribute manager uses a different one. The CIB manager doesn't * have any syntax for it. The executor and scheduler don't connect to the * cluster layer and thus don't have or need any syntax for it. * * \todo Modify the attribute manager to accept the common syntax (as well * as its current one, for compatibility with older clients). Modify * the CIB manager to accept and honor the common syntax. Modify the * executor and scheduler to accept the syntax (immediately returning * success), just for consistency. Modify this function to use the * common syntax with all daemons if their version supports it. */ static xmlNode * create_purge_node_request(pcmk_ipc_api_t *api, const char *node_name, uint32_t nodeid) { xmlNode *request = NULL; const char *client = crm_system_name? crm_system_name : "client"; switch (api->server) { case pcmk_ipc_attrd: request = create_xml_node(NULL, __func__); crm_xml_add(request, F_TYPE, T_ATTRD); crm_xml_add(request, F_ORIG, crm_system_name); crm_xml_add(request, PCMK__XA_TASK, PCMK__ATTRD_CMD_PEER_REMOVE); crm_xml_add(request, PCMK__XA_ATTR_NODE_NAME, node_name); if (nodeid > 0) { crm_xml_add_int(request, PCMK__XA_ATTR_NODE_ID, (int) nodeid); } break; case pcmk_ipc_controld: case pcmk_ipc_fenced: case pcmk_ipc_pacemakerd: request = create_request(CRM_OP_RM_NODE_CACHE, NULL, NULL, pcmk_ipc_name(api, false), client, NULL); if (nodeid > 0) { crm_xml_set_id(request, "%lu", (unsigned long) nodeid); } crm_xml_add(request, XML_ATTR_UNAME, node_name); break; case pcmk_ipc_based: case pcmk_ipc_execd: case pcmk_ipc_schedulerd: break; } return request; } /*! * \brief Ask a Pacemaker daemon to purge a node from its peer cache * * \param[in] api IPC API connection * \param[in] node_name If not NULL, name of node to purge * \param[in] nodeid If not 0, node ID of node to purge * * \return Standard Pacemaker return code * * \note At least one of node_name or nodeid must be specified. */ int pcmk_ipc_purge_node(pcmk_ipc_api_t *api, const char *node_name, uint32_t nodeid) { int rc = 0; xmlNode *request = NULL; if (api == NULL) { return EINVAL; } if ((node_name == NULL) && (nodeid == 0)) { return EINVAL; } request = create_purge_node_request(api, node_name, nodeid); if (request == NULL) { return EOPNOTSUPP; } rc = pcmk__send_ipc_request(api, request); free_xml(request); crm_debug("%s peer cache purge of node %s[%lu]: rc=%d", pcmk_ipc_name(api, true), node_name, (unsigned long) nodeid, rc); return rc; } /* * Generic IPC API (to eventually be deprecated as public API and made internal) */ struct crm_ipc_s { struct pollfd pfd; unsigned int max_buf_size; // maximum bytes we can send or receive over IPC unsigned int buf_size; // size of allocated buffer int msg_size; int need_reply; char *buffer; char *name; qb_ipcc_connection_t *ipc; }; /*! * \brief Create a new (legacy) object for using Pacemaker daemon IPC * * \param[in] name IPC system name to connect to * \param[in] max_size Use a maximum IPC buffer size of at least this size * * \return Newly allocated IPC object on success, NULL otherwise * * \note The caller is responsible for freeing the result using * crm_ipc_destroy(). * \note This should be considered deprecated for use with daemons supported by * pcmk_new_ipc_api(). */ crm_ipc_t * crm_ipc_new(const char *name, size_t max_size) { crm_ipc_t *client = NULL; client = calloc(1, sizeof(crm_ipc_t)); if (client == NULL) { crm_err("Could not create IPC connection: %s", strerror(errno)); return NULL; } client->name = strdup(name); if (client->name == NULL) { crm_err("Could not create IPC connection: %s", strerror(errno)); free(client); return NULL; } client->buf_size = pcmk__ipc_buffer_size(max_size); client->buffer = malloc(client->buf_size); if (client->buffer == NULL) { crm_err("Could not create IPC connection: %s", strerror(errno)); free(client->name); free(client); return NULL; } /* Clients initiating connection pick the max buf size */ client->max_buf_size = client->buf_size; client->pfd.fd = -1; client->pfd.events = POLLIN; client->pfd.revents = 0; return client; } /*! * \brief Establish an IPC connection to a Pacemaker component * * \param[in] client Connection instance obtained from crm_ipc_new() * * \return TRUE on success, FALSE otherwise (in which case errno will be set; * specifically, in case of discovering the remote side is not * authentic, its value is set to ECONNABORTED). */ bool crm_ipc_connect(crm_ipc_t * client) { uid_t cl_uid = 0; gid_t cl_gid = 0; pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0; int rv; client->need_reply = FALSE; client->ipc = qb_ipcc_connect(client->name, client->buf_size); if (client->ipc == NULL) { crm_debug("Could not establish %s connection: %s (%d)", client->name, pcmk_strerror(errno), errno); return FALSE; } client->pfd.fd = crm_ipc_get_fd(client); if (client->pfd.fd < 0) { rv = errno; /* message already omitted */ crm_ipc_close(client); errno = rv; return FALSE; } rv = pcmk_daemon_user(&cl_uid, &cl_gid); if (rv < 0) { /* message already omitted */ crm_ipc_close(client); errno = -rv; return FALSE; } if ((rv = pcmk__crm_ipc_is_authentic_process(client->ipc, client->pfd.fd, cl_uid, cl_gid, &found_pid, &found_uid, &found_gid)) == pcmk_rc_ipc_unauthorized) { crm_err("Daemon (IPC %s) is not authentic:" " process %lld (uid: %lld, gid: %lld)", client->name, (long long) PCMK__SPECIAL_PID_AS_0(found_pid), (long long) found_uid, (long long) found_gid); crm_ipc_close(client); errno = ECONNABORTED; return FALSE; } else if (rv != pcmk_rc_ok) { crm_perror(LOG_ERR, "Could not verify authenticity of daemon (IPC %s)", client->name); crm_ipc_close(client); if (rv > 0) { errno = rv; } else { - rv = ENOTCONN; + errno = ENOTCONN; } return FALSE; } qb_ipcc_context_set(client->ipc, client); #ifdef HAVE_IPCS_GET_BUFFER_SIZE client->max_buf_size = qb_ipcc_get_buffer_size(client->ipc); if (client->max_buf_size > client->buf_size) { free(client->buffer); client->buffer = calloc(1, client->max_buf_size); client->buf_size = client->max_buf_size; } #endif return TRUE; } void crm_ipc_close(crm_ipc_t * client) { if (client) { if (client->ipc) { qb_ipcc_connection_t *ipc = client->ipc; client->ipc = NULL; qb_ipcc_disconnect(ipc); } } } void crm_ipc_destroy(crm_ipc_t * client) { if (client) { if (client->ipc && qb_ipcc_is_connected(client->ipc)) { crm_notice("Destroying an active IPC connection to %s", client->name); /* The next line is basically unsafe * * If this connection was attached to mainloop and mainloop is active, * the 'disconnected' callback will end up back here and we'll end * up free'ing the memory twice - something that can still happen * even without this if we destroy a connection and it closes before * we call exit */ /* crm_ipc_close(client); */ } crm_trace("Destroying IPC connection to %s: %p", client->name, client); free(client->buffer); free(client->name); free(client); } } int crm_ipc_get_fd(crm_ipc_t * client) { int fd = 0; if (client && client->ipc && (qb_ipcc_fd_get(client->ipc, &fd) == 0)) { return fd; } errno = EINVAL; crm_perror(LOG_ERR, "Could not obtain file IPC descriptor for %s", (client? client->name : "unspecified client")); return -errno; } bool crm_ipc_connected(crm_ipc_t * client) { bool rc = FALSE; if (client == NULL) { crm_trace("No client"); return FALSE; } else if (client->ipc == NULL) { crm_trace("No connection"); return FALSE; } else if (client->pfd.fd < 0) { crm_trace("Bad descriptor"); return FALSE; } rc = qb_ipcc_is_connected(client->ipc); if (rc == FALSE) { client->pfd.fd = -EINVAL; } return rc; } /*! * \brief Check whether an IPC connection is ready to be read * * \param[in] client Connection to check * * \return Positive value if ready to be read, 0 if not ready, -errno on error */ int crm_ipc_ready(crm_ipc_t *client) { int rc; CRM_ASSERT(client != NULL); if (crm_ipc_connected(client) == FALSE) { return -ENOTCONN; } client->pfd.revents = 0; rc = poll(&(client->pfd), 1, 0); return (rc < 0)? -errno : rc; } // \return Standard Pacemaker return code static int crm_ipc_decompress(crm_ipc_t * client) { pcmk__ipc_header_t *header = (pcmk__ipc_header_t *)(void*)client->buffer; if (header->size_compressed) { int rc = 0; unsigned int size_u = 1 + header->size_uncompressed; /* never let buf size fall below our max size required for ipc reads. */ unsigned int new_buf_size = QB_MAX((sizeof(pcmk__ipc_header_t) + size_u), client->max_buf_size); char *uncompressed = calloc(1, new_buf_size); crm_trace("Decompressing message data %u bytes into %u bytes", header->size_compressed, size_u); rc = BZ2_bzBuffToBuffDecompress(uncompressed + sizeof(pcmk__ipc_header_t), &size_u, client->buffer + sizeof(pcmk__ipc_header_t), header->size_compressed, 1, 0); if (rc != BZ_OK) { crm_err("Decompression failed: %s " CRM_XS " bzerror=%d", bz2_strerror(rc), rc); free(uncompressed); return EILSEQ; } /* * This assert no longer holds true. For an identical msg, some clients may * require compression, and others may not. If that same msg (event) is sent * to multiple clients, it could result in some clients receiving a compressed * msg even though compression was not explicitly required for them. * * CRM_ASSERT((header->size_uncompressed + sizeof(pcmk__ipc_header_t)) >= ipc_buffer_max); */ CRM_ASSERT(size_u == header->size_uncompressed); memcpy(uncompressed, client->buffer, sizeof(pcmk__ipc_header_t)); /* Preserve the header */ header = (pcmk__ipc_header_t *)(void*)uncompressed; free(client->buffer); client->buf_size = new_buf_size; client->buffer = uncompressed; } CRM_ASSERT(client->buffer[sizeof(pcmk__ipc_header_t) + header->size_uncompressed - 1] == 0); return pcmk_rc_ok; } long crm_ipc_read(crm_ipc_t * client) { pcmk__ipc_header_t *header = NULL; CRM_ASSERT(client != NULL); CRM_ASSERT(client->ipc != NULL); CRM_ASSERT(client->buffer != NULL); client->buffer[0] = 0; client->msg_size = qb_ipcc_event_recv(client->ipc, client->buffer, client->buf_size, 0); if (client->msg_size >= 0) { int rc = crm_ipc_decompress(client); if (rc != pcmk_rc_ok) { return pcmk_rc2legacy(rc); } header = (pcmk__ipc_header_t *)(void*)client->buffer; if (!pcmk__valid_ipc_header(header)) { return -EBADMSG; } crm_trace("Received %s event %d, size=%u, rc=%d, text: %.100s", client->name, header->qb.id, header->qb.size, client->msg_size, client->buffer + sizeof(pcmk__ipc_header_t)); } else { crm_trace("No message from %s received: %s", client->name, pcmk_strerror(client->msg_size)); } if (crm_ipc_connected(client) == FALSE || client->msg_size == -ENOTCONN) { crm_err("Connection to %s failed", client->name); } if (header) { /* Data excluding the header */ return header->size_uncompressed; } return -ENOMSG; } const char * crm_ipc_buffer(crm_ipc_t * client) { CRM_ASSERT(client != NULL); return client->buffer + sizeof(pcmk__ipc_header_t); } uint32_t crm_ipc_buffer_flags(crm_ipc_t * client) { pcmk__ipc_header_t *header = NULL; CRM_ASSERT(client != NULL); if (client->buffer == NULL) { return 0; } header = (pcmk__ipc_header_t *)(void*)client->buffer; return header->flags; } const char * crm_ipc_name(crm_ipc_t * client) { CRM_ASSERT(client != NULL); return client->name; } // \return Standard Pacemaker return code static int internal_ipc_get_reply(crm_ipc_t *client, int request_id, int ms_timeout, ssize_t *bytes) { time_t timeout = time(NULL) + 1 + (ms_timeout / 1000); int rc = pcmk_rc_ok; /* get the reply */ crm_trace("client %s waiting on reply to msg id %d", client->name, request_id); do { *bytes = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 1000); if (*bytes > 0) { pcmk__ipc_header_t *hdr = NULL; rc = crm_ipc_decompress(client); if (rc != pcmk_rc_ok) { return rc; } hdr = (pcmk__ipc_header_t *)(void*)client->buffer; if (hdr->qb.id == request_id) { /* Got it */ break; } else if (hdr->qb.id < request_id) { xmlNode *bad = string2xml(crm_ipc_buffer(client)); crm_err("Discarding old reply %d (need %d)", hdr->qb.id, request_id); crm_log_xml_notice(bad, "OldIpcReply"); } else { xmlNode *bad = string2xml(crm_ipc_buffer(client)); crm_err("Discarding newer reply %d (need %d)", hdr->qb.id, request_id); crm_log_xml_notice(bad, "ImpossibleReply"); CRM_ASSERT(hdr->qb.id <= request_id); } } else if (crm_ipc_connected(client) == FALSE) { crm_err("Server disconnected client %s while waiting for msg id %d", client->name, request_id); break; } } while (time(NULL) < timeout); if (*bytes < 0) { rc = (int) -*bytes; // System errno } return rc; } /*! * \brief Send an IPC XML message * * \param[in] client Connection to IPC server * \param[in] message XML message to send * \param[in] flags Bitmask of crm_ipc_flags * \param[in] ms_timeout Give up if not sent within this much time * (5 seconds if 0, or no timeout if negative) * \param[out] reply Reply from server (or NULL if none) * * \return Negative errno on error, otherwise size of reply received in bytes * if reply was needed, otherwise number of bytes sent */ int crm_ipc_send(crm_ipc_t * client, xmlNode * message, enum crm_ipc_flags flags, int32_t ms_timeout, xmlNode ** reply) { int rc = 0; ssize_t qb_rc = 0; ssize_t bytes = 0; struct iovec *iov; static uint32_t id = 0; static int factor = 8; pcmk__ipc_header_t *header; if (client == NULL) { crm_notice("Can't send IPC request without connection (bug?): %.100s", message); return -ENOTCONN; } else if (crm_ipc_connected(client) == FALSE) { /* Don't even bother */ crm_notice("Can't send IPC request to %s: Connection closed", client->name); return -ENOTCONN; } if (ms_timeout == 0) { ms_timeout = 5000; } if (client->need_reply) { qb_rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, ms_timeout); if (qb_rc < 0) { crm_warn("Sending IPC to %s disabled until pending reply received", client->name); return -EALREADY; } else { crm_notice("Sending IPC to %s re-enabled after pending reply received", client->name); client->need_reply = FALSE; } } id++; CRM_LOG_ASSERT(id != 0); /* Crude wrap-around detection */ rc = pcmk__ipc_prepare_iov(id, message, client->max_buf_size, &iov, &bytes); if (rc != pcmk_rc_ok) { crm_warn("Couldn't prepare IPC request to %s: %s " CRM_XS " rc=%d", client->name, pcmk_rc_str(rc), rc); return pcmk_rc2legacy(rc); } header = iov[0].iov_base; pcmk__set_ipc_flags(header->flags, client->name, flags); if (pcmk_is_set(flags, crm_ipc_proxied)) { /* Don't look for a synchronous response */ pcmk__clear_ipc_flags(flags, "client", crm_ipc_client_response); } if(header->size_compressed) { if(factor < 10 && (client->max_buf_size / 10) < (bytes / factor)) { crm_notice("Compressed message exceeds %d0%% of configured IPC " "limit (%u bytes); consider setting PCMK_ipc_buffer to " "%u or higher", factor, client->max_buf_size, 2 * client->max_buf_size); factor++; } } crm_trace("Sending %s IPC request %d of %u bytes using %dms timeout", client->name, header->qb.id, header->qb.size, ms_timeout); if ((ms_timeout > 0) || !pcmk_is_set(flags, crm_ipc_client_response)) { time_t timeout = time(NULL) + 1 + (ms_timeout / 1000); do { /* @TODO Is this check really needed? Won't qb_ipcc_sendv() return * an error if it's not connected? */ if (!crm_ipc_connected(client)) { goto send_cleanup; } qb_rc = qb_ipcc_sendv(client->ipc, iov, 2); } while ((qb_rc == -EAGAIN) && (time(NULL) < timeout)); rc = (int) qb_rc; // Negative of system errno, or bytes sent if (qb_rc <= 0) { goto send_cleanup; } else if (!pcmk_is_set(flags, crm_ipc_client_response)) { crm_trace("Not waiting for reply to %s IPC request %d", client->name, header->qb.id); goto send_cleanup; } rc = internal_ipc_get_reply(client, header->qb.id, ms_timeout, &bytes); if (rc != pcmk_rc_ok) { /* We didn't get the reply in time, so disable future sends for now. * The only alternative would be to close the connection since we * don't know how to detect and discard out-of-sequence replies. * * @TODO Implement out-of-sequence detection */ client->need_reply = TRUE; } rc = (int) bytes; // Negative system errno, or size of reply received } else { // No timeout, and client response needed do { qb_rc = qb_ipcc_sendv_recv(client->ipc, iov, 2, client->buffer, client->buf_size, -1); } while ((qb_rc == -EAGAIN) && crm_ipc_connected(client)); rc = (int) qb_rc; // Negative system errno, or size of reply received } if (rc > 0) { pcmk__ipc_header_t *hdr = (pcmk__ipc_header_t *)(void*)client->buffer; crm_trace("Received %d-byte reply %d to %s IPC %d: %.100s", rc, hdr->qb.id, client->name, header->qb.id, crm_ipc_buffer(client)); if (reply) { *reply = string2xml(crm_ipc_buffer(client)); } } else { crm_trace("No reply to %s IPC %d: rc=%d", client->name, header->qb.id, rc); } send_cleanup: if (crm_ipc_connected(client) == FALSE) { crm_notice("Couldn't send %s IPC request %d: Connection closed " CRM_XS " rc=%d", client->name, header->qb.id, rc); } else if (rc == -ETIMEDOUT) { crm_warn("%s IPC request %d failed: %s after %dms " CRM_XS " rc=%d", client->name, header->qb.id, pcmk_strerror(rc), ms_timeout, rc); crm_write_blackbox(0, NULL); } else if (rc <= 0) { crm_warn("%s IPC request %d failed: %s " CRM_XS " rc=%d", client->name, header->qb.id, ((rc == 0)? "No bytes sent" : pcmk_strerror(rc)), rc); } pcmk_free_ipc_event(iov); return rc; } int pcmk__crm_ipc_is_authentic_process(qb_ipcc_connection_t *qb_ipc, int sock, uid_t refuid, gid_t refgid, pid_t *gotpid, uid_t *gotuid, gid_t *gotgid) { int ret = 0; pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0; #if defined(US_AUTH_PEERCRED_UCRED) struct ucred ucred; socklen_t ucred_len = sizeof(ucred); #endif #ifdef HAVE_QB_IPCC_AUTH_GET if (qb_ipc && !qb_ipcc_auth_get(qb_ipc, &found_pid, &found_uid, &found_gid)) { goto do_checks; } #endif #if defined(US_AUTH_PEERCRED_UCRED) if (!getsockopt(sock, SOL_SOCKET, SO_PEERCRED, &ucred, &ucred_len) && ucred_len == sizeof(ucred)) { found_pid = ucred.pid; found_uid = ucred.uid; found_gid = ucred.gid; #elif defined(US_AUTH_PEERCRED_SOCKPEERCRED) struct sockpeercred sockpeercred; socklen_t sockpeercred_len = sizeof(sockpeercred); if (!getsockopt(sock, SOL_SOCKET, SO_PEERCRED, &sockpeercred, &sockpeercred_len) && sockpeercred_len == sizeof(sockpeercred_len)) { found_pid = sockpeercred.pid; found_uid = sockpeercred.uid; found_gid = sockpeercred.gid; #elif defined(US_AUTH_GETPEEREID) if (!getpeereid(sock, &found_uid, &found_gid)) { found_pid = PCMK__SPECIAL_PID; /* cannot obtain PID (FreeBSD) */ #elif defined(US_AUTH_GETPEERUCRED) ucred_t *ucred; if (!getpeerucred(sock, &ucred)) { errno = 0; found_pid = ucred_getpid(ucred); found_uid = ucred_geteuid(ucred); found_gid = ucred_getegid(ucred); ret = -errno; ucred_free(ucred); if (ret) { return (ret < 0) ? ret : -pcmk_err_generic; } #else # error "No way to authenticate a Unix socket peer" errno = 0; if (0) { #endif #ifdef HAVE_QB_IPCC_AUTH_GET do_checks: #endif if (gotpid != NULL) { *gotpid = found_pid; } if (gotuid != NULL) { *gotuid = found_uid; } if (gotgid != NULL) { *gotgid = found_gid; } if (found_uid == 0 || found_uid == refuid || found_gid == refgid) { ret = 0; } else { ret = pcmk_rc_ipc_unauthorized; } } else { ret = (errno > 0) ? errno : pcmk_rc_error; } return ret; } int crm_ipc_is_authentic_process(int sock, uid_t refuid, gid_t refgid, pid_t *gotpid, uid_t *gotuid, gid_t *gotgid) { int ret = pcmk__crm_ipc_is_authentic_process(NULL, sock, refuid, refgid, gotpid, gotuid, gotgid); /* The old function had some very odd return codes*/ if (ret == 0) { return 1; } else if (ret == pcmk_rc_ipc_unauthorized) { return 0; } else { return pcmk_rc2legacy(ret); } } int pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid, gid_t refgid, pid_t *gotpid) { static char last_asked_name[PATH_MAX / 2] = ""; /* log spam prevention */ int fd; int rc = pcmk_rc_ipc_unresponsive; int auth_rc = 0; int32_t qb_rc; pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0; qb_ipcc_connection_t *c; c = qb_ipcc_connect(name, 0); if (c == NULL) { crm_info("Could not connect to %s IPC: %s", name, strerror(errno)); rc = pcmk_rc_ipc_unresponsive; goto bail; } qb_rc = qb_ipcc_fd_get(c, &fd); if (qb_rc != 0) { rc = (int) -qb_rc; // System errno crm_err("Could not get fd from %s IPC: %s " CRM_XS " rc=%d", name, pcmk_rc_str(rc), rc); goto bail; } auth_rc = pcmk__crm_ipc_is_authentic_process(c, fd, refuid, refgid, &found_pid, &found_uid, &found_gid); if (auth_rc == pcmk_rc_ipc_unauthorized) { crm_err("Daemon (IPC %s) effectively blocked with unauthorized" " process %lld (uid: %lld, gid: %lld)", name, (long long) PCMK__SPECIAL_PID_AS_0(found_pid), (long long) found_uid, (long long) found_gid); rc = pcmk_rc_ipc_unauthorized; goto bail; } if (auth_rc != pcmk_rc_ok) { rc = auth_rc; crm_err("Could not get peer credentials from %s IPC: %s " CRM_XS " rc=%d", name, pcmk_rc_str(rc), rc); goto bail; } if (gotpid != NULL) { *gotpid = found_pid; } rc = pcmk_rc_ok; if ((found_uid != refuid || found_gid != refgid) && strncmp(last_asked_name, name, sizeof(last_asked_name))) { if ((found_uid == 0) && (refuid != 0)) { crm_warn("Daemon (IPC %s) runs as root, whereas the expected" " credentials are %lld:%lld, hazard of violating" " the least privilege principle", name, (long long) refuid, (long long) refgid); } else { crm_notice("Daemon (IPC %s) runs as %lld:%lld, whereas the" " expected credentials are %lld:%lld, which may" " mean a different set of privileges than expected", name, (long long) found_uid, (long long) found_gid, (long long) refuid, (long long) refgid); } memccpy(last_asked_name, name, '\0', sizeof(last_asked_name)); } bail: if (c != NULL) { qb_ipcc_disconnect(c); } return rc; }