diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index bc8935782..0d34c7c65 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -1,276 +1,277 @@ # # doc: Linux-HA resource agents # # Copyright (C) 2009 Florian Haas # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(doc_DATA) $(REFENTRY_STYLESHEET) \ mkappendix.sh ralist.sh CLEANFILES = $(man_MANS) $(xmlfiles) metadata-*.xml STYLESHEET_PREFIX ?= http://docbook.sourceforge.net/release/xsl/current MANPAGES_STYLESHEET ?= $(STYLESHEET_PREFIX)/manpages/docbook.xsl HTML_STYLESHEET ?= $(STYLESHEET_PREFIX)/xhtml/docbook.xsl FO_STYLESHEET ?= $(STYLESHEET_PREFIX)/fo/docbook.xsl REFENTRY_STYLESHEET ?= ra2refentry.xsl XSLTPROC_OPTIONS ?= --xinclude XSLTPROC_MANPAGES_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_HTML_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_FO_OPTIONS ?= $(XSLTPROC_OPTIONS) radir = $(abs_top_builddir)/heartbeat # required for out-of-tree build symlinkstargets = \ ocf-distro ocf.py ocf-rarun ocf-returncodes \ findif.sh apache-conf.sh aws.sh http-mon.sh mysql-common.sh \ nfsserver-redhat.sh openstack-common.sh ora-common.sh preptree: for i in $(symlinkstargets); do \ if [ ! -f $(radir)/$$i ]; then \ rm -rf $(radir)/$$i; \ ln -sf $(abs_top_srcdir)/heartbeat/$$i $(radir)/$$i; \ fi; \ done $(radir)/%: $(abs_top_srcdir)/heartbeat/% if [ ! -f $@ ]; then \ ln -sf $< $@; \ fi # OCF_ROOT=. is necessary due to a sanity check in ocf-shellfuncs # (which tests whether $OCF_ROOT points to a directory metadata-%.xml: $(radir)/% preptree OCF_ROOT=. OCF_FUNCTIONS_DIR=$(radir) $< meta-data > $@ metadata-IPv6addr.xml: $(radir)/IPv6addr OCF_ROOT=. OCF_FUNCTIONS_DIR=$(radir) $< meta-data > $@ clean-local: find $(radir) -type l -exec rm -rf {} \; # Please note: we can't name the man pages # ocf:heartbeat:. Believe me, I've tried. It looks like it # works, but then it doesn't. While make can deal correctly with # colons in target names (when properly escaped), it royally messes up # when it is deals with _dependencies_ that contain colons. See Bug # 12126 on savannah.gnu.org. But, maybe it gets fixed soon, it was # first reported in 1995 and added to Savannah in in 2005... if BUILD_DOC man_MANS = ocf_heartbeat_AoEtarget.7 \ ocf_heartbeat_AudibleAlarm.7 \ ocf_heartbeat_ClusterMon.7 \ ocf_heartbeat_CTDB.7 \ ocf_heartbeat_Delay.7 \ ocf_heartbeat_Dummy.7 \ ocf_heartbeat_EvmsSCC.7 \ ocf_heartbeat_Evmsd.7 \ ocf_heartbeat_Filesystem.7 \ ocf_heartbeat_ICP.7 \ ocf_heartbeat_IPaddr.7 \ ocf_heartbeat_IPaddr2.7 \ ocf_heartbeat_IPsrcaddr.7 \ ocf_heartbeat_LVM.7 \ ocf_heartbeat_LVM-activate.7 \ ocf_heartbeat_LinuxSCSI.7 \ ocf_heartbeat_MailTo.7 \ ocf_heartbeat_ManageRAID.7 \ ocf_heartbeat_ManageVE.7 \ ocf_heartbeat_NodeUtilization.7 \ ocf_heartbeat_Pure-FTPd.7 \ ocf_heartbeat_Raid1.7 \ ocf_heartbeat_Route.7 \ ocf_heartbeat_SAPDatabase.7 \ ocf_heartbeat_SAPInstance.7 \ ocf_heartbeat_SendArp.7 \ ocf_heartbeat_ServeRAID.7 \ ocf_heartbeat_SphinxSearchDaemon.7 \ ocf_heartbeat_Squid.7 \ ocf_heartbeat_Stateful.7 \ ocf_heartbeat_SysInfo.7 \ ocf_heartbeat_VIPArip.7 \ ocf_heartbeat_VirtualDomain.7 \ ocf_heartbeat_WAS.7 \ ocf_heartbeat_WAS6.7 \ ocf_heartbeat_WinPopup.7 \ ocf_heartbeat_Xen.7 \ ocf_heartbeat_Xinetd.7 \ ocf_heartbeat_ZFS.7 \ ocf_heartbeat_aliyun-vpc-move-ip.7 \ ocf_heartbeat_anything.7 \ ocf_heartbeat_apache.7 \ ocf_heartbeat_asterisk.7 \ ocf_heartbeat_aws-vpc-move-ip.7 \ ocf_heartbeat_aws-vpc-route53.7 \ ocf_heartbeat_awseip.7 \ ocf_heartbeat_awsvip.7 \ ocf_heartbeat_azure-lb.7 \ ocf_heartbeat_clvm.7 \ ocf_heartbeat_conntrackd.7 \ ocf_heartbeat_corosync-qnetd.7 \ ocf_heartbeat_crypt.7 \ ocf_heartbeat_db2.7 \ ocf_heartbeat_dhcpd.7 \ ocf_heartbeat_docker.7 \ ocf_heartbeat_docker-compose.7 \ ocf_heartbeat_dovecot.7 \ ocf_heartbeat_dnsupdate.7 \ ocf_heartbeat_dummypy.7 \ ocf_heartbeat_eDir88.7 \ ocf_heartbeat_ethmonitor.7 \ ocf_heartbeat_exportfs.7 \ ocf_heartbeat_fio.7 \ ocf_heartbeat_galera.7 \ ocf_heartbeat_garbd.7 \ ocf_heartbeat_gcp-ilb.7 \ ocf_heartbeat_gcp-vpc-move-ip.7 \ ocf_heartbeat_iSCSILogicalUnit.7 \ ocf_heartbeat_iSCSITarget.7 \ ocf_heartbeat_iface-bridge.7 \ ocf_heartbeat_iface-macvlan.7 \ ocf_heartbeat_iface-vlan.7 \ ocf_heartbeat_ipsec.7 \ ocf_heartbeat_ids.7 \ ocf_heartbeat_iscsi.7 \ ocf_heartbeat_jboss.7 \ ocf_heartbeat_jira.7 \ ocf_heartbeat_kamailio.7 \ ocf_heartbeat_lvmlockd.7 \ ocf_heartbeat_lxc.7 \ ocf_heartbeat_lxd-info.7 \ ocf_heartbeat_machine-info.7 \ ocf_heartbeat_mariadb.7 \ ocf_heartbeat_mdraid.7 \ ocf_heartbeat_minio.7 \ ocf_heartbeat_mpathpersist.7 \ ocf_heartbeat_mysql.7 \ ocf_heartbeat_mysql-proxy.7 \ ocf_heartbeat_nagios.7 \ ocf_heartbeat_named.7 \ ocf_heartbeat_nfsnotify.7 \ ocf_heartbeat_nfsserver.7 \ ocf_heartbeat_nginx.7 \ ocf_heartbeat_nvmet-subsystem.7 \ ocf_heartbeat_nvmet-namespace.7 \ ocf_heartbeat_nvmet-port.7 \ ocf_heartbeat_openstack-info.7 \ ocf_heartbeat_ocivip.7 \ ocf_heartbeat_openstack-cinder-volume.7 \ ocf_heartbeat_openstack-floating-ip.7 \ ocf_heartbeat_openstack-virtual-ip.7 \ ocf_heartbeat_oraasm.7 \ ocf_heartbeat_oracle.7 \ ocf_heartbeat_oralsnr.7 \ ocf_heartbeat_osceip.7 \ ocf_heartbeat_ovsmonitor.7 \ ocf_heartbeat_pgagent.7 \ ocf_heartbeat_pgsql.7 \ ocf_heartbeat_pingd.7 \ ocf_heartbeat_podman.7 \ + ocf_heartbeat_podman-etcd.7 \ ocf_heartbeat_portblock.7 \ ocf_heartbeat_postfix.7 \ ocf_heartbeat_pound.7 \ ocf_heartbeat_proftpd.7 \ ocf_heartbeat_rabbitmq-cluster.7 \ ocf_heartbeat_rabbitmq-server-ha.7 \ ocf_heartbeat_redis.7 \ ocf_heartbeat_rkt.7 \ ocf_heartbeat_rsyncd.7 \ ocf_heartbeat_rsyslog.7 \ ocf_heartbeat_scsi2reservation.7 \ ocf_heartbeat_sfex.7 \ ocf_heartbeat_slapd.7 \ ocf_heartbeat_smb-share.7 \ ocf_heartbeat_sybaseASE.7 \ ocf_heartbeat_sg_persist.7 \ ocf_heartbeat_storage-mon.7 \ ocf_heartbeat_symlink.7 \ ocf_heartbeat_syslog-ng.7 \ ocf_heartbeat_tomcat.7 \ ocf_heartbeat_varnish.7 \ ocf_heartbeat_vdo-vol.7 \ ocf_heartbeat_vmware.7 \ ocf_heartbeat_vsftpd.7 \ ocf_heartbeat_zabbixserver.7 if USE_IPV6ADDR_AGENT man_MANS += ocf_heartbeat_IPv6addr.7 endif if BUILD_AZURE_EVENTS man_MANS += ocf_heartbeat_azure-events.7 endif if BUILD_AZURE_EVENTS_AZ man_MANS += ocf_heartbeat_azure-events-az.7 endif if BUILD_GCP_PD_MOVE man_MANS += ocf_heartbeat_gcp-pd-move.7 endif if BUILD_GCP_VPC_MOVE_ROUTE man_MANS += ocf_heartbeat_gcp-vpc-move-route.7 endif if BUILD_GCP_VPC_MOVE_VIP man_MANS += ocf_heartbeat_gcp-vpc-move-vip.7 endif if BUILD_POWERVS_SUBNET man_MANS += ocf_heartbeat_powervs-subnet.7 endif if BUILD_IBM_CLOUD_VPC_MOVE_ROUTE man_MANS += ocf_heartbeat_ibm-cloud-vpc-cr-vip.7 endif if BUILD_IBM_CLOUD_VPC_MOVE_FIP man_MANS += ocf_heartbeat_ibm-cloud-vpc-move-fip.7 endif xmlfiles = $(man_MANS:.7=.xml) %.1 %.5 %.7 %.8: %.xml $(XSLTPROC) \ $(XSLTPROC_MANPAGES_OPTIONS) \ $(MANPAGES_STYLESHEET) $< ocf_heartbeat_%.xml: metadata-%.xml $(srcdir)/$(REFENTRY_STYLESHEET) $(XSLTPROC) --novalid \ --stringparam package $(PACKAGE_NAME) \ --stringparam version $(VERSION) \ --output $@ \ $(srcdir)/$(REFENTRY_STYLESHEET) $< ocf_resource_agents.xml: $(xmlfiles) mkappendix.sh ./mkappendix.sh $(xmlfiles) > $@ %.html: %.xml $(XSLTPROC) \ $(XSLTPROC_HTML_OPTIONS) \ --output $@ \ $(HTML_STYLESHEET) $< xml: ocf_resource_agents.xml endif diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am index 5c41e0038..839505af9 100644 --- a/heartbeat/Makefile.am +++ b/heartbeat/Makefile.am @@ -1,263 +1,264 @@ # Makefile.am for OCF RAs # # Author: Sun Jing Dong # Copyright (C) 2004 IBM # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(ocf_SCRIPTS) $(ocfcommon_DATA) \ $(common_DATA) $(hb_DATA) $(dtd_DATA) \ README README.galera AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/linux-ha halibdir = $(libexecdir)/heartbeat ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat dtddir = $(datadir)/$(PACKAGE_NAME) dtd_DATA = ra-api-1.dtd metadata.rng ocf_PROGRAMS = if USE_IPV6ADDR_AGENT ocf_PROGRAMS += IPv6addr endif halib_PROGRAMS = if IPV6ADDR_COMPATIBLE halib_PROGRAMS += send_ua endif IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c IPv6addr_LDADD = -lplumb $(LIBNETLIBS) send_ua_SOURCES = send_ua.c IPv6addr_utils.c send_ua_LDADD = $(LIBNETLIBS) ocf_SCRIPTS = AoEtarget \ AudibleAlarm \ ClusterMon \ CTDB \ Delay \ Dummy \ EvmsSCC \ Evmsd \ Filesystem \ ICP \ IPaddr \ IPaddr2 \ IPsrcaddr \ LVM \ LinuxSCSI \ lvmlockd \ LVM-activate \ MailTo \ ManageRAID \ ManageVE \ NodeUtilization \ Pure-FTPd \ Raid1 \ Route \ SAPDatabase \ SAPInstance \ SendArp \ ServeRAID \ SphinxSearchDaemon \ Squid \ Stateful \ SysInfo \ VIPArip \ VirtualDomain \ WAS \ WAS6 \ WinPopup \ Xen \ Xinetd \ ZFS \ aliyun-vpc-move-ip \ anything \ apache \ asterisk \ aws-vpc-move-ip \ aws-vpc-route53 \ awseip \ awsvip \ azure-lb \ clvm \ conntrackd \ corosync-qnetd \ crypt \ db2 \ dhcpd \ dnsupdate \ dummypy \ docker \ docker-compose \ dovecot \ eDir88 \ ethmonitor \ exportfs \ fio \ galera \ garbd \ gcp-ilb \ gcp-vpc-move-ip \ iSCSILogicalUnit \ iSCSITarget \ ids \ iface-bridge \ iface-macvlan \ iface-vlan \ ipsec \ iscsi \ jboss \ jira \ kamailio \ lxc \ lxd-info \ machine-info \ mariadb \ mdraid \ minio \ mysql \ mysql-proxy \ nagios \ named \ nfsnotify \ nfsserver \ nginx \ nvmet-subsystem \ nvmet-namespace \ nvmet-port \ ocivip \ openstack-cinder-volume \ openstack-floating-ip \ openstack-info \ openstack-virtual-ip \ oraasm \ oracle \ oralsnr \ osceip \ ovsmonitor \ pgagent \ pgsql \ pingd \ podman \ + podman-etcd \ portblock \ postfix \ pound \ proftpd \ rabbitmq-cluster \ rabbitmq-server-ha \ redis \ rkt \ rsyncd \ rsyslog \ scsi2reservation \ sfex \ sg_persist \ mpathpersist \ slapd \ smb-share \ storage-mon \ sybaseASE \ symlink \ syslog-ng \ tomcat \ varnish \ vdo-vol \ vmware \ vsftpd \ zabbixserver if BUILD_AZURE_EVENTS ocf_SCRIPTS += azure-events endif if BUILD_AZURE_EVENTS_AZ ocf_SCRIPTS += azure-events-az endif if BUILD_GCP_PD_MOVE ocf_SCRIPTS += gcp-pd-move endif if BUILD_GCP_VPC_MOVE_ROUTE ocf_SCRIPTS += gcp-vpc-move-route endif if BUILD_GCP_VPC_MOVE_VIP ocf_SCRIPTS += gcp-vpc-move-vip endif if BUILD_POWERVS_SUBNET ocf_SCRIPTS += powervs-subnet endif if BUILD_IBM_CLOUD_VPC_MOVE_ROUTE ocf_SCRIPTS += ibm-cloud-vpc-cr-vip endif if BUILD_IBM_CLOUD_VPC_MOVE_FIP ocf_SCRIPTS += ibm-cloud-vpc-move-fip endif ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat ocfcommon_DATA = ocf-shellfuncs \ ocf-binaries \ ocf-directories \ ocf-returncodes \ ocf-rarun \ ocf-distro \ apache-conf.sh \ aws.sh \ http-mon.sh \ sapdb-nosha.sh \ sapdb.sh \ lvm-clvm.sh \ lvm-plain.sh \ lvm-tag.sh \ openstack-common.sh \ ora-common.sh \ mysql-common.sh \ nfsserver-redhat.sh \ findif.sh \ ocf.py # Legacy locations hbdir = $(sysconfdir)/ha.d hb_DATA = shellfuncs check: $(ocf_SCRIPTS:=.check) %.check: % OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) ./$< meta-data | xmllint --path $(abs_srcdir) --noout --relaxng $(abs_srcdir)/metadata.rng - do_spellcheck = printf '[%s]\n' "$(agent)"; \ OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) \ ./$(agent) meta-data 2>/dev/null \ | xsltproc $(top_srcdir)/make/extract_text.xsl - \ | aspell pipe list -d en_US --ignore-case \ --home-dir=$(top_srcdir)/make -p spellcheck-ignore \ | sed -n 's|^&\([^:]*\):.*|\1|p'; spellcheck: @$(foreach agent,$(ocf_SCRIPTS), $(do_spellcheck)) clean-local: rm -rf __pycache__ *.pyc diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd new file mode 100755 index 000000000..514dd2e5b --- /dev/null +++ b/heartbeat/podman-etcd @@ -0,0 +1,1597 @@ +#!/bin/sh +# +# The podman etcd HA resource agent creates and launches a etcd podman +# container based off a supplied podman image. Containers managed by +# this agent are both created and removed upon the agent's start and +# stop actions. +# +# Based on the podman resource agent. +# +# Copyright (c) 2014 David Vossel +# Michele Baldessari +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults +OCF_RESKEY_image_default="default" +OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml" +OCF_RESKEY_name_default="etcd" +OCF_RESKEY_nic_default="br-ex" +OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" +OCF_RESKEY_allow_pull_default="1" +OCF_RESKEY_reuse_default="0" + +: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} +: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} +: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}} +: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}} +: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} + +####################################################################### + +meta_data() +{ + cat < + + +1.0 + + +The podman-etcd HA resource agent creates and launches a etcd podman +container based off a supplied podman image. Containers managed by +this agent are both created and removed upon the agent's start and +stop actions. + +Podman etcd container resource agent. + + + + +The Pod manifest with the configuration for Etcd. + +Etcd pod manifest + + + + + +The podman image to base this container off of. + +podman image + + + + + +The name to give the created container. By default this will +be that resource's instance name. + +podman container name + + + + + +A mapping of node names to IPs. + +This takes the form of: +n1:ip1;n2:ip2 + +where the etcd container on n1 would have IP ip1 + +Container node name to IP mapping + + + + + +Network interface to lookup interface for host. + +Network interface + + + + + +Path of the authentication file. + +The file is created by podman login. + +Path of the authentication file + + + + + +Allow the image to be pulled from the configured podman registry when +the image does not exist locally. NOTE, this can drastically increase +the time required to start the container if the image repository is +pulled over the network. + +Allow pulling non-local images + + + + + +Add options to be appended to the 'podman run' command which is used +when creating the container during the start action. This option allows +users to do things such as setting a custom entry point and injecting +environment variables into the newly created container. Note the '-d' +option is supplied regardless of this value to force containers to run +in the background. + +NOTE: Do not explicitly specify the --name argument in the run_opts. This +agent will set --name using either the resource's instance or the name +provided in the 'name' argument of this agent. + + +run options + + + + + +Specify a command to launch within the container once +it has initialized. + +run command + + + + + +Options to be added to the 'run_cmd'. + +run command options + + + + + +A comma separated list of directories that the container is expecting to use. +The agent will ensure they exist by running 'mkdir -p' + +Required mount points + + + + + +Specify the full path of a command to launch within the container to check +the health of the container. This command must return 0 to indicate that +the container is healthy. A non-zero return code will indicate that the +container has failed and should be recovered. + +Note: Using this method for monitoring processes inside a container +is not recommended, as containerd tries to track processes running +inside the container and does not deal well with many short-lived +processes being spawned. Ensure that your container monitors its +own processes and terminates on fatal error rather than invoking +a command from the outside. + +monitor command + + + + + +Kill a container immediately rather than waiting for it to gracefully +shutdown + +force kill + + + + + +Allow the container to be reused once it is stopped. By default, +containers get removed once they are stopped. Enable this option +to have the particular one persist when this happens. + +reuse container + + + + + +Use transient drop-in files to add extra dependencies to the systemd +scopes associated to the container. During reboot, this prevents systemd +to stop the container before pacemaker. + +drop-in dependency + + + + + + + + + + + + + + +END +} + +####################################################################### +REQUIRE_IMAGE_PULL=0 + +podman_usage() +{ + cat <&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + # 126: container state improper (not running) + # 127: any other error + # 255: podman 2+: container not running + case "$rc" in + 125|126|255) + rc=$OCF_NOT_RUNNING + ;; + 0) + ocf_log debug "monitor cmd passed: exit code = $rc" + ;; + *) + ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" + rc=$OCF_ERR_GENERIC + ;; + esac + + return $rc +} + +container_exists() +{ + local rc + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + if [ $rc -ne 125 ]; then + return 0 + fi + return 1 +} + +remove_container() +{ + local rc + local execids + + if ocf_is_true "$OCF_RESKEY_reuse"; then + # never remove the container if we have reuse enabled. + return 0 + fi + + if ! container_exists; then + # don't attempt to remove a container that doesn't exist + return 0 + fi + ocf_log notice "Cleaning up inactive container, ${CONTAINER}." + ocf_run podman rm -v "$CONTAINER" + rc=$? + if [ $rc -ne 0 ]; then + if [ $rc -eq 2 ]; then + if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then + ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it." + ocf_run podman rm -f "$CONTAINER" + rc=$? + fi + fi + # due to a podman bug (rhbz#1841485), sometimes a stopped + # container can still be associated with Exec sessions, in + # which case the "podman rm" has to be forced + execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}') + if [ "$execids" -ne "0" ]; then + ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." + ocf_run podman rm -f "$CONTAINER" + rc=$? + fi + fi + return $rc +} + +attribute_node_ip() +{ + local action="$1" + local attribute="node_ip" + local value + + if ! value=$(ip -brief addr show "$OCF_RESKEY_nic" | awk '{gsub("/.*", "", $3); print $3}'); then + rc=$? + ocf_log err "could not get node ip, error code: $rc" + return "$rc" + fi + + case "$action" in + get) + echo "$value" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then + rc="$?" + ocf_log err "could not set $attribute to $value, error code: $rc" + return "$rc" + fi + ;; + clear) + crm_attribute --name "$attribute" --delete + ;; + *) + ocf_log err "unsupported $action for $attribute" + return $OCF_ERR_GENERIC + ;; + esac +} + +attribute_node_ip_peer() { + local peer_name + peer_name=$(get_peer_node_name) + crm_attribute --query --name "node_ip" --node "$peer_name" | awk -F"value=" '{print $2}' +} + +get_env_from_manifest() { + local env_var_name="$1" + local env_var_value + + # The agent waits for the manifest to exist before starting, so the + # file should exist already, but this check is included for robustness. + if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then + ocf_log err "external etcd pod manifest ($OCF_RESKEY_pod_manifest) not found" + exit "$OCF_ERR_INSTALLED" + fi + + if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then + rc=$? + ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc" + exit "$OCF_ERR_INSTALLED" + fi + + ocf_log debug "ETCD pod environment variable $env_var_name: $env_var_value" + + echo "$env_var_value" +} + +prepare_env() { + local name ip standalone_node + + NODEIP="$(attribute_node_ip get)" + + if is_force_new_cluster; then + ALL_ETCD_ENDPOINTS="https://$NODEIP:2379" + ETCD_INITIAL_CLUSTER_STATE="new" + ETCD_INITIAL_CLUSTER="$NODENAME=https://$NODEIP:2380" + else + ETCD_INITIAL_CLUSTER_STATE="existing" + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | awk -F":" '{print $1}') + ip=$(echo "$node" | awk -F":" '{print $2}') + if [ -z "$name" ] || [ -z "$ip" ]; then + ocf_exit_reason "name or ip missing for 1 or more nodes" + exit $OCF_ERR_CONFIGURED + fi + + [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379" + [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=https://$ip:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=https://$ip:2380" + done + fi + + ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API") + ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES") + ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR") + ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT") + ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF") + ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS") + ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") + ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") + ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL") + ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES") + ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS") + + SERVER_CACERT=$(get_env_from_manifest "ETCDCTL_CACERT") + ETCD_PEER_CERT=$(get_env_from_manifest "ETCDCTL_CERT") + ETCD_PEER_KEY=$(get_env_from_manifest "ETCDCTL_KEY") + + if is_learner; then + LISTEN_CLIENT_URLS="$NODEIP" + LISTEN_PEER_URLS="$NODEIP" + LISTEN_METRICS_URLS="$NODEIP" + else + LISTEN_CLIENT_URLS="0.0.0.0" + LISTEN_PEER_URLS="0.0.0.0" + LISTEN_METRICS_URLS="0.0.0.0" + fi +} + +archive_data_folder() +{ + # TODO: use etcd snapshots + local dest_dir_name + local data_dir="/var/lib/etcd/member" + + dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)" + if [ ! -d $data_dir ]; then + ocf_log info "no data dir to backup" + return $OCF_SUCCESS + fi + ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name" + mv "$data_dir" "$HA_RSCTMP/$dest_dir_name" + sync +} + +etcd_pod_container_exists() { + local count_matches + # Check whether the etcd pod exists on the same node (header line included) + count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l) + if [ "$count_matches" -eq 1 ]; then + # etcd pod found + return 0 + fi + # etcd pod not found + return 1 +} + +attribute_node_cluster_id() +{ + local action="$1" + local value + if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then + rc=$? + ocf_log err "could not get cluster_id, error code: $rc" + return "$rc" + fi + + case "$action" in + get) + echo "$value" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then + rc=$? + ocf_log err "could not update cluster_id, error code: $rc" + return "$rc" + fi + ;; + *) + ocf_log err "unsupported $action for attribute_node_cluster_id" + return $OCF_ERR_GENERIC + ;; + esac +} + +attribute_node_cluster_id_peer() +{ + local nodename + + nodename=$(get_peer_node_name) + crm_attribute --query --type nodes --node "$nodename" --name "cluster_id" | awk -F"value=" '{print $2}' +} + +attribute_node_revision() +{ + local action="$1" + local value + local attribute="revision" + + if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then + rc=$? + ocf_log err "could not get $attribute, error code: $rc" + return "$rc" + fi + + case "$action" in + get) + echo "$value" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then + rc=$? + ocf_log err "could not update etcd $revision, error code: $rc" + return "$rc" + fi + ;; + *) + ocf_log err "unsupported $action for attribute_node_revision" + return "$OCF_ERR_GENERIC" + ;; + esac +} + +attribute_node_revision_peer() +{ + local nodename + nodename=$(get_peer_node_name) + crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}' +} + +attribute_node_member_id() +{ + local action="$1" + local attribute="member_id" + + if ! container_exists; then + # we need a running container to execute etcdctl. + return 0 + fi + + case "$action" in + get) + # When we need this value at the agent startup we don't have a etcd + # container running, so we always get this value from CIB + crm_attribute --query --type nodes --node "$NODENAME" --name "$attribute" | awk -F"value=" '{print $2}' + ;; + update) + local member_list_json + member_list_json=$(get_member_list_json) + ocf_log info "member list: $member_list_json" + if [ -z "$member_list_json" ] ; then + ocf_log err "could not get $attribute: could not get member list JSON" + return "$rc" + fi + + local value + if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then + rc=$? + ocf_log err "could not get $attribute from member list JSON, error code: $rc" + return "$rc" + fi + + # JSON member_id is decimal, while etcdctl command needs the hex version + value=$(printf "%x" "$value") + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then + rc=$? + ocf_log err "could not update etcd $attribute, error code: $rc" + return "$rc" + fi + ;; + clear) + crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --delete + ;; + *) + ocf_log err "unsupported $action for attribute_node_member_id" + return "$OCF_ERR_GENERIC" + ;; + esac +} + +add_member_as_learner() +{ + local rc + local member_name=$1 + local member_ip=$2 + + ocf_log info "add $member_name ($member_ip) to the member list as learner" + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="https://$(attribute_node_ip get):2379" member add "$member_name" --peer-urls="https://$member_ip:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not add $member_name as learner, error code: $rc" + return $rc + fi + ocf_log info "$out" + + attribute_learner_node update "$member_name" + return $? +} + +set_force_new_cluster() +{ + local rc + crm_attribute --lifetime reboot --node "$NODENAME" --name "force_new_cluster" --update "$NODENAME" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not set force_new_cluster attribute to $NODENAME" + fi + return $rc +} + +get_force_new_cluster() +{ + crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}' +} + +clear_force_new_cluster() +{ + local force_new_cluster_node + + force_new_cluster_node=$(get_force_new_cluster) + if [ -z "$force_new_cluster_node" ]; then + ocf_log info "$NODENAME: force_new_cluster attribute not set" + return $OCF_SUCCESS + fi + + # only the holder of "force_new_cluster" attribute can delete it + if [ "$NODENAME" = "$force_new_cluster_node" ]; then + crm_attribute --lifetime reboot --name "force_new_cluster" --delete + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not clear force_new_cluster attribute, error code: $rc" + else + ocf_log info "$NODENAME: force_new_cluster attribute cleared" + fi + return $rc + else + ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)" + return $OCF_SUCCESS + fi +} + +is_force_new_cluster() +{ + # Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise. + local value + + value=$(get_force_new_cluster) + if [ -z "$value" ]; then + ocf_log debug "force_new_cluster attribute is not set" + return 1 + fi + + if [ "$value" = "$NODENAME" ]; then + ocf_log debug "$NODENAME has force_new_cluster set" + return 0 + fi + + ocf_log info "force_new_cluster attribute set on peer node $value" + return 1 +} + +is_standalone() +{ + local standalone_node + + standalone_node=$(get_standalone_node) + if [ -z "$standalone_node" ]; then + ocf_log debug "no node running standalone" + return 1 + fi + + if [ "$NODENAME" = "$standalone_node" ]; then + ocf_log debug "$NODENAME is set as standalone" + return 0 + fi + ocf_log debug "$NODENAME is set as learner" + return 1 + +} + +set_standalone_node() +{ + local rc + + ocf_log info "add $NODENAME as standalone" + crm_attribute --name "standalone_node" --update "$NODENAME" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not set standalone_node attribute to $NODENAME" + fi + return $rc +} + +get_standalone_node() +{ + crm_attribute --query --name "standalone_node" | awk -F"value=" '{print $2}' +} + +clear_standalone_node() +{ + crm_attribute --name "standalone_node" --delete +} + +clear_standalone_and_learner_if_not_learners() +{ + local rc + local member_list_json="$1" + + number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l) + if [ "$number_of_members" -ne 2 ]; then + ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2" + return $OCF_SUCCESS + fi + + id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not get isLearner field from member list, error code: $rc" + return $rc + fi + + if [ -z "$id" ]; then + clear_standalone_node + rc=$? + if [ $rc -ne 0 ]; then + ocf_og error "could not clear standalone_node attribute, error code: $rc" + return $rc + fi + fi + if [ -z "$id" ]; then + attribute_learner_node clear + rc=$? + if [ $rc -ne 0 ]; then + ocf_og error "could not clear learner_node attribute, error code: $rc" + return $rc + fi + fi + + return $rc +} + +attribute_learner_node() +{ + local action="$1" + local value="$2" + local attribute="learner_node" + + case "$action" in + get) + crm_attribute --query --name "$attribute" | awk -F"value=" '{print $2}' + ;; + update) + if ! crm_attribute --name "$attribute" --update "$value"; then + rc="$?" + ocf_log err "could not set $attribute to $value, error code: $rc" + return "$rc" + fi + ;; + clear) + crm_attribute --name "$attribute" --delete + ;; + *) + ocf_log err "unsupported $action for $attribute" + return $OCF_ERR_GENERIC + ;; + esac +} + +is_learner() +{ + if [ "$NODENAME" = "$(attribute_learner_node get)" ]; then + return 0 + fi + return 1 +} + +get_peer_node_name() { + crm_node -l | awk '{print $2}' | grep -v "$NODENAME" +} + +get_all_etcd_endpoints() { + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | awk -F":" '{print $1}') + ip=$(echo "$node" | awk -F":" '{print $2}') + if [ -z "$name" ] || [ -z "$ip" ]; then + ocf_exit_reason "name or ip missing for 1 or more nodes" + exit $OCF_ERR_CONFIGURED + fi + + [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379" + done + echo "$ALL_ETCD_ENDPOINTS" +} + +get_endpoint_status_json() +{ + # Get the status of all endpoints + local all_etcd_endpoints + + all_etcd_endpoints=$(get_all_etcd_endpoints) + podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json +} + +get_member_list_json() { + # Get the list of members visible to the current node + local this_node_endpoint + + this_node_endpoint="https://$(attribute_node_ip get):2379" + podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json +} + +check_peers() +{ + # Check peers endpoint status and locally accessible member list + local member_list_json + + if ! container_exists; then + # we need a running container to execute etcdctl. + return $OCF_SUCCESS + fi + + member_list_json=$(get_member_list_json) + rc=$? + ocf_log debug "member list: $member_list_json" + if [ $rc -ne 0 ]; then + ocf_log info "podman failed to get member list, error code: $rc" + + endpoint_status_json=$(get_endpoint_status_json) + ocf_log info "endpoint status: $endpoint_status_json" + + count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) + if [ "$count_endpoints" -eq 1 ]; then + ocf_log info "one endpoint only: checking status errors" + endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") + if echo "$endpoint_status_errors" | grep -q "no leader"; then + set_force_new_cluster + set_standalone_node + ocf_exit_reason "$NODENAME must force a new cluster" + return $OCF_ERR_GENERIC + fi + if [ "$endpoint_status_errors" != "null" ]; then + ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" + fi + fi + + return $OCF_SUCCESS + fi + + # Example of .members[] instance fields in member list json format: + # NOTE that "name" is present in voting members only, while "isLearner" in learner members only + # and the value is always true (not a string) in that case. + # { + # "ID": , + # "name": "", + # "peerURLs": [ + # "https://:2380" + # ], + # "clientURLs": [ + # "https://:2379" + # ] + # } + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | awk -F":" '{print $1}') + # do not check itself + if [ "$name" = "$NODENAME" ]; then + continue + fi + + # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. + ip=$(echo "$node" | awk -F":" '{print $2}') + id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") + if [ -z "$id" ]; then + ocf_log info "$name is not in the members list" + add_member_as_learner "$name" "$ip" + set_standalone_node + else + ocf_log debug "$name is in the members list by IP: $ip" + clear_standalone_and_learner_if_not_learners "$member_list_json" + fi + done + return $OCF_SUCCESS +} + +podman_simple_status() +{ + local rc + + # simple status is implemented via podman exec + # everything besides success is considered "not running" + monitor_cmd_exec + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING; + fi + return $rc +} + +podman_monitor() +{ + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report + # when a container is not running. Here, we're not interested + # in distinguishing whether it's stopped or non existing + # (there's function container_exists for that) + monitor_cmd_exec + rc=$? + if [ $rc -ne 0 ]; then + return $rc + fi + + if is_learner; then + ocf_log info "$NODENAME is learner. Cannot get member id" + return "$OCF_SUCCESS" + fi + # Failing to cache data and check member list should not cause the + # monitor operation to fail. + # TODO: move this inside check_peers where we already query member list json + attribute_node_member_id update + if ! check_peers; then + return $OCF_ERR_GENERIC + fi + + # node revision comes from the disk, so if it is not available is a fatal failure + attribute_node_revision update + return $? +} + +podman_create_mounts() { + oldIFS="$IFS" + IFS="," + for directory in $OCF_RESKEY_mount_points; do + mkdir -p "$directory" + done + IFS="$oldIFS" +} + +podman_container_id() +{ + # Retrieve the container ID by doing a "podman ps" rather than + # a "podman inspect", because the latter has performance issues + # under IO load. + # We could have run "podman start $CONTAINER" to get the ID back + # but if the container is stopped, the command will return a + # name instead of a container ID. This would break us. + podman ps --no-trunc --format '{{.ID}} {{.Names}}' | grep -F -w -m1 "$CONTAINER" | cut -d' ' -f1 +} + + +create_transient_drop_in_dependency() +{ + local cid=$1 + local rc=$OCF_SUCCESS + + if [ -z "$cid" ]; then + ocf_exit_reason "Container ID not found for \"$CONTAINER\". Not creating drop-in dependency" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Creating drop-in dependency for \"$CONTAINER\" ($cid)" + for scope in "libpod-$cid.scope.d" "libpod-conmon-$cid.scope.d"; do + if [ $rc -eq $OCF_SUCCESS ] && [ ! -d /run/systemd/transient/"$scope" ]; then + mkdir -p /run/systemd/transient/"$scope" && \ + printf "[Unit]\nBefore=pacemaker.service" > /run/systemd/transient/"$scope"/dep.conf && \ + chmod ago+r /run/systemd/transient/"$scope" /run/systemd/transient/"$scope"/dep.conf + rc=$? + fi + done + + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "Could not create drop-in dependency for \"$CONTAINER\" ($cid)" + else + systemctl daemon-reload + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "Could not refresh service definition after creating drop-in for \"$CONTAINER\"" + fi + fi + + return $rc +} + + +run_new_container() +{ + local opts=$1 + local image=$2 + local cmd=$3 + local rc + + ocf_log info "running container $CONTAINER for the first time" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + + if [ -n "$out" ]; then + out="$(echo "$out" | tr -s ' \t\r\n' ' ')" + if [ $rc -eq 0 ]; then + ocf_log info "$out" + else + ocf_log err "$out" + fi + fi + + if [ $rc -eq 125 ]; then + # If an internal podman error occurred, it might be because + # the internal storage layer still references an old container + # with the same name, even though podman itself thinks there + # is no such container. If so, purge the storage layer to try + # to clean the corruption and try again. + if echo "$out" | grep -q "unknown.*flag"; then + ocf_exit_reason "$out" + return $rc + fi + + ocf_log warn "Internal podman error while creating new container $CONTAINER. Retrying." + ocf_run podman rm --storage "$CONTAINER" + ocf_run podman run $opts $image $cmd + rc=$? + elif [ $rc -eq 127 ]; then + # rhbz#1972209: podman 3.0.x seems to be hit by a race + # where the cgroup is not yet set up properly when the OCI + # runtime configures the container. If that happens, recreate + # the container as long as we get the same error code or + # until start timeout preempts us. + while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do + ocf_log warn "Internal podman error while assigning cgroup. Retrying." + # Arbitrary sleep to prevent consuming all CPU while looping + sleep 1 + podman rm -f "$CONTAINER" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + done + # Log the created container ID if it succeeded + if [ $rc -eq 0 ]; then + ocf_log info "$out" + fi + fi + + return $rc +} + +compare_revision() +{ + # Compare local revision (from disk) against peer revision (from CIB). + # returns "older", "equal" or "newer" + local revision + local peer_node_name + local peer_revision + + revision=$(attribute_node_revision get) + peer_revision=$(attribute_node_revision_peer) + + if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then + ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision" + return "$OCF_ERR_GENERIC" + fi + + if [ "$revision" -gt "$peer_revision" ]; then + ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision" + echo "newer" + elif [ "$revision" -eq "$peer_revision" ]; then + ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision" + echo "equal" + else + ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision" + echo "older" + fi + return "$OCF_SUCCESS" +} + +ensure_pod_manifest_exists() +{ + local wait_timeout_sec=$((10 * 60)) + local poll_interval_sec=5 + local poll_retries=$((wait_timeout_sec/poll_interval_sec)) + + for try in $(seq "$poll_retries"); do + if [ -f "$OCF_RESKEY_pod_manifest" ]; then + ocf_log info "pod manifest ($OCF_RESKEY_pod_manifest) found" + break + fi + ocf_log debug "pod manifest ($OCF_RESKEY_pod_manifest) does not exist yet: retry in $poll_interval_sec seconds." + sleep "$poll_interval_sec" + done + + if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then + ocf_log err "pod manifest ($OCF_RESKEY_pod_manifest) still missing after $wait_timeout_sec seconds." + return "$OCF_ERR_CONFIGURED" + fi + + return "$OCF_SUCCESS" +} + +podman_start() +{ + local cid + local rc + local etcd_pod_wait_timeout_sec=$((10 * 60)) + local etcd_pod_poll_interval_sec=10 + local etcd_pod_poll_retries=$((etcd_pod_wait_timeout_sec/etcd_pod_poll_interval_sec)) + local pod_was_running=false + + ocf_log notice "podman-etcd start" + attribute_node_ip update + attribute_node_cluster_id update + attribute_node_revision update + + # ensure the etcd pod is not running before starting the container + ocf_log info "ensure etcd pod is not running (retries: $etcd_pod_poll_retries, interval: $etcd_pod_poll_interval_sec)" + for try in $(seq $etcd_pod_poll_retries); do + if ! etcd_pod_container_exists; then + break + fi + ocf_log info "etcd pod running: retry in $etcd_pod_poll_interval_sec seconds." + pod_was_running=true + sleep $etcd_pod_poll_interval_sec + done + if etcd_pod_container_exists; then + ocf_exit_reason "etcd pod is still running after $etcd_pod_wait_timeout_sec seconds." + return $OCF_ERR_GENERIC + fi + + if ! ensure_pod_manifest_exists; then + ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)" + return "$OCF_ERR_GENERIC" + fi + + # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1. + # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots. + # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set + # during the current node boot session, implying a deliberate request to recover the cluster. + if ocf_is_true "$pod_was_running"; then + ocf_log info "static pod was running: start normally" + else + if is_force_new_cluster; then + ocf_log notice "$NODENAME marked to force-new-cluster" + else + # When the local agent starts, we can infer the cluster state by counting + # how many agents are starting or already active: + # - 1 active agent: it's the peer (we are just starting) + # - 0 active agents, 1 starting: we are starting; the peer is not starting + # - 0 active agents, 2 starting: both agents are starting simultaneously + local active_resources_count + active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) + case "$active_resources_count" in + 1) + if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then + ocf_log info "peer active but in learner mode: start normally" + else + ocf_log info "peer is active standalone: joining as learner" + JOIN_AS_LEARNER=true + fi + ;; + 0) + # we need to compare the revisions in any of the following branches + # so call the function only once here + if ! revision_compare_result=$(compare_revision); then + ocf_log err "could not compare revisions, error code: $?" + return "$OCF_ERR_GENERIC" + fi + + # count how many agents are starting now + local start_resources_count + start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) + + case "$start_resources_count" in + 1) + ocf_log debug "peer not starting: ensure we can start a new cluster" + if [ "$revision_compare_result" != "older" ]; then + # If our revision is the same as or newer than the peer's last saved + # revision, and the peer agent isn't currently starting, we can + # restore e-quorum by forcing a new cluster. + set_force_new_cluster + else + ocf_log err "local revision is older and peer is not starting: cannot start" + ocf_exit_reason "local revision is older and peer is not starting: cannot start" + return "$OCF_ERR_GENERIC" + fi + ;; + 2) + ocf_log info "peer starting" + if [ "$revision_compare_result" = "newer" ]; then + set_force_new_cluster + elif [ "$revision_compare_result" = "older" ]; then + ocf_log info "$NODENAME shall join as learner" + JOIN_AS_LEARNER=true + else + if [ "$(attribute_node_cluster_id get)" = "$(attribute_node_cluster_id_peer)" ]; then + ocf_log info "same cluster_id and revision: start normal" + else + ocf_exit_reason "same revision but different cluster id" + return "$OCF_ERR_GENERIC" + fi + fi + ;; + *) + ocf_log err "Unexpected start resource count: $start_resources_count" + podman_notify + return "$OCF_ERR_GENERIC" + ;; + esac + ;; + *) + ocf_log err "Unexpected active resource count: $active_resources_count" + podman_notify + return "$OCF_ERR_GENERIC" + ;; + esac + fi + fi + + podman_create_mounts + local run_opts="-d --name=${CONTAINER}" + # check to see if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return "$OCF_SUCCESS" + fi + + if ocf_is_true "$JOIN_AS_LEARNER"; then + local wait_timeout_sec=$((10*60)) + local poll_interval_sec=5 + local retries=$(( wait_timeout_sec / poll_interval_sec )) + + ocf_log info "ensure the leader node added $NODENAME as learner member before continuing (timeout: $wait_timeout_sec seconds)" + for try in $(seq $retries); do + learner_node=$(attribute_learner_node get) + if [ "$NODENAME" != "$learner_node" ]; then + ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds." + sleep $poll_interval_sec + continue + fi + ocf_log info "learner node $learner_node in the member list" + break + done + if [ "$NODENAME" != "$(attribute_learner_node get)" ]; then + ocf_log err "wait for $NODENAME to be in the member list timed out" + return "$OCF_ERR_GENERIC" + fi + + archive_data_folder + fi + + prepare_env + + # add etcd-specific opts + run_opts="$run_opts \ + --network=host \ + -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ + -v /var/lib/etcd:/var/lib/etcd \ + --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \ + --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \ + --env ETCD_DATA_DIR=$ETCD_DATA_DIR \ + --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \ + --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \ + --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \ + --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \ + --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \ + --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \ + --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \ + --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \ + --env ETCD_NAME=$NODENAME \ + --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \ + --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \ + --env ETCDCTL_API=$ETCDCTL_API \ + --env ETCDCTL_CACERT=$SERVER_CACERT \ + --env ETCDCTL_CERT=$ETCD_PEER_CERT \ + --env ETCDCTL_KEY=$ETCD_PEER_KEY \ + --authfile=$OCF_RESKEY_authfile \ + --security-opt label=disable" + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \ + --log-level=info \ + --experimental-initial-corrupt-check=true \ + --snapshot-count=10000 \ + --initial-advertise-peer-urls=https://${NODEIP}:2380 \ + --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \ + --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \ + --trusted-ca-file=$SERVER_CACERT \ + --client-cert-auth=true \ + --peer-cert-file=$ETCD_PEER_CERT \ + --peer-key-file=$ETCD_PEER_KEY \ + --peer-trusted-ca-file=$SERVER_CACERT \ + --peer-client-cert-auth=true \ + --advertise-client-urls=https://${NODEIP}:2379 \ + --listen-client-urls=https://${LISTEN_CLIENT_URLS}:2379,unixs://${NODEIP}:0 \ + --listen-peer-urls=https://${LISTEN_PEER_URLS}:2380 \ + --metrics=extensive \ + --listen-metrics-urls=https://${LISTEN_METRICS_URLS}:9978" + if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" + fi + + if is_force_new_cluster; then + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster" + fi + + if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then + # no container image provided via input parameters. Read it from the pod manifest. + OCF_RESKEY_image=$(jq -r '.spec.containers[] | select( .name=="etcd").image' "$OCF_RESKEY_pod_manifest") + ocf_log info "using container image ($OCF_RESKEY_image) from Pod manifest ($OCF_RESKEY_pod_manifest)" + else + # use the container image provided as input parameter + ocf_log info "using container image ($OCF_RESKEY_image) via input parameters" + fi + + if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then + ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" + if ! podman pull --authfile="$OCF_RESKEY_authfile" "${OCF_RESKEY_image}"; then + ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" + return $OCF_ERR_GENERIC + fi + else + ocf_log notice "Pull image not required, ${OCF_RESKEY_image}" + fi + + if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then + ocf_log info "starting existing container $CONTAINER." + ocf_run podman start "$CONTAINER" + else + # make sure any previous container matching our container name is cleaned up first. + # we already know at this point it wouldn't be running + remove_container + run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd" + if [ $? -eq 125 ]; then + return $OCF_ERR_GENERIC + fi + fi + rc=$? + + # if the container was stopped or didn't exist before, systemd + # removed the libpod* scopes. So always try to recreate the drop-ins + if [ $rc -eq 0 ] && ocf_is_true "$OCF_RESKEY_drop_in_dependency"; then + cid=$(podman_container_id) + create_transient_drop_in_dependency "$cid" + rc=$? + fi + + if [ $rc -ne 0 ]; then + ocf_exit_reason "podman failed to launch container (error code: $rc)" + return $OCF_ERR_GENERIC + fi + + # wait for monitor to pass before declaring that the container is started + while true; do + podman_simple_status + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Newly created podman container exited after start" + ocf_run podman logs --tail 20 "${CONTAINER}" + return $OCF_ERR_GENERIC + fi + + monitor_cmd_exec + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log notice "Container $CONTAINER started successfully" + if is_force_new_cluster; then + clear_force_new_cluster + + local peer_node_name + local peer_node_ip + peer_node_name="$(get_peer_node_name)" + peer_node_ip="$(attribute_node_ip_peer)" + if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then + add_member_as_learner "$peer_node_name" "$peer_node_ip" + else + ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})" + fi + fi + return $OCF_SUCCESS + fi + + ocf_exit_reason "waiting on monitor_cmd to pass after start" + sleep 1 + done +} + +podman_stop() +{ + local timeout=60 + local rc + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + remove_container + ocf_log info "could not leave members list: etcd container not running" + return $OCF_SUCCESS + fi + + attribute_node_revision update + attribute_node_cluster_id update + + if ! member_id=$(attribute_node_member_id get); then + ocf_log err "error leaving members list: could not get member-id" + else + # TODO: is it worth/possible to check the current status instead than relying on cached attributes? + if is_standalone; then + ocf_log info "last member. Not leaving the member list" + else + ocf_log info "leaving members list as member with ID $member_id" + endpoint="https://$(attribute_node_ip get):2379" + if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then + rc=$? + ocf_log err "error leaving members list, error code: $rc" + fi + fi + fi + attribute_node_member_id clear + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timeout=$(((OCF_RESKEY_CRM_meta_timeout/1000) -10 )) + if [ $timeout -lt 10 ]; then + timeout=10 + fi + fi + + if ocf_is_true "$OCF_RESKEY_force_kill"; then + ocf_run podman kill "$CONTAINER" + rc=$? + else + ocf_log debug "waiting $timeout second[s] before killing container" + ocf_run podman stop -t="$timeout" "$CONTAINER" + rc=$? + # on stop, systemd will automatically delete any transient + # drop-in conf that has been created earlier + fi + + if [ $rc -ne 0 ]; then + # If the stop failed, it could be because the controlling conmon + # process died unexpectedly. If so, a generic error code is returned + # but the associated container exit code is -1. If that's the case, + # assume there's no failure and continue with the rm as usual. + if [ $rc -eq 125 ] && \ + podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' "$CONTAINER" | grep -Eq '^(exited|stopped):-1$'; then + ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." + else + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + fi + + if ! remove_container; then + ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +image_exists() +{ + if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then + # the actual container image was not defined yet. Nor by + # the user via OCF_RESKEY, nor by reading the Pod manifest + return 0 + fi + if podman image exists "${OCF_RESKEY_image}"; then + # image found + return 0 + fi + + if ocf_is_true "$OCF_RESKEY_allow_pull"; then + REQUIRE_IMAGE_PULL=1 + ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" + return 0 + fi + # image not found. + return 1 +} + +podman_validate() +{ + check_binary curl + check_binary crictl + check_binary oc + check_binary podman + check_binary jq + + if [ -z "$OCF_RESKEY_node_ip_map" ]; then + ocf_exit_reason "'node_ip_map' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_pod_manifest" ]; then + ocf_exit_reason "'pod_manifest' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_image" ]; then + ocf_exit_reason "'image' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if ! image_exists; then + ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +podman_notify() +{ + ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }" +} + +# TODO : +# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. +# When a user appoints reuse, the resource agent cannot connect plural clones with a container. + +if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + if [ -n "$OCF_RESKEY_name" ]; then + if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural clones from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural master from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + fi + : ${OCF_RESKEY_name=$(echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-')} +else + : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} +fi + +CONTAINER=$OCF_RESKEY_name + +# Note: we currently monitor podman containers by with the "podman exec" +# command, so make sure that invocation is always valid by enforcing the +# exec command to be non-empty +: ${OCF_RESKEY_monitor_cmd:=/bin/true} + +# When OCF_RESKEY_drop_in_dependency is not populated, we +# look at another file-based way of enabling the option. +# Otherwise, consider it disabled. +if [ -z "$OCF_RESKEY_drop_in_dependency" ]; then + if [ -f "/etc/sysconfig/podman_drop_in" ] || \ + [ -f "/etc/default/podman_drop_in" ]; then + OCF_RESKEY_drop_in_dependency=yes + fi +fi + + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS;; +usage|help) podman_usage + exit $OCF_SUCCESS + ;; +esac + +NODENAME=$(ocf_local_nodename) +JOIN_AS_LEARNER=false + +case $__OCF_ACTION in +start) + podman_validate || exit $? + podman_start;; +stop) podman_stop;; +monitor) podman_monitor;; +notify) podman_notify;; +validate-all) podman_validate;; +*) podman_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc