diff --git a/heartbeat/Raid1 b/heartbeat/Raid1 index 4460ca1a3..7cf658b59 100755 --- a/heartbeat/Raid1 +++ b/heartbeat/Raid1 @@ -1,556 +1,556 @@ #!/bin/sh # # # License: GNU General Public License (GPL) # Support: linux-ha@lists.linux-ha.org # # Raid1 # Description: Manages a Linux software RAID device on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # RAID patches: http://people.redhat.com/mingo/raid-patches/ # Word to the Wise: http://lwn.net/2000/0810/a/raid-faq.php3 # Sympathetic Ear: mailto:linux-raid@vger.kernel.org # # usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} # # # EXAMPLE config file /etc/raidtab.md0 # This file must exist on both machines! # # raiddev /dev/md0 # raid-level 1 # nr-raid-disks 2 # chunk-size 64k # persistent-superblock 1 # #nr-spare-disks 0 # device /dev/sda1 # raid-disk 0 # device /dev/sdb1 # raid-disk 1 # # EXAMPLE config file /etc/mdadm.conf (for more info:man mdadm.conf) # # DEVICE /dev/sdb1 /dev/sdc1 # ARRAY /dev/md0 UUID=4a865b55:ba27ef8d:29cd5701:6fb42799 ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} EOT } meta_data() { cat < 1.0 This resource agent manages Linux software RAID (MD) devices on a shared storage medium. It uses mdadm(8) to start, stop, and monitor the MD devices. Raidtools are supported, but deprecated. See https://raid.wiki.kernel.org/index.php/Linux_Raid for more information. Manages Linux software RAID (MD) devices on shared storage The RAID configuration file, e.g. /etc/mdadm.conf. RAID config file One or more block devices to use, space separated. Alternatively, set to "auto" to manage all devices specified in raidconf. block device The value for the homehost directive; this is an mdadm feature to protect RAIDs against being activated by accident. It is recommended to create RAIDs managed by the cluster with "homehost" set to a special value, so they are not accidentially auto-assembled by nodes not supposed to own them. Homehost for mdadm If processes or kernel threads are using the array, it cannot be stopped. We will try to stop processes, first by sending TERM and then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. The lsof(8) program is required to get the list of array users. Of course, the kernel threads cannot be stopped this way. If the processes are critical for data integrity, then set this parameter to false. Note that in that case the stop operation will fail and the node will be fenced. force stop processes using the array Wait until udevd creates a device in the start operation. On a normally loaded host this should happen quickly, but you may be unlucky. If you are not using udev set this to "no". udev Activating the same md RAID array on multiple nodes at the same time will result in data corruption and thus is forbidden by default. A safe example could be an array that is only named identically across all nodes, but is in fact distinct. Only set this to "true" if you know what you are doing! force ability to run as a clone END } udev_settle() { if ocf_is_true $WAIT_FOR_UDEV; then udevadm settle $* fi } list_conf_arrays() { test -f $RAIDCONF || { - ocf_log err "$RAIDCONF gone missing!" + ocf_exit_reason "$RAIDCONF gone missing!" exit $OCF_ERR_GENERIC } grep ^ARRAY $RAIDCONF | awk '{print $2}' } forall() { local func=$1 local checkall=$2 local mddev rc=0 for mddev in $RAIDDEVS; do $func $mddev rc=$(($rc | $?)) [ "$checkall" = all ] && continue [ $rc -ne 0 ] && return $rc done return $rc } are_arrays_stopped() { local rc mddev for mddev in $RAIDDEVS; do raid1_monitor_one $mddev rc=$? [ $rc -ne $OCF_NOT_RUNNING ] && break done test $rc -eq $OCF_NOT_RUNNING } md_assemble() { local mddev=$1 $MDADM --assemble $mddev --config=$RAIDCONF $MDADM_HOMEHOST udev_settle --exit-if-exists=$mddev } # # START: Start up the RAID device # raid1_start() { local rc raid1_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then # md already online, nothing to do. return $OCF_SUCCESS fi if [ $rc -ne $OCF_NOT_RUNNING ]; then # If the array is in a broken state, this agent doesn't # know how to repair that. - ocf_log err "$RAIDDEVS in a broken state; cannot start (rc=$rc)" + ocf_exit_reason "$RAIDDEVS in a broken state; cannot start (rc=$rc)" return $OCF_ERR_GENERIC fi if [ $HAVE_RAIDTOOLS = "true" ]; then # Run raidstart to start up the RAID array $RAIDSTART --configfile $RAIDCONF $MDDEV else forall md_assemble all fi raid1_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS else - ocf_log err "Couldn't start RAID for $RAIDDEVS" + ocf_exit_reason "Couldn't start RAID for $RAIDDEVS" return $OCF_ERR_GENERIC fi } # # STOP: stop the RAID device # mark_readonly() { local mddev=$1 local rc ocf_log info "Attempting to mark array $mddev readonly" $MDADM --readonly $mddev --config=$RAIDCONF rc=$? if [ $rc -ne 0 ]; then - ocf_log err "Failed to set $mddev readonly (rc=$rc)" + ocf_exit_reason "Failed to set $mddev readonly (rc=$rc)" fi return $rc } mknod_raid1_stop() { # first create a block device file, then try to stop the # array local rc n tmp_block_file n=`echo $1 | sed 's/[^0-9]*//'` if ! ocf_is_decimal "$n"; then ocf_log warn "could not get the minor device number from $1" return 1 fi tmp_block_file="$HA_RSCTMP/${OCF_RESOURCE_INSTANCE}-`basename $1`" rm -f $tmp_block_file ocf_log info "block device file $1 missing, creating one in order to stop the array" mknod $tmp_block_file b 9 $n $MDADM --stop $tmp_block_file --config=$RAIDCONF --wait-clean -W rc=$? rm -f $tmp_block_file return $rc } raid1_stop_one() { ocf_log info "Stopping array $1" if [ -b "$1" ]; then $MDADM --stop $1 --config=$RAIDCONF --wait-clean -W && return else # newer mdadm releases can stop arrays when given the # basename; try that first $MDADM --stop `basename $1` --config=$RAIDCONF --wait-clean -W && return # otherwise create a block device file mknod_raid1_stop $1 fi } get_users_pids() { local mddev=$1 local outp l ocf_log debug "running lsof to list $mddev users..." outp=`lsof $mddev | tail -n +2` echo "$outp" | awk '{print $2}' | sort -u echo "$outp" | while read l; do ocf_log warn "$l" done } stop_raid_users() { local pids pids=`forall get_users_pids all | sort -u` if [ -z "$pids" ]; then ocf_log warn "lsof reported no users holding arrays" return 2 else ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids fi } stop_arrays() { if [ $HAVE_RAIDTOOLS = "true" ]; then $RAIDSTOP --configfile $RAIDCONF $MDDEV else forall raid1_stop_one all fi } showusers() { local disk for disk; do if have_binary lsof; then ocf_log info "running lsof to list $disk users..." ocf_run -warn lsof $disk fi if [ -d /sys/block/$disk/holders ]; then ocf_log info "ls -l /sys/block/$disk/holders" ocf_run -warn ls -l /sys/block/$disk/holders fi done } raid1_stop() { local rc # See if the MD device is already cleanly stopped: if are_arrays_stopped; then return $OCF_SUCCESS fi # Turn off raid if ! stop_arrays; then if ocf_is_true $FORCESTOP; then if have_binary lsof; then stop_raid_users case $? in 2) false;; *) stop_arrays;; esac else ocf_log warn "install lsof(8) to list users holding the disk" false fi else false fi fi rc=$? if [ $rc -ne 0 ]; then ocf_log warn "Couldn't stop RAID for $RAIDDEVS (rc=$rc)" showusers $RAIDDEVS if [ $HAVE_RAIDTOOLS != "true" ]; then forall mark_readonly all fi return $OCF_ERR_GENERIC fi if are_arrays_stopped; then return $OCF_SUCCESS fi - ocf_log err "RAID $RAIDDEVS still active after stop command!" + ocf_exit_reason "RAID $RAIDDEVS still active after stop command!" return $OCF_ERR_GENERIC } # # monitor: a less noisy status # raid1_monitor_one() { local mddev=$1 local md=`echo $mddev | sed 's,/dev/,,'` local rc local TRY_READD=0 local pbsize # check if the md device exists first # but not if we are in the stop operation # device existence is important only for the running arrays if [ "$__OCF_ACTION" != "stop" -a ! -b $mddev ]; then ocf_log info "$mddev is not a block device" return $OCF_NOT_RUNNING fi if ! grep -e "^$md[ \t:]" /proc/mdstat >/dev/null ; then ocf_log info "$md not found in /proc/mdstat" return $OCF_NOT_RUNNING fi if [ $HAVE_RAIDTOOLS != "true" ]; then $MDADM --detail --test $mddev >/dev/null 2>&1 ; rc=$? case $rc in 0) ;; 1) ocf_log warn "$mddev has at least one failed device." TRY_READD=1 ;; - 2) ocf_log err "$mddev has failed." + 2) ocf_exit_reason "$mddev has failed." return $OCF_ERR_GENERIC ;; - 4) ocf_log err "mdadm failed on $mddev." + 4) ocf_exit_reason "mdadm failed on $mddev." return $OCF_ERR_GENERIC ;; - *) ocf_log err "mdadm returned an unknown result ($rc)." + *) ocf_exit_reason "mdadm returned an unknown result ($rc)." return $OCF_ERR_GENERIC ;; esac fi if [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" != 0 \ -a $TRY_READD -eq 1 -a $OCF_CHECK_LEVEL -gt 0 ]; then ocf_log info "Attempting recovery sequence to re-add devices on $mddev:" $MDADM $mddev --fail detached $MDADM $mddev --remove failed $MDADM $mddev --re-add missing # TODO: At this stage, there's nothing to actually do # here. Either this worked or it did not. fi pbsize=`(blockdev --getpbsz $mddev || stat -c "%o" $mddev) 2>/dev/null` if [ -z "$pbsize" ]; then ocf_log warn "both blockdev and stat could not get the block size (will use 4k)" pbsize=4096 # try with 4k fi if ! dd if=$mddev count=1 bs=$pbsize of=/dev/null \ iflag=direct >/dev/null 2>&1 ; then - ocf_log err "$mddev: I/O error on read" + ocf_exit_reason "$mddev: I/O error on read" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } raid1_monitor() { forall raid1_monitor_one } # # STATUS: is the raid device online or offline? # raid1_status() { # See if the MD device is online local rc raid1_monitor rc=$? if [ $rc -ne $OCF_SUCCESS ]; then echo "stopped" else echo "running" fi return $rc } raid1_validate_all() { return $OCF_SUCCESS } PROC_CLEANUP_TIME=3 if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; *) ;; esac RAIDCONF="$OCF_RESKEY_raidconf" MDDEV="$OCF_RESKEY_raiddev" FORCESTOP="${OCF_RESKEY_force_stop:-1}" WAIT_FOR_UDEV="${OCF_RESKEY_udev:-1}" if [ -z "$RAIDCONF" ] ; then - ocf_log err "Please set OCF_RESKEY_raidconf!" + ocf_exit_reason "Please set OCF_RESKEY_raidconf!" exit $OCF_ERR_CONFIGURED fi if [ ! -r "$RAIDCONF" ] ; then - ocf_log err "Configuration file [$RAIDCONF] does not exist, or can not be opend!" + ocf_exit_reason "Configuration file [$RAIDCONF] does not exist, or can not be opend!" exit $OCF_ERR_INSTALLED fi if [ -z "$MDDEV" ] ; then - ocf_log err "Please set OCF_RESKEY_raiddev to the Raid device you want to control!" + ocf_exit_reason "Please set OCF_RESKEY_raiddev to the Raid device you want to control!" exit $OCF_ERR_CONFIGURED fi if ocf_is_clone && ! ocf_is_true "$OCF_RESKEY_force_clones"; then - ocf_log err "md RAID arrays are NOT safe to run as a clone!" + ocf_exit_reason "md RAID arrays are NOT safe to run as a clone!" ocf_log err "Please read the comment on the force_clones parameter." exit $OCF_ERR_CONFIGURED fi if ocf_is_true $WAIT_FOR_UDEV && ! have_binary udevadm; then if [ "$__OCF_ACTION" = "start" ]; then ocf_log warn "either install udevadm or set udev to false" ocf_log info "setting udev to false!" fi WAIT_FOR_UDEV=0 fi if ! ocf_is_true $WAIT_FOR_UDEV; then export MDADM_NO_UDEV=1 fi if ocf_is_true $FORCESTOP && ! have_binary lsof; then ocf_log warn "Please install lsof(8), we may need it when stopping Raid device! Now continuing anyway ..." fi HAVE_RAIDTOOLS=false if have_binary $MDADM >/dev/null 2>&1 ; then if [ -n "$OCF_RESKEY_homehost" ]; then MDADM_HOMEHOST="--homehost=${OCF_RESKEY_homehost}" else MDADM_HOMEHOST="" fi else check_binary $RAIDSTART HAVE_RAIDTOOLS=true fi if [ $HAVE_RAIDTOOLS = true ]; then if [ "$MDDEV" = "auto" ]; then - ocf_log err "autoconf supported only with mdadm!" + ocf_exit_reason "autoconf supported only with mdadm!" exit $OCF_ERR_INSTALLED elif [ `echo $MDDEV|wc -w` -gt 1 ]; then - ocf_log err "multiple devices supported only with mdadm!" + ocf_exit_reason "multiple devices supported only with mdadm!" exit $OCF_ERR_INSTALLED fi fi if [ "$MDDEV" = "auto" ]; then RAIDDEVS=`list_conf_arrays` else RAIDDEVS="$MDDEV" fi # At this stage, # [ $HAVE_RAIDTOOLS = false ] <=> we have $MDADM, # otherwise we have raidtools (raidstart and raidstop) # Look for how we are called case "$1" in start) raid1_start ;; stop) raid1_stop ;; status) raid1_status ;; monitor) raid1_monitor ;; validate-all) raid1_validate_all ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/iscsi b/heartbeat/iscsi index 36ed90879..ef0236e47 100755 --- a/heartbeat/iscsi +++ b/heartbeat/iscsi @@ -1,514 +1,514 @@ #!/bin/sh # # iSCSI OCF resource agent # Description: manage iSCSI disks (add/remove) using open-iscsi # # Copyright Dejan Muhamedagic # (C) 2007 Novell Inc. All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # See usage() and meta_data() below for more details... # # OCF instance parameters: # OCF_RESKEY_portal: the iSCSI portal address or host name (required) # OCF_RESKEY_target: the iSCSI target (required) # OCF_RESKEY_iscsiadm: iscsiadm program path (optional) # OCF_RESKEY_discovery_type: discovery type (optional; default: sendtargets) # OCF_RESKEY_try_recovery: wait for iSCSI recovery in monitor (optional; default: false) # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_udev_default="yes" OCF_RESKEY_iscsiadm_default="iscsiadm" OCF_RESKEY_discovery_type_default="sendtargets" OCF_RESKEY_try_recovery_default="false" : ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}} : ${OCF_RESKEY_iscsiadm=${OCF_RESKEY_iscsiadm_default}} : ${OCF_RESKEY_discovery_type=${OCF_RESKEY_discovery_type_default}} usage() { methods=`iscsi_methods` methods=`echo $methods | tr ' ' '|'` cat < 1.0 OCF Resource Agent for iSCSI. Add (start) or remove (stop) iSCSI targets. Manages a local iSCSI initiator and its connections to iSCSI targets The iSCSI portal address in the form: {ip_address|hostname}[":"port] Portal address The iSCSI target IQN. Target IQN Target discovery type. Check the open-iscsi documentation for supported discovery types. Target discovery type open-iscsi administration utility binary. iscsiadm binary If the next resource depends on the udev creating a device then we wait until it is finished. On a normally loaded host this should be done quickly, but you may be unlucky. If you are not using udev set this to "no", otherwise we will spin in a loop until a timeout occurs. udev If the iSCSI session exists but is currently inactive/broken, which is most probably due to network problems, the iSCSI layer will try to recover. If this parameter is set to true, we'll wait for the recovery to succeed. In that case the monitor operation can only time out so you should set the monitor op timeout attribute appropriately. On error wait for iSCSI recovery in monitor EOF } iscsi_methods() { cat <= "2.0-872" changed discovery semantics # see http://www.mail-archive.com/open-iscsi@googlegroups.com/msg04883.html # there's a new discoverydb command which should be used instead discovery open_iscsi_discovery() { local output local discovery_variant="discovery" local options="" local cmd local version=`$iscsiadm --version | awk '{print $3}'` ocf_version_cmp "$version" "2.0-871" if [ $? -eq 2 ]; then # newer than 2.0-871? discovery_variant="discoverydb" [ "$discovery_type" = "sendtargets" ] && options="-D" fi cmd="$iscsiadm -m $discovery_variant -p $OCF_RESKEY_portal -t $discovery_type $options" output=`$cmd` if [ $? -ne 0 -o x = "x$output" ]; then [ x != "x$output" ] && { - ocf_log err "$cmd FAILED" + ocf_exit_reason "$cmd FAILED" echo "$output" } return 3 fi PORTAL=`echo "$output" | awk -v target="$OCF_RESKEY_target" ' $NF==target{ if( NF==3 ) portal=$2; # sles compat mode else portal=$1; sub(",.*","",portal); print portal; }'` case `echo "$PORTAL" | wc -w` in 0) #target not found echo "$output" - ocf_log err "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal" + ocf_exit_reason "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal" return 1 ;; 1) #we're ok return 0 ;; *) # handle multihome hosts reporting multiple portals for p in $PORTAL; do if [ "$OCF_RESKEY_portal" = "$p" ]; then PORTAL="$OCF_RESKEY_portal" return 0 fi done echo "$output" - ocf_log err "sorry, can't handle multihomed hosts unless you specify the portal exactly" + ocf_exit_reason "sorry, can't handle multihomed hosts unless you specify the portal exactly" return 2 ;; esac } open_iscsi_add() { $iscsiadm -m node -p $1 -T $2 -l } open_iscsi_get_session_id() { local target="$1" $iscsiadm -m session 2>/dev/null | grep -E "$target($|[[:space:]])" | awk '{print $2}' | tr -d '[]' } open_iscsi_remove() { local target="$1" local session_id session_id=`open_iscsi_get_session_id "$target"` if [ "$session_id" ]; then $iscsiadm -m session -r $session_id -u else - ocf_log err "cannot find session id for target $target" + ocf_exit_reason "cannot find session id for target $target" return 1 fi } # open_iscsi_monitor return codes: # 0: target running (logged in) # 1: target not running and target record exists # 2: iscsiadm -m session error (unexpected) # 3: target record does not exist (discovery necessary) # open_iscsi_monitor() { local target="$1" local session_id conn_state outp local prev_state local recov recov=${2:-$OCF_RESKEY_try_recovery} session_id=`open_iscsi_get_session_id "$target"` prev_state="" if [ -z "$session_id" ]; then if $iscsiadm -m node -p $OCF_RESKEY_portal -T $target >/dev/null 2>&1; then return 1 # record found else return 3 fi fi while :; do outp=`$iscsiadm -m session -r $session_id -P 1` || return 2 conn_state=`echo "$outp" | sed -n '/Connection State/s/.*: //p'` # some drivers don't return connection state, in that case # we'll assume that we're still connected case "$conn_state" in "LOGGED IN") [ -n "$msg_logged" ] && ocf_log info "connection state $conn_state. Session restored." return 0;; "Unknown"|"") # this is also probably OK [ -n "$msg_logged" ] && ocf_log info "connection state $conn_state. Session restored." return 0;; *) # failed if [ "$__OCF_ACTION" != stop ] && ! ocf_is_probe && ocf_is_true $recov; then if [ "$conn_state" != "$prev_state" ]; then ocf_log warning "connection state $conn_state, waiting for recovery..." prev_state="$conn_state" fi sleep 1 else - ocf_log err "iscsiadm output: $outp" + ocf_exit_reason "iscsiadm output: $outp" return 2 fi ;; esac done } disk_discovery() { discovery_type=${OCF_RESKEY_discovery_type} $discovery # discover and setup the real portal string (address) case $? in 0) ;; 1|2) exit $OCF_ERR_GENERIC ;; 3) if ! is_iscsid_running; then [ $setup_rc -eq 1 ] && ocf_log warning "iscsid.startup probably not correctly set in /etc/iscsi/iscsid.conf" exit $OCF_ERR_INSTALLED fi exit $OCF_ERR_GENERIC ;; esac } # # NB: this is udev specific! # wait_for_udev() { dev=/dev/disk/by-path/ip-$PORTAL-iscsi-$OCF_RESKEY_target while :; do ls $dev* >/dev/null 2>&1 && break ocf_log warning "waiting for udev to create $dev" sleep 1 done } iscsi_monitor() { $disk_status $OCF_RESKEY_target $* case $? in 0) return $OCF_SUCCESS;; 1|3) return $OCF_NOT_RUNNING;; 2) return $OCF_ERR_GENERIC;; esac } iscsi_start() { local rc $disk_status $OCF_RESKEY_target rc=$? if [ $rc -eq 3 ]; then disk_discovery $disk_status $OCF_RESKEY_target rc=$? fi case $rc in 0) ocf_log info "iscsi $PORTAL $OCF_RESKEY_target already running" return $OCF_SUCCESS ;; 1) $add_disk $PORTAL $OCF_RESKEY_target || return $OCF_ERR_GENERIC case "$OCF_RESKEY_udev" in [Yy]es) wait_for_udev || return $OCF_ERR_GENERIC ;; *) ;; esac ;; *) # the session exists, but it's broken ocf_log warning "iscsi $PORTAL $OCF_RESKEY_target in failed state" ;; esac iscsi_monitor 1 # enforce wait if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS else return $OCF_ERR_GENERIC fi } iscsi_stop() { iscsi_monitor if [ $? -ne $OCF_NOT_RUNNING ] ; then $remove_disk $OCF_RESKEY_target || return $OCF_ERR_GENERIC iscsi_monitor if [ $? -ne $OCF_NOT_RUNNING ] ; then return $OCF_ERR_GENERIC else return $OCF_SUCCESS fi else ocf_log info "iscsi $OCF_RESKEY_target already stopped" return $OCF_SUCCESS fi } # # 'main' starts here... # if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; methods) iscsi_methods exit $OCF_SUCCESS;; esac if [ x = "x$OCF_RESKEY_target" ]; then - ocf_log err "target parameter not set" + ocf_exit_reason "target parameter not set" exit $OCF_ERR_CONFIGURED fi if [ x = "x$OCF_RESKEY_portal" ]; then - ocf_log err "portal parameter not set" + ocf_exit_reason "portal parameter not set" exit $OCF_ERR_CONFIGURED fi case `uname` in Linux) setup=open_iscsi_setup ;; *) ocf_log info "platform `uname` may not be supported" setup=open_iscsi_setup ;; esac PORTAL="$OCF_RESKEY_portal" # updated by discovery LSB_STATUS_STOPPED=3 $setup setup_rc=$? if [ $setup_rc -gt 1 ]; then - ocf_log info "iscsi initiator utilities not installed or not setup" + ocf_exit_reason "iscsi initiator utilities not installed or not setup" case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $OCF_ERR_INSTALLED;; esac fi if [ `id -u` != 0 ]; then - ocf_log err "$0 must be run as root" + ocf_exit_reason "$0 must be run as root" exit $OCF_ERR_PERM fi # which method was invoked? case "$1" in start) iscsi_start ;; stop) iscsi_stop ;; status) iscsi_monitor rc=$? case $rc in $OCF_SUCCESS) echo iscsi target $OCF_RESKEY_target running ;; $OCF_NOT_RUNNING) echo iscsi target $OCF_RESKEY_target stopped ;; *) echo iscsi target $OCF_RESKEY_target failed ;; esac exit $rc ;; monitor) iscsi_monitor ;; validate-all) # everything already validated # just exit successfully here. exit $OCF_SUCCESS;; *) iscsi_methods exit $OCF_ERR_UNIMPLEMENTED;; esac # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/ora-common.sh b/heartbeat/ora-common.sh index 1580babe1..728caafec 100644 --- a/heartbeat/ora-common.sh +++ b/heartbeat/ora-common.sh @@ -1,88 +1,88 @@ # ora-common.sh # # Description: Common code for oracle and oralsnr resource agents # # # Author: Dejan Muhamedagic # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # Copyright: (C) 2012 Dejan Muhamedagic, SUSE/Attachmate # # Gather up information about our oracle instance rmtmpfiles() { rm -f $TMPFILES } ora_common_getconfig() { ORACLE_SID=$1 # optional, defaults to whatever is in oratab ORACLE_HOME=$2 # optional, defaults to the owner of ORACLE_HOME ORACLE_OWNER=$3 # optional, defaults to $ORACLE_HOME/network/admin # (only the oralsnr may provide and use this one) TNS_ADMIN=$4 # get ORACLE_HOME from /etc/oratab if not set [ x = "x$ORACLE_HOME" ] && ORACLE_HOME=`awk -F: "/^$ORACLE_SID:/"'{print $2}' /etc/oratab` # there a better way to find out ORACLE_OWNER? [ x = "x$ORACLE_OWNER" ] && ORACLE_OWNER=`ls -ld $ORACLE_HOME/. 2>/dev/null | awk 'NR==1{print $3}'` # There are use-cases were users want to be able to set a custom TMS_ADMIN path. # When TNS_ADMIN is not provided, use the default path. [ x = "x$TNS_ADMIN" ] && TNS_ADMIN=$ORACLE_HOME/network/admin LD_LIBRARY_PATH=$ORACLE_HOME/lib LIBPATH=$ORACLE_HOME/lib PATH=$ORACLE_HOME/bin:$ORACLE_HOME/dbs:$PATH export ORACLE_SID ORACLE_HOME ORACLE_OWNER TNS_ADMIN export LD_LIBRARY_PATH LIBPATH ORA_ENVF=`mktemp` dumporaenv > $ORA_ENVF chmod 644 $ORA_ENVF TMPFILES="$ORA_ENVF" trap "rmtmpfiles" EXIT } ora_common_validate_all() { # Let's make sure a few important things are set... if [ x = "x$ORACLE_HOME" ]; then ocf_log info "ORACLE_HOME not set" return $OCF_ERR_INSTALLED fi if [ x = "x$ORACLE_OWNER" ]; then ocf_log info "ORACLE_OWNER not set" return $OCF_ERR_INSTALLED fi US=`id -u -n` if [ $US != root -a $US != $ORACLE_OWNER ] then - ocf_log err "$0 must be run as root or $ORACLE_OWNER" + ocf_exit_reason "$0 must be run as root or $ORACLE_OWNER" return $OCF_ERR_PERM fi return 0 } dumporaenv() { cat< 1.0 Resource script for oracle. Manages an Oracle Database instance as an HA resource. Manages an Oracle Database instance The Oracle SID (aka ORACLE_SID). sid The Oracle home directory (aka ORACLE_HOME). If not specified, then the SID along with its home should be listed in /etc/oratab. home The Oracle owner (aka ORACLE_OWNER). If not specified, then it is set to the owner of file \$ORACLE_HOME/dbs/*\${ORACLE_SID}.ora. If this does not work for you, just set it explicitely. user Monitoring user name. Every connection as sysdba is logged in an audit log. This can result in a large number of new files created. A new user is created (if it doesn't exist) in the start action and subsequently used in monitor. It should have very limited rights. Make sure that the password for this user does not expire. monuser Password for the monitoring user. Make sure that the password for this user does not expire. monpassword Profile used by the monitoring user. If the profile does not exist, it will be created with a non-expiring password. monprofile Sometimes IPC objects (shared memory segments and semaphores) belonging to an Oracle instance might be left behind which prevents the instance from starting. It is not easy to figure out which shared segments belong to which instance, in particular when more instances are running as same user. What we use here is the "oradebug" feature and its "ipc" trace utility. It is not optimal to parse the debugging information, but I am not aware of any other way to find out about the IPC information. In case the format or wording of the trace report changes, parsing might fail. There are some precautions, however, to prevent stepping on other peoples toes. There is also a dumpinstipc option which will make us print the IPC objects which belong to the instance. Use it to see if we parse the trace file correctly. Three settings are possible: - none: don't mess with IPC and hope for the best (beware: you'll probably be out of luck, sooner or later) - instance: try to figure out the IPC stuff which belongs to the instance and remove only those (default; should be safe) - orauser: remove all IPC belonging to the user which runs the instance (don't use this if you run more than one instance as same user or if other apps running as this user use IPC) The default setting "instance" should be safe to use, but in that case we cannot guarantee that the instance will start. In case IPC objects were already left around, because, for instance, someone mercilessly killing Oracle processes, there is no way any more to find out which IPC objects should be removed. In that case, human intervention is necessary, and probably _all_ instances running as same user will have to be stopped. The third setting, "orauser", guarantees IPC objects removal, but it does that based only on IPC objects ownership, so you should use that only if every instance runs as separate user. Please report any problems. Suggestions/fixes welcome. ipcrm The clear of the backup mode of ORACLE. clear_backupmode How to stop Oracle is a matter of taste it seems. The default method ("checkpoint/abort") is: alter system checkpoint; shutdown abort; This should be the fastest safe way bring the instance down. If you find "shutdown abort" distasteful, set this attribute to "immediate" in which case we will shutdown immediate; If you still think that there's even better way to shutdown an Oracle instance we are willing to listen. shutdown_method END } # # methods: What methods/operations do we support? # oracle_methods() { cat <<-! start stop status monitor dumpinstipc showdbstat cleanup validate-all methods meta-data usage ! } # # Run commands as the Oracle owner... # execsql() { if [ "$US" = "$ORACLE_OWNER" ]; then sqlplus -S /nolog else su - $ORACLE_OWNER -s /bin/sh -c ". $ORA_ENVF; sqlplus -S /nolog" fi } # # Run commands in the oracle admin sqlplus... # common_sql_opts() { cat</dev/null; then return 0 fi output=`dbasql mk_mon_profile show_mon_profile` if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then return 0 else - ocf_log err "could not create $MONPROFILE oracle profile" + ocf_exit_reason "could not create $MONPROFILE oracle profile" ocf_log err "sqlplus output: $output" return 1 fi } check_mon_user() { local output local output2 output=`dbasql show_mon_user` if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then if echo "$output" | grep -w "EXPIRED" >/dev/null; then dbasql reset_mon_user_password fi output=`dbasql show_mon_user_profile` if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then return 0 else output=`dbasql set_mon_user_profile` output2=`dbasql show_mon_user_profile` if echo "$output2" | grep -iw "^$MONPROFILE" >/dev/null; then return 0 fi - ocf_log err "could not set profile for $MONUSR oracle user" + ocf_exit_reason "could not set profile for $MONUSR oracle user" ocf_log err "sqlplus output: $output( $output2 )" return 1 fi fi output=`dbasql mk_mon_user show_mon_user` if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then return 0 else - ocf_log err "could not create $MONUSR oracle user" + ocf_exit_reason "could not create $MONUSR oracle user" ocf_log err "sqlplus output: $output" return 1 fi } # # print the output of dbstat (for debugging) # showdbstat() { echo "Full output:" dbstat | execsql echo "Stripped output:" echo "<`dbasql dbstat`>" } # # IPC stuff: not overly complex, but quite involved :-/ # # Part 1: Oracle other_trace_junk() { echo $1 | sed 's/trc$/trm/' } dumpinstipc() { local output tracef output=`dbasql getipc` # filename in the 2nd line tracef=`echo "$output" | awk 'NR==2' | grep '^/.*trc$'` if [ "$tracef" ]; then echo $tracef else ocf_log warn "'dbasql getipc' failed: $output" return 1 fi } parseipc() { local inf=$1 if [ ! -f "$1" ]; then ocf_log warn "$1: no such ipc trace file" return 1 fi awk ' $3 == "Shmid" {n=1;next} n { if( $3~/^[0-9]+$/ ) print $3; n=0 } ' $inf | sort -u | sed 's/^/m:/' awk ' /Semaphore List/ {insems=1;next} insems { for( i=1; i<=NF; i++ ) if( $i~/^[0-9]+$/ ) print $i; } /system semaphore information/ {exit} ' $inf | sort -u | sed 's/^/s:/' TMPFILES="$TMPFILES $inf `other_trace_junk $inf`" } # Part 2: OS (ipcs,ipcrm) filteroraipc() { # this portable? grep -w $ORACLE_OWNER | awk '{print $2}' } ipcdesc() { local what=$1 case $what in m) echo "shared memory segment";; s) echo "semaphore";; q) echo "message queue";; esac } rmipc() { local what=$1 id=$2 ipcs -$what | filteroraipc | grep -iw $id >/dev/null 2>&1 || return ocf_log info "Removing `ipcdesc $what` $id." ipcrm -$what $id } ipcrm_orauser() { local what id for what in m s q; do for id in `ipcs -$what | filteroraipc`; do rmipc $what $id done done } ipcrm_instance() { local ipcobj for ipcobj; do rmipc `echo $ipcobj | sed 's/:/ /'` done } # # oracle_status: is the Oracle instance running? # # quick check to see if the instance is up is_proc_running() { ps -ef | grep -wiqs "[^ ]*[_]pmon_${ORACLE_SID}" } # instance in OPEN state? instance_live() { local status=`monsql_one dbstat` [ "$status" = OPEN ] && return 0 status=`dbasql_one dbstat` if [ "$status" = OPEN ]; then return 0 else ocf_log info "$ORACLE_SID instance state is not OPEN (dbstat output: $status)" return 1 fi } ora_cleanup() { #rm -fr /tmp/.oracle #??? rm -f `ls $ORACLE_HOME/dbs/lk* | grep -i "$ORACLE_SID\$"` #return case $IPCRM in none) ;; instance) ipcrm_instance $* ;; orauser) ipcrm_orauser $* ;; esac } oracle_getconfig() { ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" clear_backupmode=${OCF_RESKEY_clear_backupmode:-"false"} shutdown_method=${OCF_RESKEY_shutdown_method:-"checkpoint/abort"} IPCRM=${OCF_RESKEY_ipcrm:-"instance"} } # # oracle_start: Start the Oracle instance # # NOTE: We handle instance in the MOUNTED and STARTED states # efficiently # We *do not* handle instance in the restricted or read-only # mode, i.e. it appears as running, but its availability is # "not for general use" # oracle_start() { local status output if is_proc_running; then status="`dbasql_one dbstat`" case "$status" in "OPEN") : nothing to be done, we can leave right now ocf_log info "Oracle instance $ORACLE_SID already running" return $OCF_SUCCESS ;; "STARTED") output=`dbasql dbmount` ;; "MOUNTED") : we proceed if mounted ;; *) # status unknown output=`dbasql dbstop dbstart_mount` ;; esac else output="`dbasql dbstart_mount`" # try to cleanup in case of # ORA-01081: cannot start already-running ORACLE - shut it down first if echo "$output" | grep ORA-01081 >/dev/null 2>&1; then ocf_log info "ORA-01081 error found, trying to cleanup oracle (dbstart_mount output: $output)" ora_cleanup output=`dbasql dbstart_mount` fi fi # oracle instance should be mounted. status="`dbasql_one dbstat`" case "$status" in "MOUNTED") ;; *) : error!! - ocf_log err "oracle $ORACLE_SID can not be mounted (status: $status)" + ocf_exit_reason "oracle $ORACLE_SID can not be mounted (status: $status)" return $OCF_ERR_GENERIC ;; esac # It is examined whether mode is "online backup mode", # and if it is true, makes clear the mode. # Afterwards, DB is opened. if is_clear_backupmode_set && is_instance_in_backup_mode; then clear_backup_mode fi output=`dbasql dbopen` # check/create the monitor profile if ! check_mon_profile; then return $OCF_ERR_GENERIC fi # check/create the monitor user if ! check_mon_user; then return $OCF_ERR_GENERIC fi if ! is_proc_running; then - ocf_log err "oracle process not running: $output" + ocf_exit_reason "oracle process not running: $output" return $OCF_ERR_GENERIC elif ! instance_live; then - ocf_log err "oracle instance $ORACLE_SID not started: $output" + ocf_exit_reason "oracle instance $ORACLE_SID not started: $output" return $OCF_ERR_GENERIC else : cool, we are up and running ocf_log info "Oracle instance $ORACLE_SID started: $output" return $OCF_SUCCESS fi } # # oracle_stop: Stop the Oracle instance # oracle_stop() { local status output ipc="" if is_proc_running; then [ "$IPCRM" = "instance" ] && ipc=$(parseipc `dumpinstipc`) output=`dbasql dbstop` else ocf_log info "Oracle instance $ORACLE_SID already stopped" return $OCF_SUCCESS fi ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged if is_proc_running; then - ocf_log err "Oracle instance $ORACLE_SID not stopped: $output" + ocf_exit_reason "Oracle instance $ORACLE_SID not stopped: $output" return $OCF_ERR_GENERIC else ocf_log info "Oracle instance $ORACLE_SID stopped: $output" sleep 1 # give em a chance to cleanup ocf_log info "Cleaning up for $ORACLE_SID" ora_cleanup "$ipc" return $OCF_SUCCESS fi } # # oracle_monitor: Can the Oracle instance do anything useful? # oracle_monitor() { if ! is_proc_running; then ocf_log info "oracle process not running" return $OCF_NOT_RUNNING fi if ! instance_live; then - ocf_log err "oracle instance $ORACLE_SID is down" + ocf_exit_reason "oracle instance $ORACLE_SID is down" return $OCF_ERR_GENERIC fi #ocf_log info "Oracle instance $ORACLE_SID is alive" return $OCF_SUCCESS } # other supported actions oracle_status() { if is_proc_running then echo Oracle instance $ORACLE_SID is running exit $OCF_SUCCESS else echo Oracle instance $ORACLE_SID is stopped exit $OCF_NOT_RUNNING fi } oracle_dumpinstipc() { is_proc_running && parseipc `dumpinstipc` } oracle_showdbstat() { showdbstat } oracle_cleanup() { if [ "$IPCRM" = "instance" ]; then ora_cleanup $(parseipc `dumpinstipc`) else ora_cleanup fi } oracle_validate_all() { case "${shutdown_method}" in "immediate") ;; "checkpoint/abort") ;; - *) ocf_log err "unsupported shutdown_method, please read meta-data" + *) ocf_exit_reason "unsupported shutdown_method, please read meta-data" return $OCF_ERR_CONFIGURED ;; esac case "${IPCRM}" in "none"|"instance"|"orauser") ;; - *) ocf_log err "unsupported ipcrm setting, please read meta-data" + *) ocf_exit_reason "unsupported ipcrm setting, please read meta-data" return $OCF_ERR_CONFIGURED ;; esac ora_common_validate_all } # used in ora-common.sh show_procs() { ps -e -o pid,args | grep -i "[o]ra[a-zA-Z0-9_]*$ORACLE_SID$" } proc_pids() { show_procs | awk '{print $1}'; } PROCS_CLEANUP_TIME="30" MONUSR=${OCF_RESKEY_monuser:-$OCF_RESKEY_monuser_default} MONPWD=${OCF_RESKEY_monpassword:-$OCF_RESKEY_monpassword_default} MONPROFILE=${OCF_RESKEY_monprofile_default:-$OCF_RESKEY_monprofile_default} MONUSR=$(echo $MONUSR | awk '{print toupper($0)}') MONPROFILE=$(echo $MONPROFILE | awk '{print toupper($0)}') OCF_REQUIRED_PARAMS="sid" OCF_REQUIRED_BINARIES="sqlplus" ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/oralsnr b/heartbeat/oralsnr index a91eeab8f..c47f12117 100755 --- a/heartbeat/oralsnr +++ b/heartbeat/oralsnr @@ -1,281 +1,281 @@ #!/bin/sh # # # oralsnr # # Description: Manages an Oracle Listener as a High-Availability # resource # # # Author: Dejan Muhamedagic # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # Copyright: (C) 2006 International Business Machines, Inc. # # This code inspired by the DB2 resource script # written by Alan Robertson # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 oralsnr::sid::home::user::listener # # See oralsnr_usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_sid (mandatory; for the monitor op) # OCF_RESKEY_home (optional; else read it from /etc/oratab) # OCF_RESKEY_user (optional; user to run the listener) # OCF_RESKEY_listener (optional; defaults to LISTENER) # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/ora-common.sh ####################################################################### SH=/bin/sh oralsnr_usage() { methods=`oralsnr_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages an Oracle Database instance as an HA resource. The 'start' operation starts the database. The 'stop' operation stops the database. The 'status' operation reports whether the database is running The 'monitor' operation reports whether the database seems to be working The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } oralsnr_meta_data() { cat < 1.0 Resource script for Oracle Listener. It manages an Oracle Listener instance as an HA resource. Manages an Oracle TNS listener The Oracle SID (aka ORACLE_SID). Necessary for the monitor op, i.e. to do tnsping SID. sid The Oracle home directory (aka ORACLE_HOME). If not specified, then the SID should be listed in /etc/oratab. home Run the listener as this user. user Listener instance to be started (as defined in listener.ora). Defaults to LISTENER. listener Full path to the directory that contains the Oracle listener tnsnames.ora configuration file. The shell variable TNS_ADMIN is set to the value provided. Full path to the directory containing tnsnames.ora END } # # methods: What methods/operations do we support? # oralsnr_methods() { cat <<-! start stop status monitor validate-all methods meta-data usage ! } # # Run commands as the Oracle owner... # runasdba() { if [ "$US" = "$ORACLE_OWNER" ]; then $SH else ( echo ". $ORA_ENVF" cat ) | su -s $SH - $ORACLE_OWNER fi } # # oralsnr_start: Start the Oracle listener instance # oralsnr_start() { if is_proc_running && test_tnsping; then : nothing to be done, we can leave right now ocf_log info "Listener $listener already running" return $OCF_SUCCESS fi output=`echo lsnrctl start $listener | runasdba` if test_tnsping; then : cool, we are up and running ocf_log info "Listener $listener running: $output" return $OCF_SUCCESS else - ocf_log err "Listener $listener appears to have started, but is not running properly: $output" + ocf_exit_reason "Listener $listener appears to have started, but is not running properly: $output" ocf_log err "Probable Oracle configuration error" return $OCF_ERR_GENERIC fi } # # oralsnr_stop: Stop the Oracle instance # oralsnr_stop() { if is_proc_running; then output=`echo lsnrctl stop $listener | runasdba` else ocf_log info "Listener $listener already stopped" return $OCF_SUCCESS fi ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged if is_proc_running; then - ocf_log err "Listener $listener not stopped: $output" + ocf_exit_reason "Listener $listener not stopped: $output" return $OCF_ERR_GENERIC else ocf_log info "Listener $listener stopped: $output" return $OCF_SUCCESS fi } # # is_proc_running: is the listener running? # is_proc_running() { show_procs | grep "." > /dev/null } # the following two should be run only if the process is running test_listener() { local output output=`lsnrctl status $listener` if echo "$output" | tail -1 | grep -qs 'completed successfully' then return $OCF_SUCCESS else - ocf_log err "$listener status failed: $output" + ocf_exit_reason "$listener status failed: $output" return $OCF_ERR_GENERIC fi } # and does it work? test_tnsping() { local output output=`tnsping $ORACLE_SID` if echo "$output" | tail -1 | grep -qs '^OK'; then return $OCF_SUCCESS else - ocf_log err "tnsping $ORACLE_SID failed: $output" + ocf_exit_reason "tnsping $ORACLE_SID failed: $output" return $OCF_ERR_GENERIC fi } # # oralsnr_monitor: Can we connect to the listener? # oralsnr_monitor() { if is_proc_running; then test_listener && test_tnsping else return $OCF_NOT_RUNNING fi } oralsnr_status() { if is_proc_running then echo Listener $listener is running exit $OCF_SUCCESS else echo Listener $listener is stopped exit $OCF_NOT_RUNNING fi } oralsnr_getconfig() { ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" "$OCF_RESKEY_tns_admin" listener=${OCF_RESKEY_listener:-"LISTENER"} } oralsnr_validate_all() { ora_common_validate_all } # used in ora-common.sh show_procs() { ps -e -o pid,user,args | grep '[t]nslsnr' | grep -i -w "$listener" | grep -w "$ORACLE_OWNER" } proc_pids() { show_procs | awk '{print $1}'; } PROCS_CLEANUP_TIME="10" OCF_REQUIRED_PARAMS="sid" OCF_REQUIRED_BINARIES="lsnrctl tnsping" ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/pgsql b/heartbeat/pgsql index bdeed8df5..e28af702a 100755 --- a/heartbeat/pgsql +++ b/heartbeat/pgsql @@ -1,2072 +1,2072 @@ #!/bin/sh # # Description: Manages a PostgreSQL Server as an OCF High-Availability # resource # # Authors: Serge Dubrouski (sergeyfd@gmail.com) -- original RA # Florian Haas (florian@linbit.com) -- makeover # Takatoshi MATSUO (matsuo.tak@gmail.com) -- support replication # David Corlette (dcorlette@netiq.com) -- add support for non-standard library locations and non-standard port # # Copyright: 2006-2012 Serge Dubrouski # and other Linux-HA contributors # License: GNU General Public License (GPL) # ############################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Use runuser if available for SELinux. if [ -x /sbin/runuser ]; then SU=runuser else SU=su fi # # Get PostgreSQL Configuration parameter # get_pgsql_param() { local param_name param_name=$1 perl_code="if (/^\s*$param_name[\s=]+\s*(.*)$/) { \$dir=\$1; \$dir =~ s/\s*\#.*//; \$dir =~ s/^'(\S*)'/\$1/; print \$dir;}" perl -ne "$perl_code" < $OCF_RESKEY_config } # Defaults OCF_RESKEY_pgctl_default=/usr/bin/pg_ctl OCF_RESKEY_psql_default=/usr/bin/psql OCF_RESKEY_pgdata_default=/var/lib/pgsql/data OCF_RESKEY_pgdba_default=postgres OCF_RESKEY_pghost_default="" OCF_RESKEY_pgport_default=5432 OCF_RESKEY_pglibs_default=/usr/lib OCF_RESKEY_start_opt_default="" OCF_RESKEY_ctl_opt_default="" OCF_RESKEY_pgdb_default=template1 OCF_RESKEY_logfile_default=/dev/null OCF_RESKEY_stop_escalate_default=30 OCF_RESKEY_monitor_user_default="" OCF_RESKEY_monitor_password_default="" OCF_RESKEY_monitor_sql_default="select now();" OCF_RESKEY_check_wal_receiver_default="false" # Defaults for replication OCF_RESKEY_rep_mode_default=none OCF_RESKEY_node_list_default="" OCF_RESKEY_restore_command_default="" OCF_RESKEY_archive_cleanup_command_default="" OCF_RESKEY_recovery_end_command_default="" OCF_RESKEY_master_ip_default="" OCF_RESKEY_repuser_default="postgres" OCF_RESKEY_primary_conninfo_opt_default="" OCF_RESKEY_restart_on_promote_default="false" OCF_RESKEY_tmpdir_default="/var/lib/pgsql/tmp" OCF_RESKEY_xlog_check_count_default="3" OCF_RESKEY_crm_attr_timeout_default="5" OCF_RESKEY_stop_escalate_in_slave_default=30 OCF_RESKEY_replication_slot_name_default="" : ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} : ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} : ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}} : ${OCF_RESKEY_pgdba=${OCF_RESKEY_pgdba_default}} : ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}} : ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}} : ${OCF_RESKEY_pglibs=${OCF_RESKEY_pglibs_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_pgdata}/postgresql.conf} : ${OCF_RESKEY_start_opt=${OCF_RESKEY_start_opt_default}} : ${OCF_RESKEY_ctl_opt=${OCF_RESKEY_ctl_opt_default}} : ${OCF_RESKEY_pgdb=${OCF_RESKEY_pgdb_default}} : ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} : ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} : ${OCF_RESKEY_monitor_user=${OCF_RESKEY_monitor_user_default}} : ${OCF_RESKEY_monitor_password=${OCF_RESKEY_monitor_password_default}} : ${OCF_RESKEY_monitor_sql=${OCF_RESKEY_monitor_sql_default}} : ${OCF_RESKEY_check_wal_receiver=${OCF_RESKEY_check_wal_receiver_default}} # for replication : ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}} : ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}} : ${OCF_RESKEY_restore_command=${OCF_RESKEY_restore_command_default}} : ${OCF_RESKEY_archive_cleanup_command=${OCF_RESKEY_archive_cleanup_command_default}} : ${OCF_RESKEY_recovery_end_command=${OCF_RESKEY_recovery_end_command_default}} : ${OCF_RESKEY_master_ip=${OCF_RESKEY_master_ip_default}} : ${OCF_RESKEY_repuser=${OCF_RESKEY_repuser_default}} : ${OCF_RESKEY_primary_conninfo_opt=${OCF_RESKEY_primary_conninfo_opt_default}} : ${OCF_RESKEY_restart_on_promote=${OCF_RESKEY_restart_on_promote_default}} : ${OCF_RESKEY_tmpdir=${OCF_RESKEY_tmpdir_default}} : ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} : ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} : ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} : ${OCF_RESKEY_replication_slot_name=${OCF_RESKEY_replication_slot_name_default}} usage() { cat < 1.0 Resource script for PostgreSQL. It manages a PostgreSQL as an HA resource. Manages a PostgreSQL database instance Path to pg_ctl command. pgctl Start options (-o start_opt in pg_ctl). "-i -p 5432" for example. start_opt Additional pg_ctl options (-w, -W etc..). ctl_opt Path to psql command. psql Path to PostgreSQL data directory. pgdata User that owns PostgreSQL. pgdba Hostname/IP address where PostgreSQL is listening pghost Port where PostgreSQL is listening pgport Custom location of the Postgres libraries. If not set, the standard location will be used. pglibs PostgreSQL user that pgsql RA will user for monitor operations. If it's not set pgdba user will be used. monitor_user Password for monitor user. monitor_password SQL script that will be used for monitor operations. monitor_sql Path to the PostgreSQL configuration file for the instance. Configuration file Database that will be used for monitoring. pgdb Path to PostgreSQL server log output file. logfile Unix socket directory for PostgreSQL. If you use PostgreSQL 9.3 or higher and define unix_socket_directories in the postgresql.conf, then you must set socketdir to determine which directory is used for psql command. socketdir Number of shutdown retries (using -m fast) before resorting to -m immediate stop escalation Replication mode may be set to "async" or "sync" or "slave". They require PostgreSQL 9.1 or later. Once set, "async" and "sync" require node_list, master_ip, and restore_command parameters,as well as configuring PostgreSQL for replication (in postgresql.conf and pg_hba.conf). "slave" means that RA only makes recovery.conf before starting to connect to primary which is running somewhere. It dosen't need master/slave setting. It requires master_ip restore_command parameters. rep_mode All node names. Please separate each node name with a space. This is required for replication. node list restore_command for recovery.conf. This is required for replication. restore_command archive_cleanup_command for recovery.conf. This is used for replication and is optional. archive_cleanup_command recovery_end_command for recovery.conf. This is used for replication and is optional. recovery_end_command Master's floating IP address to be connected from hot standby. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. master ip User used to connect to the master server. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. repuser primary_conninfo options of recovery.conf except host, port, user and application_name. This is optional for replication. primary_conninfo_opt If this is true, RA deletes recovery.conf and restarts PostgreSQL on promote to keep Timeline ID. It probably makes fail-over slower. It's recommended to set on-fail of promote up as fence. This is optional for replication. restart_on_promote Set this option when using replication slots. When the master node has 1 slave node,one replication slot would be created with the name "replication_slot_name". When the master node has 2 or more slave nodes,the replication slots would be created for each node, with the name adding the node name as postfix. For example, replication_slot_name is "sample" and 2 slaves which are "node_a" and "node_b" connect to their slots, the slots names are "sample_node_a" and "sample_node_b". pgsql RA doesn't monitor and delete the repliation slot. When the slave node has been disconnected in failure or the like, execute one of the following manually. Otherwise it may eventually cause a disk full because the master node will continue to accumulate the unsent WAL. 1. recover and reconnect the slave node to the master node as soon as possible. 2. delete the slot on the master node by following psql command. $ select pg_drop_replication_slot('replication_slot_name'); replication_slot_name Path to temporary directory. This is optional for replication. tmpdir Number of checks of xlog on monitor before promote. This is optional for replication. xlog check count The timeout of crm_attribute forever update command. Default value is 5 seconds. This is optional for replication. The timeout of crm_attribute forever update command. Number of shutdown retries (using -m fast) before resorting to -m immediate in slave state. This is optional for replication. stop escalation_in_slave If this is true, RA checks wal_receiver process on monitor and notifies its status using "(resource name)-receiver-status" attribute. It's useful for checking whether PostgreSQL (hot standby) connects to primary. The attribute shows status as "normal" or "normal (master)" or "ERROR". Note that if you configure PostgreSQL as master/slave resource, then wal receiver is not running in the master and the attribute shows status as "normal (master)" consistently because it is normal status. check_wal_receiver EOF } # # Run the given command in the Resource owner environment... # runasowner() { local quietrun="" local loglevel="-err" local var for var in 1 2 do case "$1" in "-q") quietrun="-q" shift 1;; "warn"|"err") loglevel="-$1" shift 1;; *) ;; esac done ocf_run $quietrun $loglevel $SU $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; $*" } # # Shell escape # escape_string() { echo "$*" | sed -e "s/'/'\\\\''/g" } # # methods: What methods/operations do we support? # pgsql_methods() { cat </dev/null 2>&1" return $? fi # No PID file false } pgsql_wal_receiver_status() { local PID local receiver_parent_pids local pgsql_real_monitor_status=$1 PID=`head -n 1 $PIDFILE` receiver_parent_pids=`ps -ef | tr -s " " | grep "[w]al receiver process" | cut -d " " -f 3` if echo "$receiver_parent_pids" | grep -q -w "$PID" ; then attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal" -q return 0 fi if [ $pgsql_real_monitor_status -eq "$OCF_RUNNING_MASTER" ]; then attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal (master)" -q return 0 fi attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "ERROR" -q ocf_log warn "wal receiver process is not running" return 1 } # # pgsql_real_monitor # pgsql_real_monitor() { local loglevel local rc local output # Set the log level of the error message loglevel=${1:-err} if ! pgsql_status then ocf_log info "PostgreSQL is down" return $OCF_NOT_RUNNING fi if is_replication; then #Check replication state output=`exec_sql "${CHECK_MS_SQL}"` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel "Can't get PostgreSQL recovery status." return $OCF_ERR_GENERIC fi case "$output" in f) ocf_log debug "PostgreSQL is running as a primary." if [ "$OCF_RESKEY_monitor_sql" = "$OCF_RESKEY_monitor_sql_default" ]; then return $OCF_RUNNING_MASTER fi ;; t) ocf_log debug "PostgreSQL is running as a hot standby." return $OCF_SUCCESS;; *) ocf_exit_reason "$CHECK_MS_SQL output is $output" return $OCF_ERR_GENERIC;; esac fi OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options \ -c '$OCF_RESKEY_monitor_sql'" rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running." return $OCF_ERR_GENERIC fi if is_replication; then return $OCF_RUNNING_MASTER fi return $OCF_SUCCESS } pgsql_replication_monitor() { local rc rc=$1 if [ $rc -ne $OCF_SUCCESS -a $rc -ne "$OCF_RUNNING_MASTER" ]; then return $rc fi # If I am Master if [ $rc -eq $OCF_RUNNING_MASTER ]; then change_data_status "$NODENAME" "LATEST" change_pgsql_status "$NODENAME" "PRI" control_slave_status || return $OCF_ERR_GENERIC if [ "$RE_CONTROL_SLAVE" = "true" ]; then sleep 2 ocf_log info "re-controlling slave status." RE_CONTROL_SLAVE="none" control_slave_status || return $OCF_ERR_GENERIC fi return $rc fi # I can't get master node name from $OCF_RESKEY_CRM_meta_notify_master_uname on monitor, # so I will get master node name using crm_mon -n print_crm_mon | tr -d "\t" | tr -d " " | grep -q "^${RESOURCE_NAME}[(:].*[):].*Master" if [ $? -ne 0 ] ; then # If I am Slave and Master is not exist ocf_log info "Master does not exist." change_pgsql_status "$NODENAME" "HS:alone" have_master_right if [ $? -eq 0 ]; then rm -f ${XLOG_NOTE_FILE}.* fi else output=`exec_with_retry 0 $CRM_ATTR_FOREVER -N "$NODENAME" \ -n "$PGSQL_DATA_STATUS_ATTR" -G -q` if [ "$output" = "DISCONNECT" ]; then change_pgsql_status "$NODENAME" "HS:alone" fi fi return $rc } #pgsql_monitor: pgsql_real_monitor() wrapper for replication pgsql_monitor() { local rc pgsql_real_monitor rc=$? if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then pgsql_wal_receiver_status $rc fi if ! is_replication; then return $rc else pgsql_replication_monitor $rc return $? fi } # pgsql_post_demote pgsql_post_demote() { DEMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname | sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` ocf_log debug "post-demote called. Demote uname is $DEMOTE_NODE" if [ "$DEMOTE_NODE" != "$NODENAME" ]; then if ! echo $OCF_RESKEY_CRM_meta_notify_master_uname | tr '[A-Z]' '[a-z]' | grep $NODENAME; then show_master_baseline change_pgsql_status "$NODENAME" "HS:alone" fi fi return $OCF_SUCCESS } pgsql_pre_promote() { local master_baseline local my_master_baseline local cmp_location local number_of_nodes # If my data is newer than new master's one, I fail my resource. PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` number_of_nodes=`echo $NODE_LIST | wc -w` if [ $number_of_nodes -ge 3 -a \ "$OCF_RESKEY_rep_mode" = "sync" -a \ "$PROMOTE_NODE" != "$NODENAME" ]; then master_baseline=`$CRM_ATTR_REBOOT -N "$PROMOTE_NODE" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` if [ $? -eq 0 ]; then my_master_baseline=`$CRM_ATTR_REBOOT -N "$NODENAME" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` # get older location cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ sort | head -1` if [ "$cmp_location" != "$my_master_baseline" ]; then ocf_exit_reason "My data is newer than new master's one. New master's location : $master_baseline" exec_with_retry 0 $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY return $OCF_ERR_GENERIC fi fi fi return $OCF_SUCCESS } pgsql_notify() { local type="${OCF_RESKEY_CRM_meta_notify_type}" local op="${OCF_RESKEY_CRM_meta_notify_operation}" local rc if ! is_replication; then return $OCF_SUCCESS fi ocf_log debug "notify: ${type} for ${op}" case $type in pre) case $op in promote) pgsql_pre_promote return $? ;; esac ;; post) case $op in promote) delete_xlog_location PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` if [ "$PROMOTE_NODE" != "$NODENAME" ]; then delete_master_baseline fi return $OCF_SUCCESS ;; demote) pgsql_post_demote return $? ;; start|stop) MASTER_NODE=`echo $OCF_RESKEY_CRM_meta_notify_master_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` if [ "$NODENAME" = "$MASTER_NODE" ]; then control_slave_status fi return $OCF_SUCCESS ;; esac ;; esac return $OCF_SUCCESS } control_slave_status() { local rc local data_status local target local all_data_status local tmp_data_status local number_of_nodes all_data_status=`exec_sql "${CHECK_REPLICATION_STATE_SQL}"` rc=$? if [ $rc -eq 0 ]; then if [ -n "$all_data_status" ]; then all_data_status=`echo $all_data_status | sed "s/\n/ /g"` fi else report_psql_error $rc err "Can't get PostgreSQL replication status." return 1 fi number_of_nodes=`echo $NODE_LIST | wc -w` for target in $NODE_LIST; do if [ "$target" = "$NODENAME" ]; then continue fi data_status="DISCONNECT" if [ -n "$all_data_status" ]; then for tmp_data_status in $all_data_status; do if ! echo $tmp_data_status | grep -q "^${target}|"; then continue fi data_status=`echo $tmp_data_status | cut -d "|" -f 2,3` ocf_log debug "node_name and data_status is $tmp_data_status" break done fi case "$data_status" in "STREAMING|SYNC") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_PROMOTE" change_pgsql_status "$target" "HS:sync" ;; "STREAMING|ASYNC") change_data_status "$target" "$data_status" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then change_master_score "$target" "$CAN_NOT_PROMOTE" set_sync_mode "$target" else if [ $number_of_nodes -le 2 ]; then change_master_score "$target" "$CAN_PROMOTE" else # I can't determine which slave's data is newest in async mode. change_master_score "$target" "$CAN_NOT_PROMOTE" fi fi change_pgsql_status "$target" "HS:async" ;; "STREAMING|POTENTIAL") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" change_pgsql_status "$target" "HS:potential" ;; "DISCONNECT") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then set_async_mode "$target" fi ;; *) change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then set_async_mode "$target" fi change_pgsql_status "$target" "HS:connected" ;; esac done return 0 } have_master_right() { local old local new local output local data_status local node local mylocation local count local newestXlog local oldfile local newfile ocf_log debug "Checking if I have a master right." data_status=`$CRM_ATTR_FOREVER -N "$NODENAME" -n \ "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi else if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "STREAMING|ASYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi fi ocf_log info "My data status=$data_status." show_xlog_location if [ $? -ne 0 ]; then ocf_exit_reason "Failed to show my xlog location." exit $OCF_ERR_GENERIC fi old=0 for count in `seq $OCF_RESKEY_xlog_check_count`; do if [ -f ${XLOG_NOTE_FILE}.$count ]; then old=$count continue fi break done new=`expr $old + 1` # get xlog locations of all nodes for node in ${NODE_LIST}; do output=`$CRM_ATTR_REBOOT -N "$node" -n \ "$PGSQL_XLOG_LOC_NAME" -G -q 2>/dev/null` if [ $? -ne 0 ]; then ocf_log warn "Can't get $node xlog location." continue else ocf_log info "$node xlog location : $output" echo "$node $output" >> ${XLOG_NOTE_FILE}.${new} if [ "$node" = "$NODENAME" ]; then mylocation=$output fi fi done oldfile=`cat ${XLOG_NOTE_FILE}.${old} 2>/dev/null` newfile=`cat ${XLOG_NOTE_FILE}.${new} 2>/dev/null` if [ "$oldfile" != "$newfile" ]; then # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 return 1 fi if [ "$new" -ge "$OCF_RESKEY_xlog_check_count" ]; then newestXlog=`printf "$newfile\n" | sort -t " " -k 2,3 -r | \ head -1 | cut -d " " -f 2` if [ "$newestXlog" = "$mylocation" ]; then ocf_log info "I have a master right." exec_with_retry 5 $CRM_MASTER -v $PROMOTE_ME return 0 fi change_data_status "$NODENAME" "DISCONNECT" ocf_log info "I don't have correct master data." # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 fi return 1 } is_replication() { if [ "$OCF_RESKEY_rep_mode" != "none" -a "$OCF_RESKEY_rep_mode" != "slave" ]; then return 0 fi return 1 } use_replication_slot() { if [ -n "$OCF_RESKEY_replication_slot_name" ]; then return 0 fi return 1 } create_replication_slot_name() { local number_of_nodes=0 local target local replication_slot_name local replication_slot_name_list_tmp local replication_slot_name_list if [ -n "$NODE_LIST" ]; then number_of_nodes=`echo $NODE_LIST | wc -w` fi # If the number of nodes 2 or less, Master node has 1 or less Slave node. # The Master node should have 1 slot for the Slave, which is named "$OCF_RES_KEY_replication_slot_name". if [ $number_of_nodes -le 2 ]; then replication_slot_name_list="$OCF_RESKEY_replication_slot_name" # If the number of nodes 3 or more, the Master has some Slave nodes. # The Master node should have some slots equal to the number of Slaves, and # the Slave nodes connect to their dedicated slot on the Master. # To ensuring that the slots name are each unique, add postfix to $OCF_RESKEY_replication_slot. # The postfix is "_$target". else for target in $NODE_LIST do if [ "$target" != "$NODENAME" ]; then replication_slot_name="$OCF_RESKEY_replication_slot_name"_"$target" replication_slot_name_list_tmp="$replication_slot_name_list" replication_slot_name_list="$replication_slot_name_list_tmp $replication_slot_name" fi done fi echo $replication_slot_name_list } create_replication_slot() { local replication_slot_name local replication_slot_name_list local output local rc local CREATE_REPLICATION_SLOT_sql local DELETE_REPLICATION_SLOT_sql replication_slot_name_list=`create_replication_slot_name` ocf_log debug "replication slot names are $replication_slot_name_list." for replication_slot_name in $replication_slot_name_list do # If the same name slot is already exists, initialize(delete and create) the slot. if [ `check_replication_slot $replication_slot_name` = "1" ]; then DELETE_REPLICATION_SLOT_sql="SELECT pg_drop_replication_slot('$replication_slot_name');" output=`exec_sql "$DELETE_REPLICATION_SLOT_sql"` rc=$? if [ $rc -eq 0 ]; then ocf_log info "PostgreSQL delete the replication slot($replication_slot_name)." else ocf_exit_reason "$output" return $OCF_ERR_GENERIC fi fi CREATE_REPLICATION_SLOT_sql="SELECT pg_create_physical_replication_slot('$replication_slot_name');" output=`exec_sql "$CREATE_REPLICATION_SLOT_sql"` rc=$? if [ $rc -eq 0 ]; then ocf_log info "PostgreSQL creates the replication slot($replication_slot_name)." else ocf_exit_reason "$output" return $OCF_ERR_GENERIC fi done return 0 } # This function check the replication slot does exists. check_replication_slot(){ local replication_slot_name=$1 local output local CHECK_REPLICATION_SLOT_sql="SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$replication_slot_name'" output=`exec_sql "$CHECK_REPLICATION_SLOT_sql"` echo "$output" } get_my_location() { local rc local output local replay_loc local receive_loc local output1 local output2 local log1 local log2 local newer_location output=`exec_sql "$CHECK_XLOG_LOC_SQL"` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc err "Can't get my xlog location." return 1 fi replay_loc=`echo $output | cut -d "|" -f 1` receive_loc=`echo $output | cut -d "|" -f 2` output1=`echo "$replay_loc" | cut -d "/" -f 1` output2=`echo "$replay_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` replay_loc="${log1}${log2}" output1=`echo "$receive_loc" | cut -d "/" -f 1` output2=`echo "$receive_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` receive_loc="${log1}${log2}" newer_location=`printf "$replay_loc\n$receive_loc" | sort -r | head -1` echo "$newer_location" return 0 } show_xlog_location() { local location location=`get_my_location` || return 1 exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -v "$location" } delete_xlog_location() { exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -D } show_master_baseline() { local rc local location location=`get_my_location` ocf_log info "My master baseline : $location." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -v "$location" } delete_master_baseline() { exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -D } set_async_mode_all() { [ "$OCF_RESKEY_rep_mode" = "sync" ] || return 0 ocf_log info "Set all nodes into async mode." runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" if [ $? -ne 0 ]; then ocf_exit_reason "Can't set all nodes into async mode." return 1 fi return 0 } set_async_mode() { cat $REP_MODE_CONF | grep -q -e "[,' ]$1[,' ]" if [ $? -eq 0 ]; then ocf_log info "Setup $1 into async mode." runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" else ocf_log debug "$1 is already in async mode." return 0 fi exec_with_retry 0 reload_conf } set_sync_mode() { local sync_node_in_conf sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` if [ -n "$sync_node_in_conf" ]; then ocf_log debug "$sync_node_in_conf is already sync mode." else ocf_log info "Setup $1 into sync mode." runasowner -q err "echo \"synchronous_standby_names = '$1'\" > \"$REP_MODE_CONF\"" [ "$RE_CONTROL_SLAVE" = "false" ] && RE_CONTROL_SLAVE="true" exec_with_retry 0 reload_conf fi } reload_conf() { # Invoke pg_ctl runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata reload" if [ $? -eq 0 ]; then ocf_log info "Reload configuration file." else ocf_exit_reason "Can't reload configuration file." return 1 fi return 0 } user_recovery_conf() { local number_of_nodes # put archive_cleanup_command and recovery_end_command only when defined by user if [ -n "$OCF_RESKEY_archive_cleanup_command" ]; then echo "archive_cleanup_command = '${OCF_RESKEY_archive_cleanup_command}'" fi if [ -n "$OCF_RESKEY_recovery_end_command" ]; then echo "recovery_end_command = '${OCF_RESKEY_recovery_end_command}'" fi if use_replication_slot; then number_of_nodes=`echo $NODE_LIST | wc -w` if [ $number_of_nodes -le 2 ]; then echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}'" else echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}_$NODENAME'" fi fi } make_recovery_conf() { runasowner "touch $RECOVERY_CONF" if [ $? -ne 0 ]; then ocf_exit_reason "Can't create recovery.conf." return 1 fi cat > $RECOVERY_CONF <> $RECOVERY_CONF ocf_log debug "Created recovery.conf. host=${OCF_RESKEY_master_ip}, user=${OCF_RESKEY_repuser}" return 0 } # change pgsql-status. # arg1:node, arg2: value change_pgsql_status() { local output if ! is_node_online $1; then return 0 fi output=`$CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then # If slave's disk is broken, RA cannot read PID file # and misjudges the PostgreSQL as down while it is running. # It causes overwriting of pgsql-status by Master because replication is still connected. if [ "$output" = "STOP" -o "$output" = "UNKNOWN" ]; then if [ "$1" != "$NODENAME" ]; then ocf_log warn "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2 by $NODENAME is prohibited." return 0 fi fi ocf_log info "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -v "$2" fi return 0 } # change pgsql-data-status. # arg1:node, arg2: value change_data_status() { local output if ! node_exist $1; then return 0 fi while : do output=`$CRM_ATTR_FOREVER -N "$1" -n "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then ocf_log info "Changing $PGSQL_DATA_STATUS_ATTR on $1 : $output->$2." exec_with_retry 0 exec_with_timeout 0 "$CRM_ATTR_FOREVER" -N $1 -n $PGSQL_DATA_STATUS_ATTR -v "$2" else break fi done return 0 } # set master-score # arg1:node, arg2: score, arg3: resoure set_master_score() { local current_score current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-$3" -G -q 2>/dev/null` if [ -n "$current_score" -a "$current_score" != "$2" ]; then ocf_log info "Changing $3 master score on $1 : $current_score->$2." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "master-$3" -v "$2" fi return 0 } # change master-score # arg1:node, arg2: score change_master_score() { local instance if ! is_node_online $1; then return 0 fi if echo $OCF_RESOURCE_INSTANCE | grep -q ":"; then # If Pacemaker version is 1.0.x instance=0 while : do if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then break fi if [ "${RESOURCE_NAME}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then instance=`expr $instance + 1` continue fi set_master_score $1 $2 "${RESOURCE_NAME}:${instance}" || return 1 instance=`expr $instance + 1` done else # If globally-unique=false and Pacemaker version is 1.1.8 or higher # Master/Slave resource has no instance number set_master_score $1 $2 ${RESOURCE_NAME} || return 1 fi return 0 } report_psql_error() { local rc local loglevel local message rc=$1 loglevel=${2:-err} message="$3" ocf_log $loglevel "$message rc=$rc" if [ $rc -eq 1 ]; then ocf_exit_reason "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." elif [ $rc -eq 2 ]; then ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." elif [ $rc -eq 3 ]; then ocf_exit_reason "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." fi } # # timeout management function # arg1 timeout >= 0 (if arg1 is 0, OCF_RESKEY_crm_attr_timeout is used.) # arg2 : command # arg3 : command's args exec_with_timeout() { local func_pid local count=$OCF_RESKEY_crm_attr_timeout local rc if [ "$1" -ne 0 ]; then count=$1 fi shift $* & func_pid=$! sleep .1 while kill -s 0 $func_pid >/dev/null 2>&1; do sleep 1 count=`expr $count - 1` if [ $count -le 0 ]; then ocf_exit_reason "\"$*\" (pid=$func_pid) timed out." kill -s 9 $func_pid >/dev/null 2>&1 return 1 fi ocf_log info "Waiting($count). \"$*\" (pid=$func_pid)." done wait $func_pid } # retry command when command doesn't return 0 # arg1 : count >= 0 (if arg1 is 0, it retries command in infinitum(1day)) # arg2..argN : command and args exec_with_retry() { local count="86400" local output local rc if [ "$1" -ne 0 ]; then count=$1 fi shift while [ $count -gt 0 ]; do output=`$*` rc=$? if [ $rc -ne 0 ]; then ocf_log warn "Retrying(remain $count). \"$*\" failed. rc=$rc. stdout=\"$output\"." count=`expr $count - 1` sleep 1 else printf "${output}" return 0 fi done ocf_exit_reason "giving up executing \"$*\"" return $rc } is_node_online() { print_crm_mon | tr '[A-Z]' '[a-z]' | grep -e "^node $1 " -e "^node $1:" | grep -q -v "offline" } node_exist() { print_crm_mon | tr '[A-Z]' '[a-z]' | grep -q "^node $1" } check_binary2() { if ! have_binary "$1"; then ocf_exit_reason "Setup problem: couldn't find command: $1" return 1 fi return 0 } check_config() { local rc=0 if [ ! -f "$1" ]; then if ocf_is_probe; then ocf_log info "Configuration file is $1 not readable during probe." rc=1 else ocf_exit_reason "Configuration file $1 doesn't exist" rc=2 fi fi return $rc } # Validate most critical parameters pgsql_validate_all() { local version local check_config_rc local rep_mode_string local socket_directories version=`cat $OCF_RESKEY_pgdata/PG_VERSION` if ! check_binary2 "$OCF_RESKEY_pgctl" || ! check_binary2 "$OCF_RESKEY_psql"; then return $OCF_ERR_INSTALLED fi check_config "$OCF_RESKEY_config" check_config_rc=$? [ $check_config_rc -eq 2 ] && return $OCF_ERR_INSTALLED if [ $check_config_rc -eq 0 ]; then ocf_version_cmp "$version" "9.3" if [ $? -eq 0 ]; then : ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} else # unix_socket_directories is used by PostgreSQL 9.3 or higher. socket_directories=`get_pgsql_param unix_socket_directories` if [ -n "$socket_directories" ]; then # unix_socket_directories may have multiple socket directories and the pgsql RA can not know which directory is used for psql command. # Therefore, the user must set OCF_RESKEY_socketdir explicitly. if [ -z "$OCF_RESKEY_socketdir" ]; then - ocf_log err "In PostgreSQL 9.3 or higher, socketdir can't be empty if you define unix_socket_directories in the postgresql.conf." + ocf_exit_reason "In PostgreSQL 9.3 or higher, socketdir can't be empty if you define unix_socket_directories in the postgresql.conf." return $OCF_ERR_CONFIGURED fi fi fi fi getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist"; return $OCF_ERR_INSTALLED; fi if ocf_is_probe; then ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" else if ! runasowner "test -w $OCF_RESKEY_pgdata"; then ocf_exit_reason "Directory $OCF_RESKEY_pgdata is not writable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM; fi fi if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ] then ocf_exit_reason "monitor password can't be empty" return $OCF_ERR_CONFIGURED fi if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ] then ocf_exit_reason "monitor_user has to be set if monitor_password is set" return $OCF_ERR_CONFIGURED fi if is_replication || [ "$OCF_RESKEY_rep_mode" = "slave" ]; then if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then ocf_exit_reason "Replication mode needs PostgreSQL 9.1 or higher." return $OCF_ERR_INSTALLED fi if [ ! -n "$OCF_RESKEY_master_ip" ]; then ocf_exit_reason "master_ip can't be empty." return $OCF_ERR_CONFIGURED fi fi if is_replication; then if ! ocf_is_ms; then ocf_exit_reason "Replication(rep_mode=async or sync) requires Master/Slave configuration." return $OCF_ERR_CONFIGURED fi if [ ! "$OCF_RESKEY_rep_mode" = "sync" -a ! "$OCF_RESKEY_rep_mode" = "async" ]; then ocf_exit_reason "Invalid rep_mode : $OCF_RESKEY_rep_mode" return $OCF_ERR_CONFIGURED fi if [ ! -n "$NODE_LIST" ]; then ocf_exit_reason "node_list can't be empty." return $OCF_ERR_CONFIGURED fi if [ $check_config_rc -eq 0 ]; then rep_mode_string="include '$REP_MODE_CONF' # added by pgsql RA" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if ! grep -q "$rep_mode_string" $OCF_RESKEY_config; then ocf_log info "adding include directive into $OCF_RESKEY_config" echo "$rep_mode_string" >> $OCF_RESKEY_config fi else if grep -q "$rep_mode_string" $OCF_RESKEY_config; then ocf_log info "deleting include directive from $OCF_RESKEY_config" rep_mode_string=`echo $rep_mode_string | sed -e 's|/|\\\\/|g'` sed -i "/$rep_mode_string/d" $OCF_RESKEY_config fi fi fi if ! mkdir -p $OCF_RESKEY_tmpdir || ! chown $OCF_RESKEY_pgdba $OCF_RESKEY_tmpdir || ! chmod 700 $OCF_RESKEY_tmpdir; then ocf_exit_reason "Can't create directory $OCF_RESKEY_tmpdir or it is not readable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM fi fi if [ "$OCF_RESKEY_rep_mode" = "slave" ]; then if ocf_is_ms; then ocf_exit_reason "Replication(rep_mode=slave) does not support Master/Slave configuration." return $OCF_ERR_CONFIGURED fi fi if use_replication_slot; then ocf_version_cmp "$version" "9.4" if [ $? -eq 0 -o $? -eq 3 ]; then ocf_exit_reason "Replication slot needs PostgreSQL 9.4 or higher." return $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } # # Check if we need to create a log file # check_log_file() { if [ ! -f "$1" ] then touch $1 > /dev/null 2>&1 chown $OCF_RESKEY_pgdba:`getent passwd $OCF_RESKEY_pgdba | cut -d ":" -f 4` $1 fi #Check if $OCF_RESKEY_pgdba can write to the log file if ! runasowner "test -w $1" then return 1 fi return 0 } # # Check socket directory # check_socket_dir() { if [ ! -d "$OCF_RESKEY_socketdir" ]; then if ! mkdir "$OCF_RESKEY_socketdir"; then ocf_exit_reason "Can't create directory $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chown $OCF_RESKEY_pgdba:`getent passwd \ $OCF_RESKEY_pgdba | cut -d ":" -f 4` "$OCF_RESKEY_socketdir" then ocf_exit_reason "Can't change ownership for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chmod 2775 "$OCF_RESKEY_socketdir"; then ocf_exit_reason "Can't change permissions for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi else if ! runasowner "touch $OCF_RESKEY_socketdir/test.$$"; then ocf_exit_reason "$OCF_RESKEY_pgdba can't create files in $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi rm $OCF_RESKEY_socketdir/test.$$ fi } print_crm_mon() { if [ -z "$CRM_MON_OUTPUT" ]; then CRM_MON_OUTPUT=`exec_with_retry 0 crm_mon -n1` fi printf "${CRM_MON_OUTPUT}\n" } # # 'main' starts here... # if [ $# -ne 1 ] then usage exit $OCF_ERR_GENERIC fi PIDFILE=${OCF_RESKEY_pgdata}/postmaster.pid BACKUPLABEL=${OCF_RESKEY_pgdata}/backup_label RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` PGSQL_WAL_RECEIVER_STATUS_ATTR="${RESOURCE_NAME}-receiver-status" RECOVERY_CONF=${OCF_RESKEY_pgdata}/recovery.conf NODENAME=$(ocf_local_nodename | tr '[A-Z]' '[a-z]') if is_replication; then REP_MODE_CONF=${OCF_RESKEY_tmpdir}/rep_mode.conf PGSQL_LOCK=${OCF_RESKEY_tmpdir}/PGSQL.lock XLOG_NOTE_FILE=${OCF_RESKEY_tmpdir}/xlog_note CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount" CAN_NOT_PROMOTE="-INFINITY" CAN_PROMOTE="100" PROMOTE_ME="1000" CHECK_MS_SQL="select pg_is_in_recovery()" CHECK_XLOG_LOC_SQL="select pg_last_xlog_replay_location(),pg_last_xlog_receive_location()" CHECK_REPLICATION_STATE_SQL="select application_name,upper(state),upper(sync_state) from pg_stat_replication" PGSQL_STATUS_ATTR="${RESOURCE_NAME}-status" PGSQL_DATA_STATUS_ATTR="${RESOURCE_NAME}-data-status" PGSQL_XLOG_LOC_NAME="${RESOURCE_NAME}-xlog-loc" PGSQL_MASTER_BASELINE="${RESOURCE_NAME}-master-baseline" NODE_LIST=`echo $OCF_RESKEY_node_list | tr '[A-Z]' '[a-z]'` RE_CONTROL_SLAVE="false" fi case "$1" in methods) pgsql_methods exit $?;; meta-data) meta_data exit $OCF_SUCCESS;; esac pgsql_validate_all rc=$? [ "$1" = "validate-all" ] && exit $rc if [ $rc -ne 0 ] then case "$1" in stop) if is_replication; then change_pgsql_status "$NODENAME" "UNKNOWN" fi exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $OCF_NOT_RUNNING;; *) exit $rc;; esac fi US=`id -u -n` if [ $US != root -a $US != $OCF_RESKEY_pgdba ] then ocf_exit_reason "$0 must be run as root or $OCF_RESKEY_pgdba" exit $OCF_ERR_GENERIC fi # make psql command options if [ -n "$OCF_RESKEY_monitor_user" ]; then PGUSER=$OCF_RESKEY_monitor_user; export PGUSER PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" else psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" fi if [ -n "$OCF_RESKEY_pghost" ]; then psql_options="$psql_options -h $OCF_RESKEY_pghost" else if [ -n "$OCF_RESKEY_socketdir" ]; then psql_options="$psql_options -h $OCF_RESKEY_socketdir" fi fi if [ -n "$OCF_RESKEY_pgport" ]; then export PGPORT=$OCF_RESKEY_pgport fi if [ -n "$OCF_RESKEY_pglibs" ]; then if [ -n "$LD_LIBRARY_PATH" ]; then export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OCF_RESKEY_pglibs else export LD_LIBRARY_PATH=$OCF_RESKEY_pglibs fi fi # What kind of method was invoked? case "$1" in status) if pgsql_status then ocf_log info "PostgreSQL is up" exit $OCF_SUCCESS else ocf_log info "PostgreSQL is down" exit $OCF_NOT_RUNNING fi;; monitor) pgsql_monitor exit $?;; start) pgsql_start exit $?;; promote) pgsql_promote exit $?;; demote) pgsql_demote exit $?;; notify) pgsql_notify exit $?;; stop) pgsql_stop exit $?;; *) exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/pingd b/heartbeat/pingd index 34ed704c9..205bda66c 100755 --- a/heartbeat/pingd +++ b/heartbeat/pingd @@ -1,279 +1,279 @@ #!/bin/sh # # # pingd OCF Resource Agent # Records (in the CIB) the current number of ping nodes a # cluster node can connect to. # # Copyright (c) 2006 Andrew Beekhof # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 1.0 Deprecation warning: This agent is deprecated and may be removed from a future release. See the ocf:pacemaker:pingd resource agent for a supported alternative. -- This is a pingd Resource Agent. It records (in the CIB) the current number of ping nodes a node can connect to. Monitors connectivity to specific hosts or IP addresses ("ping nodes") (deprecated) PID file PID file The user we want to run pingd as The user we want to run pingd as The time to wait (dampening) further changes occur Dampening interval The name of the instance_attributes set to place the value in. Rarely needs to be specified. Set name The name of the attributes to set. This is the name to be used in the constraints. Attribute name The section place the value in. Rarely needs to be specified. Section name The number by which to multiply the number of connected ping nodes by Value multiplier The list of ping nodes to count. Defaults to all configured ping nodes. Rarely needs to be specified. Host list If set to true, suppresses the deprecation warning for this agent. Suppress deprecation warning END } ####################################################################### pingd_usage() { cat </dev/null if [ $? -eq 0 ]; then : Yes, user exists. We can further check his permission on crm_mon if necessary else - ocf_log err "The user $OCF_RESKEY_user does not exist!" + ocf_exit_reason "The user $OCF_RESKEY_user does not exist!" exit $OCF_ERR_ARGS fi fi # Pidfile better be an absolute path case $OCF_RESKEY_pidfile in /*) ;; *) ocf_log warn "You should have pidfile($OCF_RESKEY_pidfile) of absolute path!" ;; esac # Check the update interval if ocf_is_decimal "$OCF_RESKEY_update" && [ $OCF_RESKEY_update -gt 0 ]; then : else - ocf_log err "Invalid update interval $OCF_RESKEY_update. It should be positive integer!" + ocf_exit_reason "Invalid update interval $OCF_RESKEY_update. It should be positive integer!" exit $OCF_ERR_ARGS fi echo "Validate OK" return $OCF_SUCCESS } if [ $# -ne 1 ]; then pingd_usage exit $OCF_ERR_ARGS fi : ${OCF_RESKEY_pidfile:="$HA_RSCTMP/pingd-${OCF_RESOURCE_INSTANCE}"} : ${OCF_RESKEY_name:="pingd"} : ${OCF_RESKEY_dampen:="1s"} if [ "$__OCF_ACTION" = "meta-data" ]; then meta_data exit $OCF_SUCCESS fi # Be obnoxious, log deprecation warning on every invocation (unless # suppressed by resource configuration). ocf_deprecated case $__OCF_ACTION in start) pingd_start ;; stop) pingd_stop ;; monitor) pingd_monitor ;; validate-all) pingd_validate ;; usage|help) pingd_usage exit $OCF_SUCCESS ;; *) pingd_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $?