diff --git a/heartbeat/CTDB b/heartbeat/CTDB index 6cfff63db..63f636dc6 100755 --- a/heartbeat/CTDB +++ b/heartbeat/CTDB @@ -1,762 +1,770 @@ #!/bin/sh # # OCF Resource Agent for managing CTDB # # Copyright (c) 2009-2010 Novell Inc., Tim Serong # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # OVERVIEW # # When run by itself, CTDB can handle IP failover and includes scripts # to manage various services (Samba, Winbind, HTTP, etc.). When run as # a resource in a Pacemaker cluster, this additional functionality # should not be used; instead one should define separate resources for # CTDB, Samba, Winbind, IP addresses, etc. # # As of 2010-11-17, there is no separate OCF Samba or Winbind RA, so # it is still possible to configure CTDB so that it manages these # resources itself. In future, once Samba and Winbind RAs are # available, this ability will be deprecated and ultimately removed. # # This RA intentionally provides no ability to configure CTDB such that # it manages IP failover, HTTP, NFS, etc. # # # TODO: # - ctdb_stop doesn't really support multiple independent CTDB instances, # unless they're running from distinct ctdbd binaries (it uses pkill # $OCF_RESKEY_ctdbd_binary if "ctdb stop" doesn't work, which it might # not under heavy load - this will kill all ctdbd instances on the # system). OTOH, running multiple CTDB instances per node is, well, # AFAIK, completely crazy. Can't run more than one in a vanilla CTDB # cluster, with the CTDB init script. So it might be nice to address # this for complete semantic correctness of the RA, but shouldn't # actually cause any trouble in real life. # - As much as possible, get rid of auto config generation # - Especially smb.conf # - Verify timeouts are sane # - Monitor differentiate between error and not running? # - Do we need to verify globally unique setting? # - Should set CTDB_NODES to ${HA_RSCTMP}/ctdb (generated based on # current nodes) # - Look at enabling set_ctdb_variables() if necessary. # - Probably possible for sysconfig file to not be restored if # CTDB dies unexpectedly. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Default parameter values: # Some distro's ctdb package stores the persistent db in /var/lib/ctdb, # others store in /var/ctdb. This attempts to detect the correct default # directory. var_prefix="/var/lib/ctdb" if [ ! -d "$var_prefix" ] && [ -d "/var/ctdb" ]; then var_prefix="/var/ctdb" fi run_prefix="/run" if [ ! -d "$var_prefix" ] && [ -d "/var/run" ]; then var_prefix="/var/run" fi : ${OCF_RESKEY_ctdb_manages_samba:=no} : ${OCF_RESKEY_ctdb_manages_winbind:=no} : ${OCF_RESKEY_ctdb_service_smb:=""} : ${OCF_RESKEY_ctdb_service_nmb:=""} : ${OCF_RESKEY_ctdb_service_winbind:=""} : ${OCF_RESKEY_ctdb_samba_skip_share_check:=yes} : ${OCF_RESKEY_ctdb_monitor_free_memory:=100} : ${OCF_RESKEY_ctdb_start_as_disabled:=no} : ${OCF_RESKEY_ctdb_config_dir:=/etc/ctdb} : ${OCF_RESKEY_ctdb_binary:=/usr/bin/ctdb} : ${OCF_RESKEY_ctdbd_binary:=/usr/sbin/ctdbd} : ${OCF_RESKEY_ctdb_dbdir:=${var_prefix}} : ${OCF_RESKEY_ctdb_logfile:=/var/log/ctdb/log.ctdb} : ${OCF_RESKEY_ctdb_rundir:=${run_prefix}/ctdb} : ${OCF_RESKEY_ctdb_socket:=${OCF_RESKEY_ctdb_rundir}/ctdbd.socket} : ${OCF_RESKEY_ctdb_debuglevel:=2} : ${OCF_RESKEY_smb_conf:=/etc/samba/smb.conf} : ${OCF_RESKEY_smb_passdb_backend:=tdbsam} : ${OCF_RESKEY_smb_idmap_backend:=tdb2} ####################################################################### meta_data() { cat < 1.0 This resource agent manages CTDB, allowing one to use Clustered Samba in a Linux-HA/Pacemaker cluster. You need a shared filesystem (e.g. OCFS2 or GFS2) on which the CTDB lock will be stored. Create /etc/ctdb/nodes containing a list of private IP addresses of each node in the cluster, then configure this RA as a clone. This agent expects the samba and windbind resources to be managed outside of CTDB's control as a separate set of resources controlled by the cluster manager. The optional support for enabling CTDB management of these daemons will be depreciated. For more information see http://linux-ha.org/wiki/CTDB_(resource_agent) CTDB Resource Agent The location of a shared lock file, common across all nodes. This must be on shared storage, e.g.: /shared-fs/samba/ctdb.lock CTDB shared lock file Should CTDB manage starting/stopping the Samba service for you? This will be deprecated in future, in favor of configuring a separate Samba resource. Should CTDB manage Samba? Should CTDB manage starting/stopping the Winbind service for you? This will be deprecated in future, in favor of configuring a separate Winbind resource. Should CTDB manage Winbind? Name of smb init script. Only necessary if CTDB is managing Samba directly. Will usually be auto-detected. Name of smb init script Name of nmb init script. Only necessary if CTDB is managing Samba directly. Will usually be auto-detected. Name of nmb init script Name of winbind init script. Only necessary if CTDB is managing Winbind directly. Will usually be auto-detected. Name of winbind init script If there are very many shares it may not be feasible to check that all of them are available during each monitoring interval. In that case this check can be disabled. Skip share check during monitor? If the amount of free memory drops below this value the node will become unhealthy and ctdb and all managed services will be shutdown. Once this occurs, the administrator needs to find the reason for the OOM situation, rectify it and restart ctdb with "service ctdb start". Minimum amount of free memory (MB) When set to yes, the CTDB node will start in DISABLED mode and not host any public ip addresses. Start CTDB disabled? The directory containing various CTDB configuration files. The "nodes" and "notify.sh" scripts are expected to be in this directory, as is the "events.d" subdirectory. CTDB config file directory Full path to the CTDB binary. CTDB binary path Full path to the CTDB cluster daemon binary. CTDB Daemon binary path Full path to the domain socket that ctdbd will create, used for local clients to attach and communicate with the ctdb daemon. CTDB socket location The directory to put the local CTDB database files in. Persistent database files will be put in ctdb_dbdir/persistent. CTDB database directory Full path to log file. To log to syslog instead, use the value "syslog". CTDB log file location Full path to ctdb runtime directory, used for storage of socket lock state. CTDB runtime directory location What debug level to run at (0-10). Higher means more verbose. CTDB debug level Path to default samba config file. Only necessary if CTDB is managing Samba. Path to smb.conf The directory for smbd to use for storing such files as smbpasswd and secrets.tdb. Old versions of CTBD (prior to 1.0.50) required this to be on shared storage. This parameter should not be set for current versions of CTDB, and only remains in the RA for backwards compatibility. Samba private dir (deprecated) Which backend to use for storing user and possibly group information. Only necessary if CTDB is managing Samba. Samba passdb backend Which backend to use for SID/uid/gid mapping. Only necessary if CTDB is managing Samba. Samba idmap backend Which fileid:algorithm to use with vfs_fileid. The correct value depends on which clustered filesystem is in use, e.g.: for OCFS2, this should be set to "fsid". Only necessary if CTDB is managing Samba. Samba VFS fileid algorithm END } ####################################################################### # Figure out path to /etc/sysconfig/ctdb (same logic as # loadconfig() from /etc/ctdb/functions if [ -f /etc/sysconfig/ctdb ]; then CTDB_SYSCONFIG=/etc/sysconfig/ctdb elif [ -f /etc/default/ctdb ]; then CTDB_SYSCONFIG=/etc/default/ctdb -elif [ -f $OCF_RESKEY_ctdb_config_dir/ctdb ]; then +elif [ -f "$OCF_RESKEY_ctdb_config_dir/ctdb" ]; then CTDB_SYSCONFIG=$OCF_RESKEY_ctdb_config_dir/ctdb fi # Backup paths CTDB_SYSCONFIG_BACKUP=${CTDB_SYSCONFIG}.ctdb-ra-orig invoke_ctdb() { # CTDB's defaults are: - local timeout=3 - local timelimit=120 + local timeout + local timelimit + timeout=3 + timelimit=120 # ...but we override with the timeout for the current op: if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then timeout=$((OCF_RESKEY_CRM_meta_timeout/1000)) timelimit=$((OCF_RESKEY_CRM_meta_timeout/1000)) fi - $OCF_RESKEY_ctdb_binary --socket=$OCF_RESKEY_ctdb_socket \ + $OCF_RESKEY_ctdb_binary --socket="$OCF_RESKEY_ctdb_socket" \ -t $timeout -T $timelimit \ "$@" } # Enable any event scripts that are explicitly required. # Any others will ultimately be invoked or not based on how they ship # with CTDB, but will generally have no effect, beacuase the relevant # CTDB_MANAGES_* options won't be set in /etc/sysconfig/ctdb. enable_event_scripts() { - local event_dir=$OCF_RESKEY_ctdb_config_dir/events.d + local event_dir + event_dir=$OCF_RESKEY_ctdb_config_dir/events.d if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then - chmod u+x $event_dir/10.interface + chmod u+x "$event_dir/10.interface" else - chmod a-x $event_dir/10.interface + chmod a-x "$event_dir/10.interface" fi if [ -f "${OCF_RESKEY_ctdb_config_dir}/static-routes" ]; then - chmod u+x $event_dir/11.routing + chmod u+x "$event_dir/11.routing" else - chmod a-x $event_dir/11.routing + chmod a-x "$event_dir/11.routing" fi if ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" || \ ocf_is_true "$OCF_RESKEY_ctdb_manages_winbind"; then - chmod u+x $event_dir/50.samba + chmod u+x "$event_dir/50.samba" else - chmod a-x $event_dir/50.samba + chmod a-x "$event_dir/50.samba" fi } # This function has no effect (currently no way to set CTDB_SET_*) # but remains here in case we need it in future. set_ctdb_variables() { rv=$OCF_SUCCESS set | grep ^CTDB_SET_ | cut -d_ -f3- | while read v; do - varname=`echo $v | cut -d= -f1` - value=`echo $v | cut -d= -f2` - invoke_ctdb setvar $varname $value || rv=$OCF_ERR_GENERIC + varname=$(echo "$v" | cut -d= -f1) + value=$(echo "$v" | cut -d= -f2) + invoke_ctdb setvar "$varname" "$value" || rv=$OCF_ERR_GENERIC done || rv=$OCF_ERR_GENERIC return $rv } # Add necessary settings to /etc/samba/smb.conf. In a perfect world, # we'd be able to generate a new, temporary, smb.conf file somewhere, # something like: # include = /etc/samba/smb.conf # [global] # clustering = yes # # ...etc... # Unfortunately, we can't do this, because there's no way to tell the # smb init script where the temporary config is, so we just edit # the default config file. init_smb_conf() { # Don't screw around with the config if CTDB isn't managing Samba! ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" || return 0 # replace these things in smb.conf - local repl='# CTDB-RA:|passdb backend|clustering|idmap backend|private dir|ctdbd socket' + local repl + repl='# CTDB-RA:|passdb backend|clustering|idmap backend|private dir|ctdbd socket' local private_dir [ -n "$OCF_RESKEY_smb_private_dir" ] && private_dir="\tprivate dir = $OCF_RESKEY_smb_private_dir\n" local vfs_fileid - local do_vfs=0 + local do_vfs + do_vfs=0 if [ -n "$OCF_RESKEY_smb_fileid_algorithm" ]; then repl="${repl}|fileid:algorithm|fileid:mapping" vfs_fileid="\tfileid:algorithm = $OCF_RESKEY_smb_fileid_algorithm\n" if sed -n '/^[[:space:]]*\[global\]/,/^[[:space:]]*\[/p' $OCF_RESKEY_smb_conf | \ grep -Eq '^[[:space:]]*vfs objects'; then # vfs objects already specified, will append fileid to existing line do_vfs=1 else vfs_fileid="$vfs_fileid\tvfs objects = fileid\n" fi fi # Preserve permissions of smb.conf - cp -a $OCF_RESKEY_smb_conf $OCF_RESKEY_smb_conf.$$ + cp -a "$OCF_RESKEY_smb_conf" "$OCF_RESKEY_smb_conf.$$" awk ' /^[[:space:]]*\[/ { global = 0 } /^[[:space:]]*\[global\]/ { global = 1 } { if(global) { if ('$do_vfs' && $0 ~ /^[[:space:]]vfs objects/ && $0 !~ /fileid/) { print $0" fileid" } else if ($0 !~ /^[[:space:]]*('"$repl"')/) { print } } else { print } - }' $OCF_RESKEY_smb_conf | sed "/^[[:space:]]*\[global\]/ a\\ + }' "$OCF_RESKEY_smb_conf" | sed "/^[[:space:]]*\[global\]/ a\\ \t# CTDB-RA: Begin auto-generated section (do not change below)\n\ \tpassdb backend = $OCF_RESKEY_smb_passdb_backend\n\ \tclustering = yes\n\ \tidmap backend = $OCF_RESKEY_smb_idmap_backend\n\ \tctdbd socket = $OCF_RESKEY_ctdb_socket\n$private_dir$vfs_fileid\ -\t# CTDB-RA: End auto-generated section (do not change above)" > $OCF_RESKEY_smb_conf.$$ - dd conv=notrunc,fsync of=$OCF_RESKEY_smb_conf.$$ if=/dev/null >/dev/null 2>&1 - mv $OCF_RESKEY_smb_conf.$$ $OCF_RESKEY_smb_conf +\t# CTDB-RA: End auto-generated section (do not change above)" > "$OCF_RESKEY_smb_conf.$$" + dd conv=notrunc,fsync of="$OCF_RESKEY_smb_conf.$$" if=/dev/null >/dev/null 2>&1 + mv "$OCF_RESKEY_smb_conf.$$" "$OCF_RESKEY_smb_conf" } # Get rid of that section we added cleanup_smb_conf() { ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" || return 0 # preserve permissions of smb.conf - cp -a $OCF_RESKEY_smb_conf $OCF_RESKEY_smb_conf.$$ - sed '/# CTDB-RA: Begin/,/# CTDB-RA: End/d' $OCF_RESKEY_smb_conf > $OCF_RESKEY_smb_conf.$$ - mv $OCF_RESKEY_smb_conf.$$ $OCF_RESKEY_smb_conf + cp -a "$OCF_RESKEY_smb_conf" "$OCF_RESKEY_smb_conf.$$" + sed '/# CTDB-RA: Begin/,/# CTDB-RA: End/d' "$OCF_RESKEY_smb_conf" > "$OCF_RESKEY_smb_conf.$$" + mv "$OCF_RESKEY_smb_conf.$$" "$OCF_RESKEY_smb_conf" } append_ctdb_sysconfig() { - [ -n "$2" ] && echo "$1=$2" >> $CTDB_SYSCONFIG + [ -n "$2" ] && echo "$1=$2" >> "$CTDB_SYSCONFIG" } # Generate a new, minimal CTDB config file that's just enough # to get CTDB running as configured by the RA parameters. generate_ctdb_sysconfig() { # Backup existing sysconfig if we're not already using an auto-generated one grep -qa '# CTDB-RA: Auto-generated' $CTDB_SYSCONFIG || cp -p $CTDB_SYSCONFIG $CTDB_SYSCONFIG_BACKUP if [ $? -ne 0 ]; then ocf_log warn "Unable to backup $CTDB_SYSCONFIG to $CTDB_SYSCONFIG_BACKUP" fi ocf_log info "Generating new $CTDB_SYSCONFIG" # Note to maintainers and other random hackers: # Parameters may need to be set here, for CTDB event # scripts to pick up, or may need to be passed to ctdbd # when starting, or both. Be careful. The CTDB source # tree and manpages are your friends. As a concrete # example, setting CTDB_START_AS_DISABLED here is # completely useless, as this is actually a command line # argument for ctdbd; it's not used anywhere else. cat >$CTDB_SYSCONFIG </dev/null - for pdbase in $(ls $persistent_db_dir/*.tdb.[0-9] 2>/dev/null$) ; do - /usr/bin/tdbdump $pdbase >/dev/null 2>/dev/null || { + for pdbase in $persistent_db_dir/*.tdb.[0-9]; do + /usr/bin/tdbdump "$pdbase" >/dev/null 2>/dev/null || { ocf_exit_reason "Persistent database $pdbase is corrupted! CTDB will not start." return $OCF_ERR_GENERIC } done # Add necessary configuration to smb.conf init_smb_conf if [ $? -ne 0 ]; then ocf_exit_reason "Failed to update $OCF_RESKEY_smb_conf." return $OCF_ERR_GENERIC fi # Generate new CTDB sysconfig generate_ctdb_sysconfig enable_event_scripts # Use logfile by default, or syslog if asked for - local log_option="--logfile=$OCF_RESKEY_ctdb_logfile" + local log_option + log_option="--logfile=$OCF_RESKEY_ctdb_logfile" if [ "$OCF_RESKEY_ctdb_logfile" = "syslog" ]; then log_option="--syslog" elif [ ! -d "$(dirname $OCF_RESKEY_ctdb_logfile)" ]; then # ensure the logfile's directory exists, otherwise ctdb will fail to start mkdir -p $(dirname $OCF_RESKEY_ctdb_logfile) fi # ensure ctdb's rundir exists, otherwise it will fail to start mkdir -p $OCF_RESKEY_ctdb_rundir 2>/dev/null # public addresses file (should not be present, but need to set for correctness if it is) - local pub_addr_option="" + local pub_addr_option + pub_addr_option="" [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ] && \ pub_addr_option="--public-addresses=${OCF_RESKEY_ctdb_config_dir}/public_addresses" # start as disabled - local start_as_disabled="--start-as-disabled" + local start_as_disabled + start_as_disabled="--start-as-disabled" ocf_is_true "$OCF_RESKEY_ctdb_start_as_disabled" || start_as_disabled="" # Start her up - $OCF_RESKEY_ctdbd_binary \ - --reclock=$OCF_RESKEY_ctdb_recovery_lock \ - --nlist=$OCF_RESKEY_ctdb_config_dir/nodes \ - --socket=$OCF_RESKEY_ctdb_socket \ - --dbdir=$OCF_RESKEY_ctdb_dbdir \ - --dbdir-persistent=$OCF_RESKEY_ctdb_dbdir/persistent \ - --event-script-dir=$OCF_RESKEY_ctdb_config_dir/events.d \ - --notification-script=$OCF_RESKEY_ctdb_config_dir/notify.sh \ + "$OCF_RESKEY_ctdbd_binary" \ + --reclock="$OCF_RESKEY_ctdb_recovery_lock" \ + --nlist="$OCF_RESKEY_ctdb_config_dir/nodes" \ + --socket="$OCF_RESKEY_ctdb_socket" \ + --dbdir="$OCF_RESKEY_ctdb_dbdir" \ + --dbdir-persistent="$OCF_RESKEY_ctdb_dbdir/persistent" \ + --event-script-dir="$OCF_RESKEY_ctdb_config_dir/events.d" \ + --notification-script="$OCF_RESKEY_ctdb_config_dir/notify.sh" \ --transport=tcp \ $start_as_disabled $log_option $pub_addr_option \ - -d $OCF_RESKEY_ctdb_debuglevel + -d "$OCF_RESKEY_ctdb_debuglevel" if [ $? -ne 0 ]; then # cleanup smb.conf cleanup_smb_conf ocf_exit_reason "Failed to execute $OCF_RESKEY_ctdbd_binary." return $OCF_ERR_GENERIC else # Wait a bit for CTDB to stabilize # (until start times out if necessary) while true; do # Initial sleep is intentional (ctdb init script # has sleep after ctdbd start, but before invoking # ctdb to talk to it) sleep 1 status=$(invoke_ctdb status 2>/dev/null) if [ $? -ne 0 ]; then # CTDB will be running, kill it before returning ctdb_stop ocf_exit_reason "Can't invoke $OCF_RESKEY_ctdb_binary --socket=$OCF_RESKEY_ctdb_socket status" return $OCF_ERR_GENERIC fi - if ! echo $status | grep -qs 'UNHEALTHY (THIS'; then + if ! echo "$status" | grep -qs 'UNHEALTHY (THIS'; then # Status does not say this node is unhealthy, # so we're good to go. Do a bit of final # setup and (hopefully) return success. set_ctdb_variables return $? fi done fi # ctdbd will (or can) actually still be running at this point, so kill it ctdb_stop ocf_exit_reason "Timeout waiting for CTDB to stabilize" return $OCF_ERR_GENERIC } ctdb_stop() { # Do nothing if already stopped - pkill -0 -f $OCF_RESKEY_ctdbd_binary || return $OCF_SUCCESS + pkill -0 -f "$OCF_RESKEY_ctdbd_binary" || return $OCF_SUCCESS # Tell it to die nicely invoke_ctdb shutdown >/dev/null 2>&1 rv=$? # No more Mr. Nice Guy count=0 - while pkill -0 -f $OCF_RESKEY_ctdbd_binary ; do + while pkill -0 -f "$OCF_RESKEY_ctdbd_binary" ; do sleep 1 - count=$(($count + 1)) + count=$((count + 1)) [ $count -gt 10 ] && { ocf_log info "killing ctdbd " - pkill -9 -f $OCF_RESKEY_ctdbd_binary - pkill -9 -f ${OCF_RESKEY_ctdb_config_dir}/events.d/ + pkill -9 -f "$OCF_RESKEY_ctdbd_binary" + pkill -9 -f "${OCF_RESKEY_ctdb_config_dir}/events.d/" } done # Cleanup smb.conf cleanup_smb_conf # It was a clean shutdown, return success [ $rv -eq $OCF_SUCCESS ] && return $OCF_SUCCESS # Unclean shutdown, return success if there's no ctdbds left (we # killed them forcibly, but at least they're good and dead). - pkill -0 -f $OCF_RESKEY_ctdbd_binary || return $OCF_SUCCESS + pkill -0 -f "$OCF_RESKEY_ctdbd_binary" || return $OCF_SUCCESS # Problem: ctdb shutdown didn't work and neither did some vigorous # kill -9ing. Only thing to do is report failure. return $OCF_ERR_GENERIC } ctdb_monitor() { local status # "ctdb status" exits non-zero if CTDB isn't running. # It can also exit non-zero if there's a timeout (ctdbd blocked, # stalled, massive load, or otherwise wedged). If it's actually # not running, STDERR will say "Errno:Connection refused(111)", # whereas if it's wedged, it'll say various other unpleasant things. status=$(invoke_ctdb status 2>&1) if [ $? -ne 0 ]; then - if echo $status | grep -qs 'Connection refused'; then + if echo "$status" | grep -qs 'Connection refused'; then return $OCF_NOT_RUNNING - elif echo $status | grep -qs 'No such file or directory'; then + elif echo "$status" | grep -qs 'No such file or directory'; then return $OCF_NOT_RUNNING else ocf_exit_reason "CTDB status call failed: $status" return $OCF_ERR_GENERIC fi fi - if echo $status | grep -Eqs '(OK|DISABLED) \(THIS'; then + if echo "$status" | grep -Eqs '(OK|DISABLED) \(THIS'; then return $OCF_SUCCESS fi ocf_exit_reason "CTDB status is bad: $status" return $OCF_ERR_GENERIC } ctdb_validate() { # Required binaries (full path to tdbdump is intentional, as that's # what's used in ctdb_start, which was lifted from the init script) for binary in pkill /usr/bin/tdbdump; do check_binary $binary done if [ -z "$CTDB_SYSCONFIG" ]; then ocf_exit_reason "Can't find CTDB config file (expecting /etc/sysconfig/ctdb, /etc/default/ctdb or similar)" return $OCF_ERR_INSTALLED fi if ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" && [ ! -f "$OCF_RESKEY_smb_conf" ]; then ocf_exit_reason "Samba config file '$OCF_RESKEY_smb_conf' does not exist." return $OCF_ERR_INSTALLED fi if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then ocf_log warn "CTDB file '${OCF_RESKEY_ctdb_config_dir}/public_addresses' exists - CTDB will try to manage IP failover!" fi if [ ! -f "$OCF_RESKEY_ctdb_config_dir/nodes" ]; then ocf_exit_reason "$OCF_RESKEY_ctdb_config_dir/nodes does not exist." return $OCF_ERR_ARGS fi if [ -z "$OCF_RESKEY_ctdb_recovery_lock" ]; then ocf_exit_reason "ctdb_recovery_lock not specified." return $OCF_ERR_CONFIGURED fi lock_dir=$(dirname "$OCF_RESKEY_ctdb_recovery_lock") touch "$lock_dir/$$" 2>/dev/null if [ $? != 0 ]; then ocf_exit_reason "Directory for lock file '$OCF_RESKEY_ctdb_recovery_lock' does not exist, or is not writable." return $OCF_ERR_ARGS fi rm "$lock_dir/$$" return $OCF_SUCCESS } case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS ;; start) ctdb_start;; stop) ctdb_stop;; monitor) ctdb_monitor;; validate-all) ctdb_validate;; usage|help) ctdb_usage exit $OCF_SUCCESS ;; *) ctdb_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem index 96673f90d..9baf14073 100755 --- a/heartbeat/Filesystem +++ b/heartbeat/Filesystem @@ -1,883 +1,883 @@ #!/bin/sh # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # Filesystem # Description: Manages a Filesystem on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # # usage: ./Filesystem {start|stop|status|monitor|validate-all|meta-data} # # OCF parameters are as below: # OCF_RESKEY_device # OCF_RESKEY_directory # OCF_RESKEY_fstype # OCF_RESKEY_options # OCF_RESKEY_statusfile_prefix # OCF_RESKEY_run_fsck # OCF_RESKEY_fast_stop # OCF_RESKEY_force_clones # #OCF_RESKEY_device : name of block device for the filesystem. e.g. /dev/sda1, /dev/md0 # Or a -U or -L option for mount, or an NFS mount specification #OCF_RESKEY_directory : the mount point for the filesystem #OCF_RESKEY_fstype : optional name of the filesystem type. e.g. ext2 #OCF_RESKEY_options : options to be given to the mount command via -o #OCF_RESKEY_statusfile_prefix : the prefix used for a status file for monitoring #OCF_RESKEY_run_fsck : fsck execution mode: auto(default)/force/no #OCF_RESKEY_fast_stop : fast stop: yes(default)/no #OCF_RESKEY_force_clones : allow running the resource as clone. e.g. local xfs mounts # for each brick in a glusterfs setup # # # This assumes you want to manage a filesystem on a shared (SCSI) bus, # on a replicated device (such as DRBD), or a network filesystem (such # as NFS or Samba). # # Do not put this filesystem in /etc/fstab. This script manages all of # that for you. # # NOTE: If 2 or more nodes mount the same file system read-write, and # that file system is not designed for that specific purpose # (such as GFS or OCFS2), and is not a network file system like # NFS or Samba, then the filesystem is going to become # corrupted. # # As a result, you should use this together with the stonith # option and redundant, independent communications paths. # # If you don't do this, don't blame us when you scramble your # disk. ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults DFLT_STATUSDIR=".Filesystem_status/" # Variables used by multiple methods HOSTOS=`uname` # The status file is going to an extra directory, by default # prefix=${OCF_RESKEY_statusfile_prefix} : ${prefix:=$DFLT_STATUSDIR} suffix="${OCF_RESOURCE_INSTANCE}" [ "$OCF_RESKEY_CRM_meta_clone" ] && suffix="${suffix}_$OCF_RESKEY_CRM_meta_clone" suffix="${suffix}_`uname -n`" STATUSFILE=${OCF_RESKEY_directory}/$prefix$suffix ####################################################################### usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|meta-data} EOT } meta_data() { cat < 1.1 Resource script for Filesystem. It manages a Filesystem on a shared storage medium. The standard monitor operation of depth 0 (also known as probe) checks if the filesystem is mounted. If you want deeper tests, set OCF_CHECK_LEVEL to one of the following values: 10: read first 16 blocks of the device (raw read) This doesn't exercise the filesystem at all, but the device on which the filesystem lives. This is noop for non-block devices such as NFS, SMBFS, or bind mounts. 20: test if a status file can be written and read The status file must be writable by root. This is not always the case with an NFS mount, as NFS exports usually have the "root_squash" option set. In such a setup, you must either use read-only monitoring (depth=10), export with "no_root_squash" on your NFS server, or grant world write permissions on the directory where the status file is to be placed. Manages filesystem mounts The name of block device for the filesystem, or -U, -L options for mount, or NFS mount specification. block device The mount point for the filesystem. mount point The type of filesystem to be mounted. filesystem type Any extra options to be given as -o options to mount. For bind mounts, add "bind" here and set fstype to "none". We will do the right thing for options such as "bind,ro". options The prefix to be used for a status file for resource monitoring with depth 20. If you don't specify this parameter, all status files will be created in a separate directory. status file prefix Specify how to decide whether to run fsck or not. "auto" : decide to run fsck depending on the fstype(default) "force" : always run fsck regardless of the fstype "no" : do not run fsck ever. run_fsck Normally, we expect no users of the filesystem and the stop operation to finish quickly. If you cannot control the filesystem users easily and want to prevent the stop action from failing, then set this parameter to "no" and add an appropriate timeout for the stop operation. fast stop The use of a clone setup for local filesystems is forbidden by default. For special setups like glusterfs, cloning a mount of a local device with a filesystem like ext4 or xfs independently on several nodes is a valid use case. Only set this to "true" if you know what you are doing! allow running as a clone, regardless of filesystem type This option allows specifying how to handle processes that are currently accessing the mount directory. "true" : Default value, kill processes accessing mount point "safe" : Kill processes accessing mount point using methods that avoid functions that could potentially block during process detection "false" : Do not kill any processes. The 'safe' option uses shell logic to walk the /procs/ directory for pids using the mount point while the default option uses the fuser cli tool. fuser is known to perform operations that can potentially block if unresponsive nfs mounts are in use on the system. Kill processes before unmount END } # # Make sure the kernel does the right thing with the FS buffers # This function should be called after unmounting and before mounting # It may not be necessary in 2.4 and later kernels, but it shouldn't hurt # anything either... # # It's really a bug that you have to do this at all... # flushbufs() { if have_binary $BLOCKDEV ; then if [ "$blockdevice" = "yes" ] ; then $BLOCKDEV --flushbufs $1 return $? fi fi return 0 } # Take advantage of /etc/mtab if present, use portable mount command # otherwise. Normalize format to "dev mountpoint fstype". is_bind_mount() { echo "$options" | grep -w bind >/dev/null 2>&1 } list_mounts() { local inpf="" if [ -e "/proc/mounts" ] && ! is_bind_mount; then inpf=/proc/mounts elif [ -f "/etc/mtab" -a -r "/etc/mtab" ]; then inpf=/etc/mtab fi if [ "$inpf" ]; then cut -d' ' -f1,2,3 < $inpf else $MOUNT | cut -d' ' -f1,3,5 fi } determine_blockdevice() { if [ $blockdevice = "yes" ]; then return fi # Get the current real device name, if possible. # (specified devname could be -L or -U...) case "$FSTYPE" in nfs4|nfs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|none) : ;; *) DEVICE=`list_mounts | grep " $MOUNTPOINT " | cut -d' ' -f1` if [ -b "$DEVICE" ]; then blockdevice=yes fi ;; esac } # Lists all filesystems potentially mounted under a given path, # excluding the path itself. list_submounts() { list_mounts | grep " $1/" | cut -d' ' -f2 | sort -r } # kernels < 2.6.26 can't handle bind remounts bind_kernel_check() { echo "$options" | grep -w ro >/dev/null 2>&1 || return uname -r | awk -F. ' $1==2 && $2==6 { sub("[^0-9].*","",$3); if ($3<26) exit(1); }' [ $? -ne 0 ] && ocf_log warn "kernel `uname -r` cannot handle read only bind mounts" } bind_mount() { if is_bind_mount && [ "$options" != "-o bind" ] then bind_kernel_check bind_opts=`echo $options | sed 's/bind/remount/'` $MOUNT $bind_opts $MOUNTPOINT else true # make sure to return OK fi } is_option() { echo $OCF_RESKEY_options | grep -w "$1" >/dev/null 2>&1 } is_fsck_needed() { case $OCF_RESKEY_run_fsck in force) true;; no) false;; ""|auto) case $FSTYPE in ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs) false;; *) true;; esac;; *) ocf_log warn "Invalid parameter value for fsck: '$OCF_RESKEY_run_fsck'; setting to 'auto'" OCF_RESKEY_run_fsck="auto" is_fsck_needed;; esac } fstype_supported() { local support="$FSTYPE" local rc if [ "X${HOSTOS}" != "XOpenBSD" ];then # skip checking /proc/filesystems for obsd return $OCF_SUCCESS fi if [ -z "$FSTYPE" -o "$FSTYPE" = none ]; then : No FSTYPE specified, rely on the system has the right file-system support already return $OCF_SUCCESS fi # support fuse-filesystems (e.g. GlusterFS) case $FSTYPE in fuse.*|glusterfs|rozofs) support="fuse";; esac grep -w "$support"'$' /proc/filesystems >/dev/null if [ $? -eq 0 ]; then # found the fs type return $OCF_SUCCESS fi # if here, we should attempt to load the module and then # check the if the filesystem support exists again. $MODPROBE $support >/dev/null if [ $? -ne 0 ]; then ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems and failed to load kernel module" return $OCF_ERR_INSTALLED fi # It is possible for the module to load and not be complete initialized # before we check /proc/filesystems again. Give this a few trys before # giving up entirely. for try in $(seq 5); do grep -w "$support"'$' /proc/filesystems >/dev/null if [ $? -eq 0 ] ; then # yes. found the filesystem after doing the modprobe return $OCF_SUCCESS fi ocf_log debug "Unable to find support for $FSTYPE in /proc/filesystems after modprobe, trying again" sleep 1 done ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems" return $OCF_ERR_INSTALLED } # # START: Start up the filesystem # Filesystem_start() { # See if the device is already mounted. if Filesystem_status >/dev/null 2>&1 ; then ocf_log info "Filesystem $MOUNTPOINT is already mounted." return $OCF_SUCCESS fi fstype_supported || exit $OCF_ERR_INSTALLED # Check the filesystem & auto repair. # NOTE: Some filesystem types don't need this step... Please modify # accordingly if [ $blockdevice = "yes" ]; then if [ "$DEVICE" != "/dev/null" -a ! -b "$DEVICE" ] ; then ocf_exit_reason "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" exit $OCF_ERR_INSTALLED fi if is_fsck_needed; then ocf_log info "Starting filesystem check on $DEVICE" if [ -z "$FSTYPE" ]; then $FSCK -p $DEVICE else $FSCK -t $FSTYPE -p $DEVICE fi # NOTE: if any errors at all are detected, it returns non-zero # if the error is >= 4 then there is a big problem if [ $? -ge 4 ]; then ocf_exit_reason "Couldn't successfully fsck filesystem for $DEVICE" return $OCF_ERR_GENERIC fi fi fi [ -d "$MOUNTPOINT" ] || ocf_run mkdir -p $MOUNTPOINT if [ ! -d "$MOUNTPOINT" ] ; then ocf_exit_reason "Couldn't find directory [$MOUNTPOINT] to use as a mount point" exit $OCF_ERR_INSTALLED fi flushbufs $DEVICE # Mount the filesystem. case "$FSTYPE" in none) $MOUNT $options $DEVICE $MOUNTPOINT && bind_mount ;; "") $MOUNT $options $DEVICE $MOUNTPOINT ;; *) $MOUNT -t $FSTYPE $options $DEVICE $MOUNTPOINT ;; esac if [ $? -ne 0 ]; then ocf_exit_reason "Couldn't mount device [$DEVICE] as $MOUNTPOINT" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # end of Filesystem_start get_pids() { local dir=$1 local procs local mmap_procs if ocf_is_true "$FORCE_UNMOUNT"; then if [ "X${HOSTOS}" = "XOpenBSD" ];then fstat | grep $dir | awk '{print $3}' else $FUSER -m $dir 2>/dev/null fi elif [ "$FORCE_UNMOUNT" = "safe" ]; then procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}') mmap_procs=$(grep " ${dir}" /proc/[0-9]*/maps | awk -F/ '{print $3}') printf "${procs}\n${mmap_procs}" | sort | uniq fi } signal_processes() { local dir=$1 local sig=$2 local pids pid # fuser returns a non-zero return code if none of the # specified files is accessed or in case of a fatal # error. pids=$(get_pids "$dir") if [ -z "$pids" ]; then ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'" return fi for pid in $pids; do ocf_log info "sending signal $sig to: `ps -f $pid | tail -1`" kill -s $sig $pid done } try_umount() { local SUB=$1 $UMOUNT $umount_force $SUB list_mounts | grep -q " $SUB " >/dev/null 2>&1 || { ocf_log info "unmounted $SUB successfully" return $OCF_SUCCESS } return $OCF_ERR_GENERIC } fs_stop() { local SUB=$1 timeout=$2 sig cnt for sig in TERM KILL; do cnt=$((timeout/2)) # try half time with TERM while [ $cnt -gt 0 ]; do try_umount $SUB && return $OCF_SUCCESS ocf_exit_reason "Couldn't unmount $SUB; trying cleanup with $sig" signal_processes $SUB $sig cnt=$((cnt-1)) sleep 1 done done return $OCF_ERR_GENERIC } # # STOP: Unmount the filesystem # Filesystem_stop() { # See if the device is currently mounted Filesystem_status >/dev/null 2>&1 if [ $? -eq $OCF_NOT_RUNNING ]; then # Already unmounted, wonderful. rc=$OCF_SUCCESS else # Wipe the status file, but continue with a warning if # removal fails -- the file system might be read only if [ $OCF_CHECK_LEVEL -eq 20 ]; then rm -f ${STATUSFILE} if [ $? -ne 0 ]; then ocf_log warn "Failed to remove status file ${STATUSFILE}." fi fi # Determine the real blockdevice this is mounted on (if # possible) prior to unmounting. determine_blockdevice # For networked filesystems, there's merit in trying -f: case "$FSTYPE" in nfs4|nfs|cifs|smbfs) umount_force="-f" ;; esac # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. local timeout for SUB in `list_submounts $MOUNTPOINT` $MOUNTPOINT; do ocf_log info "Trying to unmount $SUB" if ocf_is_true "$FAST_STOP"; then timeout=6 else timeout=${OCF_RESKEY_CRM_meta_timeout:="20000"} timeout=$((timeout/1000)) fi fs_stop $SUB $timeout rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_exit_reason "Couldn't unmount $SUB, giving up!" fi done fi flushbufs $DEVICE return $rc } # end of Filesystem_stop # # STATUS: is the filesystem mounted or not? # Filesystem_status() { if list_mounts | grep -q " $MOUNTPOINT " >/dev/null 2>&1; then rc=$OCF_SUCCESS msg="$MOUNTPOINT is mounted (running)" else rc=$OCF_NOT_RUNNING msg="$MOUNTPOINT is unmounted (stopped)" fi # Special case "monitor" to check whether the UUID cached and # on-disk still match? case "$OP" in status) ocf_log info "$msg";; esac return $rc } # end of Filesystem_status # Note: the read/write tests below will stall in case the # underlying block device (or in the case of a NAS mount, the # NAS server) has gone away. In that case, if I/O does not # return to normal in time, the operation hits its timeout # and it is up to the CRM to initiate appropriate recovery # actions (such as fencing the node). # # MONITOR 10: read the device # Filesystem_monitor_10() { if [ "$blockdevice" = "no" ] ; then ocf_log warn "$DEVICE is not a block device, monitor 10 is noop" return $OCF_SUCCESS fi dd_opts="iflag=direct bs=4k count=1" err_output=`dd if=$DEVICE $dd_opts 2>&1 >/dev/null` if [ $? -ne 0 ]; then ocf_exit_reason "Failed to read device $DEVICE" ocf_log err "dd said: $err_output" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # MONITOR 20: write and read a status file # Filesystem_monitor_20() { if [ "$blockdevice" = "no" ] ; then # O_DIRECT not supported on cifs/smbfs dd_opts="oflag=sync bs=4k conv=fsync,sync" else # Writing to the device in O_DIRECT mode is imperative # to bypass caches. dd_opts="oflag=direct,sync bs=4k conv=fsync,sync" fi status_dir=`dirname $STATUSFILE` [ -d "$status_dir" ] || mkdir -p "$status_dir" err_output=`echo "${OCF_RESOURCE_INSTANCE}" | dd of=${STATUSFILE} $dd_opts 2>&1` if [ $? -ne 0 ]; then ocf_exit_reason "Failed to write status file ${STATUSFILE}" ocf_log err "dd said: $err_output" return $OCF_ERR_GENERIC fi test -f ${STATUSFILE} if [ $? -ne 0 ]; then ocf_exit_reason "Cannot stat the status file ${STATUSFILE}" return $OCF_ERR_GENERIC fi cat ${STATUSFILE} > /dev/null if [ $? -ne 0 ]; then ocf_exit_reason "Cannot read the status file ${STATUSFILE}" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } Filesystem_monitor() { Filesystem_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi if [ $rc -eq $OCF_SUCCESS -a $OCF_CHECK_LEVEL -gt 0 ]; then case "$OCF_CHECK_LEVEL" in 10) Filesystem_monitor_10; rc=$?;; 20) Filesystem_monitor_20; rc=$?;; *) ocf_exit_reason "unsupported monitor level $OCF_CHECK_LEVEL" rc=$OCF_ERR_CONFIGURED ;; esac fi return $rc } # end of Filesystem_monitor # # VALIDATE_ALL: Are the instance parameters valid? # FIXME!! The only part that's useful is the return code. # This code always returns $OCF_SUCCESS (!) # Filesystem_validate_all() { - if [ -n $MOUNTPOINT -a ! -d $MOUNTPOINT ]; then + if [ -n "$MOUNTPOINT" ] && [ ! -d "$MOUNTPOINT" ]; then ocf_log warn "Mountpoint $MOUNTPOINT does not exist" fi # Check if the $FSTYPE is workable # NOTE: Without inserting the $FSTYPE module, this step may be imprecise # TODO: This is Linux specific crap. if [ ! -z "$FSTYPE" -a "$FSTYPE" != none ]; then cut -f2 /proc/filesystems |grep -q ^$FSTYPE$ if [ $? -ne 0 ]; then modpath=/lib/modules/`uname -r` moddep=$modpath/modules.dep # Do we have $FSTYPE in modules.dep? cut -d' ' -f1 $moddep |grep -q "^$modpath.*$FSTYPE\.k\?o:$" if [ $? -ne 0 ]; then ocf_log info "It seems we do not have $FSTYPE support" fi fi fi # If we are supposed to do monitoring with status files, then # we need a utility to write in O_DIRECT mode. if [ $OCF_CHECK_LEVEL -gt 0 ]; then check_binary dd # Note: really old coreutils version do not support # the "oflag" option for dd. We don't check for that # here. In case dd does not support oflag, monitor is # bound to fail, with dd spewing an error message to # the logs. On such systems, we must do without status # file monitoring. fi #TODO: How to check the $options ? return $OCF_SUCCESS } # # set the blockdevice variable to "no" or "yes" # set_blockdevice_var() { blockdevice=no # these are definitely not block devices case $FSTYPE in nfs4|nfs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs) return;; esac if `is_option "loop"`; then return fi case $DEVICE in -*) # Oh... An option to mount instead... Typically -U or -L ;; /dev/null) # Special case for BSC blockdevice=yes ;; *) if [ ! -b "$DEVICE" -a ! -d "$DEVICE" -a "X$OP" != Xstart ] ; then ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" fi if [ ! -d "$DEVICE" ]; then blockdevice=yes fi ;; esac } # Check the arguments passed to this script if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # Check the OCF_RESKEY_ environment variables... FORCE_UNMOUNT="yes" if [ -n "${OCF_RESKEY_force_unmount}" ]; then FORCE_UNMOUNT=$OCF_RESKEY_force_unmount fi DEVICE=$OCF_RESKEY_device FSTYPE=$OCF_RESKEY_fstype if [ ! -z "$OCF_RESKEY_options" ]; then options="-o $OCF_RESKEY_options" fi FAST_STOP=${OCF_RESKEY_fast_stop:="yes"} OP=$1 # These operations do not require instance parameters case $OP in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac if [ x = x"$DEVICE" ]; then ocf_exit_reason "Please set OCF_RESKEY_device to the device to be managed" exit $OCF_ERR_CONFIGURED fi set_blockdevice_var # Normalize instance parameters: # It is possible that OCF_RESKEY_directory has one or even multiple trailing "/". # But the output of `mount` and /proc/mounts do not. if [ -z "$OCF_RESKEY_directory" ]; then if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then ocf_exit_reason "Please specify the directory" exit $OCF_ERR_CONFIGURED fi else MOUNTPOINT=$(echo $OCF_RESKEY_directory | sed 's/\/*$//') : ${MOUNTPOINT:=/} # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/" # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll # kill the whole system. Is that a good idea? fi # Check to make sure the utilites are found if [ "X${HOSTOS}" != "XOpenBSD" ];then check_binary $MODPROBE check_binary $FUSER fi check_binary $FSCK check_binary $MOUNT check_binary $UMOUNT if [ "$OP" != "monitor" ]; then ocf_log info "Running $OP for $DEVICE on $MOUNTPOINT" fi case $OP in status) Filesystem_status exit $? ;; monitor) Filesystem_monitor exit $? ;; validate-all) Filesystem_validate_all exit $? ;; stop) Filesystem_stop exit $? ;; esac CLUSTERSAFE=0 is_option "ro" && CLUSTERSAFE=2 case $FSTYPE in nfs4|nfs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs) CLUSTERSAFE=1 # this is kind of safe too ;; # add here CLUSTERSAFE=0 for all filesystems which are not # cluster aware and which, even if when mounted read-only, # could still modify parts of it such as journal/metadata ext4|ext4dev|ext3|reiserfs|reiser4|xfs|jfs) if ocf_is_true "$OCF_RESKEY_force_clones"; then CLUSTERSAFE=2 else CLUSTERSAFE=0 # these are not allowed fi ;; esac if ocf_is_clone; then case $CLUSTERSAFE in 0) ocf_exit_reason "DANGER! $FSTYPE on $DEVICE is NOT cluster-aware!" ocf_log err "DO NOT RUN IT AS A CLONE!" ocf_log err "Politely refusing to proceed to avoid data corruption." exit $OCF_ERR_CONFIGURED ;; 2) ocf_log warn "$FSTYPE on $DEVICE is NOT cluster-aware!" if ocf_is_true "$OCF_RESKEY_force_clones"; then ocf_log warn "But we'll let it run because we trust _YOU_ verified it's safe to do so." else ocf_log warn "But we'll let it run because it is mounted read-only." ocf_log warn "Please make sure that it's meta data is read-only too!" fi ;; esac fi case $OP in start) Filesystem_start ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/MailTo b/heartbeat/MailTo index 3936c39de..b2aa89b0f 100755 --- a/heartbeat/MailTo +++ b/heartbeat/MailTo @@ -1,191 +1,191 @@ #!/bin/sh # # Resource script for MailTo # # Author: Alan Robertson # # Description: sends email to a sysadmin whenever a takeover occurs. # # Note: This command requires an argument, unlike normal init scripts. # # This can be given in the haresources file as: # # You can also give a mail subject line or even multiple addresses # MailTo::alanr@unix.sh::BigImportantWebServer # MailTo::alanr@unix.sh,spoppi@gmx.de::BigImportantWebServer # # This will then be put into the message subject and body. # # OCF parameters are as below: # OCF_RESKEY_email # OCF_RESKEY_subject # # License: GNU General Public License (GPL) # # Copyright: (C) 2005 International Business Machines ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### ARGS="$0 $*" us=`uname -n` usage() { echo "Usage: $0 {start|stop|status|monitor|meta-data|validate-all}" } meta_data() { cat < 1.0 This is a resource agent for MailTo. It sends email to a sysadmin whenever a takeover occurs. Notifies recipients by email in the event of resource takeover The email address of sysadmin. Email address The subject of the email. Subject END } MailProgram() { $MAILCMD -s "$1" "$email" < 1.00.2 Manages starting, stopping and monitoring of RAID devices which are preconfigured in /etc/conf.d/HB-ManageRAID. Manages RAID devices Name (case sensitive) of RAID to manage. (preconfigured in /etc/conf.d/HB-ManageRAID) RAID name END } # # start_raid() # start_raid() { declare -i retcode status_raid retcode=$? if [[ $retcode == $OCF_SUCCESS ]]; then return $OCF_SUCCESS elif [[ $retcode != $OCF_NOT_RUNNING ]]; then return $retcode fi - for ldev in ${RAID_LOCALDISKS[@]}; do + for ldev in "${RAID_LOCALDISKS[@]}"; do if [[ ! -b $ldev ]]; then ocf_log err "$ldev is not a (local) block device." return $OCF_ERR_ARGS fi done - $MDADM -A $RAID_DEVPATH -a yes -u ${!RAID_UUID} ${RAID_LOCALDISKS[@]} &> /dev/null + $MDADM -A $RAID_DEVPATH -a yes -u ${!RAID_UUID} "${RAID_LOCALDISKS[@]}" &> /dev/null if [[ $? != 0 ]]; then - ocf_log err "starting ${!RAID_DEV} with ${RAID_LOCALDISKS[@]} failed." + ocf_log err "starting ${!RAID_DEV} with ${RAID_LOCALDISKS[*]} failed." return $OCF_ERR_GENERIC fi $MOUNT -o ${!RAID_MOUNTOPTIONS} $RAID_DEVPATH ${!RAID_MOUNTPOINT} &> /dev/null if [[ $? != 0 ]]; then $MDADM -S $RAID_DEVPATH &> /dev/null if [[ $? != 0 ]]; then ocf_log err "mounting ${!RAID_DEV} to ${!RAID_MOUNTPOINT} failed as well as stopping the RAID itself." else ocf_log err "mounting ${!RAID_DEV} to ${!RAID_MOUNTPOINT} failed. RAID stopped again." fi return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # stop_raid() # stop_raid() { status_raid if [[ $? == $OCF_NOT_RUNNING ]]; then return $OCF_SUCCESS fi $UMOUNT ${!RAID_MOUNTPOINT} &> /dev/null if [[ $? != 0 ]]; then ocf_log err "unmounting ${!RAID_MOUNTPOINT} failed. not stopping ${!RAID_DEV}!" return $OCF_ERR_GENERIC fi $MDADM -S $RAID_DEVPATH &> /dev/null if [[ $? != 0 ]]; then ocf_log err "stopping RAID ${!RAID_DEV} failed." return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # status_raid() # status_raid() { declare -i retcode_raidcheck declare -i retcode_uuidcheck $CAT $RAID_MDSTAT | $GREP -e "${!RAID_DEV}[\ ]*:[\ ]*active" &> /dev/null - if [[ $? != 0 ]]; then + if [ $? -ne 0 ]; then return $OCF_NOT_RUNNING fi - if [[ ! -e $RAID_DEVPATH ]]; then + if [ ! -e $RAID_DEVPATH ]; then return $OCF_ERR_GENERIC fi $MDADM --detail -t $RAID_DEVPATH &> /dev/null retcode_raidcheck=$? $MDADM --detail -t $RAID_DEVPATH | $GREP -qEe "^[\ ]*UUID[\ ]*:[\ ]*${!RAID_UUID}" &> /dev/null retcode_uuidcheck=$? - if [[ $retcode_raidcheck > 3 ]]; then + if [ $retcode_raidcheck -gt 3 ]; then ocf_log err "mdadm returned error code $retcode_raidcheck while checking ${!RAID_DEV}." return $OCF_ERR_GENERIC - elif [[ $retcode_raidcheck == 3 ]]; then + elif [ $retcode_raidcheck -eq 3 ]; then ocf_log err "${!RAID_DEV} has failed." return $OCF_ERR_GENERIC - elif [[ $retcode_raidcheck < 3 && $retcode_uuidcheck != 0 ]]; then + elif [ $retcode_raidcheck -lt 3 ] && [ $retcode_uuidcheck != 0 ]; then ocf_log err "active RAID ${!RAID_DEV} and configured UUID (!$RAID_UUID) do not match." return $OCF_ERR_GENERIC fi $MOUNT | $GREP -e "$RAID_DEVPATH on ${!RAID_MOUNTPOINT}" &> /dev/null if [[ $? != 0 ]]; then ocf_log err "${!RAID_DEV} seems to be no longer mounted at ${!RAID_MOUNTPOINT}" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # validate_all_raid() # validate_all_raid() { # # since all parameters are checked every time ManageRAID is # invoked, there not much more to check... # # status_raid should cover the rest. # declare -i retcode status_ve retcode=$? if [[ $retcode != $OCF_SUCCESS && $retcode != $OCF_NOT_RUNNING ]]; then return $retcode fi return $OCF_SUCCESS } if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; *) ;; esac ## required configuration # [ -f /etc/conf.d/HB-ManageRAID ] || { ocf_log err "/etc/conf.d/HB-ManageRAID missing" exit $OCF_ERR_INSTALLED } . /etc/conf.d/HB-ManageRAID # ## # # check relevant environment variables for sanity and security # declare -i retcode_test declare -i retcode_grep $TEST -z "$OCF_RESKEY_raidname" retcode_test=$? echo "$OCF_RESKEY_raidname" | $GREP -qEe "^[[:alnum:]\_]+$" retcode_grep=$? if [[ $retcode_test != 1 || $retcode_grep != 0 ]]; then ocf_log err "OCF_RESKEY_raidname not set or invalid." exit $OCF_ERR_ARGS fi RAID_UUID=${OCF_RESKEY_raidname}_UUID echo ${!RAID_UUID} | $GREP -qEe "^[[:alnum:]]{8}:[[:alnum:]]{8}:[[:alnum:]]{8}:[[:alnum:]]{8}$" if [[ $? != 0 ]]; then ocf_log err "${OCF_RESKEY_raidname}_UUID is invalid." exit $OCF_ERR_ARGS fi RAID_DEV=${OCF_RESKEY_raidname}_DEV echo ${!RAID_DEV} | $GREP -qEe "^md[0-9]+$" if [[ $? != 0 ]]; then ocf_log err "${OCF_RESKEY_raidname}_DEV is invalid." exit $OCF_ERR_ARGS fi RAID_DEVPATH=/dev/${!RAID_DEV/md/md\/} RAID_MOUNTPOINT=${OCF_RESKEY_raidname}_MOUNTPOINT echo ${!RAID_MOUNTPOINT} | $GREP -qEe "^[[:alnum:]\/\_\"\ ]+$" if [[ $? != 0 ]]; then ocf_log err "${OCF_RESKEY_raidname}_MOUNTPOINT is invalid." exit $OCF_ERR_ARGS fi RAID_MOUNTOPTIONS=${OCF_RESKEY_raidname}_MOUNTOPTIONS echo ${!RAID_MOUNTOPTIONS} | $GREP -qEe "^[[:alpha:]\,]+$" if [[ $? != 0 ]]; then ocf_log err "${OCF_RESKEY_raidname}_MOUNTOPTIONS is invalid." exit $OCF_ERR_ARGS fi RAID_LOCALDISKS=${OCF_RESKEY_raidname}_LOCALDISKS[@] RAID_LOCALDISKS=( "${!RAID_LOCALDISKS}" ) -if [[ ${#RAID_LOCALDISKS[@]} < 1 ]]; then +if [ ${#RAID_LOCALDISKS[@]} -lt 1 ]; then ocf_log err "you have to specify at least one local disk." exit $OCF_ERR_ARGS fi # # check that all relevant utilities are available # check_binary $MDADM check_binary $MOUNT check_binary $UMOUNT check_binary $GREP check_binary $CAT check_binary $TEST check_binary echo # # check that all relevant devices are available # check_file $RAID_MDSTAT # # finally... let's see what we are ordered to do :-) # case "$1" in start) start_raid ;; stop) stop_raid ;; status|monitor) status_raid ;; validate-all) validate_all_raid ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/ServeRAID b/heartbeat/ServeRAID index 4ab70fb94..a66084da0 100755 --- a/heartbeat/ServeRAID +++ b/heartbeat/ServeRAID @@ -1,419 +1,419 @@ #!/bin/sh # # # ServeRAID # # Description: Enables/Disables shared ServeRAID merge groups # # Author: Alan Robertson, Renzo Alejandro Granados # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2005 International Business Machines # (C) 2002 Renzo Alejandro Granados # # usage: ./ServeRAID (start|stop|status|monitor|validate-all|meta-data) # # OCF parameters are as below: # OCF_RESKEY_serveraid # (Adapter number of the ServeRAID adapter) # OCF_RESKEY_mergegroup # (MergeGroup # of the logical drive under consideration) # # The ServeRAID clustering model is a bit odd, and its terminology needs # a little explanation # # Logical Volume - a particular SCSI id {target id and LUN} on # a particular controller. # # Merge Group - when active on one side or the other of the ServeRAID # configuration it corresponds with a logical drive. # Merge group numbers are permanently assigned to a particular # chunk of storage. Shared merge groups are in the # range of 1 to 8, and are largely arbitrary. # Unshared merge groups start at 200. # We can only deal with shared merge groups. When a merge # group is activated on one of the controllers, it becomes # a logical volume on that system. NOTE: The order in # which the Merge Groups are activated determines which # SCSI Ids they become. This makes for extra headaches # for this script to deal with. It also means that if # you have more than one shared ServeRAID merge group on # a particular controller, that the SCSI IDs will not # be constant. This requires mounting by uuid or label. # # One of the ServerRAID controllers has to be configured with # SCSI initiator ID 6, and the other with SCSI id 7. # # At this time, the ServeRAID clustering solution only works with # RAID 1 setups. It does NOT support RAID 5. This is a firmware # bug in the ServeRAID where it doesn't fail over correctly # if the RAID5 array is in a critical state... # # Note that this script requires ServeRAID software version 6.10 or # later. This software is now available from IBM. # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 ServeRAID::1::1 # # Older ServeRAID utility returns 1 when it succeeds (weird) # BUT - the newly released version is more normal... ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### srsuccess=0 SCSI="scsi " usage() { - cat <<-! + cat <<-EOF usage: $0 (start|stop|status|monitor|validate-all|meta-data) You have to set the following environment virables before running $0 : OCF_RESKEY_serveraid (Adapter number of the ServeRAID adapter) OCF_RESKEY_mergegroup (MergeGroup # of the logical drive under consideration) ServeRAID adapters are numbered starting from 1. The shared merge group number is a number between 1 and 8 inclusive. It indicates to the controller which logical disk to fail over. node1 10.0.0.170 ServeRAID::1::1 PREREQUISITES: You must configure your ServeRAID adapters for clustering for this to work. To do this, you must use the bootable "ServeRAID Support CD" and right click your controller and pick "configure for clustering". The Linux version of the ServeRAID manager does not have the "configure for clustering" option. You will need at least version 6.10 (~July 2003 release) of the ipssend command for this script to work. - ! + EOF } meta_data() { cat < 1.0 Resource script for ServeRAID. It enables/disables shared ServeRAID merge groups. Enables and disables shared ServeRAID merge groups The adapter number of the ServeRAID adapter. serveraid The logical drive under consideration. mergegroup END } ServeRAID_methods() { cat <<-! start stop status validate-all methods usage meta-data ! } ServeRAIDSCSI="/proc/scsi/ips" IPS=ipssend proc_scsi=/proc/scsi/scsi parseinst() { sr_adapter=error sr_mergegroup=error hostid=error sr_logicaldrivenumber=error if [ $# -ne 2 ] then ocf_log err "Invalid ServeRAID instance: $*" exit $OCF_ERR_ARGS fi PerlScript='next unless /^Host/; $_ .= <>.<>; print "$1 " if /SERVERAID/ and /Proces/ and /scsi(\d+)/' # Get the list of host ids of the ServeRAID host adapters hostlist=`$PERL -ne "${PerlScript}" <$proc_scsi` # Figure the host id of the desired ServeRAID adapter hostid=`echo $hostlist | cut -d' ' -f$1` if [ ! -f "$ServeRAIDSCSI/$hostid" ] then ocf_log err "No such ServeRAID adapter: $1" exit $OCF_ERR_ARGS fi case $2 in [1-8]);; *) ocf_log err "Invalid Shared Merge Group Number: $2" exit $OCF_ERR_ARGS;; esac sr_adapter=$1 sr_mergegroup=$2 CheckRaidLevel return $? } SRLogicalDriveConfig() { $IPS getconfig $sr_adapter ld } MergeGroupToSCSI_ID() { PerlScript="while (<>) { /logical drive number *([0-9]+)/i && (\$ld=\$1); /part of merge group *: *$sr_mergegroup *\$/i && print \$ld - 1, \"\n\"; }" ID=`SRLogicalDriveConfig | $PERL -e "$PerlScript"` case $ID in [0-9]*) echo "$ID"; return 0;; *) return 1;; esac } MergeGroupRaidLevel() { PerlScript="while (<>) { /RAID level *: *([0-9]+[A-Za-z]*)/i && (\$ld=\$1); /part of merge group *: *$sr_mergegroup *\$/i && print \$ld, \"\n\"; }" Level=`SRLogicalDriveConfig | $PERL -e "$PerlScript"` case $Level in ?*) echo "$Level"; return 0;; *) return 1;; esac } CheckRaidLevel() { RAIDlevel=`MergeGroupRaidLevel` case $RAIDlevel in *5*) ocf_log err "ServeRAID device $sr_adapter $sr_mergegroup is RAID level $RAIDlevel" ocf_log err "This level of ServeRAID RAID is not supported for failover by the firmware." exit $OCF_ERR_GENERIC;; esac return $OCF_SUCCESS } ReleaseSCSI() { targetid=`MergeGroupToSCSI_ID` echo "${SCSI}remove-single-device $hostid 0 $targetid 0" > $proc_scsi } AddSCSI() { targetid=`MergeGroupToSCSI_ID` echo "${SCSI}add-single-device $hostid 0 $targetid 0" > $proc_scsi } # # start: Enable the given ServeRAID device # ServeRAID_start() { if ServeRAID_status $serveraid $mergegroup then ocf_log debug "ServeRAID merge group $serveraid $mergegroup is running." return $OCF_SUCCESS else if # # Normally we do a MERGE PARTNER, but if we still own the drive for # some reason, then we'll need to do a MERGE OWN instead... # out=`$IPS MERGE $sr_adapter $sr_mergegroup PARTNER 2>&1` if [ $? -eq $srsuccess ] then ocf_log info "$out" else ocf_run $IPS MERGE $sr_adapter $sr_mergegroup OWN fi then : OK All is well! targetid=`MergeGroupToSCSI_ID` sr_logicaldrivenumber=`expr $targetid + 1` #run $IPS SYNCH $sr_adapter $sr_logicaldrivenumber & # This version of the SYNCH command requires the 6.10 or later # ServeRAID support CD. # To avoid issues when called by lrmd, redirect stdout->stderr. # Use () to create a subshell to make the redirection be synchronized. ( ocf_run $IPS SYNCH $sr_adapter $sr_mergegroup & ) >&2 AddSCSI else return $OCF_ERR_GENERIC fi fi if ServeRAID_status "$@" then return $OCF_SUCCESS else ocf_log err "ServeRAID device $1 not active!" exit $OCF_ERR_GENERIC fi } # # stop: Disable the given ServeRAID device # ServeRAID_stop() { parseinst "$@" ReleaseSCSI if ocf_run $IPS UNMERGE $sr_adapter $sr_mergegroup then : UNMERGE $sr_adapter $sr_mergegroup worked fi if ServeRAID_status "$@" then - ocf_log err "ServeRAID device $@ is still active!" + ocf_log err "ServeRAID device $* is still active!" return $OCF_ERR_GENERIC else return $OCF_SUCCESS fi } # # status: is the given device now available? # ServeRAID_status() { parseinst "$@" # # The output we're looking for # Part of merge group : 2 # SRLogicalDriveConfig \ | grep -i "part of merge group[ ]*: *$sr_mergegroup *\$" >/dev/null } # # validate_all: are the OCF instance parameters valid? # ServeRAID_validate_all() { check_binary $PERL # parseinst() will do all the work... parseinst "$@" return $? } if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; # # methods: What methods do we support? # methods) ServeRAID_methods exit $?;; usage) usage exit $OCF_SUCCESS;; *) ;; esac if ( [ -z "$OCF_RESKEY_serveraid" ] || [ -z "$OCF_RESKEY_mergegroup" ] ) then ocf_log err "You have to set the OCF_RESKEY_serveraid and OCF_RESKEY_mergegroup\n enviroment virables before running $0 !" # usage exit $OCF_ERR_GENERIC fi : Right Number of arguments.. serveraid=$OCF_RESKEY_serveraid mergegroup=$OCF_RESKEY_mergegroup # Look for the start, stop, status, or methods calls... case "$1" in stop) ServeRAID_stop $serveraid $mergegroup exit $?;; start) ServeRAID_start $serveraid $mergegroup exit $?;; status|monitor) if ServeRAID_status $serveraid $mergegroup then ocf_log debug "ServeRAID merge group $serveraid $mergegroup is running." exit $OCF_SUCCESS else ocf_log debug "ServeRAID merge group $serveraid $mergegroup is stopped." exit $OCF_NOT_RUNNING fi exit $?;; validate-all) ServeRAID_validate_all $serveraid $mergegroup exit $?;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/SysInfo b/heartbeat/SysInfo index a7f0d6f73..d33c868aa 100755 --- a/heartbeat/SysInfo +++ b/heartbeat/SysInfo @@ -1,365 +1,364 @@ #!/bin/bash # # # SysInfo OCF Resource Agent # It records (in the CIB) various attributes of a node # # Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Bree # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 1.0 This is a SysInfo Resource Agent. It records (in the CIB) various attributes of a node Sample Linux output: arch: i686 os: Linux-2.4.26-gentoo-r14 free_swap: 1999 cpu_info: Intel(R) Celeron(R) CPU 2.40GHz cpu_speed: 4771.02 cpu_cores: 1 cpu_load: 0.00 ram_total: 513 ram_free: 117 root_free: 2.4 Sample Darwin output: arch: i386 os: Darwin-8.6.2 cpu_info: Intel Core Duo cpu_speed: 2.16 cpu_cores: 2 cpu_load: 0.18 ram_total: 2016 ram_free: 787 root_free: 13 Units: free_swap: Mb ram_*: Mb root_free: Gb cpu_speed (Linux): bogomips cpu_speed (Darwin): Ghz Records various node attributes in the CIB PID file PID file Interval to allow values to stabilize Dampening Delay END } ####################################################################### UpdateStat() { name=$1; shift value="$*" echo -e "$name:\t$value" ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n $name -v "$value" } SysInfoStats() { UpdateStat arch "`uname -m`" UpdateStat os "`uname -s`-`uname -r`" case `uname -s` in "Darwin") mem=`top -l 1 | grep Mem: | awk '{print $10}'` mem_used=`top -l 1 | grep Mem: | awk '{print $8}'` mem=`SysInfo_mem_units $mem` mem_used=`SysInfo_mem_units $mem_used` mem_total=`expr $mem_used + $mem` cpu_type=`system_profiler SPHardwareDataType | grep "CPU Type:"` cpu_type=${cpu_type/*: /} cpu_speed=`system_profiler SPHardwareDataType | grep "CPU Speed:" | awk '{print $3}'` cpu_cores=`system_profiler SPHardwareDataType | grep "Number Of"` cpu_cores=${cpu_cores/*: /} ;; "Linux") if [ -f /proc/cpuinfo ]; then cpu_type=`grep "model name" /proc/cpuinfo | head -n 1` cpu_type=${cpu_type/*: /} cpu_speed=`grep "bogomips" /proc/cpuinfo | head -n 1` cpu_speed=${cpu_speed/*: /} cpu_cores=`grep "^processor" /proc/cpuinfo | wc -l` fi if [ -f /proc/meminfo ]; then # meminfo results are in kB mem=`grep "SwapFree" /proc/meminfo | awk '{print $2"k"}'` if [ ! -z $mem ]; then UpdateStat free_swap `SysInfo_mem_units $mem` fi mem=`grep "Inactive" /proc/meminfo | awk '{print $2"k"}'` mem_total=`grep "MemTotal" /proc/meminfo | awk '{print $2"k"}'` else mem=`top -n 1 | grep Mem: | awk '{print $7}'` fi ;; *) esac if [ x != x"$cpu_type" ]; then UpdateStat cpu_info "$cpu_type" fi if [ x != x"$cpu_speed" ]; then UpdateStat cpu_speed "$cpu_speed" fi if [ x != x"$cpu_cores" ]; then UpdateStat cpu_cores "$cpu_cores" fi loads=`uptime` load15=`echo ${loads} | awk '{print $10}'` UpdateStat cpu_load $load15 if [ ! -z "$mem" ]; then # Massage the memory values UpdateStat ram_total `SysInfo_mem_units $mem_total` UpdateStat ram_free `SysInfo_mem_units $mem` fi # Portability notes: # o df: -h flag not available on Solaris 8. (OK on 9, 10, ...) #FIXME# # o tail: explicit "-n" not available in Solaris; instead simplify # 'tail -n ' to the equivalent 'tail -'. disk=`df -h / | tail -1 | awk '{print $4}'` if [ x != x"$disk" ]; then UpdateStat root_free `SysInfo_hdd_units $disk` fi } SysInfo_mem_units() { mem=$1 if [ -z $1 ]; then return fi memlen=`expr ${#mem} - 1` memlen_alt=`expr ${#mem} - 2` if [ ${mem:$memlen:1} = "G" ]; then mem="${mem:0:$memlen}" if [ $mem != ${mem/./} ]; then mem_before=${mem/.*/} mem_after=${mem/*./} mem=$[mem_before*1024] if [ ${#mem_after} = 0 ]; then : elif [ ${#mem_after} = 1 ]; then mem=$[mem+100*$mem_after] elif [ ${#mem_after} = 2 ]; then mem=$[mem+10*$mem_after] elif [ ${#mem_after} = 3 ]; then mem=$[mem+$mem_after] else mem_after=${mem_after:0:3} mem=$[mem+$mem_after] fi fi elif [ ${mem:$memlen:1} = "M" ]; then mem=${mem/.*/} mem="${mem:0:$memlen}" elif [ ${mem:$memlen:1} = "k" ]; then mem="${mem:0:$memlen}" mem=${mem/.*/} mem=`expr $mem / 1024` elif [ ${mem:$memlen_alt:2} = "kB" ]; then mem="${mem:0:$memlen_alt}" mem=${mem/.*/} mem=`expr $mem / 1024` elif [ ${mem:$memlen_alt:2} = "Mb" ]; then mem="${mem:0:$memlen_alt}" mem=${mem/.*/} elif [ ${mem:$memlen_alt:2} = "MB" ]; then mem="${mem:0:$memlen_alt}" mem=${mem/.*/} fi # Round to the next multiple of 50 memlen=`expr ${#mem} - 2` mem_round="${mem:$memlen:2}" if [ x$mem_round = x ]; then : elif [ $mem_round = "00" ]; then : else mem_round=`echo $mem_round | sed 's/^0//'` if [ $mem_round -lt "50" ]; then mem=$[mem+50] mem=$[mem-$mem_round] else mem=$[mem+100] mem=$[mem-$mem_round] fi fi echo $mem } SysInfo_hdd_units() { disk=$1 disklen=`expr ${#disk} - 1` disklen_alt=`expr ${#disk} - 2` if [ ${disk:$disklen:1} = "G" ]; then disk="${disk:0:$disklen}" elif [ ${disk:$disklen:1} = "M" ]; then disk="${disk:0:$disklen}" disk=${disk/.*/} disk=`expr $disk / 1024` elif [ ${disk:$disklen:1} = "k" ]; then disk="${disk:0:$disklen}" disk=${disk/.*/} disk=`expr $disk / 1048576` elif [ ${disk:$disklen_alt:2} = "kB" ]; then disk="${disk:0:$disklen_alt}" disk=${disk/.*/} disk=`expr $disk / 1048576` elif [ ${disk:$disklen_alt:2} = "Mb" ]; then disk="${disk:0:$disklen_alt}" disk=${disk/.*/} disk=`expr $disk / 1024` elif [ ${disk:$disklen_alt:2} = "MB" ]; then disk="${disk:0:$disklen_alt}" disk=${disk/.*/} disk=`expr $disk / 1024` fi echo $disk } SysInfo_usage() { cat < $OCF_RESKEY_pidfile SysInfoStats exit $OCF_SUCCESS } SysInfo_stop() { rm $OCF_RESKEY_pidfile exit $OCF_SUCCESS } SysInfo_monitor() { if [ -f $OCF_RESKEY_pidfile ]; then clone=`cat $OCF_RESKEY_pidfile` fi if [ x$clone = x ]; then rm $OCF_RESKEY_pidfile exit $OCF_NOT_RUNNING elif [ $clone = $OCF_RESKEY_clone ]; then SysInfoStats exit $OCF_SUCCESS - elif [ x$OCF_RESKEY_CRM_meta_globally_unique = xtrue - -o x$OCF_RESKEY_CRM_meta_globally_unique = xTrue - -o x$OCF_RESKEY_CRM_meta_globally_unique = xyes - -o x$OCF_RESKEY_CRM_meta_globally_unique = xYes - ]; then + elif [ x$OCF_RESKEY_CRM_meta_globally_unique = xtrue ] || + [ x$OCF_RESKEY_CRM_meta_globally_unique = xTrue ] || + [ x$OCF_RESKEY_CRM_meta_globally_unique = xyes ] || + [ x$OCF_RESKEY_CRM_meta_globally_unique = xYes ]; then SysInfoStats exit $OCF_SUCCESS fi exit $OCF_NOT_RUNNING } SysInfo_validate() { return $OCF_SUCCESS } if [ $# -ne 1 ]; then SysInfo_usage exit $OCF_ERR_ARGS fi : ${OCF_RESKEY_pidfile:="$HA_RSCTMP/SysInfo-${OCF_RESOURCE_INSTANCE}"} : ${OCF_RESKEY_clone:="0"} if [ x != x${OCF_RESKEY_delay} ]; then OCF_RESKEY_delay="-d ${OCF_RESKEY_delay}" fi case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS ;; start) SysInfo_start ;; stop) SysInfo_stop ;; monitor) SysInfo_monitor ;; validate-all) SysInfo_validate ;; usage|help) SysInfo_usage exit $OCF_SUCCESS ;; *) SysInfo_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/apache-conf.sh b/heartbeat/apache-conf.sh index a3c893018..d94dbd3e8 100644 --- a/heartbeat/apache-conf.sh +++ b/heartbeat/apache-conf.sh @@ -1,196 +1,196 @@ # # Common apache code # (sourced by apache) # # Author: Alan Robertson # Sun Jiang Dong # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2005 International Business Machines # source_envfiles() { for f; do [ -f "$f" -a -r "$f" ] && . "$f" done } apachecat() { awk ' function procline() { split($0,a); if( a[1]~/^[Ii]nclude$/ ) { includedir=a[2]; gsub("\"","",includedir); procinclude(includedir); } else { if( a[1]=="ServerRoot" ) { rootdir=a[2]; gsub("\"","",rootdir); } print; } } function printfile(infile, a) { while( (getline 0 ) { procline(); } close(infile); } function allfiles(dir, cmd,f) { cmd="find -L "dir" -type f"; while( ( cmd | getline f ) > 0 ) { printfile(f); } close(cmd); } function listfiles(pattern, cmd,f) { cmd="ls "pattern" 2>/dev/null"; while( ( cmd | getline f ) > 0 ) { printfile(f); } close(cmd); } function procinclude(spec) { if( rootdir!="" && spec!~/^\// ) { spec=rootdir"/"spec; } if( isdir(spec) ) { allfiles(spec); # read all files in a directory (and subdirs) } else { listfiles(spec); # there could be jokers } } function isdir(s) { return !system("test -d \""s"\""); } { procline(); } ' $1 | sed 's/#.*//;s/[[:blank:]]*$//;s/^[[:blank:]]*//' | grep -v '^$' } # # set parameters (as shell vars) from our apache config file # get_apache_params() { configfile=$1 shift 1 - vars=`echo $@ | sed 's/ /,/g'` + vars=$(echo "$@" | sed 's/ /,/g') eval ` apachecat $configfile | awk -v vars="$vars" ' BEGIN{ split(vars,v,","); for( i in v ) vl[i]=tolower(v[i]); } { for( i in v ) if( tolower($1)==vl[i] ) { print v[i]"="$2 delete vl[i] break } } '` } # # Return the location(s) that are handled by the given handler # FindLocationForHandler() { PerlScript='while (<>) { /"]+)/i && ($loc=$1); '"/SetHandler +$2"'/i && print "$loc\n"; }' apachecat $1 | perl -e "$PerlScript" } # # Check if the port is valid # CheckPort() { ocf_is_decimal "$1" && [ $1 -gt 0 ] } buildlocalurl() { [ "x$Listen" != "x" ] && echo "http://${Listen}" || echo "${LOCALHOST}:${PORT}" } # the test url may need a local prefix (as specified in the # apache Listen directive) fixtesturl() { echo $test_url | grep -qs "^http" && return test_url="`buildlocalurl`$test_url" } # # Get all the parameters we need from the Apache config file # GetParams() { ConfigFile=$1 if [ ! -f $ConfigFile ]; then return $OCF_ERR_INSTALLED fi get_apache_params $ConfigFile ServerRoot PidFile Port Listen case $PidFile in /*) ;; [[:alnum:]]*) PidFile=$ServerRoot/$PidFile;; *) # If the PidFile is not set in the config, set # a default location. PidFile=$HA_VARRUNDIR/${httpd_basename}.pid # Force the daemon to use this location by using # the -c option, which adds the PidFile directive # as if it was in the configuration file to begin with. PIDFILE_DIRECTIVE="true" ;; esac for p in "$PORT" "$Port" 80; do if CheckPort "$p"; then PORT="$p" break fi done echo $Listen | grep ':' >/dev/null || # Listen could be just port spec Listen="localhost:$Listen" # # It's difficult to figure out whether the server supports # the status operation. # (we start our server with -DSTATUS - just in case :-)) # # Typically (but not necessarily) the status URL is /server-status # # For us to think status will work, we have to have the following things: # # - The server-status handler has to be mapped to some URL somewhere # # We assume that: # # - the "main" web server at $PORT will also support it if we can find it # somewhere in the file # - it will be supported at the same URL as the one we find in the file # # If this doesn't work for you, then set the statusurl attribute. # if [ "X$STATUSURL" = "X" ] then StatusURL=`FindLocationForHandler $1 server-status | tail -1` STATUSURL="`buildlocalurl`$StatusURL" fi if ! test "$PidFile"; then return $OCF_ERR_INSTALLED else return $OCF_SUCCESS fi } diff --git a/heartbeat/conntrackd b/heartbeat/conntrackd index e81cda3e7..c09c3ad5b 100755 --- a/heartbeat/conntrackd +++ b/heartbeat/conntrackd @@ -1,335 +1,335 @@ #!/bin/bash # # # An OCF RA for conntrackd # http://conntrack-tools.netfilter.org/ # # Copyright (c) 2011 Dominik Klein # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### OCF_RESKEY_binary_default=conntrackd OCF_RESKEY_config_default=/etc/conntrackd/conntrackd.conf # For users of versions prior to 1.2: # Map renamed parameter "conntrackd" to "binary" if in use : ${OCF_RESKEY_binary=${OCF_RESKEY_conntrackd-${OCF_RESKEY_binary_default}}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} meta_data() { cat < 1.2 Master/Slave OCF Resource Agent for conntrackd This resource agent manages conntrackd Name of the conntrackd executable. If conntrackd is installed and available in the default PATH, it is sufficient to configure the name of the binary For example "my-conntrackd-binary-version-0.9.14" If conntrackd is installed somewhere else, you may also give a full path For example "/packages/conntrackd-0.9.14/sbin/conntrackd" Name of the conntrackd executable Full path to the conntrackd.conf file. For example "/packages/conntrackd-0.9.14/etc/conntrackd/conntrackd.conf" Path to conntrackd.conf END } -meta_expect() +meta_expect_eq() { - local what=$1 whatvar=OCF_RESKEY_CRM_meta_${1//-/_} op=$2 expect=$3 - local val=${!whatvar} - if [[ -n $val ]]; then - # [, not [[, or it won't work ;) - [ $val $op $expect ] && return - fi - ocf_exit_reason "meta parameter misconfigured, expected $what $op $expect, but found ${val:-unset}." - exit $OCF_ERR_CONFIGURED + local what=$1 whatvar=OCF_RESKEY_CRM_meta_${1//-/_} expect=$2 + local val=${!whatvar} + if [[ -n $val ]]; then + # [, not [[, or it won't work ;) + [ $val = $expect ] && return + fi + ocf_exit_reason "meta parameter misconfigured, expected $what $op $expect, but found ${val:-unset}." + exit $OCF_ERR_CONFIGURED } conntrackd_is_master() { - # You can't query conntrackd whether it is master or slave. It can be both at the same time. + # You can't query conntrackd whether it is master or slave. It can be both at the same time. # This RA creates a statefile during promote and enforces master-max=1 and clone-node-max=1 ha_pseudo_resource $statefile monitor } conntrackd_set_master_score() { ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 } conntrackd_monitor() { rc=$OCF_NOT_RUNNING # It does not write a PID file, so check the socket exists after # extracting its path from the configuration file local conntrack_socket=$(awk '/^[ \t]*UNIX[ \t]*{/,/^[ \t]*}/ { if ($1 == "Path") { print $2 } }' $OCF_RESKEY_config) [ -S "$conntrack_socket" ] && rc=$OCF_SUCCESS if [ "$rc" -eq "$OCF_SUCCESS" ]; then - # conntrackd is running + # conntrackd is running # now see if it acceppts queries if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -s > /dev/null 2>&1; then rc=$OCF_ERR_GENERIC ocf_exit_reason "conntrackd is running but not responding to queries" fi if conntrackd_is_master; then rc=$OCF_RUNNING_MASTER # Restore master setting on probes - if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then + if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then conntrackd_set_master_score $master_score fi else # Restore master setting on probes - if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then + if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then conntrackd_set_master_score $slave_score fi fi fi return $rc } conntrackd_start() { - rc=$OCF_ERR_GENERIC + rc=$OCF_ERR_GENERIC - # Keep trying to start the resource; - # wait for the CRM to time us out if this fails + # Keep trying to start the resource; + # wait for the CRM to time us out if this fails while :; do conntrackd_monitor status=$? - case "$status" in + case "$status" in $OCF_SUCCESS) conntrackd_set_master_score $slave_score # -n = request resync from the others if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -n; then ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -n failed during start." rc=$OCF_ERR_GENERIC else rc=$OCF_SUCCESS fi break ;; $OCF_NOT_RUNNING) ocf_log info "Starting conntrackd" $OCF_RESKEY_binary -C $OCF_RESKEY_config -d ;; $OCF_RUNNING_MASTER) ocf_log warn "conntrackd already in master mode, demoting." ha_pseudo_resource $statefile stop ;; $OCF_ERR_GENERIC) ocf_exit_reason "conntrackd start failed" rc=$OCF_ERR_GENERIC break ;; esac done return $rc } conntrackd_stop() { - rc=$OCF_ERR_GENERIC - - # Keep trying to bring down the resource; - # wait for the CRM to time us out if this fails - while :; do - conntrackd_monitor - status=$? - case "$status" in - $OCF_SUCCESS|$OCF_ERR_GENERIC) + rc=$OCF_ERR_GENERIC + + # Keep trying to bring down the resource; + # wait for the CRM to time us out if this fails + while :; do + conntrackd_monitor + status=$? + case "$status" in + $OCF_SUCCESS|$OCF_ERR_GENERIC) ocf_log info "Stopping conntrackd" - $OCF_RESKEY_binary -C $OCF_RESKEY_config -k - ;; - $OCF_NOT_RUNNING) - rc=$OCF_SUCCESS - break - ;; - $OCF_RUNNING_MASTER) - ocf_log warn "conntrackd still master" + $OCF_RESKEY_binary -C $OCF_RESKEY_config -k + ;; + $OCF_NOT_RUNNING) + rc=$OCF_SUCCESS + break + ;; + $OCF_RUNNING_MASTER) + ocf_log warn "conntrackd still master" ;; - esac - done - return $rc + esac + done + return $rc } conntrackd_validate_all() { check_binary "$OCF_RESKEY_binary" if ! [ -e "$OCF_RESKEY_config" ]; then ocf_exit_reason "Config FILE $OCF_RESKEY_config does not exist" return $OCF_ERR_INSTALLED fi - meta_expect master-node-max = 1 - meta_expect master-max = 1 - meta_expect clone-node-max = 1 - + meta_expect_eq master-node-max 1 + meta_expect_eq master-max 1 + meta_expect_eq clone-node-max 1 + return $OCF_SUCCESS } conntrackd_promote() { rc=$OCF_SUCCESS if ! conntrackd_is_master; then # -c = Commit the external cache to the kernel # -f = Flush internal and external cache # -R = resync with the kernel table # -B = send a bulk update on the line for parm in c f R B; do if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during promote." rc=$OCF_ERR_GENERIC break fi done ha_pseudo_resource $statefile start conntrackd_set_master_score $master_score fi return $rc } conntrackd_demote() { rc=$OCF_SUCCESS if conntrackd_is_master; then # -t = shorten kernel timers to remove zombies # -n = request a resync from the others for parm in t n; do if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then - ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during demote." - rc=$OCF_ERR_GENERIC - break - fi - done + ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during demote." + rc=$OCF_ERR_GENERIC + break + fi + done ha_pseudo_resource $statefile stop conntrackd_set_master_score $slave_score fi return $rc } conntrackd_notify() { hostname=$(hostname) # OCF_RESKEY_CRM_meta_notify_master_uname is a whitespace separated list of master hostnames for master in $OCF_RESKEY_CRM_meta_notify_master_uname; do # if we are the master and an instance was just started on another node: # send a bulk update to allow failback if [ "$hostname" = "$master" -a "$OCF_RESKEY_CRM_meta_notify_type" = "post" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "start" -a "$OCF_RESKEY_CRM_meta_notify_start_uname" != "$hostname" ]; then ocf_log info "Sending bulk update in post start to peers to allow failback" $OCF_RESKEY_binary -C $OCF_RESKEY_config -B fi done for tobepromoted in $OCF_RESKEY_CRM_meta_notify_promote_uname; do # if there is a promote action to be executed on another node: # send a bulk update to allow failback if [ "$hostname" != "$tobepromoted" -a "$OCF_RESKEY_CRM_meta_notify_type" = "pre" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "promote" ]; then ocf_log info "Sending bulk update in pre promote to peers to allow failback" $OCF_RESKEY_binary -C $OCF_RESKEY_config -B fi done } conntrackd_usage() { - cat < # # This agent incoporates code of a previous release created by # Alan Robertson and the community. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### db2_usage() { echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" } db2_meta_data() { cat < 1.0 Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in master/slave configuration. Multiple partitions are supported. Standard mode: An instance including all or selected databases is made highly available. Configure each partition as a separate primitive resource. HADR mode: A single database in HADR configuration is made highly available by automating takeover operations. Configure a master / slave resource with notifications enabled and an additional monitoring operation with role "Master". In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW. In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance: "monitor interval" < HADR_PEER_WINDOW - (appr 30 sec) "promote timeout" < HADR_PEER_WINDOW + (appr 20 sec) For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent) Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as master/slave configuration. Multiple partitions are supported. The instance of the database(s). instance List of databases to be managed, e.g "db1 db2". Defaults to all databases in the instance. Specify one db for HADR mode. List of databases to be managed DEPRECATED: The admin user of the instance. DEPRECATED: admin The number of the partion (DBPARTITIONNUM) to be managed. database partition number (DBPARTITIONNUM) END } # # validate # .. and set global variables # # exit on error # db2_validate() { local db2home db2sql db2instance # db2 uses korn shell check_binary "ksh" # check required instance vars if [ -z "$OCF_RESKEY_instance" ] then ocf_log err "DB2 required parameter instance is not set!" return $OCF_ERR_CONFIGURED fi instance=$OCF_RESKEY_instance if [ -n "$OCF_RESKEY_admin" ] then ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance." instance=$OCF_RESKEY_admin fi db2node=${OCF_RESKEY_dbpartitionnum:-0} db2home=$(sh -c "echo ~$instance") db2sql=$db2home/sqllib db2profile=$db2sql/db2profile db2bin=$db2sql/bin STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state # Let's make sure a few important things are there... if ! [ -d "$db2sql" -a -d "$db2bin" -a -f "$db2profile" -a \ -x "$db2profile" -a -x "$db2bin/db2" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 required directories and/or files not found" exit $OCF_ERR_INSTALLED fi db2instance=$(runasdb2 'echo $DB2INSTANCE') if [ "$db2instance" != "$instance" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\"" exit $OCF_ERR_CONFIGURED fi # enough checking for stop to succeed [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS dblist=$OCF_RESKEY_dblist if [ -n "$dblist" ] then # support , as separator as well dblist=$(echo "$dblist" | sed -e 's/[,]/ /g') else if ! dblist=$(db2_dblist) then ocf_log err "DB2 $instance($db2node): cannot retrieve db directory" exit $OCF_ERR_INSTALLED fi fi # check requirements for the HADR case if ocf_is_ms then set -- $dblist if [ $# != 1 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist" exit $OCF_ERR_CONFIGURED fi if [ $db2node != 0 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0" exit $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } master_score() { if ! have_binary "crm_master"; then return fi crm_master $* } # # Run the given command as db2 instance user # runasdb2() { su $instance -c ". $db2profile; $*" } # # Run a command as the DB2 admin, and log the output # logasdb2() { local output rc output=$(runasdb2 $*) rc=$? if [ $rc -eq 0 ] then ocf_log info "$output" else ocf_log err "$output" fi return $rc } # # maintain the fal (first active log) attribute # db2_fal_attrib DB {set val|get} # db2_fal_attrib() { local db=$1 local attr val rc id node member me attr=db2hadr_${instance}_${db}_fal case "$2" in set) me=$(uname -n) # loop over all member nodes and set attribute crm_node -l | while read id node member do [ "$member" = member -a "$node" != "$me" ] || continue crm_attribute -t nodes -l reboot --node=$node -n $attr -v "$3" rc=$? ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" [ $rc != 0 ] && break done ;; get) crm_attribute -t nodes -l reboot -n $attr -G -Q 2>&1 rc=$? if [ $rc != 0 ] then ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" fi ;; *) exit $OCF_ERR_CONFIGURED esac return $rc } # # unfortunately a first connect after a crash may need several minutes # for some internal cleanup stuff in DB2. # We run a connect in background so other connects (i.e. monitoring!) may proceed. # db2_run_connect() { local db=$1 logasdb2 "db2 connect to $db; db2 terminate" } # # get some data from the database config # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW # db2_get_cfg() { local db=$1 local output hadr_vars output=$(runasdb2 db2 get db cfg for $db) [ $? != 0 ] && return $OCF_ERR_GENERIC hadr_vars=$(echo "$output" | awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;} /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;} /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;} /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}') # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW eval $hadr_vars # HADR_PEER_WINDOW comes with V9 and is checked later if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ] then ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # return the list of databases in the instance # db2_dblist() { local output output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%' } # # Delayed check of the compatibility of DB2 instance and pacemaker # config. # Logically this belongs to validate but certain parameters can only # be retrieved once the instance is started. # db2_check_config_compatibility() { local db=$1 local is_ms ocf_is_ms is_ms=$? case "$HADR_ROLE/$is_ms" in STANDARD/0) ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource" exit $OCF_ERR_INSTALLED ;; STANDARD/1) # OK ;; */0) if [ -z "$HADR_PEER_WINDOW" ] then ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)" exit $OCF_ERR_INSTALLED fi ;; */1) ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource" esac } # # Start instance and DB. # Standard mode is through "db2 activate" in order to start in previous # mode (Standy/Primary). # If the database is a primary AND we can determine that the running master # has a higher "first active log" we conclude that we come up after a crash # an the previous Standby is now Primary. # The db is then started as Standby. # # Other cases: danger of split brain, log error and do nothing. # db2_start() { local output start_cmd db local start_opts="dbpartitionnum $db2node" # If we detect that db partitions are not in use, and no # partition is explicitly specified, activate without # partition information. This allows db2 instances without # partition support to be managed. if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then start_opts="" fi if output=$(runasdb2 db2start $start_opts) then ocf_log info "DB2 instance $instance($db2node) started: $output" else case $output in *SQL1026N*) ocf_log info "DB2 instance $instance($db2node) already running: $output" ;; *) ocf_log err "$output" return $OCF_ERR_GENERIC esac fi if ! db2_instance_status then ocf_log err "DB2 instance $instance($db2node) is not active!" return $OCF_ERR_GENERIC fi [ $db2node = 0 ] || return $OCF_SUCCESS # activate DB only on node 0 for db in $dblist do # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG db2_get_cfg $db || return $? # Better late than never: can only check this when the instance is already up db2_check_config_compatibility $db start_cmd="db2 activate db $db" if [ $HADR_ROLE = PRIMARY ] then local master_fal # communicate our FAL to other nodes the might start concurrently db2_fal_attrib $db set $FIRST_ACTIVE_LOG + # ignore false positive: + # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073] + # see https://github.com/koalaman/shellcheck/issues/691 + # shellcheck disable=SC2073 if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] then ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" start_cmd="db2 start hadr on db $db as standby" HADR_ROLE=STANDBY fi fi if output=$(runasdb2 $start_cmd) then ocf_log info "DB2 database $instance($db2node)/$db started/activated" [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & else case $output in SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*) ocf_log info "DB2 database $instance($db2node)/$db already activated: $output" ;; SQL1768N*"Reason code = \"7\""*) ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down" ocf_log err "Possible split brain ! Manual intervention required." ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" # might be the Standby is not yet there # might be a timing problem because "First active log" is delayed # on the next start attempt we might succeed when FAL was advanced # might be manual intervention is required # ... so let pacemaker give it another try and we will succeed then return $OCF_ERR_GENERIC ;; *) ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output" return $OCF_ERR_GENERIC esac fi done # come here with success # Even if we are a db2 Primary pacemaker requires start to end up in slave mode echo SLAVE > $STATE_FILE return $OCF_SUCCESS } # # helper function to be spawned # so we can detect a hang of the db2stop command # db2_stop_bg() { local rc output local stop_opts="dbpartitionnum $db2node" rc=$OCF_SUCCESS if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then stop_opts="" fi if output=$(runasdb2 db2stop force $stop_opts) then ocf_log info "DB2 instance $instance($db2node) stopped: $output" else case $output in *SQL1032N*) #SQL1032N No start database manager command was issued ocf_log info "$output" ;; *) ocf_log err "DB2 instance $instance($db2node) stop failed: $output" rc=$OCF_ERR_GENERIC esac fi return $rc } # # Stop the given db2 database instance # db2_stop() { local stop_timeout grace_timeout stop_bg_pid i must_kill # remove master score master_score -D -l reboot # be very early here in order to avoid stale data rm -f $STATE_FILE db2_instance_status if [ $? -eq $OCF_NOT_RUNNING ]; then ocf_log info "DB2 instance $instance already stopped" return $OCF_SUCCESS fi stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000} # grace_time is 4/5 (unit is ms) grace_timeout=$((stop_timeout/1250)) # start db2stop in background as this may hang db2_stop_bg & stop_bg_pid=$! # wait for grace_timeout i=0 while [ $i -lt $grace_timeout ] do kill -0 $stop_bg_pid 2>/dev/null || break; sleep 1 i=$((i+1)) done # collect exit status but don't hang if kill -0 $stop_bg_pid 2>/dev/null then stoprc=1 kill -9 $stop_bg_pid 2>/dev/null else wait $stop_bg_pid stoprc=$? fi must_kill=0 if [ $stoprc -ne 0 ] then ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill" must_kill=1 elif ! db2_instance_dead then ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" must_kill=1 fi if [ $must_kill -eq 1 ] then # db2nkill kills *all* partions on the node if [ -x $db2bin/db2nkill ] then logasdb2 $db2bin/db2nkill $db2node elif [ -x $db2bin/db2_kill ] then logasdb2 $db2bin/db2_kill fi # loop forever (or lrmd kills us due to timeout) until the # instance is dead while ! db2_instance_dead do ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit" sleep 1 done ocf_log info "DB2 instance $instance($db2node) is now dead" fi return $OCF_SUCCESS } # # check whether `enough´ processes for a healthy instance are up # db2_instance_status() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) if [ $pscount -ge 4 ]; then return $OCF_SUCCESS; elif [ $pscount -ge 1 ]; then return $OCF_GENERIC_ERR fi return $OCF_NOT_RUNNING } # # is the given db2 instance dead? # db2_instance_dead() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) test $pscount -eq 0 } # # return the status of the db as "Role/Status" # e.g. Primary/Peer, Standby/RemoteCatchupPending # # If not in HADR configuration return "Standard/Standalone" # db2_hadr_status() { local db=$1 local output output=$(runasdb2 db2pd -hadr -db $db) if [ $? != 0 ] then echo "Down/Off" return 1 fi echo "$output" | awk '/^HADR is not active/ {print "Standard/Standalone"; exit; } /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' } # # Monitor the db # And as side effect set crm_master / FAL attribute # db2_monitor() { local CMD output hadr db local rc db2_instance_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then # instance is dead remove master score master_score -D -l reboot exit $rc fi [ $db2node = 0 ] || return 0 # monitoring only for partition 0 for db in $dblist do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" # set master preference accordingly case "$hadr" in Primary/*|Standard/*) # perform a basic health check CMD="if db2 connect to $db; then db2 select \* from sysibm.sysversions ; rc=\$?; db2 terminate; else rc=\$?; fi; exit \$rc" if ! output=$(runasdb2 $CMD) then case "$output" in SQL1776N*) # can't connect/select on standby, may be spurious turing takeover ;; *) ocf_log err "DB2 database $instance($db2node)/$db is not working" ocf_log err "DB2 message: $output" # dead primary, remove master score master_score -D -l reboot return $OCF_ERR_GENERIC esac fi ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" ocf_is_ms && master_score -v 10000 -l reboot ;; Standby/*Peer) master_score -v 8000 -l reboot ;; Standby/*) ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted" master_score -D -l reboot ;; *) return $OCF_ERR_GENERIC esac done # everything OK, return if running as slave grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS return $OCF_RUNNING_MASTER } # # Promote db to Primary # db2_promote() { # validate ensured that dblist contains only one entry local db=$dblist local i hadr output force # we run this twice as after a crash of the other node # within HADR_TIMEOUT the status may be still reported as Peer # although a connection no longer exists for i in 1 2 do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted" case "$hadr" in Standard/Standalone) # this case only to keep ocf-tester happy return $OCF_SUCCESS ;; Primary/Peer) # nothing to do, only update pacemaker's view echo MASTER > $STATE_FILE return $OCF_SUCCESS ;; Standby/Peer) # must take over ;; Standby/DisconnectedPeer) # must take over forced force="by force peer window only" ;; *) return $OCF_ERR_GENERIC esac if output=$(runasdb2 db2 takeover hadr on db $db $force) then # update pacemaker's view echo MASTER > $STATE_FILE # turn the log so we rapidly get a new FAL logasdb2 "db2 archive log for db $db" return $OCF_SUCCESS fi case "$output" in SQL1770N*"Reason code = \"7\""*) # expected, HADR_TIMEOUT is now expired # go for the second try continue ;; *) ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output" return $OCF_ERR_GENERIC esac done return $OCF_ERR_GENERIC } # # Demote db to standby # db2_demote() { # validate ensured that dblist contains only one entry local db=$dblist local hadr # house keeping, set pacemaker's view to slave echo SLAVE > $STATE_FILE hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted" db2_monitor return $? } # # handle pre start notification # We record our first active log on the other nodes. # If two primaries come up after a crash they can safely determine who is # the outdated one. # db2_notify() { local node # only interested in pre-start [ $OCF_RESKEY_CRM_meta_notify_type = pre \ -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS # gets FIRST_ACTIVE_LOG db2_get_cfg $dblist || return $? db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC exit $OCF_SUCCESS } ######## # Main # ######## case "$__OCF_ACTION" in meta-data) db2_meta_data exit $OCF_SUCCESS ;; usage) db2_usage exit $OCF_SUCCESS ;; start) db2_validate db2_start || exit $? db2_monitor exit $? ;; stop) db2_validate db2_stop exit $? ;; promote) db2_validate db2_promote exit $? ;; demote) db2_validate db2_demote exit $? ;; notify) db2_validate db2_notify exit $? ;; monitor) db2_validate db2_monitor exit $? ;; validate-all) db2_validate exit $? ;; *) db2_usage exit $OCF_ERR_UNIMPLEMENTED esac diff --git a/heartbeat/eDir88 b/heartbeat/eDir88 index 31b1a06e6..b4c7952ff 100755 --- a/heartbeat/eDir88 +++ b/heartbeat/eDir88 @@ -1,460 +1,460 @@ #!/bin/bash # # eDirectory Resource Agent (RA) for Heartbeat. # This script is only compatible with eDirectory 8.8 and later # # Copyright (c) 2007 Novell Inc, Yan Fitterer # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # OCF parameters: # OCF_RESKEY_eDir_config_file - full filename to instance configuration file # OCF_RESKEY_eDir_monitor_ldap - Should we monitor LDAP (0/1 - 1 is true) # OCF_RESKEY_eDir_monitor_idm - Should we monitor IDM (0/1 - 1 is true) # OCF_RESKEY_eDir_jvm_initial_heap - Value of the DHOST_INITIAL_HEAP java env var # OCF_RESKEY_eDir_jvm_max_heap - Value of the DHOST_MAX_HEAP java env var # OCF_RESKEY_eDir_jvm_options - Value of the DHOST_OPTIONS java env var ############################################################################### ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs test -f /opt/novell/eDirectory/bin/ndspath && . /opt/novell/eDirectory/bin/ndspath 2>/dev/null >/dev/null ####################################################################### usage() { ME=$(basename "$0") cat <<-EOFA usage: $ME start|stop|status|monitor|validate-all $ME manages an eDirectory instance as an HA resource. The 'start' operation starts the instance. The 'stop' operation stops the instance. The 'status' operation reports if the instance is running. The 'monitor' operation reports if the instance is running, and runs additional checks. The 'validate-all' operation checks the validity of the arguments (environment variables). EOFA } eDir_meta_data() { cat <<-EOFB 1.0 Resource script for managing an eDirectory instance. Manages a single instance of eDirectory as an HA resource. The "multiple instances" feature or eDirectory has been added in version 8.8. This script will not work for any version of eDirectory prior to 8.8. This RA can be used to load multiple eDirectory instances on the same host. It is very strongly recommended to put eDir configuration files (as per the eDir_config_file parameter) on local storage on each node. This is necessary for this RA to be able to handle situations where the shared storage has become unavailable. If the eDir configuration file is not available, this RA will fail, and heartbeat will be unable to manage the resource. Side effects include STONITH actions, unmanageable resources, etc... Setting a high action timeout value is _very_ _strongly_ recommended. eDir with IDM can take in excess of 10 minutes to start. If heartbeat times out before eDir has had a chance to start properly, mayhem _WILL ENSUE_. The LDAP module seems to be one of the very last to start. So this script will take even longer to start on installations with IDM and LDAP if the monitoring of IDM and/or LDAP is enabled, as the start command will wait for IDM and LDAP to be available. Manages a Novell eDirectory directory server Path to configuration file for eDirectory instance. eDir config file Should we monitor if LDAP is running for the eDirectory instance? eDir monitor ldap Should we monitor if IDM is running for the eDirectory instance? eDir monitor IDM Value for the DHOST_INITIAL_HEAP java environment variable. If unset, java defaults will be used. DHOST_INITIAL_HEAP value Value for the DHOST_MAX_HEAP java environment variable. If unset, java defaults will be used. DHOST_MAX_HEAP value Value for the DHOST_OPTIONS java environment variable. If unset, original values will be used. DHOST_OPTIONS value EOFB return $OCF_SUCCESS } # # eDir_start: Start eDirectory instance # eDir_start() { if eDir_status ; then ocf_log info "eDirectory is already running ($NDSCONF)." return $OCF_SUCCESS fi # Start eDirectory instance if [ -n "$OCF_RESKEY_eDir_jvm_initial_heap" ]; then DHOST_JVM_INITIAL_HEAP=$OCF_RESKEY_eDir_jvm_initial_heap export DHOST_JVM_INITIAL_HEAP fi if [ -n "$OCF_RESKEY_eDir_jvm_max_heap" ]; then DHOST_JVM_MAX_HEAP=$OCF_RESKEY_eDir_jvm_max_heap export DHOST_JVM_MAX_HEAP fi if [ -n "$OCF_RESKEY_eDir_jvm_options" ]; then DHOST_JVM_OPTIONS=$OCF_RESKEY_eDir_jvm_options export DHOST_JVM_OPTIONS fi $NDSMANAGE start --config-file "$NDSCONF" > /dev/null 2>&1 if [ $? -eq 0 ]; then ocf_log info "eDir start command sent for $NDSCONF." else echo "ERROR: Can't start eDirectory for $NDSCONF." return $OCF_ERR_GENERIC fi CNT=0 while ! eDir_monitor ; do # Apparently, LDAP will only start after all other services # Startup time can be in excess of 10 minutes. # Leave a very long heartbeat timeout on the start action # We're relying on heartbeat to bail us out... let CNT=$CNT+1 ocf_log info "eDirectory start waiting for ${CNT}th retry for $NDSCONF." sleep 10 done ocf_log info "eDirectory start verified for $NDSCONF." return $OCF_SUCCESS } # # eDir_stop: Stop eDirectory instance # This action is written in such a way that even when run # on a node were things are broken (no binaries, no config # etc...) it will try to stop any running ndsd processes # and report success if none are running. # eDir_stop() { if ! eDir_status ; then return $OCF_SUCCESS fi $NDSMANAGE stop --config-file "$NDSCONF" >/dev/null 2>&1 if eDir_status ; then # eDir failed to stop. ocf_log err "eDirectory instance failed to stop for $NDSCONF" return $OCF_ERR_GENERIC else ocf_log info "eDirectory stop verified for $NDSCONF." return $OCF_SUCCESS fi } # # eDir_status: is eDirectory instance up ? # eDir_status() { if [ ! -r "$NDSCONF" ] ; then ocf_log err "Config file missing ($NDSCONF)." exit $OCF_ERR_GENERIC fi # Find how many ndsd processes have open listening sockets # with the IP of this eDir instance IFACE=$(grep -i "n4u.server.interfaces" $NDSCONF | cut -f2 -d= | tr '@' ':') if [ -z "$IFACE" ] ; then ocf_log err "Cannot retrieve interfaces from $NDSCONF. eDirectory may not be correctly configured." exit $OCF_ERR_GENERIC fi # In case of multiple IP's split into an array # and check all of them IFS=', ' read -a IFACE2 <<< "$IFACE" ocf_log debug "Found ${#IFACE2[@]} interfaces from $NDSCONF." counter=${#IFACE2[@]} - for IFACE in ${IFACE2[@]} + for IFACE in "${IFACE2[@]}" do ocf_log debug "Checking ndsd instance for $IFACE" NDSD_SOCKS=$(netstat -ntlp | grep -ce "$IFACE.*ndsd") if [ "$NDSD_SOCKS" -eq 1 ] ; then let counter=counter-1 ocf_log debug "Found ndsd instance for $IFACE" elif [ "$NDSD_SOCKS" -gt 1 ] ; then ocf_log err "More than 1 ndsd listening socket matched. Likely misconfiguration of eDirectory." exit $OCF_ERR_GENERIC fi done if [ $counter -eq 0 ] ; then # Correct ndsd instance is definitely running ocf_log debug "All ndsd instances found." return 0; elif [ $counter -lt ${#IFACE2[@]} ]; then ocf_log err "Only some ndsd listening sockets matched, something is very wrong." exit $OCF_ERR_GENERIC fi # No listening socket. Make sure we don't have the process running... PIDDIR=$(grep -i "n4u.server.vardir" "$NDSCONF" | cut -f2 -d=) if [ -z "$PIDDIR" ] ; then ocf_log err "Cannot get vardir from nds config ($NDSCONF). Probable eDir configuration error." exit $OCF_ERR_GENERIC fi NDSD_PID=$(cat $PIDDIR/ndsd.pid 2>/dev/null) if [ -z "$NDSD_PID" ] ; then # PID file unavailable or empty. # This will happen if the PIDDIR is not available # on this node at this time. return 1 fi RC=$(ps -p "$NDSD_PID" | grep -c ndsd) if [ "$RC" -gt 0 ] ; then # process found but no listening socket. ndsd likely not operational ocf_log err "ndsd process found, but no listening socket. Something's gone wrong ($NDSCONF)" exit $OCF_ERR_GENERIC fi ocf_log debug "ndsd instance is not running, but no other error detected." return 1 } # # eDir_monitor: Do more in-depth checks to ensure that eDirectory is fully functional # LDAP and IDM checks are only done if reqested. # # eDir_monitor() { if ! eDir_status ; then ocf_log info "eDirectory instance is down ($NDSCONF)" return $OCF_NOT_RUNNING fi # We know the right ndsd is running locally, check health $NDSSTAT --config-file "$NDSCONF" >/dev/null 2>&1 if [ $? -ne 0 ] ; then return 1 fi # Monitor IDM first, as it will start before LDAP if [ $MONITOR_IDM -eq 1 ]; then RET=$($NDSTRACE --config-file "$NDSCONF" -c modules | egrep -i '^vrdim.*Running' | awk '{print $1}') if [ "$RET" != "vrdim" ]; then ocf_log err "eDirectory IDM engine isn't running ($NDSCONF)." return $OCF_ERR_GENERIC fi fi if [ $MONITOR_LDAP -eq 1 ] ; then $NDSNLDAP -c --config-file "$NDSCONF" >/dev/null 2>&1 if [ $? -ne 0 ]; then ocf_log err "eDirectory LDAP server isn't running ($NDSCONF)." return $OCF_ERR_GENERIC fi fi ocf_log debug "eDirectory monitor success ($NDSCONF)" return $OCF_SUCCESS } # # eDir_validate: Validate environment # eDir_validate() { declare rc=$OCF_SUCCESS # Script must be run as root if ! ocf_is_root ; then ocf_log err "$0 must be run as root" rc=$OCF_ERR_GENERIC fi # ndsmanage must be available and runnable check_binary $NDSMANAGE # ndsstat must be available and runnable check_binary $NDSSTAT # Config file must be readable if [ ! -r "$NDSCONF" ] ; then ocf_log err "eDirectory configuration file [$NDSCONF] is not readable" rc=$OCF_ERR_ARGS fi # monitor_ldap must be unambiguously resolvable to a truth value MONITOR_LDAP=$(echo "$MONITOR_LDAP" | tr [A-Z] [a-z]) case "$MONITOR_LDAP" in yes|true|1) MONITOR_LDAP=1;; no|false|0) MONITOR_LDAP=0;; *) ocf_log err "Configuration parameter eDir_monitor_ldap has invalid value [$MONITOR_LDAP]" rc=$OCF_ERR_ARGS;; esac # monitor_idm must be unambiguously resolvable to a truth value MONITOR_IDM=$(echo "$MONITOR_IDM" | tr [A-Z] [a-z]) case "$MONITOR_IDM" in yes|true|1) MONITOR_IDM=1;; no|false|0) MONITOR_IDM=0;; *) ocf_log err "Configuration parameter eDir_monitor_idm has invalid value [$MONITOR_IDM]" rc=$OCF_ERR_ARGS;; esac # eDir_jvm_initial_heap must be blank or numeric if [ -n "$OCF_RESKEY_eDir_jvm_initial_heap" ] ; then if ! ocf_is_decimal "$OCF_RESKEY_eDir_jvm_initial_heap" ; then ocf_log err "Configuration parameter eDir_jvm_initial_heap has invalid" \ "value [$OCF_RESKEY_eDir_jvm_initial_heap]" rc=$OCF_ERR_ARGS fi fi # eDir_jvm_max_heap must be blank or numeric if [ -n "$OCF_RESKEY_eDir_jvm_max_heap" ] ; then if ! ocf_is_decimal "$OCF_RESKEY_eDir_jvm_max_heap" ; then ocf_log err "Configuration parameter eDir_jvm_max_heap has invalid" \ "value [$OCF_RESKEY_eDir_jvm_max_heap]" rc=$OCF_ERR_ARGS fi fi if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log err "Invalid environment" fi return $rc } # # Start of main logic # -ocf_log debug "$0 started with arguments \"$@\"" +ocf_log debug "$0 started with arguments \"$*\"" NDSBASE=/opt/novell/eDirectory NDSNLDAP=$NDSBASE/sbin/nldap NDSMANAGE=$NDSBASE/bin/ndsmanage NDSSTAT=$NDSBASE/bin/ndsstat NDSTRACE=$NDSBASE/bin/ndstrace NDSCONF=${OCF_RESKEY_eDir_config_file:-/etc/opt/novell/eDirectory/conf/nds.conf} MONITOR_LDAP=${OCF_RESKEY_eDir_monitor_ldap:-0} MONITOR_IDM=${OCF_RESKEY_eDir_monitor_idm:-0} # What kind of method was invoked? case "$1" in validate-all) eDir_validate; exit $?;; meta-data) eDir_meta_data; exit $OCF_SUCCESS;; status) if eDir_status ; then ocf_log info "eDirectory instance is up ($NDSCONF)" exit $OCF_SUCCESS else ocf_log info "eDirectory instance is down ($NDSCONF)" exit $OCF_NOT_RUNNING fi;; start) : skip;; stop) : skip;; monitor) : skip;; usage) usage; exit $OCF_SUCCESS;; *) ocf_log err "Invalid argument [$1]" usage; exit $OCF_ERR_ARGS;; esac # From now on we must have a valid environment to continue. # stop goes in the list above as it should ideally be able to # clean up after a start that failed due to bad args eDir_validate RC=$? if [ $RC -ne $OCF_SUCCESS ]; then exit $RC fi case "$1" in start) eDir_start;; stop) eDir_stop;; monitor) eDir_monitor;; esac exit $? diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh index ecd7843b8..958354b48 100644 --- a/heartbeat/findif.sh +++ b/heartbeat/findif.sh @@ -1,258 +1,258 @@ #!/bin/sh ipcheck_ipv4() { local r1_to_255="([1-9][0-9]?|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" local r0_to_255="([0-9][0-9]?|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" local r_ipv4="^$r1_to_255\.$r0_to_255\.$r0_to_255\.$r0_to_255$" echo "$1" | grep -q -Ee "$r_ipv4" } ipcheck_ipv6() { ! echo "$1" | grep -qs "[^0-9:a-fA-F]" } ifcheck() { local ifname="$1" - $IP2UTIL link show dev $ifname 2>&1 >/dev/null + $IP2UTIL link show dev $ifname >/dev/null 2>&1 } prefixcheck() { local prefix=$1 local prefix_length=${#prefix} local prefix_check=$2 if [ $prefix_length -gt 3 -o $prefix_length -eq 0 ] ; then return 1 fi echo "$prefix" | grep -qs "[^0-9]" if [ $? = 0 ] ; then return 1 fi if [ $prefix -lt 1 -o $prefix -gt $prefix_check ] ; then return 1 fi return 0 } getnetworkinfo() { local line netinfo ip -o -f inet route list match $OCF_RESKEY_ip table local scope host | (while read line; do netinfo=`echo $line | awk '{print $2}'` case $netinfo in */*) set -- $line break ;; esac done echo $line) } # previous versions of the IPaddr2 resource agent used to accept a netmask # in dotted quad notation (and convert to cidr notation implicitly; possibly # with warnings nobody ever noticed) # We can do so here as well. maybe_convert_dotted_quad_to_cidr() { # does this even look like a dotted quad notation? case $netmask in # invalid if it contains other than digits and dots # invalid if it contains adjacent dots, # or starts or ends with a dot # or more than three dots # or more than three digits in a row *[!0-9.]* | *..* | *.*.*.*.* | .* | *. | *[0-9][0-9][0-9][0-9]* ) return ;; # do we have three dots? # component range check on <= 255 is done below *.*.*.*) : ;; *) return ;; esac local IFS=. set -- $netmask [ $# = 4 ] || return; local b m=0 mask; for b ; do [ $b -le 255 ] || return; m=$(( (m << 8) + b )); done; case $m in # for i in `seq 32 -1 0`; do printf "%10u) netmask=$i ;;\n" $(( ((1 << i)-1) << (32 - i) )); done 4294967295) mask=32 ;; 4294967294) mask=31 ;; 4294967292) mask=30 ;; 4294967288) mask=29 ;; 4294967280) mask=28 ;; 4294967264) mask=27 ;; 4294967232) mask=26 ;; 4294967168) mask=25 ;; 4294967040) mask=24 ;; 4294966784) mask=23 ;; 4294966272) mask=22 ;; 4294965248) mask=21 ;; 4294963200) mask=20 ;; 4294959104) mask=19 ;; 4294950912) mask=18 ;; 4294934528) mask=17 ;; 4294901760) mask=16 ;; 4294836224) mask=15 ;; 4294705152) mask=14 ;; 4294443008) mask=13 ;; 4293918720) mask=12 ;; 4292870144) mask=11 ;; 4290772992) mask=10 ;; 4286578688) mask=9 ;; 4278190080) mask=8 ;; 4261412864) mask=7 ;; 4227858432) mask=6 ;; 4160749568) mask=5 ;; 4026531840) mask=4 ;; 3758096384) mask=3 ;; 3221225472) mask=2 ;; 2147483648) mask=1 ;; 0) mask=0 ;; *) ocf_log err "Bogus netmask: $netmask" ; return ;; esac ocf_log warn "Please convert dotted quad netmask $netmask to CIDR notation $mask!" netmask=$mask } findif_check_params() { local family="$1" local match="$OCF_RESKEY_ip" local nic="$OCF_RESKEY_nic" # netmask NOT local, see maybe_convert_dotted_quad_to_cidr netmask="$OCF_RESKEY_cidr_netmask" local brdcast="$OCF_RESKEY_broadcast" local errmsg maybe_convert_dotted_quad_to_cidr # Do a sanity check only on start and validate-all # to avoid returning OCF_ERR_CONFIGURED from the monitor operation. case $__OCF_ACTION in start|validate-all) true;; *) return $OCF_SUCCESS;; esac if [ -n "$nic" ] ; then errmsg=`ifcheck $nic` if [ $? -ne 0 ] ; then ocf_log err "Invalid interface name [$nic]: $errmsg" return $OCF_ERR_CONFIGURED fi fi if [ "$family" = "inet6" ] ; then ipcheck_ipv6 $match if [ $? = 1 ] ; then ocf_log err "IP address [$match] not valid." return $OCF_ERR_CONFIGURED fi if [ -z "$nic" ] ; then echo $match | grep -qis '^fe80::' if [ $? = 0 ] ; then ocf_log err "'nic' parameter is mandatory for a link local address [$match]." return $OCF_ERR_CONFIGURED fi fi if [ -n "$netmask" ] ; then prefixcheck $netmask 128 if [ $? = 1 ] ; then ocf_log err "Invalid netmask specification [$netmask]." return $OCF_ERR_CONFIGURED fi fi else # family = inet ipcheck_ipv4 $match if [ $? = 1 ] ; then ocf_log err "IP address [$match] not valid." return $OCF_ERR_CONFIGURED fi if [ -n "$netmask" ] ; then prefixcheck $netmask 32 if [ $? = 1 ] ; then ocf_log err "Invalid netmask specification [$netmask]." return $OCF_ERR_CONFIGURED fi fi if [ -n "$brdcast" ] ; then ipcheck_ipv4 $brdcast if [ $? = 1 ] ; then if [ "$brdcast" != "+" -a "$brdcast" != "-" ]; then ocf_log err "Invalid broadcast address [$brdcast]." return $OCF_ERR_CONFIGURED fi fi fi fi return $OCF_SUCCESS } findif() { local match="$OCF_RESKEY_ip" local family local scope local nic="$OCF_RESKEY_nic" local netmask="$OCF_RESKEY_cidr_netmask" local brdcast="$OCF_RESKEY_broadcast" echo $match | grep -qs ":" if [ $? = 0 ] ; then family="inet6" else family="inet" scope="scope link" fi findif_check_params $family || return $? if [ -n "$netmask" ] ; then match=$match/$netmask fi if [ -n "$nic" ] ; then # NIC supports more than two. set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | awk 'BEGIN{best=0} { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}') else set -- $(ip -o -f $family route list match $match $scope | awk 'BEGIN{best=0} { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}') fi if [ $# = 0 ] ; then case $OCF_RESKEY_ip in 127.*) set -- `getnetworkinfo` shift;; esac fi if [ -z "$nic" -o -z "$netmask" ] ; then if [ $# = 0 ] ; then ocf_log err "Unable to find nic or netmask." return $OCF_ERR_GENERIC fi case $1 in */*) : OK ;; *) ocf_log err "Unable to find cidr_netmask." return $OCF_ERR_GENERIC ;; esac fi [ -z "$nic" ] && nic=$3 [ -z "$netmask" ] && netmask=${1#*/} if [ $family = "inet" ] ; then if [ -z "$brdcast" ] ; then if [ -n "$7" ] ; then set -- `ip -o -f $family addr show | grep $7` [ "$5" = brd ] && brdcast=$6 fi fi else if [ -z "$OCF_RESKEY_nic" -a "$netmask" != "${1#*/}" ] ; then ocf_log err "Unable to find nic, or netmask mismatch." return $OCF_ERR_GENERIC fi fi echo "$nic netmask $netmask broadcast $brdcast" return $OCF_SUCCESS } diff --git a/heartbeat/nginx b/heartbeat/nginx index a7fb5d034..532eb81b2 100755 --- a/heartbeat/nginx +++ b/heartbeat/nginx @@ -1,947 +1,947 @@ #!/bin/sh # # High-Availability nginx OCF resource agent # # nginx # # Description: starts/stops nginx servers. # # Author: Alan Robertson # Dejan Muhamedagic # This code is based significantly on the apache resource agent # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2010 International Business Machines # # # Our parsing of the nginx config files is very rudimentary. # It'll work with lots of different configurations - but not every # possible configuration. # # Patches are being accepted ;-) # # OCF parameters: # OCF_RESKEY_configfile # OCF_RESKEY_nginx # OCF_RESKEY_port # OCF_RESKEY_options # OCF_RESKEY_status10regex # OCF_RESKEY_status10url # OCF_RESKEY_client # OCF_RESKEY_test20url # OCF_RESKEY_test20regex # OCF_RESKEY_test20conffile # OCF_RESKEY_test20name # OCF_RESKEY_external_monitor30_cmd # # # TO DO: # More extensive tests of extended monitor actions # Look at the --with-http_stub_status_module for validating # the configuration? (or is that automatically done?) # Checking could certainly result in better error # messages. # Allow for the fact that the config file and so on might all be # on shared disks - this affects the validate-all option. : ${OCF_FUNCTIONS_DIR=$OCF_ROOT/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs HA_VARRUNDIR=${HA_VARRUN} ####################################################################### # # Configuration options - usually you don't need to change these # ####################################################################### # NGINXDLIST="/usr/sbin/nginx /usr/local/sbin/nginx" # default options for http clients # NB: We _always_ test a local resource, so it should be # safe to connect from the local interface. WGETOPTS="-O- -q -L --no-proxy --bind-address=127.0.0.1" CURLOPTS="-o - -Ss -L --interface lo" LOCALHOST="http://localhost" NGINXDOPTS="" # # # End of Configuration options ####################################################################### CMD=`basename $0` # The config-file-pathname is the pathname to the configuration # file for this web server. Various appropriate defaults are # assumed if no config file is specified. usage() { - cat <<-! + cat <<-EOF usage: $0 action action: start start nginx stop stop nginx reload reload the nginx configuration status return the status of web server, running or stopped monitor return TRUE if the web server appears to be working. For this to be supported you must configure mod_status and give it a server-status URL - or configure what URL you wish to be monitored. You have to have installed either curl or wget for this to work. meta-data show meta data message validate-all validate the instance parameters - ! + EOF exit $1 } # # run the http client # curl_func() { cl_opts="$CURLOPTS $test_httpclient_opts" if [ x != "x$test_user" ] then echo "-u $test_user:$test_password" | curl -K - $cl_opts "$1" else curl $cl_opts "$1" fi } wget_func() { auth="" cl_opts="$WGETOPTS $test_httpclient_opts" [ x != "x$test_user" ] && auth="--http-user=$test_user --http-passwd=$test_password" wget $auth $cl_opts "$1" } # # rely on whatever the user provided userdefined() { $test_httpclient $test_httpclient_opts "$1" } # # find a good http client # findhttpclient() { # prefer curl if present... if [ "x$CLIENT" != x ] then echo "$CLIENT" elif which curl >/dev/null 2>&1 then echo "curl" elif which wget >/dev/null 2>&1 then echo "wget" else return 1 fi } gethttpclient() { [ -z "$test_httpclient" ] && test_httpclient=$ourhttpclient case "$test_httpclient" in curl|wget) echo ${test_httpclient}_func;; #these are supported *) echo userdefined;; esac } # test configuration good? is_testconf_sane() { if [ "x$test_regex" = x -o "x$test_url" = x ] then ocf_log err "test regular expression or test url empty" return 1 fi if [ "x$test_user$test_password" != x -a \( "x$test_user" = x -o "x$test_password" = x \) ] then ocf_log err "bad user authentication for extended test" return 1 fi return 0 } # # read the test definition from the config # readtestconf() { test_name="$1" # we look for this one or the first one if empty lcnt=0 readdef="" test_url="" test_regex="" test_user="" test_password="" test_httpclient="" test_httpclient_opts="" while read key value do lcnt=$((lcnt+1)) if [ "$readdef" ] then case "$key" in "url") test_url="$value" ;; "user") test_user="$value" ;; "password") test_password="$value" ;; "client") test_httpclient="$value" ;; "client_opts") test_httpclient_opts="$value" ;; "match") test_regex="$value" ;; "end") break ;; "#"*|"") ;; *) ocf_log err "$lcnt: $key: unknown keyword"; return 1 ;; esac else [ "$key" = "test" ] && [ -z "$test_name" -o "$test_name" = "$value" ] && readdef=1 fi done } nginxcat() { awk ' function procline() { split($0,a); if( a[1]~/^[Ii]nclude$/ ) { procinclude(a[2]); } else { if( a[1]=="root" ) { rootdir=a[2]; gsub("\"","",rootdir); } print; } } function printfile(infile, a) { while( (getline 0 ) { procline(); } close(infile); } function allfiles(dir, cmd,f) { cmd="find -L "dir" -type f"; while( ( cmd | getline f ) > 0 ) { printfile(f); } close(cmd); } function listfiles(pattern, cmd,f) { cmd="ls "pattern" 2>/dev/null"; while( ( cmd | getline f ) > 0 ) { printfile(f); } close(cmd); } function procinclude(spec) { if( rootdir!="" && spec!~/^\// ) { spec=rootdir"/"spec; } if( isdir(spec) ) { allfiles(spec); # read all files in a directory (and subdirs) } else { listfiles(spec); # there could be jokers } } function isdir(s) { return !system("test -d \""s"\""); } { procline(); } ' $1 | sed 's/#.*//;s/[[:blank:]]*$//;s/^[[:blank:]]*//' | grep -v '^$' } # # set parameters (as shell vars) from our nginx config file # get_nginx_params() { configfile=$1 shift 1 - vars=`echo $@ | sed 's/ /,/g'` + vars=`echo "$@" | sed 's/ /,/g'` eval ` nginxcat $configfile | awk -v vars="$vars" ' BEGIN{ split(vars,v,","); for( i in v ) vl[i]=tolower(v[i]); } { for( i in v ) if( tolower($1)==vl[i] ) { print v[i]"="$2 delete vl[i] break } } '` } # # Return the location(s) that are handled by the given handler # FindLocationForHandler() { PerlScript='while (<>) { /^\s*location\s+([^ \s{]+)\s*{/i && ($loc=$1); /^\s*stub_status\s+on\s*;$2/i && print "$loc\n"; }' nginxcat $1 | perl -e "$PerlScript" } # # Check if the port is valid # CheckPort() { lclport="$1" case "$lclport" in *:[0-9]*) lclport=`echo "$lclport" | sed 's%^[^:][^:]*:%%'` esac ocf_is_decimal "$lclport" && [ $lclport -gt 0 -a $lclport -lt 65537 ] } buildlocalurl() { [ "x$listen" != "x" ] && echo "http://${listen}" || echo "${LOCALHOST}:${PORT}" } # # Get all the parameters we need from the Nginx config file # GetParams() { ConfigFile=$1 DEFAULT_PID=`echo "$NGINX_CONFIGURATION" | sed -e 's%.*--pid-path=%%' -e 's% *--.*%%'` if [ ! -f $ConfigFile ] then return 1 fi get_nginx_params $ConfigFile root pid listen PidFile="$pid" case $PidFile in "") PidFile=$DEFAULT_PID ;; *) ;; esac for p in "$PORT" "$listen" 80 do if CheckPort "$p" then PORT="$p" break fi done echo $listen | grep ':' >/dev/null || # Listen could be just port spec listen="localhost:$listen" # # It's difficult to figure out whether the server supports # the status operation. # (we start our server with -DSTATUS - just in case :-)) # # Typically (but not necessarily) the status URL is /nginx_status # # For us to think status will work, we have to have the following things: # # - The server-status handler has to be mapped to some URL somewhere # # We assume that: # # - the "main" web server at $PORT will also support it if we can find it # somewhere in the file # - it will be supported at the same URL as the one we find in the file # # If this doesn't work for you, then set the status10url attribute. # if [ "X$STATUSURL" = "X" ] then StatusURL=`FindLocationForHandler $1 nginx_status | tail -1` STATUSURL="`buildlocalurl`$StatusURL" fi test ! -z "$PidFile" } # # return TRUE if a process with given PID is running # ProcessRunning() { NginxPID=$1 # Use /proc if it looks like it's here... if [ -d /proc -a -d /proc/1 ] then [ -d /proc/$NginxPID ] else # This assumes we're running as root... kill -0 "$NginxPID" >/dev/null 2>&1 fi } silent_status() { if [ -f $PidFile -a -s $PidFile ] && ocf_is_decimal "`cat $PidFile`" then ProcessRunning `cat $PidFile` else : No pid file false fi } start_nginx() { if silent_status then ocf_log info "$CMD already running (pid $NginxPID)" return $OCF_SUCCESS fi if ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE then : Configuration file $CONFIGFILE looks OK else return $OCF_ERR_CONFIGURED fi NGINX_VERSION=`$NGINXD -v 2>&1` ocf_log info "Starting $NGINXD - $NGINX_VERSION" ocf_log info "$NGINXD build configuration: $NGINX_CONFIGURATION" if ocf_run $NGINXD $NGINXDOPTS $OPTIONS -c $CONFIGFILE then : $NGINXD started without errors! else return $OCF_ERR_GENERIC fi tries=0 # This looks like a potential infinite loop - but it's not in practice # The LRM will time us out and kill us if nginx never starts working. while monitor_nginx ec=$? if [ $ec -eq $OCF_NOT_RUNNING ] then tries=`expr $tries + 1` ocf_log info "Waiting for $NGINXD $OPTIONS -c $CONFIGFILE to come up (try $tries)" true else false fi do sleep 1 done return $ec } stop_nginx() { if silent_status then if kill $NginxPID then tries=0 while ProcessRunning $NginxPID && [ $tries -lt 10 ] do sleep 1 kill $NginxPID >/dev/null ocf_log info "Killing nginx PID $NginxPID" tries=`expr $tries + 1` done else ocf_log warn "Killing nginx PID $NginxPID FAILED." fi if ProcessRunning $NginxPID then ocf_log info "$CMD still running ($NginxPID)." false else ocf_log info "$CMD stopped." fi else ocf_log info "$CMD is not running." fi # # I'm not convinced this is a wonderful idea (AlanR) # for sig in SIGTERM SIGHUP SIGKILL do if pgrep -f "$NGINXD.*$CONFIGFILE" >/dev/null then pkill -$sig -f $NGINXD.*$CONFIGFILE >/dev/null ocf_log info "nginxd children were signalled ($sig)" sleep 1 else break fi done } reload_nginx() { if silent_status then if kill -1 $NginxPID then : $NGINX reload signal to $NginxPID succeeded return $OCF_SUCCESS fi return $OCF_ERR_GENERIC fi start_nginx } status_nginx() { silent_status rc=$? if [ $rc -eq 0 ] then ocf_log info "$CMD is running (pid $NginxPID)." return $OCF_SUCCESS else ocf_log info "$CMD is stopped." return $OCF_NOT_RUNNING fi } fixtesturl() { echo $test_url | grep -qs "^http" && return test_url="`buildlocalurl`$test_url" } monitor_nginx_external() { if [ -z "$EXTMONITOR" ] then ocf_log err "$External level 30 Monitor Command not configured." return $OCF_ERR_CONFIGURED fi extbase=`echo $EXTMONITOR | sed 's% .*%%'` if case "$extbase" in /*) test -f "$extbase" -a -x "$extbase";; *) which "$extbase" >/dev/null 2>&1 esac then : OK - $extbase seems to be there... else ocf_log err "$External monitor command [$extbase] is not installed." return $OCF_ERR_CONFIGURED fi if $extbase then : OK - $extbase succeeded else ocf_log err "$extbase reported failure [rc=$?]" return $OCF_NOT_RUNNING fi return $OCF_SUCCESS } monitor_nginx_extended() { if [ -f "$TESTCONFFILE" -a -r "$TESTCONFFILE" ] then readtestconf < $TESTCONFFILE else test_url="$TESTURL" test_regex="$TESTREGEX20" fi whattorun=`gethttpclient` fixtesturl is_testconf_sane || return $OCF_ERR_CONFIGURED $whattorun "$test_url" | grep -Ei "$test_regex" > /dev/null } monitor_nginx_basic() { if [ -z "$STATUSURL" ] then ocf_log err "status10url parameter empty" return $OCF_ERR_CONFIGURED elif [ -z "$ourhttpclient" ] then ocf_log err "could not find a http client; make sure that either wget or curl is available" return $OCF_ERR_CONFIGURED fi ${ourhttpclient}_func "$STATUSURL" | grep -Ei "$TESTREGEX" > /dev/null } monitor_nginx() { silent_status if [ $? -ne 0 ] then ocf_log info "$CMD not running" return $OCF_NOT_RUNNING fi if [ -z "$OCF_CHECK_LEVEL" ] || [ "$OCF_CHECK_LEVEL" -lt 10 ] then return 0 fi ourhttpclient=`findhttpclient` # we'll need one if [ "$OCF_CHECK_LEVEL" -lt 20 ] then monitor_nginx_basic elif [ "$OCF_CHECK_LEVEL" -lt 30 ] then monitor_nginx_extended else monitor_nginx_external fi } metadata_nginx(){ cat < 1.0 This is the resource agent for the Nginx web/proxy server. This resource agent does not monitor POP or IMAP servers, as we don't know how to determine meaningful status for them. The start operation ends with a loop in which monitor is repeatedly called to make sure that the server started and that it is operational. Hence, if the monitor operation does not succeed within the start operation timeout, the nginx resource will end with an error status. The default monitor operation will verify that nginx is running. The level 10 monitor operation by default will try and fetch the /nginx_status page - which is commented out in sample nginx configurations. Make sure that the /nginx_status page works and that the access is restricted to localhost (address 127.0.0.1) plus whatever places _outside the cluster_ you want to monitor the server from. See the status10url and status10regex attributes for more details. The level 20 monitor operation will perform a more complex set of tests from a configuration file. The level 30 monitor operation will run an external command to perform an arbitrary monitoring operation. Manages an Nginx web/proxy server instance The full pathname of the Nginx configuration file. This file is parsed to provide defaults for various other resource agent parameters. configuration file path The full pathname of the httpd binary (optional). httpd binary path A port number that we can probe for status information using the statusurl. This will default to the port number found in the configuration file, or 80, if none can be found in the configuration file. httpd port The URL to monitor (the nginx server status page by default) when given a level 10 monitor operation. If left unspecified, it will be inferred from the nginx configuration file, or defaulted to /nginx_status. If you set this, make sure that it succeeds *only* from the localhost (127.0.0.1) and no other cluster nodes. Otherwise, the cluster software may complain about it being active on multiple nodes. url name Regular expression to match in the output of status10url. Case insensitive. monitor regular expression Client to use to query to Nginx for level 10 and level 20 tests. If not specified, the RA will try to find one on the system. Currently, wget and curl are supported, with curl being preferred. For example, you can set this paramter to "wget" if you prefer that to curl. http client URL to test. If it does not start with "http", then it's considered to be relative to the document root address. Level 20 monitor url Regular expression to match in the output of test20url. Case insensitive. Level 20 monitor regular expression A file which contains a more complex test configuration. Could be useful if you have to check more than one web application or in case sensitive info should be passed as arguments (passwords). Furthermore, using a config file is the only way to specify certain parameters. Please see README.webapps for examples and file description. Level 20 test configuration file Name of the test within the test configuration file. Level 20 test name Command string to run which implements level 30 monitoring. Level 30 test string Extra options to apply when starting nginx. nginx start options END exit $OCF_SUCCESS } validate_all_nginx() { if CheckPort $PORT # We are sure to succeed here, since we forced $PORT to be valid in GetParams() then : OK else ocf_log err "Port number $PORT is invalid!" exit $OCF_ERR_ARGS fi if [ -z $STATUSURL ] then : OK to be empty else case $STATUSURL in http://*/*) ;; *) ocf_log err "Invalid STATUSURL $STATUSURL" exit $OCF_ERR_ARGS ;; esac fi if [ ! -x $NGINXD ] then ocf_log err "NGINXD $NGINXD not found or is not an executable!" exit $OCF_ERR_ARGS fi if [ ! -f $CONFIGFILE ] then # We are sure to succeed here, since we have parsed $CONFIGFILE before getting here ocf_log err "Configuration file $CONFIGFILE not found!" exit $OCF_ERR_CONFIGURED fi if ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE then : Cool $NGINXD likes $CONFIGFILE else ocf_log err "$NGINXD $OPTIONS -t -c $CONFIGFILE reported a configuration error." return $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } if [ $# -eq 1 ] then COMMAND=$1 NGINXD="$OCF_RESKEY_httpd" PORT="$OCF_RESKEY_port" STATUSURL="$OCF_RESKEY_status10url" CONFIGFILE="$OCF_RESKEY_configfile" OPTIONS="$OCF_RESKEY_options" CLIENT=${OCF_RESKEY_client} TESTREGEX=${OCF_RESKEY_status10regex:-'Reading: [0-9]+ Writing: [0-9]+ Waiting: [0-9]+'} TESTURL="$OCF_RESKEY_test20url" TESTREGEX20=${OCF_RESKEY_test20regex} TESTCONFFILE="$OCF_RESKEY_test20conffile" TESTNAME="$OCF_RESKEY_test20name" EXTMONITOR="$OCF_RESKEY_external_monitor30_cmd" else usage $OCF_ERR_ARGS fi LSB_STATUS_STOPPED=3 if [ "X$NGINXD" = X -o ! -f "$NGINXD" -o ! -x "$NGINXD" ] then NGINXD= for h in $NGINXDLIST do if [ -f "$h" -a -x "$h" ] then NGINXD="$h" break fi done # It is possible that we still do not have a valid httpd at this stage if [ -z "$NGINXD" ] then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; meta-data) metadata_nginx;; esac ocf_log err "nginx binary not found! Please verify you've installed it" exit $OCF_ERR_INSTALLED fi # Let the user know that the $NGINXD used is the one (s)he specified via $OCF_RESKEY_httpd if [ ! -z "$OCF_RESKEY_httpd" ] then ocf_log info "Using $NGINXD as nginx" fi fi httpd_basename=`basename $NGINXD` case $httpd_basename in *-*) httpd_basename=`echo "$httpd_basename" | sed -e 's%\-.*%%'`;; esac NGINX_CONFIGURATION=`$NGINXD -V 2>&1 |grep 'configure arguments:'` DEFAULT_CONFIG=`echo "$NGINX_CONFIGURATION" | sed -e 's%.*--conf-path=%%' -e 's% *--.*%%'` case "$CONFIGFILE" in "") CONFIGFILE=$DEFAULT_CONFIG;; *) ;; esac if [ ! -f "$CONFIGFILE" ] then case $COMMAND in stop) ocf_log warn "$CONFIGFILE not found - nginx considered stopped" exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac fi if [ "X$COMMAND" = Xmeta-data ] || GetParams $CONFIGFILE then : OK else ocf_log err "Cannot parse config file [$CONFIGFILE]" exit $OCF_ERR_CONFIGURED fi case $COMMAND in start) start_nginx;; stop) stop_nginx;; reload) reload_nginx;; status) status_nginx;; monitor) monitor_nginx;; meta-data) metadata_nginx;; validate-all) validate_all_nginx;; *) usage $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/postfix b/heartbeat/postfix index 72fc3710d..703aa3246 100755 --- a/heartbeat/postfix +++ b/heartbeat/postfix @@ -1,415 +1,415 @@ #!/bin/sh # # Resource script for Postfix # # Description: Manages Postfix as an OCF resource in # an high-availability setup. # # Author: Raoul Bhatia : Original Author # License: GNU General Public License (GPL) # Note: If you want to run multiple Postfix instances, please see # http://amd.co.at/adminwiki/Postfix#Adding_a_Second_Postfix_Instance_on_one_Server # http://www.postfix.org/postconf.5.html # # # usage: $0 {start|stop|reload|monitor|validate-all|meta-data} # # The "start" arg starts a Postfix instance # # The "stop" arg stops it. # # OCF parameters: # OCF_RESKEY_binary # OCF_RESKEY_config_dir # OCF_RESKEY_parameters # ########################################################################## # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs : ${OCF_RESKEY_binary="/usr/sbin/postfix"} : ${OCF_RESKEY_config_dir=""} : ${OCF_RESKEY_parameters=""} USAGE="Usage: $0 {start|stop|reload|monitor|validate-all|meta-data}"; ########################################################################## usage() { echo $USAGE >&2 } meta_data() { cat < 0.1 This script manages Postfix as an OCF resource in a high-availability setup. Manages a highly available Postfix mail server instance Full path to the Postfix binary. For example, "/usr/sbin/postfix". Full path to Postfix binary Full path to a Postfix configuration directory. For example, "/etc/postfix". Full path to configuration directory The Postfix daemon may be called with additional parameters. Specify any of them here. END } postfix_running() { local loglevel loglevel=${1:-err} # run Postfix status if available if ocf_is_true $status_support; then $binary $OPTION_CONFIG_DIR status 2>&1 ret=$? if [ $ret -ne 0 ]; then ocf_log $loglevel "Postfix status: " $ret fi return $ret fi # manually check Postfix's pid PIDFILE=${queue_dir}/pid/master.pid if [ -f $PIDFILE ]; then PID=`head -n 1 $PIDFILE` kill -s 0 $PID >/dev/null 2>&1 && [ `ps -p $PID | grep master | wc -l` -eq 1 ] return $? fi # Postfix is not running false } postfix_start() { # if Postfix is running return success if postfix_running info; then ocf_log info "Postfix already running." return $OCF_SUCCESS fi # start Postfix $binary $OPTIONS start >/dev/null 2>&1 ret=$? if [ $ret -ne 0 ]; then ocf_exit_reason "Postfix returned error: " $ret return $OCF_ERR_GENERIC fi # grant some time for startup/forking the sub processes # and loop initial monitoring until success or timeout while true; do sleep 1 # break if postfix is up and running; log failure otherwise postfix_running info && break ocf_log info "Postfix failed initial monitor action: " $ret done ocf_log info "Postfix started." return $OCF_SUCCESS } postfix_stop() { # if Postfix is not running return success if ! postfix_running info; then ocf_log info "Postfix already stopped." return $OCF_SUCCESS fi # stop Postfix $binary $OPTIONS stop >/dev/null 2>&1 ret=$? if [ $ret -ne 0 ]; then ocf_exit_reason "Postfix returned an error while stopping: " $ret return $OCF_ERR_GENERIC fi # grant some time for shutdown and recheck 5 times for i in 1 2 3 4 5; do if postfix_running info; then sleep 1 else break fi done # escalate to abort if we did not stop by now # @TODO shall we loop here too? if postfix_running info; then ocf_exit_reason "Postfix failed to stop. Escalating to 'abort'." $binary $OPTIONS abort >/dev/null 2>&1; ret=$? sleep 5 # postfix abort did not succeed if postfix_running; then ocf_exit_reason "Postfix failed to abort." return $OCF_ERR_GENERIC fi fi ocf_log info "Postfix stopped." return $OCF_SUCCESS } postfix_reload() { if postfix_running; then ocf_log info "Reloading Postfix." $binary $OPTIONS reload fi } postfix_monitor() { local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi if postfix_running $status_loglevel; then return $OCF_SUCCESS fi return $OCF_NOT_RUNNING } postfix_validate_all() { # check that the Postfix binaries exist and can be executed check_binary "$binary" check_binary "postconf" # if true, run in-depth directory checks dir_check=true # check config_dir and alternate_config_directories parameter if [ "x$config_dir" != "x" ]; then if [ ! -d "$config_dir" ]; then if ocf_is_probe; then ocf_log info "Postfix configuration directory '$config_dir' not readable during probe." # skip in-depth directory checks if config file isn't readable during probe dir_check=false else ocf_exit_reason "Postfix configuration directory '$config_dir' does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi alternate_config_directories=`postconf -h alternate_config_directories 2>/dev/null | grep "$config_dir/\?"` if [ "x$alternate_config_directories" = "x" ]; then ocf_exit_reason "Postfix main configuration must contain correct 'alternate_config_directories' parameter." return $OCF_ERR_INSTALLED fi fi # check spool/queue and data directories (if applicable) # this is required because "postfix check" does not catch all errors if ocf_is_true $dir_check; then if [ ! -d "$queue_dir" ]; then if ocf_is_probe; then ocf_log info "Postfix queue directory '$queue_dir' not readable during probe." else ocf_exit_reason "Postfix queue directory '$queue_dir' does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi if ocf_is_true $status_support; then data_dir=`postconf $OPTION_CONFIG_DIR -h data_directory 2>/dev/null` data_dir_count=`echo "$data_dir" | tr ',' ' ' | wc -w` if [ $data_dir_count -gt 1 ]; then ocf_exit_reason "Postfix data directory '$orig_data_dir' cannot be set to multiple directories." return $OCF_ERR_INSTALLED fi if [ ! -d "$data_dir" ]; then if ocf_is_probe; then ocf_log info "Postfix data directory '$data_dir' not readable during probe." else ocf_exit_reason "Postfix data directory '$data_dir' does not exist or is not readable." return $OCF_ERR_INSTALLED fi fi fi # check directory permissions if ocf_is_true $status_support; then user=`postconf $OPTION_CONFIG_DIR -h mail_owner 2>/dev/null` - for dir in "$data_dir"; do + for dir in $data_dir; do if ! su -s /bin/sh - $user -c "test -w $dir"; then if ocf_is_probe; then ocf_log info "Directory '$dir' is not writable by user '$user' during probe." else ocf_exit_reason "Directory '$dir' is not writable by user '$user'." return $OCF_ERR_PERM; fi fi done fi fi # run Postfix internal check, if not probing if ! ocf_is_probe; then $binary $OPTIONS check >/dev/null 2>&1 ret=$? if [ $ret -ne 0 ]; then ocf_exit_reason "Postfix 'check' failed: " $ret return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS } # # Main # if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi binary=$OCF_RESKEY_binary config_dir=$OCF_RESKEY_config_dir parameters=$OCF_RESKEY_parameters # handle parameters case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) usage exit $OCF_SUCCESS ;; esac # build Postfix options string *outside* to access from each method OPTIONS='' OPTION_CONFIG_DIR='' # check for Postfix's postconf binary check_binary "postconf" # check if the Postfix config_dir exist if [ "x$config_dir" != "x" ]; then # remove all trailing slashes to ease "postconf alternate_config_directories" match config_dir=`echo $config_dir | sed 's/\/*$//'` # reset config_dir if it equals Postfix's default config_directory postconf -h config_directory 2>/dev/null | grep -q "^$config_dir/\?$" if [ $? -eq 0 ]; then config_dir="" fi # set OPTIONS if config_dir is still set # save OPTION_CONFIG_DIR seperatly if [ "x$config_dir" != "x" ]; then OPTION_CONFIG_DIR="-c $config_dir" OPTIONS=$OPTION_CONFIG_DIR fi fi # add all additional parameters to options string if [ "x$parameters" != "x" ]; then OPTIONS="$OPTIONS $parameters" fi # important directories, used in different methods queue_dir=`postconf $OPTION_CONFIG_DIR -h queue_directory 2>/dev/null` # check Postfix version and status support status_support=false postfix_version=`postconf -h mail_version 2>/dev/null` ocf_version_cmp "$postfix_version" "2.5.0" ret=$? # we need Postfix 2.5.0 or greater for status/data_directory support if [ $ret -eq 1 -o $ret -eq 2 ]; then status_support=true fi postfix_validate_all ret=$? LSB_STATUS_STOPPED=3 if [ $ret -ne $OCF_SUCCESS ]; then case $1 in stop) exit $OCF_SUCCESS ;; *) exit $ret;; esac fi case $1 in monitor) postfix_monitor exit $? ;; start) postfix_start exit $? ;; stop) postfix_stop exit $? ;; reload) postfix_reload exit $? ;; validate-all) exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster index 966dd64d1..0751e246a 100755 --- a/heartbeat/rabbitmq-cluster +++ b/heartbeat/rabbitmq-cluster @@ -1,465 +1,465 @@ #!/bin/sh # # Copyright (c) 2014 David Vossel # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### RMQ_SERVER=/usr/sbin/rabbitmq-server RMQ_CTL=/usr/sbin/rabbitmqctl RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia" RMQ_PID_DIR="/var/run/rabbitmq" RMQ_PID_FILE="/var/run/rabbitmq/rmq.pid" RMQ_LOG_DIR="/var/log/rabbitmq" NODENAME=$(ocf_local_nodename) # this attr represents the current active local rmq node name. # when rmq stops or the node is fenced, this attr disappears RMQ_CRM_ATTR_COOKIE="rmq-node-attr-${OCF_RESOURCE_INSTANCE}" # this attr represents the last known active local rmq node name # when rmp stops or the node is fenced, the attr stays forever so # we can continue to map an offline pcmk node to it's rmq node name # equivalent. RMQ_CRM_ATTR_COOKIE_LAST_KNOWN="rmq-node-attr-last-known-${OCF_RESOURCE_INSTANCE}" meta_data() { cat < 1.0 Starts cloned rabbitmq cluster instance rabbitmq clustered Policy string to pass to 'rabbitmqctl set_policy' right after bootstrapping the first rabbitmq instance. rabbitmqctl set_policy args END } ####################################################################### rmq_usage() { cat < /dev/null 2>&1 } rmq_local_node() { local node_name=$(rabbitmqctl status 2>&1 | sed -n -e "s/^.*[S|s]tatus of node \(.*\)\s.*$/\1/p" | tr -d "'") if [ -z "$node_name" ]; then node_name=$(cat /etc/rabbitmq/rabbitmq-env.conf 2>/dev/null | grep "\s*RABBITMQ_NODENAME=" | awk -F= '{print $2}') fi echo "$node_name" } rmq_join_list() { cibadmin -Q --xpath "//node_state[@crmd='online']//nvpair[@name='$RMQ_CRM_ATTR_COOKIE']" | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p" } rmq_write_nodename() { local node_name=$(rmq_local_node) if [ -z "$node_name" ]; then ocf_log err "Failed to determine rabbitmq node name, exiting" exit $OCF_ERR_GENERIC fi # store the pcmknode to rmq node mapping as a transient attribute. This allows # us to retrieve the join list with a simple xpath. ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -v "$node_name" # the pcmknode to rmq node mapping as a permanent attribute as well. this lets # us continue to map offline nodes to their equivalent rmq node name ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --name "$RMQ_CRM_ATTR_COOKIE_LAST_KNOWN" -v "$node_name" } rmq_delete_nodename() { # remove node-name ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -D } prepare_dir () { if [ ! -d ${1} ] ; then mkdir -p ${1} chown -R rabbitmq:rabbitmq ${1} chmod 755 ${1} fi } remove_pid () { rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 } rmq_monitor() { local rc $RMQ_CTL cluster_status > /dev/null 2>&1 rc=$? case "$rc" in 0) ocf_log debug "RabbitMQ server is running normally" rmq_write_nodename return $OCF_SUCCESS ;; 2|68|69|70|75|78) ocf_log info "RabbitMQ server is not running" rmq_delete_nodename return $OCF_NOT_RUNNING ;; *) ocf_log err "Unexpected return code from '$RMQ_CTL cluster_status' exit code: $rc" rmq_delete_nodename return $OCF_ERR_GENERIC ;; esac } rmq_init_and_wait() { local rc prepare_dir $RMQ_PID_DIR prepare_dir $RMQ_LOG_DIR remove_pid # the server startup script uses this environment variable export RABBITMQ_PID_FILE="$RMQ_PID_FILE" setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" & ocf_log info "Waiting for server to start" $RMQ_CTL wait $RMQ_PID_FILE rc=$? if [ $rc -ne $OCF_SUCCESS ]; then remove_pid ocf_log info "rabbitmq-server start failed: $rc" return $OCF_ERR_GENERIC fi rmq_monitor return $? } rmq_set_policy() { - $RMQ_CTL set_policy $@ > /dev/null 2>&1 + $RMQ_CTL set_policy "$@" > /dev/null 2>&1 } rmq_start_first() { local rc ocf_log info "Bootstrapping rabbitmq cluster" rmq_wipe_data rmq_init_and_wait rc=$? if [ $rc -eq 0 ]; then rc=$OCF_SUCCESS ocf_log info "cluster bootstrapped" if [ -n "$OCF_RESKEY_set_policy" ]; then # do not quote set_policy, we are passing in arguments rmq_set_policy $OCF_RESKEY_set_policy > /dev/null 2>&1 if [ $? -ne 0 ]; then ocf_log err "Failed to set policy: $OCF_RESKEY_set_policy" rc=$OCF_ERR_GENERIC else ocf_log info "Policy set: $OCF_RESKEY_set_policy" fi fi else ocf_log info "failed to bootstrap cluster. Check SELINUX policy" rc=$OCF_ERR_GENERIC fi return $rc } rmq_is_clustered() { $RMQ_CTL eval 'rabbit_mnesia:is_clustered().' | grep -q true } rmq_join_existing() { local join_list="$1" local rc=$OCF_ERR_GENERIC ocf_log info "Joining existing cluster with [ $(echo $join_list | tr '\n' ' ') ] nodes." rmq_init_and_wait if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi if rmq_is_clustered; then ocf_log info "Successfully re-joined existing rabbitmq cluster automatically" return $OCF_SUCCESS fi # unconditionally join the cluster $RMQ_CTL stop_app > /dev/null 2>&1 for node in $(echo "$join_list"); do ocf_log info "Attempting to join cluster with target node $node" $RMQ_CTL join_cluster $node if [ $? -eq 0 ]; then ocf_log info "Joined cluster by connecting to node $node, starting app" $RMQ_CTL start_app rc=$? if [ $rc -ne 0 ]; then ocf_log err "'$RMQ_CTL start_app' failed" fi break; fi done if [ "$rc" -ne 0 ]; then ocf_log info "Join process incomplete, shutting down." return $OCF_ERR_GENERIC fi ocf_log info "Successfully joined existing rabbitmq cluster" return $OCF_SUCCESS } rmq_forget_cluster_node_remotely() { local running_cluster_nodes="$1" local node_to_forget="$2" ocf_log info "Forgetting $node_to_forget via nodes [ $(echo $running_cluster_nodes | tr '\n' ' ') ]." for running_cluster_node in $running_cluster_nodes; do rabbitmqctl -n $running_cluster_node forget_cluster_node $node_to_forget if [ $? = 0 ]; then ocf_log info "Succeeded forgetting $node_to_forget via $running_cluster_node." return else ocf_log err "Failed to forget node $node_to_forget via $running_cluster_node." fi done } rmq_notify() { node_list="${OCF_RESKEY_CRM_meta_notify_stop_uname}" mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" # When notifications are on, this agent is going to "forget" nodes once they # leave the cluster. This is thought to resolve some issues where rabbitmq # blocks trying to sync with an offline node after a fencing action occurs. if ! [ "${mode}" = "post-stop" ]; then return $OCF_SUCCESS fi rmq_monitor if [ $? -ne $OCF_SUCCESS ]; then # only run forget when we are for sure active return $OCF_SUCCESS fi # forget each stopped rmq instance in the provided pcmk node in the list. for node in $(echo "$node_list"); do local rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $node -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" if [ -z "$rmq_node" ]; then ocf_log warn "Unable to map pcmk node $node to a known rmq node." continue fi ocf_log notice "Forgetting stopped node $rmq_node" $RMQ_CTL forget_cluster_node $rmq_node if [ $? -ne 0 ]; then ocf_log warn "Unable to forget offline node $rmq_node." fi done return $OCF_SUCCESS } rmq_start() { local join_list="" local rc rmq_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi join_list=$(rmq_join_list) # No join list means no active instances are up. This instance # is the first, so it needs to bootstrap the rest if [ -z "$join_list" ]; then rmq_start_first rc=$? return $rc fi # first try to join without wiping mnesia data rmq_join_existing "$join_list" if [ $? -ne 0 ]; then ocf_log info "node failed to join, wiping data directory and trying again" local local_rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" # if the graceful join fails, use the hammer and reset all the data. rmq_stop rmq_wipe_data rmq_forget_cluster_node_remotely "$join_list" "$local_rmq_node" rmq_join_existing "$join_list" rc=$? # Restore users (if any) BaseDataDir=`dirname $RMQ_DATA_DIR` if [ -f $BaseDataDir/users.erl ] ; then rabbitmqctl eval " {ok, [Users]} = file:consult(\"$BaseDataDir/users.erl\"), lists:foreach(fun(X) -> mnesia:dirty_write(rabbit_user, X) end, Users). " rm -f $BaseDataDir/users.erl fi if [ $rc -ne 0 ]; then ocf_log info "node failed to join even after reseting local data. Check SELINUX policy" return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS } rmq_stop() { # Backup users BaseDataDir=`dirname $RMQ_DATA_DIR` rabbitmqctl eval " Users = mnesia:dirty_select(rabbit_user, [{ {internal_user, '\\\$1', '_', '_'}, [{'/=', '\\\$1', <<\"guest\">>}], ['\\\$_'] } ]), file:write_file(\"$BaseDataDir/users.erl\", io_lib:fwrite(\"~p.~n\", [Users])). " rmq_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi $RMQ_CTL stop rc=$? if [ $rc -ne 0 ]; then ocf_log err "rabbitmq-server stop command failed: $RMQ_CTL stop, $rc" return $rc fi #TODO add kill logic stop_wait=1 while [ $stop_wait = 1 ]; do rmq_monitor rc=$? if [ "$rc" -eq $OCF_NOT_RUNNING ]; then stop_wait=0 break elif [ "$rc" -ne $OCF_SUCCESS ]; then ocf_log info "rabbitmq-server stop failed: $rc" exit $OCF_ERR_GENERIC fi sleep 1 done remove_pid return $OCF_SUCCESS } rmq_validate() { check_binary $RMQ_SERVER check_binary $RMQ_CTL # This resource only makes sense as a clone right now. at some point # we may want to verify the following. #TODO verify cloned #TODO verify ordered=true # Given that this resource does the cluster join explicitly, # having a cluster_nodes list in the static config file will # likely conflict with this agent. #TODO verify no cluster list in rabbitmq conf #cat /etc/rabbitmq/rabbitmq.config | grep "cluster_nodes" return $OCF_SUCCESS } case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS ;; start) rmq_start;; stop) rmq_stop;; monitor) rmq_monitor;; validate-all) rmq_validate;; notify) rmq_notify;; usage|help) rmq_usage exit $OCF_SUCCESS ;; *) rmq_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/redis b/heartbeat/redis index 1fe8906a3..1ea002534 100755 --- a/heartbeat/redis +++ b/heartbeat/redis @@ -1,571 +1,571 @@ #!/bin/bash . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs : ${OCF_RESKEY_bin:=/usr/bin/redis-server} : ${OCF_RESKEY_client_bin:=/usr/bin/redis-cli} : ${OCF_RESKEY_user:=redis} : ${OCF_RESKEY_rundir:=/var/run/redis} : ${OCF_RESKEY_pidfile_name:=redis-server.pid} : ${OCF_RESKEY_socket_name:=redis.sock} : ${OCF_RESKEY_port:=6379} if [ -z "$OCF_RESKEY_config" ]; then if [ -f "/etc/redis.conf" ]; then OCF_RESKEY_config="/etc/redis.conf" else OCF_RESKEY_config="/etc/redis/redis.conf" fi fi CHECK_SLAVE_STATE=0 REDIS_CHECK_DUMP="/usr/bin/redis-check-dump" REDIS_SERVER="$OCF_RESKEY_bin" REDIS_CLIENT="$OCF_RESKEY_client_bin" REDIS_CONFIG="$OCF_RESKEY_config" REDIS_USER="$OCF_RESKEY_user" REDIS_RUNDIR="$OCF_RESKEY_rundir" REDIS_PIDFILE="$OCF_RESKEY_rundir/$OCF_RESKEY_pidfile_name" REDIS_SOCKET="$OCF_RESKEY_rundir/$OCF_RESKEY_socket_name" REDIS_REPLICATION_PORT="$OCF_RESKEY_port" if ! [ -f $REDIS_CHECK_DUMP ]; then REDIS_CHECK_DUMP="$(which redis-check-dump 2>/dev/null)" fi if [ -f "$REDIS_CONFIG" ]; then REDIS_DUMP_DIR="$(cat $REDIS_CONFIG | grep "^\s*dir\s" | awk '{ print $2 }' 2>/dev/null)" REDIS_DUMP_FILE="$(cat $REDIS_CONFIG | grep "^\s*dbfilename\s" | awk '{ print $2 }' 2>/dev/null)" fi : ${REDIS_DUMP_DIR:=/var/lib/redis/} : ${REDIS_DUMP_FILE:=dump.rdb} function meta_data() { cat < 1.0 Resource agent script for redis server. This resource fully supports master/slave replication. The master preference of a node is determined by the 'slave_priority' parameter of the redis config. When taking the resource from 'unmanaged' to 'managed', the currently active master will be given a priority of 1000 (plus 1 for each active connection). The default 'slave_priority' is 100, so the master will stay master. For a slave to become master after converting the resource to managed, set a slave_priority greater than 1000. Redis server Path to \`redis-server\` Path to \`redis-server\` Path to \`redis-cli\` Path to \`redis-cli\` Path to 'redis.conf' Path to 'redis.conf' User to run redis as Redis user Directory to store socket and pid file in Redis var/run dir The filename to use for the pidfile. Will be created in the rundir. Should only be a basename, not a full path. Redis pidfile name The filename to use for the socket. Will be crated in the rundir. Should only be a basename, not a full path. Redis socket name Port for replication client to connect to on remote server Replication port During redis cluster bootstrap, wait for the last known master to be promoted before allowing any other instances in the cluster to be promoted. This lessens the risk of data loss when persistent data is in use. Wait for last known master EOI } INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s redis_replication" MASTER_HOST="" MASTER_ACTIVE_CACHED="" MASTER_ACTIVE="" master_is_active() { if [ -z "$MASTER_ACTIVE_CACHED" ]; then # determine if a master instance is already up and is healthy crm_mon --as-xml | grep "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1 MASTER_ACTIVE=$? MASTER_ACTIVE_CACHED="true" fi return $MASTER_ACTIVE } function set_master() { MASTER_HOST="$1" ${CRM_ATTR_REPL_INFO} -v "$1" -q } function last_known_master() { if [ -z "$MASTER_HOST" ]; then MASTER_HOST="$(${CRM_ATTR_REPL_INFO} --query -q 2>/dev/null)" fi echo "$MASTER_HOST" } function crm_master_reboot() { "${HA_SBIN_DIR}/crm_master" -l reboot "$@" } function calculate_score() { perf_score="$1" connected_clients="$2" if ocf_is_true "$OCF_RESKEY_wait_last_known_master"; then # only set perferred score by slave_priority if # we are not waiting for the last known master. Otherwise # we want the agent to have complete control over the scoring. perf_score="" connected_clients="0" fi if [[ -z "$perf_score" ]]; then if [[ "$(last_known_master)" == "$NODENAME" ]]; then perf_score=1000 else perf_score=1 fi fi perf_score=$(( perf_score + connected_clients )) echo "$perf_score" } function set_score() { local score="$1" if ocf_is_true "$OCF_RESKEY_wait_last_known_master" && ! master_is_active; then local last_master="$(last_known_master)" if [ -n "$last_master" ] && [[ "$last_master" != "$NODENAME" ]]; then ocf_log info "Postponing setting master score for ${NODENAME} until last known master instance [${last_master}] is promoted" return fi fi ocf_log debug "monitor: Setting master score to '$score'" crm_master_reboot -v "$score" } function redis_client() { - ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $@" + ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $*" if [ -n "$clientpasswd" ]; then "$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" | sed 's/\r//' else "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' fi } function simple_status() { local pid if ! [ -f "$REDIS_PIDFILE" ]; then return $OCF_NOT_RUNNING fi pid="$(<"$REDIS_PIDFILE")" pidof "$REDIS_SERVER" | grep -q "\<$pid\>" || return $OCF_NOT_RUNNING ocf_log debug "monitor: redis-server running under pid $pid" return $OCF_SUCCESS } function monitor() { local res simple_status res=$? if (( res != OCF_SUCCESS )); then return $res fi typeset -A info while read line; do [[ "$line" == "#"* ]] && continue [[ "$line" != *":"* ]] && continue IFS=':' read -r key value <<< "$line" info[$key]="$value" done < <(redis_client info) if [[ -z "${info[role]}" ]]; then ocf_log err "monitor: Could not get role from \`$REDIS_CLIENT -s $REDIS_SOCKET info\`" return $OCF_ERR_GENERIC fi if ocf_is_ms; then # Here we see if a score has already been set. # If score isn't set we the redis setting 'slave_priority'. # If that isn't set, we default to 1000 for a master, and 1 for slave. # We then add 1 for each connected client score="$(crm_master_reboot --get-value --quiet 2>/dev/null)" if [[ -z "$score" ]]; then score=$(calculate_score "${info[slave_priority]}" "${info[connected_clients]}") set_score "$score" fi if [[ "${info[role]}" == "master" ]]; then if ocf_is_probe; then set_master "$NODENAME" fi return $OCF_RUNNING_MASTER fi if [ "$CHECK_SLAVE_STATE" -eq 1 ]; then if [[ "${info[master_link_status]}" != "up" ]]; then ocf_log info "monitor: Slave mode link has not yet been established (link=${info[master_link_status]})" return $OCF_ERR_GENERIC fi if [[ "${info[master_host]}" != "$(last_known_master)" ]]; then ocf_log err "monitor: Slave mode current master does not match running master. current=${info[master_host]}, running=$(last_known_master)" return $OCF_ERR_GENERIC fi fi fi return $OCF_SUCCESS } function check_dump_file() { if ! have_binary "$REDIS_CHECK_DUMP"; then return 0 fi $REDIS_CHECK_DUMP ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} 2>&1 } function start() { monitor status=$? if (( status == OCF_SUCCESS )) || (( status == OCF_RUNNING_MASTER )); then ocf_log info "start: redis is already running" return $OCF_SUCCESS fi [[ ! -d "$REDIS_RUNDIR" ]] && mkdir -p "$REDIS_RUNDIR" chown -R "$REDIS_USER" "$REDIS_RUNDIR" if have_binary "restorecon"; then restorecon -Rv "$REDIS_RUNDIR" fi # check for 0 byte database dump file. This is an unrecoverable start # condition that we can avoid by deleting the 0 byte database file. if [ -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}" ]; then local size="$(stat --format "%s" ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE})" if [ "$?" -eq "0" ] && [ "$size" -eq "0" ]; then ocf_log notice "Detected 0 byte ${REDIS_DUMP_FILE}, deleting zero length file to avoid start failure." rm -f ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} fi fi ocf_log info "start: $REDIS_SERVER --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" output="$(su "$REDIS_USER" -s /bin/sh -c "cd '$REDIS_RUNDIR'; exec '$REDIS_SERVER' '$REDIS_CONFIG' --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" 2>&1)" while true; do # wait for redis to start typeset -A info while read line; do [[ "$line" == "#"* ]] && continue [[ "$line" != *":"* ]] && continue IFS=':' read -r key value <<< "$line" info[$key]="$value" done < <(redis_client info) if (( info[loading] == 0 )); then break elif (( info[loading] == 1 )); then sleep "${info[loading_eta_seconds]}" elif pidof "$REDIS_SERVER" >/dev/null; then # unknown error, but the process still exists. # This check is mainly because redis daemonizes before it starts listening, causing `redis-cli` to fail # See https://github.com/antirez/redis/issues/2368 # It's possible that the `pidof` will pick up a different redis, but in that case, the start operation will just time out sleep 1 else check_output="$(check_dump_file)" ocf_log err "start: Unknown error waiting for redis to start. redis-check-dump output=${check_output//$'\n'/; }" return $OCF_ERR_GENERIC fi done ocf_is_ms && demote # pacemaker expects resources to start in slave mode monitor status=$? if (( status == OCF_SUCCESS )) || (( status == OCF_RUNNING_MASTER )); then return $OCF_SUCCESS fi check_output="$(check_dump_file)" ocf_log err "start: Unknown error starting redis. redis-server output=${output//$'\n'/; } redis-check-dump output=${check_output//$'\n'/; }" return $status } function stop() { monitor status=$? if (( status == OCF_NOT_RUNNING )); then ocf_log info "stop: redis is already stopped" crm_master_reboot -D return $OCF_SUCCESS fi pid="$(<"$REDIS_PIDFILE")" kill -TERM "$pid" while true; do simple_status status=$? if (( status == OCF_NOT_RUNNING )); then crm_master_reboot -D return $OCF_SUCCESS fi sleep 1 done } function promote() { monitor status=$? if (( status == OCF_RUNNING_MASTER )); then ocf_log info "promote: Already running as master" set_master "$NODENAME" return $OCF_SUCCESS elif (( status != OCF_SUCCESS )); then ocf_log err "promote: Node is not running as a slave" return $OCF_ERR_GENERIC fi redis_client slaveof no one monitor status=$? if (( status == OCF_RUNNING_MASTER )); then set_master "$NODENAME" return $OCF_SUCCESS fi ocf_log err "promote: Unknown error while promoting to master (status=$status)" return $OCF_ERR_GENERIC } function demote() { local master_host local master_port CHECK_SLAVE_STATE=1 monitor status=$? if (( status == OCF_SUCCESS )); then ocf_log info "demote: Already running as slave" return $OCF_SUCCESS elif (( status == OCF_NOT_RUNNING )); then ocf_log err "demote: Failed to demote, redis not running." return $OCF_NOT_RUNNING fi master_host="$(last_known_master)" master_port="${REDIS_REPLICATION_PORT}" # The elected master has to remain a slave during startup. # During this period a placeholder master host is assigned. if [ -z "$master_host" ] || [[ "$master_host" == "$NODENAME" ]]; then CHECK_SLAVE_STATE=0 master_host="no-such-master" elif ! master_is_active; then # no master has been promoted yet. we'll be notified when the # master starts. CHECK_SLAVE_STATE=0 master_host="no-such-master" fi ocf_log info "demote: Setting master to '$master_host'" redis_client slaveof "$master_host" "$master_port" # Wait forever for the slave to connect to the master and finish the # sync. Timeout is controlled by Pacemaker "op start timeout=XX". # # hint: redis master_link_status will only come "up" when # the SYNC with the master has completed. # This can take an arbitraty time (data) and should # only be parametrized by the start operation timeout # by the administrator, not by this resource agent code while true; do # Wait infinite if replication is syncing # Then start/demote operation timeout determines timeout monitor status=$? if (( status == OCF_SUCCESS )); then return $OCF_SUCCESS fi sleep 1 done ocf_log err "demote: Unexpected error setting slave mode (status=$status)" return $OCF_ERR_GENERIC } function notify() { mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" case "$mode" in post-demote|post-promote) # change the master monitor status=$? if (( status == OCF_SUCCESS )); then # were a slave # calling demote updates the slave's connection # to the newly appointed Master instance. demote fi ;; esac return $OCF_SUCCESS } function validate() { if [[ -x "$REDIS_SERVER" ]]; then ocf_log err "validate: $REDIS_SERVER does not exist or is not executable" return $OCF_ERR_INSTALLED fi if [[ -x "$REDIS_CLIENT" ]]; then ocf_log err "validate: $REDIS_CLIENT does not exist or is not executable" return $OCF_ERR_INSTALLED fi if [[ -f "$REDIS_CONFIG" ]]; then ocf_log err "validate: $REDIS_CONFIG does not exist" return $OCF_ERR_CONFIGURED fi if ! getent passwd "$REDIS_USER" &>/dev/null; then ocf_log err "validate: $REDIS_USER is not a valid user" return $OCF_ERR_CONFIGURED fi } NODENAME=$(ocf_local_nodename) if [ -f "$REDIS_CONFIG" ]; then clientpasswd="$(cat $REDIS_CONFIG | sed -n -e 's/^\s*requirepass\s*\(.*\)\s*$/\1/p' | tail -n 1)" fi ocf_log debug "action=${1:-$__OCF_ACTION} notify_type=${OCF_RESKEY_CRM_meta_notify_type} notify_operation=${OCF_RESKEY_CRM_meta_notify_operation} master_host=${OCF_RESKEY_CRM_meta_notify_master_uname} slave_host=${OCF_RESKEY_CRM_meta_notify_slave_uname} promote_host=${OCF_RESKEY_CRM_meta_notify_promote_uname} demote_host=${OCF_RESKEY_CRM_meta_notify_demote_uname}; params: bin=${OCF_RESKEY_bin} client_bin=${OCF_RESKEY_client_bin} config=${OCF_RESKEY_config} user=${OCF_RESKEY_user} rundir=${OCF_RESKEY_rundir} port=${OCF_RESKEY_port}" case "${1:-$__OCF_ACTION}" in status|monitor) monitor ;; start) start ;; stop) stop ;; restart) stop && start ;; promote) promote ;; demote) demote ;; notify) notify ;; meta-data) meta_data ;; validate-all) validate ;; *) echo "Usage: $0 {monitor|start|stop|restart|promote|demote|notify|validate-all|meta-data}" exit $OCF_ERR_UNIMPLEMENTED ;; esac status=$? ocf_log debug "exit_status=$status" exit $status diff --git a/heartbeat/tomcat b/heartbeat/tomcat index 0c6325b31..cbcd12a00 100755 --- a/heartbeat/tomcat +++ b/heartbeat/tomcat @@ -1,775 +1,759 @@ #!/bin/sh # # Description: Manages a Tomcat Server as an OCF High-Availability # resource under Heartbeat/LinuxHA control # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # # Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION # ####################################################################### # OCF parameters: # OCF_RESKEY_tomcat_name - The name of the resource. Default is tomcat # OCF_RESKEY_script_log - A destination of the log of this script. Default /var/log/OCF_RESKEY_tomcat_name.log # OCF_RESKEY_tomcat_stop_timeout - Time-out at the time of the stop. Default is 5. DEPRECATED # OCF_RESKEY_tomcat_suspend_trialcount - The re-try number of times awaiting a stop. Default is 10. DEPRECATED # OCF_RESKEY_tomcat_user - A user name to start a resource. # OCF_RESKEY_statusurl - URL for state confirmation. Default is http://127.0.0.1:8080 # OCF_RESKEY_max_stop_time - The max time it should take for proper shutdown. Restrictions, only Tomcat6. # OCF_RESKEY_java_home - Home directory of Java. Default is none # OCF_RESKEY_java_opts - Options to pass to Java JVM for start and stop. Default is none # OCF_RESKEY_catalina_home - Home directory of Tomcat. Default is none # OCF_RESKEY_catalina_base - Base directory of Tomcat. Default is OCF_RESKEY_catalina_home # OCF_RESKEY_catalina_out - Log file name of Tomcat. Default is OCF_RESKEY_catalina_base/logs/catalina.out # OCF_RESKEY_catalina_pid - A PID file name of Tomcat. Default is OCF_RESKEY_catalina_base/logs/catalina.pid # OCF_RESKEY_tomcat_start_opts - Start options of Tomcat. Default is none. # OCF_RESKEY_catalina_opts - CATALINA_OPTS environment variable. Default is none. # OCF_RESKEY_catalina_tmpdir - CATALINA_TMPDIR environment variable. Default is none. # OCF_RESKEY_catalina_rotate_log - Control catalina.out logrotation flag. Default is NO. # OCF_RESKEY_catalina_rotatetime - catalina.out logrotation time span(seconds). Default is 86400. # OCF_RESKEY_java_endorsed_dirs - JAVA_ENDORSED_DIRS environment variable. Default is none. # OCF_RESKEY_logging_config - LOGGING_CONFIG environment variable. Default is none. # OCF_RESKEY_logging_manager - LOGGING_MANAGER environment variable. Default is none. ############################################################################### : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Use runuser if available for SELinux. if [ -x /sbin/runuser ]; then SU=runuser else SU=su fi -############################################################################ -# Usage usage() { - cat <<-! + cat <<-EOF usage: $0 action action: start start Tomcat stop stop Tomcat status return the status of Tomcat, up or down monitor return TRUE if Tomcat appears to be working. You have to have installed $WGETNAME for this to work. meta-data show meta data message validate-all validate the instance parameters -! +EOF } -############################################################################ -# Check tomcat service availability isrunning_tomcat() { $WGET --tries=20 -O /dev/null $RESOURCE_STATUSURL >/dev/null 2>&1 } -############################################################################ -# isalive_tomcat() { if ocf_is_true $SYSTEMD; then systemctl is-active tomcat@${TOMCAT_NAME} > /dev/null 2>&1 return $? fi # As the server stops, the PID file disappears. To avoid race conditions, # we will have remembered the PID of a running instance on script entry. local pid=$rememberedPID # If there is a PID file, attempt to use that if [ -f $CATALINA_PID ]; then local tmp ocf_log debug "Reading pid from $CATALINA_PID" tmp=`head -n 1 $CATALINA_PID` if [ $? -eq 0 ]; then pid=$tmp fi fi if [ -n "$pid" ] && [ "$pid" -gt 0 ]; then # Retry message for restraint ocf_log debug "Sending noop signal to $pid" kill -s 0 $pid >/dev/null 2>&1 return $? fi # No PID file false } -############################################################################ # Check rotatelogs process and restart if it is stopped monitor_rotatelogs() { pgrep -f "$ROTATELOGS.*$CATALINA_BASE/logs/catalina_%F.log" > /dev/null 2>&1 if [ $? -ne 0 ]; then ocf_log warn "A rotatelogs command for $CATALINA_BASE/logs/catalina_%F.log is not running. Restarting it." start_rotatelogs if [ $? -eq 0 ]; then ocf_log info "Restart rotatelogs process succeeded." else ocf_log warn "Restart rotatelogs process failed." fi fi } -############################################################################ -# Check tomcat process and service availability monitor_tomcat() { isalive_tomcat || return $OCF_NOT_RUNNING isrunning_tomcat || return $OCF_ERR_GENERIC if ocf_is_true ${CATALINA_ROTATE_LOG}; then # Monitor rotatelogs process and restart it if it is stopped. # And never consider rotatelogs process failure to be a monitor failure # as long as Tomcat process works fine. monitor_rotatelogs fi return $OCF_SUCCESS } -############################################################################ -# Startup rotatelogs process start_rotatelogs() { # -s is required because tomcat5.5's login shell is /bin/false $SU - -s /bin/sh $RESOURCE_TOMCAT_USER \ -c "$ROTATELOGS -l \"$CATALINA_BASE/logs/catalina_%F.log\" $CATALINA_ROTATETIME" \ < "$CATALINA_OUT" > /dev/null 2>&1 & } -############################################################################ # Execute catalina.out log rotation rotate_catalina_out() { # Check catalina_%F.log is writable or not. CURRENT_ROTATELOG_SUFFIX=`date +"%F"` $SU - -s /bin/sh $RESOURCE_TOMCAT_USER \ -c "touch \"$CATALINA_BASE/logs/catalina_$CURRENT_ROTATELOG_SUFFIX.log\"" > /dev/null 2>&1 if [ $? -ne 0 ]; then ocf_exit_reason "$CATALINA_BASE/logs/catalina_$CURRENT_ROTATELOG_SUFFIX.log is not writable." return $OCF_ERR_GENERIC fi # Clean up and set permissions on required files rm -rf "$CATALINA_BASE"/temp/* if [ -p "$CATALINA_OUT" ]; then rm -f "$CATALINA_OUT" elif [ -e "$CATALINA_OUT" ]; then DATE=`date +"%F-%H%M%S"` ocf_log warn "$CATALINA_OUT already exists. It is saved as $CATALINA_OUT-$DATE" mv "$CATALINA_OUT" "$CATALINA_OUT-$DATE" fi mkfifo -m700 "$CATALINA_OUT" chown --dereference "$RESOURCE_TOMCAT_USER" "$CATALINA_OUT" || true start_rotatelogs } -############################################################################ -# Create systemd configuration create_systemd_config() { cat<<-EOF > /etc/sysconfig/tomcat@${TOMCAT_NAME} JAVA_HOME=${JAVA_HOME} JAVA_OPTS="${JAVA_OPTS}" CATALINA_HOME=${CATALINA_HOME} CATALINA_BASE=${CATALINA_BASE} CATALINA_OUT=${CATALINA_OUT} CATALINA_OPTS="${CATALINA_OPTS}" CATALINA_TMPDIR="${CATALINA_TMPDIR}" JAVA_ENDORSED_DIRS="${JAVA_ENDORSED_DIRS}" LOGGING_CONFIG="${LOGGING_CONFIG}" LOGGING_MANAGER="${LOGGING_MANAGER}" TOMCAT_CFG=${TOMCAT_CFG} EOF } -############################################################################ -# Tomcat Command +# shellcheck disable=SC2068 tomcatCommand() { if ocf_is_true $SYSTEMD; then systemctl $@ tomcat@${TOMCAT_NAME} else cat<<-END_TOMCAT_COMMAND export JAVA_HOME=${JAVA_HOME} export JAVA_OPTS="${JAVA_OPTS}" export CATALINA_HOME=${CATALINA_HOME} export CATALINA_BASE=${CATALINA_BASE} export CATALINA_OUT=${CATALINA_OUT} export CATALINA_PID=${CATALINA_PID} export CATALINA_OPTS="${CATALINA_OPTS}" export CATALINA_TMPDIR="${CATALINA_TMPDIR}" export JAVA_ENDORSED_DIRS="${JAVA_ENDORSED_DIRS}" export LOGGING_CONFIG="${LOGGING_CONFIG}" export LOGGING_MANAGER="${LOGGING_MANAGER}" export TOMCAT_CFG=${TOMCAT_CFG} $TOMCAT_START_SCRIPT $@ END_TOMCAT_COMMAND fi } + +# shellcheck disable=SC2068 attemptTomcatCommand() { if [ -n "$REDIRECT_DEFAULT_CONFIG" ]; then - export TOMCAT_CFG=$(mktemp ${HA_RSCTMP}/tomcat-tmp-XXXXX.cfg) + TOMCAT_CFG=$(mktemp "${HA_RSCTMP}/tomcat-tmp-XXXXX.cfg") + export TOMCAT_CFG fi if ocf_is_true $SYSTEMD; then tomcatCommand $@ elif [ "$RESOURCE_TOMCAT_USER" = root ]; then "$TOMCAT_START_SCRIPT" $@ >> "$TOMCAT_CONSOLE" 2>&1 else tomcatCommand $@ | $SU - -s /bin/sh "$RESOURCE_TOMCAT_USER" >> "$TOMCAT_CONSOLE" 2>&1 fi if [ -n "$REDIRECT_DEFAULT_CONFIG" ]; then rm -f "$TOMCAT_CFG" fi } -############################################################################ -# Start Tomcat start_tomcat() { if ocf_is_true $SYSTEMD; then create_systemd_config fi - cd "$CATALINA_HOME/bin" + cd "$CATALINA_HOME/bin" || return $OCF_ERR_GENERIC validate_all_tomcat || exit $? monitor_tomcat if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi # Remove $CATALINA_PID if it exists rm -f $CATALINA_PID #ocf_log debug "catalina.out rotation FLG = ${CATALINA_ROTATE_LOG}" if ocf_is_true ${CATALINA_ROTATE_LOG}; then rotate_catalina_out if [ $? -eq 0 ]; then ocf_log debug "Rotate catalina.out succeeded." else ocf_exit_reason "Rotate catalina.out failed. Avoid starting tomcat without catalina.out rotation." return $OCF_ERR_GENERIC fi fi - + echo "`date "+%Y/%m/%d %T"`: start ===========================" >> "$TOMCAT_CONSOLE" ocf_log debug "CATALINA_OPTS value = ${CATALINA_OPTS}" attemptTomcatCommand start ${TOMCAT_START_OPTS} & while true; do monitor_tomcat if [ $? -eq $OCF_SUCCESS ]; then break fi ocf_log debug "start_tomcat[$TOMCAT_NAME]: retry monitor_tomcat" sleep 3 done return $OCF_SUCCESS } -############################################################################ -# Stop Tomcat stop_tomcat() { local stop_time local RA_TIMEOUT=20 local TOMCAT_STOP_OPTS="" - if [ -n $OCF_RESKEY_CRM_meta_timeout ]; then + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then RA_TIMEOUT=$((OCF_RESKEY_CRM_meta_timeout/1000)) fi STOP_TIMEOUT=$((RA_TIMEOUT-5)) if [ -n "$MAX_STOP_TIME" ]; then if [ $MAX_STOP_TIME -gt $RA_TIMEOUT ]; then ocf_log warn "max_stop_timeout must be shorter than the timeout of stop operation." fi if [ $MAX_STOP_TIME -eq 0 ]; then STOP_TIMEOUT=$RA_TIMEOUT else STOP_TIMEOUT=$MAX_STOP_TIME fi fi cd "$CATALINA_HOME/bin" memorize_pid # This lets monitoring continue to work reliably echo "`date "+%Y/%m/%d %T"`: stop ###########################" >> "$TOMCAT_CONSOLE" if [ "$TOMCAT_START_SCRIPT" = "$CATALINA_HOME/bin/catalina.sh" ]; then TOMCAT_STOP_OPTS="$STOP_TIMEOUT --force" fi stop_time=$(date +%s) attemptTomcatCommand stop $TOMCAT_STOP_OPTS lapse_sec=0 while isalive_tomcat; do sleep 1 lapse_sec=`expr $(date +%s) - $stop_time` if [ $lapse_sec -ge $STOP_TIMEOUT ]; then ocf_log debug "stop_tomcat[$TOMCAT_NAME]: stop failed, killing with SIGKILL ($lapse_sec)" kill -s KILL $rememberedPID > /dev/null 2>&1 fi done if ocf_is_true ${CATALINA_ROTATE_LOG}; then rm -f "$CATALINA_PID" "${CATALINA_OUT}" else rm -f "$CATALINA_PID" fi return $OCF_SUCCESS } metadata_tomcat() { cat < 1.0 Resource script for Tomcat. It manages a Tomcat instance as a cluster resource. Manages a Tomcat servlet environment instance The name of the resource, added as a Java parameter in JAVA_OPTS: -Dname=<tomcat_name> to Tomcat process on start. Used to ensure process is still running and must be unique. The name of the resource Log file, used during start and stop operations. Log file Time-out for stop operation. DEPRECATED Time-out for the stop operation. DEPRECATED Maximum number of times to retry stop operation before suspending and killing Tomcat. DEPRECATED. Does not retry. Max retry count for stop operation. DEPRECATED The user who starts Tomcat. The user who starts Tomcat URL for state confirmation. URL for state confirmation Number of seconds to wait during a stop before drastic measures (force kill) are used on the tomcat process. This number MUST be less than your cluster stop timeout for the resource. The default value is five seconds before the timeout value of stop operation. When it is over this value, it stops a process in kill commands. This parameter is only effective on Tomcat 6 or later. The max time it should take for proper shutdown. Home directory of Java. Home directory of Java Java JVM options used on start and stop. Java options parsed to JVM, used on start and stop. Home directory of Tomcat. Home directory of Tomcat Instance directory of Tomcat Instance directory of Tomcat, defaults to catalina_home Log file name of Tomcat Log file name of Tomcat, defaults to catalina_base/logs/catalina.out A PID file name for Tomcat. A PID file name for Tomcat Force use of systemd when available. Force use of systemd when available Absolute path to the custom tomcat start script to use. Tomcat start script location Tomcat start options. Tomcat start options Catalina options, for the start operation only. Catalina options Temporary directory of Tomcat Temporary directory of Tomcat, defaults to none Rotate catalina.out flag. Rotate catalina.out flag catalina.out rotation interval (seconds). catalina.out rotation interval (seconds) Java_endorsed_dirs of tomcat Java_endorsed_dirs of Tomcat, defaults to none Logging_config of tomcat Logging_config of Tomcat, defaults to none Logging_manager of tomcat Logging_manager of Tomcat, defaults to none. END return $OCF_SUCCESS } validate_all_tomcat() { local port local rc=$OCF_SUCCESS ocf_log info "validate_all_tomcat[$TOMCAT_NAME]" check_binary $WGET if ! ocf_is_true $OCF_RESKEY_force_systemd && [ -z "${TOMCAT_START_SCRIPT}" ]; then ocf_exit_reason "No default tomcat start script detected. Please specify start script location using the 'tomcat_start_script' option" rc=$OCF_ERR_CONFIGURED fi if [ -n "$MAX_STOP_TIME" ] && [ "$MAX_STOP_TIME" -lt 0 ]; then ocf_exit_reason "max_stop_time must be set to a value greater than 0." rc=$OCF_ERR_CONFIGURED fi if echo "$RESOURCE_STATUSURL" | grep -q ":[0-9][0-9]*" ; then port=${RESOURCE_STATUSURL##*:} port=${port%%/*} ocf_log debug "Tomcat port is $port" ocf_log debug "grep port=\"$port\" $CATALINA_BASE/conf/server.xml" grep "port=\"$port\"" $CATALINA_BASE/conf/server.xml > /dev/null 2>&1 if [ $? -ne 0 ]; then ocf_exit_reason "Your configured status URL specifies a port ($port), but the server does not have a connector listening to that port in $CATALINA_BASE/conf/server.xml" rc=$OCF_ERR_INSTALLED fi fi if ocf_is_true ${CATALINA_ROTATE_LOG}; then if [ ! -x "$ROTATELOGS" ]; then ocf_exit_reason "rotatelogs command does not exist." rc=$OCF_ERR_INSTALLED fi fi return $rc } # As we stop tomcat, it removes it's own pid file...we still want to know what it was memorize_pid() { if [ -f $CATALINA_PID ]; then rememberedPID=$(cat $CATALINA_PID) fi } # ### tomcat RA environment variables # COMMAND=$1 TOMCAT_NAME="${OCF_RESKEY_tomcat_name-tomcat}" TOMCAT_CONSOLE="${OCF_RESKEY_script_log-/var/log/$TOMCAT_NAME.log}" RESOURCE_TOMCAT_USER="${OCF_RESKEY_tomcat_user-root}" RESOURCE_STATUSURL="${OCF_RESKEY_statusurl-http://127.0.0.1:8080}" OCF_RESKEY_force_systemd_default=0 JAVA_HOME="${OCF_RESKEY_java_home}" JAVA_OPTS="${OCF_RESKEY_java_opts}" CATALINA_HOME="${OCF_RESKEY_catalina_home}" CATALINA_BASE="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}" CATALINA_OUT="${OCF_RESKEY_catalina_out-$CATALINA_BASE/logs/catalina.out}" CATALINA_PID=$OCF_RESKEY_catalina_pid if [ -z "$CATALINA_PID" ]; then mkdir -p "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/" if [ "${RESOURCE_TOMCAT_USER}" != "root" ]; then chown ${RESOURCE_TOMCAT_USER} "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/" fi CATALINA_PID="${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/catalina.pid" fi MAX_STOP_TIME="${OCF_RESKEY_max_stop_time}" : ${OCF_RESKEY_force_systemd=${OCF_RESKEY_force_systemd_default}} TOMCAT_START_OPTS="${OCF_RESKEY_tomcat_start_opts}" TOMCAT_START_SCRIPT="${OCF_RESKEY_tomcat_start_script}" CATALINA_OPTS="-Dname=$TOMCAT_NAME ${OCF_RESKEY_catalina_opts}" CATALINA_TMPDIR="${OCF_RESKEY_catalina_tmpdir}" CATALINA_ROTATE_LOG="${OCF_RESKEY_catalina_rotate_log-NO}" CATALINA_ROTATETIME="${OCF_RESKEY_catalina_rotatetime-86400}" JAVA_ENDORSED_DIRS="${OCF_RESKEY_java_endorsed_dirs}" LOGGING_CONFIG="${OCF_RESKEY_logging_config}" LOGGING_MANAGER="${OCF_RESKEY_logging_manager}" if [ -z "${TOMCAT_START_SCRIPT}" ]; then if ocf_is_true $OCF_RESKEY_force_systemd && \ ps -p 1 | grep -q systemd; then SYSTEMD=1 elif [ -e "$CATALINA_HOME/bin/catalina.sh" ]; then TOMCAT_START_SCRIPT="$CATALINA_HOME/bin/catalina.sh" elif [ -e "/usr/sbin/tomcat" ]; then REDIRECT_DEFAULT_CONFIG=1 TOMCAT_START_SCRIPT="/usr/sbin/tomcat" elif [ -e "/usr/sbin/tomcat6" ]; then REDIRECT_DEFAULT_CONFIG=1 TOMCAT_START_SCRIPT="/usr/sbin/tomcat6" fi fi LSB_STATUS_STOPPED=3 if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case "$COMMAND" in meta-data) metadata_tomcat; exit $OCF_SUCCESS;; help|usage) usage; exit $OCF_SUCCESS;; esac if [ ! -d "$JAVA_HOME" -o ! -d "$CATALINA_HOME" -o ! -d "$CATALINA_BASE" ]; then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac ocf_exit_reason "JAVA_HOME or CATALINA_HOME or CATALINA_BASE does not exist." exit $OCF_ERR_INSTALLED fi export JAVA_HOME JAVA_OPTS CATALINA_HOME CATALINA_BASE CATALINA_OUT CATALINA_PID CATALINA_OPTS CATALINA_TMPDIR JAVA_ENDORSED_DIRS LOGGING_CONFIG LOGGING_MANAGER JAVA=${JAVA_HOME}/bin/java if [ ! -x "$JAVA" ]; then case $COMMAND in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; esac ocf_exit_reason "java command does not exist." exit $OCF_ERR_INSTALLED fi ROTATELOGS="" if ocf_is_true ${CATALINA_ROTATE_LOG}; then # Look for rotatelogs/rotatelogs2 if [ -x /usr/sbin/rotatelogs ]; then ROTATELOGS=/usr/sbin/rotatelogs elif [ -x /usr/sbin/rotatelogs2 ]; then ROTATELOGS=/usr/sbin/rotatelogs2 fi fi # # ------------------ # the main script # ------------------ # case "$COMMAND" in start) ocf_log debug "[$TOMCAT_NAME] Enter tomcat start" start_tomcat func_status=$? ocf_log debug "[$TOMCAT_NAME] Leave tomcat start $func_status" exit $func_status ;; stop) ocf_log debug "[$TOMCAT_NAME] Enter tomcat stop" stop_tomcat func_status=$? ocf_log debug "[$TOMCAT_NAME] Leave tomcat stop $func_status" exit $func_status ;; status) if monitor_tomcat; then echo tomcat instance $TOMCAT_NAME is running exit $OCF_SUCCESS else echo tomcat instance $TOMCAT_NAME is stopped exit $OCF_NOT_RUNNING fi exit $? ;; monitor) #ocf_log debug "[$TOMCAT_NAME] Enter tomcat monitor" monitor_tomcat func_status=$? #ocf_log debug "[$TOMCAT_NAME] Leave tomcat monitor $func_status" exit $func_status ;; meta-data) metadata_tomcat exit $? ;; validate-all) validate_all_tomcat exit $? ;; usage|help) usage exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac