Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/.gitignore b/.gitignore
index bbff032c3..3a9be36e5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,109 +1,110 @@
*.swp
Makefile.in
aclocal.m4
autoconf
autoheader
autom4te.cache
automake
autoscan.log
compile
configure
configure.scan
config.guess
config.log
config.sub
config.status
Makefile
depcomp
install-sh
libtoolize
ltmain.sh
libtool
make/stamp-h1
m4
make/clusterautoconfig.h*
missing
*.pc
.deps
.libs
*.o
*.la
*.lo
*.loT
rgmanager/src/resources/fs.sh
rgmanager/src/resources/oracledb.sh
rgmanager/src/resources/utils/config-utils.sh
resource-agents-*
.version
# generated by ./autogen.sh && ./configure
doc/man/*.7
doc/man/*.xml
heartbeat/ocf-binaries
heartbeat/ocf-directories
heartbeat/ocf-shellfuncs
heartbeat/send_ua
heartbeat/shellfuncs
+heartbeat/*.pyc
include/agent_config.h
include/config.h
include/config.h.in
include/stamp-h1
include/stamp-h2
ldirectord/ldirectord
ldirectord/ldirectord.8
ldirectord/OCF/ldirectord
ldirectord/init.d/ldirectord
ldirectord/init.d/ldirectord.debian
ldirectord/init.d/ldirectord.debian.default
ldirectord/systemd/ldirectord.service
tools/findif
tools/ocf-tester
tools/send_arp
tools/tickle_tcp
tools/ocft/README
tools/ocft/README.zh_CN
tools/ocft/caselib
tools/ocft/ocft
*.cache
*.upgrade.xml
py-compile
ylwrap
# BEAM Entries
*.beam
parser-messages
MISC_ERRORS
cscope.files
cscope.out
patches
updates
logs
# OS and Editor Artifacts
.DS_Store
.bomb
*.rej
*.bz2
*.gz
*.xz
*.sed
*.diff
*.patch
*.gres
*~
# Misc
HTML
TAGS
GPATH
GRTAGS
GSYMS
GTAGS
.gres.*
*.orig
.gdb_history
*~
\#*
.changes
pacemaker.tar.gz
diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance
index eb058cccf..ca320de1f 100755
--- a/heartbeat/SAPInstance
+++ b/heartbeat/SAPInstance
@@ -1,980 +1,980 @@
#!/bin/sh
#
# SAPInstance
#
# Description: Manages a single SAP Instance as a High-Availability
# resource. One SAP Instance is defined by one
# SAP Instance-Profile. start/stop handles all services
# of the START-Profile, status and monitor care only
# about essential services.
#
# Author: Alexander Krauth, June 2006
# Support: linux@sap.com
# License: GNU General Public License (GPL)
# Copyright: (c) 2006-2008 Alexander Krauth
#
# An example usage:
# See usage() function below for more details...
#
# OCF instance parameters:
# OCF_RESKEY_InstanceName
# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default)
# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default)
# OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default)
# OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start)
# OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false)
# OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only)
# OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk)
# OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Master/Slave configuration)
# OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Master/Slave configuration)
# OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started)
# OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started)
# OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped)
# OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped)
# OCF_RESKEY_IS_ERS (needed for ENQ/REPL NW 740)
#
# TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status)
# - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque)
# - Option for cleanup abandoned enqueue replication tables
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
SH=/bin/sh
sapinstance_usage() {
methods=`sapinstance_methods`
methods=`echo $methods | tr ' ' '|'`
cat <<-EOF
usage: $0 ($methods)
$0 manages a SAP Instance as an HA resource.
The 'start' operation starts the instance or the ERS instance in a Master/Slave configuration
The 'stop' operation stops the instance
The 'status' operation reports whether the instance is running
The 'monitor' operation reports whether the instance seems to be working
The 'promote' operation starts the primary instance in a Master/Slave configuration
The 'demote' operation stops the primary instance and starts the ERS instance
The 'reload' operation allows changed parameters (non-unique only) without restarting the service
The 'notify' operation always returns SUCCESS
The 'validate-all' operation reports whether the parameters are valid
The 'methods' operation reports on the methods $0 supports
EOF
}
sapinstance_meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="SAPInstance">
<version>2.14</version>
<longdesc lang="en">
Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration.
The resource agent supports the following SAP versions:
- SAP WebAS ABAP Release 6.20 - 7.40
- SAP WebAS Java Release 6.40 - 7.40
- SAP WebAS ABAP + Java Add-In Release 6.20 - 7.40 (Java is not monitored by the cluster in that case)
When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com).
Other versions may also work with this agent, but have not been verified.
All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time.
sapstartsrv knows 4 status colours:
- GREEN = everything is fine
- YELLOW = something is wrong, but the service is still working
- RED = the service does not work
- GRAY = the service has not been started
The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover.
The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing.
</longdesc>
<shortdesc lang="en">Manages a SAP instance as an HA resource.</shortdesc>
<parameters>
<parameter name="InstanceName" unique="1" required="1">
<longdesc lang="en">The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile.</longdesc>
<shortdesc lang="en">Instance name: SID_INSTANCE_VIR-HOSTNAME</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="DIR_EXECUTABLE" unique="0" required="0">
<longdesc lang="en">The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation.</longdesc>
<shortdesc lang="en">Path of sapstartsrv and sapcontrol</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="DIR_PROFILE" unique="0" required="0">
<longdesc lang="en">The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation.</longdesc>
<shortdesc lang="en">Path of start profile</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="START_PROFILE" unique="1" required="0">
<longdesc lang="en">The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than.</longdesc>
<shortdesc lang="en">Start profile name</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="START_WAITTIME" unique="0" required="0">
<longdesc lang="en">After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and a JAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance.
Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time.
That is only useful for double stack systems.
</longdesc>
<shortdesc lang="en">Check the successful start after that time (do not wait for J2EE-Addin)</shortdesc>
<content type="string" default="3600" />
</parameter>
<parameter name="AUTOMATIC_RECOVER" unique="0" required="0">
<longdesc lang="en">The SAPInstance resource agent tries to recover a failed start attempt automatically one time. This is done by killing running instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator.</longdesc>
<shortdesc lang="en">Enable or disable automatic startup recovery</shortdesc>
<content type="boolean" default="false"/>
</parameter>
<parameter name="MONITOR_SERVICES" unique="0" required="0">
<longdesc lang="en">Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails.
Those services are monitored within the SAPInstance resource agent:
- disp+work
- msg_server
- enserver (ENSA1)
- enq_server (ENSA2)
- enrepserver (ENSA1)
- enq_replicator (ENSA2)
- jcontrol
- jstart
Some other services could be monitored as well. They have to be
given with the parameter MONITOR_SERVICES, e.g.:
- sapwebdisp
- TREXDaemon.x
That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'.
The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports.
You may specify multiple services separated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver
</longdesc>
<shortdesc lang="en">Services to monitor</shortdesc>
<content type="string" default="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart|enq_server|enq_replicator"/>
</parameter>
<parameter name="SHUTDOWN_METHOD" unique="0" required="0">
<longdesc lang="en">Usual a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the gracefull stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !!</longdesc>
<shortdesc lang="en">Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL)</shortdesc>
<content type="string" default="normal"/>
</parameter>
<parameter name="ERS_InstanceName" unique="1" required="0">
<longdesc lang="en">Only used in a Master/Slave resource configuration:
The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile.
The enqueue replication instance must be installed, before you want to configure a master-slave cluster recource.
The master-slave configuration in the cluster must use this properties:
clone_max = 2
clone_node_max = 1
master_node_max = 1
master_max = 1
</longdesc>
<shortdesc lang="en">Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="ERS_START_PROFILE" unique="1" required="0">
<longdesc lang="en">Only used in a Master/Slave resource configuration:
The parameter ERS_InstanceName must also be set in this configuration.
The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than.
</longdesc>
<shortdesc lang="en">Enqueue replication start profile name</shortdesc>
<content type="string" default=""/>
</parameter>
<parameter name="PRE_START_USEREXIT" unique="0" required="0">
<longdesc lang="en">The full qualified path where to find a script or program which should be executed before this resource gets started.</longdesc>
<shortdesc lang="en">Path to a pre-start script</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="POST_START_USEREXIT" unique="0" required="0">
<longdesc lang="en">The full qualified path where to find a script or program which should be executed after this resource got started.</longdesc>
<shortdesc lang="en">Path to a post-start script</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="PRE_STOP_USEREXIT" unique="0" required="0">
<longdesc lang="en">The full qualified path where to find a script or program which should be executed before this resource gets stopped.</longdesc>
<shortdesc lang="en">Path to a pre-start script</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="POST_STOP_USEREXIT" unique="0" required="0">
<longdesc lang="en">The full qualified path where to find a script or program which should be executed after this resource got stopped.</longdesc>
<shortdesc lang="en">Path to a post-start script</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="IS_ERS" unique="0" required="0">
<longdesc lang="en">Only used for ASCS/ERS SAP Netweaver installations without implementing a master/slave resource to
allow the ASCS to 'find' the ERS running on another cluster node after a resource failure. This parameter should be set
to true 'only' for the ERS instance for implementations following the SAP NetWeaver 7.40 HA certification (NW-HA-CLU-740). This includes also
systems for NetWeaver less than 7.40, if you like to impelemnt the NW-HA-CLU-740 scenario.
</longdesc>
<shortdesc lang="en">Mark SAPInstance as ERS instance</shortdesc>
<content type="boolean" default="false" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="180s" />
<action name="stop" timeout="240s" />
<action name="status" timeout="60s" />
<action name="monitor" depth="0" timeout="60s" interval="120s" />
<action name="monitor" depth="0" timeout="60s" interval="121s" role="Slave" />
<action name="monitor" depth="0" timeout="60s" interval="119s" role="Master" />
<action name="promote" timeout="320s" />
<action name="demote" timeout="320s" />
-<action name="reload" timeout="320" />
+<action name="reload" timeout="320s" />
<action name="validate-all" timeout="5s" />
<action name="meta-data" timeout="5s" />
<action name="methods" timeout="5s" />
</actions>
</resource-agent>
END
}
#
# methods: What methods/operations do we support?
#
sapinstance_methods() {
cat <<-EOF
start
stop
status
monitor
promote
demote
reload
notify
validate-all
methods
meta-data
usage
EOF
}
#
# is_clone : find out if we are configured to run in a Master/Slave configuration
#
is_clone() {
if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \
&& [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ]
then
if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \
[ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \
[ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \
[ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ]
then
ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)"
exit $OCF_ERR_CONFIGURED
fi
if [ -z "$OCF_RESKEY_ERS_InstanceName" ]
then
ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory."
exit $OCF_ERR_ARGS
fi
else
return 0
fi
return 1
}
#
# abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different
# from customer to customer - we cannot handle this always as an error
# This would be the case, if the software is installed on shared disks and not visible
# to all cluster nodes at all times.
#
abnormal_end() {
local err_msg=$1
ocf_is_probe && {
sapinstance_status
exit $?
}
ocf_log err $err_msg
if [ "$ACTION" = "stop" ]
then
cleanup_instance
exit $OCF_SUCCESS
fi
exit $OCF_ERR_CONFIGURED
}
#
# sapinstance_init : Define global variables with default values, if optional parameters are not set
#
#
sapinstance_init() {
local myInstanceName="$1"
SID=`echo "$myInstanceName" | cut -d_ -f1`
InstanceName=`echo "$myInstanceName" | cut -d_ -f2`
InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'`
SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3`
# optional OCF parameters, we try to guess which directories are correct
if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ]
then
if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol
then
DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe"
SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv"
SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol"
elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol
then
DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run"
SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv"
SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol"
fi
else
if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol"
then
DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE"
SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv"
SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol"
fi
fi
sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm"
[ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!"
if [ -z "$OCF_RESKEY_DIR_PROFILE" ]
then
DIR_PROFILE="/usr/sap/$SID/SYS/profile"
else
DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE"
fi
if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ]
then
currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE
else
currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE
fi
if [ -z "$OCF_RESKEY_IS_ERS" ]; then
is_ers="no"
else
is_ers="$OCF_RESKEY_IS_ERS"
fi
if [ -z "$currentSTART_PROFILE" ]
then
if [ ! -r "$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" -a -r "$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" ]; then
SAPSTARTPROFILE="$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}"
else
SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}"
fi
else
SAPSTARTPROFILE="$currentSTART_PROFILE"
fi
if [ -z "$OCF_RESKEY_START_WAITTIME" ]
then
export OCF_RESKEY_START_WAITTIME=3600
fi
if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ]
then
export OCF_RESKEY_MONITOR_SERVICES="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart|enq_server|enq_replicator"
fi
# as root user we need the library path to the SAP kernel to be able to call sapcontrol
if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then
LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH
export LD_LIBRARY_PATH
fi
return $OCF_SUCCESS
}
#
# check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance.
# We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance,
# because then we have two instances with the same instance number.
#
check_sapstartsrv() {
local restart=0
local runninginst=""
local chkrc=$OCF_SUCCESS
local output=""
if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then
ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now"
restart=1
else
output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script`
if [ $? -eq 0 ]
then
runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3`
if [ "$runninginst" != "$InstanceName" ]
then
ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed"
restart=1
else
output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start`
if [ $? -ne 0 ]; then
ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)"
ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)"
restart=1
fi
fi
else
ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now"
restart=1
fi
fi
if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi
if [ $restart -eq 1 ]
then
if [ -d /usr/sap/$SID/SYS/profile/ ]
then
DIR_PROFILE="/usr/sap/$SID/SYS/profile"
else
abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!"
fi
[ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!"
pkill -9 -f "sapstartsrv.*$runninginst"
# removing the unix domain socket files as they might have wrong permissions
# or ownership - they will be recreated by sapstartsrv during next start
rm -f /tmp/.sapstream5${InstanceNr}13
rm -f /tmp/.sapstream5${InstanceNr}14
$SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm
# now make sure the daemon has been started and is able to respond
local srvrc=1
while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ]
do
sleep 1
$SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1
srvrc=$?
done
if [ $srvrc -ne 1 ]
then
ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !"
chkrc=$OCF_SUCCESS
else
ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!"
chkrc=$OCF_ERR_GENERIC
ocf_is_probe && chkrc=$OCF_NOT_RUNNING
fi
fi
return $chkrc
}
#
# sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems.
# This specialties do not allow a totally generic SAP cluster resource agent.
# Someone should write a resource agent for each additional process you need, if it
# is required to monitor that process within the cluster manager. To enable
# you to extent this resource agent without developing a new one, this user exit
# was introduced.
#
sapuserexit() {
local NAME="$1"
local VALUE="$2"
if [ -n "$VALUE" ]
then
if have_binary "$VALUE"
then
ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}"
"$VALUE" >/dev/null 2>&1
ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?"
else
ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable"
fi
fi
return 0
}
#
# cleanup_instance : remove resources (processes and shared memory) from a crashed instance)
#
cleanup_instance() {
pkill -9 -f -U $sidadm $InstanceName
ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'"
# it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot be removed
su - $sidadm -c "cleanipc $InstanceNr remove"
ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm"
ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap
ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap
ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid
ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid
return 0
}
#
# sapinstance_start : Start the SAP instance
#
sapinstance_start() {
sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT"
local rc=$OCF_NOT_RUNNING
local output=""
local loopcount=0
while [ $loopcount -lt 2 ]
do
loopcount=$(($loopcount + 1))
check_sapstartsrv
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
output=`$SAPCONTROL -nr $InstanceNr -function Start`
rc=$?
ocf_log info "Starting SAP Instance $SID-$InstanceName: $output"
fi
if [ $rc -ne 0 ]
then
ocf_log err "SAP Instance $SID-$InstanceName start failed."
return $OCF_ERR_GENERIC
fi
local startrc=1
while [ $startrc -gt 0 ]
do
local waittime_start=`date +%s`
output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10`
startrc=$?
local waittime_stop=`date +%s`
if [ $startrc -ne 0 ]
then
if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ]
then
sapinstance_monitor NOLOG
if [ $? -eq $OCF_SUCCESS ]
then
output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running."
startrc=0; loopcount=2
fi
else
if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER
then
ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output"
ocf_log warn "Try to recover $SID-$InstanceName"
cleanup_instance
else
loopcount=2
fi
startrc=-1
fi
else
loopcount=2
fi
done
done
if [ $startrc -eq 0 ]
then
ocf_log info "SAP Instance $SID-$InstanceName started: $output"
rc=$OCF_SUCCESS
sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT"
if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 1 -l reboot; fi
else
ocf_log err "SAP Instance $SID-$InstanceName start failed: $output"
rc=$OCF_NOT_RUNNING
if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi
fi
return $rc
}
#
# sapinstance_recover: Try startup of failed instance by cleaning up resources
#
sapinstance_recover() {
cleanup_instance
sapinstance_start
return $?
}
#
# sapinstance_stop: Stop the SAP instance
#
sapinstance_stop() {
local output=""
local rc
sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT"
if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ]
then
ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!"
cleanup_instance
return $OCF_SUCCESS
fi
check_sapstartsrv
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
output=`$SAPCONTROL -nr $InstanceNr -function Stop`
rc=$?
ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output"
fi
if [ $rc -eq 0 ]
then
output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1`
if [ $? -eq 0 ]
then
ocf_log info "SAP Instance $SID-$InstanceName stopped: $output"
rc=$OCF_SUCCESS
else
ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output"
rc=$OCF_ERR_GENERIC
fi
else
ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output"
rc=$OCF_ERR_GENERIC
fi
sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT"
if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi
return $rc
}
#
# sapinstance_monitor: Can the given SAP instance do anything useful?
#
sapinstance_monitor() {
local MONLOG=$1
local rc
check_sapstartsrv
rc=$?
if [ $rc -eq $OCF_SUCCESS ]
then
local count=0
local SERVNO
local output
output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script`
# we have to parse the output, because the returncode doesn't tell anything about the instance status
for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u`
do
local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3`
local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3`
local STATE=0
local SEARCH
case $COLOR in
GREEN|YELLOW) STATE=$OCF_SUCCESS;;
*) STATE=$OCF_NOT_RUNNING;;
esac
SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'`
if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ]
then
if [ $STATE -eq $OCF_NOT_RUNNING ]
then
[ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !"
rc=$STATE
fi
count=1
fi
done
if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ]
then
if ocf_is_probe
then
rc=$OCF_NOT_RUNNING
else
[ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!"
rc=$OCF_ERR_GENERIC
fi
fi
fi
return $rc
}
#
# sapinstance_status: Lightweight check of SAP instance only with OS tools
#
sapinstance_status() {
local pid
local pids
[ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING
pids=`grep '^kill -[0-9]' /usr/sap/$SID/$InstanceName/work/kill.sap | awk '{print $3}'`
for pid in $pids
do
[ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS
done
return $OCF_NOT_RUNNING
}
#
# sapinstance_validate: Check the semantics of the input parameters
#
sapinstance_validate() {
local rc=$OCF_SUCCESS
if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ]
then
ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!"
rc=$OCF_ERR_ARGS
fi
if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ]
then
ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!"
rc=$OCF_ERR_ARGS
fi
if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ]
then
ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!"
rc=$OCF_ERR_ARGS
fi
if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ]
then
ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!"
rc=$OCF_ERR_ARGS
fi
return $rc
}
#
# sapinstance_start_clone
#
sapinstance_start_clone() {
sapinstance_init $OCF_RESKEY_ERS_InstanceName
${HA_SBIN_DIR}/crm_master -v 50 -l reboot
sapinstance_start
return $?
}
#
# sapinstance_stop_clone
#
sapinstance_stop_clone() {
sapinstance_init $OCF_RESKEY_ERS_InstanceName
${HA_SBIN_DIR}/crm_master -v 0 -l reboot
sapinstance_stop
return $?
}
#
# sapinstance_monitor_clone
#
sapinstance_monitor_clone() {
# first check with the status function (OS tools) if there could be something like a SAP instance running
# as we do not know here, if we are in master or slave state we do not want to start our monitoring
# agents (sapstartsrv) on the wrong host
local rc
sapinstance_init $OCF_RESKEY_InstanceName
if sapinstance_status; then
if sapinstance_monitor; then
${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot
return $OCF_RUNNING_MASTER
fi
# by nature of the SAP enqueue server we have to make sure
# that we do a failover to the slave (enqueue replication server)
# in case the enqueue process has failed. We signal this to the
# cluster by setting our master preference to a lower value than the slave.
${HA_SBIN_DIR}/crm_master -v 10 -l reboot
return $OCF_FAILED_MASTER
fi
sapinstance_init $OCF_RESKEY_ERS_InstanceName
sapinstance_status && sapinstance_monitor
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot
fi
return $rc
}
#
# sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance
# The order is important here to behave correct from the application levels view
#
sapinstance_promote_clone() {
local rc
sapinstance_init $OCF_RESKEY_InstanceName
ocf_log info "Promoting $SID-$InstanceName to running Master."
sapinstance_start
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
sapinstance_init $OCF_RESKEY_ERS_InstanceName
sapinstance_stop
rc=$?
fi
return $rc
}
#
# sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance
#
sapinstance_demote_clone() {
local rc
sapinstance_init $OCF_RESKEY_InstanceName
ocf_log info "Demoting $SID-$InstanceName to a slave."
sapinstance_stop
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
sapinstance_init $OCF_RESKEY_ERS_InstanceName
sapinstance_start
rc=$?
fi
return $rc
}
#
# sapinstance_notify: Handle master scoring - to make sure a slave gets the next master
#
sapinstance_notify() {
local n_type="$OCF_RESKEY_CRM_meta_notify_type"
local n_op="$OCF_RESKEY_CRM_meta_notify_operation"
if [ "${n_type}_${n_op}" = "post_promote" ]; then
# After promotion of one master in the cluster, we make sure that all clones reset their master
# value back to 100. This is because a failed monitor on a master might have degree one clone
# instance to score 10.
${HA_SBIN_DIR}/crm_master -v 100 -l reboot
elif [ "${n_type}_${n_op}" = "pre_demote" ]; then
# if we are a slave and a demote event is announced, make sure we are highest on the list to become master
# that is, when a slave resource was started after the promote event of an already running master (e.g. node of slave was down)
# We also have to make sure to overrule the globally set resource_stickiness or any fail-count factors => INFINITY
local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname"
if [ ${n_uname} != ${NODENAME} ]; then
${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot
fi
fi
}
#
# 'main' starts here...
#
## GLOBALS
SID=""
sidadm=""
InstanceName=""
InstanceNr=""
SAPVIRHOST=""
DIR_EXECUTABLE=""
SAPSTARTSRV=""
SAPCONTROL=""
DIR_PROFILE=""
SAPSTARTPROFILE=""
CLONE=0
NODENAME=$(ocf_local_nodename)
if
( [ $# -ne 1 ] )
then
sapinstance_usage
exit $OCF_ERR_ARGS
fi
ACTION=$1
if [ "$ACTION" = "status" ]; then
ACTION=monitor
fi
# These operations don't require OCF instance parameters to be set
case "$ACTION" in
usage|methods) sapinstance_$ACTION
exit $OCF_SUCCESS;;
meta-data) sapinstance_meta_data
exit $OCF_SUCCESS;;
notify) sapinstance_notify
exit $OCF_SUCCESS;;
*);;
esac
if ! ocf_is_root
then
ocf_log err "$0 must be run as root"
exit $OCF_ERR_PERM
fi
# parameter check
if [ -z "$OCF_RESKEY_InstanceName" ]
then
ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!"
exit $OCF_ERR_ARGS
fi
is_clone; CLONE=$?
if [ ${CLONE} -eq 1 ]
then
CLACT=_clone
else
if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ]
then
ocf_log err "$ACTION called in a non master/slave environment"
exit $OCF_ERR_ARGS
fi
sapinstance_init $OCF_RESKEY_InstanceName
fi
# What kind of method was invoked?
case "$ACTION" in
start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT
exit $?;;
validate-all) sapinstance_validate
exit $?;;
reload )
ocf_log info "reloading SAPInstance parameters"
exit $OCF_SUCCESS;;
*) sapinstance_methods
exit $OCF_ERR_UNIMPLEMENTED;;
esac
diff --git a/heartbeat/aliyun-vpc-move-ip b/heartbeat/aliyun-vpc-move-ip
index c004d26fc..3091a6d96 100755
--- a/heartbeat/aliyun-vpc-move-ip
+++ b/heartbeat/aliyun-vpc-move-ip
@@ -1,298 +1,298 @@
#!/bin/sh
#
# OCF resource agent to move an IP address within a VPC in the Aliyun
# Based on code of Markus Guertler (GitHub AWS-VPC-move-IP)
# Based on code of Adam Gandelman (GitHub ec2-resource-agents/elasticip)
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# aliyuncli doesnt work without HOME parameter
export HOME="/root"
USAGE="usage: $0 {start|stop|status|meta-data}";
###############################################################################
###############################################################################
#
# Functions
#
###############################################################################
ip_get_and_configure() {
ocf_log debug "function: ip_get_and_configure"
ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')"
if [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then
if [ -n "$ROUTE_TO_INSTANCE" ]; then
ip_drop
fi
cmd="aliyuncli vpc CreateRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ECS_INSTANCE_ID --NextHopType Instance --output text"
ocf_log debug "executing command: $cmd"
$cmd
rc=$?
while [ $rc -ne 0 ]; do
sleep 1
cmd="aliyuncli vpc CreateRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ECS_INSTANCE_ID --NextHopType Instance --output text"
ocf_log debug "executing command: $cmd"
$cmd
rc=$?
done
wait_for_started
fi
# Reconfigure the local ip address
ip addr add "${OCF_RESKEY_address}/32" dev $OCF_RESKEY_interface
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "command failed, rc: $rc"
return $OCF_ERR_GENERIC
fi
ocf_log debug "IP added"
return $OCF_SUCCESS
}
ip_drop() {
ocf_log debug "function: ip_drop"
cmd="ip addr delete ${OCF_RESKEY_address}/32 dev $OCF_RESKEY_interface"
ocf_log debug "executing command: $cmd"
$cmd
rc=$?
if [ $rc -ne 0 ] && [ $rc -ne 2 ]; then
ocf_log err "command failed, rc $rc"
return $OCF_ERR_GENERIC
fi
cmd="aliyuncli vpc DeleteRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ROUTE_TO_INSTANCE --output text"
ocf_log debug "executing command: $cmd"
$cmd
if [ $? -ne 0 ]; then
ocf_log err "command failed, rc: $rc"
return $OCF_ERR_GENERIC
fi
wait_for_deleted
ocf_log debug "IP dropped"
return $OCF_SUCCESS
}
wait_for_started() {
cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text"
ocf_log debug "executing command: $cmd"
ROUTE_TO_INSTANCE="$($cmd | grep $OCF_RESKEY_address | awk '{ print $3 }')"
while [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; do
sleep 3
cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text"
ocf_log debug "executing command: $cmd"
ROUTE_TO_INSTANCE="$($cmd | grep $OCF_RESKEY_address | awk '{ print $3 }')"
done
}
wait_for_deleted() {
ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')"
while [ ! -z "$ROUTE_TO_INSTANCE" ]; do
sleep 1
cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text"
ocf_log debug "executing command: $cmd"
ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')"
done
}
ecs_ip_metadata() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="aliyun-vpc-move-ip">
<version>2.0</version>
<longdesc lang="en">
Resource Agent to move IP addresses within a VPC of the Aliyun Webservices ECS
by changing an entry in an specific routing table
</longdesc>
<shortdesc lang="en">Move IP within a VPC of the Aliyun ECS</shortdesc>
<parameters>
<parameter name="address" required="1">
<longdesc lang="en">
VPC private IP address
</longdesc>
<shortdesc lang="en">vpc ip</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="routing_table" required="1">
<longdesc lang="en">
Name of the routing table, where the route for the IP address should be changed, i.e. vtb-...
</longdesc>
<shortdesc lang="en">routing table name</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="interface" required="1">
<longdesc lang="en">
Name of the network interface, i.e. eth0
</longdesc>
<shortdesc lang="en">network interface name</shortdesc>
<content type="string" default="eth0" />
</parameter>
<parameter name="profile" required="0">
<longdesc lang="en">
Valid Aliyun CLI profile name (see 'aliyuncli configure').
See https://www.alibabacloud.com/help/doc-detail/43039.htm?spm=a2c63.p38356.b99.16.38a914abRZtOU3 for more information about aliyuncli.
</longdesc>
<shortdesc lang="en">profile name</shortdesc>
<content type="string" default="default" />
</parameter>
</parameters>
<actions>
-<action name="start" timeout="180" />
-<action name="stop" timeout="180" />
-<action name="monitor" depth="0" timeout="30" interval="30" />
-<action name="validate-all" timeout="5" />
-<action name="meta-data" timeout="5" />
+<action name="start" timeout="180s" />
+<action name="stop" timeout="180s" />
+<action name="monitor" depth="0" timeout="30s" interval="30s" />
+<action name="validate-all" timeout="5s" />
+<action name="meta-data" timeout="5s" />
</actions>
</resource-agent>
END
}
ecs_ip_validate() {
ocf_log debug "function: validate"
# IP address
if [ -z "$OCF_RESKEY_address" ]; then
ocf_log err "IP address parameter not set $OCF_RESKEY_ADDRESS!"
exit $OCF_ERR_CONFIGURED
fi
# Network Interface
if [ -z "$OCF_RESKEY_interface" ]; then
ocf_log err "Network interface parameter not set $OCF_RESKEY_INTERFACE!"
exit $OCF_ERR_CONFIGURED
fi
# Routing Table
if [ -z "$OCF_RESKEY_routing_table" ]; then
ocf_log err "Routing table parameter not set $OCF_RESKEY_ROUTING_TABLE!"
exit $OCF_ERR_CONFIGURED
fi
if [ -z "${ECS_INSTANCE_ID}" ]; then
ocf_exit_reason "Instance ID not found. Is this a ECS instance?"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
}
ecs_ip_start() {
ocf_log info "ECS: Moving IP address $OCF_RESKEY_address to this host by adjusting routing table $OCF_RESKEY_routing_table"
ecs_ip_monitor
if [ $? = $OCF_SUCCESS ]; then
ocf_log info "ECS: $OCF_RESKEY_address already started"
return $OCF_SUCCESS
fi
ocf_log info "ECS: Adjusting routing table and locally configuring IP address"
ip_get_and_configure
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "Received $rc from 'aliyun cli'"
return $OCF_ERR_GENERIC
fi
ecs_ip_monitor
rc=$?
if [ $rc -ne $OCF_SUCCESS ]; then
ocf_log err "IP address couldn't be configured on this host (IP: $OCF_RESKEY_address, Interface: $OCF_RESKEY_interface)"
return $rc
fi
return $OCF_SUCCESS
}
ecs_ip_stop() {
ocf_log info "ECS: Bringing down IP address $OCF_RESKEY_address"
ecs_ip_monitor
if [ $? = $OCF_NOT_RUNNING ]; then
ocf_log info "ECS: Address $OCF_RESKEY_address already down"
return $OCF_SUCCESS
fi
ip_drop
if [ $? -ne $OCF_SUCCESS ]; then
ocf_log err "ECS: Couldn't drop IP address $OCF_RESKEY_address on interface $OCF_RESKEY_interface."
return $OCF_ERR_GENERIC
fi
ecs_ip_monitor
if [ $? = $OCF_NOT_RUNNING ]; then
ocf_log info "ECS: Successfully brought down $OCF_RESKEY_address"
return $OCF_SUCCESS
fi
ocf_log err "ECS: Couldn't bring down IP address $OCF_RESKEY_address on interface $OCF_RESKEY_interface."
return $OCF_ERR_GENERIC
}
ecs_ip_monitor() {
ocf_log debug "function: ecsip_monitor: check routing table"
cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text"
ocf_log debug "executing command: $cmd"
ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')"
if [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then
ocf_log debug "not routed to this instance ($ECS_INSTANCE_ID) but to instance $ROUTE_TO_INSTANCE"
return $OCF_NOT_RUNNING
fi
cmd="ping -W 1 -c 1 $OCF_RESKEY_address"
ocf_log debug "executing command: $cmd"
$cmd > /dev/null
if [ $? -ne 0 ]; then
ocf_log debug "IP $OCF_RESKEY_address not locally reachable via ping on this system"
return $OCF_NOT_RUNNING
fi
ocf_log debug "routed in VPC and locally reachable"
return $OCF_SUCCESS
}
###############################################################################
#
# MAIN
#
###############################################################################
case $__OCF_ACTION in
meta-data) ecs_ip_metadata
exit $OCF_SUCCESS;;
validate-all) ecs_ip_validate;;
esac
ECS_INSTANCE_ID="$(curl -s http://100.100.100.200/latest/meta-data/instance-id)"
case $__OCF_ACTION in
start)
ecs_ip_validate
ecs_ip_start;;
stop)
ecs_ip_stop;;
monitor)
ecs_ip_monitor;;
*) exit $OCF_ERR_UNIMPLEMENTED;;
esac
diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in
index ba61193b6..31d84643a 100755
--- a/heartbeat/gcp-vpc-move-vip.in
+++ b/heartbeat/gcp-vpc-move-vip.in
@@ -1,338 +1,338 @@
#!@PYTHON@ -tt
# ---------------------------------------------------------------------
# Copyright 2016 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ---------------------------------------------------------------------
# Description: Google Cloud Platform - Floating IP Address (Alias)
# ---------------------------------------------------------------------
import json
import logging
import os
import sys
import time
OCF_FUNCTIONS_DIR="%s/lib/heartbeat" % os.environ.get("OCF_ROOT")
sys.path.append(OCF_FUNCTIONS_DIR)
from ocf import *
try:
import googleapiclient.discovery
except ImportError:
pass
if sys.version_info >= (3, 0):
# Python 3 imports.
import urllib.parse as urlparse
import urllib.request as urlrequest
else:
# Python 2 imports.
import urllib as urlparse
import urllib2 as urlrequest
CONN = None
THIS_VM = None
ALIAS = None
METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/'
METADATA_HEADERS = {'Metadata-Flavor': 'Google'}
METADATA = \
'''<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="gcp-vpc-move-vip">
<version>1.0</version>
<longdesc lang="en">Floating IP Address on Google Cloud Platform - Using Alias IP address functionality to attach a secondary IP address to a running instance</longdesc>
<shortdesc lang="en">Floating IP Address on Google Cloud Platform</shortdesc>
<parameters>
<parameter name="alias_ip" unique="1" required="1">
<longdesc lang="en">IP Address to be added including CIDR. E.g 192.168.0.1/32</longdesc>
<shortdesc lang="en">IP Address to be added including CIDR. E.g 192.168.0.1/32</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="alias_range_name" unique="1" required="0">
<longdesc lang="en">Subnet name for the Alias IP</longdesc>
<shortdesc lang="en">Subnet name for the Alias IP</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="hostlist" unique="1" required="0">
<longdesc lang="en">List of hosts in the cluster</longdesc>
<shortdesc lang="en">Host list</shortdesc>
<content type="string" default="" />
</parameter>
<parameter name="stackdriver_logging" unique="0" required="0">
<longdesc lang="en">If enabled (set to true), IP failover logs will be posted to stackdriver logging</longdesc>
<shortdesc lang="en">Stackdriver-logging support</shortdesc>
<content type="boolean" default="" />
</parameter>
</parameters>
<actions>
- <action name="start" timeout="300" />
- <action name="stop" timeout="15" />
- <action name="monitor" timeout="15" interval="60" depth="0" />
- <action name="meta-data" timeout="15" />
- <action name="validate-all" timeout="15" />
+ <action name="start" timeout="300s" />
+ <action name="stop" timeout="15s" />
+ <action name="monitor" timeout="15s" interval="60s" depth="0" />
+ <action name="meta-data" timeout="15s" />
+ <action name="validate-all" timeout="15s" />
</actions>
</resource-agent>'''
def get_metadata(metadata_key, params=None, timeout=None):
"""Performs a GET request with the metadata headers.
Args:
metadata_key: string, the metadata to perform a GET request on.
params: dictionary, the query parameters in the GET request.
timeout: int, timeout in seconds for metadata requests.
Returns:
HTTP response from the GET request.
Raises:
urlerror.HTTPError: raises when the GET request fails.
"""
timeout = timeout or 60
metadata_url = os.path.join(METADATA_SERVER, metadata_key)
params = urlparse.urlencode(params or {})
url = '%s?%s' % (metadata_url, params)
request = urlrequest.Request(url, headers=METADATA_HEADERS)
request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({}))
return request_opener.open(request, timeout=timeout * 1.1).read()
def get_instance(project, zone, instance):
request = CONN.instances().get(
project=project, zone=zone, instance=instance)
return request.execute()
def get_network_ifaces(project, zone, instance):
return get_instance(project, zone, instance)['networkInterfaces']
def wait_for_operation(project, zone, operation):
while True:
result = CONN.zoneOperations().get(
project=project,
zone=zone,
operation=operation['name']).execute()
if result['status'] == 'DONE':
if 'error' in result:
raise Exception(result['error'])
return
time.sleep(1)
def set_alias(project, zone, instance, alias, alias_range_name=None):
fingerprint = get_network_ifaces(project, zone, instance)[0]['fingerprint']
body = {
'aliasIpRanges': [],
'fingerprint': fingerprint
}
if alias:
obj = {'ipCidrRange': alias}
if alias_range_name:
obj['subnetworkRangeName'] = alias_range_name
body['aliasIpRanges'].append(obj)
request = CONN.instances().updateNetworkInterface(
instance=instance, networkInterface='nic0', project=project, zone=zone,
body=body)
operation = request.execute()
wait_for_operation(project, zone, operation)
def get_alias(project, zone, instance):
iface = get_network_ifaces(project, zone, instance)
try:
return iface[0]['aliasIpRanges'][0]['ipCidrRange']
except KeyError:
return ''
def get_localhost_alias():
net_iface = get_metadata('instance/network-interfaces', {'recursive': True})
net_iface = json.loads(net_iface.decode('utf-8'))
try:
return net_iface[0]['ipAliases'][0]
except (KeyError, IndexError):
return ''
def get_zone(project, instance):
fl = 'name="%s"' % instance
request = CONN.instances().aggregatedList(project=project, filter=fl)
while request is not None:
response = request.execute()
zones = response.get('items', {})
for zone in zones.values():
for inst in zone.get('instances', []):
if inst['name'] == instance:
return inst['zone'].split("/")[-1]
request = CONN.instances().aggregatedList_next(
previous_request=request, previous_response=response)
raise Exception("Unable to find instance %s" % (instance))
def get_instances_list(project, exclude):
hostlist = []
request = CONN.instances().aggregatedList(project=project)
while request is not None:
response = request.execute()
zones = response.get('items', {})
for zone in zones.values():
for inst in zone.get('instances', []):
if inst['name'] != exclude:
hostlist.append(inst['name'])
request = CONN.instances().aggregatedList_next(
previous_request=request, previous_response=response)
return hostlist
def gcp_alias_start(alias):
my_alias = get_localhost_alias()
my_zone = get_metadata('instance/zone').split('/')[-1]
project = get_metadata('project/project-id')
# If I already have the IP, exit. If it has an alias IP that isn't the VIP,
# then remove it
if my_alias == alias:
logger.info(
'%s already has %s attached. No action required' % (THIS_VM, alias))
sys.exit(OCF_SUCCESS)
elif my_alias:
logger.info('Removing %s from %s' % (my_alias, THIS_VM))
set_alias(project, my_zone, THIS_VM, '')
# Loops through all hosts & remove the alias IP from the host that has it
hostlist = os.environ.get('OCF_RESKEY_hostlist', '')
if hostlist:
hostlist = hostlist.replace(THIS_VM, '').split()
else:
hostlist = get_instances_list(project, THIS_VM)
for host in hostlist:
host_zone = get_zone(project, host)
host_alias = get_alias(project, host_zone, host)
if alias == host_alias:
logger.info(
'%s is attached to %s - Removing all alias IP addresses from %s' %
(alias, host, host))
set_alias(project, host_zone, host, '')
break
# add alias IP to localhost
set_alias(
project, my_zone, THIS_VM, alias,
os.environ.get('OCF_RESKEY_alias_range_name'))
# Check the IP has been added
my_alias = get_localhost_alias()
if alias == my_alias:
logger.info('Finished adding %s to %s' % (alias, THIS_VM))
elif my_alias:
logger.error(
'Failed to add IP. %s has an IP attached but it isn\'t %s' %
(THIS_VM, alias))
sys.exit(OCF_ERR_GENERIC)
else:
logger.error('Failed to add IP address %s to %s' % (alias, THIS_VM))
sys.exit(OCF_ERR_GENERIC)
def gcp_alias_stop(alias):
my_alias = get_localhost_alias()
my_zone = get_metadata('instance/zone').split('/')[-1]
project = get_metadata('project/project-id')
if my_alias == alias:
logger.info('Removing %s from %s' % (my_alias, THIS_VM))
set_alias(project, my_zone, THIS_VM, '')
def gcp_alias_status(alias):
my_alias = get_localhost_alias()
if alias == my_alias:
logger.info('%s has the correct IP address attached' % THIS_VM)
else:
sys.exit(OCF_NOT_RUNNING)
def validate():
global ALIAS
global CONN
global THIS_VM
# Populate global vars
try:
CONN = googleapiclient.discovery.build('compute', 'v1')
except Exception as e:
logger.error('Couldn\'t connect with google api: ' + str(e))
sys.exit(OCF_ERR_CONFIGURED)
try:
THIS_VM = get_metadata('instance/name')
except Exception as e:
logger.error('Couldn\'t get instance name, is this running inside GCE?: ' + str(e))
sys.exit(OCF_ERR_CONFIGURED)
ALIAS = os.environ.get('OCF_RESKEY_alias_ip')
if not ALIAS:
logger.error('Missing alias_ip parameter')
sys.exit(OCF_ERR_CONFIGURED)
def configure_logs():
# Prepare logging
global logger
logging.getLogger('googleapiclient').setLevel(logging.WARN)
logging_env = os.environ.get('OCF_RESKEY_stackdriver_logging')
if logging_env:
logging_env = logging_env.lower()
if any(x in logging_env for x in ['yes', 'true', 'enabled']):
try:
import google.cloud.logging.handlers
client = google.cloud.logging.Client()
handler = google.cloud.logging.handlers.CloudLoggingHandler(
client, name=THIS_VM)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('gcp:alias "%(message)s"')
handler.setFormatter(formatter)
log.addHandler(handler)
logger = logging.LoggerAdapter(log, {'OCF_RESOURCE_INSTANCE': OCF_RESOURCE_INSTANCE})
except ImportError:
logger.error('Couldn\'t import google.cloud.logging, '
'disabling Stackdriver-logging support')
def main():
if 'meta-data' in sys.argv[1]:
print(METADATA)
return
validate()
if 'validate-all' in sys.argv[1]:
return
configure_logs()
if 'start' in sys.argv[1]:
gcp_alias_start(ALIAS)
elif 'stop' in sys.argv[1]:
gcp_alias_stop(ALIAS)
elif 'status' in sys.argv[1] or 'monitor' in sys.argv[1]:
gcp_alias_status(ALIAS)
else:
logger.error('no such function %s' % str(sys.argv[1]))
if __name__ == "__main__":
main()
diff --git a/heartbeat/mariadb.in b/heartbeat/mariadb.in
index 860fea7fd..c1969d70e 100644
--- a/heartbeat/mariadb.in
+++ b/heartbeat/mariadb.in
@@ -1,1058 +1,1058 @@
#!@BASH_SHELL@
#
#
# MariaDB
#
# Description: Manages a MariaDB Master/Slave database as Linux-HA resource
#
# Authors: Alan Robertson: DB2 Script
# Jakub Janczak: rewrite as MySQL
# Andrew Beekhof: cleanup and import
# Sebastian Reitenbach: add OpenBSD defaults, more cleanup
# Narayan Newton: add Gentoo/Debian defaults
# Marian Marinov, Florian Haas: add replication capability
# Yves Trudeau, Baron Schwartz: add VIP support and improve replication
# Nils Carlson: add GTID support and semi-sync support
#
# Support: users@clusterlabs.org
# License: GNU General Public License (GPL)
#
# (c) 2002-2005 International Business Machines, Inc.
# 2005-2010 Linux-HA contributors
#
# See usage() function below for more details...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_client_binary
# OCF_RESKEY_config
# OCF_RESKEY_datadir
# OCF_RESKEY_user
# OCF_RESKEY_group
# OCF_RESKEY_node_list
# OCF_RESKEY_test_table
# OCF_RESKEY_test_user
# OCF_RESKEY_test_passwd
# OCF_RESKEY_enable_creation
# OCF_RESKEY_additional_parameters
# OCF_RESKEY_log
# OCF_RESKEY_pid
# OCF_RESKEY_socket
# OCF_RESKEY_replication_user
# OCF_RESKEY_replication_passwd
# OCF_RESKEY_replication_port
#######################################################################
# Initialization:
OCF_RESKEY_node_list_default=""
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
. ${OCF_FUNCTIONS_DIR}/mysql-common.sh
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote|notify)
$0 manages a MariaDB Database as an HA resource.
The 'start' operation starts the database.
The 'stop' operation stops the database.
The 'status' operation reports whether the database is running
The 'monitor' operation reports whether the database seems to be working
The 'promote' operation makes this mysql server run as master
The 'demote' operation makes this mysql server run as slave
The 'validate-all' operation reports whether the parameters are valid
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="mariadb">
<version>1.0</version>
<longdesc lang="en">
Resource script for MariaDB.
Manages a complete master/slave replication setup with GTID, for simpler
uses look at the mysql resource agent which supports older replication
forms which mysql and mariadb have in common.
The resource must be setup to use notifications. Set 'notify=true' in the metadata
attributes when defining a MariaDB master/slave instance.
The default behavior is to use uname -n values in the change master to command.
Other IPs can be specified manually by adding a node attribute
\${INSTANCE_ATTR_NAME}_mysql_master_IP giving the IP to use for replication.
For example, if the mariadb primitive you are using is p_mariadb, the
attribute to set will be p_mariadb_mysql_master_IP.
</longdesc>
<shortdesc lang="en">Manages a MariaDB master/slave instance</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the MariaDB server binary
</longdesc>
<shortdesc lang="en">MariaDB server binary</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="client_binary" unique="0" required="0">
<longdesc lang="en">
Location of the MariaDB client binary
</longdesc>
<shortdesc lang="en">MariaDB client binary</shortdesc>
<content type="string" default="${OCF_RESKEY_client_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Configuration file
</longdesc>
<shortdesc lang="en">MariaDB config</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="datadir" unique="0" required="0">
<longdesc lang="en">
Directory containing databases
</longdesc>
<shortdesc lang="en">MariaDB datadir</shortdesc>
<content type="string" default="${OCF_RESKEY_datadir_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running MariaDB daemon
</longdesc>
<shortdesc lang="en">MariaDB user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="group" unique="0" required="0">
<longdesc lang="en">
Group running MariaDB daemon (for logfile and directory permissions)
</longdesc>
<shortdesc lang="en">MariaDB group</shortdesc>
<content type="string" default="${OCF_RESKEY_group_default}"/>
</parameter>
<parameter name="log" unique="0" required="0">
<longdesc lang="en">
The logfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MariaDB log file</shortdesc>
<content type="string" default="${OCF_RESKEY_log_default}"/>
</parameter>
<parameter name="node_list" unique="0" required="1">
<longdesc lang="en">
All node names of nodes that will execute mariadb.
Please separate each node name with a space.
This is required for the master selection to function.
</longdesc>
<shortdesc lang="en">node list</shortdesc>
<content type="string" default="${OCF_RESKEY_node_list_default}" />
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pidfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MariaDB pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}"/>
</parameter>
<parameter name="socket" unique="0" required="0">
<longdesc lang="en">
The socket to be used for mysqld.
</longdesc>
<shortdesc lang="en">MariaDB socket</shortdesc>
<content type="string" default="${OCF_RESKEY_socket_default}"/>
</parameter>
<parameter name="test_table" unique="0" required="0">
<longdesc lang="en">
Table to be tested in monitor statement (in database.table notation)
</longdesc>
<shortdesc lang="en">MariaDB test table</shortdesc>
<content type="string" default="${OCF_RESKEY_test_table_default}" />
</parameter>
<parameter name="test_user" unique="0" required="0">
<longdesc lang="en">
MariaDB test user, must have select privilege on test_table
</longdesc>
<shortdesc lang="en">MariaDB test user</shortdesc>
<content type="string" default="${OCF_RESKEY_test_user_default}" />
</parameter>
<parameter name="test_passwd" unique="0" required="0">
<longdesc lang="en">
MariaDB test user password
</longdesc>
<shortdesc lang="en">MariaDB test user password</shortdesc>
<content type="string" default="${OCF_RESKEY_test_passwd_default}" />
</parameter>
<parameter name="enable_creation" unique="0" required="0">
<longdesc lang="en">
If the MariaDB database does not exist, it will be created
</longdesc>
<shortdesc lang="en">Create the database if it does not exist</shortdesc>
<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/>
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters which are passed to the mysqld on startup.
(e.g. --skip-external-locking or --skip-grant-tables)
</longdesc>
<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc>
<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/>
</parameter>
<parameter name="replication_user" unique="0" required="0">
<longdesc lang="en">
MariaDB replication user. This user is used for starting and stopping
MariaDB replication, for setting and resetting the master host, and for
setting and unsetting read-only mode. Because of that, this user must
have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD
privileges on all nodes within the cluster. Mandatory if you define a
master-slave resource.
</longdesc>
<shortdesc lang="en">MariaDB replication user</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_user_default}" />
</parameter>
<parameter name="replication_passwd" unique="0" required="0">
<longdesc lang="en">
MariaDB replication password. Used for replication client and slave.
Mandatory if you define a master-slave resource.
</longdesc>
<shortdesc lang="en">MariaDB replication user password</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_passwd_default}" />
</parameter>
<parameter name="replication_port" unique="0" required="0">
<longdesc lang="en">
The port on which the Master MariaDB instance is listening.
</longdesc>
<shortdesc lang="en">MariaDB replication port</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_port_default}" />
</parameter>
</parameters>
<actions>
-<action name="start" timeout="120" />
-<action name="stop" timeout="120" />
-<action name="status" timeout="60" />
-<action name="monitor" depth="0" timeout="30" interval="20" />
-<action name="monitor" role="Master" depth="0" timeout="30" interval="10" />
-<action name="monitor" role="Slave" depth="0" timeout="30" interval="30" />
-<action name="promote" timeout="120" />
-<action name="demote" timeout="120" />
-<action name="notify" timeout="90" />
-<action name="validate-all" timeout="5" />
-<action name="meta-data" timeout="5" />
+<action name="start" timeout="120s" />
+<action name="stop" timeout="120s" />
+<action name="status" timeout="60s" />
+<action name="monitor" depth="0" timeout="30s" interval="20s" />
+<action name="monitor" role="Master" depth="0" timeout="30s" interval="10s" />
+<action name="monitor" role="Slave" depth="0" timeout="30s" interval="30s" />
+<action name="promote" timeout="120s" />
+<action name="demote" timeout="120s" />
+<action name="notify" timeout="90s" />
+<action name="validate-all" timeout="5s" />
+<action name="meta-data" timeout="5s" />
</actions>
</resource-agent>
END
}
# Convenience functions
greater_than_equal_long()
{
# there are values we need to compare in this script
# that are too large for shell -gt to process
local true=$(echo "$1 > $2" | bc)
if [ "$true" -eq "1" ]; then
return 0
else
return 1
fi
}
greater_than_gtid()
{
local gtid1_transaction_id=$(echo $1 | cut -d - -f 3)
local gtid2_transaction_id=$(echo $2 | cut -d - -f 3)
greater_than_equal_long $gtid1_transaction_id $gtid2_transaction_id
return $?
}
set_gtid() {
# Sets the GTID in CIB using attrd_updater for this node.
local gtid=$($MYSQL $MYSQL_OPTIONS_REPL \
-s -N -e "show global variables like 'gtid_current_pos'" | cut -f 2)
# Ensure that we got somethine like a valid GTID
if ! echo $gtid | grep -q '-'; then
ocf_exit_reason "Unable to read GTID from MariaDB"
ocf_log err "Unable to read GTID from MariaDB"
return $OCF_ERR_GENERIC
fi
${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-gtid -U $gtid
}
read_gtid() {
local node=$1
local query_result
local name
local host
local value
# This produces output of the form 'name="var-name" host="node2" value="val"'.
# This should be set at this point, because we have store our own GTID previously.
if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -N $node -n ${OCF_RESOURCE_INSTANCE}-gtid -Q); then
ocf_exit_reason "Unable to read GTID from attrd"
ocf_log err "Unable to read GTID from attrd"
echo ""
return
fi
# Evaluate the query result to place the variables in the local scope.
eval ${query_result}
echo ${value}
}
clear_all_gtid() {
for node in $OCF_RESKEY_node_list; do
${HA_SBIN_DIR}/attrd_updater -n ${OCF_RESOURCE_INSTANCE}-gtid -N $node -D
done
}
set_waiting_for_first_master() {
${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -U true
}
waiting_for_first_master() {
local query_result
local name
local host
local value
if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -Q); then
ocf_exit_reason "Unable to read waiting-for-first-master from attrd"
ocf_log err "Unable to read waiting-for-first-master from attrd"
return 1
fi
# Evaluate the query result to place the variables in the local scope.
eval ${query_result}
if [ "$value" = "true" ]; then
return 0
else
return 1
fi
}
clear_waiting_for_first_master() {
attrd_updater -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -D
}
have_master_with_priority() {
# Go through each node and validate that at least one has
# a set priority. Because we unset the priority on reboot
# a lack of priority indicates that we need to select a
# new master.
for node in $OCF_RESKEY_node_list; do
$CRM_MASTER -G -N $node >/dev/null 2>&1
rc=$?
if [ $rc -eq 0 ]; then
return 0
fi
done
return 1
}
attempt_to_set_master() {
ocf_log info "Attempting to set master"
local expected_node_count
if waiting_for_first_master; then
# Wait for all nodes to come online
expected_node_count=$OCF_RESKEY_CRM_meta_clone_max
else
# We accept one node being down. This is not arbitrary,
# synchronous replication requires acknowledgement from
# at least one host, which means only two nodes must have
# the latest GTID. So a set of n - 1 ensures that we do
# not lose any writes.
expected_node_count=$(($OCF_RESKEY_CRM_meta_clone_max-1))
fi
# Set the gtid for this node, making it available to other nodes
set_gtid
local node_count=0
local highest_gtid=0
local master_candidate=""
for node in $OCF_RESKEY_node_list; do
local node_gtid=$(read_gtid $node)
if [ -z "$node_gtid" ]; then
continue
fi
# Got a valid gtid, increment node count
node_count=$(($node_count+1))
# Check if this is a good master candidate
if greater_than_gtid $node_gtid $highest_gtid; then
master_candidate=$node
highest_gtid=$node_gtid
fi
done
# If we managed to query a sufficient number of nodes
# then set a master
if [ $node_count -ge $expected_node_count ]; then
ocf_log info "Promoting $master_candidate to master, highest gtid $highest_gtid, queried $node_count nodes."
$CRM_MASTER -v 100 -N $master_candidate
else
ocf_log info "Not enough nodes ($node_count) contributed to select a master, need $expected_node_count nodes."
fi
}
set_read_only() {
# Sets or unsets read-only mode. Accepts one boolean as its
# optional argument. If invoked without any arguments, defaults to
# enabling read only mode. Should only be set in master/slave
# setups.
# Returns $OCF_SUCCESS if the operation succeeds, or
# $OCF_ERR_GENERIC if it fails.
local ro_val
if ocf_is_true $1; then
ro_val="on"
else
ro_val="off"
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "SET GLOBAL read_only=${ro_val}"
}
get_read_only() {
# Check if read-only is set
local read_only_state
read_only_state=$($MYSQL $MYSQL_OPTIONS_REPL \
-e "SHOW VARIABLES" | grep -w read_only | awk '{print $2}')
if [ "$read_only_state" = "ON" ]; then
return 0
else
return 1
fi
}
is_slave() {
# Determine whether the machine is currently running as a MariaDB
# slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW
# SLAVE STATUS creates an empty result set, 0 otherwise.
local rc
# Check whether this machine should be slave
if ! get_read_only; then
return 1
fi
if get_slave_info; then
# show slave status is not empty
# Is the slave sql thread running, then we are a slave!
if [ "$slave_sql" == 'Yes' ]; then
return 0
else
return 1
fi
else
# "SHOW SLAVE STATUS" returns an empty set if instance is not a
# replication slave
return 1
fi
}
parse_slave_info() {
# Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2
sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2
}
get_slave_info() {
if [ "$master_log_file" -a "$master_host" ]; then
# variables are already defined, get_slave_info has been run before
return $OCF_SUCCESS
else
local tmpfile=$(mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX)
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW SLAVE STATUS\G' > $tmpfile
if [ -s $tmpfile ]; then
master_host=$(parse_slave_info Master_Host $tmpfile)
master_user=$(parse_slave_info Master_User $tmpfile)
master_port=$(parse_slave_info Master_Port $tmpfile)
master_using_gtid=$(parse_slave_info Using_Gtid $tmpfile)
master_log_file=$(parse_slave_info Master_Log_File $tmpfile)
slave_sql=$(parse_slave_info Slave_SQL_Running $tmpfile)
slave_io=$(parse_slave_info Slave_IO_Running $tmpfile)
last_errno=$(parse_slave_info Last_Errno $tmpfile)
last_error=$(parse_slave_info Last_Error $tmpfile)
secs_behind=$(parse_slave_info Seconds_Behind_Master $tmpfile)
last_io_errno=$(parse_slave_info Last_IO_Errno $tmpfile)
last_io_error=$(parse_slave_info Last_IO_Error $tmpfile)
ocf_log debug "MariaDB instance running as a replication slave"
rm "$tmpfile"
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
rm "$tmpfile"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
fi
}
check_slave() {
# Checks slave status
local rc new_master
get_slave_info
rc=$?
if [ $rc -eq 0 ]; then
# Check normal errors
if [ $last_errno -ne 0 ]; then
ocf_exit_reason "MariaDB slave replication has failed ($last_errno): $last_error"
exit $OCF_ERR_GENERIC
fi
# Check IO Errors, ignore 2003 which indicates a connection failure to the master
if [ $last_io_errno -ne 0 ] && [ $last_io_errno -ne 2003 ]; then
ocf_exit_reason "MariaDB slave io has failed ($last_io_errno): $last_io_error"
exit $OCF_ERR_GENERIC
fi
if [ $last_io_errno -eq 2003 ]; then
ocf_log warn "MariaDB master not reachable from slave"
fi
if [ "$slave_io" != 'Yes' ]; then
# Not necessarily a bad thing. The master may have
# temporarily shut down, and the slave may just be
# reconnecting. A warning can't hurt, though.
ocf_log warn "MariaDB Slave IO threads currently not running."
# Sanity check, are we at least on the right master
new_master=$($CRM_ATTR_REPL_INFO --query -q)
if [ "$master_host" != "$new_master" ]; then
# Not pointing to the right master, not good, removing the VIPs
set_reader_attr 0
exit $OCF_SUCCESS
fi
fi
if [ "$slave_sql" != 'Yes' ]; then
# We don't have a replication SQL thread running. Not a
# good thing. Try to recoved by restarting the SQL thread
# and remove reader vip. Prevent MariaDB restart.
ocf_exit_reason "MariaDB Slave SQL threads currently not running."
# Remove reader vip
set_reader_attr 0
# try to restart slave
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
# Return success to prevent a restart
exit $OCF_SUCCESS
fi
ocf_log debug "MariaDB instance running as a replication slave"
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
# TODO: Needs to handle when get_slave_info will return too many connections error
ocf_exit_reason "check_slave invoked on an instance that is not a replication slave."
exit $OCF_ERR_GENERIC
fi
}
set_master() {
local new_master=$($CRM_ATTR_REPL_INFO --query -q)
# Informs the MariaDB server of the master to replicate
# from. Accepts one mandatory argument which must contain the host
# name of the new master host. The master must either be unchanged
# from the laste master the slave replicated from, or freshly
# reset with RESET MASTER.
ocf_log info "Changing MariaDB configuration to replicate from $new_master."
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "CHANGE MASTER TO MASTER_HOST='$new_master', \
MASTER_PORT=$OCF_RESKEY_replication_port, \
MASTER_USER='$OCF_RESKEY_replication_user', \
MASTER_PASSWORD='$OCF_RESKEY_replication_passwd', \
MASTER_USE_GTID=current_pos";
}
unset_master(){
# Instructs the MariaDB server to stop replicating from a master
# host.
# If we're currently not configured to be replicating from any
# host, then there's nothing to do. But we do log a warning as
# no-one but the CRM should be touching the MariaDB master/slave
# configuration.
if ! is_slave; then
ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave"
return $OCF_SUCCESS
fi
# Stop the slave I/O thread and wait for relay log
# processing to complete
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE IO_THREAD"
if [ $? -gt 0 ]; then
ocf_exit_reason "Error stopping slave IO thread"
exit $OCF_ERR_GENERIC
fi
local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX)
while true; do
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW PROCESSLIST\G' > $tmpfile
if grep -i 'Has read all relay log' $tmpfile >/dev/null; then
ocf_log info "MariaDB slave has finished processing relay log"
break
fi
if ! grep -q 'system user' $tmpfile; then
ocf_log info "Slave not runnig - not waiting to finish"
break
fi
ocf_log info "Waiting for MariaDB slave to finish processing relay log"
sleep 1
done
rm -f $tmpfile
# Now, stop all slave activity and unset the master host
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE"
if [ $? -gt 0 ]; then
ocf_exit_reason "Error stopping rest slave threads"
exit $OCF_ERR_GENERIC
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "RESET SLAVE /*!50516 ALL */;"
if [ $? -gt 0 ]; then
ocf_exit_reason "Failed to reset slave"
exit $OCF_ERR_GENERIC
fi
}
# Start replication as slave
start_slave() {
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
}
# Set the attribute controlling the readers VIP
set_reader_attr() {
local curr_attr_value
curr_attr_value=$(get_reader_attr)
if [ "$curr_attr_value" -ne "$1" ]; then
$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1
fi
}
# get the attribute controlling the readers VIP
get_reader_attr() {
local attr_value
local rc
attr_value=$($CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q)
rc=$?
if [ "$rc" -eq "0" ]; then
echo $attr_value
else
echo -1
fi
}
# Determines what IP address is attached to the current host. The output of the
# crm_attribute command looks like this:
# scope=nodes name=IP value=10.2.2.161
# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n
# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the
# change master to command.
get_local_ip() {
local IP
IP=$($CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G 2>/dev/null)
if [ ! $? -eq 0 ]; then
uname -n
else
echo $IP
fi
}
#######################################################################
# Functions invoked by resource manager actions
mysql_monitor() {
local rc
local status_loglevel="err"
# Set loglevel to info during probe
if ocf_is_probe; then
status_loglevel="info"
fi
mysql_common_status $status_loglevel
rc=$?
# If status returned an error, return that immediately
if [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
# Check if this instance is configured as a slave, and if so
# check slave status
if is_slave; then
if ! check_slave; then
return $OCF_ERR_GENERIC
fi
fi
if [ -n "$OCF_RESKEY_test_table" ]; then
# Check for test table
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table"
rc=$?
if [ $rc -ne 0 ]; then
ocf_exit_reason "Failed to select from $test_table";
return $OCF_ERR_GENERIC;
fi
fi
# Check if we are in read-only mode and there is no master
# with priority then we attempt to select a master
if get_read_only && ! have_master_with_priority; then
attempt_to_set_master
fi
if ! get_read_only; then
ocf_log debug "MariaDB monitor succeeded (master)";
return $OCF_RUNNING_MASTER
else
ocf_log debug "MariaDB monitor succeeded";
return $OCF_SUCCESS
fi
}
mysql_start() {
local rc
if ! ocf_is_ms; then
ocf_exit_reason "Resource is not configured as master/slave"
return $OCF_ERR_GENERIC
fi
# Initialize the ReaderVIP attribute, monitor will enable it
set_reader_attr 0
mysql_common_status info
if [ $? = $OCF_SUCCESS ]; then
ocf_log info "MariaDB already running"
return $OCF_SUCCESS
fi
mysql_common_prepare_dirs
mysql_common_start --skip-slave-start --log-slave-updates
rc=$?
if [ $rc != $OCF_SUCCESS ]; then
return $rc
fi
# Enable semi-sync
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SET GLOBAL rpl_semi_sync_slave_enabled='ON', \
rpl_semi_sync_master_enabled='ON', \
rpl_semi_sync_master_wait_no_slave='OFF', \
rpl_semi_sync_master_wait_point='AFTER_SYNC', \
gtid_strict_mode='ON', \
sync_binlog=1, \
sync_master_info=1, \
innodb_flush_log_at_trx_commit=1;"
rc=$?
if [ $rc -ne 0 ]; then
ocf_exit_reason "Failed to enable semi-sync and set variables";
return $OCF_ERR_GENERIC;
fi
# We're configured as a stateful resource. We must start as
# slave by default. At this point we don't know if the CRM has
# already promoted a master. So, we simply start in read only
# mode and make sure our old score is invalidated.
set_read_only on
$CRM_MASTER -D
# Now, let's see whether there is a master. We might be a new
# node that is just joining the cluster, and the CRM may have
# promoted a master before.
new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " ")
if [ "$new_master_host" -a "$new_master_host" != ${NODENAME} ]; then
set_master
start_slave
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to start slave"
return $OCF_ERR_GENERIC
fi
else
ocf_log info "No MariaDB master present - clearing replication state, setting gtid in attrd, waiting for first master"
unset_master
set_waiting_for_first_master
fi
# Initial monitor action
if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then
OCF_CHECK_LEVEL=10
fi
mysql_monitor
rc=$?
if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
ocf_exit_reason "Failed initial monitor action"
return $rc
fi
ocf_log info "MariaDB started"
return $OCF_SUCCESS
}
mysql_stop() {
# clear preference for becoming master
$CRM_MASTER -D
# Remove VIP capability
set_reader_attr 0
mysql_common_stop
}
mysql_promote() {
local master_info
if ( ! mysql_common_status err ); then
return $OCF_NOT_RUNNING
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE"
set_read_only off || return $OCF_ERR_GENERIC
# Force the master to wait for timeout period on slave disconnect
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='ON';"
# Set Master Info in CIB, cluster level attribute
master_info="$(get_local_ip)"
${CRM_ATTR_REPL_INFO} -v "$master_info"
# A master can accept reads
set_reader_attr 1
# Clear the gtids in attrd now that there is a master
clear_all_gtid
return $OCF_SUCCESS
}
mysql_demote() {
if ! mysql_common_status err; then
return $OCF_NOT_RUNNING
fi
# Return to default no wait setting.
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='OFF';"
# Return master preference to default, so the cluster manager gets
# a chance to select a new master
$CRM_MASTER -D
}
mysql_notify() {
local type_op
type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}"
ocf_log debug "Received $type_op notification."
case "$type_op" in
'pre-promote')
# A master is now being promoted, remove the waiting-for-first-master flag
clear_waiting_for_first_master
;;
'post-promote')
# The master has completed its promotion. Now is a good
# time to check whether our replication slave is working
# correctly.
new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " ")
if [ "$new_master_host" = ${NODENAME} ]; then
ocf_log info "This will be the new master, ignoring post-promote notification."
else
ocf_log info "Resetting replication, uname of master: $new_master_host"
unset_master
if [ $? -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
set_master
if [ $? -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
start_slave
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to start slave"
return $OCF_ERR_GENERIC
fi
fi
return $OCF_SUCCESS
;;
'pre-demote')
demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ")
if [ $demote_host = ${NODENAME} ]; then
ocf_log info "pre-demote notification for $demote_host"
set_read_only on
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to set read-only";
return $OCF_ERR_GENERIC;
fi
# Must kill all existing user threads because they are still Read/write
# in order for the slaves to complete the read of binlogs
local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX)
$MYSQL $MYSQL_OPTIONS_REPL -e "SHOW PROCESSLIST" > $tmpfile
for thread in $(awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile)
do
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "KILL ${thread}"
done
rm -f $tmpfile
else
ocf_log info "Ignoring post-demote notification execpt for my own demotion."
fi
return $OCF_SUCCESS
;;
'post-demote')
demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ")
if [ $demote_host = ${NODENAME} ]; then
ocf_log info "Ignoring post-demote notification for my own demotion."
return $OCF_SUCCESS
fi
ocf_log info "post-demote notification for $demote_host."
# The former master has just been gracefully demoted.
unset_master
;;
*)
return $OCF_SUCCESS
;;
esac
}
#######################################################################
##########################################################################
# If DEBUG_LOG is set, make this resource agent easy to debug: set up the
# debug log and direct all output to it. Otherwise, redirect to /dev/null.
# The log directory must be a directory owned by root, with permissions 0700,
# and the log must be writable and not a symlink.
##########################################################################
DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log"
if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then
DEBUG_LOG_DIR="${DEBUG_LOG%/*}"
if [ -d "${DEBUG_LOG_DIR}" ]; then
exec 9>>"$DEBUG_LOG"
exec 2>&9
date >&9
echo "$*" >&9
env | grep OCF_ | sort >&9
set -x
else
exec 9>/dev/null
fi
fi
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
mysql_common_validate
rc=$?
LSB_STATUS_STOPPED=3
if [ $rc -ne 0 ]; then
case "$1" in
stop) ;;
monitor)
mysql_common_status "info"
if [ $? -eq $OCF_SUCCESS ]; then
# if validatation fails and pid is active, always treat this as an error
ocf_exit_reason "environment validation failed, active pid is in unknown state."
exit $OCF_ERR_GENERIC
fi
# validation failed and pid is not active, it's safe to say this instance is inactive.
exit $OCF_NOT_RUNNING;;
status) exit $LSB_STATUS_STOPPED;;
*) exit $rc;;
esac
fi
# What kind of method was invoked?
case "$1" in
start) mysql_start;;
stop) mysql_stop;;
status) mysql_common_status err;;
monitor) mysql_monitor;;
promote) mysql_promote;;
demote) mysql_demote;;
notify) mysql_notify;;
validate-all) exit $OCF_SUCCESS;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac
# vi:sw=4:ts=4:et:
diff --git a/heartbeat/sybaseASE.in b/heartbeat/sybaseASE.in
index a4a0b7a0c..b4809ea23 100755
--- a/heartbeat/sybaseASE.in
+++ b/heartbeat/sybaseASE.in
@@ -1,890 +1,890 @@
#!@BASH_SHELL@
#
# Sybase Availability Agent for Red Hat Cluster v15.0.2
# Copyright (C) - 2007
# Sybase, Inc. All rights reserved.
#
# Sybase Availability Agent for Red Hat Cluster v15.0.2 is licensed
# under the GNU General Public License Version 2.
#
# Author(s):
# Jian-ping Hui <jphui@sybase.com>
#
# Description: Service script for starting/stopping/monitoring \
# Sybase Adaptive Server on: \
# Red Hat Enterprise Linux 7 ES \
# Red Hat Enterprise Linux 7 AS
#
# NOTES:
#
# (1) Before running this script, we assume that user has installed
# Sybase ASE 15.0.2 or higher version on the machine. Please
# customize your configuration in /etc/cluster/cluster.conf according
# to your actual environment. We assume the following files exist before
# you start the service:
# /$sybase_home/SYBASE.sh
# /$sybase_home/$sybase_ase/install/RUN_$server_name
#
# (2) You can customize the interval value in the meta-data section if needed:
-# <action name="start" timeout="300" />
-# <action name="stop" timeout="300" />
+# <action name="start" timeout="300s" />
+# <action name="stop" timeout="300s" />
#
# <!-- Checks to see if it''s mounted in the right place -->
-# <action name="status" interval="30" timeout="100" />
-# <action name="monitor" interval="30" timeout="100" />
+# <action name="status" interval="30s" timeout="100s" />
+# <action name="monitor" interval="30s" timeout="100s" />
#
# <!--Checks to see if we can read from the mountpoint -->
-# <action name="status" depth="10" timeout="100" interval="120" />
-# <action name="monitor" depth="10" timeout="100" interval="120" />
+# <action name="status" depth="10" timeout="100s" interval="120s" />
+# <action name="monitor" depth="10" timeout="100s" interval="120s" />
#
-# <action name="meta-data" timeout="5" />
-# <action name="validate-all" timeout="5" />
+# <action name="meta-data" timeout="5s" />
+# <action name="validate-all" timeout="5s" />
# The timeout value is not supported by Redhat in RHCS5.0.
#
#######################################################################
# Initialization:
if [ -f /etc/init.d/functions ]; then
. /etc/init.d/functions
fi
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Default timeouts when we aren't using the rgmanager wrapper
if ! ocf_is_true "$OCF_RESKEY_is_rgmanager_wrapper"; then
if [ -z "$OCF_RESKEY_CRM_meta_timeout" ]; then
case $1 in
start|stop) OCF_RESKEY_CRM_meta_timeout=300000 ;;
*) OCF_RESKEY_CRM_meta_timeout=100000 ;;
esac
fi
default_timeout=$(((${OCF_RESKEY_CRM_meta_timeout}/1000) - 5))
default_force_stop_timeout=$(((${OCF_RESKEY_CRM_meta_timeout}/1000) - 5))
: ${OCF_RESKEY_shutdown_timeout=${default_force_stop_timeout}}
: ${OCF_RESKEY_deep_probe_timeout=${default_timeout}}
: ${OCF_RESKEY_start_timeout=${default_timeout}}
fi
sybase_user_default="sybase"
sybase_home_default="detect"
ase_default="detect"
ocs_default="detect"
: ${OCF_RESKEY_sybase_user=${sybase_user_default}}
: ${OCF_RESKEY_sybase_ase=${ase_default}}
: ${OCF_RESKEY_sybase_ocs=${ocs_default}}
: ${OCF_RESKEY_sybase_home=${sybase_home_default}}
if [ "$__OCF_ACTION" != "meta-data" ]; then
if [ "$OCF_RESKEY_sybase_home" = "detect" ]; then
if [ -d "/opt/sap" ]; then
OCF_RESKEY_sybase_home="/opt/sap"
elif [ -d "/opt/sybase" ]; then
OCF_RESKEY_sybase_home="/opt/sybase"
else
ocf_log err "sybaseASE: Unable to detect 'sybase_home'."
exit $OCF_ERR_ARGS
fi
fi
sybase_env="$OCF_RESKEY_sybase_home/SYBASE.env"
if [ "$OCF_RESKEY_sybase_ase" = "detect" ]; then
if [ -f "$sybase_env" ]; then
OCF_RESKEY_sybase_ase=$(grep "SYBASE_ASE" "$sybase_env" | cut -d= -f2)
else
ocf_log err "sybaseASE: Unable to detect 'sybase_ase'."
exit $OCF_ERR_ARGS
fi
fi
if [ "$OCF_RESKEY_sybase_ocs" = "detect" ]; then
if [ -f "$sybase_env" ]; then
OCF_RESKEY_sybase_ocs=$(grep "SYBASE_OCS" "$sybase_env" | cut -d= -f2)
else
ocf_log err "sybaseASE: Unable to detect 'sybase_ocs'."
exit $OCF_ERR_ARGS
fi
fi
fi
interfaces_file_default="${OCF_RESKEY_sybase_home}/interfaces"
: ${OCF_RESKEY_interfaces_file=${interfaces_file_default}}
export LD_POINTER_GUARD=0
#######################################################################################
# Declare some variables we will use in the script. #
#######################################################################################
declare login_string=""
declare RUNSERVER_SCRIPT=$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase/install/RUN_$OCF_RESKEY_server_name
declare CONSOLE_LOG=$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase/install/$OCF_RESKEY_server_name.log
##################################################################################################
# This function will be called by Pacemaker to get the meta data of resource agent "sybaseASE". #
##################################################################################################
meta_data()
{
cat <<EOT
<?xml version="1.0" ?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="sybaseASE" >
<version>1.0</version>
<longdesc lang="en">
Sybase ASE Failover Instance
</longdesc>
<shortdesc lang="en">
Sybase ASE Failover Instance
</shortdesc>
<parameters>
<parameter name="sybase_home">
<longdesc lang="en">
The home directory of sybase products
</longdesc>
<shortdesc lang="en">
SYBASE home directory
</shortdesc>
<content type="string" default="${sybase_home_default}"/>
</parameter>
<parameter name="sybase_ase">
<longdesc lang="en">
The directory name under sybase_home where ASE products are installed
</longdesc>
<shortdesc lang="en">
SYBASE_ASE directory name
</shortdesc>
<content type="string" default="$ase_default" />
</parameter>
<parameter name="sybase_ocs">
<longdesc lang="en">
The directory name under sybase_home where OCS products are installed, i.e. ASE-15_0
</longdesc>
<shortdesc lang="en">
SYBASE_OCS directory name
</shortdesc>
<content type="string" default="${ocs_default}" />
</parameter>
<parameter name="server_name" unique="1" required="1">
<longdesc lang="en">
The ASE server name which is configured for the HA service
</longdesc>
<shortdesc lang="en">
ASE server name
</shortdesc>
<content type="string" />
</parameter>
<parameter name="interfaces_file">
<longdesc lang="en">
The full path of interfaces file which is used to start/access the ASE server
</longdesc>
<shortdesc lang="en">
Interfaces file
</shortdesc>
<content type="string" default="$interfaces_file_default"/>
</parameter>
<parameter name="sybase_user">
<longdesc lang="en">
The user who can run ASE server
</longdesc>
<shortdesc lang="en">
Sybase user
</shortdesc>
<content type="string" default="$sybase_user_default" />
</parameter>
<parameter name="db_user" required="1">
<longdesc lang="en">
The database user required to login to isql.
</longdesc>
<shortdesc lang="en">
Sybase user
</shortdesc>
<content type="string"/>
</parameter>
<parameter name="db_passwd">
<longdesc lang="en">
The database user's password required to login to isql.
</longdesc>
<shortdesc lang="en">
Sybase user
</shortdesc>
<content type="string"/>
</parameter>
</parameters>
<actions>
- <action name="start" timeout="300" />
- <action name="stop" timeout="300" />
+ <action name="start" timeout="300s" />
+ <action name="stop" timeout="300s" />
<!-- Checks to see if it''s mounted in the right place -->
- <action name="status" interval="30" timeout="100" />
- <action name="monitor" interval="30" timeout="100" />
+ <action name="status" interval="30s" timeout="100s" />
+ <action name="monitor" interval="30s" timeout="100s" />
<!--Checks to see if we can read from the mountpoint -->
- <action name="status" depth="10" timeout="100" interval="120" />
- <action name="monitor" depth="10" timeout="100" interval="120" />
+ <action name="status" depth="10" timeout="100" interval="120s" />
+ <action name="monitor" depth="10" timeout="100" interval="120s" />
- <action name="meta-data" timeout="5" />
- <action name="validate-all" timeout="5" />
+ <action name="meta-data" timeout="5s" />
+ <action name="validate-all" timeout="5s" />
</actions>
</resource-agent>
EOT
}
ase_engine0_process()
{
sed -n -e '/engine 0/s/^.*os pid \([0-9]*\).*online$/\1/p' $CONSOLE_LOG
}
ase_engine0_thread()
{
sed -n -e 's/.*Thread.*LWP \([0-9]*\).*online as engine 0.*/\1/p' $CONSOLE_LOG
}
ase_engine_threadpool_pid()
{
sed -n -e 's/.*Adaptive Server is running as process id \([0-9]*\).*/\1/p' $CONSOLE_LOG
}
ase_all_pids()
{
local PIDS=$(sed -n -e '/engine /s/^.*os pid \([0-9]*\).*online$/\1/p' $CONSOLE_LOG)
if [ -z "$PIDS" ]; then
#engines are running in a threadpool
PIDS=$(ase_engine_threadpool_pid)
fi
echo $PIDS
}
##################################################################################################
# Function Name: verify_all #
# Parameter: None #
# Return value: #
# 0 SUCCESS #
# OCF_ERR_ARGS Parameters are invalid #
# Description: Do some validation on the user-configurable stuff at the beginning of the script. #
##################################################################################################
verify_all()
{
ocf_log debug "sybaseASE: Start 'verify_all'"
check_binary "ksh"
# Check if the parameter 'sybase_home' is set.
if [[ -z "$OCF_RESKEY_sybase_home" ]]
then
ocf_log err "sybaseASE: The parameter 'sybase_home' is not set."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'sybase_home' is a valid path.
if [[ ! -d $OCF_RESKEY_sybase_home ]]
then
ocf_log err "sybaseASE: The sybase_home '$OCF_RESKEY_sybase_home' doesn't exist."
return $OCF_ERR_ARGS
fi
# Check if the script file SYBASE.sh exists
if [[ ! -f $OCF_RESKEY_sybase_home/SYBASE.sh ]]
then
ocf_log err "sybaseASE: The file $OCF_RESKEY_sybase_home/SYBASE.sh is required to run this script. Failed to run the script."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'sybase_ase' is set.
if [[ -z "$OCF_RESKEY_sybase_ase" ]]
then
ocf_log err "sybaseASE: The parameter 'sybase_ase' is not set."
return $OCF_ERR_ARGS
fi
# Check if the directory /$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase exists.
if [[ ! -d $OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase ]]
then
ocf_log err "sybaseASE: The directory '$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase' doesn't exist."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'sybase_ocs' is set.
if [[ -z "$OCF_RESKEY_sybase_ocs" ]]
then
ocf_log err "sybaseASE: The parameter 'sybase_ocs' is not set."
return $OCF_ERR_ARGS
fi
# Check if the directory /$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs exists.
if [[ ! -d $OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs ]]
then
ocf_log err "sybaseASE: The directory '$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs' doesn't exist."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'server_name' is set.
if [[ -z "$OCF_RESKEY_server_name" ]]
then
ocf_log err "sybaseASE: The parameter 'server_name' is not set."
return $OCF_ERR_ARGS
fi
# Check if the Run_server file exists.
if [[ ! -f $RUNSERVER_SCRIPT ]]
then
ocf_log err "sybaseASE: The file $RUNSERVER_SCRIPT doesn't exist. The sybase directory may be incorrect."
return $OCF_ERR_ARGS
fi
# Check if the user 'sybase_user' exist
id -u $OCF_RESKEY_sybase_user
if [[ $? != 0 ]]
then
ocf_log err "sybaseASE: The user '$OCF_RESKEY_sybase_user' doesn't exist in the system."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'interfaces_file' is set
if [[ -z "$OCF_RESKEY_interfaces_file" ]]
then
ocf_log err "sybaseASE: The parameter 'interfaces_file' is not set."
return $OCF_ERR_ARGS
fi
# Check if the file 'interfaces_file' exists
if [[ ! -f $OCF_RESKEY_interfaces_file ]]
then
ocf_log err "sybaseASE: The interfaces file '$OCF_RESKEY_interfaces_file' doesn't exist."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'db_user' is set
if [[ -z "$OCF_RESKEY_db_user" ]]
then
ocf_log err "sybaseASE: The parameter 'db_user' is not set."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'shutdown_timeout' is a valid value
if [[ $OCF_RESKEY_shutdown_timeout -eq 0 ]]
then
ocf_log err "sybaseASE: The parameter 'shutdown_timeout' is not set. Its value cannot be zero."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'start_timeout' is a valid value
if [[ $OCF_RESKEY_start_timeout -eq 0 ]]
then
ocf_log err "sybaseASE: The parameter 'start_timeout' is not set. Its value cannot be zero."
return $OCF_ERR_ARGS
fi
# Check if the parameter 'deep_probe_timeout' is a valid value
if [[ $OCF_RESKEY_deep_probe_timeout -eq 0 ]]
then
ocf_log err "sybaseASE: The parameter 'deep_probe_timeout' is not set. Its value cannot be zero."
return $OCF_ERR_ARGS
fi
ocf_log debug "sybaseASE: End 'verify_all' successfully."
return $OCF_SUCCESS
}
set_login_string()
{
tmpstring=""
login_sting=""
login_string="-U$OCF_RESKEY_db_user -P$OCF_RESKEY_db_passwd"
return 0
}
##############################################################################################
# Function name: ase_start #
# Parameter: None #
# Return value: #
# 0 SUCCESS #
# 1 FAIL #
# Description: This function is used to start the ASE server in primary or secondary server. #
##############################################################################################
ase_start()
{
ocf_log debug "sybaseASE: Start 'ase_start'"
# Check if the server is running. If yes, return SUCCESS directly. Otherwise, continue the start work.
ase_is_running
if [[ $? = 0 ]]
then
# The server is running.
ocf_log info "sybaseASE: Server is running. Start is success."
return $OCF_SUCCESS
fi
# The server is not running. We need to start it.
# If the log file existed, delete it.
if [[ -f $CONSOLE_LOG ]]
then
rm -f $CONSOLE_LOG
fi
ocf_log debug "sybaseASE: Starting '$OCF_RESKEY_server_name'..."
# Run runserver script to start the server. Since this script will be run by root and ASE server
# needs to be run by another user, we need to change the user to sybase_user first. Then, run
# the script to start the server.
su $OCF_RESKEY_sybase_user -c ksh << EOF
# set required SYBASE environment by running SYBASE.sh.
. $OCF_RESKEY_sybase_home/SYBASE.sh
# Run the RUNSERVER_SCRIPT to start the server.
. $RUNSERVER_SCRIPT > $CONSOLE_LOG 2>&1 &
EOF
# Monitor every 1 seconds if the server has
# recovered, until RECOVERY_TIMEOUT.
t=0
while [[ $t -le $OCF_RESKEY_start_timeout ]]
do
grep -s "Recovery complete." $CONSOLE_LOG > /dev/null 2>&1
if [[ $? != 0 ]]
then
# The server has not completed the recovery. We need to continue to monitor the recovery
# process.
t=`expr $t + 1`
else
# The server has completed the recovery.
ocf_log info "sybaseASE: ASE server '$OCF_RESKEY_server_name' started successfully."
break
fi
sleep 1
done
# If $t is larger than start_timeout, it means the ASE server cannot start in given time. Otherwise, it
# means the ASE server has started successfully.
if [[ $t -gt $OCF_RESKEY_start_timeout ]]
then
# The server cannot start in specified time. We think the start is failed.
ocf_log err "sybaseASE: Failed to start ASE server '$OCF_RESKEY_server_name'. Please check the server error log $CONSOLE_LOG for possible problems."
return $OCF_ERR_GENERIC
fi
ase_is_running
if [ $? -ne 0 ]; then
ocf_log err "sybaseASE: ase_start could not detect database initialized properly."
return $OCF_ERR_GENERIC
fi
ocf_log debug "sybaseASE: End 'ase_start' successfully."
return $OCF_SUCCESS
}
#############################################################################################
# Function name: ase_stop #
# Parameter: None #
# Return value: #
# 0 SUCCESS #
# 1 FAIL #
# Description: This function is used to stop the ASE server in primary or secondary server. #
#############################################################################################
ase_stop()
{
ocf_log debug "sybaseASE: Start 'ase_stop'"
# Check if the ASE server is still running.
ase_is_running
if [[ $? != 0 ]]
then
# The ASE server is not running. We need not to shutdown it.
ocf_log info "sybaseASE: The dataserver $OCF_RESKEY_server_name is not running."
return $OCF_SUCCESS
fi
set_login_string
# Just in case things are hung, start a process that will wait for the
# timeout period, then kill any remaining porcesses. We'll need to
# monitor this process (set -m), so we can terminate it later if it is
# not needed.
set -m
kill_ase $OCF_RESKEY_shutdown_timeout &
KILL_PID=$! # If successful, we will also terminate watchdog process
# Run "shutdown with nowait" from isql command line to shutdown the server
su $OCF_RESKEY_sybase_user -c ksh << EOF
# set required SYBASE environment by running SYBASE.sh.
. $OCF_RESKEY_sybase_home/SYBASE.sh
# Run "shutdown with nowait" to shutdown the server immediately.
(echo "use master" ; echo go ; echo "shutdown with nowait"; echo go) | \
\$SYBASE/\$SYBASE_OCS/bin/isql $login_string -S$OCF_RESKEY_server_name -I$OCF_RESKEY_interfaces_file &
EOF
sleep 5
# Check if the server has been shut down successfully
t=0
while [[ $t -lt $OCF_RESKEY_shutdown_timeout ]]
do
# Search "ueshutdown: exiting" in the server log. If found, it means the server has been shut down.
# Otherwise, we need to wait.
tail $CONSOLE_LOG | grep "ueshutdown: exiting" > /dev/null 2>&1
if [[ $? != 0 ]]
then
# The shutdown is still in processing. Wait...
sleep 2
t=`expr $t+2`
else
# The shutdown is success.
ocf_log info "sybaseASE: ASE server '$OCF_RESKEY_server_name' shutdown with isql successfully."
break
fi
done
# If $t is larger than shutdown_timeout, it means the ASE server cannot be shut down in given time. We need
# to wait for the background kill process to kill the OS processes directly.
if [[ $t -ge $OCF_RESKEY_shutdown_timeout ]]
then
ocf_log err "sybaseASE: Shutdown of '$OCF_RESKEY_server_name' from isql failed. Server is either down or unreachable."
fi
# Here, the ASE server has been shut down by isql command or killed by background process. We need to do
# further check to make sure all processes have gone away before saying shutdown is complete. This stops the
# other node from starting up the package before it has been stopped and the file system has been unmounted.
# Get all processes ids from log file
declare -a ENGINE_ALL=$(ase_all_pids)
typeset -i num_procs=${#ENGINE_ALL[@]}
# We cannot find any process id from log file. It may be because the log file is corrupted or be deleted.
# In this case, we determine the shutdown is failed.
if [[ ${#ENGINE_ALL[@]} -lt 1 ]]
then
ocf_log err "sybaseASE: Unable to find the process id from $CONSOLE_LOG."
ocf_log err "sybaseASE: Stop ASE server failed."
return $OCF_ERR_GENERIC
fi
# Monitor the system processes to make sure all ASE related processes have gone away.
while true
do
# To every engine process, search it in system processes list. If it is not in the
# list, it means this process has gone away. Otherwise, we need to wait for it is
# killed by background process.
for i in "${ENGINE_ALL[@]}"
do
ps -fu $OCF_RESKEY_sybase_user | awk '{print $2}' | grep $i | grep -v grep
if [[ $? != 0 ]]
then
ocf_log debug "sybaseASE: $i process has stopped."
c=0
while (( c < $num_procs ))
do
if [[ ${ENGINE_ALL[$c]} = $i ]]
then
unset ENGINE_ALL[$c]
c=$num_procs
fi
(( c = c + 1 ))
done
fi
done
# To here, all processes should have gone away.
if [[ ${#ENGINE_ALL[@]} -lt 1 ]]
then
#
# Looks like shutdown was successful, so kill the
# script to kill any hung processes, which we started earlier.
# Check to see if the script is still running. If jobs
# returns that the script is done, then we don't need to kill
# it.
#
job=$(jobs | grep -v Done)
if [[ ${job} != "" ]]
then
ocf_log debug "sybaseASE: Killing the kill_ase script."
kill -15 $KILL_PID > /dev/null 2>&1
fi
break
fi
sleep 5
done
ocf_log debug "sybaseASE: End 'ase_stop'."
return $OCF_SUCCESS
}
####################################################################################
# Function name: ase_is_running #
# Parameter: None #
# Return value: #
# 0 ASE server is running #
# 1 ASE server is not running or there are errors #
# Description: This function is used to check if the ASE server is still running . #
####################################################################################
ase_is_running()
{
local PID
local THREAD
# If the error log doesn't exist, we can say there is no ASE is running.
if [[ ! -f $CONSOLE_LOG ]]
then
ocf_log debug "could not find console log $CONSOLE_LOG"
return $OCF_NOT_RUNNING
fi
# The error log file exists. Check if the engine 0 is alive.
PID=$(ase_engine0_process)
if [ -n "$PID" ]; then
kill -s 0 $PID > /dev/null 2>&1
if [ $? -eq 0 ]; then
# The engine 0 is running.
ocf_log debug "Found engine 0 pid $PID to be running"
return $OCF_SUCCESS
fi
# The engine 0 is not running.
return $OCF_NOT_RUNNING
fi
PID=$(ase_engine_threadpool_pid)
THREAD=$(ase_engine0_thread)
if [ -n "$PID" ] && [ -n "$THREAD" ]; then
ps -AL | grep -q "${PID}[[:space:]]*${THREAD} "
if [ $? -eq 0 ]; then
# engine 0 thread is running
ocf_log debug "Found engine 0 thread $THREAD in pid $PID to be running"
return $OCF_SUCCESS
fi
# The engine 0 is not running.
return $OCF_NOT_RUNNING
fi
return $OCF_ERR_GENERIC
}
####################################################################################
# Function name: kill_ase #
# Parameter: #
# DELAY The seconds to wait before killing the ASE processes. 0 means #
# kill the ASE processes immediately. #
# Return value: None #
# 1 ASE server is not running or there are errors #
# Description: This function is used to check if the ASE server is still running . #
####################################################################################
kill_ase()
{
ocf_log debug "sybaseASE: Start 'kill_ase'."
DELAY=$1
# Wait for sometime before sending a kill signal.
t=0
while [[ $t -lt $DELAY ]]
do
sleep 1
t=`expr $t+1`
done
# Get the process ids from log file
declare -a ENGINE_ALL=$(ase_all_pids)
# If there is no process id found in the log file, we need not to continue.
if [[ ${#ENGINE_ALL[@]} -lt 1 ]]
then
ocf_log err "sybaseASE: Unable to find the process id from $CONSOLE_LOG."
return $OCF_ERR_GENERIC
fi
# Kill the datasever process(es)
for pid in "${ENGINE_ALL[@]}"
do
kill -9 $pid > /dev/null 2>&1
if [[ $? != 0 ]]
then
ocf_log info "sybaseASE: kill_ase function did NOT find process $pid running."
else
ocf_log info "sybaseASE: kill_ase function did find process $pid running. Sent SIGTERM."
fi
done
ocf_log debug "sybaseASE: End 'kill_ase'."
return $OCF_SUCCESS
}
#####################################################################################
# Function name: ase_status #
# Parameter: #
# 0 Level 0 probe. In this level, we just check if engine 0 is alive #
# 10 Level 10 probe. In this level, we need to probe if the ASE server #
# still has response. #
# Return value: #
# 0 The server is still alive #
# 1 The server is down #
# Description: This function is used to check if the ASE server is still running. #
#####################################################################################
ase_status()
{
local rc
ocf_log debug "sybaseASE: Start 'ase_status'."
# Step 1: Check if the engine 0 is alive
ase_is_running
rc=$?
if [ $rc -ne 0 ]; then
# ASE is down. Return fail to Pacemaker to trigger the failover process.
ocf_log err "sybaseASE: ASE server is down."
return $rc
fi
# ASE process is still alive.
# Step2: If this is level 10 probe, We need to check if the ASE server still has response.
if [[ $1 -gt 0 ]]
then
ocf_log debug "sybaseASE: Need to run deep probe."
# Run deep probe
deep_probe
if [[ $? = 1 ]]
then
# Deep probe failed. This means the server has been down.
ocf_log err "sybaseASE: Deep probe found the ASE server is down."
return $OCF_ERR_GENERIC
fi
fi
ocf_log debug "sybaseASE: End 'ase_status'."
return $OCF_SUCCESS
}
####################################################################################
# Function name: deep_probe #
# Parameter: None #
# Return value: #
# 0 ASE server is alive #
# 1 ASE server is down #
# Description: This function is used to run deep probe to make sure the ASE server #
# still has response. #
####################################################################################
deep_probe()
{
declare -i rv
ocf_log debug "sybaseASE: Start 'deep_probe'."
# Declare two temporary files which will be used in this probe.
tmpfile1="$(mktemp /tmp/sybaseASE.1.XXXXXX)"
tmpfile2="$(mktemp /tmp/sybaseASE.2.XXXXXX)"
set_login_string
rm -f $tmpfile1
rm -f $tmpfile2
# The login file is correct. We have gotten the login account and password from it.
# Run isql command in background.
su $OCF_RESKEY_sybase_user -c ksh << EOF
# set required SYBASE environment by running SYBASE.sh.
. $OCF_RESKEY_sybase_home/SYBASE.sh
# Run a very simple SQL statement to make sure the server is still ok. The output will be put to
# tmpfile1.
(echo "select 1"; echo "go") |
\$SYBASE/\$SYBASE_OCS/bin/isql $login_string -S$OCF_RESKEY_server_name -I$OCF_RESKEY_interfaces_file -t $OCF_RESKEY_deep_probe_timeout -e -o$tmpfile1 &
# Record the isql command process id to temporary file. If the isql is hung, we need this process id
# to kill the hung process.
echo \$! > $tmpfile2
EOF
declare -i t=0
# Monitor the output file tmpfile1.
while [[ $t -lt $OCF_RESKEY_deep_probe_timeout ]]
do
# If the SQL statement is executed successfully, we will get the following output:
# 1> select 1
#
# -----------
# 1
#
# (1 row affected)
# So, we determine if the execution is success by searching the keyword "(1 row affected)".
grep "(1 row affected)" $tmpfile1
if [[ $? = 0 ]]
then
ocf_log debug "sybaseASE: Deep probe sucess."
break
else
sleep 1
t=`expr $t+1`
fi
done
# If $t is larger than deep_probe_timeout, it means the isql command line cannot finish in given time.
# This means the deep probe failed. We need to kill the isql process manually.
if [[ $t -ge $OCF_RESKEY_deep_probe_timeout ]]
then
ocf_log err "sybaseASE: Deep probe fail. The dataserver has no response."
# Read the process id of isql process from tmpfile2
pid=`cat $tmpfile2 | awk '{print $1}'`
rm -f $tmpfile1
rm -f $tmpfile2
# Kill the isql process directly.
kill -9 $pid
return 1
fi
rm -f $tmpfile1
rm -f $tmpfile2
ocf_log debug "sybaseASE: End 'deep_probe'."
return 0
}
#############################
# Do some real work here... #
#############################
case $__OCF_ACTION in
start)
verify_all || exit $OCF_ERR_GENERIC
ase_start
exit $?
;;
stop)
verify_all || exit $OCF_ERR_GENERIC
ase_stop
exit $?
;;
status | monitor)
verify_all || exit $OCF_ERR_GENERIC
ase_status $OCF_CHECK_LEVEL
exit $?
;;
meta-data)
meta_data
exit $OCF_SUCCESS
;;
validate-all)
verify_all
exit $?
;;
*)
echo "Usage: $SCRIPT {start|stop|monitor|status|validate-all|meta-data}"
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
exit 0

File Metadata

Mime Type
text/x-diff
Expires
Thu, Oct 16, 12:00 AM (2 h, 57 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2530610
Default Alt Text
(126 KB)

Event Timeline