diff --git a/.gitignore b/.gitignore index bbff032c3..3a9be36e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,109 +1,110 @@ *.swp Makefile.in aclocal.m4 autoconf autoheader autom4te.cache automake autoscan.log compile configure configure.scan config.guess config.log config.sub config.status Makefile depcomp install-sh libtoolize ltmain.sh libtool make/stamp-h1 m4 make/clusterautoconfig.h* missing *.pc .deps .libs *.o *.la *.lo *.loT rgmanager/src/resources/fs.sh rgmanager/src/resources/oracledb.sh rgmanager/src/resources/utils/config-utils.sh resource-agents-* .version # generated by ./autogen.sh && ./configure doc/man/*.7 doc/man/*.xml heartbeat/ocf-binaries heartbeat/ocf-directories heartbeat/ocf-shellfuncs heartbeat/send_ua heartbeat/shellfuncs +heartbeat/*.pyc include/agent_config.h include/config.h include/config.h.in include/stamp-h1 include/stamp-h2 ldirectord/ldirectord ldirectord/ldirectord.8 ldirectord/OCF/ldirectord ldirectord/init.d/ldirectord ldirectord/init.d/ldirectord.debian ldirectord/init.d/ldirectord.debian.default ldirectord/systemd/ldirectord.service tools/findif tools/ocf-tester tools/send_arp tools/tickle_tcp tools/ocft/README tools/ocft/README.zh_CN tools/ocft/caselib tools/ocft/ocft *.cache *.upgrade.xml py-compile ylwrap # BEAM Entries *.beam parser-messages MISC_ERRORS cscope.files cscope.out patches updates logs # OS and Editor Artifacts .DS_Store .bomb *.rej *.bz2 *.gz *.xz *.sed *.diff *.patch *.gres *~ # Misc HTML TAGS GPATH GRTAGS GSYMS GTAGS .gres.* *.orig .gdb_history *~ \#* .changes pacemaker.tar.gz diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance index eb058cccf..ca320de1f 100755 --- a/heartbeat/SAPInstance +++ b/heartbeat/SAPInstance @@ -1,980 +1,980 @@ #!/bin/sh # # SAPInstance # # Description: Manages a single SAP Instance as a High-Availability # resource. One SAP Instance is defined by one # SAP Instance-Profile. start/stop handles all services # of the START-Profile, status and monitor care only # about essential services. # # Author: Alexander Krauth, June 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006-2008 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_InstanceName # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) # OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only) # OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk) # OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # OCF_RESKEY_IS_ERS (needed for ENQ/REPL NW 740) # # TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status) # - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque) # - Option for cleanup abandoned enqueue replication tables # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### SH=/bin/sh sapinstance_usage() { methods=`sapinstance_methods` methods=`echo $methods | tr ' ' '|'` cat <<-EOF usage: $0 ($methods) $0 manages a SAP Instance as an HA resource. The 'start' operation starts the instance or the ERS instance in a Master/Slave configuration The 'stop' operation stops the instance The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'promote' operation starts the primary instance in a Master/Slave configuration The 'demote' operation stops the primary instance and starts the ERS instance The 'reload' operation allows changed parameters (non-unique only) without restarting the service The 'notify' operation always returns SUCCESS The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports EOF } sapinstance_meta_data() { cat < 2.14 Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration. The resource agent supports the following SAP versions: - SAP WebAS ABAP Release 6.20 - 7.40 - SAP WebAS Java Release 6.40 - 7.40 - SAP WebAS ABAP + Java Add-In Release 6.20 - 7.40 (Java is not monitored by the cluster in that case) When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com). Other versions may also work with this agent, but have not been verified. All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time. sapstartsrv knows 4 status colours: - GREEN = everything is fine - YELLOW = something is wrong, but the service is still working - RED = the service does not work - GRAY = the service has not been started The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover. The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing. Manages a SAP instance as an HA resource. The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile. Instance name: SID_INSTANCE_VIR-HOSTNAME The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. Path of sapstartsrv and sapcontrol The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. Path of start profile The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Start profile name After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and a JAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance. Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time. That is only useful for double stack systems. Check the successful start after that time (do not wait for J2EE-Addin) The SAPInstance resource agent tries to recover a failed start attempt automatically one time. This is done by killing running instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator. Enable or disable automatic startup recovery Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails. Those services are monitored within the SAPInstance resource agent: - disp+work - msg_server - enserver (ENSA1) - enq_server (ENSA2) - enrepserver (ENSA1) - enq_replicator (ENSA2) - jcontrol - jstart Some other services could be monitored as well. They have to be given with the parameter MONITOR_SERVICES, e.g.: - sapwebdisp - TREXDaemon.x That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'. The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports. You may specify multiple services separated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver Services to monitor Usual a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the gracefull stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !! Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL) Only used in a Master/Slave resource configuration: The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile. The enqueue replication instance must be installed, before you want to configure a master-slave cluster recource. The master-slave configuration in the cluster must use this properties: clone_max = 2 clone_node_max = 1 master_node_max = 1 master_max = 1 Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME Only used in a Master/Slave resource configuration: The parameter ERS_InstanceName must also be set in this configuration. The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Enqueue replication start profile name The full qualified path where to find a script or program which should be executed before this resource gets started. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. Path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. Path to a post-start script Only used for ASCS/ERS SAP Netweaver installations without implementing a master/slave resource to allow the ASCS to 'find' the ERS running on another cluster node after a resource failure. This parameter should be set to true 'only' for the ERS instance for implementations following the SAP NetWeaver 7.40 HA certification (NW-HA-CLU-740). This includes also systems for NetWeaver less than 7.40, if you like to impelemnt the NW-HA-CLU-740 scenario. Mark SAPInstance as ERS instance - + END } # # methods: What methods/operations do we support? # sapinstance_methods() { cat <<-EOF start stop status monitor promote demote reload notify validate-all methods meta-data usage EOF } # # is_clone : find out if we are configured to run in a Master/Slave configuration # is_clone() { if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ] then if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ] then ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_ERS_InstanceName" ] then ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory." exit $OCF_ERR_ARGS fi else return 0 fi return 1 } # # abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different # from customer to customer - we cannot handle this always as an error # This would be the case, if the software is installed on shared disks and not visible # to all cluster nodes at all times. # abnormal_end() { local err_msg=$1 ocf_is_probe && { sapinstance_status exit $? } ocf_log err $err_msg if [ "$ACTION" = "stop" ] then cleanup_instance exit $OCF_SUCCESS fi exit $OCF_ERR_CONFIGURED } # # sapinstance_init : Define global variables with default values, if optional parameters are not set # # sapinstance_init() { local myInstanceName="$1" SID=`echo "$myInstanceName" | cut -d_ -f1` InstanceName=`echo "$myInstanceName" | cut -d_ -f2` InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3` # optional OCF parameters, we try to guess which directories are correct if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] then if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" fi else if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" then DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" fi fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" [ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" if [ -z "$OCF_RESKEY_DIR_PROFILE" ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ] then currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE else currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE fi if [ -z "$OCF_RESKEY_IS_ERS" ]; then is_ers="no" else is_ers="$OCF_RESKEY_IS_ERS" fi if [ -z "$currentSTART_PROFILE" ] then if [ ! -r "$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" -a -r "$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" ]; then SAPSTARTPROFILE="$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" else SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" fi else SAPSTARTPROFILE="$currentSTART_PROFILE" fi if [ -z "$OCF_RESKEY_START_WAITTIME" ] then export OCF_RESKEY_START_WAITTIME=3600 fi if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then export OCF_RESKEY_MONITOR_SERVICES="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart|enq_server|enq_replicator" fi # as root user we need the library path to the SAP kernel to be able to call sapcontrol if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi return $OCF_SUCCESS } # # check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. # We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, # because then we have two instances with the same instance number. # check_sapstartsrv() { local restart=0 local runninginst="" local chkrc=$OCF_SUCCESS local output="" if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` if [ $? -eq 0 ] then runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` if [ "$runninginst" != "$InstanceName" ] then ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start` if [ $? -ne 0 ]; then ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)" ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" restart=1 fi fi else ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" restart=1 fi fi if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi if [ $restart -eq 1 ] then if [ -d /usr/sap/$SID/SYS/profile/ ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" fi [ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" pkill -9 -f "sapstartsrv.*$runninginst" # removing the unix domain socket files as they might have wrong permissions # or ownership - they will be recreated by sapstartsrv during next start rm -f /tmp/.sapstream5${InstanceNr}13 rm -f /tmp/.sapstream5${InstanceNr}14 $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm # now make sure the daemon has been started and is able to respond local srvrc=1 while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] do sleep 1 $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 srvrc=$? done if [ $srvrc -ne 1 ] then ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" chkrc=$OCF_ERR_GENERIC ocf_is_probe && chkrc=$OCF_NOT_RUNNING fi fi return $chkrc } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { local NAME="$1" local VALUE="$2" if [ -n "$VALUE" ] then if have_binary "$VALUE" then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" "$VALUE" >/dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # cleanup_instance : remove resources (processes and shared memory) from a crashed instance) # cleanup_instance() { pkill -9 -f -U $sidadm $InstanceName ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'" # it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot be removed su - $sidadm -c "cleanipc $InstanceNr remove" ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm" ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid return 0 } # # sapinstance_start : Start the SAP instance # sapinstance_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" local rc=$OCF_NOT_RUNNING local output="" local loopcount=0 while [ $loopcount -lt 2 ] do loopcount=$(($loopcount + 1)) check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Start` rc=$? ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" fi if [ $rc -ne 0 ] then ocf_log err "SAP Instance $SID-$InstanceName start failed." return $OCF_ERR_GENERIC fi local startrc=1 while [ $startrc -gt 0 ] do local waittime_start=`date +%s` output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` startrc=$? local waittime_stop=`date +%s` if [ $startrc -ne 0 ] then if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] then sapinstance_monitor NOLOG if [ $? -eq $OCF_SUCCESS ] then output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." startrc=0; loopcount=2 fi else if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" ocf_log warn "Try to recover $SID-$InstanceName" cleanup_instance else loopcount=2 fi startrc=-1 fi else loopcount=2 fi done done if [ $startrc -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 1 -l reboot; fi else ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" rc=$OCF_NOT_RUNNING if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi fi return $rc } # # sapinstance_recover: Try startup of failed instance by cleaning up resources # sapinstance_recover() { cleanup_instance sapinstance_start return $? } # # sapinstance_stop: Stop the SAP instance # sapinstance_stop() { local output="" local rc sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ] then ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!" cleanup_instance return $OCF_SUCCESS fi check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Stop` rc=$? ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output" fi if [ $rc -eq 0 ] then output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` if [ $? -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi return $rc } # # sapinstance_monitor: Can the given SAP instance do anything useful? # sapinstance_monitor() { local MONLOG=$1 local rc check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ] then local count=0 local SERVNO local output output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` do local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` local STATE=0 local SEARCH case $COLOR in GREEN|YELLOW) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then if ocf_is_probe then rc=$OCF_NOT_RUNNING else [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" rc=$OCF_ERR_GENERIC fi fi fi return $rc } # # sapinstance_status: Lightweight check of SAP instance only with OS tools # sapinstance_status() { local pid local pids [ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING pids=`grep '^kill -[0-9]' /usr/sap/$SID/$InstanceName/work/kill.sap | awk '{print $3}'` for pid in $pids do [ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS done return $OCF_NOT_RUNNING } # # sapinstance_validate: Check the semantics of the input parameters # sapinstance_validate() { local rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS fi if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" rc=$OCF_ERR_ARGS fi return $rc } # # sapinstance_start_clone # sapinstance_start_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 50 -l reboot sapinstance_start return $? } # # sapinstance_stop_clone # sapinstance_stop_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 0 -l reboot sapinstance_stop return $? } # # sapinstance_monitor_clone # sapinstance_monitor_clone() { # first check with the status function (OS tools) if there could be something like a SAP instance running # as we do not know here, if we are in master or slave state we do not want to start our monitoring # agents (sapstartsrv) on the wrong host local rc sapinstance_init $OCF_RESKEY_InstanceName if sapinstance_status; then if sapinstance_monitor; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot return $OCF_RUNNING_MASTER fi # by nature of the SAP enqueue server we have to make sure # that we do a failover to the slave (enqueue replication server) # in case the enqueue process has failed. We signal this to the # cluster by setting our master preference to a lower value than the slave. ${HA_SBIN_DIR}/crm_master -v 10 -l reboot return $OCF_FAILED_MASTER fi sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_status && sapinstance_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot fi return $rc } # # sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance # The order is important here to behave correct from the application levels view # sapinstance_promote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Promoting $SID-$InstanceName to running Master." sapinstance_start rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_stop rc=$? fi return $rc } # # sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance # sapinstance_demote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Demoting $SID-$InstanceName to a slave." sapinstance_stop rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_start rc=$? fi return $rc } # # sapinstance_notify: Handle master scoring - to make sure a slave gets the next master # sapinstance_notify() { local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" if [ "${n_type}_${n_op}" = "post_promote" ]; then # After promotion of one master in the cluster, we make sure that all clones reset their master # value back to 100. This is because a failed monitor on a master might have degree one clone # instance to score 10. ${HA_SBIN_DIR}/crm_master -v 100 -l reboot elif [ "${n_type}_${n_op}" = "pre_demote" ]; then # if we are a slave and a demote event is announced, make sure we are highest on the list to become master # that is, when a slave resource was started after the promote event of an already running master (e.g. node of slave was down) # We also have to make sure to overrule the globally set resource_stickiness or any fail-count factors => INFINITY local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname" if [ ${n_uname} != ${NODENAME} ]; then ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot fi fi } # # 'main' starts here... # ## GLOBALS SID="" sidadm="" InstanceName="" InstanceNr="" SAPVIRHOST="" DIR_EXECUTABLE="" SAPSTARTSRV="" SAPCONTROL="" DIR_PROFILE="" SAPSTARTPROFILE="" CLONE=0 NODENAME=$(ocf_local_nodename) if ( [ $# -ne 1 ] ) then sapinstance_usage exit $OCF_ERR_ARGS fi ACTION=$1 if [ "$ACTION" = "status" ]; then ACTION=monitor fi # These operations don't require OCF instance parameters to be set case "$ACTION" in usage|methods) sapinstance_$ACTION exit $OCF_SUCCESS;; meta-data) sapinstance_meta_data exit $OCF_SUCCESS;; notify) sapinstance_notify exit $OCF_SUCCESS;; *);; esac if ! ocf_is_root then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # parameter check if [ -z "$OCF_RESKEY_InstanceName" ] then ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" exit $OCF_ERR_ARGS fi is_clone; CLONE=$? if [ ${CLONE} -eq 1 ] then CLACT=_clone else if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] then ocf_log err "$ACTION called in a non master/slave environment" exit $OCF_ERR_ARGS fi sapinstance_init $OCF_RESKEY_InstanceName fi # What kind of method was invoked? case "$ACTION" in start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT exit $?;; validate-all) sapinstance_validate exit $?;; reload ) ocf_log info "reloading SAPInstance parameters" exit $OCF_SUCCESS;; *) sapinstance_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/aliyun-vpc-move-ip b/heartbeat/aliyun-vpc-move-ip index e27952adb..ed446c9c1 100755 --- a/heartbeat/aliyun-vpc-move-ip +++ b/heartbeat/aliyun-vpc-move-ip @@ -1,296 +1,296 @@ #!/bin/sh # # OCF resource agent to move an IP address within a VPC in the Aliyun # Based on code of Markus Guertler (GitHub AWS-VPC-move-IP) # Based on code of Adam Gandelman (GitHub ec2-resource-agents/elasticip) # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # aliyuncli doesnt work without HOME parameter export HOME="/root" USAGE="usage: $0 {start|stop|status|meta-data}"; ############################################################################### ############################################################################### # # Functions # ############################################################################### ip_get_and_configure() { ocf_log debug "function: ip_get_and_configure" ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')" if [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then if [ -n "$ROUTE_TO_INSTANCE" ]; then ip_drop fi cmd="aliyuncli vpc CreateRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ECS_INSTANCE_ID --NextHopType Instance --output text" ocf_log debug "executing command: $cmd" $cmd rc=$? while [ $rc -ne 0 ]; do sleep 1 cmd="aliyuncli vpc CreateRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ECS_INSTANCE_ID --NextHopType Instance --output text" ocf_log debug "executing command: $cmd" $cmd rc=$? done wait_for_started fi # Reconfigure the local ip address ip addr add "${OCF_RESKEY_address}/32" dev $OCF_RESKEY_interface rc=$? if [ $rc -ne 0 ]; then ocf_log err "command failed, rc: $rc" return $OCF_ERR_GENERIC fi ocf_log debug "IP added" return $OCF_SUCCESS } ip_drop() { ocf_log debug "function: ip_drop" cmd="ip addr delete ${OCF_RESKEY_address}/32 dev $OCF_RESKEY_interface" ocf_log debug "executing command: $cmd" $cmd rc=$? if [ $rc -ne 0 ] && [ $rc -ne 2 ]; then ocf_log err "command failed, rc $rc" return $OCF_ERR_GENERIC fi cmd="aliyuncli vpc DeleteRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ROUTE_TO_INSTANCE --output text" ocf_log debug "executing command: $cmd" $cmd if [ $? -ne 0 ]; then ocf_log err "command failed, rc: $rc" return $OCF_ERR_GENERIC fi wait_for_deleted ocf_log debug "IP dropped" return $OCF_SUCCESS } wait_for_started() { cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text" ocf_log debug "executing command: $cmd" ROUTE_TO_INSTANCE="$($cmd | grep $OCF_RESKEY_address | awk '{ print $3 }')" while [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; do sleep 3 cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text" ocf_log debug "executing command: $cmd" ROUTE_TO_INSTANCE="$($cmd | grep $OCF_RESKEY_address | awk '{ print $3 }')" done } wait_for_deleted() { ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')" while [ ! -z "$ROUTE_TO_INSTANCE" ]; do sleep 1 cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text" ocf_log debug "executing command: $cmd" ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')" done } ecs_ip_metadata() { cat < 2.0 Resource Agent to move IP addresses within a VPC of the Aliyun Webservices ECS by changing an entry in an specific routing table Move IP within a APC of the Aliyun ECS VPC private IP address vpc ip Name of the routing table, where the route for the IP address should be changed, i.e. rtb-... routing table name Name of the network interfacen, i.e. eth0 network interface name Valid Aliyun CLI profile name profile name - - - - - + + + + + END } ecs_ip_validate() { ocf_log debug "function: validate" # IP address if [ -z "$OCF_RESKEY_address" ]; then ocf_log err "IP address parameter not set $OCF_RESKEY_ADDRESS!" exit $OCF_ERR_CONFIGURED fi # Network Interface if [ -z "$OCF_RESKEY_interface" ]; then ocf_log err "Network interface parameter not set $OCF_RESKEY_INTERFACE!" exit $OCF_ERR_CONFIGURED fi # Routing Table if [ -z "$OCF_RESKEY_routing_table" ]; then ocf_log err "Routing table parameter not set $OCF_RESKEY_ROUTING_TABLE!" exit $OCF_ERR_CONFIGURED fi if [ -z "${ECS_INSTANCE_ID}" ]; then ocf_exit_reason "Instance ID not found. Is this a ECS instance?" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } ecs_ip_start() { ocf_log info "ECS: Moving IP address $OCF_RESKEY_address to this host by adjusting routing table $OCF_RESKEY_routing_table" ecs_ip_monitor if [ $? = $OCF_SUCCESS ]; then ocf_log info "ECS: $OCF_RESKEY_address already started" return $OCF_SUCCESS fi ocf_log info "ECS: Adjusting routing table and locally configuring IP address" ip_get_and_configure rc=$? if [ $rc -ne 0 ]; then ocf_log err "Received $rc from 'aliyun cli'" return $OCF_ERR_GENERIC fi ecs_ip_monitor rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_log err "IP address couldn't be configured on this host (IP: $OCF_RESKEY_address, Interface: $OCF_RESKEY_interface)" return $rc fi return $OCF_SUCCESS } ecs_ip_stop() { ocf_log info "ECS: Bringing down IP address $OCF_RESKEY_address" ecs_ip_monitor if [ $? = $OCF_NOT_RUNNING ]; then ocf_log info "ECS: Address $OCF_RESKEY_address already down" return $OCF_SUCCESS fi ip_drop if [ $? -ne $OCF_SUCCESS ]; then ocf_log err "ECS: Couldn't drop IP address $OCF_RESKEY_address on interface $OCF_RESKEY_interface." return $OCF_ERR_GENERIC fi ecs_ip_monitor if [ $? = $OCF_NOT_RUNNING ]; then ocf_log info "ECS: Successfully brought down $OCF_RESKEY_address" return $OCF_SUCCESS fi ocf_log err "ECS: Couldn't bring down IP address $OCF_RESKEY_address on interface $OCF_RESKEY_interface." return $OCF_ERR_GENERIC } ecs_ip_monitor() { ocf_log debug "function: ecsip_monitor: check routing table" cmd="aliyuncli vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output text" ocf_log debug "executing command: $cmd" ROUTE_TO_INSTANCE="$($cmd |grep $OCF_RESKEY_address | awk '{ print $3 }')" if [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then ocf_log debug "not routed to this instance ($ECS_INSTANCE_ID) but to instance $ROUTE_TO_INSTANCE" return $OCF_NOT_RUNNING fi cmd="ping -W 1 -c 1 $OCF_RESKEY_address" ocf_log debug "executing command: $cmd" $cmd > /dev/null if [ $? -ne 0 ]; then ocf_log debug "IP $OCF_RESKEY_address not locally reachable via ping on this system" return $OCF_NOT_RUNNING fi ocf_log debug "routed in VPC and locally reachable" return $OCF_SUCCESS } ############################################################################### # # MAIN # ############################################################################### case $__OCF_ACTION in meta-data) ecs_ip_metadata exit $OCF_SUCCESS;; validate-all) ecs_ip_validate;; esac ECS_INSTANCE_ID="$(curl -s http://100.100.100.200/latest/meta-data/instance-id)" case $__OCF_ACTION in start) ecs_ip_validate ecs_ip_start;; stop) ecs_ip_stop;; monitor) ecs_ip_monitor;; *) exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in index ba61193b6..31d84643a 100755 --- a/heartbeat/gcp-vpc-move-vip.in +++ b/heartbeat/gcp-vpc-move-vip.in @@ -1,338 +1,338 @@ #!@PYTHON@ -tt # --------------------------------------------------------------------- # Copyright 2016 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # --------------------------------------------------------------------- # Description: Google Cloud Platform - Floating IP Address (Alias) # --------------------------------------------------------------------- import json import logging import os import sys import time OCF_FUNCTIONS_DIR="%s/lib/heartbeat" % os.environ.get("OCF_ROOT") sys.path.append(OCF_FUNCTIONS_DIR) from ocf import * try: import googleapiclient.discovery except ImportError: pass if sys.version_info >= (3, 0): # Python 3 imports. import urllib.parse as urlparse import urllib.request as urlrequest else: # Python 2 imports. import urllib as urlparse import urllib2 as urlrequest CONN = None THIS_VM = None ALIAS = None METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' METADATA_HEADERS = {'Metadata-Flavor': 'Google'} METADATA = \ ''' 1.0 Floating IP Address on Google Cloud Platform - Using Alias IP address functionality to attach a secondary IP address to a running instance Floating IP Address on Google Cloud Platform IP Address to be added including CIDR. E.g 192.168.0.1/32 IP Address to be added including CIDR. E.g 192.168.0.1/32 Subnet name for the Alias IP Subnet name for the Alias IP List of hosts in the cluster Host list If enabled (set to true), IP failover logs will be posted to stackdriver logging Stackdriver-logging support - - - - - + + + + + ''' def get_metadata(metadata_key, params=None, timeout=None): """Performs a GET request with the metadata headers. Args: metadata_key: string, the metadata to perform a GET request on. params: dictionary, the query parameters in the GET request. timeout: int, timeout in seconds for metadata requests. Returns: HTTP response from the GET request. Raises: urlerror.HTTPError: raises when the GET request fails. """ timeout = timeout or 60 metadata_url = os.path.join(METADATA_SERVER, metadata_key) params = urlparse.urlencode(params or {}) url = '%s?%s' % (metadata_url, params) request = urlrequest.Request(url, headers=METADATA_HEADERS) request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) return request_opener.open(request, timeout=timeout * 1.1).read() def get_instance(project, zone, instance): request = CONN.instances().get( project=project, zone=zone, instance=instance) return request.execute() def get_network_ifaces(project, zone, instance): return get_instance(project, zone, instance)['networkInterfaces'] def wait_for_operation(project, zone, operation): while True: result = CONN.zoneOperations().get( project=project, zone=zone, operation=operation['name']).execute() if result['status'] == 'DONE': if 'error' in result: raise Exception(result['error']) return time.sleep(1) def set_alias(project, zone, instance, alias, alias_range_name=None): fingerprint = get_network_ifaces(project, zone, instance)[0]['fingerprint'] body = { 'aliasIpRanges': [], 'fingerprint': fingerprint } if alias: obj = {'ipCidrRange': alias} if alias_range_name: obj['subnetworkRangeName'] = alias_range_name body['aliasIpRanges'].append(obj) request = CONN.instances().updateNetworkInterface( instance=instance, networkInterface='nic0', project=project, zone=zone, body=body) operation = request.execute() wait_for_operation(project, zone, operation) def get_alias(project, zone, instance): iface = get_network_ifaces(project, zone, instance) try: return iface[0]['aliasIpRanges'][0]['ipCidrRange'] except KeyError: return '' def get_localhost_alias(): net_iface = get_metadata('instance/network-interfaces', {'recursive': True}) net_iface = json.loads(net_iface.decode('utf-8')) try: return net_iface[0]['ipAliases'][0] except (KeyError, IndexError): return '' def get_zone(project, instance): fl = 'name="%s"' % instance request = CONN.instances().aggregatedList(project=project, filter=fl) while request is not None: response = request.execute() zones = response.get('items', {}) for zone in zones.values(): for inst in zone.get('instances', []): if inst['name'] == instance: return inst['zone'].split("/")[-1] request = CONN.instances().aggregatedList_next( previous_request=request, previous_response=response) raise Exception("Unable to find instance %s" % (instance)) def get_instances_list(project, exclude): hostlist = [] request = CONN.instances().aggregatedList(project=project) while request is not None: response = request.execute() zones = response.get('items', {}) for zone in zones.values(): for inst in zone.get('instances', []): if inst['name'] != exclude: hostlist.append(inst['name']) request = CONN.instances().aggregatedList_next( previous_request=request, previous_response=response) return hostlist def gcp_alias_start(alias): my_alias = get_localhost_alias() my_zone = get_metadata('instance/zone').split('/')[-1] project = get_metadata('project/project-id') # If I already have the IP, exit. If it has an alias IP that isn't the VIP, # then remove it if my_alias == alias: logger.info( '%s already has %s attached. No action required' % (THIS_VM, alias)) sys.exit(OCF_SUCCESS) elif my_alias: logger.info('Removing %s from %s' % (my_alias, THIS_VM)) set_alias(project, my_zone, THIS_VM, '') # Loops through all hosts & remove the alias IP from the host that has it hostlist = os.environ.get('OCF_RESKEY_hostlist', '') if hostlist: hostlist = hostlist.replace(THIS_VM, '').split() else: hostlist = get_instances_list(project, THIS_VM) for host in hostlist: host_zone = get_zone(project, host) host_alias = get_alias(project, host_zone, host) if alias == host_alias: logger.info( '%s is attached to %s - Removing all alias IP addresses from %s' % (alias, host, host)) set_alias(project, host_zone, host, '') break # add alias IP to localhost set_alias( project, my_zone, THIS_VM, alias, os.environ.get('OCF_RESKEY_alias_range_name')) # Check the IP has been added my_alias = get_localhost_alias() if alias == my_alias: logger.info('Finished adding %s to %s' % (alias, THIS_VM)) elif my_alias: logger.error( 'Failed to add IP. %s has an IP attached but it isn\'t %s' % (THIS_VM, alias)) sys.exit(OCF_ERR_GENERIC) else: logger.error('Failed to add IP address %s to %s' % (alias, THIS_VM)) sys.exit(OCF_ERR_GENERIC) def gcp_alias_stop(alias): my_alias = get_localhost_alias() my_zone = get_metadata('instance/zone').split('/')[-1] project = get_metadata('project/project-id') if my_alias == alias: logger.info('Removing %s from %s' % (my_alias, THIS_VM)) set_alias(project, my_zone, THIS_VM, '') def gcp_alias_status(alias): my_alias = get_localhost_alias() if alias == my_alias: logger.info('%s has the correct IP address attached' % THIS_VM) else: sys.exit(OCF_NOT_RUNNING) def validate(): global ALIAS global CONN global THIS_VM # Populate global vars try: CONN = googleapiclient.discovery.build('compute', 'v1') except Exception as e: logger.error('Couldn\'t connect with google api: ' + str(e)) sys.exit(OCF_ERR_CONFIGURED) try: THIS_VM = get_metadata('instance/name') except Exception as e: logger.error('Couldn\'t get instance name, is this running inside GCE?: ' + str(e)) sys.exit(OCF_ERR_CONFIGURED) ALIAS = os.environ.get('OCF_RESKEY_alias_ip') if not ALIAS: logger.error('Missing alias_ip parameter') sys.exit(OCF_ERR_CONFIGURED) def configure_logs(): # Prepare logging global logger logging.getLogger('googleapiclient').setLevel(logging.WARN) logging_env = os.environ.get('OCF_RESKEY_stackdriver_logging') if logging_env: logging_env = logging_env.lower() if any(x in logging_env for x in ['yes', 'true', 'enabled']): try: import google.cloud.logging.handlers client = google.cloud.logging.Client() handler = google.cloud.logging.handlers.CloudLoggingHandler( client, name=THIS_VM) handler.setLevel(logging.INFO) formatter = logging.Formatter('gcp:alias "%(message)s"') handler.setFormatter(formatter) log.addHandler(handler) logger = logging.LoggerAdapter(log, {'OCF_RESOURCE_INSTANCE': OCF_RESOURCE_INSTANCE}) except ImportError: logger.error('Couldn\'t import google.cloud.logging, ' 'disabling Stackdriver-logging support') def main(): if 'meta-data' in sys.argv[1]: print(METADATA) return validate() if 'validate-all' in sys.argv[1]: return configure_logs() if 'start' in sys.argv[1]: gcp_alias_start(ALIAS) elif 'stop' in sys.argv[1]: gcp_alias_stop(ALIAS) elif 'status' in sys.argv[1] or 'monitor' in sys.argv[1]: gcp_alias_status(ALIAS) else: logger.error('no such function %s' % str(sys.argv[1])) if __name__ == "__main__": main() diff --git a/heartbeat/mariadb.in b/heartbeat/mariadb.in index 860fea7fd..c1969d70e 100644 --- a/heartbeat/mariadb.in +++ b/heartbeat/mariadb.in @@ -1,1058 +1,1058 @@ #!@BASH_SHELL@ # # # MariaDB # # Description: Manages a MariaDB Master/Slave database as Linux-HA resource # # Authors: Alan Robertson: DB2 Script # Jakub Janczak: rewrite as MySQL # Andrew Beekhof: cleanup and import # Sebastian Reitenbach: add OpenBSD defaults, more cleanup # Narayan Newton: add Gentoo/Debian defaults # Marian Marinov, Florian Haas: add replication capability # Yves Trudeau, Baron Schwartz: add VIP support and improve replication # Nils Carlson: add GTID support and semi-sync support # # Support: users@clusterlabs.org # License: GNU General Public License (GPL) # # (c) 2002-2005 International Business Machines, Inc. # 2005-2010 Linux-HA contributors # # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_binary # OCF_RESKEY_client_binary # OCF_RESKEY_config # OCF_RESKEY_datadir # OCF_RESKEY_user # OCF_RESKEY_group # OCF_RESKEY_node_list # OCF_RESKEY_test_table # OCF_RESKEY_test_user # OCF_RESKEY_test_passwd # OCF_RESKEY_enable_creation # OCF_RESKEY_additional_parameters # OCF_RESKEY_log # OCF_RESKEY_pid # OCF_RESKEY_socket # OCF_RESKEY_replication_user # OCF_RESKEY_replication_passwd # OCF_RESKEY_replication_port ####################################################################### # Initialization: OCF_RESKEY_node_list_default="" : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/mysql-common.sh ####################################################################### usage() { cat < 1.0 Resource script for MariaDB. Manages a complete master/slave replication setup with GTID, for simpler uses look at the mysql resource agent which supports older replication forms which mysql and mariadb have in common. The resource must be setup to use notifications. Set 'notify=true' in the metadata attributes when defining a MariaDB master/slave instance. The default behavior is to use uname -n values in the change master to command. Other IPs can be specified manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP giving the IP to use for replication. For example, if the mariadb primitive you are using is p_mariadb, the attribute to set will be p_mariadb_mysql_master_IP. Manages a MariaDB master/slave instance Location of the MariaDB server binary MariaDB server binary Location of the MariaDB client binary MariaDB client binary Configuration file MariaDB config Directory containing databases MariaDB datadir User running MariaDB daemon MariaDB user Group running MariaDB daemon (for logfile and directory permissions) MariaDB group The logfile to be used for mysqld. MariaDB log file All node names of nodes that will execute mariadb. Please separate each node name with a space. This is required for the master selection to function. node list The pidfile to be used for mysqld. MariaDB pid file The socket to be used for mysqld. MariaDB socket Table to be tested in monitor statement (in database.table notation) MariaDB test table MariaDB test user, must have select privilege on test_table MariaDB test user MariaDB test user password MariaDB test user password If the MariaDB database does not exist, it will be created Create the database if it does not exist Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld MariaDB replication user. This user is used for starting and stopping MariaDB replication, for setting and resetting the master host, and for setting and unsetting read-only mode. Because of that, this user must have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD privileges on all nodes within the cluster. Mandatory if you define a master-slave resource. MariaDB replication user MariaDB replication password. Used for replication client and slave. Mandatory if you define a master-slave resource. MariaDB replication user password The port on which the Master MariaDB instance is listening. MariaDB replication port - - - - - - - - - - - + + + + + + + + + + + END } # Convenience functions greater_than_equal_long() { # there are values we need to compare in this script # that are too large for shell -gt to process local true=$(echo "$1 > $2" | bc) if [ "$true" -eq "1" ]; then return 0 else return 1 fi } greater_than_gtid() { local gtid1_transaction_id=$(echo $1 | cut -d - -f 3) local gtid2_transaction_id=$(echo $2 | cut -d - -f 3) greater_than_equal_long $gtid1_transaction_id $gtid2_transaction_id return $? } set_gtid() { # Sets the GTID in CIB using attrd_updater for this node. local gtid=$($MYSQL $MYSQL_OPTIONS_REPL \ -s -N -e "show global variables like 'gtid_current_pos'" | cut -f 2) # Ensure that we got somethine like a valid GTID if ! echo $gtid | grep -q '-'; then ocf_exit_reason "Unable to read GTID from MariaDB" ocf_log err "Unable to read GTID from MariaDB" return $OCF_ERR_GENERIC fi ${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-gtid -U $gtid } read_gtid() { local node=$1 local query_result local name local host local value # This produces output of the form 'name="var-name" host="node2" value="val"'. # This should be set at this point, because we have store our own GTID previously. if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -N $node -n ${OCF_RESOURCE_INSTANCE}-gtid -Q); then ocf_exit_reason "Unable to read GTID from attrd" ocf_log err "Unable to read GTID from attrd" echo "" return fi # Evaluate the query result to place the variables in the local scope. eval ${query_result} echo ${value} } clear_all_gtid() { for node in $OCF_RESKEY_node_list; do ${HA_SBIN_DIR}/attrd_updater -n ${OCF_RESOURCE_INSTANCE}-gtid -N $node -D done } set_waiting_for_first_master() { ${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -U true } waiting_for_first_master() { local query_result local name local host local value if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -Q); then ocf_exit_reason "Unable to read waiting-for-first-master from attrd" ocf_log err "Unable to read waiting-for-first-master from attrd" return 1 fi # Evaluate the query result to place the variables in the local scope. eval ${query_result} if [ "$value" = "true" ]; then return 0 else return 1 fi } clear_waiting_for_first_master() { attrd_updater -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -D } have_master_with_priority() { # Go through each node and validate that at least one has # a set priority. Because we unset the priority on reboot # a lack of priority indicates that we need to select a # new master. for node in $OCF_RESKEY_node_list; do $CRM_MASTER -G -N $node >/dev/null 2>&1 rc=$? if [ $rc -eq 0 ]; then return 0 fi done return 1 } attempt_to_set_master() { ocf_log info "Attempting to set master" local expected_node_count if waiting_for_first_master; then # Wait for all nodes to come online expected_node_count=$OCF_RESKEY_CRM_meta_clone_max else # We accept one node being down. This is not arbitrary, # synchronous replication requires acknowledgement from # at least one host, which means only two nodes must have # the latest GTID. So a set of n - 1 ensures that we do # not lose any writes. expected_node_count=$(($OCF_RESKEY_CRM_meta_clone_max-1)) fi # Set the gtid for this node, making it available to other nodes set_gtid local node_count=0 local highest_gtid=0 local master_candidate="" for node in $OCF_RESKEY_node_list; do local node_gtid=$(read_gtid $node) if [ -z "$node_gtid" ]; then continue fi # Got a valid gtid, increment node count node_count=$(($node_count+1)) # Check if this is a good master candidate if greater_than_gtid $node_gtid $highest_gtid; then master_candidate=$node highest_gtid=$node_gtid fi done # If we managed to query a sufficient number of nodes # then set a master if [ $node_count -ge $expected_node_count ]; then ocf_log info "Promoting $master_candidate to master, highest gtid $highest_gtid, queried $node_count nodes." $CRM_MASTER -v 100 -N $master_candidate else ocf_log info "Not enough nodes ($node_count) contributed to select a master, need $expected_node_count nodes." fi } set_read_only() { # Sets or unsets read-only mode. Accepts one boolean as its # optional argument. If invoked without any arguments, defaults to # enabling read only mode. Should only be set in master/slave # setups. # Returns $OCF_SUCCESS if the operation succeeds, or # $OCF_ERR_GENERIC if it fails. local ro_val if ocf_is_true $1; then ro_val="on" else ro_val="off" fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "SET GLOBAL read_only=${ro_val}" } get_read_only() { # Check if read-only is set local read_only_state read_only_state=$($MYSQL $MYSQL_OPTIONS_REPL \ -e "SHOW VARIABLES" | grep -w read_only | awk '{print $2}') if [ "$read_only_state" = "ON" ]; then return 0 else return 1 fi } is_slave() { # Determine whether the machine is currently running as a MariaDB # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW # SLAVE STATUS creates an empty result set, 0 otherwise. local rc # Check whether this machine should be slave if ! get_read_only; then return 1 fi if get_slave_info; then # show slave status is not empty # Is the slave sql thread running, then we are a slave! if [ "$slave_sql" == 'Yes' ]; then return 0 else return 1 fi else # "SHOW SLAVE STATUS" returns an empty set if instance is not a # replication slave return 1 fi } parse_slave_info() { # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 } get_slave_info() { if [ "$master_log_file" -a "$master_host" ]; then # variables are already defined, get_slave_info has been run before return $OCF_SUCCESS else local tmpfile=$(mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX) $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW SLAVE STATUS\G' > $tmpfile if [ -s $tmpfile ]; then master_host=$(parse_slave_info Master_Host $tmpfile) master_user=$(parse_slave_info Master_User $tmpfile) master_port=$(parse_slave_info Master_Port $tmpfile) master_using_gtid=$(parse_slave_info Using_Gtid $tmpfile) master_log_file=$(parse_slave_info Master_Log_File $tmpfile) slave_sql=$(parse_slave_info Slave_SQL_Running $tmpfile) slave_io=$(parse_slave_info Slave_IO_Running $tmpfile) last_errno=$(parse_slave_info Last_Errno $tmpfile) last_error=$(parse_slave_info Last_Error $tmpfile) secs_behind=$(parse_slave_info Seconds_Behind_Master $tmpfile) last_io_errno=$(parse_slave_info Last_IO_Errno $tmpfile) last_io_error=$(parse_slave_info Last_IO_Error $tmpfile) ocf_log debug "MariaDB instance running as a replication slave" rm "$tmpfile" else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave rm "$tmpfile" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS fi } check_slave() { # Checks slave status local rc new_master get_slave_info rc=$? if [ $rc -eq 0 ]; then # Check normal errors if [ $last_errno -ne 0 ]; then ocf_exit_reason "MariaDB slave replication has failed ($last_errno): $last_error" exit $OCF_ERR_GENERIC fi # Check IO Errors, ignore 2003 which indicates a connection failure to the master if [ $last_io_errno -ne 0 ] && [ $last_io_errno -ne 2003 ]; then ocf_exit_reason "MariaDB slave io has failed ($last_io_errno): $last_io_error" exit $OCF_ERR_GENERIC fi if [ $last_io_errno -eq 2003 ]; then ocf_log warn "MariaDB master not reachable from slave" fi if [ "$slave_io" != 'Yes' ]; then # Not necessarily a bad thing. The master may have # temporarily shut down, and the slave may just be # reconnecting. A warning can't hurt, though. ocf_log warn "MariaDB Slave IO threads currently not running." # Sanity check, are we at least on the right master new_master=$($CRM_ATTR_REPL_INFO --query -q) if [ "$master_host" != "$new_master" ]; then # Not pointing to the right master, not good, removing the VIPs set_reader_attr 0 exit $OCF_SUCCESS fi fi if [ "$slave_sql" != 'Yes' ]; then # We don't have a replication SQL thread running. Not a # good thing. Try to recoved by restarting the SQL thread # and remove reader vip. Prevent MariaDB restart. ocf_exit_reason "MariaDB Slave SQL threads currently not running." # Remove reader vip set_reader_attr 0 # try to restart slave ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" # Return success to prevent a restart exit $OCF_SUCCESS fi ocf_log debug "MariaDB instance running as a replication slave" else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave # TODO: Needs to handle when get_slave_info will return too many connections error ocf_exit_reason "check_slave invoked on an instance that is not a replication slave." exit $OCF_ERR_GENERIC fi } set_master() { local new_master=$($CRM_ATTR_REPL_INFO --query -q) # Informs the MariaDB server of the master to replicate # from. Accepts one mandatory argument which must contain the host # name of the new master host. The master must either be unchanged # from the laste master the slave replicated from, or freshly # reset with RESET MASTER. ocf_log info "Changing MariaDB configuration to replicate from $new_master." ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "CHANGE MASTER TO MASTER_HOST='$new_master', \ MASTER_PORT=$OCF_RESKEY_replication_port, \ MASTER_USER='$OCF_RESKEY_replication_user', \ MASTER_PASSWORD='$OCF_RESKEY_replication_passwd', \ MASTER_USE_GTID=current_pos"; } unset_master(){ # Instructs the MariaDB server to stop replicating from a master # host. # If we're currently not configured to be replicating from any # host, then there's nothing to do. But we do log a warning as # no-one but the CRM should be touching the MariaDB master/slave # configuration. if ! is_slave; then ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" return $OCF_SUCCESS fi # Stop the slave I/O thread and wait for relay log # processing to complete ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE IO_THREAD" if [ $? -gt 0 ]; then ocf_exit_reason "Error stopping slave IO thread" exit $OCF_ERR_GENERIC fi local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX) while true; do $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW PROCESSLIST\G' > $tmpfile if grep -i 'Has read all relay log' $tmpfile >/dev/null; then ocf_log info "MariaDB slave has finished processing relay log" break fi if ! grep -q 'system user' $tmpfile; then ocf_log info "Slave not runnig - not waiting to finish" break fi ocf_log info "Waiting for MariaDB slave to finish processing relay log" sleep 1 done rm -f $tmpfile # Now, stop all slave activity and unset the master host ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" if [ $? -gt 0 ]; then ocf_exit_reason "Error stopping rest slave threads" exit $OCF_ERR_GENERIC fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "RESET SLAVE /*!50516 ALL */;" if [ $? -gt 0 ]; then ocf_exit_reason "Failed to reset slave" exit $OCF_ERR_GENERIC fi } # Start replication as slave start_slave() { ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" } # Set the attribute controlling the readers VIP set_reader_attr() { local curr_attr_value curr_attr_value=$(get_reader_attr) if [ "$curr_attr_value" -ne "$1" ]; then $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1 fi } # get the attribute controlling the readers VIP get_reader_attr() { local attr_value local rc attr_value=$($CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q) rc=$? if [ "$rc" -eq "0" ]; then echo $attr_value else echo -1 fi } # Determines what IP address is attached to the current host. The output of the # crm_attribute command looks like this: # scope=nodes name=IP value=10.2.2.161 # If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n # The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the # change master to command. get_local_ip() { local IP IP=$($CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G 2>/dev/null) if [ ! $? -eq 0 ]; then uname -n else echo $IP fi } ####################################################################### # Functions invoked by resource manager actions mysql_monitor() { local rc local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi mysql_common_status $status_loglevel rc=$? # If status returned an error, return that immediately if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi # Check if this instance is configured as a slave, and if so # check slave status if is_slave; then if ! check_slave; then return $OCF_ERR_GENERIC fi fi if [ -n "$OCF_RESKEY_test_table" ]; then # Check for test table ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "Failed to select from $test_table"; return $OCF_ERR_GENERIC; fi fi # Check if we are in read-only mode and there is no master # with priority then we attempt to select a master if get_read_only && ! have_master_with_priority; then attempt_to_set_master fi if ! get_read_only; then ocf_log debug "MariaDB monitor succeeded (master)"; return $OCF_RUNNING_MASTER else ocf_log debug "MariaDB monitor succeeded"; return $OCF_SUCCESS fi } mysql_start() { local rc if ! ocf_is_ms; then ocf_exit_reason "Resource is not configured as master/slave" return $OCF_ERR_GENERIC fi # Initialize the ReaderVIP attribute, monitor will enable it set_reader_attr 0 mysql_common_status info if [ $? = $OCF_SUCCESS ]; then ocf_log info "MariaDB already running" return $OCF_SUCCESS fi mysql_common_prepare_dirs mysql_common_start --skip-slave-start --log-slave-updates rc=$? if [ $rc != $OCF_SUCCESS ]; then return $rc fi # Enable semi-sync ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SET GLOBAL rpl_semi_sync_slave_enabled='ON', \ rpl_semi_sync_master_enabled='ON', \ rpl_semi_sync_master_wait_no_slave='OFF', \ rpl_semi_sync_master_wait_point='AFTER_SYNC', \ gtid_strict_mode='ON', \ sync_binlog=1, \ sync_master_info=1, \ innodb_flush_log_at_trx_commit=1;" rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "Failed to enable semi-sync and set variables"; return $OCF_ERR_GENERIC; fi # We're configured as a stateful resource. We must start as # slave by default. At this point we don't know if the CRM has # already promoted a master. So, we simply start in read only # mode and make sure our old score is invalidated. set_read_only on $CRM_MASTER -D # Now, let's see whether there is a master. We might be a new # node that is just joining the cluster, and the CRM may have # promoted a master before. new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " ") if [ "$new_master_host" -a "$new_master_host" != ${NODENAME} ]; then set_master start_slave if [ $? -ne 0 ]; then ocf_exit_reason "Failed to start slave" return $OCF_ERR_GENERIC fi else ocf_log info "No MariaDB master present - clearing replication state, setting gtid in attrd, waiting for first master" unset_master set_waiting_for_first_master fi # Initial monitor action if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then OCF_CHECK_LEVEL=10 fi mysql_monitor rc=$? if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then ocf_exit_reason "Failed initial monitor action" return $rc fi ocf_log info "MariaDB started" return $OCF_SUCCESS } mysql_stop() { # clear preference for becoming master $CRM_MASTER -D # Remove VIP capability set_reader_attr 0 mysql_common_stop } mysql_promote() { local master_info if ( ! mysql_common_status err ); then return $OCF_NOT_RUNNING fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" set_read_only off || return $OCF_ERR_GENERIC # Force the master to wait for timeout period on slave disconnect ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='ON';" # Set Master Info in CIB, cluster level attribute master_info="$(get_local_ip)" ${CRM_ATTR_REPL_INFO} -v "$master_info" # A master can accept reads set_reader_attr 1 # Clear the gtids in attrd now that there is a master clear_all_gtid return $OCF_SUCCESS } mysql_demote() { if ! mysql_common_status err; then return $OCF_NOT_RUNNING fi # Return to default no wait setting. ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='OFF';" # Return master preference to default, so the cluster manager gets # a chance to select a new master $CRM_MASTER -D } mysql_notify() { local type_op type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" ocf_log debug "Received $type_op notification." case "$type_op" in 'pre-promote') # A master is now being promoted, remove the waiting-for-first-master flag clear_waiting_for_first_master ;; 'post-promote') # The master has completed its promotion. Now is a good # time to check whether our replication slave is working # correctly. new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " ") if [ "$new_master_host" = ${NODENAME} ]; then ocf_log info "This will be the new master, ignoring post-promote notification." else ocf_log info "Resetting replication, uname of master: $new_master_host" unset_master if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi set_master if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi start_slave if [ $? -ne 0 ]; then ocf_exit_reason "Failed to start slave" return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS ;; 'pre-demote') demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ") if [ $demote_host = ${NODENAME} ]; then ocf_log info "pre-demote notification for $demote_host" set_read_only on if [ $? -ne 0 ]; then ocf_exit_reason "Failed to set read-only"; return $OCF_ERR_GENERIC; fi # Must kill all existing user threads because they are still Read/write # in order for the slaves to complete the read of binlogs local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX) $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW PROCESSLIST" > $tmpfile for thread in $(awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile) do ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "KILL ${thread}" done rm -f $tmpfile else ocf_log info "Ignoring post-demote notification execpt for my own demotion." fi return $OCF_SUCCESS ;; 'post-demote') demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ") if [ $demote_host = ${NODENAME} ]; then ocf_log info "Ignoring post-demote notification for my own demotion." return $OCF_SUCCESS fi ocf_log info "post-demote notification for $demote_host." # The former master has just been gracefully demoted. unset_master ;; *) return $OCF_SUCCESS ;; esac } ####################################################################### ########################################################################## # If DEBUG_LOG is set, make this resource agent easy to debug: set up the # debug log and direct all output to it. Otherwise, redirect to /dev/null. # The log directory must be a directory owned by root, with permissions 0700, # and the log must be writable and not a symlink. ########################################################################## DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then DEBUG_LOG_DIR="${DEBUG_LOG%/*}" if [ -d "${DEBUG_LOG_DIR}" ]; then exec 9>>"$DEBUG_LOG" exec 2>&9 date >&9 echo "$*" >&9 env | grep OCF_ | sort >&9 set -x else exec 9>/dev/null fi fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac mysql_common_validate rc=$? LSB_STATUS_STOPPED=3 if [ $rc -ne 0 ]; then case "$1" in stop) ;; monitor) mysql_common_status "info" if [ $? -eq $OCF_SUCCESS ]; then # if validatation fails and pid is active, always treat this as an error ocf_exit_reason "environment validation failed, active pid is in unknown state." exit $OCF_ERR_GENERIC fi # validation failed and pid is not active, it's safe to say this instance is inactive. exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $rc;; esac fi # What kind of method was invoked? case "$1" in start) mysql_start;; stop) mysql_stop;; status) mysql_common_status err;; monitor) mysql_monitor;; promote) mysql_promote;; demote) mysql_demote;; notify) mysql_notify;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac # vi:sw=4:ts=4:et: diff --git a/heartbeat/sybaseASE.in b/heartbeat/sybaseASE.in index a4a0b7a0c..b4809ea23 100755 --- a/heartbeat/sybaseASE.in +++ b/heartbeat/sybaseASE.in @@ -1,890 +1,890 @@ #!@BASH_SHELL@ # # Sybase Availability Agent for Red Hat Cluster v15.0.2 # Copyright (C) - 2007 # Sybase, Inc. All rights reserved. # # Sybase Availability Agent for Red Hat Cluster v15.0.2 is licensed # under the GNU General Public License Version 2. # # Author(s): # Jian-ping Hui # # Description: Service script for starting/stopping/monitoring \ # Sybase Adaptive Server on: \ # Red Hat Enterprise Linux 7 ES \ # Red Hat Enterprise Linux 7 AS # # NOTES: # # (1) Before running this script, we assume that user has installed # Sybase ASE 15.0.2 or higher version on the machine. Please # customize your configuration in /etc/cluster/cluster.conf according # to your actual environment. We assume the following files exist before # you start the service: # /$sybase_home/SYBASE.sh # /$sybase_home/$sybase_ase/install/RUN_$server_name # # (2) You can customize the interval value in the meta-data section if needed: -# -# +# +# # # -# -# +# +# # # -# -# +# +# # -# -# +# +# # The timeout value is not supported by Redhat in RHCS5.0. # ####################################################################### # Initialization: if [ -f /etc/init.d/functions ]; then . /etc/init.d/functions fi : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Default timeouts when we aren't using the rgmanager wrapper if ! ocf_is_true "$OCF_RESKEY_is_rgmanager_wrapper"; then if [ -z "$OCF_RESKEY_CRM_meta_timeout" ]; then case $1 in start|stop) OCF_RESKEY_CRM_meta_timeout=300000 ;; *) OCF_RESKEY_CRM_meta_timeout=100000 ;; esac fi default_timeout=$(((${OCF_RESKEY_CRM_meta_timeout}/1000) - 5)) default_force_stop_timeout=$(((${OCF_RESKEY_CRM_meta_timeout}/1000) - 5)) : ${OCF_RESKEY_shutdown_timeout=${default_force_stop_timeout}} : ${OCF_RESKEY_deep_probe_timeout=${default_timeout}} : ${OCF_RESKEY_start_timeout=${default_timeout}} fi sybase_user_default="sybase" sybase_home_default="detect" ase_default="detect" ocs_default="detect" : ${OCF_RESKEY_sybase_user=${sybase_user_default}} : ${OCF_RESKEY_sybase_ase=${ase_default}} : ${OCF_RESKEY_sybase_ocs=${ocs_default}} : ${OCF_RESKEY_sybase_home=${sybase_home_default}} if [ "$__OCF_ACTION" != "meta-data" ]; then if [ "$OCF_RESKEY_sybase_home" = "detect" ]; then if [ -d "/opt/sap" ]; then OCF_RESKEY_sybase_home="/opt/sap" elif [ -d "/opt/sybase" ]; then OCF_RESKEY_sybase_home="/opt/sybase" else ocf_log err "sybaseASE: Unable to detect 'sybase_home'." exit $OCF_ERR_ARGS fi fi sybase_env="$OCF_RESKEY_sybase_home/SYBASE.env" if [ "$OCF_RESKEY_sybase_ase" = "detect" ]; then if [ -f "$sybase_env" ]; then OCF_RESKEY_sybase_ase=$(grep "SYBASE_ASE" "$sybase_env" | cut -d= -f2) else ocf_log err "sybaseASE: Unable to detect 'sybase_ase'." exit $OCF_ERR_ARGS fi fi if [ "$OCF_RESKEY_sybase_ocs" = "detect" ]; then if [ -f "$sybase_env" ]; then OCF_RESKEY_sybase_ocs=$(grep "SYBASE_OCS" "$sybase_env" | cut -d= -f2) else ocf_log err "sybaseASE: Unable to detect 'sybase_ocs'." exit $OCF_ERR_ARGS fi fi fi interfaces_file_default="${OCF_RESKEY_sybase_home}/interfaces" : ${OCF_RESKEY_interfaces_file=${interfaces_file_default}} export LD_POINTER_GUARD=0 ####################################################################################### # Declare some variables we will use in the script. # ####################################################################################### declare login_string="" declare RUNSERVER_SCRIPT=$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase/install/RUN_$OCF_RESKEY_server_name declare CONSOLE_LOG=$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase/install/$OCF_RESKEY_server_name.log ################################################################################################## # This function will be called by Pacemaker to get the meta data of resource agent "sybaseASE". # ################################################################################################## meta_data() { cat < 1.0 Sybase ASE Failover Instance Sybase ASE Failover Instance The home directory of sybase products SYBASE home directory The directory name under sybase_home where ASE products are installed SYBASE_ASE directory name The directory name under sybase_home where OCS products are installed, i.e. ASE-15_0 SYBASE_OCS directory name The ASE server name which is configured for the HA service ASE server name The full path of interfaces file which is used to start/access the ASE server Interfaces file The user who can run ASE server Sybase user The database user required to login to isql. Sybase user The database user's password required to login to isql. Sybase user - - + + - - + + - - + + - - + + EOT } ase_engine0_process() { sed -n -e '/engine 0/s/^.*os pid \([0-9]*\).*online$/\1/p' $CONSOLE_LOG } ase_engine0_thread() { sed -n -e 's/.*Thread.*LWP \([0-9]*\).*online as engine 0.*/\1/p' $CONSOLE_LOG } ase_engine_threadpool_pid() { sed -n -e 's/.*Adaptive Server is running as process id \([0-9]*\).*/\1/p' $CONSOLE_LOG } ase_all_pids() { local PIDS=$(sed -n -e '/engine /s/^.*os pid \([0-9]*\).*online$/\1/p' $CONSOLE_LOG) if [ -z "$PIDS" ]; then #engines are running in a threadpool PIDS=$(ase_engine_threadpool_pid) fi echo $PIDS } ################################################################################################## # Function Name: verify_all # # Parameter: None # # Return value: # # 0 SUCCESS # # OCF_ERR_ARGS Parameters are invalid # # Description: Do some validation on the user-configurable stuff at the beginning of the script. # ################################################################################################## verify_all() { ocf_log debug "sybaseASE: Start 'verify_all'" check_binary "ksh" # Check if the parameter 'sybase_home' is set. if [[ -z "$OCF_RESKEY_sybase_home" ]] then ocf_log err "sybaseASE: The parameter 'sybase_home' is not set." return $OCF_ERR_ARGS fi # Check if the parameter 'sybase_home' is a valid path. if [[ ! -d $OCF_RESKEY_sybase_home ]] then ocf_log err "sybaseASE: The sybase_home '$OCF_RESKEY_sybase_home' doesn't exist." return $OCF_ERR_ARGS fi # Check if the script file SYBASE.sh exists if [[ ! -f $OCF_RESKEY_sybase_home/SYBASE.sh ]] then ocf_log err "sybaseASE: The file $OCF_RESKEY_sybase_home/SYBASE.sh is required to run this script. Failed to run the script." return $OCF_ERR_ARGS fi # Check if the parameter 'sybase_ase' is set. if [[ -z "$OCF_RESKEY_sybase_ase" ]] then ocf_log err "sybaseASE: The parameter 'sybase_ase' is not set." return $OCF_ERR_ARGS fi # Check if the directory /$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase exists. if [[ ! -d $OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase ]] then ocf_log err "sybaseASE: The directory '$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase' doesn't exist." return $OCF_ERR_ARGS fi # Check if the parameter 'sybase_ocs' is set. if [[ -z "$OCF_RESKEY_sybase_ocs" ]] then ocf_log err "sybaseASE: The parameter 'sybase_ocs' is not set." return $OCF_ERR_ARGS fi # Check if the directory /$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs exists. if [[ ! -d $OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs ]] then ocf_log err "sybaseASE: The directory '$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs' doesn't exist." return $OCF_ERR_ARGS fi # Check if the parameter 'server_name' is set. if [[ -z "$OCF_RESKEY_server_name" ]] then ocf_log err "sybaseASE: The parameter 'server_name' is not set." return $OCF_ERR_ARGS fi # Check if the Run_server file exists. if [[ ! -f $RUNSERVER_SCRIPT ]] then ocf_log err "sybaseASE: The file $RUNSERVER_SCRIPT doesn't exist. The sybase directory may be incorrect." return $OCF_ERR_ARGS fi # Check if the user 'sybase_user' exist id -u $OCF_RESKEY_sybase_user if [[ $? != 0 ]] then ocf_log err "sybaseASE: The user '$OCF_RESKEY_sybase_user' doesn't exist in the system." return $OCF_ERR_ARGS fi # Check if the parameter 'interfaces_file' is set if [[ -z "$OCF_RESKEY_interfaces_file" ]] then ocf_log err "sybaseASE: The parameter 'interfaces_file' is not set." return $OCF_ERR_ARGS fi # Check if the file 'interfaces_file' exists if [[ ! -f $OCF_RESKEY_interfaces_file ]] then ocf_log err "sybaseASE: The interfaces file '$OCF_RESKEY_interfaces_file' doesn't exist." return $OCF_ERR_ARGS fi # Check if the parameter 'db_user' is set if [[ -z "$OCF_RESKEY_db_user" ]] then ocf_log err "sybaseASE: The parameter 'db_user' is not set." return $OCF_ERR_ARGS fi # Check if the parameter 'shutdown_timeout' is a valid value if [[ $OCF_RESKEY_shutdown_timeout -eq 0 ]] then ocf_log err "sybaseASE: The parameter 'shutdown_timeout' is not set. Its value cannot be zero." return $OCF_ERR_ARGS fi # Check if the parameter 'start_timeout' is a valid value if [[ $OCF_RESKEY_start_timeout -eq 0 ]] then ocf_log err "sybaseASE: The parameter 'start_timeout' is not set. Its value cannot be zero." return $OCF_ERR_ARGS fi # Check if the parameter 'deep_probe_timeout' is a valid value if [[ $OCF_RESKEY_deep_probe_timeout -eq 0 ]] then ocf_log err "sybaseASE: The parameter 'deep_probe_timeout' is not set. Its value cannot be zero." return $OCF_ERR_ARGS fi ocf_log debug "sybaseASE: End 'verify_all' successfully." return $OCF_SUCCESS } set_login_string() { tmpstring="" login_sting="" login_string="-U$OCF_RESKEY_db_user -P$OCF_RESKEY_db_passwd" return 0 } ############################################################################################## # Function name: ase_start # # Parameter: None # # Return value: # # 0 SUCCESS # # 1 FAIL # # Description: This function is used to start the ASE server in primary or secondary server. # ############################################################################################## ase_start() { ocf_log debug "sybaseASE: Start 'ase_start'" # Check if the server is running. If yes, return SUCCESS directly. Otherwise, continue the start work. ase_is_running if [[ $? = 0 ]] then # The server is running. ocf_log info "sybaseASE: Server is running. Start is success." return $OCF_SUCCESS fi # The server is not running. We need to start it. # If the log file existed, delete it. if [[ -f $CONSOLE_LOG ]] then rm -f $CONSOLE_LOG fi ocf_log debug "sybaseASE: Starting '$OCF_RESKEY_server_name'..." # Run runserver script to start the server. Since this script will be run by root and ASE server # needs to be run by another user, we need to change the user to sybase_user first. Then, run # the script to start the server. su $OCF_RESKEY_sybase_user -c ksh << EOF # set required SYBASE environment by running SYBASE.sh. . $OCF_RESKEY_sybase_home/SYBASE.sh # Run the RUNSERVER_SCRIPT to start the server. . $RUNSERVER_SCRIPT > $CONSOLE_LOG 2>&1 & EOF # Monitor every 1 seconds if the server has # recovered, until RECOVERY_TIMEOUT. t=0 while [[ $t -le $OCF_RESKEY_start_timeout ]] do grep -s "Recovery complete." $CONSOLE_LOG > /dev/null 2>&1 if [[ $? != 0 ]] then # The server has not completed the recovery. We need to continue to monitor the recovery # process. t=`expr $t + 1` else # The server has completed the recovery. ocf_log info "sybaseASE: ASE server '$OCF_RESKEY_server_name' started successfully." break fi sleep 1 done # If $t is larger than start_timeout, it means the ASE server cannot start in given time. Otherwise, it # means the ASE server has started successfully. if [[ $t -gt $OCF_RESKEY_start_timeout ]] then # The server cannot start in specified time. We think the start is failed. ocf_log err "sybaseASE: Failed to start ASE server '$OCF_RESKEY_server_name'. Please check the server error log $CONSOLE_LOG for possible problems." return $OCF_ERR_GENERIC fi ase_is_running if [ $? -ne 0 ]; then ocf_log err "sybaseASE: ase_start could not detect database initialized properly." return $OCF_ERR_GENERIC fi ocf_log debug "sybaseASE: End 'ase_start' successfully." return $OCF_SUCCESS } ############################################################################################# # Function name: ase_stop # # Parameter: None # # Return value: # # 0 SUCCESS # # 1 FAIL # # Description: This function is used to stop the ASE server in primary or secondary server. # ############################################################################################# ase_stop() { ocf_log debug "sybaseASE: Start 'ase_stop'" # Check if the ASE server is still running. ase_is_running if [[ $? != 0 ]] then # The ASE server is not running. We need not to shutdown it. ocf_log info "sybaseASE: The dataserver $OCF_RESKEY_server_name is not running." return $OCF_SUCCESS fi set_login_string # Just in case things are hung, start a process that will wait for the # timeout period, then kill any remaining porcesses. We'll need to # monitor this process (set -m), so we can terminate it later if it is # not needed. set -m kill_ase $OCF_RESKEY_shutdown_timeout & KILL_PID=$! # If successful, we will also terminate watchdog process # Run "shutdown with nowait" from isql command line to shutdown the server su $OCF_RESKEY_sybase_user -c ksh << EOF # set required SYBASE environment by running SYBASE.sh. . $OCF_RESKEY_sybase_home/SYBASE.sh # Run "shutdown with nowait" to shutdown the server immediately. (echo "use master" ; echo go ; echo "shutdown with nowait"; echo go) | \ \$SYBASE/\$SYBASE_OCS/bin/isql $login_string -S$OCF_RESKEY_server_name -I$OCF_RESKEY_interfaces_file & EOF sleep 5 # Check if the server has been shut down successfully t=0 while [[ $t -lt $OCF_RESKEY_shutdown_timeout ]] do # Search "ueshutdown: exiting" in the server log. If found, it means the server has been shut down. # Otherwise, we need to wait. tail $CONSOLE_LOG | grep "ueshutdown: exiting" > /dev/null 2>&1 if [[ $? != 0 ]] then # The shutdown is still in processing. Wait... sleep 2 t=`expr $t+2` else # The shutdown is success. ocf_log info "sybaseASE: ASE server '$OCF_RESKEY_server_name' shutdown with isql successfully." break fi done # If $t is larger than shutdown_timeout, it means the ASE server cannot be shut down in given time. We need # to wait for the background kill process to kill the OS processes directly. if [[ $t -ge $OCF_RESKEY_shutdown_timeout ]] then ocf_log err "sybaseASE: Shutdown of '$OCF_RESKEY_server_name' from isql failed. Server is either down or unreachable." fi # Here, the ASE server has been shut down by isql command or killed by background process. We need to do # further check to make sure all processes have gone away before saying shutdown is complete. This stops the # other node from starting up the package before it has been stopped and the file system has been unmounted. # Get all processes ids from log file declare -a ENGINE_ALL=$(ase_all_pids) typeset -i num_procs=${#ENGINE_ALL[@]} # We cannot find any process id from log file. It may be because the log file is corrupted or be deleted. # In this case, we determine the shutdown is failed. if [[ ${#ENGINE_ALL[@]} -lt 1 ]] then ocf_log err "sybaseASE: Unable to find the process id from $CONSOLE_LOG." ocf_log err "sybaseASE: Stop ASE server failed." return $OCF_ERR_GENERIC fi # Monitor the system processes to make sure all ASE related processes have gone away. while true do # To every engine process, search it in system processes list. If it is not in the # list, it means this process has gone away. Otherwise, we need to wait for it is # killed by background process. for i in "${ENGINE_ALL[@]}" do ps -fu $OCF_RESKEY_sybase_user | awk '{print $2}' | grep $i | grep -v grep if [[ $? != 0 ]] then ocf_log debug "sybaseASE: $i process has stopped." c=0 while (( c < $num_procs )) do if [[ ${ENGINE_ALL[$c]} = $i ]] then unset ENGINE_ALL[$c] c=$num_procs fi (( c = c + 1 )) done fi done # To here, all processes should have gone away. if [[ ${#ENGINE_ALL[@]} -lt 1 ]] then # # Looks like shutdown was successful, so kill the # script to kill any hung processes, which we started earlier. # Check to see if the script is still running. If jobs # returns that the script is done, then we don't need to kill # it. # job=$(jobs | grep -v Done) if [[ ${job} != "" ]] then ocf_log debug "sybaseASE: Killing the kill_ase script." kill -15 $KILL_PID > /dev/null 2>&1 fi break fi sleep 5 done ocf_log debug "sybaseASE: End 'ase_stop'." return $OCF_SUCCESS } #################################################################################### # Function name: ase_is_running # # Parameter: None # # Return value: # # 0 ASE server is running # # 1 ASE server is not running or there are errors # # Description: This function is used to check if the ASE server is still running . # #################################################################################### ase_is_running() { local PID local THREAD # If the error log doesn't exist, we can say there is no ASE is running. if [[ ! -f $CONSOLE_LOG ]] then ocf_log debug "could not find console log $CONSOLE_LOG" return $OCF_NOT_RUNNING fi # The error log file exists. Check if the engine 0 is alive. PID=$(ase_engine0_process) if [ -n "$PID" ]; then kill -s 0 $PID > /dev/null 2>&1 if [ $? -eq 0 ]; then # The engine 0 is running. ocf_log debug "Found engine 0 pid $PID to be running" return $OCF_SUCCESS fi # The engine 0 is not running. return $OCF_NOT_RUNNING fi PID=$(ase_engine_threadpool_pid) THREAD=$(ase_engine0_thread) if [ -n "$PID" ] && [ -n "$THREAD" ]; then ps -AL | grep -q "${PID}[[:space:]]*${THREAD} " if [ $? -eq 0 ]; then # engine 0 thread is running ocf_log debug "Found engine 0 thread $THREAD in pid $PID to be running" return $OCF_SUCCESS fi # The engine 0 is not running. return $OCF_NOT_RUNNING fi return $OCF_ERR_GENERIC } #################################################################################### # Function name: kill_ase # # Parameter: # # DELAY The seconds to wait before killing the ASE processes. 0 means # # kill the ASE processes immediately. # # Return value: None # # 1 ASE server is not running or there are errors # # Description: This function is used to check if the ASE server is still running . # #################################################################################### kill_ase() { ocf_log debug "sybaseASE: Start 'kill_ase'." DELAY=$1 # Wait for sometime before sending a kill signal. t=0 while [[ $t -lt $DELAY ]] do sleep 1 t=`expr $t+1` done # Get the process ids from log file declare -a ENGINE_ALL=$(ase_all_pids) # If there is no process id found in the log file, we need not to continue. if [[ ${#ENGINE_ALL[@]} -lt 1 ]] then ocf_log err "sybaseASE: Unable to find the process id from $CONSOLE_LOG." return $OCF_ERR_GENERIC fi # Kill the datasever process(es) for pid in "${ENGINE_ALL[@]}" do kill -9 $pid > /dev/null 2>&1 if [[ $? != 0 ]] then ocf_log info "sybaseASE: kill_ase function did NOT find process $pid running." else ocf_log info "sybaseASE: kill_ase function did find process $pid running. Sent SIGTERM." fi done ocf_log debug "sybaseASE: End 'kill_ase'." return $OCF_SUCCESS } ##################################################################################### # Function name: ase_status # # Parameter: # # 0 Level 0 probe. In this level, we just check if engine 0 is alive # # 10 Level 10 probe. In this level, we need to probe if the ASE server # # still has response. # # Return value: # # 0 The server is still alive # # 1 The server is down # # Description: This function is used to check if the ASE server is still running. # ##################################################################################### ase_status() { local rc ocf_log debug "sybaseASE: Start 'ase_status'." # Step 1: Check if the engine 0 is alive ase_is_running rc=$? if [ $rc -ne 0 ]; then # ASE is down. Return fail to Pacemaker to trigger the failover process. ocf_log err "sybaseASE: ASE server is down." return $rc fi # ASE process is still alive. # Step2: If this is level 10 probe, We need to check if the ASE server still has response. if [[ $1 -gt 0 ]] then ocf_log debug "sybaseASE: Need to run deep probe." # Run deep probe deep_probe if [[ $? = 1 ]] then # Deep probe failed. This means the server has been down. ocf_log err "sybaseASE: Deep probe found the ASE server is down." return $OCF_ERR_GENERIC fi fi ocf_log debug "sybaseASE: End 'ase_status'." return $OCF_SUCCESS } #################################################################################### # Function name: deep_probe # # Parameter: None # # Return value: # # 0 ASE server is alive # # 1 ASE server is down # # Description: This function is used to run deep probe to make sure the ASE server # # still has response. # #################################################################################### deep_probe() { declare -i rv ocf_log debug "sybaseASE: Start 'deep_probe'." # Declare two temporary files which will be used in this probe. tmpfile1="$(mktemp /tmp/sybaseASE.1.XXXXXX)" tmpfile2="$(mktemp /tmp/sybaseASE.2.XXXXXX)" set_login_string rm -f $tmpfile1 rm -f $tmpfile2 # The login file is correct. We have gotten the login account and password from it. # Run isql command in background. su $OCF_RESKEY_sybase_user -c ksh << EOF # set required SYBASE environment by running SYBASE.sh. . $OCF_RESKEY_sybase_home/SYBASE.sh # Run a very simple SQL statement to make sure the server is still ok. The output will be put to # tmpfile1. (echo "select 1"; echo "go") | \$SYBASE/\$SYBASE_OCS/bin/isql $login_string -S$OCF_RESKEY_server_name -I$OCF_RESKEY_interfaces_file -t $OCF_RESKEY_deep_probe_timeout -e -o$tmpfile1 & # Record the isql command process id to temporary file. If the isql is hung, we need this process id # to kill the hung process. echo \$! > $tmpfile2 EOF declare -i t=0 # Monitor the output file tmpfile1. while [[ $t -lt $OCF_RESKEY_deep_probe_timeout ]] do # If the SQL statement is executed successfully, we will get the following output: # 1> select 1 # # ----------- # 1 # # (1 row affected) # So, we determine if the execution is success by searching the keyword "(1 row affected)". grep "(1 row affected)" $tmpfile1 if [[ $? = 0 ]] then ocf_log debug "sybaseASE: Deep probe sucess." break else sleep 1 t=`expr $t+1` fi done # If $t is larger than deep_probe_timeout, it means the isql command line cannot finish in given time. # This means the deep probe failed. We need to kill the isql process manually. if [[ $t -ge $OCF_RESKEY_deep_probe_timeout ]] then ocf_log err "sybaseASE: Deep probe fail. The dataserver has no response." # Read the process id of isql process from tmpfile2 pid=`cat $tmpfile2 | awk '{print $1}'` rm -f $tmpfile1 rm -f $tmpfile2 # Kill the isql process directly. kill -9 $pid return 1 fi rm -f $tmpfile1 rm -f $tmpfile2 ocf_log debug "sybaseASE: End 'deep_probe'." return 0 } ############################# # Do some real work here... # ############################# case $__OCF_ACTION in start) verify_all || exit $OCF_ERR_GENERIC ase_start exit $? ;; stop) verify_all || exit $OCF_ERR_GENERIC ase_stop exit $? ;; status | monitor) verify_all || exit $OCF_ERR_GENERIC ase_status $OCF_CHECK_LEVEL exit $? ;; meta-data) meta_data exit $OCF_SUCCESS ;; validate-all) verify_all exit $? ;; *) echo "Usage: $SCRIPT {start|stop|monitor|status|validate-all|meta-data}" exit $OCF_ERR_UNIMPLEMENTED ;; esac exit 0