diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance index 510de7b63..da394f5a1 100755 --- a/heartbeat/SAPInstance +++ b/heartbeat/SAPInstance @@ -1,941 +1,942 @@ #!/bin/sh # # SAPInstance # # Description: Manages a single SAP Instance as a High-Availability # resource. One SAP Instance is defined by one # SAP Instance-Profile. start/stop handels all services # of the START-Profile, status and monitor care only # about essential services. # # Author: Alexander Krauth, June 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006-2008 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_InstanceName # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) # OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only) # OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk) # OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # # TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status) # - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque) # - Option for cleanup abandoned enqueue replication tables # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### SH=/bin/sh sapinstance_usage() { methods=`sapinstance_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages a SAP Instance as an HA resource. The 'start' operation starts the instance or the ERS instance in a Master/Slave configuration The 'stop' operation stops the instance The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'promote' operation starts the primary instance in a Master/Slave configuration The 'demote' operation stops the primary instance and starts the ERS instance The 'notify' operation always returns SUCCESS The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } sapinstance_meta_data() { cat < 2.14 Manages a SAP instance as an HA resource. Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration. The resource agent supports the following SAP versions: - SAP WebAS ABAP Release 6.20 - 7.30 - SAP WebAS Java Release 6.40 - 7.30 - SAP WebAS ABAP + Java Add-In Release 6.20 - 7.30 (Java is not monitored by the cluster in that case) When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com). All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time. sapstartsrv knows 4 status colours: - GREEN = everything is fine - YELLOW = something is wrong, but the service is still working - RED = the service does not work - GRAY = the service has not been started The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover. The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing. The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile. Instance name: SID_INSTANCE_VIR-HOSTNAME The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. Path of sapstartsrv and sapcontrol The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. Path of start profile The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Start profile name After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and aJAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance. Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time. That is only useful for double stack systems. Check the successful start after that time (do not wait for J2EE-Addin) The SAPInstance resource agent tries to recover a failed start attempt automaticaly one time. This is done by killing runing instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator. Enable or disable automatic startup recovery Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails. Those services are monitored within the SAPInstance resource agent: - disp+work - msg_server - enserver - enrepserver - jcontrol - jstart That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'. The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports. You may specify multiple services seperated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver Services to monitor Usual a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the gracefull stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !! Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL) Only used in a Master/Slave resource configuration: The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile. The enqueue replication instance must be installed, before you want to configure a master-slave cluster recource. The master-slave configuration in the cluster must use this properties: clone_max = 2 clone_node_max = 1 master_node_max = 1 master_max = 1 Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME Only used in a Master/Slave resource configuration: The parameter ERS_InstanceName must also be set in this configuration. The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Enqueue replication start profile name The full qualified path where to find a script or program which should be executed before this resource gets started. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. Path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. Path to a post-start script END } # # methods: What methods/operations do we support? # sapinstance_methods() { cat <<-! start stop status monitor promote demote notify validate-all methods meta-data usage ! } # # is_clone : find out if we are configured to run in a Master/Slave configuration # is_clone() { if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ] then if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ] then ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_ERS_InstanceName" ] then ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory." exit $OCF_ERR_ARGS fi else return 0 fi return 1 } # # abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different # from customer to customer - we cannot handle this always as an error # This would be the case, if the software is installed on shared disks and not visible # to all cluster nodes at all times. # abnormal_end() { local err_msg=$1 ocf_is_probe && { sapinstance_status exit $? } if [ "$ACTION" = "stop" ] then cleanup_instance exit $OCF_SUCCESS fi ocf_log err $err_msg exit $OCF_ERR_CONFIGURED } # # sapinstance_init : Define global variables with default values, if optional parameters are not set # # sapinstance_init() { local myInstanceName="$1" SID=`echo "$myInstanceName" | cut -d_ -f1` InstanceName=`echo "$myInstanceName" | cut -d_ -f2` InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3` # optional OCF parameters, we try to guess which directories are correct if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] then if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" fi else if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" then DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" fi fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" [ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" if [ -z "$OCF_RESKEY_DIR_PROFILE" ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ] then currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE else currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE fi if [ -z "$currentSTART_PROFILE" ] then SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" else SAPSTARTPROFILE="$currentSTART_PROFILE" fi if [ -z "$OCF_RESKEY_START_WAITTIME" ] then export OCF_RESKEY_START_WAITTIME=3600 fi if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then export OCF_RESKEY_MONITOR_SERVICES="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart" fi # as root user we need the library path to the SAP kernel to be able to call sapcontrol if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi return $OCF_SUCCESS } # # check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. # We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, # because then we have two instances with the same instance number. # check_sapstartsrv() { local restart=0 local runninginst="" local chkrc=$OCF_SUCCESS local output="" if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` if [ $? -eq 0 ] then runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` if [ "$runninginst" != "$InstanceName" ] then ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start` if [ $? -ne 0 ]; then ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)" ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" restart=1 fi fi else ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" restart=1 fi fi if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi if [ $restart -eq 1 ] then if [ -d /usr/sap/$SID/SYS/profile/ ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" fi [ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" pkill -9 -f "sapstartsrv.*$runninginst" # removing the unix domain socket files as they might have wrong permissions # or ownership - they will be recreated by sapstartsrv during next start rm -f /tmp/.sapstream5${InstanceNr}13 rm -f /tmp/.sapstream5${InstanceNr}14 $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm # now make sure the daemon has been started and is able to respond local srvrc=1 while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] do sleep 1 $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 srvrc=$? done if [ $srvrc -ne 1 ] then ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" chkrc=$OCF_ERR_GENERIC ocf_is_probe && chkrc=$OCF_NOT_RUNNING fi fi return $chkrc } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { local NAME="$1" local VALUE="$2" if [ -n "$VALUE" ] then if have_binary "$VALUE" then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" "$VALUE" >/dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # cleanup_instance : remove resources (processes and shared memory) from a crashed instance) # cleanup_instance() { pkill -9 -f -U $sidadm $InstanceName ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'" # it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot beremoved su - $sidadm -c "cleanipc $InstanceNr remove" ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm" ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid return 0 } # # sapinstance_start : Start the SAP instance # sapinstance_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" local rc=$OCF_NOT_RUNNING local output="" local loopcount=0 while [ $loopcount -lt 2 ] do loopcount=$(($loopcount + 1)) check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Start` rc=$? ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" fi if [ $rc -ne 0 ] then ocf_log err "SAP Instance $SID-$InstanceName start failed." return $OCF_ERR_GENERIC fi local startrc=1 while [ $startrc -gt 0 ] do local waittime_start=`date +%s` output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` startrc=$? local waittime_stop=`date +%s` if [ $startrc -ne 0 ] then if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] then sapinstance_monitor NOLOG if [ $? -eq $OCF_SUCCESS ] then output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." startrc=0; loopcount=2 fi else if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" ocf_log warn "Try to recover $SID-$InstanceName" cleanup_instance else loopcount=2 fi startrc=-1 fi else loopcount=2 fi done done if [ $startrc -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" else ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" rc=$OCF_NOT_RUNNING fi return $rc } # # sapinstance_recover: Try startup of failed instance by cleaning up resources # sapinstance_recover() { cleanup_instance sapinstance_start return $? } # # sapinstance_stop: Stop the SAP instance # sapinstance_stop() { local output="" local rc sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ] then ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!" cleanup_instance return $OCF_SUCCESS fi check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Stop` rc=$? ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output" fi if [ $rc -eq 0 ] then output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` if [ $? -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" return $rc } # # sapinstance_monitor: Can the given SAP instance do anything useful? # sapinstance_monitor() { local MONLOG=$1 local rc check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ] then local count=0 local SERVNO local output output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` do local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` local STATE=0 local SEARCH case $COLOR in GREEN|YELLOW) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then if ocf_is_probe then rc=$OCF_NOT_RUNNING else [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" rc=$OCF_ERR_GENERIC fi fi fi return $rc } # # sapinstance_status: Lightweight check of SAP instance only with OS tools # sapinstance_status() { local pid local pids [ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING pids=`grep '^kill -[0-9]' /usr/sap/$SID/$InstanceName/work/kill.sap | awk '{print $3}'` for pid in $pids do [ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS done return $OCF_NOT_RUNNING } # # sapinstance_validate: Check the symantic of the input parameters # sapinstance_validate() { local rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS fi if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" rc=$OCF_ERR_ARGS fi return $rc } # # sapinstance_start_clone # sapinstance_start_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 50 -l reboot sapinstance_start return $? } # # sapinstance_stop_clone # sapinstance_stop_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 0 -l reboot sapinstance_stop return $? } # # sapinstance_monitor_clone # sapinstance_monitor_clone() { # first check with the status function (OS tools) if there could be something like a SAP instance running # as we do not know here, if we are in master or slave state we do not want to start our monitoring # agents (sapstartsrv) on the wrong host local rc sapinstance_init $OCF_RESKEY_InstanceName if sapinstance_status; then if sapinstance_monitor; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot return $OCF_RUNNING_MASTER fi # by nature of the SAP enqueue server we have to make sure # that we do a failover to the slave (enqueue replication server) # in case the enqueue process has failed. We signal this to the # cluster by setting our master preference to a lower value than the slave. ${HA_SBIN_DIR}/crm_master -v 10 -l reboot return $OCF_FAILED_MASTER fi sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_status && sapinstance_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot fi return $rc } # # sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance # The order is important here to behave correct from the application levels view # sapinstance_promote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Promoting $SID-$InstanceName to running Master." sapinstance_start rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_stop rc=$? fi return $rc } # # sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance # sapinstance_demote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Demoting $SID-$InstanceName to a slave." sapinstance_stop rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_start rc=$? fi return $rc } # # sapinstance_notify: Handle master scoring - to make sure a slave gets the next master # sapinstance_notify() { local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" if [ "${n_type}_${n_op}" = "post_promote" ]; then # After promotion of one master in the cluster, we make sure that all clones reset their master # value back to 100. This is because a failed monitor on a master might have degree one clone # instance to score 10. ${HA_SBIN_DIR}/crm_master -v 100 -l reboot elif [ "${n_type}_${n_op}" = "pre_demote" ]; then # if we are a slave and a demote event is anounced, make sure we have the highes wish to became master # that is, when a slave resource was startet after the promote event of a already running master (e.g. node of slave was down) # We also have to make sure to overrule the globaly set resource_stickiness or any fail-count factors => INFINITY local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname" - if [ ${n_uname} != ${HOSTNAME} ]; then + if [ ${n_uname} != ${NODENAME} ]; then ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot fi fi } # # 'main' starts here... # ## GLOBALS SID="" sidadm="" InstanceName="" InstanceNr="" SAPVIRHOST="" DIR_EXECUTABLE="" SAPSTARTSRV="" SAPCONTROL="" DIR_PROFILE="" SAPSTARTPROFILE="" CLONE=0 +NODENAME=$(ocf_local_nodename) if ( [ $# -ne 1 ] ) then sapinstance_usage exit $OCF_ERR_ARGS fi ACTION=$1 if [ "$ACTION" = "status" ]; then ACTION=monitor fi # These operations don't require OCF instance parameters to be set case "$ACTION" in usage|methods) sapinstance_$ACTION exit $OCF_SUCCESS;; meta-data) sapinstance_meta_data exit $OCF_SUCCESS;; notify) sapinstance_notify exit $OCF_SUCCESS;; *);; esac if ! ocf_is_root then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # parameter check if [ -z "$OCF_RESKEY_InstanceName" ] then ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" exit $OCF_ERR_ARGS fi is_clone; CLONE=$? if [ ${CLONE} -eq 1 ] then CLACT=_clone else if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] then ocf_log err "$ACTION called in a non master/slave environment" exit $OCF_ERR_ARGS fi sapinstance_init $OCF_RESKEY_InstanceName fi # What kind of method was invoked? case "$ACTION" in start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT exit $?;; validate-all) sapinstance_validate exit $?;; *) sapinstance_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/mysql b/heartbeat/mysql index e569de228..5e332a15e 100755 --- a/heartbeat/mysql +++ b/heartbeat/mysql @@ -1,1274 +1,1274 @@ #!/bin/sh # # # MySQL # # Description: Manages a MySQL database as Linux-HA resource # # Authors: Alan Robertson: DB2 Script # Jakub Janczak: rewrite as MySQL # Andrew Beekhof: cleanup and import # Sebastian Reitenbach: add OpenBSD defaults, more cleanup # Narayan Newton: add Gentoo/Debian defaults # Marian Marinov, Florian Haas: add replication capability # Yves Trudeau, Baron Schwartz: add VIP support and improve replication # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # (c) 2002-2005 International Business Machines, Inc. # 2005-2010 Linux-HA contributors # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 mysql # # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_binary # OCF_RESKEY_client_binary # OCF_RESKEY_config # OCF_RESKEY_datadir # OCF_RESKEY_user # OCF_RESKEY_group # OCF_RESKEY_test_table # OCF_RESKEY_test_user # OCF_RESKEY_test_passwd # OCF_RESKEY_enable_creation # OCF_RESKEY_additional_parameters # OCF_RESKEY_log # OCF_RESKEY_pid # OCF_RESKEY_socket # OCF_RESKEY_replication_user # OCF_RESKEY_replication_passwd # OCF_RESKEY_replication_port # OCF_RESKEY_max_slave_lag # OCF_RESKEY_evict_outdated_slaves # OCF_RESKEY_reader_attribute ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Attempt to detect a default binary OCF_RESKEY_binary_default=$(which mysqld_safe 2> /dev/null) if [ "$OCF_RESKEY_binary_default" = "" ]; then OCF_RESKEY_binary_default=$(which safe_mysqld 2> /dev/null) fi # Fill in some defaults if no values are specified HOSTOS=`uname` if [ "X${HOSTOS}" = "XOpenBSD" ];then if [ "$OCF_RESKEY_binary_default" = "" ]; then OCF_RESKEY_binary_default="/usr/local/bin/mysqld_safe" fi OCF_RESKEY_config_default="/etc/my.cnf" OCF_RESKEY_datadir_default="/var/mysql" OCF_RESKEY_user_default="_mysql" OCF_RESKEY_group_default="_mysql" OCF_RESKEY_log_default="/var/log/mysqld.log" OCF_RESKEY_pid_default="/var/mysql/mysqld.pid" OCF_RESKEY_socket_default="/var/run/mysql/mysql.sock" else if [ "$OCF_RESKEY_binary_default" = "" ]; then OCF_RESKEY_binary_default="/usr/bin/safe_mysqld" fi OCF_RESKEY_config_default="/etc/my.cnf" OCF_RESKEY_datadir_default="/var/lib/mysql" OCF_RESKEY_user_default="mysql" OCF_RESKEY_group_default="mysql" OCF_RESKEY_log_default="/var/log/mysqld.log" OCF_RESKEY_pid_default="/var/run/mysql/mysqld.pid" OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" fi OCF_RESKEY_client_binary_default="mysql" OCF_RESKEY_test_user_default="root" OCF_RESKEY_test_table_default="mysql.user" OCF_RESKEY_test_passwd_default="" OCF_RESKEY_enable_creation_default=0 OCF_RESKEY_additional_parameters_default="" OCF_RESKEY_replication_port_default="3306" OCF_RESKEY_max_slave_lag_default="3600" OCF_RESKEY_evict_outdated_slaves_default="false" OCF_RESKEY_reader_attribute_default="readable" : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} MYSQL_BINDIR=`dirname ${OCF_RESKEY_binary}` : ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} : ${OCF_RESKEY_datadir=${OCF_RESKEY_datadir_default}} : ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} : ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} : ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} : ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} : ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} : ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} : ${OCF_RESKEY_test_table=${OCF_RESKEY_test_table_default}} : ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} : ${OCF_RESKEY_enable_creation=${OCF_RESKEY_enable_creation_default}} : ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} : ${OCF_RESKEY_replication_user=${OCF_RESKEY_replication_user_default}} : ${OCF_RESKEY_replication_passwd=${OCF_RESKEY_replication_passwd_default}} : ${OCF_RESKEY_replication_port=${OCF_RESKEY_replication_port_default}} : ${OCF_RESKEY_max_slave_lag=${OCF_RESKEY_max_slave_lag_default}} : ${OCF_RESKEY_evict_outdated_slaves=${OCF_RESKEY_evict_outdated_slaves_default}} : ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}} ####################################################################### # Convenience variables MYSQL=$OCF_RESKEY_client_binary MYSQL_OPTIONS_LOCAL="-S $OCF_RESKEY_socket --connect_timeout=10" MYSQL_OPTIONS_REPL="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd" MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" MYSQL_TOO_MANY_CONN_ERR=1040 CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot " -HOSTNAME=`uname -n` -CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME " +NODENAME=$(ocf_local_nodename) +CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $NODENAME " INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s mysql_replication" ####################################################################### usage() { cat < 1.0 Resource script for MySQL. May manage a standalone MySQL database, a clone set with externally managed replication, or a complete master/slave replication setup. While managing replication, the default behavior is to use uname -n values in the change master to command. Other IPs can be specified manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP giving the IP to use for replication. For example, if the mysql primitive you are using is p_mysql, the attribute to set will be p_mysql_mysql_master_IP. Manages a MySQL database instance Location of the MySQL server binary MySQL server binary Location of the MySQL client binary MySQL client binary Configuration file MySQL config Directory containing databases MySQL datadir User running MySQL daemon MySQL user Group running MySQL daemon (for logfile and directory permissions) MySQL group The logfile to be used for mysqld. MySQL log file The pidfile to be used for mysqld. MySQL pid file The socket to be used for mysqld. MySQL socket Table to be tested in monitor statement (in database.table notation) MySQL test table MySQL test user, must have select privilege on test_table MySQL test user MySQL test user password MySQL test user password If the MySQL database does not exist, it will be created Create the database if it does not exist Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld MySQL replication user. This user is used for starting and stopping MySQL replication, for setting and resetting the master host, and for setting and unsetting read-only mode. Because of that, this user must have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, and PROCESS privileges on all nodes within the cluster. Mandatory if you define a master-slave resource. MySQL replication user MySQL replication password. Used for replication client and slave. Mandatory if you define a master-slave resource. MySQL replication user password The port on which the Master MySQL instance is listening. MySQL replication port The maximum number of seconds a replication slave is allowed to lag behind its master. Do not set this to zero. What the cluster manager does in case a slave exceeds this maximum lag is determined by the evict_outdated_slaves parameter. Maximum time (seconds) a MySQL slave is allowed to lag behind a master If set to true, any slave which is more than max_slave_lag seconds behind the master has its MySQL instance shut down. If this parameter is set to false in a primitive or clone resource, it is simply ignored. If set to false in a master/slave resource, then exceeding the maximum slave lag will merely push down the master preference so the lagging slave is never promoted to the new master. Determines whether to shut down badly lagging slaves An attribute that the RA can manage to specify whether a node can be read from. This node attribute will be 1 if it's fine to read from the node, and 0 otherwise (for example, when a slave has lagged too far behind the master). A typical example for the use of this attribute would be to tie a set of IP addresses to MySQL slaves that can be read from. This parameter is only meaningful in master/slave set configurations. Sets the node attribute that determines whether a node is usable for clients to read from. END } # Convenience functions set_read_only() { # Sets or unsets read-only mode. Accepts one boolean as its # optional argument. If invoked without any arguments, defaults to # enabling read only mode. Should only be set in master/slave # setups. # Returns $OCF_SUCCESS if the operation succeeds, or # $OCF_ERR_GENERIC if it fails. local ro_val if ocf_is_true $1; then ro_val="on" else ro_val="off" fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "SET GLOBAL read_only=${ro_val}" } get_read_only() { # Check if read-only is set local read_only_state read_only_state=`$MYSQL $MYSQL_OPTIONS_REPL \ -e "SHOW VARIABLES" | grep read_only | awk '{print $2}'` if [ "$read_only_state" = "ON" ]; then return 0 else return 1 fi } is_slave() { # Determine whether the machine is currently running as a MySQL # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW # SLAVE STATUS creates an empty result set, 0 otherwise. local rc local tmpfile # Check whether this machine should be slave if ! ocf_is_ms || ! get_read_only; then return 1 fi get_slave_info rc=$? if [ $rc -eq 0 ]; then # show slave status is not empty # Is there a master_log_file defined? (master_log_file is deleted # by reset slave if [ "$master_log_file" ]; then return 0 else return 1 fi else # "SHOW SLAVE STATUS" returns an empty set if instance is not a # replication slave return 1 fi } parse_slave_info() { # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 } get_slave_info() { # Warning: this sets $tmpfile and LEAVE this file! You must delete it after use! local mysql_options if [ "$master_log_file" -a "$master_host" ]; then # variables are already defined, get_slave_info has been run before return $OCF_SUCCESS else tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW SLAVE STATUS\G' > $tmpfile if [ -s $tmpfile ]; then master_host=`parse_slave_info Master_Host $tmpfile` master_user=`parse_slave_info Master_User $tmpfile` master_port=`parse_slave_info Master_Port $tmpfile` master_log_file=`parse_slave_info Master_Log_File $tmpfile` master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` slave_io=`parse_slave_info Slave_IO_Running $tmpfile` last_errno=`parse_slave_info Last_Errno $tmpfile` secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` ocf_log debug "MySQL instance running as a replication slave" else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave ocf_log err "check_slave invoked on an instance that is not a replication slave." return $OCF_ERR_GENERIC fi return $OCF_SUCCESS fi } check_slave() { # Checks slave status local rc new_master get_slave_info rc=$? if [ $rc -eq 0 ]; then # Did we receive an error other than max_connections? if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then # Whoa. Replication ran into an error. This slave has # diverged from its master. Make sure this resource # doesn't restart in place. ocf_log err "MySQL instance configured for replication, but replication has failed." ocf_log err "See $tmpfile for details" # Just pull the reader VIP away, killing MySQL here would be pretty evil # on a loaded server set_reader_attr 0 exit $OCF_SUCCESS fi # If we got max_connections, let's remove the vip if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then set_reader_attr 0 exit $OCF_SUCCESS fi if [ "$slave_io" != 'Yes' ]; then # Not necessarily a bad thing. The master may have # temporarily shut down, and the slave may just be # reconnecting. A warning can't hurt, though. ocf_log warn "MySQL Slave IO threads currently not running." # Sanity check, are we at least on the right master new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` if [ "$master_host" != "$new_master" ]; then # Not pointing to the right master, not good, removing the VIPs set_reader_attr 0 exit $OCF_SUCCESS fi fi if [ "$slave_sql" != 'Yes' ]; then # We don't have a replication SQL thread running. Not a # good thing. Try to recoved by restarting the SQL thread # and remove reader vip. Prevent MySQL restart. ocf_log err "MySQL Slave SQL threads currently not running." ocf_log err "See $tmpfile for details" # Remove reader vip set_reader_attr 0 # try to restart slave ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" # Return success to prevent a restart exit $OCF_SUCCESS fi if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then # We're supposed to bail out if we lag too far # behind. Let's check our lag. if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then ocf_log err "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)." ocf_log err "See $tmpfile for details" # Remove reader vip set_reader_attr 0 exit $OCF_ERR_INSTALLED fi elif ocf_is_ms; then # Even if we're not set to evict lagging slaves, we can # still use the seconds behind master value to set our # master preference. local master_pref master_pref=$((${OCF_RESKEY_max_slave_lag}-${secs_behind})) if [ $master_pref -lt 0 ]; then # Sanitize a below-zero preference to just zero master_pref=0 fi $CRM_MASTER -v $master_pref fi # is the slave ok to have a VIP on it if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then set_reader_attr 0 else set_reader_attr 1 fi ocf_log debug "MySQL instance running as a replication slave" rm -f $tmpfile else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave # TODO: Needs to handle when get_slave_info will return too many connections error rm -f $tmpfile ocf_log err "check_slave invoked on an instance that is not a replication slave." exit $OCF_ERR_GENERIC fi } set_master() { local new_master master_log_file master_log_pos local master_params new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` # Keep replication position get_slave_info if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then # master_params=", MASTER_LOG_FILE='$master_log_file', \ # MASTER_LOG_POS=$master_log_pos" ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos" rm -f $tmpfile return else master_log_file=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f2` master_log_pos=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f3` if [ -n "$master_log_file" -a -n "$master_log_pos" ]; then master_params=", MASTER_LOG_FILE='$master_log_file', \ MASTER_LOG_POS=$master_log_pos" ocf_log info "Restored master pos for $new_master : $master_log_file:$master_log_pos" fi fi # Informs the MySQL server of the master to replicate # from. Accepts one mandatory argument which must contain the host # name of the new master host. The master must either be unchanged # from the laste master the slave replicated from, or freshly # reset with RESET MASTER. ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "CHANGE MASTER TO MASTER_HOST='$new_master', \ MASTER_USER='$OCF_RESKEY_replication_user', \ MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params" rm -f $tmpfile } unset_master(){ # Instructs the MySQL server to stop replicating from a master # host. # If we're currently not configured to be replicating from any # host, then there's nothing to do. But we do log a warning as # no-one but the CRM should be touching the MySQL master/slave # configuration. if ! is_slave; then ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" return $OCF_SUCCESS fi local tmpfile tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX` # At this point, the master is read only so there should not be much binlogs to transfer # Let's wait for the last bits while true; do $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW PROCESSLIST\G' > $tmpfile if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then ocf_log info "MySQL slave has finished reading master binary log" break fi if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then ocf_log info "Master is down, no more binary logs to come" break fi if grep -i 'Connecting to master' $tmpfile >/dev/null; then ocf_log info "Master is down, no more binary logs to come" break fi if ! grep 'system user' $tmpfile >/dev/null; then ocf_log info "Slave is not running - not waiting to finish" break fi sleep 1 done # Now, stop the slave I/O thread and wait for relay log # processing to complete ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE IO_THREAD" if [ $? -gt 0 ]; then ocf_log err "Error stopping slave IO thread" exit $OCF_ERR_GENERIC fi while true; do $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW PROCESSLIST\G' > $tmpfile if grep -i 'Has read all relay log' $tmpfile >/dev/null; then ocf_log info "MySQL slave has finished processing relay log" break fi if ! grep -q 'system user' $tmpfile; then ocf_log info "Slave not runnig - not waiting to finish" break fi ocf_log info "Waiting for MySQL slave to finish processing relay log" sleep 1 done rm -f $tmpfile # Now, stop all slave activity and unset the master host ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" if [ $? -gt 0 ]; then ocf_log err "Error stopping rest slave threads" exit $OCF_ERR_GENERIC fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "RESET SLAVE /*!50516 ALL */;" if [ $? -gt 0 ]; then ocf_log err "Failed to reset slave" exit $OCF_ERR_GENERIC fi } # Start replication as slave start_slave() { ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" } # Set the attribute controlling the readers VIP set_reader_attr() { local curr_attr_value curr_attr_value=$(get_reader_attr) if [ "$curr_attr_value" -ne "$1" ]; then $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1 fi } # get the attribute controlling the readers VIP get_reader_attr() { local attr_value local rc attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` rc=$? if [ "$rc" -eq "0" ]; then echo $attr_value else echo -1 fi } # Stores data for MASTER STATUS from MySQL update_data_master_status() { master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}" $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file } # Returns the specified value from the stored copy of SHOW MASTER STATUS. # should be call after update_data_master_status for tmpfile # Arguments: # $1 The value to get. get_master_status() { awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file" } # Determines what IP address is attached to the current host. The output of the # crm_attribute command looks like this: # scope=nodes name=IP value=10.2.2.161 # If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n # The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the # change master to command. get_local_ip() { local IP IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G` if [ ! $? -eq 0 ]; then uname -n else echo $IP fi } ####################################################################### # Functions invoked by resource manager actions mysql_validate() { check_binary $OCF_RESKEY_binary check_binary $OCF_RESKEY_client_binary if [ ! -f $OCF_RESKEY_config ]; then ocf_log err "Config $OCF_RESKEY_config doesn't exist"; return $OCF_ERR_INSTALLED; fi if [ ! -d $OCF_RESKEY_datadir ]; then ocf_log err "Datadir $OCF_RESKEY_datadir doesn't exist"; return $OCF_ERR_INSTALLED; fi getent passwd $OCF_RESKEY_user >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "User $OCF_RESKEY_user doesn't exit"; return $OCF_ERR_INSTALLED; fi getent group $OCF_RESKEY_group >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "Group $OCF_RESKEY_group doesn't exist"; return $OCF_ERR_INSTALLED; fi true } mysql_status() { if [ ! -e $OCF_RESKEY_pid ]; then ocf_log $1 "MySQL is not running" return $OCF_NOT_RUNNING; fi pid=`cat $OCF_RESKEY_pid`; if [ -d /proc -a -d /proc/1 ]; then [ "u$pid" != "u" -a -d /proc/$pid ] else kill -s 0 $pid >/dev/null 2>&1 fi if [ $? -eq 0 ]; then return $OCF_SUCCESS; else ocf_log $1 "MySQL not running: removing old PID file" rm -f $OCF_RESKEY_pid return $OCF_NOT_RUNNING; fi } mysql_monitor() { local rc local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi mysql_status $status_loglevel rc=$? # TODO: check max connections error # If status returned an error, return that immediately if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then # Check if this instance is configured as a slave, and if so # check slave status if is_slave; then check_slave fi # Check for test table ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" rc=$? if [ $rc -ne 0 ]; then ocf_log err "Failed to select from $test_table"; return $OCF_ERR_GENERIC; fi fi if ocf_is_ms && ! get_read_only; then ocf_log debug "MySQL monitor succeeded (master)"; return $OCF_RUNNING_MASTER else ocf_log debug "MySQL monitor succeeded"; return $OCF_SUCCESS fi } mysql_start() { local rc pid if ocf_is_ms; then # Initialize the ReaderVIP attribute, monitor will enable it set_reader_attr 0 fi mysql_status info if [ $? = $OCF_SUCCESS ]; then ocf_log info "MySQL already running" return $OCF_SUCCESS fi touch $OCF_RESKEY_log chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log chmod 0640 $OCF_RESKEY_log [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log if ocf_is_true "$OCF_RESKEY_enable_creation" && [ ! -d $OCF_RESKEY_datadir/mysql ] ; then ocf_log info "Initializing MySQL database: " $MYSQL_BINDIR/mysql_install_db --datadir=$OCF_RESKEY_datadir rc=$? if [ $rc -ne 0 ] ; then ocf_log err "Initialization failed: $rc"; exit $OCF_ERR_GENERIC fi chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_datadir fi pid_dir=`dirname $OCF_RESKEY_pid` if [ ! -d $pid_dir ] ; then ocf_log info "Creating PID dir: $pid_dir" mkdir -p $pid_dir chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir fi socket_dir=`dirname $OCF_RESKEY_socket` if [ ! -d $socket_dir ] ; then ocf_log info "Creating socket dir: $socket_dir" mkdir -p $socket_dir chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir fi # Regardless of whether we just created the directory or it # already existed, check whether it is writable by the configured # user for dir in $pid_dir $socket_dir; do if ! su -s /bin/sh - $OCF_RESKEY_user -c "test -w $dir"; then ocf_log err "Directory $dir is not writable by $OCF_RESKEY_user" exit $OCF_ERR_PERM; fi done # Uncomment to perform permission clensing # - not convinced this should be enabled by default # #chmod 0755 $OCF_RESKEY_datadir #chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir #chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir mysql_extra_params= if ocf_is_ms; then mysql_extra_params="--skip-slave-start" fi ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ --pid-file=$OCF_RESKEY_pid \ --socket=$OCF_RESKEY_socket \ --datadir=$OCF_RESKEY_datadir \ --log-error=$OCF_RESKEY_log \ --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ $mysql_extra_params >/dev/null 2>&1 & pid=$! # Spin waiting for the server to come up. # Let the CRM/LRM time us out if required. start_wait=1 while [ $start_wait = 1 ]; do if ! ps $pid > /dev/null 2>&1; then wait $pid ocf_log err "MySQL server failed to start (rc=$?), please check your installation" return $OCF_ERR_GENERIC fi mysql_status info rc=$? if [ $rc = $OCF_SUCCESS ]; then start_wait=0 elif [ $rc != $OCF_NOT_RUNNING ]; then ocf_log info "MySQL start failed: $rc" return $rc fi sleep 2 done if ocf_is_ms; then # We're configured as a stateful resource. We must start as # slave by default. At this point we don't know if the CRM has # already promoted a master. So, we simply start in read only # mode. set_read_only on # Now, let's see whether there is a master. We might be a new # node that is just joining the cluster, and the CRM may have # promoted a master before. master_host=`echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " "` - if [ "$master_host" -a "$master_host" != ${HOSTNAME} ]; then + if [ "$master_host" -a "$master_host" != ${NODENAME} ]; then ocf_log info "Changing MySQL configuration to replicate from $master_host." set_master start_slave if [ $? -ne 0 ]; then ocf_log err "Failed to start slave" return $OCF_ERR_GENERIC fi else ocf_log info "No MySQL master present - clearing replication state" unset_master fi # We also need to set a master preference, otherwise Pacemaker # won't ever promote us in the absence of any explicit # preference set by the administrator. We choose a low # greater-than-zero preference. $CRM_MASTER -v 1 fi # Initial monitor action if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then OCF_CHECK_LEVEL=10 fi mysql_monitor rc=$? if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then ocf_log err "Failed initial monitor action" return $rc fi ocf_log info "MySQL started" return $OCF_SUCCESS } mysql_stop() { if ocf_is_ms; then # clear preference for becoming master $CRM_MASTER -D # Remove VIP capability set_reader_attr 0 fi if [ ! -f $OCF_RESKEY_pid ]; then ocf_log info "MySQL is not running" return $OCF_SUCCESS fi pid=`cat $OCF_RESKEY_pid 2> /dev/null ` /bin/kill $pid > /dev/null rc=$? if [ $rc != 0 ]; then ocf_log err "MySQL couldn't be stopped" return $OCF_ERR_GENERIC fi # stop waiting shutdown_timeout=15 if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) fi count=0 while [ $count -lt $shutdown_timeout ] do mysql_status info rc=$? if [ $rc = $OCF_NOT_RUNNING ]; then break fi count=`expr $count + 1` sleep 1 ocf_log debug "MySQL still hasn't stopped yet. Waiting..." done mysql_status info if [ $? != $OCF_NOT_RUNNING ]; then ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." /bin/kill -KILL $pid > /dev/null fi ocf_log info "MySQL stopped"; rm -f /var/lock/subsys/mysqld rm -f $OCF_RESKEY_socket return $OCF_SUCCESS } mysql_promote() { local master_info if ( ! mysql_status err ); then return $OCF_NOT_RUNNING fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" # Set Master Info in CIB, cluster level attribute update_data_master_status master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)" ${CRM_ATTR_REPL_INFO} -v "$master_info" rm -f $tmpfile set_read_only off || return $OCF_ERR_GENERIC # Existing master gets a higher-than-default master preference, so # the cluster manager does not shuffle the master role around # unnecessarily $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1)) # A master can accept reads set_reader_attr 1 return $OCF_SUCCESS } mysql_demote() { if ! mysql_status err; then return $OCF_NOT_RUNNING fi # Return master preference to default, so the cluster manager gets # a chance to select a new master $CRM_MASTER -v 1 } mysql_notify() { # If not configured as a Stateful resource, we make no sense of # notifications. if ! ocf_is_ms; then ocf_log info "This agent makes no use of notifications unless running in master/slave mode." return $OCF_SUCCESS fi local type_op type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" ocf_log debug "Received $type_op notification." case "$type_op" in 'pre-promote') # Nothing to do now here, new replication info not yet published ;; 'post-promote') # The master has completed its promotion. Now is a good # time to check whether our replication slave is working # correctly. master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "` - if [ "$master_host" = ${HOSTNAME} ]; then + if [ "$master_host" = ${NODENAME} ]; then ocf_log info "This will be the new master, ignoring post-promote notification." else ocf_log info "Resetting replication" unset_master if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi ocf_log info "Changing MySQL configuration to replicate from $master_host" set_master if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi start_slave if [ $? -ne 0 ]; then ocf_log err "Failed to start slave" return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS ;; 'pre-demote') demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` - if [ $demote_host = ${HOSTNAME} ]; then + if [ $demote_host = ${NODENAME} ]; then ocf_log info "post-demote notification for $demote_host" set_read_only on if [ $? -ne 0 ]; then ocf_log err "Failed to set read-only"; return $OCF_ERR_GENERIC; fi # Must kill all existing user threads because they are still Read/write # in order for the slaves to complete the read of binlogs local tmpfile tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX` $MYSQL $MYSQL_OPTIONS_REPL \ -e "SHOW PROCESSLIST" > $tmpfile for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile` do ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "KILL ${thread}" done else ocf_log info "Ignoring post-demote notification execpt for my own demotion." fi return $OCF_SUCCESS ;; 'post-demote') demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` - if [ $demote_host = ${HOSTNAME} ]; then + if [ $demote_host = ${NODENAME} ]; then ocf_log info "Ignoring post-demote notification for my own demotion." return $OCF_SUCCESS fi ocf_log info "post-demote notification for $demote_host." # The former master has just been gracefully demoted. unset_master ;; *) return $OCF_SUCCESS ;; esac } ####################################################################### ########################################################################## # If DEBUG_LOG is set, make this resource agent easy to debug: set up the # debug log and direct all output to it. Otherwise, redirect to /dev/null. # The log directory must be a directory owned by root, with permissions 0700, # and the log must be writable and not a symlink. ########################################################################## DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then DEBUG_LOG_DIR="${DEBUG_LOG%/*}" if [ -d "${DEBUG_LOG_DIR}" ]; then exec 9>>"$DEBUG_LOG" exec 2>&9 date >&9 echo "$*" >&9 env | grep OCF_ | sort >&9 set -x else exec 9>/dev/null fi fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac mysql_validate rc=$? LSB_STATUS_STOPPED=3 if [ $rc -ne 0 ]; then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $rc;; esac fi # What kind of method was invoked? case "$1" in start) mysql_start;; stop) mysql_stop;; status) mysql_status err;; monitor) mysql_monitor;; promote) mysql_promote;; demote) mysql_demote;; notify) mysql_notify;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac # vi:sw=4:ts=4:et: diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in index 35b0a5a87..1406c80f1 100644 --- a/heartbeat/ocf-shellfuncs.in +++ b/heartbeat/ocf-shellfuncs.in @@ -1,761 +1,780 @@ # # # Common helper functions for the OCF Resource Agents supplied by # heartbeat. # # Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Brée # All Rights Reserved. # # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Build version: $Format:%H$ # TODO: Some of this should probably split out into a generic OCF # library for shell scripts, but for the time being, we'll just use it # ourselves... # # TODO wish-list: # - Generic function for evaluating version numbers # - Generic function(s) to extract stuff from our own meta-data # - Logging function which automatically adds resource identifier etc # prefixes # TODO: Move more common functionality for OCF RAs here. # # This was common throughout all legacy Heartbeat agents unset LC_ALL; export LC_ALL unset LANGUAGE; export LANGUAGE __SCRIPT_NAME=`basename $0` if [ -z "$OCF_ROOT" ]; then : ${OCF_ROOT=@OCF_ROOT_DIR@} fi if [ "$OCF_FUNCTIONS_DIR" = ${OCF_ROOT}/resource.d/heartbeat ]; then # old unset OCF_FUNCTIONS_DIR fi : ${OCF_FUNCTIONS_DIR:=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-binaries . ${OCF_FUNCTIONS_DIR}/ocf-returncodes . ${OCF_FUNCTIONS_DIR}/ocf-directories . ${OCF_FUNCTIONS_DIR}/ocf-rarun # Define OCF_RESKEY_CRM_meta_interval in case it isn't already set, # to make sure that ocf_is_probe() always works : ${OCF_RESKEY_CRM_meta_interval=0} ocf_is_root() { if [ X`id -u` = X0 ]; then true else false fi } ocf_maybe_random() { local rnd="$RANDOM" # Something sane-ish in case a shell doesn't support $RANDOM [ -n "$rnd" ] || rnd=$$ echo $rnd } # Portability comments: # o The following rely on Bourne "sh" pattern-matching, which is usually # that for filename generation (note: not regexp). # o The "*) true ;;" clause is probably unnecessary, but is included # here for completeness. # o The negation in the pattern uses "!". This seems to be common # across many OSes (whereas the alternative "^" fails on some). # o If an OS is encountered where this negation fails, then a possible # alternative would be to replace the function contents by (e.g.): # [ -z "`echo $1 | tr -d '[0-9]'`" ] # ocf_is_decimal() { case "$1" in ""|*[!0-9]*) # empty, or at least one non-decimal false ;; *) true ;; esac } ocf_is_true() { case "$1" in yes|true|1|YES|TRUE|ja|on|ON) true ;; *) false ;; esac } ocf_is_hex() { case "$1" in ""|*[!0-9a-fA-F]*) # empty, or at least one non-hex false ;; *) true ;; esac } ocf_is_octal() { case "$1" in ""|*[!0-7]*) # empty, or at least one non-octal false ;; *) true ;; esac } __ocf_set_defaults() { __OCF_ACTION="$1" # Return to sanity for the agents... unset LANG LC_ALL=C export LC_ALL # TODO: Review whether we really should source this. Or rewrite # to match some emerging helper function syntax...? This imports # things which no OCF RA should be using... # Strip the OCF_RESKEY_ prefix from this particular parameter if [ -z "$OCF_RESKEY_OCF_CHECK_LEVEL" ]; then : ${OCF_CHECK_LEVEL:=0} else : ${OCF_CHECK_LEVEL:=$OCF_RESKEY_OCF_CHECK_LEVEL} fi if [ ! -d "$OCF_ROOT" ]; then ha_log "ERROR: OCF_ROOT points to non-directory $OCF_ROOT." exit $OCF_ERR_GENERIC fi if [ -z "$OCF_RESOURCE_TYPE" ]; then : ${OCF_RESOURCE_TYPE:=$__SCRIPT_NAME} fi if [ -z "$OCF_RA_VERSION_MAJOR" ]; then : We are being invoked as an init script. : Fill in some things with reasonable values. : ${OCF_RESOURCE_INSTANCE:="default"} return 0 fi if [ "x$__OCF_ACTION" = "xmeta-data" ]; then OCF_RESOURCE_INSTANCE="undef" fi if [ -z "$OCF_RESOURCE_INSTANCE" ]; then ha_log "ERROR: Need to tell us our resource instance name." exit $OCF_ERR_ARGS fi } hadate() { date "+${HA_DATEFMT}" } set_logtag() { if [ -z "$HA_LOGTAG" ]; then if [ -n "$OCF_RESOURCE_INSTANCE" ]; then HA_LOGTAG="$__SCRIPT_NAME($OCF_RESOURCE_INSTANCE)[$$]" else HA_LOGTAG="$__SCRIPT_NAME[$$]" fi fi } ha_log() { local loglevel [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" # if we're connected to a tty, then output to stderr if tty >/dev/null; then if [ "x$HA_debug" = "x0" -a "x$loglevel" = xdebug ] ; then return 0 fi if [ "$HA_LOGTAG" ]; then echo "$HA_LOGTAG: $*" else echo "$*" fi >&2 return 0 fi set_logtag if [ "x${HA_LOGD}" = "xyes" ] ; then ha_logger -t "${HA_LOGTAG}" "$@" if [ "$?" -eq "0" ] ; then return 0 fi fi if [ -n "$HA_LOGFACILITY" ] then : logging through syslog # loglevel is unknown, use 'notice' for now loglevel=notice case "${*}" in *ERROR*) loglevel=err;; *WARN*) loglevel=warning;; *INFO*|info) loglevel=info;; esac logger -t "$HA_LOGTAG" -p ${HA_LOGFACILITY}.${loglevel} "${*}" fi if [ -n "$HA_LOGFILE" ] then : appending to $HA_LOGFILE echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_LOGFILE fi if [ -z "$HA_LOGFACILITY" -a -z "$HA_LOGFILE" ] then : appending to stderr echo `hadate`"${*}" >&2 fi if [ -n "$HA_DEBUGLOG" ] then : appending to $HA_DEBUGLOG if [ "$HA_LOGFILE"x != "$HA_DEBUGLOG"x ]; then echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG fi fi } ha_debug() { if [ "x${HA_debug}" = "x0" ] ; then return 0 fi if tty >/dev/null; then if [ "$HA_LOGTAG" ]; then echo "$HA_LOGTAG: $*" else echo "$*" fi >&2 return 0 fi set_logtag if [ "x${HA_LOGD}" = "xyes" ] ; then ha_logger -t "${HA_LOGTAG}" -D "ha-debug" "$@" if [ "$?" -eq "0" ] ; then return 0 fi fi [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" if [ -n "$HA_LOGFACILITY" ] then : logging through syslog logger -t "$HA_LOGTAG" -p "${HA_LOGFACILITY}.debug" "${*}" fi if [ -n "$HA_DEBUGLOG" ] then : appending to $HA_DEBUGLOG echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG fi if [ -z "$HA_LOGFACILITY" -a -z "$HA_DEBUGLOG" ] then : appending to stderr echo "$HA_LOGTAG: `hadate`${*}: ${HA_LOGFACILITY}" >&2 fi } ha_parameter() { local VALUE VALUE=`sed -e 's%[ ][ ]*% %' -e 's%^ %%' -e 's%#.*%%' $HA_CF | grep -i "^$1 " | sed 's%[^ ]* %%'` if [ "X$VALUE" = X ] then case $1 in keepalive) VALUE=2;; deadtime) ka=`ha_parameter keepalive` VALUE=`expr $ka '*' 2 '+' 1`;; esac fi echo $VALUE } ocf_log() { # TODO: Revisit and implement internally. if [ $# -lt 2 ] then ocf_log err "Not enough arguments [$#] to ocf_log." fi __OCF_PRIO="$1" shift __OCF_MSG="$*" case "${__OCF_PRIO}" in crit) __OCF_PRIO="CRIT";; err) __OCF_PRIO="ERROR";; warn) __OCF_PRIO="WARNING";; info) __OCF_PRIO="INFO";; debug)__OCF_PRIO="DEBUG";; *) __OCF_PRIO=`echo ${__OCF_PRIO}| tr '[a-z]' '[A-Z]'`;; esac if [ "${__OCF_PRIO}" = "DEBUG" ]; then ha_debug "${__OCF_PRIO}: $__OCF_MSG" else ha_log "${__OCF_PRIO}: $__OCF_MSG" fi } # # ocf_deprecated: Log a deprecation warning # Usage: ocf_deprecated [param-name] # Arguments: param-name optional, name of a boolean resource # parameter that can be used to suppress # the warning (default # "ignore_deprecation") ocf_deprecated() { local param param=${1:-ignore_deprecation} # don't use ${!param} here, it's a bashism if ! ocf_is_true $(eval echo \$OCF_RESKEY_$param); then ocf_log warn "This resource agent is deprecated" \ "and may be removed in a future release." \ "See the man page for details." \ "To suppress this warning, set the \"${param}\"" \ "resource parameter to true." fi } # # Ocf_run: Run a script, and log its output. # Usage: ocf_run [-q] [-info|-warn|-err] # -q: don't log the output of the command if it succeeds # -info|-warn|-err: log the output of the command at given # severity if it fails (defaults to err) # ocf_run() { local rc local output local verbose=1 local loglevel=err local var for var in 1 2 do case "$1" in "-q") verbose="" shift 1;; "-info"|"-warn"|"-err") loglevel=`echo $1 | sed -e s/-//g` shift 1;; *) ;; esac done output=`"$@" 2>&1` rc=$? output=`echo $output` if [ $rc -eq 0 ]; then if [ "$verbose" -a ! -z "$output" ]; then ocf_log info "$output" fi return $OCF_SUCCESS else if [ ! -z "$output" ]; then ocf_log $loglevel "$output" else ocf_log $loglevel "command failed: $*" fi return $rc fi } ocf_pidfile_status() { local pid pidfile=$1 if [ ! -e $pidfile ]; then # Not exists return 2 fi pid=`cat $pidfile` kill -0 $pid 2>&1 > /dev/null if [ $? = 0 ]; then return 0 fi # Stale return 1 } ocf_take_lock() { local lockfile=$1 local rnd=$(ocf_maybe_random) sleep 0.$rnd while ocf_pidfile_status $lockfile do ocf_log info "Sleeping until $lockfile is released..." sleep 0.$rnd done echo $$ > $lockfile } ocf_release_lock_on_exit() { local lockfile=$1 trap "rm -f $lockfile" EXIT } # returns true if the CRM is currently running a probe. A probe is # defined as a monitor operation with a monitoring interval of zero. ocf_is_probe() { [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" = 0 ] } # returns true if the resource is configured as a clone. This is # defined as a resource where the clone-max meta attribute is present, # and set to greater than zero. ocf_is_clone() { [ ! -z "${OCF_RESKEY_CRM_meta_clone_max}" ] && [ "${OCF_RESKEY_CRM_meta_clone_max}" -gt 0 ] } # returns true if the resource is configured as a multistate # (master/slave) resource. This is defined as a resource where the # master-max meta attribute is present, and set to greater than zero. ocf_is_ms() { [ ! -z "${OCF_RESKEY_CRM_meta_master_max}" ] && [ "${OCF_RESKEY_CRM_meta_master_max}" -gt 0 ] } # version check functions # allow . and - to delimit version numbers # max version number is 999 # letters and such are effectively ignored # ocf_is_ver() { echo $1 | grep '^[0-9][0-9.-]*[0-9]$' >/dev/null 2>&1 } ocf_ver2num() { echo $1 | awk -F'[.-]' ' {for(i=1; i<=NF; i++) s=s*1000+$i; print s} ' } ocf_ver_level(){ echo $1 | awk -F'[.-]' '{print NF}' } ocf_ver_complete_level(){ local ver="$1" local level="$2" local i=0 while [ $i -lt $level ]; do ver=${ver}.0 i=`expr $i + 1` done echo $ver } # usage: ocf_version_cmp VER1 VER2 # version strings can contain digits, dots, and dashes # must start and end with a digit # returns: # 0: VER1 smaller (older) than VER2 # 1: versions equal # 2: VER1 greater (newer) than VER2 # 3: bad format ocf_version_cmp() { ocf_is_ver "$1" || return 3 ocf_is_ver "$2" || return 3 local v1=$1 local v2=$2 local v1_level=`ocf_ver_level $v1` local v2_level=`ocf_ver_level $v2` local level_diff if [ $v1_level -lt $v2_level ]; then level_diff=`expr $v2_level - $v1_level` v1=`ocf_ver_complete_level $v1 $level_diff` elif [ $v1_level -gt $v2_level ]; then level_diff=`expr $v1_level - $v2_level` v2=`ocf_ver_complete_level $v2 $level_diff` fi v1=`ocf_ver2num $v1` v2=`ocf_ver2num $v2` if [ $v1 -eq $v2 ]; then return 1 elif [ $v1 -lt $v2 ]; then return 0 else return 2 # -1 would look funny in shell ;-) fi } +ocf_local_nodename() { + # use crm_node -n for pacemaker > 1.1.8 + which pacemakerd > /dev/null 2>&1 + if [ $? -eq 0 ]; then + local version=$(pacemakerd -$ | grep "Pacemaker .*" | awk '{ print $2 }') + ocf_version_cmp "$version" "1.1.8" + if [ $? -eq 2 ]; then + which crm_node > /dev/null 2>&1 + if [ $? -eq 0 ]; then + crm_node -n + return + fi + fi + fi + + # otherwise use uname -n + uname -n +} + # usage: dirname DIR dirname() { local a local b [ $# = 1 ] || return 1 a="$1" while [ 1 ]; do b="${a%/}" [ "$a" = "$b" ] && break a="$b" done b=${a%/*} [ -z "$b" -o "$a" = "$b" ] && b="." echo "$b" return 0 } # # pseudo_resource status tracking function... # # This allows pseudo resources to give correct status information. As we add # resource monitoring, and better resource tracking in general, this will # become essential. # # These scripts work because ${HA_RSCTMP} is cleaned out every time # heartbeat is started. # # We create "resource-string" tracking files under ${HA_RSCTMP} in a # very simple way: # # Existence of "${HA_RSCTMP}/resource-string" means that we consider # the resource named by "resource-string" to be running. # # Note that "resource-string" needs to be unique. Using the resource type # plus the resource instance arguments to make up the resource string # is probably sufficient... # # usage: ha_pseudo_resource resource-string op [tracking_file] # where op is {start|stop|monitor|status|restart|reload|print} # print is a special op which just prints the tracking file location # user can override our choice of the tracking file location by # specifying it as the third arg # Note that all operations are silent... # ha_pseudo_resource() { local ha_resource_tracking_file="${3:-${HA_RSCTMP}/$1}" case $2 in start|restart|reload) touch "$ha_resource_tracking_file";; stop) rm -f "$ha_resource_tracking_file";; status|monitor) if [ -f "$ha_resource_tracking_file" ] then return 0 else case $2 in status) return 3;; *) return 7;; esac fi;; print) echo "$ha_resource_tracking_file";; *) return 3;; esac } # usage: rmtempdir TMPDIR rmtempdir() { [ $# = 1 ] || return 1 if [ -e "$1" ]; then rmdir "$1" || return 1 fi return 0 } # usage: maketempfile [-d] maketempfile() { if [ $# = 1 -a "$1" = "-d" ]; then mktemp -d return -0 elif [ $# != 0 ]; then return 1 fi mktemp return 0 } # usage: rmtempfile TMPFILE rmtempfile () { [ $# = 1 ] || return 1 if [ -e "$1" ]; then rm "$1" || return 1 fi return 0 } # echo the first lower supported check level # pass set of levels supported by the agent # (in increasing order, 0 is optional) ocf_check_level() { local lvl prev lvl=0 prev=0 if ocf_is_decimal "$OCF_CHECK_LEVEL"; then # the level list should be very short for lvl; do if [ "$lvl" -eq "$OCF_CHECK_LEVEL" ]; then break elif [ "$lvl" -gt "$OCF_CHECK_LEVEL" ]; then lvl=$prev # the previous one break fi prev=$lvl done fi echo $lvl } # usage: ocf_stop_processes SIGNALS WAIT_TIME PIDS # # we send signals (use quotes for more than one!) in the order # given; if one or more processes are still running we try KILL; # the wait_time is the _total_ time we'll spend in this function # this time may be slightly exceeded if the processes won't leave # # returns: # 0: all processes left # 1: some processes still running # # example: # # ocf_stop_processes TERM 5 $pids # ocf_stop_processes() { local signals="$1" local wait_time="$(($2/`echo $signals|wc -w`))" shift 2 local pids="$*" local sig i test -z "$pids" && return 0 for sig in $signals KILL; do kill -s $sig $pids 2>/dev/null # try to leave early, and yet leave processes time to exit sleep 0.2 for i in `seq $wait_time`; do kill -s 0 $pids 2>/dev/null || return 0 sleep 1 done done return 1 } # # RA tracing may be turned on by setting OCF_TRACE_RA # the trace output will be saved to OCF_TRACE_FILE, if set, or # by default to # $HA_VARLIB/trace_ra//.. # e.g. $HA_VARLIB/trace_ra/oracle/db.start.2012-11-27.08:37:08 # # OCF_TRACE_FILE: # - FD (small integer [3-9]) in that case it is up to the callers # to capture output; the FD _must_ be open for writing # - absolute path # # NB: FD 9 may be used for tracing with bash >= v4 in case # OCF_TRACE_FILE is set to a path. # ocf_is_bash4() { echo "$SHELL" | grep bash > /dev/null && [ ${BASH_VERSINFO[0]} = "4" ] } ocf_trace_redirect_to_file() { local dest=$1 if ocf_is_bash4; then exec 9>$dest BASH_XTRACEFD=9 else exec 2>$dest fi } ocf_trace_redirect_to_fd() { local fd=$1 if ocf_is_bash4; then BASH_XTRACEFD=$fd else exec 2>&$fd fi } __ocf_test_trc_dest() { local dest=$1 if ! touch $dest; then ocf_log warn "$dest not writable, trace not going to happen" __OCF_TRC_DEST="" __OCF_TRC_MANAGE="" return 1 fi return 0 } ocf_default_trace_dest() { tty >/dev/null && return if [ -n "$OCF_RESOURCE_TYPE" -a \ -n "$OCF_RESOURCE_INSTANCE" -a -n "$__OCF_ACTION" ]; then local ts=`date +%F.%T` __OCF_TRC_DEST=$HA_VARLIB/trace_ra/${OCF_RESOURCE_TYPE}/${OCF_RESOURCE_INSTANCE}.${__OCF_ACTION}.$ts __OCF_TRC_MANAGE="1" fi } ocf_start_trace() { export __OCF_TRC_DEST="" __OCF_TRC_MANAGE="" case "$OCF_TRACE_FILE" in [3-9]) ocf_trace_redirect_to_fd "$OCF_TRACE_FILE" ;; /*/*) __OCF_TRC_DEST=$OCF_TRACE_FILE ;; "") ocf_default_trace_dest ;; *) ocf_log warn "OCF_TRACE_FILE must be set to either FD (open for writing) or absolute file path" ocf_default_trace_dest ;; esac if [ "$__OCF_TRC_DEST" ]; then mkdir -p `dirname $__OCF_TRC_DEST` __ocf_test_trc_dest $__OCF_TRC_DEST || return ocf_trace_redirect_to_file "$__OCF_TRC_DEST" fi PS4='+ `date +"%T"`: ${FUNCNAME[0]:+${FUNCNAME[0]}:}${LINENO}: ' set -x } ocf_stop_trace() { set +x } __ocf_set_defaults "$@" : ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra} ocf_is_true "$OCF_TRACE_RA" && ocf_start_trace diff --git a/heartbeat/pgsql b/heartbeat/pgsql index fe1361653..a462627eb 100755 --- a/heartbeat/pgsql +++ b/heartbeat/pgsql @@ -1,1872 +1,1872 @@ #!/bin/sh # # Description: Manages a PostgreSQL Server as an OCF High-Availability # resource # # Authors: Serge Dubrouski (sergeyfd@gmail.com) -- original RA # Florian Haas (florian@linbit.com) -- makeover # Takatoshi MATSUO (matsuo.tak@gmail.com) -- support replication # David Corlette (dcorlette@netiq.com) -- add support for non-standard library locations and non-standard port # # Copyright: 2006-2012 Serge Dubrouski # and other Linux-HA contributors # License: GNU General Public License (GPL) # ############################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # # Get PostgreSQL Configuration parameter # get_pgsql_param() { local param_name param_name=$1 perl_code="if (/^\s*$param_name[\s=]+\s*(.*)$/) { \$dir=\$1; \$dir =~ s/\s*\#.*//; \$dir =~ s/^'(\S*)'/\$1/; print \$dir;}" perl -ne "$perl_code" < $OCF_RESKEY_config } # Defaults OCF_RESKEY_pgctl_default=/usr/bin/pg_ctl OCF_RESKEY_psql_default=/usr/bin/psql OCF_RESKEY_pgdata_default=/var/lib/pgsql/data OCF_RESKEY_pgdba_default=postgres OCF_RESKEY_pghost_default="" OCF_RESKEY_pgport_default=5432 OCF_RESKEY_pglibs_default=/usr/lib OCF_RESKEY_start_opt_default="" OCF_RESKEY_pgdb_default=template1 OCF_RESKEY_logfile_default=/dev/null OCF_RESKEY_stop_escalate_default=30 OCF_RESKEY_monitor_user_default="" OCF_RESKEY_monitor_password_default="" OCF_RESKEY_monitor_sql_default="select now();" OCF_RESKEY_check_wal_receiver_default="false" # Defaults for replication OCF_RESKEY_rep_mode_default=none OCF_RESKEY_node_list_default="" OCF_RESKEY_restore_command_default="" OCF_RESKEY_archive_cleanup_command_default="" OCF_RESKEY_recovery_end_command_default="" OCF_RESKEY_master_ip_default="" OCF_RESKEY_repuser_default="postgres" OCF_RESKEY_primary_conninfo_opt_default="" OCF_RESKEY_restart_on_promote_default="false" OCF_RESKEY_tmpdir_default="/var/lib/pgsql/tmp" OCF_RESKEY_xlog_check_count_default="3" OCF_RESKEY_crm_attr_timeout_default="5" OCF_RESKEY_stop_escalate_in_slave_default=30 : ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} : ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} : ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}} : ${OCF_RESKEY_pgdba=${OCF_RESKEY_pgdba_default}} : ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}} : ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}} : ${OCF_RESKEY_pglibs=${OCF_RESKEY_pglibs_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_pgdata}/postgresql.conf} : ${OCF_RESKEY_start_opt=${OCF_RESKEY_start_opt_default}} : ${OCF_RESKEY_pgdb=${OCF_RESKEY_pgdb_default}} : ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} : ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} : ${OCF_RESKEY_monitor_user=${OCF_RESKEY_monitor_user_default}} : ${OCF_RESKEY_monitor_password=${OCF_RESKEY_monitor_password_default}} : ${OCF_RESKEY_monitor_sql=${OCF_RESKEY_monitor_sql_default}} : ${OCF_RESKEY_check_wal_receiver=${OCF_RESKEY_check_wal_receiver_default}} # for replication : ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}} : ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}} : ${OCF_RESKEY_restore_command=${OCF_RESKEY_restore_command_default}} : ${OCF_RESKEY_archive_cleanup_command=${OCF_RESKEY_archive_cleanup_command_default}} : ${OCF_RESKEY_recovery_end_command=${OCF_RESKEY_recovery_end_command_default}} : ${OCF_RESKEY_master_ip=${OCF_RESKEY_master_ip_default}} : ${OCF_RESKEY_repuser=${OCF_RESKEY_repuser_default}} : ${OCF_RESKEY_primary_conninfo_opt=${OCF_RESKEY_primary_conninfo_opt_default}} : ${OCF_RESKEY_restart_on_promote=${OCF_RESKEY_restart_on_promote_default}} : ${OCF_RESKEY_tmpdir=${OCF_RESKEY_tmpdir_default}} : ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} : ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} : ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} usage() { cat < 1.0 Resource script for PostgreSQL. It manages a PostgreSQL as an HA resource. Manages a PostgreSQL database instance Path to pg_ctl command. pgctl Start options (-o start_opt in pg_ctl). "-i -p 5432" for example. start_opt Additional pg_ctl options (-w, -W etc..). ctl_opt Path to psql command. psql Path to PostgreSQL data directory. pgdata User that owns PostgreSQL. pgdba Hostname/IP address where PostgreSQL is listening pghost Port where PostgreSQL is listening pgport Custom location of the Postgres libraries. If not set, the standard location will be used. pglibs PostgreSQL user that pgsql RA will user for monitor operations. If it's not set pgdba user will be used. monitor_user Password for monitor user. monitor_password SQL script that will be used for monitor operations. monitor_sql Path to the PostgreSQL configuration file for the instance. Configuration file Database that will be used for monitoring. pgdb Path to PostgreSQL server log output file. logfile Unix socket directory for PostgreSQL socketdir Number of shutdown retries (using -m fast) before resorting to -m immediate stop escalation Replication mode may be set to "async" or "sync" or "slave". They require PostgreSQL 9.1 or later. Once set, "async" and "sync" require node_list, master_ip, and restore_command parameters,as well as configuring PostgreSQL for replication (in postgresql.conf and pg_hba.conf). "slave" means that RA only makes recovery.conf before starting to connect to primary which is running somewhere. It dosen't need master/slave setting. It requires master_ip restore_command parameters. rep_mode All node names. Please separate each node name with a space. This is required for replication. node list restore_command for recovery.conf. This is required for replication. restore_command archive_cleanup_command for recovery.conf. This is used for replication and is optional. archive_cleanup_command recovery_end_command for recovery.conf. This is used for replication and is optional. recovery_end_command Master's floating IP address to be connected from hot standby. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. master ip User used to connect to the master server. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. repuser primary_conninfo options of recovery.conf except host, port, user and application_name. This is optional for replication. primary_conninfo_opt If this is true, RA deletes recovery.conf and restarts PostgreSQL on promote to keep Timeline ID. It probably makes fail-over slower. It's recommended to set on-fail of promote up as fence. This is optional for replication. restart_on_promote Path to temporary directory. This is optional for replication. tmpdir Number of checks of xlog on monitor before promote. This is optional for replication. xlog check count The timeout of crm_attribute forever update command. Default value is 5 seconds. This is optional for replication. The timeout of crm_attribute forever update command. Number of shutdown retries (using -m fast) before resorting to -m immediate in slave state. This is optional for replication. stop escalation_in_slave If this is true, RA checks wal_receiver process on monitor and notifies its status using "(resource name)-receiver-status" attribute. It's useful for checking whether PostgreSQL (hot standby) connects to primary. The attribute shows status as "normal" or "ERROR". check_wal_receiver EOF } # # Run the given command in the Resource owner environment... # runasowner() { local quietrun="" local loglevel="-err" local var for var in 1 2 do case "$1" in "-q") quietrun="-q" shift 1;; "warn"|"err") loglevel="-$1" shift 1;; *) ;; esac done ocf_run $quietrun $loglevel su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; $*" } # # Shell escape # escape_string() { echo "$*" | sed -e "s/'/'\\\\''/g" } # # methods: What methods/operations do we support? # pgsql_methods() { cat </dev/null 2>&1" return $? fi # No PID file false } pgsql_wal_receiver_status() { local PID local receiver_parent_pids PID=`head -n 1 $PIDFILE` receiver_parent_pids=`ps -ef | tr -s " " | grep "[w]al receiver process" | cut -d " " -f 3` if echo "$receiver_parent_pids" | grep -q -w "$PID" ; then attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal" -q return 0 fi attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "ERROR" -q ocf_log warn "wal receiver process is not running" return 1 } # # pgsql_real_monitor # pgsql_real_monitor() { local loglevel local rc local output # Set the log level of the error message loglevel=${1:-err} if ! pgsql_status then ocf_log info "PostgreSQL is down" return $OCF_NOT_RUNNING fi if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then pgsql_wal_receiver_status fi if is_replication; then #Check replication state output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ -Atc \"${CHECK_MS_SQL}\""` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel "Can't get PostgreSQL recovery status." return $OCF_ERR_GENERIC fi case "$output" in f) ocf_log debug "PostgreSQL is running as a primary." if [ "$OCF_RESKEY_monitor_sql" = "$OCF_RESKEY_monitor_sql_default" ]; then return $OCF_RUNNING_MASTER fi ;; t) ocf_log debug "PostgreSQL is running as a hot standby." return $OCF_SUCCESS;; *) ocf_log err "$CHECK_MS_SQL output is $output" return $OCF_ERR_GENERIC;; esac fi OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options \ -c '$OCF_RESKEY_monitor_sql'" rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running." return $OCF_ERR_GENERIC fi if is_replication; then return $OCF_RUNNING_MASTER fi return $OCF_SUCCESS } pgsql_replication_monitor() { local rc rc=$1 if [ $rc -ne $OCF_SUCCESS -a $rc -ne "$OCF_RUNNING_MASTER" ]; then return $rc fi # If I am Master if [ $rc -eq $OCF_RUNNING_MASTER ]; then change_data_status "$NODENAME" "LATEST" change_pgsql_status "$NODENAME" "PRI" control_slave_status || return $OCF_ERR_GENERIC if [ "$RE_CONTROL_SLAVE" = "true" ]; then sleep 2 ocf_log info "re-controlling slave status." RE_CONTROL_SLAVE="none" control_slave_status || return $OCF_ERR_GENERIC fi return $rc fi # I can't get master node name from $OCF_RESKEY_CRM_meta_notify_master_uname on monitor, # so I will get master node name using crm_mon -n print_crm_mon | tr -d "\t" | tr -d " " | grep -q "^${RESOURCE_NAME}[(:].*[):]Master" if [ $? -ne 0 ] ; then # If I am Slave and Master is not exist ocf_log info "Master does not exist." change_pgsql_status "$NODENAME" "HS:alone" have_master_right if [ $? -eq 0 ]; then rm -f ${XLOG_NOTE_FILE}.* fi else output=`exec_with_retry 0 $CRM_ATTR_FOREVER -N "$NODENAME" \ -n "$PGSQL_DATA_STATUS_ATTR" -G -q` if [ "$output" = "DISCONNECT" ]; then change_pgsql_status "$NODENAME" "HS:alone" fi fi return $rc } #pgsql_monitor: pgsql_real_monitor() wrapper for replication pgsql_monitor() { local rc pgsql_real_monitor rc=$? if ! is_replication; then return $rc else pgsql_replication_monitor $rc return $? fi } # pgsql_post_demote pgsql_post_demote() { DEMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname | sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` ocf_log debug "post-demote called. Demote uname is $DEMOTE_NODE" if [ "$DEMOTE_NODE" != "$NODENAME" ]; then if ! echo $OCF_RESKEY_CRM_meta_notify_master_uname | tr '[A-Z]' '[a-z]' | grep $NODENAME; then show_master_baseline change_pgsql_status "$NODENAME" "HS:alone" fi fi return $OCF_SUCCESS } pgsql_pre_promote() { local master_baseline local my_master_baseline local cmp_location local number_of_nodes # If my data is newer than new master's one, I fail my resource. PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` number_of_nodes=`echo $NODE_LIST | wc -w` if [ $number_of_nodes -ge 3 -a \ "$OCF_RESKEY_rep_mode" = "sync" -a \ "$PROMOTE_NODE" != "$NODENAME" ]; then master_baseline=`$CRM_ATTR_REBOOT -N "$PROMOTE_NODE" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` if [ $? -eq 0 ]; then my_master_baseline=`$CRM_ATTR_REBOOT -N "$NODENAME" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` # get older location cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ sort | head -1` if [ "$cmp_location" != "$my_master_baseline" ]; then ocf_log err "My data is newer than new master's one. New master's location : $master_baseline" exec_with_retry 0 $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY return $OCF_ERR_GENERIC fi fi fi return $OCF_SUCCESS } pgsql_notify() { local type="${OCF_RESKEY_CRM_meta_notify_type}" local op="${OCF_RESKEY_CRM_meta_notify_operation}" local rc if ! is_replication; then return $OCF_SUCCESS fi ocf_log debug "notify: ${type} for ${op}" case $type in pre) case $op in promote) pgsql_pre_promote return $? ;; esac ;; post) case $op in promote) delete_xlog_location PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` if [ "$PROMOTE_NODE" != "$NODENAME" ]; then delete_master_baseline fi return $OCF_SUCCESS ;; demote) pgsql_post_demote return $? ;; start|stop) MASTER_NODE=`echo $OCF_RESKEY_CRM_meta_notify_master_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` if [ "$NODENAME" = "$MASTER_NODE" ]; then control_slave_status fi return $OCF_SUCCESS ;; esac ;; esac return $OCF_SUCCESS } control_slave_status() { local rc local data_status local target local all_data_status local tmp_data_status local number_of_nodes all_data_status=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ -Atc \"${CHECK_REPLICATION_STATE_SQL}\""` rc=$? if [ $rc -eq 0 ]; then if [ -n "$all_data_status" ]; then all_data_status=`echo $all_data_status | sed "s/\n/ /g"` fi else report_psql_error $rc err "Can't get PostgreSQL replication status." return 1 fi number_of_nodes=`echo $NODE_LIST | wc -w` for target in $NODE_LIST; do if [ "$target" = "$NODENAME" ]; then continue fi data_status="DISCONNECT" if [ -n "$all_data_status" ]; then for tmp_data_status in $all_data_status; do if ! echo $tmp_data_status | grep -q "^${target}|"; then continue fi data_status=`echo $tmp_data_status | cut -d "|" -f 2,3` ocf_log debug "node_name and data_status is $tmp_data_status" break done fi case "$data_status" in "STREAMING|SYNC") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_PROMOTE" change_pgsql_status "$target" "HS:sync" ;; "STREAMING|ASYNC") change_data_status "$target" "$data_status" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then change_master_score "$target" "$CAN_NOT_PROMOTE" set_sync_mode "$target" else if [ $number_of_nodes -le 2 ]; then change_master_score "$target" "$CAN_PROMOTE" else # I can't determine which slave's data is newest in async mode. change_master_score "$target" "$CAN_NOT_PROMOTE" fi fi change_pgsql_status "$target" "HS:async" ;; "STREAMING|POTENTIAL") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" change_pgsql_status "$target" "HS:potential" ;; "DISCONNECT") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then set_async_mode "$target" fi ;; *) change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then set_async_mode "$target" fi change_pgsql_status "$target" "HS:connected" ;; esac done return 0 } have_master_right() { local old local new local output local data_status local node local mylocation local count local newestXlog local oldfile local newfile ocf_log debug "Checking if I have a master right." data_status=`$CRM_ATTR_FOREVER -N "$NODENAME" -n \ "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi else if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "STREAMING|ASYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi fi ocf_log info "My data status=$data_status." show_xlog_location if [ $? -ne 0 ]; then ocf_log err "Failed to show my xlog location." exit $OCF_ERR_GENERIC fi old=0 for count in `seq $OCF_RESKEY_xlog_check_count`; do if [ -f ${XLOG_NOTE_FILE}.$count ]; then old=$count continue fi break done new=`expr $old + 1` # get xlog locations of all nodes for node in ${NODE_LIST}; do output=`$CRM_ATTR_REBOOT -N "$node" -n \ "$PGSQL_XLOG_LOC_NAME" -G -q 2>/dev/null` if [ $? -ne 0 ]; then ocf_log warn "Can't get $node xlog location." continue else ocf_log info "$node xlog location : $output" echo "$node $output" >> ${XLOG_NOTE_FILE}.${new} if [ "$node" = "$NODENAME" ]; then mylocation=$output fi fi done oldfile=`cat ${XLOG_NOTE_FILE}.${old} 2>/dev/null` newfile=`cat ${XLOG_NOTE_FILE}.${new} 2>/dev/null` if [ "$oldfile" != "$newfile" ]; then # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 return 1 fi if [ "$new" -ge "$OCF_RESKEY_xlog_check_count" ]; then newestXlog=`printf "$newfile\n" | sort -t " " -k 2,3 -r | \ head -1 | cut -d " " -f 2` if [ "$newestXlog" = "$mylocation" ]; then ocf_log info "I have a master right." exec_with_retry 5 $CRM_MASTER -v $PROMOTE_ME return 0 fi change_data_status "$NODENAME" "DISCONNECT" ocf_log info "I don't have correct master data." # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 fi return 1 } is_replication() { if [ "$OCF_RESKEY_rep_mode" != "none" -a "$OCF_RESKEY_rep_mode" != "slave" ]; then return 0 fi return 1 } get_my_location() { local rc local output local replay_loc local receive_loc local output1 local output2 local log1 local log2 local newer_location output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ -Atc \"${CHECK_XLOG_LOC_SQL}\""` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc err "Can't get my xlog location." return 1 fi replay_loc=`echo $output | cut -d "|" -f 1` receive_loc=`echo $output | cut -d "|" -f 2` output1=`echo "$replay_loc" | cut -d "/" -f 1` output2=`echo "$replay_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` replay_loc="${log1}${log2}" output1=`echo "$receive_loc" | cut -d "/" -f 1` output2=`echo "$receive_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` receive_loc="${log1}${log2}" newer_location=`printf "$replay_loc\n$receive_loc" | sort -r | head -1` echo "$newer_location" return 0 } show_xlog_location() { local location location=`get_my_location` || return 1 exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -v "$location" } delete_xlog_location() { exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -D } show_master_baseline() { local rc local location location=`get_my_location` ocf_log info "My master baseline : $location." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -v "$location" } delete_master_baseline() { exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -D } set_async_mode_all() { [ "$OCF_RESKEY_rep_mode" = "sync" ] || return 0 ocf_log info "Set all nodes into async mode." runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" if [ $? -ne 0 ]; then ocf_log err "Can't set all nodes into async mode." return 1 fi return 0 } set_async_mode() { cat $REP_MODE_CONF | grep -q -e "[,' ]$1[,' ]" if [ $? -eq 0 ]; then ocf_log info "Setup $1 into async mode." runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" else ocf_log debug "$1 is already in async mode." return 0 fi exec_with_retry 0 reload_conf } set_sync_mode() { local sync_node_in_conf sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` if [ -n "$sync_node_in_conf" ]; then ocf_log debug "$sync_node_in_conf is already sync mode." else ocf_log info "Setup $1 into sync mode." runasowner -q err "echo \"synchronous_standby_names = '$1'\" > \"$REP_MODE_CONF\"" [ "$RE_CONTROL_SLAVE" = "false" ] && RE_CONTROL_SLAVE="true" exec_with_retry 0 reload_conf fi } reload_conf() { # Invoke pg_ctl runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata reload" if [ $? -eq 0 ]; then ocf_log info "Reload configuration file." else ocf_log err "Can't reload configuration file." return 1 fi return 0 } user_recovery_conf() { # put archive_cleanup_command and recovery_end_command only when defined by user if [ -n "$OCF_RESKEY_archive_cleanup_command" ]; then echo "archive_cleanup_command = '${OCF_RESKEY_archive_cleanup_command}'" fi if [ -n "$OCF_RESKEY_recovery_end_command" ]; then echo "recovery_end_command = '${OCF_RESKEY_recovery_end_command}'" fi } make_recovery_conf() { runasowner "touch $RECOVERY_CONF" if [ $? -ne 0 ]; then ocf_log err "Can't create recovery.conf." return 1 fi cat > $RECOVERY_CONF <> $RECOVERY_CONF ocf_log debug "Created recovery.conf. host=${OCF_RESKEY_master_ip}, user=${OCF_RESKEY_repuser}" return 0 } # change pgsql-status. # arg1:node, arg2: value change_pgsql_status() { local output if ! is_node_online $1; then return 0 fi output=`$CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then # If slave's disk is broken, RA cannot read PID file # and misjudges the PostgreSQL as down while it is running. # It causes overwriting of pgsql-status by Master because replication is still connected. if [ "$output" = "STOP" -o "$output" = "UNKNOWN" ]; then if [ "$1" != "$NODENAME" ]; then ocf_log warn "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2 by $NODENAME is prohibited." return 0 fi fi ocf_log info "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -v "$2" fi return 0 } # change pgsql-data-status. # arg1:node, arg2: value change_data_status() { local output if ! node_exist $1; then return 0 fi while : do output=`$CRM_ATTR_FOREVER -N "$1" -n "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then ocf_log info "Changing $PGSQL_DATA_STATUS_ATTR on $1 : $output->$2." exec_with_retry 0 exec_with_timeout 0 "$CRM_ATTR_FOREVER" -N $1 -n $PGSQL_DATA_STATUS_ATTR -v "$2" else break fi done return 0 } # set master-score # arg1:node, arg2: score, arg3: resoure set_master_score() { local current_score current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-$3" -G -q 2>/dev/null` if [ -n "$current_score" -a "$current_score" != "$2" ]; then ocf_log info "Changing $3 master score on $1 : $current_score->$2." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$target" -n "master-$3" -v "$2" fi return 0 } # change master-score # arg1:node, arg2: score change_master_score() { local instance if ! is_node_online $1; then return 0 fi if echo $OCF_RESOURCE_INSTANCE | grep -q ":"; then # If Pacemaker version is 1.0.x instance=0 while : do if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then break fi if [ "${RESOURCE_NAME}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then instance=`expr $instance + 1` continue fi set_master_score $1 $2 "${RESOURCE_NAME}:${instance}" || return 1 instance=`expr $instance + 1` done else # If globally-unique=false and Pacemaker version is 1.1.8 or higher # Master/Slave resource has no instance number set_master_score $1 $2 ${RESOURCE_NAME} || return 1 fi return 0 } report_psql_error() { local rc local loglevel local message rc=$1 loglevel=${2:-err} message="$3" ocf_log $loglevel "$message rc=$rc" if [ $rc -eq 1 ]; then ocf_log err "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." elif [ $rc -eq 2 ]; then ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." elif [ $rc -eq 3 ]; then ocf_log err "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." fi } # # timeout management function # arg1 timeout >= 0 (if arg1 is 0, OCF_RESKEY_crm_attr_timeout is used.) # arg2 : command # arg3 : command's args exec_with_timeout() { local func_pid local count=$OCF_RESKEY_crm_attr_timeout local rc if [ "$1" -ne 0 ]; then count=$1 fi shift $* & func_pid=$! sleep .1 while kill -s 0 $func_pid >/dev/null 2>&1; do sleep 1 count=`expr $count - 1` if [ $count -le 0 ]; then ocf_log err "\"$*\" (pid=$func_pid) timed out." kill -s 9 $func_pid >/dev/null 2>&1 return 1 fi ocf_log info "Waiting($count). \"$*\" (pid=$func_pid)." done wait $func_pid } # retry command when command doesn't return 0 # arg1 : count >= 0 (if arg1 is 0, it retries command in infinitum(1day)) # arg2..argN : command and args exec_with_retry() { local count="86400" local output local rc if [ "$1" -ne 0 ]; then count=$1 fi shift while [ $count -gt 0 ]; do output=`$*` rc=$? if [ $rc -ne 0 ]; then ocf_log warn "Retrying(remain $count). \"$*\" failed. rc=$rc. stdout=\"$output\"." count=`expr $count - 1` sleep 1 else printf "${output}" return 0 fi done ocf_log err "giving up executing \"$*\"" return $rc } is_node_online() { print_crm_mon | tr '[A-Z]' '[a-z]' | grep -e "^node $1 " -e "^node $1:" | grep -q -v "offline" } node_exist() { print_crm_mon | tr '[A-Z]' '[a-z]' | grep -q "^node $1" } check_binary2() { if ! have_binary "$1"; then ocf_log err "Setup problem: couldn't find command: $1" return 1 fi return 0 } check_config() { local rc=0 if [ ! -f "$1" ]; then if ocf_is_probe; then ocf_log info "Configuration file is $1 not readable during probe." rc=1 else ocf_log err "Configuration file $1 doesn't exist" rc=2 fi fi return $rc } # Validate most critical parameters pgsql_validate_all() { local version local check_config_rc local rep_mode_string if ! check_binary2 "$OCF_RESKEY_pgctl" || ! check_binary2 "$OCF_RESKEY_psql"; then return $OCF_ERR_INSTALLED fi check_config "$OCF_RESKEY_config" check_config_rc=$? [ $check_config_rc -eq 2 ] && return $OCF_ERR_INSTALLED [ $check_config_rc -eq 0 ] && : ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "User $OCF_RESKEY_pgdba doesn't exist"; return $OCF_ERR_INSTALLED; fi if ocf_is_probe; then ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" else if ! runasowner "test -w $OCF_RESKEY_pgdata"; then ocf_log err "Directory $OCF_RESKEY_pgdata is not writable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM; fi fi if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ] then ocf_log err "monitor password can't be empty" return $OCF_ERR_CONFIGURED fi if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ] then ocf_log err "monitor_user has to be set if monitor_password is set" return $OCF_ERR_CONFIGURED fi if is_replication || [ "$OCF_RESKEY_rep_mode" = "slave" ]; then version=`cat $OCF_RESKEY_pgdata/PG_VERSION` if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then ocf_log err "Replication mode needs PostgreSQL 9.1 or higher." return $OCF_ERR_INSTALLED fi if [ ! -n "$OCF_RESKEY_master_ip" ]; then ocf_log err "master_ip can't be empty." return $OCF_ERR_CONFIGURED fi fi if is_replication; then if ! ocf_is_ms; then ocf_log err "Replication(rep_mode=async or sync) requires Master/Slave configuration." return $OCF_ERR_CONFIGURED fi if [ ! "$OCF_RESKEY_rep_mode" = "sync" -a ! "$OCF_RESKEY_rep_mode" = "async" ]; then ocf_log err "Invalid rep_mode : $OCF_RESKEY_rep_mode" return $OCF_ERR_CONFIGURED fi if [ ! -n "$NODE_LIST" ]; then ocf_log err "node_list can't be empty." return $OCF_ERR_CONFIGURED fi if [ $check_config_rc -eq 0 ]; then rep_mode_string="include '$REP_MODE_CONF' # added by pgsql RA" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if ! grep -q "$rep_mode_string" $OCF_RESKEY_config; then ocf_log info "adding include directive into $OCF_RESKEY_config" echo "$rep_mode_string" >> $OCF_RESKEY_config fi else if grep -q "$rep_mode_string" $OCF_RESKEY_config; then ocf_log info "deleting include directive from $OCF_RESKEY_config" sed -i "/${rep_mode_string//\//\\/}/d" $OCF_RESKEY_config fi fi fi if ! mkdir -p $OCF_RESKEY_tmpdir || ! chown $OCF_RESKEY_pgdba $OCF_RESKEY_tmpdir || ! chmod 700 $OCF_RESKEY_tmpdir; then ocf_log err "Can't create directory $OCF_RESKEY_tmpdir or it is not readable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM fi fi if [ "$OCF_RESKEY_rep_mode" = "slave" ]; then if ocf_is_ms; then ocf_log err "Replication(rep_mode=slave) does not support Master/Slave configuration." return $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } # # Check if we need to create a log file # check_log_file() { if [ ! -f "$1" ] then touch $1 > /dev/null 2>&1 chown $OCF_RESKEY_pgdba:`getent passwd $OCF_RESKEY_pgdba | cut -d ":" -f 4` $1 fi #Check if $OCF_RESKEY_pgdba can write to the log file if ! runasowner "test -w $1" then return 1 fi return 0 } # # Check socket directory # check_socket_dir() { if [ ! -d "$OCF_RESKEY_socketdir" ]; then if ! mkdir "$OCF_RESKEY_socketdir"; then ocf_log err "Can't create directory $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chown $OCF_RESKEY_pgdba:`getent passwd \ $OCF_RESKEY_pgdba | cut -d ":" -f 4` "$OCF_RESKEY_socketdir" then ocf_log err "Can't change ownership for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chmod 2775 "$OCF_RESKEY_socketdir"; then ocf_log err "Can't change permissions for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi else if ! runasowner "touch $OCF_RESKEY_socketdir/test.$$"; then ocf_log err "$OCF_RESKEY_pgdba can't create files in $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi rm $OCF_RESKEY_socketdir/test.$$ fi } print_crm_mon() { if [ -z "$CRM_MON_OUTPUT" ]; then CRM_MON_OUTPUT=`exec_with_retry 0 crm_mon -n1` fi printf "${CRM_MON_OUTPUT}\n" } # # 'main' starts here... # if [ $# -ne 1 ] then usage exit $OCF_ERR_GENERIC fi PIDFILE=${OCF_RESKEY_pgdata}/postmaster.pid BACKUPLABEL=${OCF_RESKEY_pgdata}/backup_label RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` PGSQL_WAL_RECEIVER_STATUS_ATTR="${RESOURCE_NAME}-receiver-status" RECOVERY_CONF=${OCF_RESKEY_pgdata}/recovery.conf -NODENAME=`uname -n | tr '[A-Z]' '[a-z]'` +NODENAME=$(ocf_local_nodename | tr '[A-Z]' '[a-z]') if is_replication; then REP_MODE_CONF=${OCF_RESKEY_tmpdir}/rep_mode.conf PGSQL_LOCK=${OCF_RESKEY_tmpdir}/PGSQL.lock XLOG_NOTE_FILE=${OCF_RESKEY_tmpdir}/xlog_note CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount" CAN_NOT_PROMOTE="-INFINITY" CAN_PROMOTE="100" PROMOTE_ME="1000" CHECK_MS_SQL="select pg_is_in_recovery()" CHECK_XLOG_LOC_SQL="select pg_last_xlog_replay_location(),pg_last_xlog_receive_location()" CHECK_REPLICATION_STATE_SQL="select application_name,upper(state),upper(sync_state) from pg_stat_replication" PGSQL_STATUS_ATTR="${RESOURCE_NAME}-status" PGSQL_DATA_STATUS_ATTR="${RESOURCE_NAME}-data-status" PGSQL_XLOG_LOC_NAME="${RESOURCE_NAME}-xlog-loc" PGSQL_MASTER_BASELINE="${RESOURCE_NAME}-master-baseline" NODE_LIST=`echo $OCF_RESKEY_node_list | tr '[A-Z]' '[a-z]'` RE_CONTROL_SLAVE="false" fi case "$1" in methods) pgsql_methods exit $?;; meta-data) meta_data exit $OCF_SUCCESS;; esac pgsql_validate_all rc=$? [ "$1" = "validate-all" ] && exit $rc if [ $rc -ne 0 ] then case "$1" in stop) if is_replication; then change_pgsql_status "$NODENAME" "UNKNOWN" fi exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $OCF_NOT_RUNNING;; *) exit $rc;; esac fi US=`id -u -n` if [ $US != root -a $US != $OCF_RESKEY_pgdba ] then ocf_log err "$0 must be run as root or $OCF_RESKEY_pgdba" exit $OCF_ERR_GENERIC fi # make psql command options if [ -n "$OCF_RESKEY_monitor_user" ]; then PGUSER=$OCF_RESKEY_monitor_user; export PGUSER PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" else psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" fi if [ -n "$OCF_RESKEY_pghost" ]; then psql_options="$psql_options -h $OCF_RESKEY_pghost" else if [ -n "$OCF_RESKEY_socketdir" ]; then psql_options="$psql_options -h $OCF_RESKEY_socketdir" fi fi if [ -n "$OCF_RESKEY_pgport" ]; then export PGPORT=$OCF_RESKEY_pgport fi if [ -n "$OCF_RESKEY_pglibs" ]; then if [ -n "$LD_LIBRARY_PATH" ]; then export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OCF_RESKEY_pglibs else export LD_LIBRARY_PATH=$OCF_RESKEY_pglibs fi fi # What kind of method was invoked? case "$1" in status) if pgsql_status then ocf_log info "PostgreSQL is up" exit $OCF_SUCCESS else ocf_log info "PostgreSQL is down" exit $OCF_NOT_RUNNING fi;; monitor) pgsql_monitor exit $?;; start) pgsql_start exit $?;; promote) pgsql_promote exit $?;; demote) pgsql_demote exit $?;; notify) pgsql_notify exit $?;; stop) pgsql_stop exit $?;; *) exit $OCF_ERR_UNIMPLEMENTED;; esac