diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance index 016f59aff..e3fe788ae 100755 --- a/heartbeat/SAPInstance +++ b/heartbeat/SAPInstance @@ -1,1076 +1,1076 @@ #!/bin/sh # # SAPInstance # # Description: Manages a single SAP Instance as a High-Availability # resource. One SAP Instance is defined by one # SAP Instance-Profile. start/stop handles all services # of the START-Profile, status and monitor care only # about essential services. # # Author: Alexander Krauth, June 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006-2008 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_InstanceName # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) # OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only) # OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk) -# OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Master/Slave configuration) -# OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Master/Slave configuration) +# OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Promotable configuration) +# OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Promotable configuration) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # OCF_RESKEY_IS_ERS (needed for ENQ/REPL NW 740) # OCF_RESKEY_MINIMAL_PROBE (optional but needed for simple mount structure architecure) # # TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status) # - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque) # - Option for cleanup abandoned enqueue replication tables # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Parameter defaults OCF_RESKEY_InstanceName_default="" OCF_RESKEY_DIR_EXECUTABLE_default="" OCF_RESKEY_DIR_PROFILE_default="" OCF_RESKEY_START_PROFILE_default="" OCF_RESKEY_START_WAITTIME_default="3600" OCF_RESKEY_AUTOMATIC_RECOVER_default="false" OCF_RESKEY_MONITOR_SERVICES_default="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart|enq_server|enq_replicator" OCF_RESKEY_SHUTDOWN_METHOD_default="normal" OCF_RESKEY_ERS_InstanceName_default="" OCF_RESKEY_ERS_START_PROFILE_default="" OCF_RESKEY_PRE_START_USEREXIT_default="" OCF_RESKEY_POST_START_USEREXIT_default="" OCF_RESKEY_PRE_STOP_USEREXIT_default="" OCF_RESKEY_POST_STOP_USEREXIT_default="" OCF_RESKEY_IS_ERS_default="false" OCF_RESKEY_MINIMAL_PROBE_default="false" : ${OCF_RESKEY_InstanceName=${OCF_RESKEY_InstanceName_default}} : ${OCF_RESKEY_DIR_EXECUTABLE=${OCF_RESKEY_DIR_EXECUTABLE_default}} : ${OCF_RESKEY_DIR_PROFILE=${OCF_RESKEY_DIR_PROFILE_default}} : ${OCF_RESKEY_START_PROFILE=${OCF_RESKEY_START_PROFILE_default}} : ${OCF_RESKEY_START_WAITTIME=${OCF_RESKEY_START_WAITTIME_default}} : ${OCF_RESKEY_AUTOMATIC_RECOVER=${OCF_RESKEY_AUTOMATIC_RECOVER_default}} : ${OCF_RESKEY_MONITOR_SERVICES=${OCF_RESKEY_MONITOR_SERVICES_default}} : ${OCF_RESKEY_SHUTDOWN_METHOD=${OCF_RESKEY_SHUTDOWN_METHOD_default}} : ${OCF_RESKEY_ERS_InstanceName=${OCF_RESKEY_ERS_InstanceName_default}} : ${OCF_RESKEY_ERS_START_PROFILE=${OCF_RESKEY_ERS_START_PROFILE_default}} : ${OCF_RESKEY_PRE_START_USEREXIT=${OCF_RESKEY_PRE_START_USEREXIT_default}} : ${OCF_RESKEY_POST_START_USEREXIT=${OCF_RESKEY_POST_START_USEREXIT_default}} : ${OCF_RESKEY_PRE_STOP_USEREXIT=${OCF_RESKEY_PRE_STOP_USEREXIT_default}} : ${OCF_RESKEY_POST_STOP_USEREXIT=${OCF_RESKEY_POST_STOP_USEREXIT_default}} : ${OCF_RESKEY_IS_ERS=${OCF_RESKEY_IS_ERS_default}} : ${OCF_RESKEY_IS_MINIMAL_PROBE=${OCF_RESKEY_IS_MINIMAL_PROBE_default}} ####################################################################### SH=/bin/sh sapinstance_usage() { methods=`sapinstance_methods` methods=`echo $methods | tr ' ' '|'` cat <<-EOF usage: $0 ($methods) $0 manages a SAP Instance as an HA resource. - The 'start' operation starts the instance or the ERS instance in a Master/Slave configuration + The 'start' operation starts the instance or the ERS instance in a Promotable configuration The 'stop' operation stops the instance The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working - The 'promote' operation starts the primary instance in a Master/Slave configuration + The 'promote' operation starts the primary instance in a Promotable configuration The 'demote' operation stops the primary instance and starts the ERS instance The 'reload' operation allows changed parameters (non-unique only) without restarting the service The 'notify' operation always returns SUCCESS The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports EOF } sapinstance_meta_data() { cat < 1.0 Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration. The resource agent supports the following SAP versions: - SAP WebAS ABAP Release 6.20 - 7.40 - SAP WebAS Java Release 6.40 - 7.40 - SAP WebAS ABAP + Java Add-In Release 6.20 - 7.40 (Java is not monitored by the cluster in that case) When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com). Other versions may also work with this agent, but have not been verified. All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time. sapstartsrv knows 4 status colours: - GREEN = everything is fine - YELLOW = something is wrong, but the service is still working - RED = the service does not work - GRAY = the service has not been started The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover. The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing. Manages a SAP instance as an HA resource. The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile. Instance name: SID_INSTANCE_VIR-HOSTNAME The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. Path of sapstartsrv and sapcontrol The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. Path of start profile The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Start profile name After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and a JAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance. Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time. That is only useful for double stack systems. Check the successful start after that time (do not wait for J2EE-Addin) The SAPInstance resource agent tries to recover a failed start attempt automatically one time. This is done by killing running instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator. Enable or disable automatic startup recovery Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails. Those services are monitored within the SAPInstance resource agent: - disp+work - msg_server - enserver (ENSA1) - enq_server (ENSA2) - enrepserver (ENSA1) - enq_replicator (ENSA2) - jcontrol - jstart Some other services could be monitored as well. They have to be given with the parameter MONITOR_SERVICES, e.g.: - sapwebdisp - TREXDaemon.x That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'. The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports. You may specify multiple services separated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver Services to monitor Usually a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the graceful stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !! Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL) - Only used in a Master/Slave resource configuration: + Only used in a Promotable resource configuration: The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile. -The enqueue replication instance must be installed, before you want to configure a master-slave cluster resource. +The enqueue replication instance must be installed, before you want to configure a promotable cluster resource. -The master-slave configuration in the cluster must use this properties: +The promotable configuration in the cluster must use this properties: clone_max = 2 clone_node_max = 1 master_node_max = 1 master_max = 1 Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME - Only used in a Master/Slave resource configuration: + Only used in a Promotable resource configuration: The parameter ERS_InstanceName must also be set in this configuration. The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Enqueue replication start profile name The full qualified path where to find a script or program which should be executed before this resource gets started. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. Path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. Path to a post-start script - Only used for ASCS/ERS SAP Netweaver installations without implementing a master/slave resource to + Only used for ASCS/ERS SAP Netweaver installations without implementing a promotable resource to allow the ASCS to 'find' the ERS running on another cluster node after a resource failure. This parameter should be set to true 'only' for the ERS instance for implementations following the SAP NetWeaver 7.40 HA certification (NW-HA-CLU-740). This includes also systems for NetWeaver less than 7.40, if you like to implement the NW-HA-CLU-740 scenario. Mark SAPInstance as ERS instance Setting MINIMAL_PROBE=true forces the resource agent to do only minimal check during a probe. This is needed for special file system setups. The MINIMAL_PROBE=true is only supported, if requested either by your vendor's support or if described in an architecture document from your HA vendor. Switch probe action from full to minimal check - - + + END } # # methods: What methods/operations do we support? # sapinstance_methods() { cat <<-EOF start stop status monitor promote demote reload notify validate-all methods meta-data usage EOF } # # is_clone : find out if we are configured to run in a Master/Slave configuration # is_clone() { if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ] then if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ] then ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_ERS_InstanceName" ] then ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory." exit $OCF_ERR_ARGS fi else return 0 fi return 1 } # # abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different # from customer to customer - we cannot handle this always as an error # This would be the case, if the software is installed on shared disks and not visible # to all cluster nodes at all times. # abnormal_end() { local err_msg=$1 ocf_is_probe && { sapinstance_status exit $? } ocf_log err $err_msg if [ "$ACTION" = "stop" ] then cleanup_instance exit $OCF_SUCCESS fi exit $OCF_ERR_CONFIGURED } # # sapinstance_init : Define global variables with default values, if optional parameters are not set # # sapinstance_init() { local myInstanceName="$1" SID=`echo "$myInstanceName" | cut -d_ -f1` InstanceName=`echo "$myInstanceName" | cut -d_ -f2` InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3` # make sure that we don't care the content of variable from previous run of sapinstance_init DIR_EXECUTABLE="" SYSTEMCTL="systemctl" # optional OCF parameters, we try to guess which directories are correct if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] then if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" fi else if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" then DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" fi fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" [ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" if [ -z "$OCF_RESKEY_DIR_PROFILE" ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ] then currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE else currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE fi if [ -z "$OCF_RESKEY_IS_ERS" ]; then is_ers="no" else is_ers="$OCF_RESKEY_IS_ERS" fi if [ -z "$currentSTART_PROFILE" ] then if [ ! -r "$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" -a -r "$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" ]; then SAPSTARTPROFILE="$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" else SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" fi else SAPSTARTPROFILE="$currentSTART_PROFILE" fi if [ -z "$OCF_RESKEY_START_WAITTIME" ] then export OCF_RESKEY_START_WAITTIME="${OCF_RESKEY_START_WAITTIME_default}" fi if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then export OCF_RESKEY_MONITOR_SERVICES="${OCF_RESKEY_MONITOR_SERVICES_default}" fi # as root user we need the library path to the SAP kernel to be able to call sapcontrol if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi return $OCF_SUCCESS } # # check_systemd_integration : Check, if SAP instance is controlled by systemd unit file SAP_.service # rc == 0 : sap instance is controlled by the unit file (file at least exists) # rc == 1 : sap instance is NOT controlled by the unit file (file does not exist) # check_systemd_integration() { local systemd_unit_name="SAP${SID}_${InstanceNr}" local rc=1 if which "$SYSTEMCTL" 1>/dev/null 2>/dev/null; then if $SYSTEMCTL list-unit-files | \ awk '$1 == service { found=1 } END { if (! found) {exit 1}}' service="${systemd_unit_name}.service"; then rc=0 else rc=1 fi fi return "$rc" } # # check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. # We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, # because then we have two instances with the same instance number. # check_sapstartsrv() { local restart=0 local runninginst="" local chkrc=$OCF_SUCCESS local output="" # check for sapstartsrv/systemd integration if check_systemd_integration; then # do it the systemd way local systemd_unit_name="SAP${SID}_${InstanceNr}" if $SYSTEMCTL status "$systemd_unit_name" 1>/dev/null 2>/dev/null; then ocf_log info "systemd service $systemd_unit_name is active" else ocf_log warn "systemd service $systemd_unit_name is not active, it will be started using systemd" $SYSTEMCTL start "$systemd_unit_name" 1>/dev/null 2>/dev/null # use start, because restart does also stop sap instance fi return 0 else # otherwise continue with old code... if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` if [ $? -eq 0 ] then runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` if [ "$runninginst" != "$InstanceName" ] then ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start` if [ $? -ne 0 ]; then ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)" ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" restart=1 fi fi else ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" restart=1 fi fi if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi if [ $restart -eq 1 ] then if [ -d /usr/sap/$SID/SYS/profile/ ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" fi [ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" pkill -9 -f "sapstartsrv.*$runninginst" # removing the unix domain socket files as they might have wrong permissions # or ownership - they will be recreated by sapstartsrv during next start rm -f /tmp/.sapstream5${InstanceNr}13 rm -f /tmp/.sapstream5${InstanceNr}14 $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm # now make sure the daemon has been started and is able to respond local srvrc=1 while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] do sleep 1 $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 srvrc=$? done if [ $srvrc -ne 1 ] then ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" chkrc=$OCF_ERR_GENERIC ocf_is_probe && chkrc=$OCF_NOT_RUNNING fi fi return $chkrc fi } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { local NAME="$1" local VALUE="$2" if [ -n "$VALUE" ] then if have_binary "$VALUE" then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" "$VALUE" >/dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # cleanup_instance : remove resources (processes and shared memory) from a crashed instance) # cleanup_instance() { pkill -9 -f -U $sidadm $InstanceName ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'" # it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot be removed su - $sidadm -c "cleanipc $InstanceNr remove" ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm" ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid return 0 } # # sapinstance_start : Start the SAP instance # sapinstance_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" local rc=$OCF_NOT_RUNNING local output="" local loopcount=0 while [ $loopcount -lt 2 ] do loopcount=$(($loopcount + 1)) check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Start` rc=$? ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" fi if [ $rc -ne 0 ] then ocf_log err "SAP Instance $SID-$InstanceName start failed." return $OCF_ERR_GENERIC fi local startrc=1 while [ $startrc -gt 0 ] do local waittime_start=`date +%s` output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` startrc=$? local waittime_stop=`date +%s` if [ $startrc -ne 0 ] then if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] then sapinstance_monitor NOLOG if [ $? -eq $OCF_SUCCESS ] then output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." startrc=0; loopcount=2 fi else if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" ocf_log warn "Try to recover $SID-$InstanceName" cleanup_instance else loopcount=2 fi startrc=-1 fi else loopcount=2 fi done done if [ $startrc -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 1 -l reboot; fi else ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" rc=$OCF_NOT_RUNNING if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi fi return $rc } # # sapinstance_recover: Try startup of failed instance by cleaning up resources # sapinstance_recover() { cleanup_instance sapinstance_start return $? } # # sapinstance_stop: Stop the SAP instance # sapinstance_stop() { local output="" local rc sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ] then ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!" cleanup_instance return $OCF_SUCCESS fi check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Stop` rc=$? ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output" fi if [ $rc -eq 0 ] then output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` if [ $? -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi return $rc } # # sapinstance_monitor: Can the given SAP instance do anything useful? # sapinstance_monitor() { local MONLOG=$1 local rc if ocf_is_probe && ocf_is_true "$OCF_RESKEY_MINIMAL_PROBE"; then # code for minimal probe: # grep for sapstartsrv and maybe also for sapstart # TODO: Do we need to improve this minimal test? if pgrep -f -l "sapstartsrv .*pf=.*${SID}_${InstanceName}_${SAPVIRHOST}"; then rc="$OCF_SUCCESS" elif pgrep -f -l "sapstart .*pf=.*${SID}_${InstanceName}_${SAPVIRHOST}"; then rc="$OCF_SUCCESS" else rc="$OCF_NOT_RUNNING" fi else # standard probe and monitoring code check_sapstartsrv rc=$? fi if [ $rc -eq $OCF_SUCCESS ] then local count=0 local SERVNO local output output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` do local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` local STATE=0 local SEARCH case $COLOR in GREEN|YELLOW) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then if ocf_is_probe then rc=$OCF_NOT_RUNNING else [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" rc=$OCF_ERR_GENERIC fi fi fi return $rc } # # sapinstance_status: Lightweight check of SAP instance only with OS tools # sapinstance_status() { local pid local pids [ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING pids=`grep '^kill -[0-9]' /usr/sap/$SID/$InstanceName/work/kill.sap | awk '{print $3}'` for pid in $pids do [ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS done return $OCF_NOT_RUNNING } # # sapinstance_validate: Check the semantics of the input parameters # sapinstance_validate() { local rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS fi if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" rc=$OCF_ERR_ARGS fi return $rc } # # sapinstance_start_clone # sapinstance_start_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 50 -l reboot sapinstance_start return $? } # # sapinstance_stop_clone # sapinstance_stop_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 0 -l reboot sapinstance_stop return $? } # # sapinstance_monitor_clone # sapinstance_monitor_clone() { # first check with the status function (OS tools) if there could be something like a SAP instance running # as we do not know here, if we are in master or slave state we do not want to start our monitoring # agents (sapstartsrv) on the wrong host local rc sapinstance_init $OCF_RESKEY_InstanceName if sapinstance_status; then if sapinstance_monitor; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot return $OCF_RUNNING_MASTER fi # by nature of the SAP enqueue server we have to make sure # that we do a failover to the slave (enqueue replication server) # in case the enqueue process has failed. We signal this to the # cluster by setting our master preference to a lower value than the slave. ${HA_SBIN_DIR}/crm_master -v 10 -l reboot return $OCF_FAILED_MASTER fi sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_status && sapinstance_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot fi return $rc } # # sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance # The order is important here to behave correct from the application levels view # sapinstance_promote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Promoting $SID-$InstanceName to running Master." sapinstance_start rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_stop rc=$? fi return $rc } # # sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance # sapinstance_demote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Demoting $SID-$InstanceName to a slave." sapinstance_stop rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_start rc=$? fi return $rc } # # sapinstance_notify: Handle master scoring - to make sure a slave gets the next master # sapinstance_notify() { local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" if [ "${n_type}_${n_op}" = "post_promote" ]; then # After promotion of one master in the cluster, we make sure that all clones reset their master # value back to 100. This is because a failed monitor on a master might have degree one clone # instance to score 10. ${HA_SBIN_DIR}/crm_master -v 100 -l reboot elif [ "${n_type}_${n_op}" = "pre_demote" ]; then # if we are a slave and a demote event is announced, make sure we are highest on the list to become master # that is, when a slave resource was started after the promote event of an already running master (e.g. node of slave was down) # We also have to make sure to overrule the globally set resource_stickiness or any fail-count factors => INFINITY local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname" if [ ${n_uname} != ${NODENAME} ]; then ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot fi fi } # # 'main' starts here... # ## GLOBALS SID="" sidadm="" InstanceName="" InstanceNr="" SAPVIRHOST="" DIR_EXECUTABLE="" SAPSTARTSRV="" SAPCONTROL="" DIR_PROFILE="" SAPSTARTPROFILE="" CLONE=0 NODENAME=$(ocf_local_nodename) if ( [ $# -ne 1 ] ) then sapinstance_usage exit $OCF_ERR_ARGS fi ACTION=$1 if [ "$ACTION" = "status" ]; then ACTION=monitor fi # These operations don't require OCF instance parameters to be set case "$ACTION" in usage|methods) sapinstance_$ACTION exit $OCF_SUCCESS;; meta-data) sapinstance_meta_data exit $OCF_SUCCESS;; notify) sapinstance_notify exit $OCF_SUCCESS;; *);; esac if ! ocf_is_root then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # parameter check if [ -z "$OCF_RESKEY_InstanceName" ] then ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" exit $OCF_ERR_ARGS fi is_clone; CLONE=$? if [ ${CLONE} -eq 1 ] then CLACT=_clone else if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] then ocf_log err "$ACTION called in a non master/slave environment" exit $OCF_ERR_ARGS fi sapinstance_init $OCF_RESKEY_InstanceName fi # What kind of method was invoked? case "$ACTION" in start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT exit $?;; validate-all) sapinstance_validate exit $?;; reload ) ocf_log info "reloading SAPInstance parameters" exit $OCF_SUCCESS;; *) sapinstance_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/conntrackd.in b/heartbeat/conntrackd.in index f115250d6..1c2ee955b 100644 --- a/heartbeat/conntrackd.in +++ b/heartbeat/conntrackd.in @@ -1,335 +1,335 @@ #!@BASH_SHELL@ # # # An OCF RA for conntrackd # http://conntrack-tools.netfilter.org/ # # Copyright (c) 2011 Dominik Klein # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### OCF_RESKEY_binary_default=conntrackd OCF_RESKEY_config_default=/etc/conntrackd/conntrackd.conf # For users of versions prior to 1.2: # Map renamed parameter "conntrackd" to "binary" if in use : ${OCF_RESKEY_binary=${OCF_RESKEY_conntrackd-${OCF_RESKEY_binary_default}}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} meta_data() { cat < 1.0 -Master/Slave OCF Resource Agent for conntrackd +Promotable OCF Resource Agent for conntrackd This resource agent manages conntrackd Name of the conntrackd executable. If conntrackd is installed and available in the default PATH, it is sufficient to configure the name of the binary For example "my-conntrackd-binary-version-0.9.14" If conntrackd is installed somewhere else, you may also give a full path For example "/packages/conntrackd-0.9.14/sbin/conntrackd" Name of the conntrackd executable Full path to the conntrackd.conf file. For example "/packages/conntrackd-0.9.14/etc/conntrackd/conntrackd.conf" Path to conntrackd.conf - - + + END } meta_expect_eq() { local what=$1 whatvar=OCF_RESKEY_CRM_meta_${1//-/_} expect=$2 local val=${!whatvar} if [[ -n $val ]]; then # [, not [[, or it won't work ;) [ $val = $expect ] && return fi ocf_exit_reason "meta parameter misconfigured, expected $what $op $expect, but found ${val:-unset}." exit $OCF_ERR_CONFIGURED } conntrackd_is_master() { # You can't query conntrackd whether it is master or slave. It can be both at the same time. # This RA creates a statefile during promote and enforces master-max=1 and clone-node-max=1 ha_pseudo_resource $statefile monitor } conntrackd_set_master_score() { ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 } conntrackd_monitor() { rc=$OCF_NOT_RUNNING # It does not write a PID file, so check the socket exists after # extracting its path from the configuration file local conntrack_socket=$(awk '/^[ \t]*UNIX[ \t]*{/,/^[ \t]*}/ { if ($1 == "Path") { print $2 } }' $OCF_RESKEY_config) [ -S "$conntrack_socket" ] && rc=$OCF_SUCCESS if [ "$rc" -eq "$OCF_SUCCESS" ]; then # conntrackd is running # now see if it acceppts queries if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -s > /dev/null 2>&1; then rc=$OCF_ERR_GENERIC ocf_exit_reason "conntrackd is running but not responding to queries" fi if conntrackd_is_master; then rc=$OCF_RUNNING_MASTER # Restore master setting on probes if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then conntrackd_set_master_score $master_score fi else # Restore master setting on probes if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then conntrackd_set_master_score $slave_score fi fi fi return $rc } conntrackd_start() { rc=$OCF_ERR_GENERIC # Keep trying to start the resource; # wait for the CRM to time us out if this fails while :; do conntrackd_monitor status=$? case "$status" in $OCF_SUCCESS) conntrackd_set_master_score $slave_score # -n = request resync from the others if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -n; then ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -n failed during start." rc=$OCF_ERR_GENERIC else rc=$OCF_SUCCESS fi break ;; $OCF_NOT_RUNNING) ocf_log info "Starting conntrackd" $OCF_RESKEY_binary -C $OCF_RESKEY_config -d ;; $OCF_RUNNING_MASTER) ocf_log warn "conntrackd already in master mode, demoting." ha_pseudo_resource $statefile stop ;; $OCF_ERR_GENERIC) ocf_exit_reason "conntrackd start failed" rc=$OCF_ERR_GENERIC break ;; esac done return $rc } conntrackd_stop() { rc=$OCF_ERR_GENERIC # Keep trying to bring down the resource; # wait for the CRM to time us out if this fails while :; do conntrackd_monitor status=$? case "$status" in $OCF_SUCCESS|$OCF_ERR_GENERIC) ocf_log info "Stopping conntrackd" $OCF_RESKEY_binary -C $OCF_RESKEY_config -k ;; $OCF_NOT_RUNNING) rc=$OCF_SUCCESS break ;; $OCF_RUNNING_MASTER) ocf_log warn "conntrackd still master" ;; esac done return $rc } conntrackd_validate_all() { check_binary "$OCF_RESKEY_binary" if ! [ -e "$OCF_RESKEY_config" ]; then ocf_exit_reason "Config FILE $OCF_RESKEY_config does not exist" return $OCF_ERR_INSTALLED fi meta_expect_eq master-node-max 1 meta_expect_eq master-max 1 meta_expect_eq clone-node-max 1 return $OCF_SUCCESS } conntrackd_promote() { rc=$OCF_SUCCESS if ! conntrackd_is_master; then # -c = Commit the external cache to the kernel # -f = Flush internal and external cache # -R = resync with the kernel table # -B = send a bulk update on the line for parm in c f R B; do if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during promote." rc=$OCF_ERR_GENERIC break fi done ha_pseudo_resource $statefile start conntrackd_set_master_score $master_score fi return $rc } conntrackd_demote() { rc=$OCF_SUCCESS if conntrackd_is_master; then # -t = shorten kernel timers to remove zombies # -n = request a resync from the others for parm in t n; do if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during demote." rc=$OCF_ERR_GENERIC break fi done ha_pseudo_resource $statefile stop conntrackd_set_master_score $slave_score fi return $rc } conntrackd_notify() { hostname=$(hostname) # OCF_RESKEY_CRM_meta_notify_master_uname is a whitespace separated list of master hostnames for master in $OCF_RESKEY_CRM_meta_notify_master_uname; do # if we are the master and an instance was just started on another node: # send a bulk update to allow failback if [ "$hostname" = "$master" -a "$OCF_RESKEY_CRM_meta_notify_type" = "post" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "start" -a "$OCF_RESKEY_CRM_meta_notify_start_uname" != "$hostname" ]; then ocf_log info "Sending bulk update in post start to peers to allow failback" $OCF_RESKEY_binary -C $OCF_RESKEY_config -B fi done for tobepromoted in $OCF_RESKEY_CRM_meta_notify_promote_uname; do # if there is a promote action to be executed on another node: # send a bulk update to allow failback if [ "$hostname" != "$tobepromoted" -a "$OCF_RESKEY_CRM_meta_notify_type" = "pre" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "promote" ]; then ocf_log info "Sending bulk update in pre promote to peers to allow failback" $OCF_RESKEY_binary -C $OCF_RESKEY_config -B fi done } conntrackd_usage() { cat < # # This agent incoporates code of a previous release created by # Alan Robertson and the community. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Parameter defaults OCF_RESKEY_instance_default="" OCF_RESKEY_admin_default="" OCF_RESKEY_dbpartitionnum_default="0" : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} ####################################################################### db2_usage() { echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" } db2_meta_data() { cat < 1.0 -Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in master/slave configuration. Multiple partitions are supported. +Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in promotable configuration. Multiple partitions are supported. Standard mode: An instance including all or selected databases is made highly available. Configure each partition as a separate primitive resource. HADR mode: A single database in HADR configuration is made highly available by automating takeover operations. -Configure a master / slave resource with notifications enabled and an -additional monitoring operation with role "Master". +Configure a promotable resource with notifications enabled and an +additional monitoring operation with role "Promoted". In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW. In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance: "monitor interval" < HADR_PEER_WINDOW - (appr 30 sec) "promote timeout" < HADR_PEER_WINDOW + (appr 20 sec) For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent) -Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as master/slave configuration. Multiple partitions are supported. +Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as promotable configuration. Multiple partitions are supported. The instance of the database(s). instance List of databases to be managed, e.g "db1 db2". Defaults to all databases in the instance. Specify one db for HADR mode. List of databases to be managed DEPRECATED: The admin user of the instance. DEPRECATED: admin The number of the partition (DBPARTITIONNUM) to be managed. database partition number (DBPARTITIONNUM) - + END } # # validate # .. and set global variables # # exit on error # db2_validate() { local db2home db2sql db2instance # db2 uses korn shell check_binary "ksh" # check required instance vars if [ -z "$OCF_RESKEY_instance" ] then ocf_log err "DB2 required parameter instance is not set!" return $OCF_ERR_CONFIGURED fi instance=$OCF_RESKEY_instance if [ -n "$OCF_RESKEY_admin" ] then ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance." instance=$OCF_RESKEY_admin fi db2node=${OCF_RESKEY_dbpartitionnum:-0} db2home=$(sh -c "echo ~$instance") db2sql=$db2home/sqllib db2profile=$db2sql/db2profile db2bin=$db2sql/bin STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state # Let's make sure a few important things are there... if ! [ -d "$db2sql" -a -d "$db2bin" -a -f "$db2profile" -a \ -x "$db2profile" -a -x "$db2bin/db2" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 required directories and/or files not found" exit $OCF_ERR_INSTALLED fi db2instance=$(runasdb2 'echo $DB2INSTANCE') if [ "$db2instance" != "$instance" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\"" exit $OCF_ERR_CONFIGURED fi # enough checking for stop to succeed [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS dblist=$OCF_RESKEY_dblist if [ -n "$dblist" ] then # support , as separator as well dblist=$(echo "$dblist" | sed -e 's/[,]/ /g') else if ! dblist=$(db2_dblist) then ocf_log err "DB2 $instance($db2node): cannot retrieve db directory" exit $OCF_ERR_INSTALLED fi fi # check requirements for the HADR case if ocf_is_ms then set -- $dblist if [ $# != 1 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist" exit $OCF_ERR_CONFIGURED fi if [ $db2node != 0 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0" exit $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } master_score() { if ! have_binary "crm_master"; then return fi crm_master $* } # # Run the given command as db2 instance user # runasdb2() { su $instance -c ". $db2profile; $*" } # # Run a command as the DB2 admin, and log the output # logasdb2() { local output rc output=$(runasdb2 $*) rc=$? if [ $rc -eq 0 ] then ocf_log info "$output" else ocf_log err "$output" fi return $rc } # # maintain the fal (first active log) attribute # db2_fal_attrib DB {set val|get} # db2_fal_attrib() { local db=$1 local attr val rc id node member me attr=db2hadr_${instance}_${db}_fal case "$2" in set) me=$(ocf_local_nodename) # loop over all member nodes and set attribute crm_node -l | while read id node member do [ "$member" = member -a "$node" != "$me" ] || continue crm_attribute -l forever --node=$node -n $attr -v "$3" rc=$? ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" [ $rc != 0 ] && break done ;; get) crm_attribute -l forever -n $attr -G --quiet 2>&1 rc=$? if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ] then ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" fi ;; *) exit $OCF_ERR_CONFIGURED esac return $rc } # # unfortunately a first connect after a crash may need several minutes # for some internal cleanup stuff in DB2. # We run a connect in background so other connects (i.e. monitoring!) may proceed. # db2_run_connect() { local db=$1 logasdb2 "db2 connect to $db; db2 terminate" } # # get some data from the database config # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW # db2_get_cfg() { local db=$1 local output hadr_vars output=$(runasdb2 db2 get db cfg for $db) [ $? != 0 ] && return $OCF_ERR_GENERIC hadr_vars=$(echo "$output" | awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;} /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;} /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;} /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}') # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW eval $hadr_vars # HADR_PEER_WINDOW comes with V9 and is checked later if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ] then ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # return the list of databases in the instance # db2_dblist() { local output output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%' } # # Delayed check of the compatibility of DB2 instance and pacemaker # config. # Logically this belongs to validate but certain parameters can only # be retrieved once the instance is started. # db2_check_config_compatibility() { local db=$1 local is_ms ocf_is_ms is_ms=$? case "$HADR_ROLE/$is_ms" in STANDARD/0) ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource" exit $OCF_ERR_INSTALLED ;; STANDARD/1) # OK ;; */0) if [ -z "$HADR_PEER_WINDOW" ] then ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)" exit $OCF_ERR_INSTALLED fi ;; */1) ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource" esac } # # Start instance and DB. # Standard mode is through "db2 activate" in order to start in previous # mode (Standy/Primary). # If the database is a primary AND we can determine that the running master # has a higher "first active log" we conclude that we come up after a crash # an the previous Standby is now Primary. # The db is then started as Standby. # # Other cases: danger of split brain, log error and do nothing. # db2_start() { local output start_cmd db local start_opts="dbpartitionnum $db2node" # If we detect that db partitions are not in use, and no # partition is explicitly specified, activate without # partition information. This allows db2 instances without # partition support to be managed. if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then start_opts="" fi if output=$(runasdb2 db2start $start_opts) then ocf_log info "DB2 instance $instance($db2node) started: $output" else case $output in *SQL1026N*) ocf_log info "DB2 instance $instance($db2node) already running: $output" ;; *) ocf_log err "$output" return $OCF_ERR_GENERIC esac fi if ! db2_instance_status then ocf_log err "DB2 instance $instance($db2node) is not active!" return $OCF_ERR_GENERIC fi [ $db2node = 0 ] || return $OCF_SUCCESS # activate DB only on node 0 for db in $dblist do # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG db2_get_cfg $db || return $? # Better late than never: can only check this when the instance is already up db2_check_config_compatibility $db start_cmd="db2 activate db $db" if [ $HADR_ROLE = PRIMARY ] then local master_fal # communicate our FAL to other nodes the might start concurrently db2_fal_attrib $db set $FIRST_ACTIVE_LOG # ignore false positive: # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073] # see https://github.com/koalaman/shellcheck/issues/691 # shellcheck disable=SC2073 if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] then ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" start_cmd="db2 start hadr on db $db as standby" HADR_ROLE=STANDBY fi fi if output=$(runasdb2 $start_cmd) then ocf_log info "DB2 database $instance($db2node)/$db started/activated" [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & else case $output in SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*) ocf_log info "DB2 database $instance($db2node)/$db already activated: $output" ;; SQL1768N*"Reason code = \"7\""*) ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down" ocf_log err "Possible split brain ! Manual intervention required." ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" # might be the Standby is not yet there # might be a timing problem because "First active log" is delayed # on the next start attempt we might succeed when FAL was advanced # might be manual intervention is required # ... so let pacemaker give it another try and we will succeed then return $OCF_ERR_GENERIC ;; *) ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output" return $OCF_ERR_GENERIC esac fi done # come here with success # Even if we are a db2 Primary pacemaker requires start to end up in slave mode echo SLAVE > $STATE_FILE return $OCF_SUCCESS } # # helper function to be spawned # so we can detect a hang of the db2stop command # db2_stop_bg() { local rc output local stop_opts="dbpartitionnum $db2node" rc=$OCF_SUCCESS if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then stop_opts="" fi if output=$(runasdb2 db2stop force $stop_opts) then ocf_log info "DB2 instance $instance($db2node) stopped: $output" else case $output in *SQL1032N*) #SQL1032N No start database manager command was issued ocf_log info "$output" ;; *) ocf_log err "DB2 instance $instance($db2node) stop failed: $output" rc=$OCF_ERR_GENERIC esac fi return $rc } # # Stop the given db2 database instance # db2_stop() { local stop_timeout grace_timeout stop_bg_pid i must_kill # remove master score master_score -D -l reboot # be very early here in order to avoid stale data rm -f $STATE_FILE db2_instance_status if [ $? -eq $OCF_NOT_RUNNING ]; then ocf_log info "DB2 instance $instance already stopped" return $OCF_SUCCESS fi stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000} # grace_time is 4/5 (unit is ms) grace_timeout=$((stop_timeout/1250)) # start db2stop in background as this may hang db2_stop_bg & stop_bg_pid=$! # wait for grace_timeout i=0 while [ $i -lt $grace_timeout ] do kill -0 $stop_bg_pid 2>/dev/null || break; sleep 1 i=$((i+1)) done # collect exit status but don't hang if kill -0 $stop_bg_pid 2>/dev/null then stoprc=1 kill -9 $stop_bg_pid 2>/dev/null else wait $stop_bg_pid stoprc=$? fi must_kill=0 if [ $stoprc -ne 0 ] then ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill" must_kill=1 elif ! db2_instance_dead then ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" must_kill=1 fi if [ $must_kill -eq 1 ] then # db2nkill kills *all* partitions on the node if [ -x $db2bin/db2nkill ] then logasdb2 $db2bin/db2nkill $db2node elif [ -x $db2bin/db2_kill ] then logasdb2 $db2bin/db2_kill fi # loop forever (or lrmd kills us due to timeout) until the # instance is dead while ! db2_instance_dead do ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit" sleep 1 done ocf_log info "DB2 instance $instance($db2node) is now dead" fi return $OCF_SUCCESS } # # check whether `enough´ processes for a healthy instance are up # db2_instance_status() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) if [ $pscount -ge 4 ]; then return $OCF_SUCCESS; elif [ $pscount -ge 1 ]; then return $OCF_ERR_GENERIC fi return $OCF_NOT_RUNNING } # # is the given db2 instance dead? # db2_instance_dead() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) test $pscount -eq 0 } # # return the status of the db as "Role/Status" # e.g. Primary/Peer, Standby/RemoteCatchupPending # # If not in HADR configuration return "Standard/Standalone" # db2_hadr_status() { local db=$1 local output output=$(runasdb2 db2pd -hadr -db $db) if [ $? != 0 ] then echo "Down/Off" return 1 fi echo "$output" | awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } /^HADR is not active/ {print "Standard/Standalone"; exit; } /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' } # # Monitor the db # And as side effect set crm_master / FAL attribute # db2_monitor() { local CMD output hadr db local rc db2_instance_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then # instance is dead remove master score master_score -D -l reboot exit $rc fi [ $db2node = 0 ] || return 0 # monitoring only for partition 0 for db in $dblist do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" # set master preference accordingly case "$hadr" in PRIMARY/*|Primary/*|Standard/*) # perform a basic health check CMD="if db2 connect to $db; then db2 select \* from sysibm.sysversions ; rc=\$?; db2 terminate; else rc=\$?; fi; exit \$rc" if ! output=$(runasdb2 $CMD) then case "$output" in SQL1776N*) # can't connect/select on standby, may be spurious turing takeover ;; *) ocf_log err "DB2 database $instance($db2node)/$db is not working" ocf_log err "DB2 message: $output" # dead primary, remove master score master_score -D -l reboot return $OCF_ERR_GENERIC esac fi ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" ocf_is_ms && master_score -v 10000 -l reboot ;; STANDBY/*PEER/*|Standby/*Peer) master_score -v 8000 -l reboot ;; STANDBY/*|Standby/*) ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted" master_score -D -l reboot ;; *) return $OCF_ERR_GENERIC esac done # everything OK, return if running as slave grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS return $OCF_RUNNING_MASTER } # # Promote db to Primary # db2_promote() { # validate ensured that dblist contains only one entry local db=$dblist local i hadr output force # we run this twice as after a crash of the other node # within HADR_TIMEOUT the status may be still reported as Peer # although a connection no longer exists for i in 1 2 do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted" case "$hadr" in Standard/Standalone) # this case only to keep ocf-tester happy return $OCF_SUCCESS ;; PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer) # nothing to do, only update pacemaker's view echo MASTER > $STATE_FILE return $OCF_SUCCESS ;; STANDBY/PEER/CONNECTED|Standby/Peer) # must take over ;; STANDBY/*PEER/DISCONNECTED|Standby/DisconnectedPeer) # must take over by force peer window only force="by force peer window only" ;; # must take over by force STANDBY/REMOTE_CATCHUP_PENDING/DISCONNECTED) force="by force" ;; *) return $OCF_ERR_GENERIC esac if output=$(runasdb2 db2 takeover hadr on db $db $force) then # update pacemaker's view echo MASTER > $STATE_FILE # turn the log so we rapidly get a new FAL logasdb2 "db2 archive log for db $db" return $OCF_SUCCESS fi case "$output" in SQL1770N*"Reason code = \"7\""*) # expected, HADR_TIMEOUT is now expired # go for the second try continue ;; *) ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output" return $OCF_ERR_GENERIC esac done return $OCF_ERR_GENERIC } # # Demote db to standby # db2_demote() { # validate ensured that dblist contains only one entry local db=$dblist local hadr # house keeping, set pacemaker's view to slave echo SLAVE > $STATE_FILE hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted" db2_monitor return $? } # # handle pre start notification # We record our first active log on the other nodes. # If two primaries come up after a crash they can safely determine who is # the outdated one. # db2_notify() { local node # only interested in pre-start [ $OCF_RESKEY_CRM_meta_notify_type = pre \ -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS # gets FIRST_ACTIVE_LOG db2_get_cfg $dblist || return $? db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC exit $OCF_SUCCESS } ######## # Main # ######## case "$__OCF_ACTION" in meta-data) db2_meta_data exit $OCF_SUCCESS ;; usage) db2_usage exit $OCF_SUCCESS ;; start) db2_validate db2_start || exit $? db2_monitor exit $? ;; stop) db2_validate db2_stop exit $? ;; promote) db2_validate db2_promote exit $? ;; demote) db2_validate db2_demote exit $? ;; notify) db2_validate db2_notify exit $? ;; monitor) db2_validate db2_monitor exit $? ;; validate-all) db2_validate exit $? ;; *) db2_usage exit $OCF_ERR_UNIMPLEMENTED esac diff --git a/heartbeat/dnsupdate.in b/heartbeat/dnsupdate.in index 35b7c99bb..b54822cd8 100755 --- a/heartbeat/dnsupdate.in +++ b/heartbeat/dnsupdate.in @@ -1,381 +1,381 @@ #!@BASH_SHELL@ # # # Support: users@clusterlabs.org # License: GNU General Public License v2 # # Copyright (c) 2014 SUSE Linux Products GmbH, Lars Marowsky-Brée # All Rights Reserved. # ####################################################################### : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Parameter defaults OCF_RESKEY_hostname_default="" OCF_RESKEY_type_default="A" OCF_RESKEY_ip_default="" OCF_RESKEY_cname_default="" OCF_RESKEY_ttl_default="300" OCF_RESKEY_keyfile_default="" OCF_RESKEY_server_default="" OCF_RESKEY_serverport_default="53" OCF_RESKEY_nsupdate_opts_default="" OCF_RESKEY_unregister_on_stop_default="false" : ${OCF_RESKEY_hostname=${OCF_RESKEY_hostname_default}} : ${OCF_RESKEY_cname=${OCF_RESKEY_cname_default}} : ${OCF_RESKEY_type=${OCF_RESKEY_type_default}} : ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} : ${OCF_RESKEY_ttl=${OCF_RESKEY_ttl_default}} : ${OCF_RESKEY_keyfile=${OCF_RESKEY_keyfile_default}} : ${OCF_RESKEY_server=${OCF_RESKEY_server_default}} : ${OCF_RESKEY_serverport=${OCF_RESKEY_serverport_default}} : ${OCF_RESKEY_nsupdate_opts=${OCF_RESKEY_nsupdate_opts_default}} : ${OCF_RESKEY_unregister_on_stop=${OCF_RESKEY_unregister_on_stop_default}} ####################################################################### # TODO: # - Should multiple A records be supported? usage() { cat <<-! usage: $0 {start|stop|status|monitor|meta-data|validate-all} ! } meta_data() { cat < 1.0 This resource agent manages IP take-over via dynamic DNS updates. IP take-over via dynamic DNS update Either the hostname whose IP address will need to be updated (in case of type=A) or alias whose hostname will need to be updated (in case of type=CNAME). Hostname to update The type of DNS record that need to be updated (A or CNAME). Type of DNS record IP address to set. IP address to set The CNAME whose hostname address will need to be updated. CNAME to update Time to live, in seconds, for the DNS record. This affects how soon DNS updates propagate. It should be a reasonable compromise between update speed and DNS server load. If using booth, the ticket timeout is a good start. TTL for the DNS record The file containing the shared secret needed to update the DNS record. Please see the nsupdate man page for the exact syntax. nsupdate key file Which DNS server to send these updates for. When no -server is provided, this defaults to the master server +server is provided, this defaults to the promoted server for the correct zone. DNS server to contact Port number on the DNS server. Note: due to a limitation in the nsupdate command, this option will only take effect if you also specify the DNS server! Port number on the DNS server Additional options to be passed to nsupdate. Additional nsupdate options Whether or not to actively remove records on stop. This is not needed for normal operation, since the site taking over the IP address will delete all previous records. Remove A record on stop END } dnsupdate_status() { case $type in A) # The resource is considered active if the current IP # address is returned as the only response. local record=$(dig ${dig_opts} ${hostname}. A +short 2>/dev/null) if [ "$record" = "$ip" ]; then return $OCF_SUCCESS fi return $OCF_NOT_RUNNING ;; CNAME) local record=$(dig ${dig_opts} ${cname}. CNAME +short 2>/dev/null) if [ "$record" = "${hostname}." ]; then return $OCF_SUCCESS fi return $OCF_NOT_RUNNING ;; esac } dnsupdate_monitor() { if ocf_is_probe ; then # return $OCF_NOT_RUNNING fi dnsupdate_status } dnsupdate_start() { case $type in A) if dnsupdate_status ; then ocf_log info "$hostname already resolves to $ip" return $OCF_SUCCESS fi ocf_log info "Updating DNS records for $hostname" ( if [ -n "$dns_server" ]; then echo "server ${dns_server} ${dns_serverport}" fi echo "update delete $hostname A" echo "update add $hostname ${OCF_RESKEY_ttl} A $ip" echo "send" ) | nsupdate ${nsupdate_opts} ;; CNAME) if dnsupdate_status ; then ocf_log info "$cname already is an alias to $hostname" return $OCF_SUCCESS fi ocf_log info "Updating DNS records for $cname" ( if [ -n "$dns_server" ]; then echo "server ${dns_server} ${dns_serverport}" fi echo "update delete $cname CNAME" echo "update add $cname ${OCF_RESKEY_ttl} CNAME $hostname" echo "send" ) | nsupdate ${nsupdate_opts} ;; esac dnsupdate_monitor return $? } dnsupdate_stop() { case $type in A) if ocf_is_true "${OCF_RESKEY_unregister_on_stop}" && dnsupdate_status ; then ocf_log info "Unregistering $hostname with $ip from DNS server" ( if [ -n "$dns_server" ]; then echo "server ${dns_server} ${dns_serverport}" fi echo "update delete $hostname A $ip" echo "send" ) | nsupdate ${nsupdate_opts} dnsupdate_monitor if [ $? -ne $OCF_NOT_RUNNING ]; then ocf_log warn "Unregistering failed!" fi fi return $OCF_SUCCESS ;; CNAME) if ocf_is_true "${OCF_RESKEY_unregister_on_stop}" && dnsupdate_status ; then ocf_log info "Unregistering $cname with $hostname from DNS server" ( if [ -n "$dns_server" ]; then echo "server ${dns_server} ${dns_serverport}" fi echo "update delete $cname CNAME" echo "send" ) | nsupdate ${nsupdate_opts} dnsupdate_monitor if [ $? -ne $OCF_NOT_RUNNING ]; then ocf_log warn "Unregistering failed!" fi fi return $OCF_SUCCESS ;; esac } dnsupdate_validate() { hostname=${OCF_RESKEY_hostname} ip=${OCF_RESKEY_ip} #added support for CNAME type=${OCF_RESKEY_type} cname=${OCF_RESKEY_cname} # dig_opts="" dns_server=${OCF_RESKEY_server} : ${OCF_RESKEY_serverport:="53"} dns_serverport=${OCF_RESKEY_serverport} : ${OCF_RESKEY_ttl:="300"} nsupdate_opts=${OCF_RESKEY_nsupdate_opts} if [ -z "$nsupdate_opts" -a -n "$OCF_RESKEY_opts" ]; then nsupdate_opts=${OCF_RESKEY_opts} ocf_log warn "opts was never an advertised parameter, please use nsupdate_opts" fi if [ -z "$hostname" ]; then ocf_log err "No hostname specified." exit $OCF_ERR_CONFIGURED fi if [ -z "$ip" ] && [ "$type" = "A" ]; then ocf_log err "No IP specified." exit $OCF_ERR_CONFIGURED fi #added support for CNAME if [ -z "$type" ]; then ocf_log err "No TYPE specified." exit $OCF_ERR_CONFIGURED fi # if ! ocf_is_decimal $OCF_RESKEY_ttl ; then ocf_log err "ttl $OCF_RESKEY_ttl is not valid" exit $OCF_ERR_CONFIGURED fi if ! ocf_is_decimal $dns_serverport ; then ocf_log err "serverport $dns_serverport is not valid" exit $OCF_ERR_CONFIGURED fi dig_opts+=" -p ${dns_serverport}" if [ -n "$dns_server" ]; then dig_opts+=" @${dns_server}" fi if [ -n "$OCF_RESKEY_keyfile" ]; then if [ ! -f ${OCF_RESKEY_keyfile} ]; then ocf_log err "keyfile $OCF_RESKEY_keyfile does not exist" exit $OCF_ERR_CONFIGURED fi nsupdate_opts+=" -k $OCF_RESKEY_keyfile" fi } if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac check_binary dig check_binary nsupdate dnsupdate_validate case $1 in start) dnsupdate_start ;; stop) dnsupdate_stop ;; monitor) dnsupdate_monitor ;; status) dnsupdate_status ;; validate-all) # We've already run this exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/galera.in b/heartbeat/galera.in index c363eb254..546b1a853 100755 --- a/heartbeat/galera.in +++ b/heartbeat/galera.in @@ -1,1094 +1,1094 @@ #!@BASH_SHELL@ # # Copyright (c) 2014 David Vossel # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ## # README. # -# This agent only supports being configured as a multistate Master +# This agent only supports being configured as a multistate Promoted # resource. # -# Slave vs Master role: +# Unpromoted vs Promoted role: # -# During the 'Slave' role, galera instances are in read-only mode and +# During the 'Unpromoted' role, galera instances are in read-only mode and # will not attempt to connect to the cluster. This role exists only as # a means to determine which galera instance is the most up-to-date. The # most up-to-date node will be used to bootstrap a galera cluster that # has no current members. # -# The galera instances will only begin to be promoted to the Master role +# The galera instances will only begin to be promoted to the Promoted role # once all the nodes in the 'wsrep_cluster_address' connection address # have entered read-only mode. At that point the node containing the -# database that is most current will be promoted to Master. Once the first -# Master instance bootstraps the galera cluster, the other nodes will be -# promoted to Master as well. +# database that is most current will be promoted to Promoted. Once the first +# Promoted instance bootstraps the galera cluster, the other nodes will be +# promoted to Promoted as well. # # Example: Create a galera cluster using nodes rhel7-node1 rhel7-node2 rhel7-node3 # # pcs resource create db galera enable_creation=true \ -# wsrep_cluster_address="gcomm://rhel7-auto1,rhel7-auto2,rhel7-auto3" meta master-max=3 --master +# wsrep_cluster_address="gcomm://rhel7-auto1,rhel7-auto2,rhel7-auto3" meta promoted-max=3 --promoted # # By setting the 'enable_creation' option, the database will be automatically -# generated at startup. The meta attribute 'master-max=3' means that all 3 +# generated at startup. The meta attribute 'promoted-max=3' means that all 3 # nodes listed in the wsrep_cluster_address list will be allowed to connect # to the galera cluster and perform replication. # # NOTE: If you have more nodes in the pacemaker cluster then you wish # to have in the galera cluster, make sure to use location contraints to prevent # pacemaker from attempting to place a galera instance on a node that is # not in the 'wsrep_cluster_address" list. # ## ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs if [ "$__OCF_ACTION" != "meta-data" ]; then . ${OCF_FUNCTIONS_DIR}/mysql-common.sh NODENAME=$(ocf_attribute_target) fi # It is common for some galera instances to store # check user that can be used to query status # in this file if [ -f "/etc/sysconfig/clustercheck" ]; then . /etc/sysconfig/clustercheck elif [ -f "/etc/default/clustercheck" ]; then . /etc/default/clustercheck fi # Parameter defaults OCF_RESKEY_wsrep_cluster_address_default="" OCF_RESKEY_cluster_host_map_default="" OCF_RESKEY_check_user_default="" OCF_RESKEY_check_passwd_default="" OCF_RESKEY_two_node_mode_default="false" : ${OCF_RESKEY_wsrep_cluster_address=${OCF_RESKEY_wsrep_cluster_address_default}} : ${OCF_RESKEY_cluster_host_map=${OCF_RESKEY_cluster_host_map_default}} : ${OCF_RESKEY_check_user=${OCF_RESKEY_check_user_default}} : ${OCF_RESKEY_check_passwd=${OCF_RESKEY_check_passwd_default}} : ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}} ####################################################################### # Defaults: OCF_RESKEY_check_passwd_use_empty_default=0 : ${OCF_RESKEY_check_passwd_use_empty=${OCF_RESKEY_check_passwd_use_empty_default}} ####################################################################### usage() { cat < 1.0 Resource script for managing galera database. Manages a galera instance Location of the MySQL server binary MySQL server binary Location of the MySQL client binary MySQL client binary Configuration file MySQL config Directory containing databases MySQL datadir User running MySQL daemon MySQL user Group running MySQL daemon (for logfile and directory permissions) MySQL group The logfile to be used for mysqld. MySQL log file The pidfile to be used for mysqld. MySQL pid file The socket to be used for mysqld. MySQL socket If the MySQL database does not exist, it will be created Create the database if it does not exist Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld The galera cluster address. This takes the form of: gcomm://node,node,node Only nodes present in this node list will be allowed to start a galera instance. The galera node names listed in this address are expected to match valid pacemaker node names. If both names need to differ, you must provide a mapping in option cluster_host_map. Galera cluster address A mapping of pacemaker node names to galera node names. To be used when both pacemaker and galera names need to differ, (e.g. when galera names map to IP from a specific network interface) This takes the form of: pcmk1:node.1.galera;pcmk2:node.2.galera;pcmk3:node.3.galera where the galera resource started on node pcmk1 would be named node.1.galera in the wsrep_cluster_address Pacemaker to Galera name mapping Cluster check user. MySQL test user Cluster check user password. Empty passwords are ignored unless the parameter "check_passwd_use_empty" is set to 1. check password Use an empty "check_passwd" password. If this parameter is set to 1, "check_passwd" will be ignored and an empty password is used when calling the "mysql" client binary. check password use empty If running in a 2-node pacemaker cluster, rely on pacemaker quorum to allow automatic recovery even when the other node is unreachable. Use it with caution! (and fencing) Special recovery when running on a 2-node cluster - - + + END } get_option_variable() { local key=$1 $MYSQL $MYSQL_OPTIONS_CHECK -e "SHOW VARIABLES like '$key';" | tail -1 } get_status_variable() { local key=$1 $MYSQL $MYSQL_OPTIONS_CHECK -e "show status like '$key';" | tail -1 } set_bootstrap_node() { local node=$(ocf_attribute_target $1) ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -v "true" } clear_bootstrap_node() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -D } is_bootstrap() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" --quiet 2>/dev/null } set_no_grastate() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -v "true" } clear_no_grastate() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -D } is_no_grastate() { local node=$(ocf_attribute_target $1) ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" --quiet 2>/dev/null } clear_last_commit() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D } set_last_commit() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -v $1 } get_last_commit() { local node=$(ocf_attribute_target $1) if [ -z "$node" ]; then ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null else ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null fi } clear_safe_to_bootstrap() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -D } set_safe_to_bootstrap() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -v $1 } get_safe_to_bootstrap() { local node=$(ocf_attribute_target $1) if [ -z "$node" ]; then ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null else ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null fi } wait_for_sync() { local state=$(get_status_variable "wsrep_local_state") ocf_log info "Waiting for database to sync with the cluster. " while [ "$state" != "4" ]; do sleep 1 state=$(get_status_variable "wsrep_local_state") done ocf_log info "Database synced." } is_primary() { cluster_status=$(get_status_variable "wsrep_cluster_status") if [ "$cluster_status" = "Primary" ]; then return 0 fi if [ -z "$cluster_status" ]; then ocf_exit_reason "Unable to retrieve wsrep_cluster_status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status" else ocf_log info "Galera instance wsrep_cluster_status=${cluster_status}" fi return 1 } is_readonly() { local res=$(get_option_variable "read_only") if ! ocf_is_true "$res"; then return 1 fi cluster_status=$(get_status_variable "wsrep_cluster_status") if ! [ "$cluster_status" = "Disconnected" ]; then return 1 fi return 0 } is_two_node_mode_active() { # crm_node or corosync-quorumtool cannot access various corosync # flags when running inside a bundle, so only count the cluster # members ocf_is_true "$OCF_RESKEY_two_node_mode" && crm_mon_no_validation -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2 } is_last_node_in_quorate_partition() { # when a network split occurs in a 2-node cluster, pacemaker # fences the other node and try to retain quorum. So until # the fencing is resolved (and the status of the peer node # is clean), we shouldn't consider ourself quorate. local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w) local quorate=$(${HA_SBIN_DIR}/crm_node -q) local clean_members=$(crm_mon_no_validation -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -) [ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ] } master_exists() { if [ "$__OCF_ACTION" = "demote" ]; then # We don't want to detect master instances during demote. # 1. we could be detecting ourselves as being master, which is no longer the case. # 2. we could be detecting other master instances that are in the process of shutting down. # by not detecting other master instances in "demote" we are deferring this check # to the next recurring monitor operation which will be much more accurate return 1 fi # determine if a master instance is already up and is healthy ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.1.0" res=$? if [ -z "$OCF_RESKEY_crm_feature_set" ] || [ $res -eq 2 ]; then XMLOPT="--output-as=xml" ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.2.0" if [ $? -eq 1 ]; then crm_mon_no_validation -1 $XMLOPT >/dev/null 2>&1 if [ $? -ne 0 ]; then XMLOPT="--as-xml" fi fi else XMLOPT="--as-xml" fi crm_mon_no_validation -1 $XMLOPT | grep -q -i -E "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"(Promoted|Master)\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" return $? } clear_master_score() { local node=$(ocf_attribute_target $1) if [ -z "$node" ]; then $CRM_MASTER -D else $CRM_MASTER -D -N $node fi } set_master_score() { local node=$(ocf_attribute_target $1) if [ -z "$node" ]; then $CRM_MASTER -v 100 else $CRM_MASTER -N $node -v 100 fi } promote_everyone() { for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do local pcmk_node=$(galera_to_pcmk_name $node) if [ -z "$pcmk_node" ]; then ocf_log err "Could not determine pacemaker node from galera name <${node}>." return else node=$pcmk_node fi set_master_score $node done } greater_than_equal_long() { # there are values we need to compare in this script # that are too large for shell -gt to process echo | awk -v n1="$1" -v n2="$2" '{if (n1>=n2) printf ("true"); else printf ("false");}' | grep -q "true" } galera_to_pcmk_name() { local galera=$1 if [ -z "$OCF_RESKEY_cluster_host_map" ]; then echo $galera else echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$galera"'" {print $1;exit}' fi } pcmk_to_galera_name() { local pcmk=$1 if [ -z "$OCF_RESKEY_cluster_host_map" ]; then echo $pcmk else echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$pcmk"'" {print $2;exit}' fi } detect_first_master() { local best_commit=0 local last_commit=0 local missing_nodes=0 local nodes="" local nodes_recovered="" local all_nodes local best_node_gcomm local best_node local safe_to_bootstrap all_nodes=$(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' ') best_node_gcomm=$(echo "$all_nodes" | sed 's/^.* \(.*\)$/\1/') best_node=$(galera_to_pcmk_name $best_node_gcomm) if [ -z "$best_node" ]; then ocf_log err "Could not determine initial best node from galera name <${best_node_gcomm}>." return fi # avoid selecting a recovered node as bootstrap if possible for node in $all_nodes; do local pcmk_node=$(galera_to_pcmk_name $node) if [ -z "$pcmk_node" ]; then ocf_log err "Could not determine pacemaker node from galera name <${node}>." return else node=$pcmk_node fi if is_no_grastate $node; then nodes_recovered="$nodes_recovered $node" else nodes="$nodes $node" fi done for node in $nodes_recovered $nodes; do # On clean shutdown, galera sets the last stopped node as 'safe to bootstrap', # so use this hint when we can safe_to_bootstrap=$(get_safe_to_bootstrap $node) # Special case for 2-node clusters: during a network split, rely on # pacemaker's quorum to check whether we can restart galera if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then is_last_node_in_quorate_partition if [ $? -eq 0 ]; then ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap" safe_to_bootstrap=1 fi fi if [ "$safe_to_bootstrap" = "1" ]; then # Galera marked the node as safe to boostrap during shutdown. Let's just # pick it as our bootstrap node. ocf_log info "Node <${node}> is marked as safe to bootstrap." best_node=$node # We don't need to wait for the other nodes to report state in this case missing_nodes=0 break fi last_commit=$(get_last_commit $node) if [ -z "$last_commit" ]; then ocf_log info "Waiting on node <${node}> to report database status before Master instances can start." missing_nodes=1 continue fi # this means -1, or that no commit has occured yet. if [ "$last_commit" = "18446744073709551615" ]; then last_commit="0" fi greater_than_equal_long "$last_commit" "$best_commit" if [ $? -eq 0 ]; then best_node=$(ocf_attribute_target $node) best_commit=$last_commit fi done if [ $missing_nodes -eq 1 ]; then return fi ocf_log info "Promoting $best_node to be our bootstrap node" set_bootstrap_node $best_node set_master_score $best_node } detect_safe_to_bootstrap() { local safe_to_bootstrap="" local uuid="" local seqno="" if [ -f ${OCF_RESKEY_datadir}/grastate.dat ]; then ocf_log info "attempting to read safe_to_bootstrap flag from ${OCF_RESKEY_datadir}/grastate.dat" safe_to_bootstrap=$(sed -n 's/^safe_to_bootstrap:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) uuid=$(sed -n 's/^uuid:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) seqno=$(sed -n 's/^seqno:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) fi if [ -z "$uuid" ] || \ [ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then clear_safe_to_bootstrap return fi if [ "$safe_to_bootstrap" = "1" ]; then if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then clear_safe_to_bootstrap return fi fi if [ "$safe_to_bootstrap" = "1" ] || [ "$safe_to_bootstrap" = "0" ]; then set_safe_to_bootstrap $safe_to_bootstrap else clear_safe_to_bootstrap fi } detect_last_commit() { local last_commit local recover_args="--defaults-file=$OCF_RESKEY_config \ --pid-file=$OCF_RESKEY_pid \ --socket=$OCF_RESKEY_socket \ --datadir=$OCF_RESKEY_datadir" local recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p' local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p' # codership/galera#354 # Some ungraceful shutdowns can leave an empty gvwstate.dat on # disk. This will prevent galera to join the cluster if it is # configured to attempt PC recovery. Removing that file makes the # node fall back to the normal, unoptimized joining process. if [ -f ${OCF_RESKEY_datadir}/gvwstate.dat ] && \ [ ! -s ${OCF_RESKEY_datadir}/gvwstate.dat ]; then ocf_log warn "empty ${OCF_RESKEY_datadir}/gvwstate.dat detected, removing it to prevent PC recovery failure at next restart" rm -f ${OCF_RESKEY_datadir}/gvwstate.dat fi ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then local tmp=$(mktemp) chown $OCF_RESKEY_user:$OCF_RESKEY_group $tmp # if we pass here because grastate.dat doesn't exist, # try not to bootstrap from this node if possible if [ ! -f ${OCF_RESKEY_datadir}/grastate.dat ]; then set_no_grastate fi ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" $SU - $OCF_RESKEY_user -s /bin/sh -c \ "${OCF_RESKEY_binary} $recover_args --wsrep-recover --log-error=$tmp 2>/dev/null" last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)" if [ -z "$last_commit" ]; then # Galera uses InnoDB's 2pc transactions internally. If # server was stopped in the middle of a replication, the # recovery may find a "prepared" XA transaction in the # redo log, and mysql won't recover automatically local recovery_file="$(cat $tmp | sed -n $recovery_file_regex)" if [ -e $recovery_file ]; then cat $recovery_file | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null if [ $? -eq 0 ]; then # we can only rollback the transaction, but that's OK # since the DB will get resynchronized anyway ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" $SU - $OCF_RESKEY_user -s /bin/sh -c \ "${OCF_RESKEY_binary} $recover_args --wsrep-recover \ --tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null" last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)" if [ ! -z "$last_commit" ]; then ocf_log warn "State recovered. force SST at next restart for full resynchronization" rm -f ${OCF_RESKEY_datadir}/grastate.dat # try not to bootstrap from this node if possible set_no_grastate fi fi fi fi rm -f $tmp fi if [ ! -z "$last_commit" ]; then ocf_log info "Last commit version found: $last_commit" set_last_commit $last_commit return $OCF_SUCCESS else ocf_exit_reason "Unable to detect last known write sequence number" clear_last_commit return $OCF_ERR_GENERIC fi } # For galera, promote is really start galera_promote() { local rc local extra_opts local bootstrap local safe_to_bootstrap master_exists if [ $? -eq 0 ]; then # join without bootstrapping extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}" else bootstrap=$(is_bootstrap) if ocf_is_true $bootstrap; then # The best node for bootstrapping wasn't cleanly shutdown. Allow # bootstrapping anyways if [ "$(get_safe_to_bootstrap)" = "0" ]; then sed -ie 's/^\(safe_to_bootstrap:\) 0/\1 1/' ${OCF_RESKEY_datadir}/grastate.dat ocf_log info "safe_to_bootstrap in ${OCF_RESKEY_datadir}/grastate.dat set to 1 on node ${NODENAME}" fi ocf_log info "Node <${NODENAME}> is bootstrapping the cluster" extra_opts="--wsrep-cluster-address=gcomm://" else # We are being promoted without having the bootstrap # attribute in the CIB, which means we are supposed to # join a cluster; however if we end up here, there is no # Master remaining right now, which means there is no # cluster to join anymore. So force a demotion, and and # let the RA decide later which node should be the next # bootstrap node. ocf_log warn "There is no running cluster to join, demoting ourself" clear_master_score return $OCF_SUCCESS fi fi galera_monitor if [ $? -eq $OCF_RUNNING_MASTER ]; then if ocf_is_true $bootstrap; then promote_everyone clear_bootstrap_node ocf_log info "boostrap node already up, promoting the rest of the galera instances." fi clear_safe_to_bootstrap clear_last_commit return $OCF_SUCCESS fi # last commit/safe_to_bootstrap flag are no longer relevant once promoted clear_last_commit clear_safe_to_bootstrap mysql_common_prepare_dirs mysql_common_start "$extra_opts" rc=$? if [ $rc != $OCF_SUCCESS ]; then return $rc fi galera_monitor rc=$? if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then ocf_exit_reason "Failed initial monitor action" return $rc fi is_readonly if [ $? -eq 0 ]; then ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration." return $OCF_ERR_GENERIC fi is_primary if [ $? -ne 0 ]; then ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." return $OCF_ERR_GENERIC fi if ocf_is_true $bootstrap; then promote_everyone clear_bootstrap_node # clear attribute no-grastate. if last shutdown was # not clean, we cannot be extra-cautious by requesting a SST # since this is the bootstrap node clear_no_grastate ocf_log info "Bootstrap complete, promoting the rest of the galera instances." else # if this is not the bootstrap node, make sure this instance # syncs with the rest of the cluster before promotion returns. wait_for_sync # sync is done, clear info about last startup clear_no_grastate fi ocf_log info "Galera started" return $OCF_SUCCESS } galera_demote() { mysql_common_stop rc=$? if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then ocf_exit_reason "Failed to stop Master galera instance during demotion to Master" return $rc fi # if this node was previously a bootstrap node, that is no longer the case. clear_bootstrap_node clear_last_commit clear_no_grastate clear_safe_to_bootstrap # Clear master score here rather than letting pacemaker do so once # demote finishes. This way a promote cannot take place right # after this demote even if pacemaker is requested to do so. It # will first have to run a start/monitor op, to reprobe the state # of the other galera nodes and act accordingly. clear_master_score # record last commit for next promotion detect_safe_to_bootstrap detect_last_commit rc=$? return $rc } galera_start() { local rc local galera_node galera_node=$(pcmk_to_galera_name $NODENAME) if [ -z "$galera_node" ]; then ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>." return $OCF_ERR_CONFIGURED fi echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node if [ $? -ne 0 ]; then ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}> to start this galera instance" return $OCF_ERR_CONFIGURED fi galera_monitor if [ $? -eq $OCF_RUNNING_MASTER ]; then ocf_exit_reason "master galera instance started outside of the cluster's control" return $OCF_ERR_GENERIC fi mysql_common_prepare_dirs detect_safe_to_bootstrap detect_last_commit rc=$? if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi master_exists if [ $? -eq 0 ]; then ocf_log info "Master instances are already up, setting master score so this instance will join galera cluster." set_master_score $NODENAME else clear_master_score detect_first_master fi return $OCF_SUCCESS } galera_monitor() { local rc local galera_node local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi mysql_common_status $status_loglevel rc=$? if [ $rc -eq $OCF_NOT_RUNNING ]; then last_commit=$(get_last_commit $node) if [ -n "$last_commit" ]; then # if last commit is set, this instance is considered started in slave mode rc=$OCF_SUCCESS master_exists if [ $? -ne 0 ]; then detect_first_master else # a master instance exists and is healthy, promote this # local read only instance # so it can join the master galera cluster. set_master_score fi fi return $rc elif [ $rc -ne $OCF_SUCCESS ]; then return $rc fi # if we make it here, mysql is running. Check cluster status now. galera_node=$(pcmk_to_galera_name $NODENAME) if [ -z "$galera_node" ]; then ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>." return $OCF_ERR_CONFIGURED fi echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node if [ $? -ne 0 ]; then ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" return $OCF_ERR_GENERIC fi is_primary if [ $? -eq 0 ]; then if ocf_is_probe; then # restore master score during probe # if we detect this is a master instance set_master_score fi rc=$OCF_RUNNING_MASTER else ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." rc=$OCF_ERR_GENERIC fi return $rc } galera_stop() { local rc # make sure the process is stopped mysql_common_stop rc=$1 clear_safe_to_bootstrap clear_last_commit clear_master_score clear_bootstrap_node clear_no_grastate return $rc } galera_validate() { if ! ocf_is_ms; then ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource." return $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then ocf_exit_reason "Galera must be configured with a wsrep_cluster_address value." return $OCF_ERR_CONFIGURED fi mysql_common_validate } case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac galera_validate rc=$? LSB_STATUS_STOPPED=3 if [ $rc -ne 0 ]; then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $rc;; esac fi if [ -z "${OCF_RESKEY_check_passwd}" ]; then # This value is automatically sourced from /etc/sysconfig/checkcluster if available OCF_RESKEY_check_passwd=${MYSQL_PASSWORD} fi if [ -z "${OCF_RESKEY_check_user}" ]; then # This value is automatically sourced from /etc/sysconfig/checkcluster if available OCF_RESKEY_check_user=${MYSQL_USERNAME} fi : ${OCF_RESKEY_check_user="root"} MYSQL_OPTIONS_CHECK="-nNE --user=${OCF_RESKEY_check_user}" if ocf_is_true "${OCF_RESKEY_check_passwd_use_empty}"; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password=" elif [ -n "${OCF_RESKEY_check_passwd}" ]; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password=${OCF_RESKEY_check_passwd}" fi # This value is automatically sourced from /etc/sysconfig/checkcluster if available if [ -n "${MYSQL_HOST}" ]; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -h ${MYSQL_HOST}" fi # This value is automatically sourced from /etc/sysconfig/checkcluster if available if [ -n "${MYSQL_PORT}" ]; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -P ${MYSQL_PORT}" fi # What kind of method was invoked? case "$1" in start) galera_start;; stop) galera_stop;; status) mysql_common_status err;; monitor) galera_monitor;; promote) galera_promote;; demote) galera_demote;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac # vi:sw=4:ts=4:et: diff --git a/heartbeat/iface-bridge b/heartbeat/iface-bridge index 75d5371dd..a4e50adb9 100755 --- a/heartbeat/iface-bridge +++ b/heartbeat/iface-bridge @@ -1,843 +1,843 @@ #!/bin/sh # # OCF Resource Agent compliant iface-bridge script. # # Implements network Bridge interface management # # Copyright (C) 2013 Red Hat, Inc. All rights reserved. # Author: Fabio M. Di Nitto # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # TODO: # * Eventually improve bridge_check to verify all runtime # parameters. Is it really necessary? # * consider add support for advanced multicast timers tuning # sethashel set hash elasticity default 4 # sethashmax set hash max default 512 # setmclmc set multicast last member count default 2, ? # setmcsqc set multicast startup query count default 2, ? # setmclmi