diff --git a/heartbeat/db2 b/heartbeat/db2 index 83020fc70..82f2f82c3 100755 --- a/heartbeat/db2 +++ b/heartbeat/db2 @@ -1,1084 +1,1084 @@ #!/bin/sh # # db2 # # Resource agent that manages a DB2 LUW database in Standard role # or HADR configuration in promotable configuration. # Multi partition is supported as well. # # Copyright (c) 2011 Holger Teutsch # # This agent incoporates code of a previous release created by # Alan Robertson and the community. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Use runuser if available for SELinux. if [ -x "/sbin/runuser" ]; then SU="runuser" else SU="su" fi # Parameter defaults OCF_RESKEY_instance_default="" OCF_RESKEY_skip_basic_sql_health_check_default="false" OCF_RESKEY_monitor_retries_default="1" -OCF_RESKEY_monitor_sleep_default="1" +OCF_RESKEY_monitor_retries_sleep_default="1" OCF_RESKEY_monitor_retry_all_errors_default="false" OCF_RESKEY_admin_default="" OCF_RESKEY_dbpartitionnum_default="0" : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} : ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} : ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}} -: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}} +: ${OCF_RESKEY_monitor_retries_sleep=${OCF_RESKEY_monitor_retries_sleep_default}} : ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}} : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} POSIX_UNICODE_LOCALE="C.UTF-8" ####################################################################### db2_usage() { echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data" } db2_meta_data() { cat < 1.0 Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in promotable configuration. Multiple partitions are supported. Standard mode: An instance including all or selected databases is made highly available. Configure each partition as a separate primitive resource. HADR mode: A single database in HADR configuration is made highly available by automating takeover operations. Configure a promotable resource with notifications enabled and an additional monitoring operation with role "Promoted". In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW. In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance: "monitor interval" < HADR_PEER_WINDOW - (appr 30 sec) "promote timeout" < HADR_PEER_WINDOW + (appr 20 sec) For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent) Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as promotable configuration. Multiple partitions are supported. The instance of the database(s). instance List of databases to be managed, e.g "db1 db2". Defaults to all databases in the instance. Specify one db for HADR mode. List of databases to be managed Skip basic health check SQL query. Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent enough to avoid issues under high load. Skip basic health check SQL query Monitor retries before failing. Monitor retries Monitor sleep between tries. Monitor sleep - + Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions. Retry monitor for all errors DEPRECATED: The admin user of the instance. DEPRECATED: admin The number of the partition (DBPARTITIONNUM) to be managed. database partition number (DBPARTITIONNUM) END } # # validate # .. and set global variables # # exit on error # db2_validate() { local db2home db2sql db2instance # db2 uses korn shell check_binary "ksh" # check required instance vars if [ -z "$OCF_RESKEY_instance" ] then ocf_log err "DB2 required parameter instance is not set!" return $OCF_ERR_CONFIGURED fi instance=$OCF_RESKEY_instance if [ -n "$OCF_RESKEY_admin" ] then ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance." instance=$OCF_RESKEY_admin fi db2node=${OCF_RESKEY_dbpartitionnum:-0} db2home=$(sh -c "echo ~$instance") db2sql=$db2home/sqllib db2profile=$db2sql/db2profile db2bin=$db2sql/bin STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state # Let's make sure a few important things are there... if ! [ -d "$db2sql" -a -d "$db2bin" -a -f "$db2profile" -a \ -x "$db2profile" -a -x "$db2bin/db2" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 required directories and/or files not found" exit $OCF_ERR_INSTALLED fi db2instance=$(runasdb2 'echo $DB2INSTANCE') if [ "$db2instance" != "$instance" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\"" exit $OCF_ERR_CONFIGURED fi # enough checking for stop to succeed [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS dblist=$OCF_RESKEY_dblist if [ -n "$dblist" ] then # support , as separator as well dblist=$(echo "$dblist" | sed -e 's/[,]/ /g') else if ! dblist=$(db2_dblist) then ocf_log err "DB2 $instance($db2node): cannot retrieve db directory" exit $OCF_ERR_INSTALLED fi fi # check requirements for the HADR case if ocf_is_ms then set -- $dblist if [ $# != 1 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist" exit $OCF_ERR_CONFIGURED fi if [ $db2node != 0 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0" exit $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } master_score() { if ! have_binary "crm_master"; then return fi crm_master $* } # # Run the given command as db2 instance user # runasdb2() { $SU $instance -c ". $db2profile; $*" } # # Run the given command as db2 instance user using $SU # We run this function as opposed to runasdb2 whenever we have to issue commands # that leave processes running on the system, such as db2start # We do not want these processes to hog the resources as they were run with elevated privileges # runasdb2_session() { # Override db2profile with unicode locale is required to maintain compatibility with unicode CODEPAGE $SU "$instance" -c "ksh -c '. $db2profile; export LC_ALL="$POSIX_UNICODE_LOCALE"; export LANG="$POSIX_UNICODE_LOCALE"; $*'" } # # Run a command as the DB2 admin, and log the output # logasdb2() { local output rc output=$(runasdb2 $*) rc=$? if [ $rc -eq 0 ] then ocf_log info "$output" else ocf_log err "$output" fi return $rc } # # unfortunately a first connect after a crash may need several minutes # for some internal cleanup stuff in DB2. # We run a connect in background so other connects (i.e. monitoring!) may proceed. # db2_run_connect() { local db=$1 logasdb2 "db2 connect to $db; db2 terminate" } # # get some data from the database config # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW # db2_get_cfg() { local db=$1 local output hadr_vars output=$(runasdb2 db2 get db cfg for $db) [ $? != 0 ] && return $OCF_ERR_GENERIC hadr_vars=$(echo "$output" | awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;} /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;} /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;} /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}') # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW HADR_ROLE=$(echo "$output" | awk '/HADR database role/ {print $NF;}') HADR_TIMEOUT=$(echo "$output" | awk '/HADR_TIMEOUT/ {print $NF;}') FIRST_ACTIVE_LOG=$(echo "$output" | awk '/First active log file/ {print $NF;}') HADR_PEER_WINDOW=$(echo "$output" | awk '/HADR_PEER_WINDOW/ {print $NF;}') # HADR_PEER_WINDOW comes with V9 and is checked later if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ] then ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # return the list of databases in the instance # db2_dblist() { local output output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%' } # # Delayed check of the compatibility of DB2 instance and pacemaker # config. # Logically this belongs to validate but certain parameters can only # be retrieved once the instance is started. # db2_check_config_compatibility() { local db=$1 local is_ms ocf_is_ms is_ms=$? case "$HADR_ROLE/$is_ms" in STANDARD/0) ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource" exit $OCF_ERR_INSTALLED ;; STANDARD/1) # OK ;; */0) if [ -z "$HADR_PEER_WINDOW" ] then ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)" exit $OCF_ERR_INSTALLED fi ;; */1) ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource" esac } # # Start HADR as standby. # # Parameters # 1 - Calling function # 2 - Calling functions line number # # Return codes: # 0 - Start as standby successful # 1 - Start as standby failed # reintegrateAsStandby() { db=$1 reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ocf_log info "$__OCF_ACTION: $LINENO: reintegrateAsStandby called by $2 at $3. Attempting to reintegrate $db as standby." if output=$(runasdb2_session "db2 start hadr on db $db as standby"); then rc=0 ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated" else case $output in SQL1777N*) # SQL1777N: HADR is already started in given state. ocf_log info "$__OCF_ACTION: $LINENO: $output" rc=0 ;; *) rc=1 ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc" ;; esac fi crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever return $rc } # # Start instance and DB. # Standard mode is through "db2 activate" in order to start in previous # mode (Standy/Primary). # If the database is a primary AND we can determine that the running master # has a higher "first active log" we conclude that we come up after a crash # an the previous Standby is now Primary. # The db is then started as Standby. # # Other cases: danger of split brain, log error and do nothing. # db2_start() { local output start_cmd db local start_opts="dbpartitionnum $db2node" # If we detect that db partitions are not in use, and no # partition is explicitly specified, activate without # partition information. This allows db2 instances without # partition support to be managed. if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then start_opts="" fi if output=$(runasdb2 db2start $start_opts) then ocf_log info "DB2 instance $instance($db2node) started: $output" else case $output in *SQL1026N*) ocf_log info "DB2 instance $instance($db2node) already running: $output" ;; *) ocf_log err "$output" return $OCF_ERR_GENERIC esac fi if ! db2_instance_status then ocf_log err "DB2 instance $instance($db2node) is not active!" return $OCF_ERR_GENERIC fi [ $db2node = 0 ] || return $OCF_SUCCESS # activate DB only on node 0 for db in $dblist do reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG db2_get_cfg $db || return $? # Better late than never: can only check this when the instance is already up db2_check_config_compatibility $db start_cmd="db2 activate db $db" if [ $HADR_ROLE = PRIMARY ] then cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value'" if [ "$cib_value" = "1" ]; then ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" start_cmd="db2 start hadr on db $db as standby" HADR_ROLE=STANDBY standby_reintegration=1 fi fi if output=$(runasdb2 $start_cmd) then ocf_log info "DB2 database $instance($db2node)/$db started/activated" [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & else case $output in SQL1490W* | SQL1494W* | SQL1497W* | SQL1777N*) # SQL1490W Activate database is successful, however, the database has already been activated on one or more nodes. # SQL1494W Activate database is successful, however, there is already a connection to the database. # SQL1497W Activate/Deactivate database was successful, however, an error occurred on some nodes. # SQL1777N HADR is already started. ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is already activated: $output" ;; SQL1768N*"Reason code = \"7\""*) rc="$OCF_ERR_GENERIC" ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is a Primary and the Standby is down" ocf_log err "Possible split brain! Manual intervention required." ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\". db2_start() exit with rc=$rc." # let pacemaker give it another try and we will succeed then return "$rc" ;; SQL1776N*"Reason code = \"6\""*) # SQL1776N The command cannot be issued on an HADR database. # Reason code 6: # This database is an old primary database. It cannot be started # because the standby has become the new primary through forced # takeover. rc="$OCF_ERR_GENERIC" ocf_log err "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db didn't start: $output, return with rc=$rc" ocf_log err "$__OCF_ACTION: $LINENO: This database is an old primary database. Trying start again as standby" start_cmd="db2 start hadr on db $db as standby" if output=$(runasdb2_session "$start_cmd"); then rc="$OCF_SUCCESS" ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated" else case $output in SQL1777N*) # SQL1777N: HADR is already started. ocf_log info "$__OCF_ACTION: $LINENO: $output" rc="$OCF_SUCCESS" ;; *) rc="$OCF_ERR_GENERIC" ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc" ;; esac fi return "$rc" ;; *) rc="$OCF_ERR_GENERIC" ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database didn't start: $output, db2_start() exit with rc=$rc." return "$rc" ;; esac fi done # come here with success # Even if we are a db2 Primary pacemaker requires start to end up in slave mode echo SLAVE > $STATE_FILE # Unset primary failover attribute as host was successfully reintegrated as standby if [ "$standby_reintegration" = "1" ]; then for db in $dblist; do reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever done fi return $OCF_SUCCESS } # # helper function to be spawned # so we can detect a hang of the db2stop command # db2_stop_bg() { local rc output local stop_opts="dbpartitionnum $db2node" rc=$OCF_SUCCESS if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then stop_opts="" fi if output=$(runasdb2 db2stop force $stop_opts) then ocf_log info "DB2 instance $instance($db2node) stopped: $output" else case $output in *SQL1032N*) #SQL1032N No start database manager command was issued ocf_log info "$output" ;; *) ocf_log err "DB2 instance $instance($db2node) stop failed: $output" rc=$OCF_ERR_GENERIC esac fi return $rc } # # Stop the given db2 database instance # db2_stop() { local stop_timeout grace_timeout stop_bg_pid i must_kill # remove master score master_score -D -l reboot # be very early here in order to avoid stale data rm -f $STATE_FILE db2_instance_status if [ $? -eq $OCF_NOT_RUNNING ]; then ocf_log info "DB2 instance $instance already stopped" return $OCF_SUCCESS fi stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000} # grace_time is 4/5 (unit is ms) grace_timeout=$((stop_timeout/1250)) # start db2stop in background as this may hang db2_stop_bg & stop_bg_pid=$! # wait for grace_timeout i=0 while [ $i -lt $grace_timeout ] do kill -0 $stop_bg_pid 2>/dev/null || break; sleep 1 i=$((i+1)) done # collect exit status but don't hang if kill -0 $stop_bg_pid 2>/dev/null then stoprc=1 kill -9 $stop_bg_pid 2>/dev/null else wait $stop_bg_pid stoprc=$? fi must_kill=0 if [ $stoprc -ne 0 ] then ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill" must_kill=1 elif ! db2_instance_dead then ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" must_kill=1 fi if [ $must_kill -eq 1 ] then # db2nkill kills *all* partitions on the node if [ -x $db2bin/db2nkill ] then logasdb2 $db2bin/db2nkill $db2node elif [ -x $db2bin/db2_kill ] then logasdb2 $db2bin/db2_kill fi # loop forever (or lrmd kills us due to timeout) until the # instance is dead while ! db2_instance_dead do ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit" sleep 1 done ocf_log info "DB2 instance $instance($db2node) is now dead" fi return $OCF_SUCCESS } # # check whether `enough´ processes for a healthy instance are up # db2_instance_status() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) if [ $pscount -ge 4 ]; then return $OCF_SUCCESS; elif [ $pscount -ge 1 ]; then return $OCF_ERR_GENERIC fi return $OCF_NOT_RUNNING } # # is the given db2 instance dead? # db2_instance_dead() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) test $pscount -eq 0 } # # return the status of the db as "Role/Status" # e.g. Primary/Peer, Standby/RemoteCatchupPending # # If not in HADR configuration return "Standard/Standalone" # db2_hadr_status() { local db=$1 local output output=$(runasdb2 db2pd -hadr -db $db) ocf_log debug "db2_hadr_status: $output" if [ $? != 0 ] then echo "Down/Off" return 1 fi echo "$output" | awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } /^HADR is not active/ {print "Standard/Standalone"; exit; } /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; } /^Option -hadr requires -db or -alldbs option and active database./ { exit 255 } /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 } /^Changing data structure forced command termination./ { exit 255 }' } db2_monitor_retry() { local tries=$(($OCF_RESKEY_monitor_retries + 1)) for try in $(seq $tries); do ocf_log debug "monitor try $try of $tries" db2_monitor rc=$? [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc." if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then break fi - [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep + [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_retries_sleep done [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then # instance is dead remove master score master_score -D -l reboot fi return $rc } # # Monitor the db # And as side effect set crm_master # db2_monitor() { local CMD output hadr db local rc db2_instance_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi [ $db2node = 0 ] || return 0 # monitoring only for partition 0 for db in $dblist do reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" #Check for the reintegration file, then set the flag if it exists and delete the file if [ -e "/tmp/$reint_attr" ] && [ -n "$remote_host" ]; then #The file exist, try to set the reintegration attribute crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever cib_value=$(crm_attribute -n "$reint_attr" -N "$remote_host" -G | awk -v FS=' value=' '{print $2}') if [ "$cib_value" = "1" ]; then ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value', reintegration flag file will now be deleted." rm -f "/tmp/$reint_attr" else ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The reintegration flag file exists, but its attribute failed to set." fi fi hadr=$(db2_hadr_status $db) rc=$? ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" if [ "$rc" -eq 255 ]; then if [ "$__OCF_ACTION" = "monitor" ]; then return $rc else return $OCF_ERR_GENERIC fi elif [ "$rc" -ne 0 ]; then return $OCF_ERR_GENERIC fi # set master preference accordingly case "$hadr" in PRIMARY/*|Primary/*|Standard/*) if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then # perform a basic health check CMD="if db2 connect to $db; then db2 select \* from sysibm.sysversions ; rc=\$?; db2 terminate; else rc=\$?; fi; exit \$rc" if ! output=$(runasdb2 $CMD) then case "$output" in SQL1776N*) # can't connect/select on standby, may be spurious turing takeover ;; *) ocf_log err "DB2 database $instance($db2node)/$db is not working" ocf_log err "DB2 message: $output" # dead primary, remove master score master_score -D -l reboot return $OCF_ERR_GENERIC esac fi fi ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" ocf_is_ms && master_score -v 10000 -l reboot ;; STANDBY/*PEER/*|Standby/*Peer) # If db is in standby peer, then it has already reintegrated. # If the reintegrate flag is still set, remove it cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') if [ "$cib_value" = "1" ]; then ocf_log info "$__OCF_ACTION: $LINENO: Reintegrate flag detected for $db, but it has already reintegrated as standby. Removing reintegration flag." crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever fi master_score -v 8000 -l reboot ;; STANDBY/*|Standby/*) ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted" master_score -D -l reboot ;; Down/Off) # If db is a deactivated primary and it has a reintegration flag, then reintegrate as standby. cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') if [ "$cib_value" = "1" ]; then output=$(runasdb2 "db2 get db cfg for $db" | grep 'HADR database role' | awk '{print $5}') if [ "PRIMARY" = "$output" ]; then ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Database is deactivated with Primary role and the reintegration flag is set. Role: $output, Reintegration flag: $reint_attr = $cib_value" # Reintegrate as the standby database. if reintegrateAsStandby "$db" 'db2_monitor' $LINENO; then ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration succeeded." # Setting slave state here will cause rc to be OCF_SUCCESS below. ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Echoing SLAVE into $STATE_FILE" echo SLAVE >"$STATE_FILE" # Update master score to reflect standby state. master_score -v 8000 -l reboot else ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration failed." return "$OCF_ERR_GENERIC" fi fi else rc="$OCF_NOT_RUNNING" ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database has HADR status $hadr." ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: db2_monitor() exit with rc=$rc." return "$rc" fi ;; *) return $OCF_ERR_GENERIC esac done # everything OK, return if running as slave grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS return $OCF_RUNNING_MASTER } # # Promote db to Primary # db2_promote() { # validate ensured that dblist contains only one entry local db=$dblist local i hadr output force # we run this twice as after a crash of the other node # within HADR_TIMEOUT the status may be still reported as Peer # although a connection no longer exists for i in 1 2 do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted" case "$hadr" in Standard/Standalone) # this case only to keep ocf-tester happy return $OCF_SUCCESS ;; PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer) # nothing to do, only update pacemaker's view echo MASTER > $STATE_FILE return $OCF_SUCCESS ;; STANDBY/PEER/CONNECTED|Standby/Peer) # must take over ;; STANDBY/*PEER/DISCONNECTED|Standby/DisconnectedPeer) # must take over by force peer window only force="by force peer window only" ;; # must take over by force STANDBY/REMOTE_CATCHUP_PENDING/DISCONNECTED) force="by force" ;; *) return $OCF_ERR_GENERIC esac if output=$(runasdb2 db2 takeover hadr on db $db $force) then # update pacemaker's view echo MASTER > $STATE_FILE return $OCF_SUCCESS fi case "$output" in SQL1770N*"Reason code = \"7\""*) # expected, HADR_TIMEOUT is now expired # go for the second try continue ;; *) ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output" return $OCF_ERR_GENERIC esac done return $OCF_ERR_GENERIC } # # Demote db to standby # db2_demote() { # validate ensured that dblist contains only one entry local db=$dblist local hadr # house keeping, set pacemaker's view to slave echo SLAVE > $STATE_FILE hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted" db2_monitor return $? } ######## # Main # ######## case "$__OCF_ACTION" in meta-data) db2_meta_data exit $OCF_SUCCESS ;; usage) db2_usage exit $OCF_SUCCESS ;; esac local_host=$(ocf_local_nodename) inst1=$(echo "$OCF_RESKEY_instance" | cut -d"," -f1) inst2=$(echo "$OCF_RESKEY_instance" | cut -d"," -f2) host1=$(crm_node -l | sort | awk '{print $2;}' | sed -n 1p) if [ "$host1" = "$local_host" ]; then remote_host=$(crm_node -l | sort | awk '{print $2;}' | sed -n 2p) else remote_host="$host1" fi db2_validate; validate_rc=$? case "$__OCF_ACTION" in start) db2_start || exit $? db2_monitor ;; stop) db2_stop ;; promote) db2_promote ;; demote) db2_demote ;; notify) ocf_log debug "notify-action has been DEPRECATED, and should be removed" ;; monitor) db2_monitor_retry ;; validate-all) exit $validate_rc ;; *) db2_usage exit $OCF_ERR_UNIMPLEMENTED esac exit $?