diff --git a/heartbeat/pgsql b/heartbeat/pgsql index 8b6ea10be..d16c96bb4 100755 --- a/heartbeat/pgsql +++ b/heartbeat/pgsql @@ -1,688 +1,1767 @@ #!/bin/sh # # Description: Manages a PostgreSQL Server as an OCF High-Availability # resource # # Authors: Serge Dubrouski (sergeyfd@gmail.com) -- original RA # Florian Haas (florian@linbit.com) -- makeover +# Takatoshi MATSUO (matsuo.tak@gmail.com) -- support replication # -# Copyright: 2006-2010 Serge Dubrouski +# Copyright: 2006-2012 Serge Dubrouski # and other Linux-HA contributors # License: GNU General Public License (GPL) # ############################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # # Get PostgreSQL Configuration parameter # get_pgsql_param() { local config local param_name param_name=$1 #Check that config file exists if [ -n "$OCF_RESKEY_config" ]; then config=$OCF_RESKEY_config else config=$OCF_RESKEY_pgdata/postgresql.conf fi check_config "$config" [ $? -eq 0 ] || return perl_code="if (/^\s*$param_name[\s=]+\s*(.*)$/) { \$dir=\$1; \$dir =~ s/\s*\#.*//; \$dir =~ s/^'(\S*)'/\$1/; print \$dir;}" perl -ne "$perl_code" < $config } # Defaults OCF_RESKEY_pgctl_default=/usr/bin/pg_ctl OCF_RESKEY_psql_default=/usr/bin/psql OCF_RESKEY_pgdata_default=/var/lib/pgsql/data OCF_RESKEY_pgdba_default=postgres OCF_RESKEY_pghost_default="" OCF_RESKEY_pgport_default=5432 OCF_RESKEY_config_default="" OCF_RESKEY_start_opt_default="" OCF_RESKEY_pgdb_default=template1 OCF_RESKEY_logfile_default=/dev/null OCF_RESKEY_stop_escalate_default=30 OCF_RESKEY_monitor_user_default="" OCF_RESKEY_monitor_password_default="" OCF_RESKEY_monitor_sql_default="select now();" +# Defaults for replication +OCF_RESKEY_rep_mode_default=none +OCF_RESKEY_node_list_default="" +OCF_RESKEY_restore_command_default="" +OCF_RESKEY_master_ip_default="" +OCF_RESKEY_repuser_default="postgres" +OCF_RESKEY_primary_conninfo_opt_default="" +OCF_RESKEY_tmpdir_default="/var/lib/pgsql/tmp" +OCF_RESKEY_xlog_check_count_default="3" +OCF_RESKEY_crm_attr_timeout_default="5" +OCF_RESKEY_stop_escalate_in_slave_default=30 : ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} : ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} : ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}} : ${OCF_RESKEY_pgdba=${OCF_RESKEY_pgdba_default}} : ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}} : ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} : ${OCF_RESKEY_start_opt=${OCF_RESKEY_start_opt_default}} : ${OCF_RESKEY_pgdb=${OCF_RESKEY_pgdb_default}} : ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} : ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} : ${OCF_RESKEY_monitor_user=${OCF_RESKEY_monitor_user_default}} : ${OCF_RESKEY_monitor_password=${OCF_RESKEY_monitor_password_default}} : ${OCF_RESKEY_monitor_sql=${OCF_RESKEY_monitor_sql_default}} +# for replication +: ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}} +: ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}} +: ${OCF_RESKEY_restore_command=${OCF_RESKEY_restore_command_default}} +: ${OCF_RESKEY_master_ip=${OCF_RESKEY_master_ip_default}} +: ${OCF_RESKEY_repuser=${OCF_RESKEY_repuser_default}} +: ${OCF_RESKEY_primary_conninfo_opt=${OCF_RESKEY_primary_conninfo_opt_default}} +: ${OCF_RESKEY_tmpdir=${OCF_RESKEY_tmpdir_default}} +: ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} +: ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} +: ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} usage() { cat < 1.0 Resource script for PostgreSQL. It manages a PostgreSQL as an HA resource. Manages a PostgreSQL database instance Path to pg_ctl command. pgctl Start options (-o start_opt in pg_ctl). "-i -p 5432" for example. start_opt Additional pg_ctl options (-w, -W etc..). ctl_opt Path to psql command. psql Path to PostgreSQL data directory. pgdata User that owns PostgreSQL. pgdba Hostname/IP address where PostgreSQL is listening pghost Port where PostgreSQL is listening pgport PostgreSQL user that pgsql RA will user for monitor operations. If it's not set pgdba user will be used. monitor_user Password for monitor user. monitor_password SQL script that will be used for monitor operations. monitor_sql - Path to the PostgreSQL configuration file for the instance Configuration file Database that will be used for monitoring. pgdb Path to PostgreSQL server log output file. logfile Unix socket directory for PostgeSQL socketdir Number of shutdown retries (using -m fast) before resorting to -m immediate stop escalation + + + +Replication mode(none(default)/async/sync). +"async" and "sync" require PostgreSQL 9.1 or later. +If you use async or sync, it requires node_list, master_ip, restore_command +parameters, and needs setting postgresql.conf, pg_hba.conf up for +replication. +Please delete "include /../../rep_mode.conf" line in postgresql.conf +when you switch from sync to async. + +rep_mode + + + + + +All node names. Please separate each node name with a space. +This is required for replication. + +node list + + + + + +restore_command for recovery.conf. +This is required for replication. + +restore_command + + + + + +Master's floating IP address to be connected from hot standby. +This parameter is used for "primary_conninfo" in recovery.conf. +This is required for replication. + +master ip + + + + + +User used to connect to the master server. +This parameter is used for "primary_conninfo" in recovery.conf. +This is required for replication. + +repuser + + + + + +primary_conninfo options of recovery.conf except host, port, user and application_name. +This is optional for replication. + +primary_conninfo_opt + + + + + +Path to temporary directory. +This is optional for replication. + +tmpdir + + + + + +Number of checking xlog on monitor before promote. +This is optional for replication. + +xlog check count + + + + + +The timeout of crm_attribute forever update command. +Default value is 5 seconds. +This is optional for replication. + +The timeout of crm_attribute forever update command. + + + + + +Number of shutdown retries (using -m fast) before resorting to -m immediate +in Slave state. +This is optional for replication. + +stop escalation_in_slave + + + + + + EOF } # -# Run the given command in the Resource owner environment... +# Run the given command in the Resource owner environment... # runasowner() { local quietrun="" local loglevel="-err" local var for var in 1 2 do case "$1" in "-q") quietrun="-q" shift 1;; "warn"|"err") loglevel="-$1" shift 1;; *) ;; esac done ocf_run $quietrun $loglevel su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; $*" } # # Shell escape # escape_string() { echo "$*" | sed -e "s/'/'\\\\''/g" } # # methods: What methods/operations do we support? # pgsql_methods() { - cat </dev/null 2>&1" return $? fi # No PID file false } # -# pgsql_monitor +# pgsql_real_monitor # -pgsql_monitor() { +pgsql_real_monitor() { local loglevel - local psql_options local rc + local output # Set the log level of the error message loglevel=${1:-err} if ! pgsql_status then - ocf_log info "PostgreSQL is down" - return $OCF_NOT_RUNNING + ocf_log info "PostgreSQL is down" + return $OCF_NOT_RUNNING + fi + + if is_replication; then + #Check replication state + output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ + $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ + -Atc \"${CHECK_MS_SQL}\""` + rc=$? + if [ $rc -ne 0 ]; then + report_psql_error $rc $loglevel + return $OCF_ERR_GENERIC + fi + + case "$output" in + f) ocf_log debug "PostgreSQL is running as a primary." + if [ "$OCF_RESKEY_monitor_sql" = "$OCF_RESKEY_monitor_sql_default" ]; then + return $OCF_RUNNING_MASTER + fi + ;; + + t) ocf_log debug "PostgreSQL is running as a hot standby." + return $OCF_SUCCESS;; + + *) ocf_log err "$CHECK_MS_SQL output is $output" + return $OCF_ERR_GENERIC;; + esac + fi + + OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` + runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options \ + -c '$OCF_RESKEY_monitor_sql'" + rc=$? + if [ $rc -ne 0 ]; then + report_psql_error $rc $loglevel + return $OCF_ERR_GENERIC + fi + + if is_replication; then + return $OCF_RUNNING_MASTER fi + return $OCF_SUCCESS +} - if [ -n "$OCF_RESKEY_monitor_user" ]; then - PGUSER=$OCF_RESKEY_monitor_user; export PGUSER - PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD - psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" +pgsql_replication_monitor() { + local rc + local rsc + local instance + local my_status + local data_status + local is_master="" + + rc=$1 + if [ $rc -ne $OCF_SUCCESS -a $rc -ne "$OCF_RUNNING_MASTER" ]; then + return $rc + fi + # If I am Master + if [ $rc -eq $OCF_RUNNING_MASTER ]; then + change_data_status "$NODENAME" "LATEST" + change_pgsql_status "$NODENAME" "PRI" + control_slave_status || return $OCF_ERR_GENERIC + return $rc + fi + + # I can't get master node name from $OCF_RESKEY_CRM_meta_notify_master_uname on monitor, + # so I will get master node name using crm_mon -n + if output=`crm_mon -n1 | grep " Master"`; then + rsc=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` + instance=0 + while : + do + if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then + break + fi + if echo "$output" | grep "${rsc}:${instance}"; then + is_master="yes" + break + fi + instance=`expr $instance + 1` + done + fi + + if [ ! -n "$is_master" ]; then + # If I am Slave and Master is not exist + ocf_log info "Master does not exist." + change_pgsql_status "$NODENAME" "HS:alone" + have_master_right + if [ $? -eq 0 ]; then + rm -f ${XLOG_NOTE_FILE}.* + fi else - psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" + output=`$CRM_ATTR_FOREVER -N "$NODENAME" \ + -n "$PGSQL_DATA_STATUS_ATTR" -G -q` + if [ "$output" = "DISCONNECT" ]; then + change_pgsql_status "$NODENAME" "HS:alone" + fi fi + return $rc +} - if [ -n "$OCF_RESKEY_pghost" ]; then - psql_options="$psql_options -h $OCF_RESKEY_pghost" +#pgsql_monitor: pgsql_real_monitor() wrapper for replication +pgsql_monitor() { + local rc + + pgsql_real_monitor + rc=$? + if ! is_replication; then + return $rc + else + pgsql_replication_monitor $rc + return $? + fi +} + +# pgsql_post_demote +pgsql_post_demote() { + DEMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname | sed "s/ /\n/g" | head -1` + ocf_log debug "post-demote called. Demote uname is $DEMOTE_NODE" + if [ "$DEMOTE_NODE" != "$NODENAME" ]; then + if ! echo $OCF_RESKEY_CRM_meta_notify_master_uname | grep $NODENAME; then + show_master_baseline + change_pgsql_status "$NODENAME" "HS:alone" + fi + fi + return $OCF_SUCCESS +} + +pgsql_pre_promote() { + local master_baseline + local my_master_baseline + local cmp_location + local number_of_nodes + + # If my data is newer than new master's one, I fail my resource. + PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ + sed "s/ /\n/g" | head -1` + number_of_nodes=`echo $OCF_RESKEY_node_list | \ + sed -e "s/ */ /g" | sed -e "s/^ \| $//g" | \ + sed -e "s/ /\n/g" | wc -l` + if [ $number_of_nodes -ge 3 -a \ + "$OCF_RESKEY_rep_mode" = "sync" -a \ + "$PROMOTE_NODE" != "$NODENAME" ]; then + master_baseline=`$CRM_ATTR_REBOOT -N "$PROMOTE_NODE" -n \ + "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` + if [ $? -eq 0 ]; then + my_master_baseline=`$CRM_ATTR_REBOOT -N "$NODENAME" -n \ + "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` + # get older location + cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ + sort | head -1` + if [ "$cmp_location" != "$my_master_baseline" ]; then + ocf_log err "My data is newer than new master's one. New master's location : $master_baseline" + $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY + return $OCF_ERR_GENERIC + fi + fi + fi + return $OCF_SUCCESS +} + +pgsql_notify() { + local type="${OCF_RESKEY_CRM_meta_notify_type}" + local op="${OCF_RESKEY_CRM_meta_notify_operation}" + local rc + + if ! is_replication; then + return $OCF_SUCCESS + fi + + ocf_log debug "notify: ${type} for ${op}" + case $type in + pre) + case $op in + promote) + pgsql_pre_promote + return $? + ;; + esac + ;; + post) + case $op in + promote) + delete_xlog_location + PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ + sed "s/ /\n/g" | head -1` + if [ "$PROMOTE_NODE" != "$NODENAME" ]; then + delete_master_baseline + fi + return $OCF_SUCCESS + ;; + demote) + pgsql_post_demote + return $? + ;; + start|stop) + if [ "$NODENAME " = "$OCF_RESKEY_CRM_meta_notify_master_uname" ]; then + control_slave_status + fi + return $OCF_SUCCESS + ;; + esac + ;; + esac + return $OCF_SUCCESS +} + +control_slave_status() { + local rc + local data_status + local target + local all_data_status + local tmp_data_status + local node_name + local number_of_nodes + + all_data_status=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ + $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ + -Atc \"${CHECK_REPLICATION_STATE_SQL}\""` + rc=$? + if [ $rc -eq 0 ]; then + if [ -n "$all_data_status" ]; then + all_data_status=`echo $all_data_status | sed "s/\n/ /g"` + fi else - if [ -n "$OCF_RESKEY_socketdir" ]; then - psql_options="$psql_options -h $OCF_RESKEY_socketdir" + report_psql_error $rc warn + return 1 + fi + + number_of_nodes=`echo $OCF_RESKEY_node_list | \ + sed -e "s/ */ /g" | sed -e "s/^ \| $//g" | \ + sed -e "s/ /\n/g" | wc -l` + for target in $OCF_RESKEY_node_list; do + if [ "$target" = "$NODENAME" ]; then + continue + fi + + data_status="DISCONNECT" + if [ -n "$all_data_status" ]; then + for tmp_data_status in $all_data_status; do + node_name=`echo $tmp_data_status | cut -d "|" -f 1` + state=`echo $tmp_data_status | cut -d "|" -f 2` + sync_state=`echo $tmp_data_status | cut -d "|" -f 3` + ocf_log debug "node=$node_name, state=$state, sync_state=$sync_state" + if [ "$node_name" = "$target" ];then + data_status="$state|$sync_state" + break + fi + done + fi + + case "$data_status" in + "STREAMING|SYNC") + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_PROMOTE" + change_pgsql_status "$target" "HS:sync" + ;; + "STREAMING|ASYNC") + change_data_status "$target" "$data_status" + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + change_master_score "$target" "$CAN_NOT_PROMOTE" + if ! is_sync_mode "$target"; then + set_sync_mode "$target" + fi + else + if [ $number_of_nodes -le 2 ]; then + change_master_score "$target" "$CAN_PROMOTE" + else + # I can't determine which slave's data is newest in async mode. + change_master_score "$target" "$CAN_NOT_PROMOTE" + fi + fi + change_pgsql_status "$target" "HS:async" + ;; + "STREAMING|POTENTIAL") + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_NOT_PROMOTE" + change_pgsql_status "$target" "HS:potential" + ;; + "DISCONNECT") + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_NOT_PROMOTE" + if [ "$OCF_RESKEY_rep_mode" = "sync" ] && \ + is_sync_mode "$target"; then + set_async_mode "$target" + fi + ;; + *) + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_NOT_PROMOTE" + if [ "$OCF_RESKEY_rep_mode" = "sync" ] && \ + is_sync_mode "$target"; then + set_async_mode "$target" + fi + change_pgsql_status "$target" "HS:connected" + ;; + esac + done + return 0 +} + +have_master_right() { + local old + local new + local output + local data_status + local node + local mylocation + local count + local newestXlog + local oldfile + local newfile + + ocf_log debug "Checking if I have a master right." + + data_status=`$CRM_ATTR_FOREVER -N "$NODENAME" -n \ + "$PGSQL_DATA_STATUS_ATTR" -G -q` + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ + "$data_status" != "LATEST" ]; then + ocf_log warn "My data is out-of-date. status=$data_status" + return 1 + fi + else + if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ + "$data_status" != "STREAMING|ASYNC" -a \ + "$data_status" != "LATEST" ]; then + ocf_log warn "My data is out-of-date. status=$data_status" + return 1 + fi + fi + ocf_log info "My data status=$data_status." + + show_xlog_location + if [ $? -ne 0 ]; then + ocf_log err "Failed to show my xlog location." + exit $OCF_ERR_GENERIC + fi + + old=0 + for count in `seq $OCF_RESKEY_xlog_check_count`; do + if [ -f ${XLOG_NOTE_FILE}.$count ]; then + old=$count + continue fi + break + done + new=`expr $old + 1` + + # get xlog locations of all nodes + for node in ${OCF_RESKEY_node_list}; do + output=`$CRM_ATTR_REBOOT -N "$node" -n \ + "$PGSQL_XLOG_LOC_NAME" -G -q 2>/dev/null` + if [ $? -ne 0 ]; then + ocf_log warn "Can't get $node xlog location." + continue + else + ocf_log info "$node xlog location : $output" + echo "$node $output" >> ${XLOG_NOTE_FILE}.${new} + if [ "$node" = "$NODENAME" ]; then + mylocation=$output + fi + fi + done + + oldfile=`cat ${XLOG_NOTE_FILE}.${old} 2>/dev/null` + newfile=`cat ${XLOG_NOTE_FILE}.${new} 2>/dev/null` + if [ "$oldfile" != "$newfile" ]; then + # reset counter + rm -f ${XLOG_NOTE_FILE}.* + printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 + return 1 fi - OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` - runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options -c '$OCF_RESKEY_monitor_sql'" + if [ "$new" -ge "$OCF_RESKEY_xlog_check_count" ]; then + newestXlog=`printf "$newfile\n" | sort -t " " -k 2,3 -r | \ + head -1 | cut -d " " -f 2` + if [ "$newestXlog" = "$mylocation" ]; then + ocf_log info "I have a master right." + $CRM_MASTER -v $PROMOTE_ME + return 0 + fi + change_data_status "$NODENAME" "DISCONNECT" + ocf_log info "I don't have correct master data." + # reset counter + rm -f ${XLOG_NOTE_FILE}.* + printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 + fi + return 1 +} + +is_replication() { + if [ "$OCF_RESKEY_rep_mode" != "none" ]; then + return 0 + fi + return 1 +} + +get_my_location() { + local rc + local output + local replay_loc + local receive_loc + local output1 + local output2 + local log1 + local log2 + local newer_location + + output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ + $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ + -Atc \"${CHECK_XLOG_LOC_SQL}\""` rc=$? - if [ $rc -ne 0 ]; then - ocf_log $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running" - if [ $rc -eq 1 ]; then - ocf_log err "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." - elif [ $rc -eq 2 ]; then - ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." - elif [ $rc -eq 3 ]; then - ocf_log err "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." + if [ $rc -ne 0 ]; then + report_psql_error $rc warn + ocf_log err "Can't get my xlog location." + return 1 + fi + replay_loc=`echo $output | cut -d "|" -f 1` + receive_loc=`echo $output | cut -d "|" -f 2` + + output1=`echo "$replay_loc" | cut -d "/" -f 1` + output2=`echo "$replay_loc" | cut -d "/" -f 2` + log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` + log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` + replay_loc="${log1}${log2}" + + output1=`echo "$receive_loc" | cut -d "/" -f 1` + output2=`echo "$receive_loc" | cut -d "/" -f 2` + log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` + log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` + receive_loc="${log1}${log2}" + + newer_location=`printf "$replay_loc\n$receive_loc" | sort -r | head -1` + echo "$newer_location" + return 0 +} + +show_xlog_location() { + local location + + location=`get_my_location` || return 1 + $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -v "$location" + return $? +} + +delete_xlog_location() { + $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -D + return $? +} + +show_master_baseline() { + local rc + local location + + runasowner -q err "$OCF_RESKEY_psql $psql_options \ + -U $OCF_RESKEY_pgdba -c 'CHECKPOINT'" + rc=$? + if [ $rc -ne 0 ]; then + report_psql_error $rc warn + fi + location=`get_my_location` + ocf_log info "My master baseline : $location." + $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -v "$location" + return $? +} + +delete_master_baseline() { + $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -D + return $? +} + +set_async_mode_all() { + [ "$OCF_RESKEY_rep_mode" = "sync" ] || return 0 + ocf_log info "Set all nodes into async mode." + runasowner -q err "echo "" > \"$REP_MODE_CONF\"" + if [ $? -ne 0 ]; then + ocf_log err "Can't set all nodes into async mode." + return 1 + fi + return 0 +} + +set_async_mode() { + local sync_node_in_conf + + sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` + if [ -n "$sync_node_in_conf" ]; then + if ! echo $sync_node_in_conf | grep "$1"; then + ocf_log info "$1 is already in async mode." + return 0 + else + ocf_log info "Setup $1 into async mode." + sync_node_in_conf=`echo $sync_node_in_conf | sed "s/$1//g" |\ + sed "s/^,//g" | sed "s/,,/,/g" | sed "s/,$//g"` + if [ -n $sync_node_in_conf ]; then + echo "synchronous_standby_names = '$sync_node_in_conf'" > "$REP_MODE_CONF" + else + echo "" > "$REP_MODE_CONF" + fi fi - return $OCF_ERR_GENERIC + else + ocf_log info "$1 is already in async mode." + return 0 fi - return $OCF_SUCCESS + ocf_log info "All synced nodes : \"$sync_node_in_conf\"" + reload_conf + return $? +} + +set_sync_mode() { + local sync_node_in_conf + + sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` + if [ -n "$sync_node_in_conf" ]; then + if echo "$sync_node_in_conf" | grep "$1"; then + ocf_log info "$1 is already in sync mode." + return 0 + else + ocf_log info "Setup $1 into sync mode." + echo "synchronous_standby_names = '$sync_node_in_conf,$1'" > "$REP_MODE_CONF" + fi + else + ocf_log info "Setup $1 into sync mode." + echo "synchronous_standby_names = '$1'" > "$REP_MODE_CONF" + fi + + sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` + ocf_log info "All synced nodes : \"$sync_node_in_conf\"" + reload_conf + return $? +} + +is_sync_mode() { + local target + + sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2 | sed "s/,/ /g"` + if [ -n "$sync_node_in_conf" ]; then + for target in $sync_node_in_conf; do + if [ "$target" = "$1" ];then + return 0 + fi + done + fi + return 1 +} + +reload_conf() { + # Invoke pg_ctl + runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata reload" + if [ $? -eq 0 ]; then + ocf_log info "Reload configuration file." + else + ocf_log err "Can't reload configuration file." + return 1 + fi + + return 0 +} + +make_recovery_conf() { + runasowner "touch $RECOVERY_CONF" + if [ $? -ne 0 ]; then + ocf_log err "Can't create recovery.conf." + return 1 + fi + +cat > $RECOVERY_CONF </dev/null` + if [ "$output" != "$2" ]; then + # If slave's disk is broken, RA cannot read PID file + # and misjudges the PostgreSQL as down while it is running. + # It causes overwriting of pgsql-status by Master because replication is still connected. + if [ "$output" = "STOP" -o "$output" = "UNKNOWN" ]; then + if [ "$1" != "$NODENAME" ]; then + ocf_log warn "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2 by $NODENAME is prohibited." + return 0 + fi + fi + ocf_log info "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2." + $CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -v "$2" + if [ $? -ne 0 ]; then + ocf_log err "Can't change $PGSQL_STATUS_ATTR." + return 1 + fi + fi + return 0 +} + +# change pgsql-data-status. +# arg1:node, arg2: value +change_data_status() { + local output + + if ! node_exist $1; then + return 0 + fi + + while : + do + output=`$CRM_ATTR_FOREVER -N "$1" -n "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` + if [ "$output" != "$2" ]; then + ocf_log info "Changing $PGSQL_DATA_STATUS_ATTR on $1 : $output->$2." + exec_func_with_timeout "$CRM_ATTR_FOREVER" "-N $1 -n \ + $PGSQL_DATA_STATUS_ATTR -v \"$2\"" \ + $OCF_RESKEY_crm_attr_timeout + if [ $? -ne 0 ]; then + ocf_log err "Can't change $PGSQL_DATA_STATUS_ATTR." + return 1 + fi + else + break + fi + done + return 0 +} + +# change master-score +# arg1:node, arg2: score +change_master_score() { + local rsc + local instance + local current_score + + if ! is_node_online $1; then + return 0 + fi + + rsc=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` + instance=0 + while : + do + if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then + break + fi + if [ "${rsc}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then + instance=`expr $instance + 1` + continue + fi + + current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-${rsc}:${instance}" -G -q 2>/dev/null` + if [ -n "$current_score" -a "$current_score" != "$2" ]; then + ocf_log info "Changing ${rsc}:${instance} master score on $1 : $current_score->$2." + $CRM_ATTR_REBOOT -N "$target" -n "master-${rsc}:${instance}" -v "$2" + if [ $? -ne 0 ]; then + ocf_log err "Can't change master score." + return 1 + fi + fi + instance=`expr $instance + 1` + done + return 0 +} + +report_psql_error() +{ + local rc + local loglevel + + rc=$1 + loglevel=${2:-err} + + ocf_log $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running" + if [ $rc -eq 1 ]; then + ocf_log err "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." + elif [ $rc -eq 2 ]; then + ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." + elif [ $rc -eq 3 ]; then + ocf_log err "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." + fi +} + +# +# timeout management function +# arg1 : command +# arg2 : command's args +# arg3 : timeout(s) +# +exec_func_with_timeout() { + local func_pid + local count + local rc + + $1 `eval echo $2` & + func_pid=$! + count=0 + while kill -s 0 $func_pid >/dev/null 2>&1; do + sleep 1 + count=`expr $count + 1` + if [ $count -ge $3 ]; then + ocf_log debug "Execute $1 time out." + kill -s 9 $func_pid >/dev/null 2>&1 + return 0 + fi + done + wait $func_pid + rc=$? + return $rc +} + +is_node_online() { + crm_mon -1 -n | grep -e "^Node $1 " -e "^Node $1:" | grep -q -v "OFFLINE" + return $? +} + +node_exist() { + crm_mon -1 -n | grep -q "^Node $1" + return $? } check_binary2() { if ! have_binary "$1"; then ocf_log err "Setup problem: couldn't find command: $1" return 1 fi return 0 } check_config() { local rc=0 if [ ! -f "$1" ]; then if ocf_is_probe; then - ocf_log info "Configuration file $1 not readable during probe." + ocf_log info "Configuration file is $1 not readable during probe." rc=1 else ocf_log err "Configuration file $1 doesn't exist" rc=2 fi fi return $rc } # Validate most critical parameters pgsql_validate_all() { + local version + if ! check_binary2 "$OCF_RESKEY_pgctl" || ! check_binary2 "$OCF_RESKEY_psql"; then return $OCF_ERR_INSTALLED fi if [ -n "$OCF_RESKEY_config" -a ! -f "$OCF_RESKEY_config" ]; then check_config "$OCF_RESKEY_config" [ $? -eq 2 ] && return $OCF_ERR_INSTALLED fi getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "User $OCF_RESKEY_pgdba doesn't exist"; return $OCF_ERR_INSTALLED; fi if ocf_is_probe; then ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" else if ! runasowner "test -w $OCF_RESKEY_pgdata"; then ocf_log err "Directory $OCF_RESKEY_pgdata is not writable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM; fi fi if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ] then ocf_log err "monitor password can't be empty" return $OCF_ERR_CONFIGURED fi if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ] then ocf_log err "monitor_user has to be set if monitor_password is set" return $OCF_ERR_CONFIGURED fi + if is_replication; then + version=`cat $OCF_RESKEY_pgdata/PG_VERSION` + if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then + ocf_log err "Replication mode needs PostgreSQL 9.1 or higher." + return $OCF_ERR_INSTALLED + fi + if ! ocf_is_ms; then + ocf_log err "Replication requires Master/Slave configuration." + return $OCF_ERR_CONFIGURED + fi + if [ ! "$OCF_RESKEY_rep_mode" = "sync" -a ! "$OCF_RESKEY_rep_mode" = "async" ]; then + ocf_log err "Invalid rep_mode : $OCF_RESKEY_rep_mode" + return $OCF_ERR_CONFIGURED + fi + if [ ! -n "$OCF_RESKEY_master_ip" ]; then + ocf_log err "master_ip can't be empty." + return $OCF_ERR_CONFIGURED + fi + if [ ! -n "$OCF_RESKEY_node_list" ]; then + ocf_log err "node_list can't be empty." + return $OCF_ERR_CONFIGURED + fi + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + if ! grep -q "include '$REP_MODE_CONF' # added by pgsql RA" $OCF_RESKEY_pgdata/postgresql.conf; then + echo "include '$REP_MODE_CONF' # added by pgsql RA" >> $OCF_RESKEY_pgdata/postgresql.conf + fi + fi + if ! mkdir -p $OCF_RESKEY_tmpdir || ! chown $OCF_RESKEY_pgdba $OCF_RESKEY_tmpdir || ! chmod 700 $OCF_RESKEY_tmpdir; then + ocf_log err "Can't create directory $OCF_RESKEY_tmpdir or it is not readable by $OCF_RESKEY_pgdba" + return $OCF_ERR_PERM + fi + fi + return $OCF_SUCCESS } # # Check if we need to create a log file # check_log_file() { if [ ! -f "$1" ] then touch $1 > /dev/null 2>&1 chown $OCF_RESKEY_pgdba:`getent passwd $OCF_RESKEY_pgdba | cut -d ":" -f 4` $1 fi #Check if $OCF_RESKEY_pgdba can write to the log file if ! runasowner "test -w $1" then return 1 fi return 0 } # # Check socket directory # check_socket_dir() { if [ ! -d "$OCF_RESKEY_socketdir" ]; then if ! mkdir "$OCF_RESKEY_socketdir"; then - ocf_log err "Cannot create directory $OCF_RESKEY_socketdir" + ocf_log err "Can't create directory $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chown $OCF_RESKEY_pgdba:`getent passwd \ $OCF_RESKEY_pgdba | cut -d ":" -f 4` "$OCF_RESKEY_socketdir" then - ocf_log err "Cannot change ownership for $OCF_RESKEY_socketdir" + ocf_log err "Can't change ownership for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chmod 2775 "$OCF_RESKEY_socketdir"; then - ocf_log err "Cannot change permissions for $OCF_RESKEY_socketdir" + ocf_log err "Can't change permissions for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi else if ! runasowner "touch $OCF_RESKEY_socketdir/test.$$"; then - ocf_log err "$OCF_RESKEY_pgdba cannot create files in $OCF_RESKEY_socketdir" + ocf_log err "$OCF_RESKEY_pgdba can't create files in $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi rm $OCF_RESKEY_socketdir/test.$$ fi } # # 'main' starts here... # if [ $# -ne 1 ] then usage exit $OCF_ERR_GENERIC fi PIDFILE=${OCF_RESKEY_pgdata}/postmaster.pid BACKUPLABEL=${OCF_RESKEY_pgdata}/backup_label +if is_replication; then + RECOVERY_CONF=${OCF_RESKEY_pgdata}/recovery.conf + REP_MODE_CONF=${OCF_RESKEY_tmpdir}/rep_mode.conf + PGSQL_LOCK=${OCF_RESKEY_tmpdir}/PGSQL.lock + XLOG_NOTE_FILE=${OCF_RESKEY_tmpdir}/xlog_note + + CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" + CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" + CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" + CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount" + + CAN_NOT_PROMOTE="-INFINITY" + CAN_PROMOTE="100" + PROMOTE_ME="1000" + + CHECK_MS_SQL="select pg_is_in_recovery()" + CHECK_XLOG_LOC_SQL="select pg_last_xlog_replay_location(),pg_last_xlog_receive_location()" + CHECK_REPLICATION_STATE_SQL="select application_name,upper(state),upper(sync_state) from pg_stat_replication" + + PGSQL_STATUS_ATTR="pgsql-status" + PGSQL_DATA_STATUS_ATTR="pgsql-data-status" + PGSQL_XLOG_LOC_NAME="pgsql-xlog-loc" + PGSQL_MASTER_BASELINE="pgsql-master-baseline" + + NODENAME=`uname -n` + OPERATION=$1 +fi + case "$1" in methods) pgsql_methods exit $?;; meta-data) meta_data exit $OCF_SUCCESS;; esac # $OCF_RESKEY_pgdata has to be initialized at this momemnt : ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} pgsql_validate_all rc=$? [ "$1" = "validate-all" ] && exit $rc if [ $rc -ne 0 ] then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $OCF_NOT_RUNNING;; *) exit $rc;; esac fi US=`id -u -n` if [ $US != root -a $US != $OCF_RESKEY_pgdba ] then ocf_log err "$0 must be run as root or $OCF_RESKEY_pgdba" exit $OCF_ERR_GENERIC fi +# make psql command options +if [ -n "$OCF_RESKEY_monitor_user" ]; then + PGUSER=$OCF_RESKEY_monitor_user; export PGUSER + PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD + psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" +else + psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" +fi + +if [ -n "$OCF_RESKEY_pghost" ]; then + psql_options="$psql_options -h $OCF_RESKEY_pghost" +else + if [ -n "$OCF_RESKEY_socketdir" ]; then + psql_options="$psql_options -h $OCF_RESKEY_socketdir" + fi +fi + # What kind of method was invoked? case "$1" in status) if pgsql_status then ocf_log info "PostgreSQL is up" exit $OCF_SUCCESS else ocf_log info "PostgreSQL is down" exit $OCF_NOT_RUNNING fi;; monitor) pgsql_monitor exit $?;; start) pgsql_start exit $?;; + promote) pgsql_promote + exit $?;; + + demote) pgsql_demote + exit $?;; + + notify) pgsql_notify + exit $?;; + stop) pgsql_stop exit $?;; *) exit $OCF_ERR_UNIMPLEMENTED;; esac