Page MenuHomeClusterLabs Projects

No OneTemporary

diff --git a/heartbeat/galera.in b/heartbeat/galera.in
index b518595cb..b29d68bf7 100755
--- a/heartbeat/galera.in
+++ b/heartbeat/galera.in
@@ -1,1106 +1,1105 @@
#!@BASH_SHELL@
#
# Copyright (c) 2014 David Vossel <davidvossel@gmail.com>
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
##
# README.
#
# This agent only supports being configured as a multistate Promoted
# resource.
#
# Unpromoted vs Promoted role:
#
# During the 'Unpromoted' role, galera instances are in read-only mode and
# will not attempt to connect to the cluster. This role exists only as
# a means to determine which galera instance is the most up-to-date. The
# most up-to-date node will be used to bootstrap a galera cluster that
# has no current members.
#
# The galera instances will only begin to be promoted to the Promoted role
# once all the nodes in the 'wsrep_cluster_address' connection address
# have entered read-only mode. At that point the node containing the
# database that is most current will be promoted to Promoted. Once the first
# Promoted instance bootstraps the galera cluster, the other nodes will be
# promoted to Promoted as well.
#
# Example: Create a galera cluster using nodes rhel7-node1 rhel7-node2 rhel7-node3
#
# pcs resource create db galera enable_creation=true \
# wsrep_cluster_address="gcomm://rhel7-auto1,rhel7-auto2,rhel7-auto3" meta promoted-max=3 --promoted
#
# By setting the 'enable_creation' option, the database will be automatically
# generated at startup. The meta attribute 'promoted-max=3' means that all 3
# nodes listed in the wsrep_cluster_address list will be allowed to connect
# to the galera cluster and perform replication.
#
# NOTE: If you have more nodes in the pacemaker cluster then you wish
# to have in the galera cluster, make sure to use location contraints to prevent
# pacemaker from attempting to place a galera instance on a node that is
# not in the 'wsrep_cluster_address" list.
#
##
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
if [ "$__OCF_ACTION" != "meta-data" ]; then
. ${OCF_FUNCTIONS_DIR}/mysql-common.sh
NODENAME=$(ocf_attribute_target)
fi
# It is common for some galera instances to store
# check user that can be used to query status
# in this file
if [ -f "/etc/sysconfig/clustercheck" ]; then
. /etc/sysconfig/clustercheck
elif [ -f "/etc/default/clustercheck" ]; then
. /etc/default/clustercheck
fi
# Parameter defaults
OCF_RESKEY_wsrep_cluster_address_default=""
OCF_RESKEY_cluster_host_map_default=""
OCF_RESKEY_check_user_default=""
OCF_RESKEY_check_passwd_default=""
OCF_RESKEY_two_node_mode_default="false"
: ${OCF_RESKEY_wsrep_cluster_address=${OCF_RESKEY_wsrep_cluster_address_default}}
: ${OCF_RESKEY_cluster_host_map=${OCF_RESKEY_cluster_host_map_default}}
: ${OCF_RESKEY_check_user=${OCF_RESKEY_check_user_default}}
: ${OCF_RESKEY_check_passwd=${OCF_RESKEY_check_passwd_default}}
: ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}}
#######################################################################
# Defaults:
OCF_RESKEY_check_passwd_use_empty_default=0
: ${OCF_RESKEY_check_passwd_use_empty=${OCF_RESKEY_check_passwd_use_empty_default}}
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote)
$0 manages a galera Database as an HA resource.
The 'start' operation starts the database.
The 'stop' operation stops the database.
The 'status' operation reports whether the database is running
The 'monitor' operation reports whether the database seems to be working
The 'promote' operation makes this mysql server run as promoted
The 'demote' operation makes this mysql server run as unpromoted
The 'validate-all' operation reports whether the parameters are valid
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="galera" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Resource script for managing galera database.
</longdesc>
<shortdesc lang="en">Manages a galera instance</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the MySQL server binary
</longdesc>
<shortdesc lang="en">MySQL server binary</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="client_binary" unique="0" required="0">
<longdesc lang="en">
Location of the MySQL client binary
</longdesc>
<shortdesc lang="en">MySQL client binary</shortdesc>
<content type="string" default="${OCF_RESKEY_client_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Configuration file
</longdesc>
<shortdesc lang="en">MySQL config</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="datadir" unique="0" required="0">
<longdesc lang="en">
Directory containing databases
</longdesc>
<shortdesc lang="en">MySQL datadir</shortdesc>
<content type="string" default="${OCF_RESKEY_datadir_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running MySQL daemon
</longdesc>
<shortdesc lang="en">MySQL user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="group" unique="0" required="0">
<longdesc lang="en">
Group running MySQL daemon (for logfile and directory permissions)
</longdesc>
<shortdesc lang="en">MySQL group</shortdesc>
<content type="string" default="${OCF_RESKEY_group_default}"/>
</parameter>
<parameter name="log" unique="0" required="0">
<longdesc lang="en">
The logfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MySQL log file</shortdesc>
<content type="string" default="${OCF_RESKEY_log_default}"/>
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pidfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MySQL pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}"/>
</parameter>
<parameter name="socket" unique="0" required="0">
<longdesc lang="en">
The socket to be used for mysqld.
</longdesc>
<shortdesc lang="en">MySQL socket</shortdesc>
<content type="string" default="${OCF_RESKEY_socket_default}"/>
</parameter>
<parameter name="enable_creation" unique="0" required="0">
<longdesc lang="en">
If the MySQL database does not exist, it will be created
</longdesc>
<shortdesc lang="en">Create the database if it does not exist</shortdesc>
<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/>
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters which are passed to the mysqld on startup.
(e.g. --skip-external-locking or --skip-grant-tables)
</longdesc>
<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc>
<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/>
</parameter>
<parameter name="wsrep_cluster_address" unique="0" required="1">
<longdesc lang="en">
The galera cluster address. This takes the form of:
gcomm://node,node,node
Only nodes present in this node list will be allowed to start a galera instance.
The galera node names listed in this address are expected to match valid
pacemaker node names. If both names need to differ, you must provide a
mapping in option cluster_host_map.
</longdesc>
<shortdesc lang="en">Galera cluster address</shortdesc>
<content type="string" default="${OCF_RESKEY_wsrep_cluster_address_default}"/>
</parameter>
<parameter name="cluster_host_map" unique="0" required="0">
<longdesc lang="en">
A mapping of pacemaker node names to galera node names.
To be used when both pacemaker and galera names need to differ,
(e.g. when galera names map to IP from a specific network interface)
This takes the form of:
pcmk1:node.1.galera;pcmk2:node.2.galera;pcmk3:node.3.galera
where the galera resource started on node pcmk1 would be named
node.1.galera in the wsrep_cluster_address
</longdesc>
<shortdesc lang="en">Pacemaker to Galera name mapping</shortdesc>
<content type="string" default="${OCF_RESKEY_cluster_host_map_default}"/>
</parameter>
<parameter name="check_user" unique="0" required="0">
<longdesc lang="en">
Cluster check user.
</longdesc>
<shortdesc lang="en">MySQL test user</shortdesc>
<content type="string" default="${OCF_RESKEY_check_user_default}" />
</parameter>
<parameter name="check_passwd" unique="0" required="0">
<longdesc lang="en">
Cluster check user password. Empty passwords are ignored unless
the parameter "check_passwd_use_empty" is set to 1.
</longdesc>
<shortdesc lang="en">check password</shortdesc>
<content type="string" default="${OCF_RESKEY_check_passwd_default}" />
</parameter>
<parameter name="check_passwd_use_empty" unique="0" required="0">
<longdesc lang="en">
Use an empty "check_passwd" password. If this parameter is set to 1,
"check_passwd" will be ignored and an empty password is used
when calling the "mysql" client binary.
</longdesc>
<shortdesc lang="en">check password use empty</shortdesc>
<content type="boolean" default="${OCF_RESKEY_check_passwd_use_empty_default}"/>
</parameter>
<parameter name="two_node_mode" unique="0" required="0">
<longdesc lang="en">
If running in a 2-node pacemaker cluster, rely on pacemaker quorum
to allow automatic recovery even when the other node is unreachable.
Use it with caution! (and fencing)
</longdesc>
<shortdesc lang="en">Special recovery when running on a 2-node cluster</shortdesc>
<content type="boolean" default="${OCF_RESKEY_two_node_mode_default}"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="120s" />
<action name="stop" timeout="120s" />
<action name="status" timeout="60s" />
<action name="monitor" depth="0" timeout="30s" interval="20s" />
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
<action name="promote" timeout="300s" />
<action name="demote" timeout="120s" />
<action name="validate-all" timeout="5s" />
<action name="meta-data" timeout="5s" />
</actions>
</resource-agent>
END
}
get_option_variable()
{
local key=$1
$MYSQL $MYSQL_OPTIONS_CHECK -e "SHOW VARIABLES like '$key';" | tail -1
}
get_status_variable()
{
local key=$1
$MYSQL $MYSQL_OPTIONS_CHECK -e "show status like '$key';" | tail -1
}
set_bootstrap_node()
{
local node=$(ocf_attribute_target $1)
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -v "true"
}
clear_bootstrap_node()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -D
}
is_bootstrap()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" --quiet 2>/dev/null
}
set_no_grastate()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -v "true"
}
clear_no_grastate()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -D
}
is_no_grastate()
{
local node=$(ocf_attribute_target $1)
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" --quiet 2>/dev/null
}
clear_last_commit()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D
}
set_last_commit()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -v $1
}
get_last_commit()
{
local node=$(ocf_attribute_target $1)
if [ -z "$node" ]; then
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null
else
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null
fi
}
clear_safe_to_bootstrap()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -D
}
set_safe_to_bootstrap()
{
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -v $1
}
get_safe_to_bootstrap()
{
local node=$(ocf_attribute_target $1)
if [ -z "$node" ]; then
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null
else
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null
fi
}
wait_for_sync()
{
local state=$(get_status_variable "wsrep_local_state")
ocf_log info "Waiting for database to sync with the cluster. "
while [ "$state" != "4" ]; do
sleep 1
state=$(get_status_variable "wsrep_local_state")
done
ocf_log info "Database synced."
}
is_primary()
{
cluster_status=$(get_status_variable "wsrep_cluster_status")
if [ "$cluster_status" = "Primary" ]; then
return 0
fi
if [ -z "$cluster_status" ]; then
ocf_exit_reason "Unable to retrieve wsrep_cluster_status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status"
else
ocf_log info "Galera instance wsrep_cluster_status=${cluster_status}"
fi
return 1
}
is_readonly()
{
local res=$(get_option_variable "read_only")
if ! ocf_is_true "$res"; then
return 1
fi
cluster_status=$(get_status_variable "wsrep_cluster_status")
if ! [ "$cluster_status" = "Disconnected" ]; then
return 1
fi
return 0
}
is_two_node_mode_active()
{
# crm_node or corosync-quorumtool cannot access various corosync
# flags when running inside a bundle, so only count the cluster
# members
ocf_is_true "$OCF_RESKEY_two_node_mode" && crm_mon_no_validation -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2
}
is_last_node_in_quorate_partition()
{
# when a network split occurs in a 2-node cluster, pacemaker
# fences the other node and try to retain quorum. So until
# the fencing is resolved (and the status of the peer node
# is clean), we shouldn't consider ourself quorate.
local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w)
local quorate=$(${HA_SBIN_DIR}/crm_node -q)
local clean_members=$(crm_mon_no_validation -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -)
[ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ]
}
master_exists()
{
if [ "$__OCF_ACTION" = "demote" ]; then
# We don't want to detect master instances during demote.
# 1. we could be detecting ourselves as being master, which is no longer the case.
# 2. we could be detecting other master instances that are in the process of shutting down.
# by not detecting other master instances in "demote" we are deferring this check
# to the next recurring monitor operation which will be much more accurate
return 1
fi
# determine if a master instance is already up and is healthy
ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.1.0"
res=$?
if [ -z "$OCF_RESKEY_crm_feature_set" ] || [ $res -eq 2 ]; then
XMLOPT="--output-as=xml"
ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.2.0"
if [ $? -eq 1 ]; then
crm_mon_no_validation -1 $XMLOPT >/dev/null 2>&1
if [ $? -ne 0 ]; then
XMLOPT="--as-xml"
fi
fi
else
XMLOPT="--as-xml"
fi
crm_mon_no_validation -1 $XMLOPT | grep -q -i -E "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"(Promoted|Master)\".*active=\"true\".*orphaned=\"false\".*failed=\"false\""
return $?
}
clear_master_score()
{
local node=$(ocf_attribute_target $1)
if [ -z "$node" ]; then
ocf_promotion_score -D
else
ocf_promotion_score -D -N $node
fi
}
set_master_score()
{
local node=$(ocf_attribute_target $1)
if [ -z "$node" ]; then
ocf_promotion_score -v 100
else
ocf_promotion_score -N $node -v 100
fi
}
promote_everyone()
{
for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do
local pcmk_node=$(galera_to_pcmk_name $node)
if [ -z "$pcmk_node" ]; then
ocf_log err "Could not determine pacemaker node from galera name <${node}>."
return
else
node=$pcmk_node
fi
set_master_score $node
done
}
greater_than_equal_long()
{
# there are values we need to compare in this script
# that are too large for shell -gt to process
echo | awk -v n1="$1" -v n2="$2" '{if (n1>=n2) printf ("true"); else printf ("false");}' | grep -q "true"
}
galera_to_pcmk_name()
{
local galera=$1
if [ -z "$OCF_RESKEY_cluster_host_map" ]; then
echo $galera
else
echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$galera"'" {print $1;exit}'
fi
}
pcmk_to_galera_name()
{
local pcmk=$1
if [ -z "$OCF_RESKEY_cluster_host_map" ]; then
echo $pcmk
else
echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$pcmk"'" {print $2;exit}'
fi
}
detect_first_master()
{
local best_commit=0
local last_commit=0
local missing_nodes=0
local nodes=""
local nodes_recovered=""
local all_nodes
local best_node_gcomm
local best_node
local safe_to_bootstrap
all_nodes=$(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' ')
best_node_gcomm=$(echo "$all_nodes" | sed 's/^.* \(.*\)$/\1/')
best_node=$(galera_to_pcmk_name $best_node_gcomm)
if [ -z "$best_node" ]; then
ocf_log err "Could not determine initial best node from galera name <${best_node_gcomm}>."
return
fi
# avoid selecting a recovered node as bootstrap if possible
for node in $all_nodes; do
local pcmk_node=$(galera_to_pcmk_name $node)
if [ -z "$pcmk_node" ]; then
ocf_log err "Could not determine pacemaker node from galera name <${node}>."
return
else
node=$pcmk_node
fi
if is_no_grastate $node; then
nodes_recovered="$nodes_recovered $node"
else
nodes="$nodes $node"
fi
done
for node in $nodes_recovered $nodes; do
# On clean shutdown, galera sets the last stopped node as 'safe to bootstrap',
# so use this hint when we can
safe_to_bootstrap=$(get_safe_to_bootstrap $node)
# Special case for 2-node clusters: during a network split, rely on
# pacemaker's quorum to check whether we can restart galera
if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then
is_last_node_in_quorate_partition
if [ $? -eq 0 ]; then
ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap"
safe_to_bootstrap=1
fi
fi
if [ "$safe_to_bootstrap" = "1" ]; then
# Galera marked the node as safe to boostrap during shutdown. Let's just
# pick it as our bootstrap node.
ocf_log info "Node <${node}> is marked as safe to bootstrap."
best_node=$node
# We don't need to wait for the other nodes to report state in this case
missing_nodes=0
break
fi
last_commit=$(get_last_commit $node)
if [ -z "$last_commit" ]; then
ocf_log info "Waiting on node <${node}> to report database status before Master instances can start."
missing_nodes=1
continue
fi
# this means -1, or that no commit has occured yet.
if [ "$last_commit" = "18446744073709551615" ]; then
last_commit="0"
fi
greater_than_equal_long "$last_commit" "$best_commit"
if [ $? -eq 0 ]; then
best_node=$(ocf_attribute_target $node)
best_commit=$last_commit
fi
done
if [ $missing_nodes -eq 1 ]; then
return
fi
ocf_log info "Promoting $best_node to be our bootstrap node"
set_bootstrap_node $best_node
set_master_score $best_node
}
detect_safe_to_bootstrap()
{
local safe_to_bootstrap=""
local uuid=""
local seqno=""
if [ -f ${OCF_RESKEY_datadir}/grastate.dat ]; then
ocf_log info "attempting to read safe_to_bootstrap flag from ${OCF_RESKEY_datadir}/grastate.dat"
safe_to_bootstrap=$(sed -n 's/^safe_to_bootstrap:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat)
uuid=$(sed -n 's/^uuid:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat)
seqno=$(sed -n 's/^seqno:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat)
fi
if [ -z "$uuid" ] || \
[ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then
clear_safe_to_bootstrap
return
fi
if [ "$safe_to_bootstrap" = "1" ]; then
if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
clear_safe_to_bootstrap
return
fi
fi
if [ "$safe_to_bootstrap" = "1" ] || [ "$safe_to_bootstrap" = "0" ]; then
set_safe_to_bootstrap $safe_to_bootstrap
else
clear_safe_to_bootstrap
fi
}
detect_last_commit()
{
local last_commit
local recover_args="--defaults-file=$OCF_RESKEY_config \
--pid-file=$OCF_RESKEY_pid \
--socket=$OCF_RESKEY_socket \
--datadir=$OCF_RESKEY_datadir"
local recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p'
local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
# codership/galera#354
# Some ungraceful shutdowns can leave an empty gvwstate.dat on
# disk. This will prevent galera to join the cluster if it is
# configured to attempt PC recovery. Removing that file makes the
# node fall back to the normal, unoptimized joining process.
if [ -f ${OCF_RESKEY_datadir}/gvwstate.dat ] && \
[ ! -s ${OCF_RESKEY_datadir}/gvwstate.dat ]; then
ocf_log warn "empty ${OCF_RESKEY_datadir}/gvwstate.dat detected, removing it to prevent PC recovery failure at next restart"
rm -f ${OCF_RESKEY_datadir}/gvwstate.dat
fi
ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat"
last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
local tmp=$(mktemp)
chown $OCF_RESKEY_user:$OCF_RESKEY_group $tmp
# if we pass here because grastate.dat doesn't exist,
# try not to bootstrap from this node if possible
if [ ! -f ${OCF_RESKEY_datadir}/grastate.dat ]; then
set_no_grastate
fi
ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'"
$SU - $OCF_RESKEY_user -s /bin/sh -c \
"${OCF_RESKEY_binary} $recover_args --wsrep-recover --log-error=$tmp 2>/dev/null"
last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
if [ -z "$last_commit" ]; then
# Galera uses InnoDB's 2pc transactions internally. If
# server was stopped in the middle of a replication, the
# recovery may find a "prepared" XA transaction in the
# redo log, and mysql won't recover automatically
local recovery_file="$(cat $tmp | sed -n $recovery_file_regex)"
if [ -e $recovery_file ]; then
cat $recovery_file | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null
if [ $? -eq 0 ]; then
# we can only rollback the transaction, but that's OK
# since the DB will get resynchronized anyway
ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover"
$SU - $OCF_RESKEY_user -s /bin/sh -c \
"${OCF_RESKEY_binary} $recover_args --wsrep-recover \
--tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null"
last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
if [ ! -z "$last_commit" ]; then
ocf_log warn "State recovered. force SST at next restart for full resynchronization"
rm -f ${OCF_RESKEY_datadir}/grastate.dat
# try not to bootstrap from this node if possible
set_no_grastate
fi
fi
fi
fi
rm -f $tmp
fi
if [ ! -z "$last_commit" ]; then
ocf_log info "Last commit version found: $last_commit"
set_last_commit $last_commit
return $OCF_SUCCESS
else
ocf_exit_reason "Unable to detect last known write sequence number"
clear_last_commit
return $OCF_ERR_GENERIC
fi
}
# For galera, promote is really start
galera_promote()
{
local rc
local extra_opts
local bootstrap
local safe_to_bootstrap
master_exists
if [ $? -eq 0 ]; then
# join without bootstrapping
extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}"
else
bootstrap=$(is_bootstrap)
if ocf_is_true $bootstrap; then
# The best node for bootstrapping wasn't cleanly shutdown. Allow
# bootstrapping anyways
if [ "$(get_safe_to_bootstrap)" = "0" ]; then
sed -ie 's/^\(safe_to_bootstrap:\) 0/\1 1/' ${OCF_RESKEY_datadir}/grastate.dat
ocf_log info "safe_to_bootstrap in ${OCF_RESKEY_datadir}/grastate.dat set to 1 on node ${NODENAME}"
fi
ocf_log info "Node <${NODENAME}> is bootstrapping the cluster"
extra_opts="--wsrep-cluster-address=gcomm://"
else
# We are being promoted without having the bootstrap
# attribute in the CIB, which means we are supposed to
# join a cluster; however if we end up here, there is no
# Master remaining right now, which means there is no
# cluster to join anymore. So force a demotion, and and
# let the RA decide later which node should be the next
# bootstrap node.
ocf_log warn "There is no running cluster to join, demoting ourself"
clear_master_score
return $OCF_SUCCESS
fi
fi
galera_monitor
if [ $? -eq $OCF_RUNNING_MASTER ]; then
if ocf_is_true $bootstrap; then
promote_everyone
clear_bootstrap_node
ocf_log info "boostrap node already up, promoting the rest of the galera instances."
fi
clear_safe_to_bootstrap
clear_last_commit
return $OCF_SUCCESS
fi
# last commit/safe_to_bootstrap flag are no longer relevant once promoted
clear_last_commit
clear_safe_to_bootstrap
mysql_common_prepare_dirs
mysql_common_start "$extra_opts"
rc=$?
if [ $rc != $OCF_SUCCESS ]; then
return $rc
fi
# At this point, the mysql pidfile is created on disk and the
# mysql server is reacheable via its UNIX socket. If we are a
# joiner, SST transfers (rsync) have finished, but an IST may
# still be requested or ongoing
galera_monitor
rc=$?
if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
ocf_exit_reason "Failed initial monitor action"
return $rc
fi
is_readonly
if [ $? -eq 0 ]; then
ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration."
return $OCF_ERR_GENERIC
fi
if ocf_is_true $bootstrap; then
promote_everyone
clear_bootstrap_node
# clear attribute no-grastate. if last shutdown was
# not clean, we cannot be extra-cautious by requesting a SST
# since this is the bootstrap node
clear_no_grastate
ocf_log info "Bootstrap complete, promoting the rest of the galera instances."
else
# if this is not the bootstrap node, make sure this instance
# syncs with the rest of the cluster before promotion returns.
wait_for_sync
# sync is done, clear info about last startup
clear_no_grastate
fi
ocf_log info "Galera started"
return $OCF_SUCCESS
}
galera_demote()
{
mysql_common_stop
rc=$?
if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then
ocf_exit_reason "Failed to stop Master galera instance during demotion to Master"
return $rc
fi
# if this node was previously a bootstrap node, that is no longer the case.
clear_bootstrap_node
clear_last_commit
clear_no_grastate
clear_safe_to_bootstrap
# Clear master score here rather than letting pacemaker do so once
# demote finishes. This way a promote cannot take place right
# after this demote even if pacemaker is requested to do so. It
# will first have to run a start/monitor op, to reprobe the state
# of the other galera nodes and act accordingly.
clear_master_score
# record last commit for next promotion
detect_safe_to_bootstrap
detect_last_commit
rc=$?
return $rc
}
galera_start()
{
local rc
local galera_node
galera_node=$(pcmk_to_galera_name $NODENAME)
if [ -z "$galera_node" ]; then
ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>."
return $OCF_ERR_CONFIGURED
fi
echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node
if [ $? -ne 0 ]; then
ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}> to start this galera instance"
return $OCF_ERR_CONFIGURED
fi
galera_monitor
if [ $? -eq $OCF_RUNNING_MASTER ]; then
ocf_exit_reason "master galera instance started outside of the cluster's control"
return $OCF_ERR_GENERIC
fi
mysql_common_prepare_dirs
detect_safe_to_bootstrap
detect_last_commit
rc=$?
if [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
master_exists
if [ $? -eq 0 ]; then
ocf_log info "Master instances are already up, setting master score so this instance will join galera cluster."
set_master_score $NODENAME
else
clear_master_score
detect_first_master
fi
return $OCF_SUCCESS
}
galera_monitor()
{
local rc
local galera_node
local status_loglevel="err"
# Set loglevel to info during probe
if ocf_is_probe; then
status_loglevel="info"
fi
mysql_common_status $status_loglevel
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
last_commit=$(get_last_commit $node)
if [ -n "$last_commit" ]; then
# if last commit is set, this instance is considered started in slave mode
rc=$OCF_SUCCESS
master_exists
if [ $? -ne 0 ]; then
detect_first_master
else
# a master instance exists and is healthy, promote this
# local read only instance
# so it can join the master galera cluster.
set_master_score
fi
fi
return $rc
elif [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
# if we make it here, mysql is running. Check cluster status now.
galera_node=$(pcmk_to_galera_name $NODENAME)
if [ -z "$galera_node" ]; then
ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>."
return $OCF_ERR_CONFIGURED
fi
echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node
if [ $? -ne 0 ]; then
ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>"
return $OCF_ERR_GENERIC
fi
is_primary
if [ $? -eq 0 ]; then
if ocf_is_probe; then
# restore master score during probe
# if we detect this is a master instance
set_master_score
fi
rc=$OCF_RUNNING_MASTER
else
# It seems that with recent galera (26.4+), a joiner that is
# connected to a Primary component and is preparing its IST
# request might still temporarily report its state as
# Non-Primary. Do not fail in this case as the promote
# operation will loop until the IST finishes or the promote
# times out.
if [ "$__OCF_ACTION" = "promote" ] && ! ocf_is_true $(is_bootstrap); then
ocf_log info "local node <${NODENAME}> is receiving a State Transfer."
else
ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
rc=$OCF_ERR_GENERIC
fi
fi
return $rc
}
galera_stop()
{
local rc
# make sure the process is stopped
mysql_common_stop
rc=$1
clear_safe_to_bootstrap
clear_last_commit
clear_master_score
clear_bootstrap_node
clear_no_grastate
return $rc
}
galera_validate()
{
if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then
if ! ocf_is_ms; then
ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource."
return $OCF_ERR_CONFIGURED
fi
fi
if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then
ocf_exit_reason "Galera must be configured with a wsrep_cluster_address value."
return $OCF_ERR_CONFIGURED
fi
mysql_common_validate
}
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
[ "$__OCF_ACTION" = "start" ] && OCF_CHECK_LEVEL=10
galera_validate
rc=$?
LSB_STATUS_STOPPED=3
if [ $rc -ne 0 ]; then
case "$1" in
stop) exit $OCF_SUCCESS;;
monitor) exit $OCF_NOT_RUNNING;;
status) exit $LSB_STATUS_STOPPED;;
*) exit $rc;;
esac
fi
if [ -z "${OCF_RESKEY_check_passwd}" ]; then
# This value is automatically sourced from /etc/sysconfig/checkcluster if available
OCF_RESKEY_check_passwd=${MYSQL_PASSWORD}
fi
if [ -z "${OCF_RESKEY_check_user}" ]; then
# This value is automatically sourced from /etc/sysconfig/checkcluster if available
OCF_RESKEY_check_user=${MYSQL_USERNAME}
fi
: ${OCF_RESKEY_check_user="root"}
MYSQL_OPTIONS_CHECK="-nNE --user=${OCF_RESKEY_check_user}"
if ocf_is_true "${OCF_RESKEY_check_passwd_use_empty}"; then
MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password="
elif [ -n "${OCF_RESKEY_check_passwd}" ]; then
MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password=${OCF_RESKEY_check_passwd}"
fi
# This value is automatically sourced from /etc/sysconfig/checkcluster if available
if [ -n "${MYSQL_HOST}" ]; then
MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -h ${MYSQL_HOST}"
fi
# This value is automatically sourced from /etc/sysconfig/checkcluster if available
if [ -n "${MYSQL_PORT}" ]; then
MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -P ${MYSQL_PORT}"
fi
# What kind of method was invoked?
case "$1" in
start) galera_start;;
stop) galera_stop;;
status) mysql_common_status err;;
monitor) galera_monitor;;
promote) galera_promote;;
demote) galera_demote;;
validate-all) exit $OCF_SUCCESS;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac
# vi:sw=4:ts=4:et:
diff --git a/heartbeat/mariadb.in b/heartbeat/mariadb.in
index e0f1f3c9f..1dca98ba6 100644
--- a/heartbeat/mariadb.in
+++ b/heartbeat/mariadb.in
@@ -1,1040 +1,1039 @@
#!@BASH_SHELL@
#
#
# MariaDB
#
# Description: Manages a MariaDB Promotable database as Linux-HA resource
#
# Authors: Alan Robertson: DB2 Script
# Jakub Janczak: rewrite as MySQL
# Andrew Beekhof: cleanup and import
# Sebastian Reitenbach: add OpenBSD defaults, more cleanup
# Narayan Newton: add Gentoo/Debian defaults
# Marian Marinov, Florian Haas: add replication capability
# Yves Trudeau, Baron Schwartz: add VIP support and improve replication
# Nils Carlson: add GTID support and semi-sync support
#
# Support: users@clusterlabs.org
# License: GNU General Public License (GPL)
#
# (c) 2002-2005 International Business Machines, Inc.
# 2005-2010 Linux-HA contributors
#
# See usage() function below for more details...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_client_binary
# OCF_RESKEY_config
# OCF_RESKEY_datadir
# OCF_RESKEY_user
# OCF_RESKEY_group
# OCF_RESKEY_node_list
# OCF_RESKEY_test_table
# OCF_RESKEY_test_user
# OCF_RESKEY_test_passwd
# OCF_RESKEY_enable_creation
# OCF_RESKEY_additional_parameters
# OCF_RESKEY_log
# OCF_RESKEY_pid
# OCF_RESKEY_socket
# OCF_RESKEY_replication_user
# OCF_RESKEY_replication_passwd
# OCF_RESKEY_replication_port
#######################################################################
# Initialization:
OCF_RESKEY_node_list_default=""
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
. ${OCF_FUNCTIONS_DIR}/mysql-common.sh
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote|notify)
$0 manages a MariaDB Database as an HA resource.
The 'start' operation starts the database.
The 'stop' operation stops the database.
The 'status' operation reports whether the database is running
The 'monitor' operation reports whether the database seems to be working
The 'promote' operation makes this mysql server run as promoted
The 'demote' operation makes this mysql server run as unpromoted
The 'validate-all' operation reports whether the parameters are valid
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="mariadb" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Resource script for MariaDB.
Manages a complete promotable replication setup with GTID, for simpler
uses look at the mysql resource agent which supports older replication
forms which mysql and mariadb have in common.
The resource must be setup to use notifications. Set 'notify=true' in the metadata
attributes when defining a MariaDB promotable instance.
The default behavior is to use uname -n values in the change promoted to command.
Other IPs can be specified manually by adding a node attribute
\${INSTANCE_ATTR_NAME}_mysql_master_IP giving the IP to use for replication.
For example, if the mariadb primitive you are using is p_mariadb, the
attribute to set will be p_mariadb_mysql_master_IP.
</longdesc>
<shortdesc lang="en">Manages a MariaDB promotable instance</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the MariaDB server binary
</longdesc>
<shortdesc lang="en">MariaDB server binary</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="client_binary" unique="0" required="0">
<longdesc lang="en">
Location of the MariaDB client binary
</longdesc>
<shortdesc lang="en">MariaDB client binary</shortdesc>
<content type="string" default="${OCF_RESKEY_client_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Configuration file
</longdesc>
<shortdesc lang="en">MariaDB config</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="datadir" unique="0" required="0">
<longdesc lang="en">
Directory containing databases
</longdesc>
<shortdesc lang="en">MariaDB datadir</shortdesc>
<content type="string" default="${OCF_RESKEY_datadir_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running MariaDB daemon
</longdesc>
<shortdesc lang="en">MariaDB user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="group" unique="0" required="0">
<longdesc lang="en">
Group running MariaDB daemon (for logfile and directory permissions)
</longdesc>
<shortdesc lang="en">MariaDB group</shortdesc>
<content type="string" default="${OCF_RESKEY_group_default}"/>
</parameter>
<parameter name="log" unique="0" required="0">
<longdesc lang="en">
The logfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MariaDB log file</shortdesc>
<content type="string" default="${OCF_RESKEY_log_default}"/>
</parameter>
<parameter name="node_list" unique="0" required="1">
<longdesc lang="en">
All node names of nodes that will execute mariadb.
Please separate each node name with a space.
This is required for the promoted selection to function.
</longdesc>
<shortdesc lang="en">node list</shortdesc>
<content type="string" default="${OCF_RESKEY_node_list_default}" />
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pidfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MariaDB pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}"/>
</parameter>
<parameter name="socket" unique="0" required="0">
<longdesc lang="en">
The socket to be used for mysqld.
</longdesc>
<shortdesc lang="en">MariaDB socket</shortdesc>
<content type="string" default="${OCF_RESKEY_socket_default}"/>
</parameter>
<parameter name="test_table" unique="0" required="0">
<longdesc lang="en">
Table to be tested in monitor statement (in database.table notation)
</longdesc>
<shortdesc lang="en">MariaDB test table</shortdesc>
<content type="string" default="${OCF_RESKEY_test_table_default}" />
</parameter>
<parameter name="test_user" unique="0" required="0">
<longdesc lang="en">
MariaDB test user, must have select privilege on test_table
</longdesc>
<shortdesc lang="en">MariaDB test user</shortdesc>
<content type="string" default="${OCF_RESKEY_test_user_default}" />
</parameter>
<parameter name="test_passwd" unique="0" required="0">
<longdesc lang="en">
MariaDB test user password
</longdesc>
<shortdesc lang="en">MariaDB test user password</shortdesc>
<content type="string" default="${OCF_RESKEY_test_passwd_default}" />
</parameter>
<parameter name="enable_creation" unique="0" required="0">
<longdesc lang="en">
If the MariaDB database does not exist, it will be created
</longdesc>
<shortdesc lang="en">Create the database if it does not exist</shortdesc>
<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/>
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters which are passed to the mysqld on startup.
(e.g. --skip-external-locking or --skip-grant-tables)
</longdesc>
<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc>
<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/>
</parameter>
<parameter name="replication_user" unique="0" required="0">
<longdesc lang="en">
MariaDB replication user. This user is used for starting and stopping
MariaDB replication, for setting and resetting the promoted host, and for
setting and unsetting read-only mode. Because of that, this user must
have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD
privileges on all nodes within the cluster. Mandatory if you define a
promotable resource.
</longdesc>
<shortdesc lang="en">MariaDB replication user</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_user_default}" />
</parameter>
<parameter name="replication_passwd" unique="0" required="0">
<longdesc lang="en">
MariaDB replication password. Used for replication client and unpromoted.
Mandatory if you define a promotable resource.
</longdesc>
<shortdesc lang="en">MariaDB replication user password</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_passwd_default}" />
</parameter>
<parameter name="replication_port" unique="0" required="0">
<longdesc lang="en">
The port on which the Promoted MariaDB instance is listening.
</longdesc>
<shortdesc lang="en">MariaDB replication port</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_port_default}" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="120s" />
<action name="stop" timeout="120s" />
<action name="status" timeout="60s" />
<action name="monitor" depth="0" timeout="30s" interval="20s" />
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
<action name="promote" timeout="120s" />
<action name="demote" timeout="120s" />
<action name="notify" timeout="90s" />
<action name="validate-all" timeout="5s" />
<action name="meta-data" timeout="5s" />
</actions>
</resource-agent>
END
}
# Convenience functions
greater_than_equal_long()
{
# there are values we need to compare in this script
# that are too large for shell -gt to process
local true=$(echo "$1 > $2" | bc)
if [ "$true" -eq "1" ]; then
return 0
else
return 1
fi
}
greater_than_gtid()
{
local gtid1_transaction_id=$(echo $1 | cut -d - -f 3)
local gtid2_transaction_id=$(echo $2 | cut -d - -f 3)
greater_than_equal_long $gtid1_transaction_id $gtid2_transaction_id
return $?
}
set_gtid() {
# Sets the GTID in CIB using attrd_updater for this node.
local gtid=$($MYSQL $MYSQL_OPTIONS_REPL \
-s -N -e "show global variables like 'gtid_current_pos'" | cut -f 2)
# Ensure that we got somethine like a valid GTID
if ! echo $gtid | grep -q '-'; then
ocf_exit_reason "Unable to read GTID from MariaDB"
ocf_log err "Unable to read GTID from MariaDB"
return $OCF_ERR_GENERIC
fi
${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-gtid -U $gtid
}
read_gtid() {
local node=$1
local query_result
local name
local host
local value
# This produces output of the form 'name="var-name" host="node2" value="val"'.
# This should be set at this point, because we have store our own GTID previously.
if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -N $node -n ${OCF_RESOURCE_INSTANCE}-gtid -Q); then
ocf_exit_reason "Unable to read GTID from attrd"
ocf_log err "Unable to read GTID from attrd"
echo ""
return
fi
# Evaluate the query result to place the variables in the local scope.
eval ${query_result}
echo ${value}
}
clear_all_gtid() {
for node in $OCF_RESKEY_node_list; do
${HA_SBIN_DIR}/attrd_updater -n ${OCF_RESOURCE_INSTANCE}-gtid -N $node -D
done
}
set_waiting_for_first_master() {
${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -U true
}
waiting_for_first_master() {
local query_result
local name
local host
local value
if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -Q); then
ocf_exit_reason "Unable to read waiting-for-first-master from attrd"
ocf_log err "Unable to read waiting-for-first-master from attrd"
return 1
fi
# Evaluate the query result to place the variables in the local scope.
eval ${query_result}
if [ "$value" = "true" ]; then
return 0
else
return 1
fi
}
clear_waiting_for_first_master() {
attrd_updater -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -D
}
have_master_with_priority() {
# Go through each node and validate that at least one has
# a set priority. Because we unset the priority on reboot
# a lack of priority indicates that we need to select a
# new master.
for node in $OCF_RESKEY_node_list; do
ocf_promotion_score -G -N $node >/dev/null 2>&1
rc=$?
if [ $rc -eq 0 ]; then
return 0
fi
done
return 1
}
attempt_to_set_master() {
ocf_log info "Attempting to set master"
local expected_node_count
if waiting_for_first_master; then
# Wait for all nodes to come online
expected_node_count=$OCF_RESKEY_CRM_meta_clone_max
else
# We accept one node being down. This is not arbitrary,
# synchronous replication requires acknowledgement from
# at least one host, which means only two nodes must have
# the latest GTID. So a set of n - 1 ensures that we do
# not lose any writes.
expected_node_count=$(($OCF_RESKEY_CRM_meta_clone_max-1))
fi
# Set the gtid for this node, making it available to other nodes
set_gtid
local node_count=0
local highest_gtid=0
local master_candidate=""
for node in $OCF_RESKEY_node_list; do
local node_gtid=$(read_gtid $node)
if [ -z "$node_gtid" ]; then
continue
fi
# Got a valid gtid, increment node count
node_count=$(($node_count+1))
# Check if this is a good master candidate
if greater_than_gtid $node_gtid $highest_gtid; then
master_candidate=$node
highest_gtid=$node_gtid
fi
done
# If we managed to query a sufficient number of nodes
# then set a master
if [ $node_count -ge $expected_node_count ]; then
ocf_log info "Promoting $master_candidate to master, highest gtid $highest_gtid, queried $node_count nodes."
ocf_promotion_score -v 100 -N $master_candidate
else
ocf_log info "Not enough nodes ($node_count) contributed to select a master, need $expected_node_count nodes."
fi
}
set_read_only() {
# Sets or unsets read-only mode. Accepts one boolean as its
# optional argument. If invoked without any arguments, defaults to
# enabling read only mode. Should only be set in master/slave
# setups.
# Returns $OCF_SUCCESS if the operation succeeds, or
# $OCF_ERR_GENERIC if it fails.
local ro_val
if ocf_is_true $1; then
ro_val="on"
else
ro_val="off"
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "SET GLOBAL read_only=${ro_val}"
}
get_read_only() {
# Check if read-only is set
local read_only_state
read_only_state=$($MYSQL $MYSQL_OPTIONS_REPL \
-e "SHOW VARIABLES" | grep -w read_only | awk '{print $2}')
if [ "$read_only_state" = "ON" ]; then
return 0
else
return 1
fi
}
is_slave() {
# Determine whether the machine is currently running as a MariaDB
# slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW
# SLAVE STATUS creates an empty result set, 0 otherwise.
local rc
# Check whether this machine should be slave
if ! get_read_only; then
return 1
fi
if get_slave_info; then
# show slave status is not empty
# Is the slave sql thread running, then we are a slave!
if [ "$slave_sql" == 'Yes' ]; then
return 0
else
return 1
fi
else
# "SHOW SLAVE STATUS" returns an empty set if instance is not a
# replication slave
return 1
fi
}
parse_slave_info() {
# Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2
sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2
}
get_slave_info() {
if [ "$master_log_file" -a "$master_host" ]; then
# variables are already defined, get_slave_info has been run before
return $OCF_SUCCESS
else
local tmpfile=$(mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX)
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW SLAVE STATUS\G' > $tmpfile
if [ -s $tmpfile ]; then
master_host=$(parse_slave_info Master_Host $tmpfile)
master_user=$(parse_slave_info Master_User $tmpfile)
master_port=$(parse_slave_info Master_Port $tmpfile)
master_using_gtid=$(parse_slave_info Using_Gtid $tmpfile)
master_log_file=$(parse_slave_info Master_Log_File $tmpfile)
slave_sql=$(parse_slave_info Slave_SQL_Running $tmpfile)
slave_io=$(parse_slave_info Slave_IO_Running $tmpfile)
last_errno=$(parse_slave_info Last_Errno $tmpfile)
last_error=$(parse_slave_info Last_Error $tmpfile)
secs_behind=$(parse_slave_info Seconds_Behind_Master $tmpfile)
last_io_errno=$(parse_slave_info Last_IO_Errno $tmpfile)
last_io_error=$(parse_slave_info Last_IO_Error $tmpfile)
ocf_log debug "MariaDB instance running as a replication slave"
rm "$tmpfile"
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
rm "$tmpfile"
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
fi
}
check_slave() {
# Checks slave status
local rc new_master
get_slave_info
rc=$?
if [ $rc -eq 0 ]; then
# Check normal errors
if [ $last_errno -ne 0 ]; then
ocf_exit_reason "MariaDB slave replication has failed ($last_errno): $last_error"
exit $OCF_ERR_GENERIC
fi
# Check IO Errors, ignore 2003 which indicates a connection failure to the master
if [ $last_io_errno -ne 0 ] && [ $last_io_errno -ne 2003 ]; then
ocf_exit_reason "MariaDB slave io has failed ($last_io_errno): $last_io_error"
exit $OCF_ERR_GENERIC
fi
if [ $last_io_errno -eq 2003 ]; then
ocf_log warn "MariaDB master not reachable from slave"
fi
if [ "$slave_io" != 'Yes' ]; then
# Not necessarily a bad thing. The master may have
# temporarily shut down, and the slave may just be
# reconnecting. A warning can't hurt, though.
ocf_log warn "MariaDB Slave IO threads currently not running."
# Sanity check, are we at least on the right master
new_master=$($CRM_ATTR_REPL_INFO --query -q)
if [ "$master_host" != "$new_master" ]; then
# Not pointing to the right master, not good, removing the VIPs
set_reader_attr 0
exit $OCF_SUCCESS
fi
fi
if [ "$slave_sql" != 'Yes' ]; then
# We don't have a replication SQL thread running. Not a
# good thing. Try to recoved by restarting the SQL thread
# and remove reader vip. Prevent MariaDB restart.
ocf_exit_reason "MariaDB Slave SQL threads currently not running."
# Remove reader vip
set_reader_attr 0
# try to restart slave
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
# Return success to prevent a restart
exit $OCF_SUCCESS
fi
ocf_log debug "MariaDB instance running as a replication slave"
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
# TODO: Needs to handle when get_slave_info will return too many connections error
ocf_exit_reason "check_slave invoked on an instance that is not a replication slave."
exit $OCF_ERR_GENERIC
fi
}
set_master() {
local new_master=$($CRM_ATTR_REPL_INFO --query -q)
# Informs the MariaDB server of the master to replicate
# from. Accepts one mandatory argument which must contain the host
# name of the new master host. The master must either be unchanged
# from the laste master the slave replicated from, or freshly
# reset with RESET MASTER.
ocf_log info "Changing MariaDB configuration to replicate from $new_master."
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "CHANGE MASTER TO MASTER_HOST='$new_master', \
MASTER_PORT=$OCF_RESKEY_replication_port, \
MASTER_USER='$OCF_RESKEY_replication_user', \
MASTER_PASSWORD='$OCF_RESKEY_replication_passwd', \
MASTER_USE_GTID=current_pos";
}
unset_master(){
# Instructs the MariaDB server to stop replicating from a master
# host.
# If we're currently not configured to be replicating from any
# host, then there's nothing to do. But we do log a warning as
# no-one but the CRM should be touching the MariaDB master/slave
# configuration.
if ! is_slave; then
ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave"
return $OCF_SUCCESS
fi
# Stop the slave I/O thread and wait for relay log
# processing to complete
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE IO_THREAD"
if [ $? -gt 0 ]; then
ocf_exit_reason "Error stopping slave IO thread"
exit $OCF_ERR_GENERIC
fi
local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX)
while true; do
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW PROCESSLIST\G' > $tmpfile
if grep -i 'Has read all relay log' $tmpfile >/dev/null; then
ocf_log info "MariaDB slave has finished processing relay log"
break
fi
if ! grep -q 'system user' $tmpfile; then
ocf_log info "Slave not runnig - not waiting to finish"
break
fi
ocf_log info "Waiting for MariaDB slave to finish processing relay log"
sleep 1
done
rm -f $tmpfile
# Now, stop all slave activity and unset the master host
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE"
if [ $? -gt 0 ]; then
ocf_exit_reason "Error stopping rest slave threads"
exit $OCF_ERR_GENERIC
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "RESET SLAVE /*!50516 ALL */;"
if [ $? -gt 0 ]; then
ocf_exit_reason "Failed to reset slave"
exit $OCF_ERR_GENERIC
fi
}
# Start replication as slave
start_slave() {
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
}
# Set the attribute controlling the readers VIP
set_reader_attr() {
local curr_attr_value
curr_attr_value=$(get_reader_attr)
if [ "$curr_attr_value" -ne "$1" ]; then
$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1
fi
}
# get the attribute controlling the readers VIP
get_reader_attr() {
local attr_value
local rc
attr_value=$($CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q)
rc=$?
if [ "$rc" -eq "0" ]; then
echo $attr_value
else
echo -1
fi
}
# Determines what IP address is attached to the current host. The output of the
# crm_attribute command looks like this:
# scope=nodes name=IP value=10.2.2.161
# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n
# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the
# change master to command.
get_local_ip() {
local IP
IP=$($CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G 2>/dev/null)
if [ ! $? -eq 0 ]; then
uname -n
else
echo $IP
fi
}
#######################################################################
# Functions invoked by resource manager actions
mysql_monitor() {
local rc
local status_loglevel="err"
# Set loglevel to info during probe
if ocf_is_probe; then
status_loglevel="info"
fi
mysql_common_status $status_loglevel
rc=$?
# If status returned an error, return that immediately
if [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
# Check if this instance is configured as a slave, and if so
# check slave status
if is_slave; then
if ! check_slave; then
return $OCF_ERR_GENERIC
fi
fi
if [ -n "$OCF_RESKEY_test_table" ]; then
# Check for test table
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table"
rc=$?
if [ $rc -ne 0 ]; then
ocf_exit_reason "Failed to select from $test_table";
return $OCF_ERR_GENERIC;
fi
fi
# Check if we are in read-only mode and there is no master
# with priority then we attempt to select a master
if get_read_only && ! have_master_with_priority; then
attempt_to_set_master
fi
if ! get_read_only; then
ocf_log debug "MariaDB monitor succeeded (master)";
return $OCF_RUNNING_MASTER
else
ocf_log debug "MariaDB monitor succeeded";
return $OCF_SUCCESS
fi
}
mysql_start() {
local rc
if ! ocf_is_ms; then
ocf_exit_reason "Resource is not configured as master/slave"
return $OCF_ERR_GENERIC
fi
# Initialize the ReaderVIP attribute, monitor will enable it
set_reader_attr 0
mysql_common_status info
if [ $? = $OCF_SUCCESS ]; then
ocf_log info "MariaDB already running"
return $OCF_SUCCESS
fi
mysql_common_prepare_dirs
mysql_common_start --skip-slave-start --log-slave-updates
rc=$?
if [ $rc != $OCF_SUCCESS ]; then
return $rc
fi
# Enable semi-sync
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SET GLOBAL rpl_semi_sync_slave_enabled='ON', \
rpl_semi_sync_master_enabled='ON', \
rpl_semi_sync_master_wait_no_slave='OFF', \
rpl_semi_sync_master_wait_point='AFTER_SYNC', \
gtid_strict_mode='ON', \
sync_binlog=1, \
sync_master_info=1, \
innodb_flush_log_at_trx_commit=1;"
rc=$?
if [ $rc -ne 0 ]; then
ocf_exit_reason "Failed to enable semi-sync and set variables";
return $OCF_ERR_GENERIC;
fi
# We're configured as a stateful resource. We must start as
# slave by default. At this point we don't know if the CRM has
# already promoted a master. So, we simply start in read only
# mode and make sure our old score is invalidated.
set_read_only on
ocf_promotion_score -D
# Now, let's see whether there is a master. We might be a new
# node that is just joining the cluster, and the CRM may have
# promoted a master before.
new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " ")
if [ "$new_master_host" -a "$new_master_host" != ${NODENAME} ]; then
set_master
start_slave
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to start slave"
return $OCF_ERR_GENERIC
fi
else
ocf_log info "No MariaDB master present - clearing replication state, setting gtid in attrd, waiting for first master"
unset_master
set_waiting_for_first_master
fi
# Initial monitor action
if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then
OCF_CHECK_LEVEL=10
fi
mysql_monitor
rc=$?
if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
ocf_exit_reason "Failed initial monitor action"
return $rc
fi
ocf_log info "MariaDB started"
return $OCF_SUCCESS
}
mysql_stop() {
# clear preference for becoming master
ocf_promotion_score -D
# Remove VIP capability
set_reader_attr 0
mysql_common_stop
}
mysql_promote() {
local master_info
if ( ! mysql_common_status err ); then
return $OCF_NOT_RUNNING
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE"
set_read_only off || return $OCF_ERR_GENERIC
# Force the master to wait for timeout period on slave disconnect
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='ON';"
# Set Master Info in CIB, cluster level attribute
master_info="$(get_local_ip)"
${CRM_ATTR_REPL_INFO} -v "$master_info"
# A master can accept reads
set_reader_attr 1
# Clear the gtids in attrd now that there is a master
clear_all_gtid
return $OCF_SUCCESS
}
mysql_demote() {
if ! mysql_common_status err; then
return $OCF_NOT_RUNNING
fi
# Return to default no wait setting.
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='OFF';"
# Return master preference to default, so the cluster manager gets
# a chance to select a new master
ocf_promotion_score -D
}
mysql_notify() {
local type_op
type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}"
ocf_log debug "Received $type_op notification."
case "$type_op" in
'pre-promote')
# A master is now being promoted, remove the waiting-for-first-master flag
clear_waiting_for_first_master
;;
'post-promote')
# The master has completed its promotion. Now is a good
# time to check whether our replication slave is working
# correctly.
new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " ")
if [ "$new_master_host" = ${NODENAME} ]; then
ocf_log info "This will be the new master, ignoring post-promote notification."
else
ocf_log info "Resetting replication, uname of master: $new_master_host"
unset_master
if [ $? -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
set_master
if [ $? -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
start_slave
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to start slave"
return $OCF_ERR_GENERIC
fi
fi
return $OCF_SUCCESS
;;
'pre-demote')
demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ")
if [ $demote_host = ${NODENAME} ]; then
ocf_log info "pre-demote notification for $demote_host"
set_read_only on
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to set read-only";
return $OCF_ERR_GENERIC;
fi
# Must kill all existing user threads because they are still Read/write
# in order for the slaves to complete the read of binlogs
local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX)
$MYSQL $MYSQL_OPTIONS_REPL -e "SHOW PROCESSLIST" > $tmpfile
for thread in $(awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile)
do
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "KILL ${thread}"
done
rm -f $tmpfile
else
ocf_log info "Ignoring post-demote notification execpt for my own demotion."
fi
return $OCF_SUCCESS
;;
'post-demote')
demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ")
if [ $demote_host = ${NODENAME} ]; then
ocf_log info "Ignoring post-demote notification for my own demotion."
return $OCF_SUCCESS
fi
ocf_log info "post-demote notification for $demote_host."
# The former master has just been gracefully demoted.
unset_master
;;
*)
return $OCF_SUCCESS
;;
esac
}
mysql_validate() {
check_binary bc
}
#######################################################################
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
mysql_common_validate
rc=$?
LSB_STATUS_STOPPED=3
if [ $rc -ne 0 ]; then
case "$1" in
stop) ;;
monitor)
mysql_common_status "info"
if [ $? -eq $OCF_SUCCESS ]; then
# if validatation fails and pid is active, always treat this as an error
ocf_exit_reason "environment validation failed, active pid is in unknown state."
exit $OCF_ERR_GENERIC
fi
# validation failed and pid is not active, it's safe to say this instance is inactive.
exit $OCF_NOT_RUNNING;;
status) exit $LSB_STATUS_STOPPED;;
*) exit $rc;;
esac
fi
# What kind of method was invoked?
case "$1" in
start) mysql_start;;
stop) mysql_stop;;
status) mysql_common_status err;;
monitor) mysql_monitor;;
promote) mysql_promote;;
demote) mysql_demote;;
notify) mysql_notify;;
validate-all) mysql_validate;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac
# vi:sw=4:ts=4:et:
diff --git a/heartbeat/mysql b/heartbeat/mysql
index 1df2fc0f2..6b00889ff 100755
--- a/heartbeat/mysql
+++ b/heartbeat/mysql
@@ -1,1074 +1,1073 @@
#!/bin/sh
#
#
# MySQL
#
# Description: Manages a MySQL database as Linux-HA resource
#
# Authors: Alan Robertson: DB2 Script
# Jakub Janczak: rewrite as MySQL
# Andrew Beekhof: cleanup and import
# Sebastian Reitenbach: add OpenBSD defaults, more cleanup
# Narayan Newton: add Gentoo/Debian defaults
# Marian Marinov, Florian Haas: add replication capability
# Yves Trudeau, Baron Schwartz: add VIP support and improve replication
#
# Support: users@clusterlabs.org
# License: GNU General Public License (GPL)
#
# (c) 2002-2005 International Business Machines, Inc.
# 2005-2010 Linux-HA contributors
#
# An example usage in /etc/ha.d/haresources:
# node1 10.0.0.170 mysql
#
# See usage() function below for more details...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_client_binary
# OCF_RESKEY_config
# OCF_RESKEY_datadir
# OCF_RESKEY_user
# OCF_RESKEY_group
# OCF_RESKEY_test_table
# OCF_RESKEY_test_user
# OCF_RESKEY_test_passwd
# OCF_RESKEY_enable_creation
# OCF_RESKEY_additional_parameters
# OCF_RESKEY_log
# OCF_RESKEY_pid
# OCF_RESKEY_socket
# OCF_RESKEY_replication_user
# OCF_RESKEY_replication_passwd
# OCF_RESKEY_replication_port
# OCF_RESKEY_max_slave_lag
# OCF_RESKEY_evict_outdated_slaves
# OCF_RESKEY_reader_attribute
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
. ${OCF_FUNCTIONS_DIR}/mysql-common.sh
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote|notify)
$0 manages a MySQL Database as an HA resource.
The 'start' operation starts the database.
The 'stop' operation stops the database.
The 'status' operation reports whether the database is running
The 'monitor' operation reports whether the database seems to be working
The 'promote' operation makes this mysql server run as master
The 'demote' operation makes this mysql server run as slave
The 'validate-all' operation reports whether the parameters are valid
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="mysql" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Resource script for MySQL.
May manage a standalone MySQL database, a clone set with externally
managed replication, or a complete master/slave replication setup.
Note, when master/slave replication is in use, the resource must
be setup to use notifications. Set 'notify=true' in the metadata
attributes when defining a MySQL master/slave instance.
While managing replication, the default behavior is to use uname -n
values in the change master to command. Other IPs can be specified
manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP
giving the IP to use for replication. For example, if the mysql primitive
you are using is p_mysql, the attribute to set will be
p_mysql_mysql_master_IP.
</longdesc>
<shortdesc lang="en">Manages a MySQL database instance</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the MySQL server binary
</longdesc>
<shortdesc lang="en">MySQL server binary</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="client_binary" unique="0" required="0">
<longdesc lang="en">
Location of the MySQL client binary
</longdesc>
<shortdesc lang="en">MySQL client binary</shortdesc>
<content type="string" default="${OCF_RESKEY_client_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Configuration file
</longdesc>
<shortdesc lang="en">MySQL config</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="datadir" unique="0" required="0">
<longdesc lang="en">
Directory containing databases
</longdesc>
<shortdesc lang="en">MySQL datadir</shortdesc>
<content type="string" default="${OCF_RESKEY_datadir_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running MySQL daemon
</longdesc>
<shortdesc lang="en">MySQL user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="group" unique="0" required="0">
<longdesc lang="en">
Group running MySQL daemon (for logfile and directory permissions)
</longdesc>
<shortdesc lang="en">MySQL group</shortdesc>
<content type="string" default="${OCF_RESKEY_group_default}"/>
</parameter>
<parameter name="log" unique="0" required="0">
<longdesc lang="en">
The logfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MySQL log file</shortdesc>
<content type="string" default="${OCF_RESKEY_log_default}"/>
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pidfile to be used for mysqld.
</longdesc>
<shortdesc lang="en">MySQL pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}"/>
</parameter>
<parameter name="socket" unique="0" required="0">
<longdesc lang="en">
The socket to be used for mysqld.
</longdesc>
<shortdesc lang="en">MySQL socket</shortdesc>
<content type="string" default="${OCF_RESKEY_socket_default}"/>
</parameter>
<parameter name="test_table" unique="0" required="0">
<longdesc lang="en">
Table to be tested in monitor statement (in database.table notation)
</longdesc>
<shortdesc lang="en">MySQL test table</shortdesc>
<content type="string" default="${OCF_RESKEY_test_table_default}" />
</parameter>
<parameter name="test_user" unique="0" required="0">
<longdesc lang="en">
MySQL test user, must have select privilege on test_table
</longdesc>
<shortdesc lang="en">MySQL test user</shortdesc>
<content type="string" default="${OCF_RESKEY_test_user_default}" />
</parameter>
<parameter name="test_passwd" unique="0" required="0">
<longdesc lang="en">
MySQL test user password
</longdesc>
<shortdesc lang="en">MySQL test user password</shortdesc>
<content type="string" default="${OCF_RESKEY_test_passwd_default}" />
</parameter>
<parameter name="enable_creation" unique="0" required="0">
<longdesc lang="en">
If the MySQL database does not exist, it will be created
</longdesc>
<shortdesc lang="en">Create the database if it does not exist</shortdesc>
<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/>
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters which are passed to the mysqld on startup.
(e.g. --skip-external-locking or --skip-grant-tables)
</longdesc>
<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc>
<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/>
</parameter>
<parameter name="replication_user" unique="0" required="0">
<longdesc lang="en">
MySQL replication user. This user is used for starting and stopping
MySQL replication, for setting and resetting the master host, and for
setting and unsetting read-only mode. Because of that, this user must
have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD
privileges on all nodes within the cluster. Mandatory if you define a
master-slave resource.
</longdesc>
<shortdesc lang="en">MySQL replication user</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_user_default}" />
</parameter>
<parameter name="replication_passwd" unique="0" required="0">
<longdesc lang="en">
MySQL replication password. Used for replication client and slave.
Mandatory if you define a master-slave resource.
</longdesc>
<shortdesc lang="en">MySQL replication user password</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_passwd_default}" />
</parameter>
<parameter name="replication_port" unique="0" required="0">
<longdesc lang="en">
The port on which the Master MySQL instance is listening.
</longdesc>
<shortdesc lang="en">MySQL replication port</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_port_default}" />
</parameter>
<parameter name="replication_require_ssl" unique="0" required="0">
<longdesc lang="en">
Enables SSL connection to local MySQL service for replication user.
i.e. if REQUIRE SSL for replication user in MySQL set, this should be set to "true".
</longdesc>
<shortdesc lang="en">MySQL replication require ssl</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_require_ssl_default}" />
</parameter>
<parameter name="replication_master_ssl_ca" unique="0" required="0">
<longdesc lang="en">
The SSL CA certificate to be used for replication over SSL.
</longdesc>
<shortdesc lang="en">MySQL replication SSL CA certificate</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_master_ssl_ca_default}" />
</parameter>
<parameter name="replication_master_ssl_cert" unique="0" required="0">
<longdesc lang="en">
The SSL CA certificate to be used for replication over SSL.
</longdesc>
<shortdesc lang="en">MySQL replication SSL certificate</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_master_ssl_cert_default}" />
</parameter>
<parameter name="replication_master_ssl_key" unique="0" required="0">
<longdesc lang="en">
The SSL certificate key to be used for replication over SSL.
</longdesc>
<shortdesc lang="en">MySQL replication SSL certificate key</shortdesc>
<content type="string" default="${OCF_RESKEY_replication_master_ssl_key_default}" />
</parameter>
<parameter name="max_slave_lag" unique="0" required="0">
<longdesc lang="en">
The maximum number of seconds a replication slave is allowed to lag
behind its master. Do not set this to zero. What the cluster manager
does in case a slave exceeds this maximum lag is determined by the
evict_outdated_slaves parameter.
</longdesc>
<shortdesc lang="en">Maximum time (seconds) a MySQL slave is allowed
to lag behind a master</shortdesc>
<content type="integer" default="${OCF_RESKEY_max_slave_lag_default}"/>
</parameter>
<parameter name="evict_outdated_slaves" unique="0" required="0">
<longdesc lang="en">
If set to true, any slave which is more than max_slave_lag seconds
behind the master has its MySQL instance shut down. If this parameter
is set to false in a primitive or clone resource, it is simply
ignored. If set to false in a master/slave resource, then exceeding
the maximum slave lag will merely push down the master preference so
the lagging slave is never promoted to the new master.
</longdesc>
<shortdesc lang="en">Determines whether to shut down badly lagging
slaves</shortdesc>
<content type="boolean" default="${OCF_RESKEY_evict_outdated_slaves_default}" />
</parameter>
<parameter name="reader_attribute" unique="1" required="0">
<longdesc lang="en">
An attribute that the RA can manage to specify whether a node
can be read from. This node attribute will be 1 if it's fine to
read from the node, and 0 otherwise (for example, when a slave
has lagged too far behind the master).
A typical example for the use of this attribute would be to tie
a set of IP addresses to MySQL slaves that can be read from.
This parameter is only meaningful in master/slave set configurations.
</longdesc>
<shortdesc lang="en">Sets the node attribute that determines
whether a node is usable for clients to read from.</shortdesc>
<content type="string" default="${OCF_RESKEY_reader_attribute_default}" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="120s" />
<action name="stop" timeout="120s" />
<action name="status" timeout="60s" />
<action name="monitor" depth="0" timeout="30s" interval="20s" />
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
<action name="promote" timeout="120s" />
<action name="demote" timeout="120s" />
<action name="notify" timeout="90s" />
<action name="validate-all" timeout="5s" />
<action name="meta-data" timeout="5s" />
</actions>
</resource-agent>
END
}
# Convenience functions
set_read_only() {
# Sets or unsets read-only mode. Accepts one boolean as its
# optional argument. If invoked without any arguments, defaults to
# enabling read only mode. Should only be set in master/slave
# setups.
# Returns $OCF_SUCCESS if the operation succeeds, or
# $OCF_ERR_GENERIC if it fails.
local ro_val
if ocf_is_true $1; then
ro_val="on"
else
ro_val="off"
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "SET GLOBAL read_only=${ro_val}"
}
get_read_only() {
# Check if read-only is set
local read_only_state
read_only_state=`$MYSQL $MYSQL_OPTIONS_REPL \
--skip-column-names -e "SHOW VARIABLES LIKE 'read_only'" | awk '{print $2}'`
if [ "$read_only_state" = "ON" ]; then
return 0
else
return 1
fi
}
is_slave() {
# Determine whether the machine is currently running as a MySQL
# slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW
# SLAVE STATUS creates an empty result set, 0 otherwise.
local rc
local tmpfile
# Check whether this machine should be slave
if ! ocf_is_ms || ! get_read_only; then
return 1
fi
get_slave_info
rc=$?
rm -f $tmpfile
if [ $rc -eq 0 ]; then
# show slave status is not empty
# Is there a master_log_file defined? (master_log_file is deleted
# by reset slave
if [ "$master_log_file" ]; then
return 0
else
return 1
fi
else
# "SHOW SLAVE STATUS" returns an empty set if instance is not a
# replication slave
return 1
fi
}
parse_slave_info() {
# Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2
sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2
}
get_slave_info() {
# Warning: this sets $tmpfile and LEAVE this file! You must delete it after use!
local mysql_options
if [ "$master_log_file" -a "$master_host" ]; then
# variables are already defined, get_slave_info has been run before
return $OCF_SUCCESS
else
tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX`
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW SLAVE STATUS\G' > $tmpfile
if [ -s $tmpfile ]; then
master_host=`parse_slave_info Master_Host $tmpfile`
master_user=`parse_slave_info Master_User $tmpfile`
master_port=`parse_slave_info Master_Port $tmpfile`
master_log_file=`parse_slave_info Master_Log_File $tmpfile`
master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile`
slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile`
slave_io=`parse_slave_info Slave_IO_Running $tmpfile`
last_errno=`parse_slave_info Last_Errno $tmpfile`
secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile`
ocf_log debug "MySQL instance running as a replication slave"
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
ocf_exit_reason "check_slave invoked on an instance that is not a replication slave."
return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
fi
}
check_slave() {
# Checks slave status
local rc new_master
get_slave_info
rc=$?
if [ $rc -eq 0 ]; then
# Did we receive an error other than max_connections?
if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then
# Whoa. Replication ran into an error. This slave has
# diverged from its master. Make sure this resource
# doesn't restart in place.
ocf_exit_reason "MySQL instance configured for replication, but replication has failed."
ocf_log err "See $tmpfile for details"
# Just pull the reader VIP away, killing MySQL here would be pretty evil
# on a loaded server
set_reader_attr 0
exit $OCF_SUCCESS
fi
# If we got max_connections, let's remove the vip
if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then
set_reader_attr 0
exit $OCF_SUCCESS
fi
if [ "$slave_io" != 'Yes' ]; then
# Not necessarily a bad thing. The master may have
# temporarily shut down, and the slave may just be
# reconnecting. A warning can't hurt, though.
ocf_log warn "MySQL Slave IO threads currently not running."
# Sanity check, are we at least on the right master
new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1`
if [ "$master_host" != "$new_master" ]; then
# Not pointing to the right master, not good, removing the VIPs
set_reader_attr 0
exit $OCF_SUCCESS
fi
fi
if [ "$slave_sql" != 'Yes' ]; then
# We don't have a replication SQL thread running. Not a
# good thing. Try to recoved by restarting the SQL thread
# and remove reader vip. Prevent MySQL restart.
ocf_exit_reason "MySQL Slave SQL threads currently not running."
ocf_log err "See $tmpfile for details"
# Remove reader vip
set_reader_attr 0
# try to restart slave
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
# Return success to prevent a restart
exit $OCF_SUCCESS
fi
if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then
# We're supposed to bail out if we lag too far
# behind. Let's check our lag.
if [ "$secs_behind" = "NULL" ] || [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then
ocf_exit_reason "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)."
ocf_log err "See $tmpfile for details"
# Remove reader vip
set_reader_attr 0
exit $OCF_ERR_INSTALLED
fi
fi
# is the slave ok to have a VIP on it
if [ "$secs_behind" = "NULL" ] || [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then
set_reader_attr 0
else
set_reader_attr 1
fi
ocf_log debug "MySQL instance running as a replication slave"
rm -f $tmpfile
else
# Instance produced an empty "SHOW SLAVE STATUS" output --
# instance is not a slave
# TODO: Needs to handle when get_slave_info will return too many connections error
rm -f $tmpfile
ocf_exit_reason "check_slave invoked on an instance that is not a replication slave."
exit $OCF_ERR_GENERIC
fi
}
set_master() {
local new_master master_log_file master_log_pos
local master_params master_ssl_params
new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1`
# Keep replication position
get_slave_info
if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then
# master_params=", MASTER_LOG_FILE='$master_log_file', \
# MASTER_LOG_POS=$master_log_pos"
ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos"
rm -f $tmpfile
return
else
master_log_file=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f2`
master_log_pos=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f3`
if [ -n "$master_log_file" -a -n "$master_log_pos" ]; then
master_params=", MASTER_LOG_FILE='$master_log_file', \
MASTER_LOG_POS=$master_log_pos"
ocf_log info "Restored master pos for $new_master : $master_log_file:$master_log_pos"
fi
fi
# Informs the MySQL server of the master to replicate
# from. Accepts one mandatory argument which must contain the host
# name of the new master host. The master must either be unchanged
# from the last master the slave replicated from, or freshly
# reset with RESET MASTER.
if [ -n "$OCF_RESKEY_replication_master_ssl_ca" ] && [ -n "$OCF_RESKEY_replication_master_ssl_cert" ] && [ -n "$OCF_RESKEY_replication_master_ssl_key" ]; then
master_ssl_params=", MASTER_SSL=1, \
MASTER_SSL_CA='$OCF_RESKEY_replication_master_ssl_ca', \
MASTER_SSL_CERT='$OCF_RESKEY_replication_master_ssl_cert', \
MASTER_SSL_KEY='$OCF_RESKEY_replication_master_ssl_key'"
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "CHANGE MASTER TO MASTER_HOST='$new_master', \
MASTER_PORT=$OCF_RESKEY_replication_port, \
MASTER_USER='$OCF_RESKEY_replication_user', \
MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params $master_ssl_params"
rm -f $tmpfile
}
unset_master(){
# Instructs the MySQL server to stop replicating from a master
# host.
# If we're currently not configured to be replicating from any
# host, then there's nothing to do. But we do log a warning as
# no-one but the CRM should be touching the MySQL master/slave
# configuration.
if ! is_slave; then
ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave"
return $OCF_SUCCESS
fi
local tmpfile
tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX`
# At this point, the master is read only so there should not be much binlogs to transfer
# Let's wait for the last bits
while true; do
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW PROCESSLIST\G' > $tmpfile
if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then
ocf_log info "MySQL slave has finished reading master binary log"
break
fi
if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then
ocf_log info "Master is down, no more binary logs to come"
break
fi
if grep -i 'Connecting to master' $tmpfile >/dev/null; then
ocf_log info "Master is down, no more binary logs to come"
break
fi
if ! grep 'system user' $tmpfile >/dev/null; then
ocf_log info "Slave is not running - not waiting to finish"
break
fi
sleep 1
done
# Now, stop the slave I/O thread and wait for relay log
# processing to complete
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE IO_THREAD"
if [ $? -gt 0 ]; then
ocf_exit_reason "Error stopping slave IO thread"
exit $OCF_ERR_GENERIC
fi
while true; do
$MYSQL $MYSQL_OPTIONS_REPL \
-e 'SHOW PROCESSLIST\G' > $tmpfile
if grep -i 'Has read all relay log' $tmpfile >/dev/null; then
ocf_log info "MySQL slave has finished processing relay log"
break
fi
if ! grep -q 'system user' $tmpfile; then
ocf_log info "Slave not runnig - not waiting to finish"
break
fi
ocf_log info "Waiting for MySQL slave to finish processing relay log"
sleep 1
done
rm -f $tmpfile
# Now, stop all slave activity and unset the master host
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE"
if [ $? -gt 0 ]; then
ocf_exit_reason "Error stopping rest slave threads"
exit $OCF_ERR_GENERIC
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "RESET SLAVE /*!50516 ALL */;"
if [ $? -gt 0 ]; then
ocf_exit_reason "Failed to reset slave"
exit $OCF_ERR_GENERIC
fi
}
# Start replication as slave
start_slave() {
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "START SLAVE"
}
# Set the attribute controlling the readers VIP
set_reader_attr() {
local curr_attr_value
curr_attr_value=$(get_reader_attr)
if [ "$curr_attr_value" -ne "$1" ]; then
$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1
fi
}
# get the attribute controlling the readers VIP
get_reader_attr() {
local attr_value
local rc
attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q`
rc=$?
if [ "$rc" -eq "0" ]; then
echo $attr_value
else
echo -1
fi
}
# Stores data for MASTER STATUS from MySQL
update_data_master_status() {
master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}"
$MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file
}
# Returns the specified value from the stored copy of SHOW MASTER STATUS.
# should be call after update_data_master_status for tmpfile
# Arguments:
# $1 The value to get.
get_master_status() {
awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file"
}
# Determines what IP address is attached to the current host. The output of the
# crm_attribute command looks like this:
# scope=nodes name=IP value=10.2.2.161
# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n
# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the
# change master to command.
get_local_ip() {
local IP
IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G`
if [ ! $? -eq 0 ]; then
uname -n
else
echo $IP
fi
}
#######################################################################
# Functions invoked by resource manager actions
mysql_monitor() {
local rc
local status_loglevel="err"
# Set loglevel to info during probe
if ocf_is_probe; then
status_loglevel="info"
fi
if ocf_is_ms; then
OCF_CHECK_LEVEL=10
fi
mysql_common_status $status_loglevel
rc=$?
# TODO: check max connections error
# If status returned an error, return that immediately
if [ $rc -ne $OCF_SUCCESS ]; then
if ocf_is_ms ; then
# This is a master slave setup but monitored host returned some errors.
# Immediately remove it from the pool of possible masters by erasing its master-mysql key
# When new mysql master election is started and node got no or negative master-mysql attribute the following is logged
# nodename.com pengine: debug: master_color: mysql:0 master score: -1
# If there are NO nodes with positive vaule election of mysql master will fail with
# nodename.com pengine: info: master_color: ms_mysql: Promoted 0 instances of a possible 1 to master
ocf_promotion_score -D
fi
return $rc
fi
if [ $OCF_CHECK_LEVEL -eq 10 ]; then
if [ -z "$OCF_RESKEY_test_table" ]; then
ocf_exit_reason "test_table not set"
return $OCF_ERR_CONFIGURED
fi
# Check if this instance is configured as a slave, and if so
# check slave status
if is_slave; then
check_slave
fi
# Check for test table
ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \
-e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table"
rc=$?
if [ $rc -ne 0 ]; then
# We are master/slave and test failed. Delete master score for this node as it is considered unhealthy because of this particular failed check.
ocf_is_ms && ocf_promotion_score -D
ocf_exit_reason "Failed to select from $test_table";
return $OCF_ERR_GENERIC;
fi
fi
if ocf_is_ms && ! get_read_only; then
ocf_log debug "MySQL monitor succeeded (master)";
# Always set master score for the master
ocf_promotion_score -v $((${OCF_RESKEY_max_slave_lag}+1))
return $OCF_RUNNING_MASTER
else
ocf_log debug "MySQL monitor succeeded";
ocf_is_ms && ocf_promotion_score -v 1
return $OCF_SUCCESS
fi
}
mysql_start() {
local rc
if ocf_is_ms; then
# Initialize the ReaderVIP attribute, monitor will enable it
set_reader_attr 0
fi
mysql_common_status info
if [ $? = $OCF_SUCCESS ]; then
ocf_log info "MySQL already running"
return $OCF_SUCCESS
fi
mysql_common_prepare_dirs
# Uncomment to perform permission clensing
# - not convinced this should be enabled by default
#
#chmod 0755 $OCF_RESKEY_datadir
#chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir
#chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir
mysql_extra_params=
if ocf_is_ms; then
mysql_extra_params="--skip-slave-start"
fi
mysql_common_start $mysql_extra_params
rc=$?
if [ $rc != $OCF_SUCCESS ]; then
return $rc
fi
if ocf_is_ms; then
# We're configured as a stateful resource. We must start as
# slave by default. At this point we don't know if the CRM has
# already promoted a master. So, we simply start in read only
# mode.
set_read_only on
# Now, let's see whether there is a master. We might be a new
# node that is just joining the cluster, and the CRM may have
# promoted a master before.
master_host=`echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " "`
if [ "$master_host" -a "$master_host" != ${NODENAME} ]; then
ocf_log info "Changing MySQL configuration to replicate from $master_host."
set_master
start_slave
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to start slave"
return $OCF_ERR_GENERIC
fi
else
ocf_log info "No MySQL master present - clearing replication state"
unset_master
fi
# We also need to set a master preference, otherwise Pacemaker
# won't ever promote us in the absence of any explicit
# preference set by the administrator. We choose a low
# greater-than-zero preference.
ocf_promotion_score -v 1
fi
# Initial monitor action
if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then
OCF_CHECK_LEVEL=10
fi
mysql_monitor
rc=$?
if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
ocf_exit_reason "Failed initial monitor action"
return $rc
fi
ocf_log info "MySQL started"
return $OCF_SUCCESS
}
mysql_stop() {
if ocf_is_ms; then
# clear preference for becoming master
ocf_promotion_score -D
# Remove VIP capability
set_reader_attr 0
fi
mysql_common_stop
}
mysql_promote() {
local master_info
if ( ! mysql_common_status err ); then
return $OCF_NOT_RUNNING
fi
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "STOP SLAVE"
# Set Master Info in CIB, cluster level attribute
update_data_master_status
master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)"
${CRM_ATTR_REPL_INFO} -v "$master_info"
rm -f $tmpfile
set_read_only off || return $OCF_ERR_GENERIC
# Existing master gets a higher-than-default master preference, so
# the cluster manager does not shuffle the master role around
# unnecessarily
ocf_promotion_score -v $((${OCF_RESKEY_max_slave_lag}+1))
# A master can accept reads
set_reader_attr 1
return $OCF_SUCCESS
}
mysql_demote() {
if ! mysql_common_status err; then
return $OCF_NOT_RUNNING
fi
# Return master preference to default, so the cluster manager gets
# a chance to select a new master
ocf_promotion_score -v 1
}
mysql_notify() {
# If not configured as a Stateful resource, we make no sense of
# notifications.
if ! ocf_is_ms; then
ocf_log info "This agent makes no use of notifications unless running in master/slave mode."
return $OCF_SUCCESS
fi
local type_op
type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}"
ocf_log debug "Received $type_op notification."
case "$type_op" in
'pre-promote')
# Nothing to do now here, new replication info not yet published
;;
'post-promote')
# The master has completed its promotion. Now is a good
# time to check whether our replication slave is working
# correctly.
master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "`
if [ "$master_host" = ${NODENAME} ]; then
ocf_log info "This will be the new master, ignoring post-promote notification."
else
ocf_log info "Resetting replication"
unset_master
if [ $? -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
ocf_log info "Changing MySQL configuration to replicate from $master_host"
set_master
if [ $? -ne 0 ]; then
return $OCF_ERR_GENERIC
fi
start_slave
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to start slave"
return $OCF_ERR_GENERIC
fi
fi
return $OCF_SUCCESS
;;
'pre-demote')
demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "`
if [ $demote_host = ${NODENAME} ]; then
ocf_log info "post-demote notification for $demote_host"
set_read_only on
if [ $? -ne 0 ]; then
ocf_exit_reason "Failed to set read-only";
return $OCF_ERR_GENERIC;
fi
# Must kill all existing user threads because they are still Read/write
# in order for the slaves to complete the read of binlogs
local tmpfile
tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX`
$MYSQL $MYSQL_OPTIONS_REPL \
-e "SHOW PROCESSLIST" > $tmpfile
for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile`
do
ocf_run $MYSQL $MYSQL_OPTIONS_REPL \
-e "KILL ${thread}"
done
else
ocf_log info "Ignoring post-demote notification execpt for my own demotion."
fi
return $OCF_SUCCESS
;;
'post-demote')
demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "`
if [ $demote_host = ${NODENAME} ]; then
ocf_log info "Ignoring post-demote notification for my own demotion."
return $OCF_SUCCESS
fi
ocf_log info "post-demote notification for $demote_host."
# The former master has just been gracefully demoted.
unset_master
;;
*)
return $OCF_SUCCESS
;;
esac
}
#######################################################################
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
mysql_common_validate
rc=$?
LSB_STATUS_STOPPED=3
if [ $rc -ne 0 ]; then
case "$1" in
stop) ;;
monitor)
mysql_common_status "info"
if [ $? -eq $OCF_SUCCESS ]; then
# if validatation fails and pid is active, always treat this as an error
ocf_exit_reason "environment validation failed, active pid is in unknown state."
exit $OCF_ERR_GENERIC
fi
# validation failed and pid is not active, it's safe to say this instance is inactive.
exit $OCF_NOT_RUNNING;;
status) exit $LSB_STATUS_STOPPED;;
*) exit $rc;;
esac
fi
# What kind of method was invoked?
case "$1" in
start) mysql_start;;
stop) mysql_stop;;
status) mysql_common_status err;;
monitor) mysql_monitor;;
promote) mysql_promote;;
demote) mysql_demote;;
notify) mysql_notify;;
validate-all) exit $OCF_SUCCESS;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac
# vi:sw=4:ts=4:et:
diff --git a/heartbeat/redis.in b/heartbeat/redis.in
index 6429477e1..1e541f13d 100755
--- a/heartbeat/redis.in
+++ b/heartbeat/redis.in
@@ -1,783 +1,782 @@
#!@BASH_SHELL@
#
# Resource agent script for redis server.
#
# Copyright (c) 2013 Patrick Hemmer <patrick.hemmer@gmail.com>
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
# Parameter defaults
OCF_RESKEY_bin_default="/usr/bin/redis-server"
OCF_RESKEY_client_bin_default="/usr/bin/redis-cli"
if [ -f "/etc/redis.conf" ]; then
OCF_RESKEY_config_default="/etc/redis.conf"
else
OCF_RESKEY_config_default="/etc/redis/redis.conf"
fi
OCF_RESKEY_user_default="redis"
OCF_RESKEY_rundir_default="/var/run/redis"
OCF_RESKEY_pidfile_name_default="redis-server.pid"
OCF_RESKEY_socket_name_default="redis.sock"
OCF_RESKEY_port_default="6379"
OCF_RESKEY_tunnel_host_default="127.0.0.1"
OCF_RESKEY_tunnel_port_map_default=""
OCF_RESKEY_wait_last_known_master_default="false"
: ${OCF_RESKEY_bin=${OCF_RESKEY_bin_default}}
: ${OCF_RESKEY_client_bin=${OCF_RESKEY_client_bin_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_rundir=${OCF_RESKEY_rundir_default}}
: ${OCF_RESKEY_pidfile_name=${OCF_RESKEY_pidfile_name_default}}
: ${OCF_RESKEY_socket_name=${OCF_RESKEY_socket_name_default}}
: ${OCF_RESKEY_port=${OCF_RESKEY_port_default}}
: ${OCF_RESKEY_tunnel_host=${OCF_RESKEY_tunnel_host_default}}
: ${OCF_RESKEY_tunnel_port_map=${OCF_RESKEY_tunnel_port_map_default}}
: ${OCF_RESKEY_wait_last_known_master=${OCF_RESKEY_wait_last_known_master_default}}
CHECK_SLAVE_STATE=0
REDIS_CHECK_DUMP="/usr/bin/redis-check-dump"
REDIS_SERVER="$OCF_RESKEY_bin"
REDIS_CLIENT="$OCF_RESKEY_client_bin"
REDIS_CONFIG="$OCF_RESKEY_config"
REDIS_USER="$OCF_RESKEY_user"
REDIS_RUNDIR="$OCF_RESKEY_rundir"
REDIS_PIDFILE="$OCF_RESKEY_rundir/$OCF_RESKEY_pidfile_name"
REDIS_SOCKET="$OCF_RESKEY_rundir/$OCF_RESKEY_socket_name"
REDIS_REPLICATION_PORT="$OCF_RESKEY_port"
if ! [ -f $REDIS_CHECK_DUMP ]; then
REDIS_CHECK_DUMP="$(which redis-check-dump 2>/dev/null)"
fi
if [ -z "$REDIS_CHECK_DUMP" ]; then
REDIS_CHECK_DUMP="$(which redis-check-rdb 2>/dev/null)"
fi
if [ -r "$REDIS_CONFIG" ]; then
REDIS_DUMP_DIR="$(grep "^\s*dir\s" < "$REDIS_CONFIG" | awk '{ print $2 }' 2>/dev/null)"
REDIS_DUMP_FILE="$(grep "^\s*dbfilename\s" < "$REDIS_CONFIG" | awk '{ print $2 }' 2>/dev/null)"
fi
: ${REDIS_DUMP_DIR:=/var/lib/redis/}
: ${REDIS_DUMP_FILE:=dump.rdb}
redis_meta_data() {
cat <<EOI
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="redis" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Resource agent script for redis server.
This resource fully supports master/slave replication. The master preference of a node is determined by the 'slave_priority' parameter of the redis config.
When taking the resource from 'unmanaged' to 'managed', the currently active master will be given a priority of 1000 (plus 1 for each active connection). The default 'slave_priority' is 100, so the master will stay master. For a slave to become master after converting the resource to managed, set a slave_priority greater than 1000.
</longdesc>
<shortdesc lang="en">Redis server</shortdesc>
<parameters>
<parameter name="bin" unique="0" required="0">
<longdesc lang="en">
Path to \`redis-server\`
</longdesc>
<shortdesc lang="en">Path to \`redis-server\`</shortdesc>
<content type="string" default="${OCF_RESKEY_bin_default}" />
</parameter>
<parameter name="client_bin" unique="0" required="0">
<longdesc lang="en">
Path to \`redis-cli\`
</longdesc>
<shortdesc lang="en">Path to \`redis-cli\`</shortdesc>
<content type="string" default="${OCF_RESKEY_client_bin_default}" />
</parameter>
<parameter name="config" unique="1" required="0">
<longdesc lang="en">
Path to 'redis.conf'
</longdesc>
<shortdesc lang="en">Path to 'redis.conf'</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User to run redis as
</longdesc>
<shortdesc lang="en">Redis user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="rundir" unique="1" required="0">
<longdesc lang="en">
Directory to store socket and pid file in
</longdesc>
<shortdesc lang="en">Redis var/run dir</shortdesc>
<content type="string" default="${OCF_RESKEY_rundir_default}"/>
</parameter>
<parameter name="pidfile_name" unique="0" required="0">
<longdesc lang="en">
The filename to use for the pidfile. Will be created in the rundir.
Should only be a basename, not a full path.
</longdesc>
<shortdesc lang="en">Redis pidfile name</shortdesc>
<content type="string" default="${OCF_RESKEY_pidfile_name_default}"/>
</parameter>
<parameter name="socket_name" unique="0" required="0">
<longdesc lang="en">
The filename to use for the socket. Will be crated in the rundir.
Should only be a basename, not a full path.
</longdesc>
<shortdesc lang="en">Redis socket name</shortdesc>
<content type="string" default="${OCF_RESKEY_socket_name_default}"/>
</parameter>
<parameter name="port" unique="0" required="0">
<longdesc lang="en">
Port for replication client to connect to on remote server
</longdesc>
<shortdesc lang="en">Replication port</shortdesc>
<content type="string" default="${OCF_RESKEY_port_default}"/>
</parameter>
<parameter name="tunnel_host" unique="0" required="0">
<longdesc lang="en">
When replication traffic is tunnelled, this is the host to target
to forward outgoing traffic to the redis master. The resource
agent configures the redis slave to target the master via
tunnel_host:tunnel_port.
Note that in order to enable replication traffic tunneling,
parameter {tunnel_port_map} must be populated.
</longdesc>
<shortdesc lang="en">Tunnel host for replication traffic</shortdesc>
<content type="string" default="${OCF_RESKEY_tunnel_host_default}"/>
</parameter>
<parameter name="tunnel_port_map" unique="0" required="0">
<longdesc lang="en">
A mapping of pacemaker node names to redis port number.
To be used when redis servers need to tunnel replication traffic.
On every node where the redis resource is running, the redis server
listens to a different port. Each redis server can access its peers
for replication traffic via a tunnel accessible at {tunnel_host}:port.
The mapping the form of:
pcmk1-name:port-for-redis1;pcmk2-name:port-for-redis2;pcmk3-name:port-for-redis3
where the redis resource started on node pcmk1-name would listen on
port port-for-redis1
</longdesc>
<shortdesc lang="en">Mapping of Redis server name to redis port</shortdesc>
<content type="string" default="${OCF_RESKEY_tunnel_port_map_default}"/>
</parameter>
<parameter name="wait_last_known_master" unique="0" required="0">
<longdesc lang="en">
During redis cluster bootstrap, wait for the last known master to be
promoted before allowing any other instances in the cluster to be
promoted. This lessens the risk of data loss when persistent data
is in use.
</longdesc>
<shortdesc lang="en">Wait for last known master</shortdesc>
<content type="boolean" default="${OCF_RESKEY_wait_last_known_master_default}"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="120s" />
<action name="stop" timeout="120s" />
<action name="status" timeout="60s" />
<action name="monitor" depth="0" timeout="60s" interval="45s" />
<action name="monitor" role="Promoted" depth="0" timeout="60s" interval="20s" />
-<action name="monitor" role="Unpromoted" depth="0" timeout="60s" interval="60s" />
<action name="promote" timeout="120s" />
<action name="demote" timeout="120s" />
<action name="notify" timeout="90s" />
<action name="validate-all" timeout="5s" />
<action name="meta-data" timeout="5s" />
</actions>
</resource-agent>
EOI
}
INSTANCE_ATTR_NAME=$(echo "${OCF_RESOURCE_INSTANCE}" | awk -F : '{print $1}')
CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s redis_replication"
MASTER_HOST=""
MASTER_ACTIVE_CACHED=""
MASTER_ACTIVE=""
CLI_HAVE_AUTH_WARNING=0
CLI_HAVE_ARG_NO_AUTH_WARNING=0
CLI_HAVE_ENV_AUTH=0
redis_cli_features()
{
CLI_VER=$("$REDIS_CLIENT" -v | awk '{print $NF}')
# Starting with 4.0.10 there is a warning on stderr when using a pass
# Starting with 5.0.0 there is an argument to silence the warning: --no-auth-warning
# Starting with 5.0.3 there is an option to use REDISCLI_AUTH evironment variable for password, no warning in this case
ocf_version_cmp $CLI_VER 5.0.3
res=$?
if [[ res -ge 1 ]]; then
CLI_HAVE_ENV_AUTH=1
fi
ocf_version_cmp $CLI_VER 5.0.0
res=$?
if [[ res -ge 1 ]]; then
CLI_HAVE_ARG_NO_AUTH_WARNING=1
fi
ocf_version_cmp $CLI_VER 4.0.10
res=$?
if [[ res -ge 1 ]]; then
CLI_HAVE_AUTH_WARNING=1
fi
}
master_is_active()
{
if [ -z "$MASTER_ACTIVE_CACHED" ]; then
# determine if a master instance is already up and is healthy
ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.1.0"
res=$?
if [ -z "$OCF_RESKEY_crm_feature_set" ] || [ $res -eq 2 ]; then
XMLOPT="--output-as=xml"
ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.2.0"
if [ $? -eq 1 ]; then
crm_mon_no_validation -1 $XMLOPT >/dev/null 2>&1
if [ $? -ne 0 ]; then
XMLOPT="--as-xml"
fi
fi
else
XMLOPT="--as-xml"
fi
crm_mon_no_validation -1 $XMLOPT | grep -q -i -E "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".* role=\"(Promoted|Master)\".* active=\"true\".* orphaned=\"false\".* failed=\"false\""
MASTER_ACTIVE=$?
MASTER_ACTIVE_CACHED="true"
fi
return $MASTER_ACTIVE
}
set_master()
{
MASTER_HOST="$1"
${CRM_ATTR_REPL_INFO} -v "$1" -q
}
last_known_master()
{
if [ -z "$MASTER_HOST" ]; then
MASTER_HOST="$(${CRM_ATTR_REPL_INFO} --query -q 2>/dev/null)"
fi
echo "$MASTER_HOST"
}
crm_master_reboot() {
local node
node=$(ocf_attribute_target)
"${HA_SBIN_DIR}/crm_master" -N "$node" -l reboot "$@"
}
calculate_score()
{
perf_score="$1"
connected_clients="$2"
if ocf_is_true "$OCF_RESKEY_wait_last_known_master"; then
# only set perferred score by slave_priority if
# we are not waiting for the last known master. Otherwise
# we want the agent to have complete control over the scoring.
perf_score=""
connected_clients="0"
fi
if [[ -z "$perf_score" ]]; then
if [[ "$(last_known_master)" == "$NODENAME" ]]; then
perf_score=1000
else
perf_score=1
fi
fi
perf_score=$(( perf_score + connected_clients ))
echo "$perf_score"
}
set_score()
{
local score
local last_master
score="$1"
if ocf_is_true "$OCF_RESKEY_wait_last_known_master" && ! master_is_active; then
last_master="$(last_known_master)"
if [ -n "$last_master" ] && [[ "$last_master" != "$NODENAME" ]]; then
ocf_log info "Postponing setting master score for ${NODENAME} until last known master instance [${last_master}] is promoted"
return
fi
fi
ocf_log debug "monitor: Setting master score to '$score'"
crm_master_reboot -v "$score"
}
redis_client() {
ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $*"
if [ -n "$clientpasswd" ]; then
# Consider redis-cli features to choose optimal password passing method and warning filtering workaround
if [[ CLI_HAVE_ENV_AUTH -eq 1 ]]; then
REDISCLI_AUTH=$clientpasswd "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//'
elif [[ CLI_HAVE_ARG_NO_AUTH_WARNING -eq 1 ]]; then
"$REDIS_CLIENT" -s "$REDIS_SOCKET" --no-auth-warning -a "$clientpasswd" "$@" | sed 's/\r//'
elif [[ CLI_HAVE_AUTH_WARNING -eq 1 ]]; then
("$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" 2>&1 >&3 3>&- | grep -v "Using a password" >&2 3>&-) 3>&1 | sed 's/\r//'
else
"$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" | sed 's/\r//'
fi
else
"$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//'
fi
}
simple_status() {
local pid
if ! [ -f "$REDIS_PIDFILE" ]; then
return $OCF_NOT_RUNNING
fi
pid="$(<"$REDIS_PIDFILE")"
pidof $(basename "$REDIS_SERVER") | grep -q "\<$pid\>" || return $OCF_NOT_RUNNING
ocf_log debug "monitor: redis-server running under pid $pid"
return $OCF_SUCCESS
}
redis_monitor() {
local res
local master_name
local last_known_master_port
simple_status
res=$?
if (( res != OCF_SUCCESS )); then
return $res
fi
typeset -A info
while read line; do
[[ "$line" == "#"* ]] && continue
[[ "$line" != *":"* ]] && continue
IFS=':' read -r key value <<< "$line"
info[$key]="$value"
done < <(redis_client info)
if [[ -z "${info[role]}" ]]; then
ocf_log err "monitor: Could not get role from \`$REDIS_CLIENT -s $REDIS_SOCKET info\`"
return $OCF_ERR_GENERIC
fi
if ocf_is_ms; then
# Here we see if a score has already been set.
# If score isn't set we the redis setting 'slave_priority'.
# If that isn't set, we default to 1000 for a master, and 1 for slave.
# We then add 1 for each connected client
score="$(crm_master_reboot -G --quiet 2>/dev/null)"
if [[ -z "$score" ]]; then
score=$(calculate_score "${info[slave_priority]}" "${info[connected_clients]}")
set_score "$score"
fi
if [[ "${info[role]}" == "master" ]]; then
if ocf_is_probe; then
set_master "$NODENAME"
fi
return $OCF_RUNNING_MASTER
fi
if [ "$CHECK_SLAVE_STATE" -eq 1 ]; then
if [[ "${info[master_link_status]}" != "up" ]]; then
ocf_log info "monitor: Slave mode link has not yet been established (link=${info[master_link_status]})"
return $OCF_ERR_GENERIC
fi
if [[ "${info[master_host]}" != "$(last_known_master)" ]]; then
if [ -n "${OCF_RESKEY_tunnel_port_map}" ]; then
master_name=$(port_to_redis_node ${info[master_port]})
last_known_master_port=$(redis_node_to_port $(last_known_master))
if [[ "${info[master_host]}" != "${OCF_RESKEY_tunnel_host}" ]] ||
[[ "${info[master_port]}" != "${last_known_master_port}" ]]; then
ocf_log err "monitor: Slave mode current tunnelled connection to redis server does not match running master. tunnelled='${info[master_host]}:${info[master_port]} (${master_name})', running='$(last_known_master)'"
return $OCF_ERR_GENERIC
fi
else
ocf_log err "monitor: Slave mode current master does not match running master. current=${info[master_host]}, running=$(last_known_master)"
return $OCF_ERR_GENERIC
fi
fi
fi
fi
return $OCF_SUCCESS
}
redis_node_to_port()
{
local node=$1
echo "$OCF_RESKEY_tunnel_port_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$node"'" {print $2;exit}'
}
port_to_redis_node()
{
local port=$1
echo "$OCF_RESKEY_tunnel_port_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$port"'" {print $1;exit}'
}
get_tunnel_port_from_master()
{
local master_name=$1
crm_attribute --node "$master_name" -l forever --name ${INSTANCE_ATTR_NAME}-tunnel-port --query -q 2>/dev/null
}
get_master_from_tunnel_port()
{
local master_name=$1
crm_attribute --node "$master_name" -l forever --name ${INSTANCE_ATTR_NAME}-tunnel-port --query -q 2>/dev/null
}
check_dump_file()
{
if ! have_binary "$REDIS_CHECK_DUMP"; then
return 0
fi
$REDIS_CHECK_DUMP ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} 2>&1
}
redis_start() {
local size
redis_monitor
status=$?
if (( status == OCF_SUCCESS )) || (( status == OCF_RUNNING_MASTER )); then
ocf_log info "start: redis is already running"
return $OCF_SUCCESS
fi
[[ ! -d "$REDIS_RUNDIR" ]] && mkdir -p "$REDIS_RUNDIR"
chown -R "$REDIS_USER" "$REDIS_RUNDIR"
if have_binary "restorecon"; then
restorecon -Rv "$REDIS_RUNDIR"
fi
# check for 0 byte database dump file. This is an unrecoverable start
# condition that we can avoid by deleting the 0 byte database file.
if [ -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}" ]; then
size="$(stat --format "%s" ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE})"
if [ "$?" -eq "0" ] && [ "$size" -eq "0" ]; then
ocf_log notice "Detected 0 byte ${REDIS_DUMP_FILE}, deleting zero length file to avoid start failure."
rm -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}"
fi
fi
ocf_log info "start: $REDIS_SERVER --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'"
output="$(su "$REDIS_USER" -s /bin/sh -c "cd '$REDIS_RUNDIR'; exec '$REDIS_SERVER' '$REDIS_CONFIG' --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" 2>&1)"
while true; do
# wait for redis to start
typeset -A info
while read line; do
[[ "$line" == "#"* ]] && continue
[[ "$line" != *":"* ]] && continue
IFS=':' read -r key value <<< "$line"
info[$key]="$value"
done < <(redis_client info)
if (( info[loading] == 0 )); then
break
elif (( info[loading] == 1 )); then
sleep "${info[loading_eta_seconds]}"
elif pidof $(basename "$REDIS_SERVER") >/dev/null; then
# unknown error, but the process still exists.
# This check is mainly because redis daemonizes before it starts listening, causing `redis-cli` to fail
# See https://github.com/antirez/redis/issues/2368
# It's possible that the `pidof` will pick up a different redis, but in that case, the start operation will just time out
sleep 1
else
check_output="$(check_dump_file)"
ocf_log err "start: Unknown error waiting for redis to start. redis-check-dump output=${check_output//$'\n'/; }"
return $OCF_ERR_GENERIC
fi
done
while ! [ -s "$REDIS_PIDFILE" ]; do
ocf_log debug "start: Waiting for pid file '$REDIS_PIDFILE' to appear"
sleep 1
done
ocf_is_ms && redis_demote # pacemaker expects resources to start in slave mode
redis_monitor
status=$?
if (( status == OCF_SUCCESS )) || (( status == OCF_RUNNING_MASTER )); then
return $OCF_SUCCESS
fi
check_output="$(check_dump_file)"
ocf_log err "start: Unknown error starting redis. redis-server output=${output//$'\n'/; } redis-check-dump output=${check_output//$'\n'/; }"
return $status
}
redis_stop() {
redis_monitor
status=$?
if (( status == OCF_NOT_RUNNING )); then
ocf_log info "stop: redis is already stopped"
crm_master_reboot -D
return $OCF_SUCCESS
fi
pid="$(<"$REDIS_PIDFILE")"
kill -TERM "$pid"
while true; do
simple_status
status=$?
if (( status == OCF_NOT_RUNNING )); then
crm_master_reboot -D
return $OCF_SUCCESS
fi
sleep 1
done
}
redis_promote() {
redis_monitor
status=$?
if (( status == OCF_RUNNING_MASTER )); then
ocf_log info "promote: Already running as master"
set_master "$NODENAME"
return $OCF_SUCCESS
elif (( status != OCF_SUCCESS )); then
ocf_log err "promote: Node is not running as a slave"
return $OCF_ERR_GENERIC
fi
redis_client slaveof no one
redis_monitor
status=$?
if (( status == OCF_RUNNING_MASTER )); then
set_master "$NODENAME"
return $OCF_SUCCESS
fi
ocf_log err "promote: Unknown error while promoting to master (status=$status)"
return $OCF_ERR_GENERIC
}
redis_demote() {
local master_host
local master_port
local tunnel_port
# client kill is only supported in Redis 2.8.12 or greater
version=$(redis_client -v | awk '{print $NF}')
ocf_version_cmp "$version" "2.8.11"
client_kill=$?
CHECK_SLAVE_STATE=1
redis_monitor
status=$?
if (( status == OCF_SUCCESS )); then
ocf_log info "demote: Already running as slave"
return $OCF_SUCCESS
elif (( status == OCF_NOT_RUNNING )); then
ocf_log err "demote: Failed to demote, redis not running."
return $OCF_NOT_RUNNING
fi
master_host="$(last_known_master)"
master_port="${REDIS_REPLICATION_PORT}"
# The elected master has to remain a slave during startup.
# During this period a placeholder master host is assigned.
if [ -z "$master_host" ] || [[ "$master_host" == "$NODENAME" ]]; then
CHECK_SLAVE_STATE=0
master_host="no-such-master"
elif ! master_is_active; then
# no master has been promoted yet. we'll be notified when the
# master starts.
CHECK_SLAVE_STATE=0
master_host="no-such-master"
fi
if [ -n "${OCF_RESKEY_tunnel_port_map}" ]; then
# master_host can be the special marker "no-such-master"
# while a master is being selected. In this case, no
# tunnel port is returned, but this is not fatal.
tunnel_port=$(redis_node_to_port "$master_host")
if [ -n "$tunnel_port" ]; then
ocf_log info "demote: Setting master to '$master_host' via local tunnel '${OCF_RESKEY_tunnel_host}' on port '$tunnel_port'"
master_host="${OCF_RESKEY_tunnel_host}"
master_port="$tunnel_port"
fi
else
ocf_log info "demote: Setting master to '$master_host'"
fi
redis_client slaveof "$master_host" "$master_port"
# Wait forever for the slave to connect to the master and finish the
# sync. Timeout is controlled by Pacemaker "op start timeout=XX".
#
# hint: redis master_link_status will only come "up" when
# the SYNC with the master has completed.
# This can take an arbitraty time (data) and should
# only be parametrized by the start operation timeout
# by the administrator, not by this resource agent code
while true; do
# Wait infinite if replication is syncing
# Then start/demote operation timeout determines timeout
if [ "$client_kill" -eq 2 ]; then
redis_client CLIENT PAUSE 2000
fi
redis_monitor
status=$?
if (( status == OCF_SUCCESS )); then
if [ "$client_kill" -eq 2 ]; then
redis_client CLIENT KILL type normal
fi
return $OCF_SUCCESS
fi
sleep 1
done
ocf_log err "demote: Unexpected error setting slave mode (status=$status)"
return $OCF_ERR_GENERIC
}
redis_notify() {
mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}"
case "$mode" in
post-demote|post-promote) # change the master
redis_monitor
status=$?
if (( status == OCF_SUCCESS )); then # were a slave
# calling demote updates the slave's connection
# to the newly appointed Master instance.
redis_demote
fi
;;
esac
return $OCF_SUCCESS
}
redis_validate() {
if [[ ! -x "$REDIS_SERVER" ]]; then
ocf_log err "validate: $REDIS_SERVER does not exist or is not executable"
return $OCF_ERR_INSTALLED
fi
if [[ ! -x "$REDIS_CLIENT" ]]; then
ocf_log err "validate: $REDIS_CLIENT does not exist or is not executable"
return $OCF_ERR_INSTALLED
fi
if [[ ! -f "$REDIS_CONFIG" ]]; then
ocf_log err "validate: $REDIS_CONFIG does not exist"
return $OCF_ERR_CONFIGURED
fi
if ! getent passwd "$REDIS_USER" &>/dev/null; then
ocf_log err "validate: $REDIS_USER is not a valid user"
return $OCF_ERR_CONFIGURED
fi
}
if [ "$__OCF_ACTION" != "meta-data" ]; then
NODENAME=$(ocf_attribute_target)
fi
if [ -r "$REDIS_CONFIG" ]; then
clientpasswd="$(sed -n -e 's/^\s*requirepass\s*\(.*\)\s*$/\1/p' < $REDIS_CONFIG | tail -n 1)"
fi
if [ "$__OCF_ACTION" = "start" ]; then
redis_validate || exit $?
fi
redis_cli_features
ocf_log debug "action=${1:-$__OCF_ACTION} notify_type=${OCF_RESKEY_CRM_meta_notify_type} notify_operation=${OCF_RESKEY_CRM_meta_notify_operation} master_host=${OCF_RESKEY_CRM_meta_notify_master_uname} slave_host=${OCF_RESKEY_CRM_meta_notify_slave_uname} promote_host=${OCF_RESKEY_CRM_meta_notify_promote_uname} demote_host=${OCF_RESKEY_CRM_meta_notify_demote_uname}; params: bin=${OCF_RESKEY_bin} client_bin=${OCF_RESKEY_client_bin} config=${OCF_RESKEY_config} user=${OCF_RESKEY_user} rundir=${OCF_RESKEY_rundir} port=${OCF_RESKEY_port}"
case "${1:-$__OCF_ACTION}" in
status|monitor)
redis_monitor
;;
start)
redis_start
;;
stop)
redis_stop
;;
restart)
redis_stop && redis_start
;;
promote)
redis_promote
;;
demote)
redis_demote
;;
notify)
redis_notify
;;
meta-data)
redis_meta_data
;;
validate-all)
redis_validate
;;
*)
echo "Usage: $0 {monitor|start|stop|restart|promote|demote|notify|validate-all|meta-data}"
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
status=$?
ocf_log debug "exit_status=$status"
exit $status

File Metadata

Mime Type
text/x-diff
Expires
Thu, Feb 27, 1:14 AM (17 h, 11 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1465983
Default Alt Text
(133 KB)

Event Timeline