diff --git a/heartbeat/rabbitmq-server-ha.ocf b/heartbeat/rabbitmq-server-ha.ocf new file mode 100755 index 000000000..2425e0fed --- /dev/null +++ b/heartbeat/rabbitmq-server-ha.ocf @@ -0,0 +1,2437 @@ +#!/bin/sh +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See usage() function below for more details ... +# +# Note that the script uses an external file to setup RabbitMQ policies +# so make sure to create it from an example shipped with the package. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Fill in some defaults if no values are specified + +PATH=/sbin:/usr/sbin:/bin:/usr/bin + +OCF_RESKEY_binary_default="/usr/sbin/rabbitmq-server" +OCF_RESKEY_ctl_default="/usr/sbin/rabbitmqctl" +OCF_RESKEY_debug_default=false +OCF_RESKEY_username_default="rabbitmq" +OCF_RESKEY_groupname_default="rabbitmq" +OCF_RESKEY_admin_user_default="guest" +OCF_RESKEY_admin_password_default="guest" +OCF_RESKEY_definitions_dump_file_default="/etc/rabbitmq/definitions" +OCF_RESKEY_pid_file_default="/var/run/rabbitmq/pid" +OCF_RESKEY_log_dir_default="/var/log/rabbitmq" +OCF_RESKEY_mnesia_base_default="/var/lib/rabbitmq/mnesia" +OCF_RESKEY_mnesia_schema_base_default="/var/lib/rabbitmq" +OCF_RESKEY_host_ip_default="127.0.0.1" +OCF_RESKEY_node_port_default=5672 +OCF_RESKEY_default_vhost_default="/" +OCF_RESKEY_erlang_cookie_default=false +OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie" +OCF_RESKEY_use_fqdn_default=false +OCF_RESKEY_fqdn_prefix_default="" +OCF_RESKEY_max_rabbitmqctl_timeouts_default=3 +OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" +OCF_RESKEY_rmq_feature_health_check_default=true +OCF_RESKEY_rmq_feature_local_list_queues_default=true +OCF_RESKEY_limit_nofile_default=65535 +OCF_RESKEY_avoid_using_iptables_default=false +OCF_RESKEY_allowed_cluster_nodes_default="" + +: ${HA_LOGTAG="lrmd"} +: ${HA_LOGFACILITY="daemon"} +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_ctl=${OCF_RESKEY_ctl_default}} +: ${OCF_RESKEY_debug=${OCF_RESKEY_debug_default}} +: ${OCF_RESKEY_username=${OCF_RESKEY_username_default}} +: ${OCF_RESKEY_groupname=${OCF_RESKEY_groupname_default}} +: ${OCF_RESKEY_admin_user=${OCF_RESKEY_admin_user_default}} +: ${OCF_RESKEY_admin_password=${OCF_RESKEY_admin_password_default}} +: ${OCF_RESKEY_definitions_dump_file=${OCF_RESKEY_definitions_dump_file_default}} +: ${OCF_RESKEY_log_dir=${OCF_RESKEY_log_dir_default}} +: ${OCF_RESKEY_mnesia_base=${OCF_RESKEY_mnesia_base_default}} +: ${OCF_RESKEY_mnesia_schema_base=${OCF_RESKEY_mnesia_schema_base_default}} +: ${OCF_RESKEY_pid_file=${OCF_RESKEY_pid_file_default}} +: ${OCF_RESKEY_node_port=${OCF_RESKEY_node_port_default}} +: ${OCF_RESKEY_default_vhost=${OCF_RESKEY_default_vhost_default}} +: ${OCF_RESKEY_erlang_cookie=${OCF_RESKEY_erlang_cookie_default}} +: ${OCF_RESKEY_erlang_cookie_file=${OCF_RESKEY_erlang_cookie_file_default}} +: ${OCF_RESKEY_use_fqdn=${OCF_RESKEY_use_fqdn_default}} +: ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}} +: ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}} +: ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}} +: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}} +: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}} +: ${OCF_RESKEY_limit_nofile=${OCF_RESKEY_limit_nofile_default}} +: ${OCF_RESKEY_avoid_using_iptables=${OCF_RESKEY_avoid_using_iptables_default}} +: ${OCF_RESKEY_allowed_cluster_nodes=${OCF_RESKEY_allowed_cluster_nodes_default}} + +####################################################################### + +OCF_RESKEY_CRM_meta_timeout_default=30000 +: ${OCF_RESKEY_CRM_meta_timeout=${OCF_RESKEY_CRM_meta_timeout_default}} +OCF_RESKEY_start_time_default=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 2)) +: ${OCF_RESKEY_start_time=${OCF_RESKEY_start_time_default}} +OCF_RESKEY_stop_time_default=${OCF_RESKEY_start_time_default} +: ${OCF_RESKEY_stop_time=${OCF_RESKEY_start_time_default}} +OCF_RESKEY_command_timeout_default="" +: ${OCF_RESKEY_command_timeout=${OCF_RESKEY_command_timeout_default}} +TIMEOUT_ARG=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 30)) +COMMAND_TIMEOUT="/usr/bin/timeout ${OCF_RESKEY_command_timeout} ${TIMEOUT_ARG}" +RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` + +####################################################################### + +usage() { + cat < + + +1.0 + + +Resource agent for RabbitMQ promotes a Master, then cluster nodes can join it + +Resource agent for RabbitMQ HA cluster + + + + +RabbitMQ binary + +RabbitMQ binary + + + + + +rabbitctl binary + +rabbitctl binary binary + + + + + +RabbitMQ PID file + +RabbitMQ PID file + + + + + +RabbitMQ log directory + +RabbitMQ log directory + + + + + +RabbitMQ user name + +RabbitMQ user name + + + + + +RabbitMQ group name + +RabbitMQ group name + + + + + +RabbitMQ default admin user for API + +RabbitMQ admin user + + + + + +RabbitMQ default admin user password for API + +RabbitMQ admin password + + + + + +RabbitMQ default definitions dump file + +RabbitMQ definitions dump file + + + + + +Timeout command arguments for issued commands termination (value is auto evaluated) + +Arguments for timeout wrapping command + + + + + +Timeout for start rabbitmq server + +Timeout for start rabbitmq server + + + + + +Timeout for stopping rabbitmq server + +Timeout for stopping rabbitmq server + + + + + +The debug flag for agent (${OCF_RESKEY_binary}) instance. +In the /tmp/ directory will be created rmq-* files for log +some operations and ENV values inside OCF-script. + +AMQP server (${OCF_RESKEY_binary}) debug flag + + + + + +Base directory for storing Mnesia files + +Base directory for storing Mnesia files + + + + + +Parent directory for Mnesia schema directory + +Parent directory for Mnesia schema directory + + + + + +${OCF_RESKEY_binary} should listen on this IP address + +${OCF_RESKEY_binary} should listen on this IP address + + + + + +${OCF_RESKEY_binary} should listen on this port + +${OCF_RESKEY_binary} should listen on this port + + + + + +Default virtual host used for monitoring if a node is fully synchronized with +the rest of the cluster. In normal operation, the resource agent will wait for +queues from this virtual host on this node to be synchronized elsewhere before +stopping RabbitMQ. This also means queues in other virtual hosts may not be +fully synchronized on stop operations. + +Default virtual host used for waiting for synchronization + + + + + +Erlang cookie for clustering. If specified, will be updated at the mnesia reset + +Erlang cookie + + + + + +Erlang cookie file path where the cookie will be put, if requested + +Erlang cookie file + + + + + +Either to use FQDN or a shortname for the rabbitmq node + +Use FQDN + + + + + +Optional FQDN prefix for RabbitMQ nodes in cluster. +FQDN prefix can be specified to host multiple RabbitMQ instances on a node or +in case of RabbitMQ running in dedicated network/interface. + +FQDN prefix + + + + + +If during monitor call rabbitmqctl times out, the timeout is ignored +unless it is Nth timeout in a row. Here N is the value of the current parameter. +If too many timeouts happen in a raw, the monitor call will return with error. + +Fail only if that many rabbitmqctl timeouts in a row occurred + + + + + +A path to the shell script to setup RabbitMQ policies + +A policy file path + + + + + +Since rabbit 3.6.4 list_queues/list_channels-based monitoring should +be replaced with "node_health_check" command, as it creates no network +load at all. + +Use node_health_check for monitoring + + + + + +For rabbit version that implements --local flag for list_queues, this +can greatly reduce network overhead in cases when node is +stopped/demoted. + +Use --local option for list_queues + + + + + +Soft and hard limit for NOFILE + +NOFILE limit + + + + + +When set to true the iptables calls to block client access become +noops. This is useful when we run inside containers. + +Disable iptables use entirely + + + + + +When set to anything other than the empty string it must container the list of +cluster node names, separated by spaces, where the rabbitmq resource is allowed to run. +Tis is needed when rabbitmq is running on a subset of nodes part of a larger +cluster. The default ("") is to assume that all nodes part of the cluster will +run the rabbitmq resource. + +List of cluster nodes where rabbitmq is allowed to run + + + +$EXTENDED_OCF_PARAMS + + + + + + + + + + + + + + + + +END +} + + +MIN_MASTER_SCORE=100 +BEST_MASTER_SCORE=1000 + + +####################################################################### +# Functions invoked by resource manager actions + +#TODO(bogdando) move proc_kill, proc_stop to shared OCF functions +# to be shipped with HA cluster packages +########################################################### +# Attempts to kill a process with retries and checks procfs +# to make sure the process is stopped. +# +# Globals: +# LL +# Arguments: +# $1 - pid of the process to try and kill +# $2 - service name used for logging and match-based kill, if the pid is "none" +# $3 - signal to use, defaults to SIGTERM +# $4 - number of retries, defaults to 5 +# $5 - time to sleep between retries, defaults to 2 +# Returns: +# 0 - if successful +# 1 - if process is still running according to procfs +# 2 - if invalid parameters passed in +########################################################### +proc_kill() +{ + local pid="${1}" + local service_name="${2}" + local signal="${3:-SIGTERM}" + local count="${4:-5}" + local process_sleep="${5:-2}" + local LH="${LL} proc_kill():" + local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')" + + if [ "${pid}" ] && [ "${pgrp}" = "1" ] ; then + ocf_log err "${LH} shall not kill by the bad pid 1 (init)!" + return 2 + fi + + if [ "${pid}" = "none" ]; then + local matched + matched="$(pgrep -fla ${service_name})" + if [ -z "${matched}" ] ; then + ocf_log info "${LH} cannot find any processes matching the ${service_name}, considering target process to be already dead" + return 0 + fi + ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" + while [ $count -gt 0 ]; do + if [ -z "${matched}" ]; then + break + else + matched="$(pgrep -fla ${service_name})" + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -f -"${signal}" "${service_name}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + pgrep -f "${service_name}" > /dev/null + if [ $? -ne 0 ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 + else + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 + fi + else + # pid is not none + while [ $count -gt 0 ]; do + if [ ! -d "/proc/${pid}" ]; then + break + else + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -"${signal}" -g "${pgrp}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + + # Check if the process ended after the last sleep + if [ ! -d "/proc/${pid}" ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 + fi + + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 + fi +} + +########################################################### +# Attempts to kill a process with the given pid or pid file +# using proc_kill and will retry with sigkill if sigterm is +# unsuccessful. +# +# Globals: +# OCF_ERR_GENERIC +# OCF_SUCCESS +# LL +# Arguments: +# $1 - pidfile or pid or 'none', if stopping by the name matching +# $2 - service name used for logging or for the failback stopping method +# $3 - stop process timeout (in sec), used to determine how many times we try +# SIGTERM and an upper limit on how long this function should try and +# stop the process. Defaults to 15. +# Returns: +# OCF_SUCCESS - if successful +# OCF_ERR_GENERIC - if process is still running according to procfs +########################################################### +proc_stop() +{ + local pid_param="${1}" + local service_name="${2}" + local timeout="${3:-15}" + local LH="${LL} proc_stop():" + local i + local pid + local pidfile + if [ "${pid_param}" = "none" ] ; then + pid="none" + else + # check if provide just a number + echo "${pid_param}" | egrep -q '^[0-9]+$' + if [ $? -eq 0 ]; then + pid="${pid_param}" + elif [ -e "${pid_param}" ]; then # check if passed in a pid file + pidfile="${pid_param}" + pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) + else + ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + pid="none" + fi + fi + # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds + local stop_count=$(( ($timeout-5)/2 )) + + # make sure we stop at least once + if [ $stop_count -le 0 ]; then + stop_count=1 + fi + + if [ -z "${pid}" ] ; then + ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}" + pid="none" + fi + + if [ -n "${pid}" ]; then + for i in ${pid} ; do + [ "${i}" ] || break + ocf_log info "${LH} Stopping ${service_name} by PID ${i}" + proc_kill "${i}" "${service_name}" SIGTERM $stop_count + if [ $? -ne 0 ]; then + # SIGTERM failed, send a single SIGKILL + proc_kill "${i}" "${service_name}" SIGKILL 1 2 + if [ $? -ne 0 ]; then + ocf_log err "${LH} ERROR: could not stop ${service_name}" + return "${OCF_ERR_GENERIC}" + fi + fi + done + fi + + # Remove the pid file here which will remove empty pid files as well + if [ -n "${pidfile}" ]; then + rm -f "${pidfile}" + fi + + ocf_log info "${LH} Stopped ${service_name}" + return "${OCF_SUCCESS}" +} + +# Invokes the given command as a rabbitmq user and wrapped in the +# timeout command. +su_rabbit_cmd() { + local timeout + if [ "$1" = "-t" ]; then + timeout="/usr/bin/timeout ${OCF_RESKEY_command_timeout} $2" + shift 2 + else + timeout=$COMMAND_TIMEOUT + fi + local cmd="${1:-status}" + local LH="${LL} su_rabbit_cmd():" + local rc=1 + local user=$OCF_RESKEY_username + local mail=/var/spool/mail/rabbitmq + local pwd=/var/lib/rabbitmq + local home=/var/lib/rabbitmq + + ocf_log debug "${LH} invoking a command: ${cmd}" + su $user -s /bin/sh -c "USER=${user} MAIL=${mail} PWD=${pwd} HOME=${home} LOGNAME=${user} \ + ${timeout} ${cmd}" + rc=$? + ocf_log info "${LH} the invoked command exited ${rc}: ${cmd}" + return $rc +} + +now() { + date -u +%s +} + +set_limits() { + local current_limit=$(su $OCF_RESKEY_username -s /bin/sh -c "ulimit -n") + if [ ! -z $OCF_RESKEY_limit_nofile ] && [ $OCF_RESKEY_limit_nofile -gt $current_limit ] ; then + ulimit -n $OCF_RESKEY_limit_nofile + fi +} + +master_score() { + local LH="${LL} master_score():" + local score=$1 + if [ -z $score ] ; then + score=0 + fi + ocf_log info "${LH} Updating master score attribute with ${score}" + ocf_run crm_master -N $THIS_PCMK_NODE -l reboot -v $score || return $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + +# Return either FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. +get_hostname() { + local os=$(uname -s) + if [ "${OCF_RESKEY_use_fqdn}" = 'false' ] ; then + if [ "$os" = "SunOS" ]; then + echo "$(hostname | sed 's@\..*@@')" + else + echo "$(hostname -s)" + fi + else + if [ "$os" = "SunOS" ]; then + echo "$(hostname)" + else + echo "$(hostname -f)" + fi + fi +} + +# Strip the FQDN to the shortname, if OCF_RESKEY_use_fqdn was set; +# Prepend prefix to the hostname +process_fqdn() { + if [ "${OCF_RESKEY_use_fqdn}" = 'false' ] ; then + echo "${OCF_RESKEY_fqdn_prefix}$1" | awk -F. '{print $1}' + else + echo "${OCF_RESKEY_fqdn_prefix}$1" + fi +} + +# Return OCF_SUCCESS, if current host is in the list of given hosts. +# Otherwise, return 10 +my_host() { + local hostlist="$1" + local hostname + local hn + local rc=10 + local LH="${LL} my_host():" + + hostname=$(process_fqdn $(get_hostname)) + ocf_log debug "${LH} hostlist is: $hostlist" + for host in $hostlist ; do + hn=$(process_fqdn "${host}") + ocf_log debug "${LH} comparing '$hostname' with '$hn'" + if [ "${hostname}" = "${hn}" ] ; then + rc=$OCF_SUCCESS + break + fi + done + + return $rc +} + +get_integer_node_attr() { + local value + value=$(crm_attribute -N $1 -l reboot --name "$2" --query 2>/dev/null | awk '{ split($3, vals, "="); if (vals[2] != "(null)") print vals[2] }') + if [ $? -ne 0 ] || [ -z "$value" ] ; then + value=0 + fi + echo $value +} + +get_node_start_time() { + get_integer_node_attr $1 'rabbit-start-time' +} + +get_node_master_score() { + get_integer_node_attr $1 "master-${RESOURCE_NAME}" +} + +# Return either rabbit node name as FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. +rabbit_node_name() { + echo "rabbit@$(process_fqdn $(ocf_attribute_target $1))" +} + +rmq_setup_env() { + local H + local dir + H="$(get_hostname)" + export RABBITMQ_NODENAME=$(rabbit_node_name $H) + if [ "$OCF_RESKEY_node_port" != "$OCF_RESKEY_node_port_default" ]; then + export RABBITMQ_NODE_PORT=$OCF_RESKEY_node_port + fi + export RABBITMQ_PID_FILE=$OCF_RESKEY_pid_file + MNESIA_FILES="${OCF_RESKEY_mnesia_base}/$(rabbit_node_name $H)" + export RABBITMQ_SERVER_START_ARGS="${RABBITMQ_SERVER_START_ARGS} -mnesia dir \"${MNESIA_FILES}\" -sname $(rabbit_node_name $H)" + RMQ_START_TIME="${MNESIA_FILES}/ocf_server_start_time.txt" + MASTER_FLAG_FILE="${MNESIA_FILES}/ocf_master_for_${OCF_RESOURCE_INSTANCE}" + THIS_PCMK_NODE=$(ocf_attribute_target) + TOTALVMEM=`free -mt | awk '/Total:/ {print $2}'` + # check and make PID file dir + local PID_DIR=$( dirname $OCF_RESKEY_pid_file ) + if [ ! -d ${PID_DIR} ] ; then + mkdir -p ${PID_DIR} + chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} ${PID_DIR} + chmod 755 ${PID_DIR} + fi + + # Regardless of whether we just created the directory or it + # already existed, check whether it is writable by the configured + # user + for dir in ${PID_DIR} "${OCF_RESKEY_mnesia_base}" "${OCF_RESKEY_log_dir}"; do + if test -e ${dir}; then + local files + files=$(su -s /bin/sh - $OCF_RESKEY_username -c "find ${dir} ! -writable") + if [ "${files}" ]; then + ocf_log warn "Directory ${dir} is not writable by ${OCF_RESKEY_username}, chowning." + chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${dir}" + fi + fi + done + + export LL="${OCF_RESOURCE_INSTANCE}[$$]:" + update_cookie +} + +# Return a RabbitMQ node to its virgin state. +# For reset and force_reset to succeed the RabbitMQ application must have been stopped. +# If the app cannot be stopped, beam will be killed and mnesia files will be removed. +reset_mnesia() { + local LH="${LL} reset_mnesia():" + local make_amnesia=false + local rc=$OCF_ERR_GENERIC + + # check status of a beam process + get_status + rc=$? + if [ $rc -eq 0 ] ; then + # beam is running + # check status of rabbit app and stop it, if it is running + get_status rabbit + rc=$? + if [ $rc -eq 0 ] ; then + # rabbit app is running, have to stop it + ocf_log info "${LH} Stopping RMQ-app prior to reset the mnesia." + stop_rmq_server_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log warn "${LH} RMQ-app can't be stopped." + make_amnesia=true + fi + fi + + if ! $make_amnesia ; then + # rabbit app is not running, reset mnesia + ocf_log info "${LH} Execute reset with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} reset" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log info "${LH} Execute force_reset with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} force_reset" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log warn "${LH} Mnesia couldn't cleaned, even by force-reset command." + make_amnesia=true + fi + fi + fi + else + # there is no beam running + make_amnesia=true + ocf_log warn "${LH} There is no Beam process running." + fi + + # remove mnesia files, if required + if $make_amnesia ; then + kill_rmq_and_remove_pid + ocf_run rm -rf "${MNESIA_FILES}" + mnesia_schema_location="${OCF_RESKEY_mnesia_schema_base}/Mnesia.$(rabbit_node_name $(get_hostname))" + ocf_run rm -rf "$mnesia_schema_location" + ocf_log warn "${LH} Mnesia files appear corrupted and have been removed from ${MNESIA_FILES} and $mnesia_schema_location" + fi + # always return OCF SUCCESS + return $OCF_SUCCESS +} + + +block_client_access() +{ + # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops + if [ "${OCF_RESKEY_avoid_using_iptables}" = 'true' ] ; then + return $OCF_SUCCESS + fi + # do not add temporary RMQ blocking rule, if it is already exist + # otherwise, try to add a blocking rule with max of 5 retries + local tries=5 + until $(iptables -nvL --wait | grep -q 'temporary RMQ block') || [ $tries -eq 0 ]; do + tries=$((tries-1)) + iptables --wait -I INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \ + -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset + sleep 1 + done + if [ $tries -eq 0 ]; then + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +unblock_client_access() +{ + local lhtext="none" + if [ -z $1 ] ; then + lhtext=$1 + fi + # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops + if [ "${OCF_RESKEY_avoid_using_iptables}" = 'true' ] ; then + return + fi + # remove all temporary RMQ blocking rules, if there are more than one exist + for i in $(iptables -nvL --wait --line-numbers | awk '/temporary RMQ block/ {print $1}'); do + iptables --wait -D INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \ + -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset + done + ocf_log info "${lhtext} unblocked access to RMQ port" +} + +get_nodes__base(){ + local infotype='' + local rc=$OCF_ERR_GENERIC + local c_status + + if [ "$1" = 'nodes' ] + then + infotype='db_nodes' + elif [ "$1" = 'running' ] + then + infotype='running_db_nodes' + fi + c_status=`${OCF_RESKEY_ctl} eval "mnesia:system_info(${infotype})." 2>/dev/null` + rc=$? + if [ $rc -ne 0 ] ; then + echo '' + return $OCF_ERR_GENERIC + fi + # translate line like '{running_nodes,['rabbit@node-1','rabbit@node-2','rabbit@node-3']},' to node_list + echo $(echo "${c_status}" | awk -F, '{ for (i=1;i<=NF;i++) { if ($i ~ /@/) { gsub(/[\[\]}{]/,"",$i); print $i; } }}' | tr -d "\'") + return $OCF_SUCCESS +} + +get_nodes() { + echo $(get_nodes__base nodes) + return $? +} + +get_running_nodes() { + echo $(get_nodes__base running) + return $? +} + +# Get alive cluster nodes in visible partition, but the specified one +get_alive_pacemaker_nodes_but() +{ + if [ -z "$1" ]; then + tmp_pcmk_node_list=`crm_node -l -p | sed -e '/(null)/d'` + else + tmp_pcmk_node_list=`crm_node -l -p | sed -e "s/${1}//g" | sed -e '/(null)/d'` + fi + # If OCF_RESKEY_allowed_cluster_nodes is set then we only want the intersection + # of the cluster node output and the allowed_cluster_nodes list + if [ -z "${OCF_RESKEY_allowed_cluster_nodes}" ]; then + pcmk_node_list=$tmp_pcmk_node_list + else + pcmk_node_list=`for i in $tmp_pcmk_node_list ${OCF_RESKEY_allowed_cluster_nodes}; do echo $i; done | sort | uniq -d` + fi + echo $pcmk_node_list +} + +# Get current master. If a parameter is provided, +# do not check node with that name +get_master_name_but() +{ + local node + for node in $(get_alive_pacemaker_nodes_but "$@") + do + ocf_log info "${LH} looking if $node is master" + + if is_master $node; then + ocf_log info "${LH} master is $node" + echo $node + break + fi + done +} + +# Evals some erlang code on current node +erl_eval() { + local fmt="${1:?}" + shift + + $COMMAND_TIMEOUT ${OCF_RESKEY_ctl} eval "$(printf "$fmt" "$@")" 2>/dev/null +} + +# Returns 0 if we are clustered with provideded node +is_clustered_with() +{ + local LH="${LH}: is_clustered_with: " + local node_name + local rc + node_name=$(rabbit_node_name $1) + + local seen_as_running + seen_as_running=$(erl_eval "lists:member('%s', rabbit_mnesia:cluster_nodes(running))." "$node_name") + rc=$? + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} Failed to check whether '$node_name' is considered running by us" + # We had a transient local error; that doesn't mean the remote node is + # not part of the cluster, so ignore this + elif [ "$seen_as_running" != true ]; then + ocf_log info "${LH} Node $node_name is not running, considering it not clustered with us" + return 1 + fi + + local seen_as_partitioned + seen_as_partitioned=$(erl_eval "lists:member('%s', rabbit_node_monitor:partitions())." "$node_name") + rc=$? + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} Failed to check whether '$node_name' is partitioned with us" + # We had a transient local error; that doesn't mean the remote node is + # partitioned with us, so ignore this + elif [ "$seen_as_partitioned" != false ]; then + ocf_log info "${LH} Node $node_name is partitioned from us" + return 1 + fi + + return $? +} + + +check_need_join_to() { + local join_to + local node + local running_nodes + local rc=$OCF_ERR_GENERIC + + rc=0 + join_to=$(rabbit_node_name $1) + running_nodes=$(get_running_nodes) + for node in $running_nodes ; do + if [ "${join_to}" = "${node}" ] ; then + rc=1 + break + fi + done + + return $rc +} + +# Update erlang cookie, if it has been specified +update_cookie() { + local cookie_file_content + if [ "${OCF_RESKEY_erlang_cookie}" != 'false' ] ; then + if [ -f "${OCF_RESKEY_erlang_cookie_file}" ]; then + # First line of cookie file without newline + cookie_file_content=$(head -n1 "${OCF_RESKEY_erlang_cookie_file}" | perl -pe chomp) + fi + # As there is a brief period of time when the file is empty + # (shell redirection has already opened and truncated file, + # and echo hasn't finished its job), we are doing this write + # only when cookie has changed. + if [ "${OCF_RESKEY_erlang_cookie}" != "${cookie_file_content}" ]; then + echo "${OCF_RESKEY_erlang_cookie}" > "${OCF_RESKEY_erlang_cookie_file}" + fi + # And this are idempotent operations, so we don't have to + # check any preconditions for running them. + chown ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${OCF_RESKEY_erlang_cookie_file}" + chmod 600 "${OCF_RESKEY_erlang_cookie_file}" + fi + return $OCF_SUCCESS +} + +# Stop rmq beam process by pid and by rabbit node name match. Returns SUCCESS/ERROR +kill_rmq_and_remove_pid() { + local LH="${LL} kill_rmq_and_remove_pid():" + # Stop the rabbitmq-server by its pidfile, use the name matching as a fallback, + # and ignore the exit code + proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" + # Ensure the beam.smp stopped by the rabbit node name matching as well + proc_stop none "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" + if [ $? -eq 0 ] ; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi +} + +trim_var(){ + local string="$*" + echo ${string%% } +} + +action_validate() { + # todo(sv): validate some incoming parameters + OCF_RESKEY_CRM_meta_notify_post=$(trim_var $OCF_RESKEY_CRM_meta_notify_post) + OCF_RESKEY_CRM_meta_notify_pre=$(trim_var $OCF_RESKEY_CRM_meta_notify_pre) + OCF_RESKEY_CRM_meta_notify_start=$(trim_var $OCF_RESKEY_CRM_meta_notify_start) + OCF_RESKEY_CRM_meta_notify_stop=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop) + OCF_RESKEY_CRM_meta_notify_start_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_resource) + OCF_RESKEY_CRM_meta_notify_stop_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_resource) + OCF_RESKEY_CRM_meta_notify_active_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_resource) + OCF_RESKEY_CRM_meta_notify_inactive_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_inactive_resource) + OCF_RESKEY_CRM_meta_notify_start_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_uname) + OCF_RESKEY_CRM_meta_notify_stop_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_uname) + OCF_RESKEY_CRM_meta_notify_active_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_uname) + OCF_RESKEY_CRM_meta_notify_master_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_resource) + OCF_RESKEY_CRM_meta_notify_master_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_uname) + OCF_RESKEY_CRM_meta_notify_demote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_resource) + OCF_RESKEY_CRM_meta_notify_demote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_uname) + OCF_RESKEY_CRM_meta_notify_slave_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_resource) + OCF_RESKEY_CRM_meta_notify_slave_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_uname) + OCF_RESKEY_CRM_meta_notify_promote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_resource) + OCF_RESKEY_CRM_meta_notify_promote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_uname) + return $OCF_SUCCESS +} + +update_rabbit_start_time_if_rc() { + local nowtime + local rc=$1 + if [ $rc -eq 0 ]; then + nowtime="$(now)" + ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" + fi +} + +join_to_cluster() { + local node="$1" + local rmq_node + local rc=$OCF_ERR_GENERIC + local LH="${LL} join_to_cluster():" + + ocf_log info "${LH} start." + + rmq_node=$(rabbit_node_name $node) + ocf_log info "${LH} Joining to cluster by node '${rmq_node}'." + get_status rabbit + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} rabbitmq app will be stopped." + stop_rmq_server_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't stop rabbitmq app by stop_app command. Stopping." + action_stop + return $OCF_ERR_GENERIC + fi + fi + ocf_log info "${LH} Execute join_cluster with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} join_cluster $rmq_node" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't join to cluster by node '${rmq_node}'. Stopping." + action_stop + return $OCF_ERR_GENERIC + fi + sleep 2 + try_to_start_rmq_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't start RMQ app after join to cluster. Stopping." + action_stop + return $OCF_ERR_GENERIC + else + update_rabbit_start_time_if_rc 0 + ocf_log info "${LH} Joined to cluster succesfully." + fi + + ocf_log info "${LH} end." + return $rc +} + +unjoin_nodes_from_cluster() { + # node names of the nodes where the pcs resource is being stopped + local nodelist="$1" + local hostname + local nodename + local rc=$OCF_ERR_GENERIC + local rnode + # nodes in rabbit cluster db + local nodes_in_cluster + local LH="${LL} unjoin_nodes_from_cluster():" + + nodes_in_cluster=$(get_nodes) + rc=$? + if [ $rc -ne 0 ] ; then + # no nodes in node list, nothing to do + return $OCF_SUCCESS + fi + + # unjoin all cluster nodes which are being stopped (i.e. recieved post-stop notify), except *this* node + # before to unjoin the nodes, make sure they were disconnected from *this* node + for hostname in $nodelist ; do + nodename=$(rabbit_node_name $hostname) + if [ "${nodename}" = "${RABBITMQ_NODENAME}" ] ; then + continue + fi + for rnode in $nodes_in_cluster ; do + if [ "${nodename}" = "${rnode}" ] ; then + # disconnect node being unjoined from this node + ocf_run ${OCF_RESKEY_ctl} eval "disconnect_node(list_to_atom(\"${nodename}\"))." 2>&1 + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} node '${nodename}' disconnected succesfully." + else + ocf_log info "${LH} disconnecting node '${nodename}' failed." + fi + + # unjoin node + # when the rabbit node went down, its status + # remains 'running' for a while, so few retries are required + local tries=0 + until [ $tries -eq 5 ]; do + tries=$((tries+1)) + if is_clustered_with $nodename; then + ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet" + else + break + fi + sleep 10 + done + ocf_log info "${LH} Execute forget_cluster_node with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} forget_cluster_node ${nodename}" + rc=$? + if [ $rc -eq 0 ] ; then + ocf_log info "${LH} node '${nodename}' unjoined succesfully." + else + ocf_log warn "${LH} unjoining node '${nodename}' failed." + fi + fi + done + done + return $OCF_SUCCESS +} + +# Stop RMQ beam server process. Returns SUCCESS/ERROR +stop_server_process() { + local pid + local rc=$OCF_ERR_GENERIC + local LH="${LL} stop_server_process():" + + pid=$(cat ${OCF_RESKEY_pid_file}) + rc=$? + if [ $rc -ne 0 ] ; then + # Try to stop without known PID + ocf_log err "${LH} RMQ-server process PIDFILE was not found!" + su_rabbit_cmd "${OCF_RESKEY_ctl} stop >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" + if [ $? -eq 0 ] ; then + ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found." + ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam" + sleep "${OCF_RESKEY_stop_time}" + else + kill_rmq_and_remove_pid + fi + elif [ "${pid}" ] ; then + # Try to stop gracefully by known PID + ocf_log info "${LH} Execute stop with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} stop ${OCF_RESKEY_pid_file} >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" + [ $? -eq 0 ] && ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully." + fi + + # Ensure there is no beam process and pidfile left + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + rc=$? + if [ -f ${OCF_RESKEY_pid_file} ] || [ $rc -eq 0 ] ; then + ocf_log warn "${LH} The pidfile or beam's still exist, forcing the RMQ-server cleanup" + kill_rmq_and_remove_pid + return $? + else + return $OCF_SUCCESS + fi +} + +# Stop RMQ-app. Return OCF_SUCCESS, if the app was stopped, +# otherwise return OCF_ERR_GENERIC +stop_rmq_server_app() { + local rc=$OCF_ERR_GENERIC + + # if the beam process isn't running, then rabbit app is stopped as well + get_status + rc=$? + if [ $rc -ne 0 ] ; then + return $OCF_SUCCESS + fi + + # stop the app + ocf_log info "${LH} Execute stop_app with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} stop_app >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} RMQ-server app cannot be stopped." + return $OCF_ERR_GENERIC + fi + + get_status rabbit + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ-server app stopped succesfully." + rc=$OCF_SUCCESS + else + ocf_log err "${LH} RMQ-server app cannot be stopped." + rc=$OCF_ERR_GENERIC + fi + + return $rc +} + +start_beam_process() { + local command + local rc=$OCF_ERR_GENERIC + local ts_end + local pf_end + local pid + local LH="${LL} start_beam_process():" + + # remove old PID-file if it exists + if [ -f "${OCF_RESKEY_pid_file}" ] ; then + ocf_log warn "${LH} found old PID-file '${OCF_RESKEY_pid_file}'." + pid=$(cat ${OCF_RESKEY_pid_file}) + if [ "${pid}" ] && [ -d "/proc/${pid}" ] ; then + ocf_run cat /proc/${pid}/cmdline | grep -c 'bin/beam' > /dev/null 2>&1 + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log warn "${LH} found beam process with PID=${pid}, killing...'." + ocf_run kill -TERM $pid + else + ocf_log err "${LH} found unknown process with PID=${pid} from '${OCF_RESKEY_pid_file}'." + return $OCF_ERR_GENERIC + fi + fi + ocf_run rm -f $OCF_RESKEY_pid_file + fi + + [ -f /etc/default/rabbitmq-server ] && . /etc/default/rabbitmq-server + + # RabbitMQ requires high soft and hard limits for NOFILE + set_limits + + # run beam process + command="${OCF_RESKEY_binary} >> \"${OCF_RESKEY_log_dir}/startup_log\" 2>/dev/null" + RABBITMQ_NODE_ONLY=1 su rabbitmq -s /bin/sh -c "${command}"& + ts_end=$(( $(now) + ${OCF_RESKEY_start_time} )) + sleep 3 # give it some time, before attempting to start_app + # PID-file is now created later, if the application started successfully + # So assume beam.smp is started, and defer errors handling for start_app + return $OCF_SUCCESS +} + +check_plugins() { + # Check if it's safe to load plugins and if we need to do so. Logic is: + # if (EnabledPlugins > 0) and (ActivePlugins == 0) ; then it's safe to load + # If we have at least one active plugin, then it's not safe to re-load them + # because plugins:setup() would remove existing dependency plugins in plugins_expand_dir. + ${OCF_RESKEY_ctl} eval '{ok, EnabledFile} = application:get_env(rabbit, enabled_plugins_file), EnabledPlugins = rabbit_plugins:read_enabled(EnabledFile), ActivePlugins = rabbit_plugins:active(), if length(EnabledPlugins)>0 -> if length(ActivePlugins)==0 -> erlang:error("need_to_load_plugins"); true -> false end; true -> false end.' + return $? +} + +load_plugins() { + check_plugins + local rc=$? + if [ $rc -eq 0 ] ; then + return 0 + else + ${OCF_RESKEY_ctl} eval 'ToBeLoaded = rabbit_plugins:setup(), ok = app_utils:load_applications(ToBeLoaded), StartupApps = app_utils:app_dependency_order(ToBeLoaded,false), app_utils:start_applications(StartupApps).' + return $? + fi +} + +list_active_plugins() { + local list + list=`${OCF_RESKEY_ctl} eval 'rabbit_plugins:active().' 2>/dev/null` + echo "${list}" +} + +try_to_start_rmq_app() { + local startup_log="${1:-${OCF_RESKEY_log_dir}/startup_log}" + local rc=$OCF_ERR_GENERIC + local LH="${LL} try_to_start_rmq_app():" + + get_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ-runtime (beam) not started, starting..." + start_beam_process + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "${LH} Failed to start beam - returning from the function" + return $OCF_ERR_GENERIC + fi + fi + + + if [ -z "${startup_log}" ] ; then + startup_log="${OCF_RESKEY_log_dir}/startup_log" + fi + + ocf_log info "${LH} begin." + ocf_log info "${LH} Execute start_app with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} start_app >>${startup_log} 2>&1" + rc=$? + if [ $rc -eq 0 ] ; then + ocf_log info "${LH} start_app was successful." + ocf_log info "${LH} waiting for start to finish with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} wait ${OCF_RESKEY_pid_file}" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} RMQ-server app failed to wait for start." + return $OCF_ERR_GENERIC + fi + rc=$OCF_SUCCESS + # Loading enabled modules + ocf_log info "${LH} start plugins." + load_plugins + local mrc=$? + if [ $mrc -eq 0 ] ; then + local mlist + mlist=`list_active_plugins` + ocf_log info "${LH} Starting plugins: ${mlist}" + else + ocf_log info "${LH} Starting plugins: failed." + fi + else + ocf_log info "${LH} start_app failed." + rc=$OCF_ERR_GENERIC + fi + return $rc +} + +start_rmq_server_app() { + local rc=$OCF_ERR_GENERIC + local startup_log="${OCF_RESKEY_log_dir}/startup_log" + local startup_output + local LH="${LL} start_rmq_server_app():" + local a + + #We are performing initial start check. + #We are not ready to provide service. + #Clients should not have access. + + + ocf_log info "${LH} begin." + # Safe-unblock the rules, if there are any + unblock_client_access "${LH}" + # Apply the blocking rule + block_client_access + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "${LH} blocked access to RMQ port" + else + ocf_log err "${LH} cannot block access to RMQ port!" + return $OCF_ERR_GENERIC + fi + get_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ-runtime (beam) not started, starting..." + start_beam_process + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + unblock_client_access "${LH}" + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "${LH} RMQ-server app not started, starting..." + try_to_start_rmq_app "$startup_log" + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + # rabbitmq-server started successfuly as master of cluster + master_score $MIN_MASTER_SCORE + stop_rmq_server_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} RMQ-server app can't be stopped. Beam will be killed." + kill_rmq_and_remove_pid + unblock_client_access "${LH}" + return $OCF_ERR_GENERIC + fi + else + # error at start RMQ-server + ocf_log warn "${LH} RMQ-server app can't start without Mnesia cleaning." + for a in $(seq 1 10) ; do + rc=$OCF_ERR_GENERIC + reset_mnesia || break + try_to_start_rmq_app "$startup_log" + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + stop_rmq_server_app + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "${LH} RMQ-server app Mnesia cleaned successfully." + rc=$OCF_SUCCESS + master_score $MIN_MASTER_SCORE + break + else + ocf_log err "${LH} RMQ-server app can't be stopped during Mnesia cleaning. Beam will be killed." + kill_rmq_and_remove_pid + unblock_client_access "${LH}" + return $OCF_ERR_GENERIC + fi + fi + done + fi + if [ $rc -eq $OCF_ERR_GENERIC ] ; then + ocf_log err "${LH} RMQ-server can't be started while many tries. Beam will be killed." + kill_rmq_and_remove_pid + fi + ocf_log info "${LH} end." + unblock_client_access "${LH}" + return $rc +} + +# check status of rabbit beam process or a rabbit app, if rabbit arg specified +# by default, test if the kernel app is running, otherwise consider it is "not running" +get_status() { + local what="${1:-kernel}" + local rc=$OCF_NOT_RUNNING + local LH="${LL} get_status():" + local body + local beam_running + + body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 ) + rc=$? + + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + beam_running=$? + # report not running only if the which_applications() reported an error AND the beam is not running + if [ $rc -ne 0 ] && [ $beam_running -ne 0 ] ; then + ocf_log info "${LH} failed with code ${rc}. Command output: ${body}" + return $OCF_NOT_RUNNING + # return a generic error, if there were errors and beam is found running + elif [ $rc -ne 0 ] ; then + ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}" + return $OCF_ERR_GENERIC + fi + + # try to parse the which_applications() output only if it exited w/o errors + if [ "${what}" ] && [ $rc -eq 0 ] ; then + rc=$OCF_NOT_RUNNING + echo "$body" | grep "\{${what}," > /dev/null 2>&1 && rc=$OCF_SUCCESS + + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} app ${what} was not found in command output: ${body}" + fi + fi + + [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING + return $rc +} + +action_status() { + local rc=$OCF_ERR_GENERIC + + get_status + rc=$? + return $rc +} + +# return 0, if given node has a master attribute in CIB, +# otherwise, return 1 +is_master() { + local result + result=`crm_attribute -N "${1}" -l reboot --name 'rabbit-master' --query 2>/dev/null |\ + awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` + if [ "${result}" != 'true' ] ; then + return 1 + fi + return 0 +} + +# Verify if su_rabbit_cmd exited by timeout by checking its return code. +# If it did not, return 0. If it did AND it is +# $OCF_RESKEY_max_rabbitmqctl_timeouts'th timeout in a row, +# return 2 to signal get_monitor that it should +# exit with error. Otherwise return 1 to signal that there was a timeout, +# but it should be ignored. Timeouts for different operations are tracked +# separately. The second argument is used to distingush them. +check_timeouts() { + local op_rc=$1 + local timeouts_attr_name=$2 + local op_name=$3 + + # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about + # timeout. + if [ $op_rc -ne 124 ] && [ $op_rc -ne 137 ] && [ $op_rc -ne 75 ]; then + ocf_update_private_attr $timeouts_attr_name 0 + return 0 + fi + + local count + count=$(ocf_get_private_attr $timeouts_attr_name 0) + + count=$((count+1)) + # There is a slight chance that this piece of code will be executed twice simultaneously. + # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need + # precise calculation here. + ocf_update_private_attr $timeouts_attr_name $count + + if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then + ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." + return 1 + else + ocf_log err "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row and is not responding. The resource is failed." + return 2 + fi +} + +wait_sync() { + local wait_time=$1 + local queues + local opt_arg="" + + if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then + opt_arg="--local" + fi + + queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} -p ${OCF_RESKEY_default_vhost} list_queues $opt_arg name state" + + su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ + do sleep 2; done\"" + + return $? +} + +get_monitor() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} get_monitor():" + local status_master=1 + local rabbit_running + local name + local node + local node_start_time + local nowtime + local partitions_report + local node_partitions + + ocf_log info "${LH} CHECK LEVEL IS: ${OCF_CHECK_LEVEL}" + get_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ] ; then + ocf_log info "${LH} get_status() returns ${rc}." + ocf_log info "${LH} ensuring this slave does not get promoted." + master_score 0 + return $OCF_NOT_RUNNING + elif [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} get_status() returns ${rc}." + ocf_log info "${LH} also checking if we are master." + get_status rabbit + rabbit_running=$? + is_master $THIS_PCMK_NODE + status_master=$? + ocf_log info "${LH} master attribute is ${status_master}" + if [ $status_master -eq 0 ] && [ $rabbit_running -eq $OCF_SUCCESS ] + then + ocf_log info "${LH} We are the running master" + rc=$OCF_RUNNING_MASTER + elif [ $status_master -eq 0 ] && [ $rabbit_running -ne $OCF_SUCCESS ] ; then + ocf_log err "${LH} We are the master and RMQ-runtime (beam) is not running. this is a failure" + exit $OCF_FAILED_MASTER + fi + fi + get_status rabbit + rabbit_running=$? + ocf_log info "${LH} checking if rabbit app is running" + + if [ $rc -eq $OCF_RUNNING_MASTER ]; then + if [ $rabbit_running -eq $OCF_SUCCESS ]; then + ocf_log info "${LH} rabbit app is running and is master of cluster" + else + ocf_log err "${LH} we are the master and rabbit app is not running. This is a failure" + exit $OCF_FAILED_MASTER + fi + else + start_time=$((180 + $(ocf_get_private_attr 'rabbit-start-phase-1-time' 0))) + restart_order_time=$((60 + $(ocf_get_private_attr 'rabbit-ordered-to-restart' 0))) + nowtime=$(now) + + # If we started more than 3 minutes ago, and + # we got order to restart less than 1 minute ago + if [ $nowtime -lt $restart_order_time ]; then + if [ $nowtime -gt $start_time ]; then + ocf_log err "${LH} failing because we have received an order to restart from the master" + stop_server_process + rc=$OCF_ERR_GENERIC + else + ocf_log warn "${LH} received an order to restart from the master, ignoring it because we have just started" + fi + fi + fi + + if [ $rc -eq $OCF_ERR_GENERIC ]; then + ocf_log err "${LH} get_status() returns generic error ${rc}" + ocf_log info "${LH} ensuring this slave does not get promoted." + master_score 0 + return $OCF_ERR_GENERIC + fi + + # Recounting our master score + ocf_log info "${LH} preparing to update master score for node" + local our_start_time + local new_score + local node_start_time + local node_score + + our_start_time=$(get_node_start_time $THIS_PCMK_NODE) + + if [ $our_start_time -eq 0 ]; then + new_score=$MIN_MASTER_SCORE + else + new_score=$BEST_MASTER_SCORE + for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) + do + node_start_time=$(get_node_start_time $node) + node_score=$(get_node_master_score $node) + + ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)" + if [ $node_start_time -ne 0 ] && [ $node_score -ne 0 ] && [ $node_start_time -lt $our_start_time ]; then + new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) + elif [ $node_start_time -ne 0 ] && [ $node_score -ne 0 ] && [ $node_start_time -eq $our_start_time ]; then + # Do not get promoted if the other node is already master and we have the same start time + if is_master $node; then + new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) + fi + fi + done + fi + + if [ "$new_score" -ne "$(get_node_master_score $THIS_PCMK_NODE)" ]; then + master_score $new_score + fi + ocf_log info "${LH} our start time is $our_start_time and score is $new_score" + + # Skip all other checks if rabbit app is not running + if [ $rabbit_running -ne $OCF_SUCCESS ]; then + ocf_log info "${LH} RabbitMQ is not running, get_monitor function ready to return ${rc}" + return $rc + fi + + # rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there + # is some error uncovered by node_health_check + if ! node_health_check; then + rc=$OCF_ERR_GENERIC + fi + + if [ $rc -eq $OCF_RUNNING_MASTER ] ; then + # If we are the master and healthy, perform various + # connectivity checks for other nodes in the cluster. + # Order a member to restart if something fishy happens with it. + # All cross-node checks MUST happen only here. + + partitions_report="$(partitions_report)" + + for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE); do + # Restart node if we don't consider ourselves clustered with it + if ! is_clustered_with $node; then + ocf_log warn "${LH} node $node is not connected with us" + order_node_restart "$node" + continue + fi + + # Restart node if it has any unresolved partitions + node_partitions=$(grep_partitions_report $node "$partitions_report") + if [ ! -z "$node_partitions" ]; then + ocf_log warn "${LH} Node $node thinks that it is partitoned with $node_partitions" + order_node_restart "$node" + continue + fi + done + fi + + ocf_log info "${LH} get_monitor function ready to return ${rc}" + return $rc +} + +order_node_restart() { + local node=${1:?} + ocf_log warn "${LH} Ordering node '$node' to restart" + ocf_update_private_attr 'rabbit-ordered-to-restart' "$(now)" "$node" +} + +# Checks whether node is mentioned somewhere in report returned by +# partitions_report() +grep_partitions_report() { + local node="${1:?}" + local report="${2:?}" + local rabbit_node + rabbit_node=$(rabbit_node_name "$node") + echo "$report" | grep "PARTITIONED $rabbit_node:" | sed -e 's/^[^:]\+: //' +} + +# Report partitions (if any) from viewpoint of every running node in cluster. +# It is parseable/grepable version of `rabbitmqctl cluster_status`. +# +# If node sees partition, report will contain the line like: +# PARTITIONED node-name: list-of-nodes, which-node-name-considers, itself-partitioned-with +partitions_report() { + $COMMAND_TIMEOUT xargs -0 ${OCF_RESKEY_ctl} eval < ok; + ({Node, Partitions}) -> + PartitionsStr = string:join([atom_to_list(Part) || Part <- Partitions], + ", "), + io:format("PARTITIONED ~s: ~s~n", + [Node, PartitionsStr]) + end, Replies), + +ok. +EOF +} + +# Check if the rabbitmqctl control plane is alive. +node_health_check() { + local rc + if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then + node_health_check_local + rc=$? + else + node_health_check_legacy + rc=$? + fi + return $rc +} + +node_health_check_local() { + local LH="${LH} node_health_check_local():" + local rc + local rc_timeouts + + # Give node_health_check some time to handle timeout by itself. + # By using internal rabbitmqctl timeouts, we allow it to print + # more useful diagnostics + local timeout=$((TIMEOUT_ARG - 2)) + su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout" + rc=$? + + check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check" + rc_timeouts=$? + + if [ "$rc_timeouts" -eq 2 ]; then + master_score 0 + ocf_log info "${LH} node_health_check timed out, retry limit reached" + return $OCF_ERR_GENERIC + elif [ "$rc_timeouts" -eq 1 ]; then + ocf_log info "${LH} node_health_check timed out, going to retry" + return $OCF_SUCCESS + fi + + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl node_health_check exited with errors." + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +node_health_check_legacy() { + local rc_alive + local timeout_alive + su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels > /dev/null 2>&1" + rc_alive=$? + { [ $rc_alive -eq 137 ] || [ $rc_alive -eq 124 ] ; } && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)" + check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels" + timeout_alive=$? + + if [ $timeout_alive -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + elif [ $timeout_alive -eq 0 ]; then + if [ $rc_alive -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl list_channels exited with errors." + rc=$OCF_ERR_GENERIC + fi + fi + + # Check for memory alarms for this Master or Slave node. + # If alert found, reset the alarm + # and restart the resource as it likely means a dead end situation + # when rabbitmq cluster is running with blocked publishing due + # to high memory watermark exceeded. + local alarms + local rc_alarms + local timeout_alarms + alarms=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q eval 'rabbit_alarm:get_alarms().'" 2>/dev/null` + rc_alarms=$? + check_timeouts $rc_alarms "rabbit_get_alarms_timeouts" "get_alarms" + timeout_alarms=$? + + if [ $timeout_alarms -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + + elif [ $timeout_alarms -eq 0 ]; then + if [ $rc_alarms -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl get_alarms exited with errors." + rc=$OCF_ERR_GENERIC + + elif [ -n "${alarms}" ]; then + for node in ${alarms}; do + name=`echo ${node} | perl -n -e "m/memory,'(?\S+)+'/ && print \"$+{n}\n\""` + if [ "${name}" = "${RABBITMQ_NODENAME}" ] ; then + ocf_log err "${LH} Found raised memory alarm. Erasing the alarm and restarting." + su_rabbit_cmd "${OCF_RESKEY_ctl} set_vm_memory_high_watermark 10 > /dev/null 2>&1" + rc=$OCF_ERR_GENERIC + break + fi + done + fi + fi + + if ! is_cluster_status_ok ; then + rc=$OCF_ERR_GENERIC + fi + + # Check if the list of all queues is available, + # Also report some queues stats and total virtual memory. + local queues + local rc_queues + local timeout_queues + queues=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q -p ${OCF_RESKEY_default_vhost} list_queues memory messages consumer_utilisation"` + rc_queues=$? + check_timeouts $rc_queues "rabbit_list_queues_timeouts" "list_queues" + timeout_queues=$? + + if [ $timeout_queues -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + + elif [ $timeout_queues -eq 0 ]; then + if [ $rc_queues -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl list_queues exited with errors." + rc=$OCF_ERR_GENERIC + + elif [ -n "${queues}" ]; then + local q_c + q_c=`printf %b "${queues}\n" | wc -l` + local mem + mem=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'` + local mes + mes=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$2} END {print sum}'` + local c_u + c_u=`printf %b "${queues}\n" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'` + local status + status=`echo $(su_rabbit_cmd "${OCF_RESKEY_ctl} -q status")` + ocf_log info "${LH} RabbitMQ is running ${q_c} queues consuming ${mem}m of ${TOTALVMEM}m total, with ${mes} queued messages, average consumer utilization ${c_u}" + ocf_log info "${LH} RabbitMQ status: ${status}" + fi + fi + + return $rc +} + +ocf_get_private_attr() { + local attr_name="${1:?}" + local attr_default_value="${2:?}" + local nodename="${3:-$THIS_PCMK_NODE}" + local count + count=$(attrd_updater -p --name "$attr_name" --node "$nodename" --query) + if [ $? -ne 0 ]; then + echo $attr_default_value + else + echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "") print vals[2]; else print def_val }' + fi +} + +ocf_update_private_attr() { + local attr_name="${1:?}" + local attr_value="${2:?}" + local nodename="${3:-$THIS_PCMK_NODE}" + ocf_run attrd_updater -p --name "$attr_name" --node "$nodename" --update "$attr_value" +} + +rabbitmqctl_with_timeout_check() { + local command="${1:?}" + local timeout_attr_name="${2:?}" + + su_rabbit_cmd "${OCF_RESKEY_ctl} $command" + local rc=$? + + check_timeouts $rc $timeout_attr_name "$command" + local has_timed_out=$? + + case "$has_timed_out" in + 0) + return $rc;; + 1) + return 0;; + 2) + return 1;; + esac +} + +is_cluster_status_ok() { + local LH="${LH}: is_cluster_status_ok:" + rabbitmqctl_with_timeout_check cluster_status rabbit_cluster_status_timeouts > /dev/null 2>&1 +} + +action_monitor() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} monitor:" + ocf_log debug "${LH} action start." + if [ "${OCF_RESKEY_debug}" = 'true' ] ; then + d=`date '+%Y%m%d %H:%M:%S'` + echo $d >> /tmp/rmq-monitor.log + env >> /tmp/rmq-monitor.log + echo "$d [monitor] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + get_monitor + rc=$? + ocf_log debug "${LH} role: ${OCF_RESKEY_CRM_meta_role}" + ocf_log debug "${LH} result: $rc" + ocf_log debug "${LH} action end." + return $rc +} + + +action_start() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} start:" + local nowtime + + if [ "${OCF_RESKEY_debug}" = 'true' ] ; then + d=`date '+%Y%m%d %H:%M:%S'` + echo $d >> /tmp/rmq-start.log + env >> /tmp/rmq-start.log + echo "$d [start] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + ocf_log info "${LH} action begin." + + get_status + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log warn "${LH} RMQ-runtime (beam) already started." + return $OCF_SUCCESS + fi + + local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts" + local attr_name_to_reset + for attr_name_to_reset in $attrs_to_zero; do + ocf_update_private_attr $attr_name_to_reset 0 + done + + nowtime=$(now) + ocf_log info "${LH} Setting phase 1 one start time to $nowtime" + ocf_update_private_attr 'rabbit-start-phase-1-time' "$nowtime" + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + + ocf_log info "${LH} RMQ going to start." + start_rmq_server_app + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ prepared for start succesfully." + fi + + ocf_log info "${LH} action end." + return $rc +} + + +action_stop() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} stop:" + + if [ "${OCF_RESKEY_debug}" = 'true' ] ; then + d=$(date '+%Y%m%d %H:%M:%S') + echo $d >> /tmp/rmq-stop.log + env >> /tmp/rmq-stop.log + echo "$d [stop] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + ocf_log info "${LH} action begin." + + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + master_score 0 + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + + ocf_log info "${LH} RMQ-runtime (beam) going to down." + stop_server_process + + if [ $? -ne $OCF_SUCCESS ] ; then + ocf_log err "RMQ-runtime (beam) couldn't be stopped and will likely became unmanaged. Take care of it manually!" + ocf_log info "${LH} action end." + exit $OCF_ERR_GENERIC + fi + + ocf_log info "${LH} RMQ-runtime (beam) not running." + ocf_log info "${LH} action end." + return $OCF_SUCCESS +} + +####################################################################### +# Enhanced list_channels: +# - nodes are processed in parallel +# - report contains information about which nodes timed out +# +# 'list_channels' is used as a healh-check for current node, but it +# actually checks overall health of all node in cluster. And there were +# some bugs where only one (non-local) channel became stuck, but OCF +# script was wrongfully killing local node. +# +# Hopefully all such bugs are fixed, but if not - it will allow to +# detect such conditions. +# +# Somewhat strange implementation is due to the following reasons: +# - ability to support older versions of RabbitMQ which have reached +# end-of-life with single version of the script +# - zero dependencies - for older versions this functionality could be +# implemented as a plugin, but it'll require this plugin installation +enhanced_list_channels() { + # One second less than timeout of su_rabbit_cmd + local timeout=$((${TIMEOUT_ARG:-5} - 1)) + + su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" < + {Mega, Secs, Micro} = os:timestamp(), + Mili = Micro div 1000, + Mili + 1000 * (Secs + 1000000 * Mega) + end, + +%% We shouldn't continue execution past this time +ShouldEndAt = Now() + SecondsToCompletion * 1000, + +%% How many milliseconds we still have +Timeout = fun() -> + case ShouldEndAt - Now() of + Past when Past =< 0 -> + 0; + Timeout -> + Timeout + end + end, + +%% Lambda combinator - for defining anonymous recursive functions +Y = fun(F) -> + (fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)( + fun (X) -> F(fun(Y) -> (X(X))(Y) end) end) + end, + +Parent = self(), + +ListChannels = Y(fun(Rec) -> + fun (({Node, [], OkChannelsCount})) -> + Parent ! {Node, ok, OkChannelsCount}; + ({Node, [Chan|Rest], OkChannelsCount}) -> + case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of + Infos when is_list(Infos) -> + Rec({Node, Rest, OkChannelsCount + 1}); + {badrpc, {'EXIT', {noproc, _}}} -> + %% Channel became dead before we could request it's status, don't care + Rec({Node, Rest, OkChannelsCount}); + Err -> + Parent ! {Node, Err, OkChannelsCount} + end + end + end), + +SingleNodeListing = fun(Node) -> + case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of + LocalChannels when is_list(LocalChannels) -> + ListChannels({Node, LocalChannels, 0}); + Err -> + Parent ! {Node, Err, 0} + end + end, + +AllNodes = rabbit_mnesia:cluster_nodes(running), +[ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ], + +WaitForNodes = Y(fun(Rec) -> + fun ({[], Acc}) -> + Acc; + ({RemainingNodes, Acc}) -> + receive + {Node, _Status, _ChannelCount} = Smth -> + RemainingNodes1 = lists:delete(Node, RemainingNodes), + Rec({RemainingNodes1, [Smth|Acc]}) + after Timeout() + 100 -> + Acc + end + end + end), + +Result = WaitForNodes({AllNodes, []}), + +ExpandedResult = [ case lists:keysearch(Node, 1, Result) of + {value, NodeResult} -> + NodeResult; + false -> + {Node, no_data_collected, 0} + end || Node <- AllNodes ], + +ExpandedResult. +EOF +} + +####################################################################### +# Join the cluster and return OCF_SUCCESS, if joined. +# Return 10, if node is trying to join to itself or empty destination. +# Return OCF_ERR_GENERIC, if cannot join. +jjj_join () { + local join_to="$1" + local rc=$OCF_ERR_GENERIC + local LH="${LL} jjj_join:" + + my_host ${join_to} + rc=$? + ocf_log debug "${LH} node='${join_to}' rc='${rc}'" + + # Check whether we are joining to ourselves + # or master host is not given + if [ $rc -ne 0 ] && [ "${join_to}" ] ; then + ocf_log info "${LH} Joining to cluster by node '${join_to}'" + join_to_cluster "${join_to}" + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log err "${LH} Failed to join the cluster. The mnesia will be reset." + reset_mnesia + rc=$OCF_ERR_GENERIC + fi + fi + return $rc +} + +action_notify() { + local rc_join=$OCF_SUCCESS + local rc=$OCF_ERR_GENERIC + local rc2=$OCF_ERR_GENERIC + local LH="${LL} notify:" + local nodelist + + if [ "${OCF_RESKEY_debug}" = 'true' ] ; then + d=`date '+%Y%m%d %H:%M:%S'` + echo $d >> /tmp/rmq-notify.log + env >> /tmp/rmq-notify.log + echo "$d [notify] ${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation} promote='${OCF_RESKEY_CRM_meta_notify_promote_uname}' demote='${OCF_RESKEY_CRM_meta_notify_demote_uname}' master='${OCF_RESKEY_CRM_meta_notify_master_uname}' slave='${OCF_RESKEY_CRM_meta_notify_slave_uname}' start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + if [ "${OCF_RESKEY_CRM_meta_notify_type}" = 'post' ] ; then + # POST- anything notify section + case "$OCF_RESKEY_CRM_meta_notify_operation" in + promote) + ocf_log info "${LH} post-promote begin." + + rc=$OCF_SUCCESS + + # Do nothing, if the list of nodes being promoted reported empty. + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then + ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." + + elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + ocf_log info "${LH} ignoring post-promote of self" + + elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + if get_status rabbit; then + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." + else + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. We only need to start the app." + + try_to_start_rmq_app + rc2=$? + update_rabbit_start_time_if_rc $rc2 + fi + + else + # Note, this should fail when the mnesia is inconsistent. + # For example, when the "old" master processing the promition of the new one. + # Later this ex-master node will rejoin the cluster at post-start. + jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ] ; then + ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." + fi + fi + + ocf_log info "${LH} post-promote end." + return $rc + ;; + start) + ocf_log info "${LH} post-start begin." + # Do nothing, if the list of nodes being started or running reported empty + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" ] && [ -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then + ocf_log warn "${LH} I'm a last man standing and I must survive!" + ocf_log info "${LH} post-start end." + return $OCF_SUCCESS + fi + # check did this event from this host + my_host "${OCF_RESKEY_CRM_meta_notify_start_uname}" + rc=$? + # Do nothing, if there is no master reported + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_master_uname}" ] ; then + ocf_log warn "${LH} there are no nodes to join to reported on post-start. Nothing to do." + ocf_log info "${LH} post-start end." + return $OCF_SUCCESS + fi + if [ $rc -eq $OCF_SUCCESS ] ; then + # Now we need to: + # a. join to the cluster if we are not joined yet + # b. start the RabbitMQ application, which is always + # stopped after start action finishes + check_need_join_to ${OCF_RESKEY_CRM_meta_notify_master_uname} + rc_join=$? + if [ $rc_join -eq $OCF_SUCCESS ]; then + ocf_log warn "${LH} Going to join node ${OCF_RESKEY_CRM_meta_notify_master_uname}" + jjj_join "${OCF_RESKEY_CRM_meta_notify_master_uname}" + rc2=$? + else + ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}" + + try_to_start_rmq_app + rc2=$? + update_rabbit_start_time_if_rc $rc2 + fi + if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then + ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists" + ocf_run curl --silent --show-error --request POST --user $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password $OCF_RESKEY_host_ip:15672/api/definitions --header "Content-Type:application/json" --data @$OCF_RESKEY_definitions_dump_file + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "RMQ definitions have imported succesfully." + else + ocf_log err "RMQ definitions have not imported." + fi + fi + if [ $rc2 -eq $OCF_ERR_GENERIC ] ; then + ocf_log warn "${LH} Failed to join the cluster on post-start. The resource will be restarted." + ocf_log info "${LH} post-start end." + return $OCF_ERR_GENERIC + fi + fi + ocf_log info "${LH} post-start end." + ;; + stop) + # if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation) + ocf_log info "${LH} post-stop begin." + # Report not running, if there are no nodes being stopped reported + if [ -z "${OCF_RESKEY_CRM_meta_notify_stop_uname}" ] ; then + ocf_log warn "${LH} there are no nodes being stopped reported on post-stop. The resource will be restarted." + ocf_log info "${LH} post-stop end." + return $OCF_ERR_GENERIC + fi + my_host "${OCF_RESKEY_CRM_meta_notify_stop_uname}" + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + # On other nodes processing the post-stop, make sure the stopped node will be forgotten + unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_stop_uname}" + else + # On the nodes being stopped, reset the master score + ocf_log info "${LH} resetting the master score." + master_score 0 + fi + # always returns OCF_SUCCESS + ocf_log info "${LH} post-stop end." + ;; + *) ;; + esac + fi + + return $OCF_SUCCESS +} + + +action_promote() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} promote:" + + if [ "${OCF_RESKEY_debug}" = 'true' ] ; then + d=$(date '+%Y%m%d %H:%M:%S') + echo $d >> /tmp/rmq-promote.log + env >> /tmp/rmq-promote.log + echo "$d [promote] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + ocf_log info "${LH} action begin." + + get_monitor + rc=$? + ocf_log info "${LH} get_monitor returns ${rc}" + case "$rc" in + "$OCF_SUCCESS") + # Running as slave. Normal, expected behavior. + ocf_log info "${LH} Resource is currently running as Slave" + # rabbitmqctl start_app if need + get_status rabbit + rc=$? + ocf_log info "${LH} Updating cluster master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --update 'true' + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ app is not started. Starting..." + start_rmq_server_app + rc=$? + if [ $rc -eq 0 ] ; then + try_to_start_rmq_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't start RMQ app. Master resource is failed." + ocf_log info "${LH} action end." + exit $OCF_FAILED_MASTER + fi + + [ -f "${OCF_RESKEY_policy_file}" ] && . "${OCF_RESKEY_policy_file}" + + update_rabbit_start_time_if_rc $rc + + ocf_log info "${LH} Checking master status" + get_monitor + rc=$? + ocf_log info "${LH} Master status is $rc" + if [ $rc = $OCF_RUNNING_MASTER ] + then + rc=$OCF_SUCCESS + else + ocf_log err "${LH} Master resource is failed." + ocf_log info "${LH} action end." + exit $OCF_FAILED_MASTER + fi + else + ocf_log err "${LH} Can't start RMQ-runtime." + rc=$OCF_ERR_GENERIC + fi + fi + return $rc + ;; + "$OCF_RUNNING_MASTER") + # Already a master. Unexpected, but not a problem. + ocf_log warn "${LH} Resource is already running as Master" + rc=$OCF_SUCCESS + ;; + + "$OCF_FAILED_MASTER") + # Master failed. + ocf_log err "${LH} Master resource is failed and not running" + ocf_log info "${LH} action end." + exit $OCF_FAILED_MASTER + ;; + + "$OCF_NOT_RUNNING") + # Currently not running. + ocf_log err "${LH} Resource is currently not running" + rc=$OCF_NOT_RUNNING + ;; + *) + # Failed resource. Let the cluster manager recover. + ocf_log err "${LH} Unexpected error, cannot promote" + ocf_log info "${LH} action end." + exit $rc + ;; + esac + + # transform slave RMQ-server to master + + ocf_log info "${LH} action end." + return $rc +} + + +action_demote() { + local LH="${LL} demote:" + ocf_log info "${LH} action begin." + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + ocf_log info "${LH} action end." + return $OCF_SUCCESS +} +####################################################################### + +rmq_setup_env + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +# Anything except meta-data and help must pass validation +action_validate || exit $? + +# What kind of method was invoked? +case "$1" in + start) action_start;; + stop) action_stop;; + status) action_status;; + monitor) action_monitor;; + validate) action_validate;; + promote) action_promote;; + demote) action_demote;; + notify) action_notify;; + validate-all) action_validate;; + *) usage;; +esac +###