diff --git a/heartbeat/ovsmonitor b/heartbeat/ovsmonitor new file mode 100755 index 000000000..000854bcd --- /dev/null +++ b/heartbeat/ovsmonitor @@ -0,0 +1,450 @@ +#!/bin/sh +# +# OCF Resource Agent compliant script. +# Monitor the vitality of a local OpenVSwitch bond. +# +# Based on the work by Alexander Krauth. +# +# Transfered from ethmonitor into ovsmonitor by Mathieu Grzybek. +# +# Copyright (c) 2017 Robert Euhus, Alexander Krauth, Lars Marowsky-Bré +# Mathieu Grzybek +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# OCF parameters are as below +# +# OCF_RESKEY_bond +# OCF_RESKEY_bridge +# OCF_RESKEY_multiplicator +# OCF_RESKEY_name +# OCF_RESKEY_repeat_count +# OCF_RESKEY_repeat_interval +# OCF_RESKEY_pktcnt_timeout +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +meta_data() { + cat < + + +0.1 + + +Monitor the vitality of a local ovs bond. + +You may set up this RA as a clone resource to monitor the network bonds on different nodes, with the same bond name. +This is not related to the IP address or the network on which a bond is configured. +You may use this RA to move resources away from a node, which has a faulty bond or prevent moving resources to such a node. +This gives you independend control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network bond. + +The resource configuration requires a monitor operation, because the monitor does the main part of the work. +In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. +The name of the attribute value is configured in the 'name' option of this RA. + +Example constraint configuration using crmsh +location loc_connected_node my_resource_grp \ + rule $id="rule_loc_connected_node" -INF: ovsmonitor-bond-public eq 0 + +Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. +pcs constraint location my_resource rule score=-INFINITY ovsmonitor-bond-public ne 1 + +The ethmonitor works in 3 different modes to test the bond vitality. +1. call ovs-appctl to see if at least one of the bonding's link status is up (if link is down -> error) +2. call ovs-ofctl and watch the RX counter (if packages come around in a certain time -> success) +3. return error + +Monitors ovs bonding bonds + + + + +The name of the network bond which should be monitored (e.g. bond-public). + +Bond bond name + + + + + +The name of the ovs bridge that contains the bridge. + +ovs bridge + + + + + +The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ethmonitor-'bond_name'". + +Attribute name + + + + + +Multiplier for the value of the CIB attriobute specified in parameter name. + +Multiplier for result variable + + + + + +Specify how often the bond will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval + +Monitor repeat count + + + + + +Specify how long to wait in seconds between the repeat_counts. + +Monitor repeat interval in seconds + + + + + +Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds. + +packet counter timeout + + + + + +Only report success based on link status. Do not perform RX counter related connectivity tests. + +link status check only + + + + + + + + + + + + + +END + + exit $OCF_SUCCESS +} + +# +# Return true, if the bond exists +# +is_bond() { + # + # List bonds but exclude FreeS/WAN ipsecN virtual bonds + # + ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 +} + +# +# Return true, if the bridge exists +# +is_bridge() { + # + # List bonds but exclude FreeS/WAN ipsecN virtual bonds + # + #ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 + ovs-vsctl show|grep Bridge|grep -q $OCF_RESKEY_bridge +} + + +if_init() { + local rc + + if [ X"$OCF_RESKEY_bond" = "X" ]; then + ocf_exit_reason "Bond name (the bond parameter) is mandatory" + exit $OCF_ERR_CONFIGURED + fi + + if [ X"$OCF_RESKEY_bridge" = "X" ]; then + ocf_exit_reason "Bridge name (the bridge parameter) is mandatory" + exit $OCF_ERR_CONFIGURED + fi + + BOND="$OCF_RESKEY_bond" + BRIDGE="$OCF_RESKEY_bridge" + + if is_bond + then + if ! is_bridge + then + ocf_exit_reason "Bridge $OCF_RESKEY_bond does not exist" + exit $OCF_ERR_CONFIGURED; + fi + else + ocf_exit_reason "Bond $OCF_RESKEY_bond does not exist" + exit $OCF_ERR_CONFIGURED; + fi + + : ${OCF_RESKEY_multiplier:="1"} + if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then + ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" + exit $OCF_ERR_CONFIGURED + fi + + ATTRNAME=${OCF_RESKEY_name:-"ovsmonitor-$BOND"} + + REP_COUNT=${OCF_RESKEY_repeat_count:-5} + if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then + ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" + exit $OCF_ERR_CONFIGURED + fi + REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} + if ! ocf_is_decimal "$REP_INTERVAL_S"; then + ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" + exit $OCF_ERR_CONFIGURED + fi + : ${OCF_RESKEY_pktcnt_timeout:="5"} + if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then + ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" + exit $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS +} + +# get the link status on $BOND +# asks ip about running (up) bonds, returns the number of matching bond names that are up +get_link_status () { + #$IP2UTIL -o link show up dev "$BOND" | grep -v 'NO-CARRIER' | grep -c "$BOND" + ovs-appctl bond/show "$BOND"|awk -F: '/^slave/ {print $2}'|grep -c enabled +} + +# returns the number of received rx packets on $BOND +get_rx_packets () { + ocf_log debug "bond $BOND - bridge $BRIDGE" + #$IP2UTIL -o -s link show dev "$BOND" \ + # | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/' + local ovs_port + + for ovs_port in $(ovs-appctl bond/show $BOND|awk '/^slave/ {gsub(":","");print $2}') ; do + ovs-ofctl dump-ports $BRIDGE $ovs_port + done \ + | awk -F, 'BEGIN{total=0} /rx/ {gsub(".*pkts=","");total=total+int($1)} END{print total}' +} + +# watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds +# returns immedeately with return code 0 if any packets were received +# otherwise 1 is returned +watch_pkt_counter () { + local RX_PACKETS_NEW + local RX_PACKETS_OLD + RX_PACKETS_OLD="`get_rx_packets`" + for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do + sleep 0.1 + RX_PACKETS_NEW="`get_rx_packets`" + ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" + if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then + ocf_log debug "we received some packets." + return 0 + fi + done + return 1 +} + +# +# Check the bond depending on the level given as parameter: $OCF_RESKEY_check_level +# +# 10: watch for packet counter changes +# +# +# 30: watch for packet counter changes in promiscios mode +# +# If unsuccessfull in levels 18 and above, +# the tests for higher check levels are run. +# +if_check () { + # always check link status first + link_status="`get_link_status`" + ocf_log debug "link_status: $link_status (up > 0, down = 0)" + + if [ $link_status -eq 0 ]; then + ocf_log notice "link_status: DOWN" + return $OCF_NOT_RUNNING + fi + + # if using link_status_only, skip RX count related test + if ocf_is_true "$OCF_RESKEY_link_status_only"; then + return $OCF_SUCCESS + fi + + # watch for packet counter changes + ocf_log debug "watch for packet counter changes" + watch_pkt_counter + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + else + ocf_log debug "No packets received during packet watch timeout" + fi + + # watch for packet counter changes in promiscios mode +# ocf_log debug "watch for packet counter changes in promiscios mode" + # be sure switch off promiscios mode in any case + # TODO: check first, wether promisc is already on and leave it untouched. +# trap "$IP2UTIL link set dev $BOND promisc off; exit" INT TERM EXIT +# $IP2UTIL link set dev $BOND promisc on +# watch_pkt_counter && return $OCF_SUCCESS +# $IP2UTIL link set dev $BOND promisc off +# trap - INT TERM EXIT + + # looks like it's not working (for whatever reason) + return $OCF_NOT_RUNNING +} + +####################################################################### + +if_usage() { + cat < /dev/null` + sleep $sleep_time 2> /dev/null + runs=$(($runs + 1)) + fi + + if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then + ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" + fi + done + + ocf_log debug "Monitoring return code: $mon_rc" + if [ $mon_rc -eq $OCF_SUCCESS ]; then + set_cib_value 1 + attr_rc=$? + else + ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." + set_cib_value 0 + attr_rc=$? + fi + + ## The resource should not fail, if the bond is down. It should fail, if the update of the CIB variable has errors. + ## To react on the bond failure you must use constraints based on the CIB variable value, not on the resource itself. + exit $attr_rc +} + +if_stop() +{ + attrd_updater -D -n $ATTRNAME + ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop +} + +if_start() +{ + local rc + ha_pseudo_resource $OCF_RESOURCE_INSTANCE start + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failure to create ovsmonitor state file" + return $rc + fi + + # perform the first monitor during the start operation + if_monitor + return $? +} + + +if_validate() { + check_binary ovs-vsctl + check_binary ovs-appctl + check_binary ovs-ofctl + if_init +} + +case $__OCF_ACTION in +meta-data) meta_data + ;; +usage|help) if_usage + exit $OCF_SUCCESS + ;; +esac + +if_validate + +case $__OCF_ACTION in +start) if_start + exit $? + ;; +stop) if_stop + exit $? + ;; +monitor|status) if_monitor + exit $? + ;; +validate-all) exit $? + ;; +*) if_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac