diff --git a/heartbeat/ovsmonitor b/heartbeat/ovsmonitor index 120977462..8a80a8c92 100755 --- a/heartbeat/ovsmonitor +++ b/heartbeat/ovsmonitor @@ -1,450 +1,468 @@ #!/bin/sh # # OCF Resource Agent compliant script. # Monitor the vitality of a local OpenVSwitch bond. # # Based on the work by Alexander Krauth. # # Transfered from ethmonitor into ovsmonitor by Mathieu Grzybek. # # Copyright (c) 2017 Robert Euhus, Alexander Krauth, Lars Marowsky-Bré # Mathieu Grzybek # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # OCF parameters are as below # # OCF_RESKEY_bond # OCF_RESKEY_bridge # OCF_RESKEY_multiplicator # OCF_RESKEY_name # OCF_RESKEY_repeat_count # OCF_RESKEY_repeat_interval # OCF_RESKEY_pktcnt_timeout # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +# Parameter defaults + +OCF_RESKEY_bond_default="" +OCF_RESKEY_bridge_default="" +OCF_RESKEY_name_default="" +OCF_RESKEY_multiplier_default="1" +OCF_RESKEY_repeat_count_default="5" +OCF_RESKEY_repeat_interval_default="10" +OCF_RESKEY_pktcnt_timeout_default="5" +OCF_RESKEY_link_status_only_default="false" + +: ${OCF_RESKEY_bond=${OCF_RESKEY_bond_default}} +: ${OCF_RESKEY_bridge=${OCF_RESKEY_bridge_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_multiplier=${OCF_RESKEY_multiplier_default}} +: ${OCF_RESKEY_repeat_count=${OCF_RESKEY_repeat_count_default}} +: ${OCF_RESKEY_repeat_interval=${OCF_RESKEY_repeat_interval_default}} +: ${OCF_RESKEY_pktcnt_timeout=${OCF_RESKEY_pktcnt_timeout_default}} +: ${OCF_RESKEY_link_status_only=${OCF_RESKEY_link_status_only_default}} + ####################################################################### meta_data() { cat < 0.1 Monitor the vitality of a local ovs bond. You may set up this RA as a clone resource to monitor the network bonds on different nodes, with the same bond name. This is not related to the IP address or the network on which a bond is configured. You may use this RA to move resources away from a node, which has a faulty bond or prevent moving resources to such a node. This gives you independent control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network bond. The resource configuration requires a monitor operation, because the monitor does the main part of the work. In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. The name of the attribute value is configured in the 'name' option of this RA. Example constraint configuration using crmsh location loc_connected_node my_resource_grp \ rule $id="rule_loc_connected_node" -INF: ovsmonitor-bond-public eq 0 Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. pcs constraint location my_resource rule score=-INFINITY ovsmonitor-bond-public ne 1 The ethmonitor works in 3 different modes to test the bond vitality. 1. call ovs-appctl to see if at least one of the bonding's link status is up (if link is down -> error) 2. call ovs-ofctl and watch the RX counter (if packages come around in a certain time -> success) 3. return error Monitors ovs bonding bonds The name of the network bond which should be monitored (e.g. bond-public). Bond bond name - + The name of the ovs bridge that contains the bridge. ovs bridge - + -The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ethmonitor-'bond_name'". +The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ovsmonitor-'bond_name'". Attribute name - + Multiplier for the value of the CIB attriobute specified in parameter name. Multiplier for result variable - + Specify how often the bond will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval Monitor repeat count - + Specify how long to wait in seconds between the repeat_counts. Monitor repeat interval in seconds - + Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds. packet counter timeout - + Only report success based on link status. Do not perform RX counter related connectivity tests. link status check only - + END exit $OCF_SUCCESS } # # Return true, if the bond exists # is_bond() { # # List bonds but exclude FreeS/WAN ipsecN virtual bonds # ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 } # # Return true, if the bridge exists # is_bridge() { # # List bonds but exclude FreeS/WAN ipsecN virtual bonds # #ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 ovs-vsctl show|grep Bridge|grep -q $OCF_RESKEY_bridge } if_init() { local rc if [ X"$OCF_RESKEY_bond" = "X" ]; then ocf_exit_reason "Bond name (the bond parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi if [ X"$OCF_RESKEY_bridge" = "X" ]; then ocf_exit_reason "Bridge name (the bridge parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi BOND="$OCF_RESKEY_bond" BRIDGE="$OCF_RESKEY_bridge" if is_bond then if ! is_bridge then ocf_exit_reason "Bridge $OCF_RESKEY_bond does not exist" exit $OCF_ERR_CONFIGURED; fi else ocf_exit_reason "Bond $OCF_RESKEY_bond does not exist" exit $OCF_ERR_CONFIGURED; fi - : ${OCF_RESKEY_multiplier:="1"} if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" exit $OCF_ERR_CONFIGURED fi ATTRNAME=${OCF_RESKEY_name:-"ovsmonitor-$BOND"} - REP_COUNT=${OCF_RESKEY_repeat_count:-5} + REP_COUNT=${OCF_RESKEY_repeat_count} if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" exit $OCF_ERR_CONFIGURED fi - REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} + REP_INTERVAL_S=${OCF_RESKEY_repeat_interval} if ! ocf_is_decimal "$REP_INTERVAL_S"; then ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" exit $OCF_ERR_CONFIGURED fi - : ${OCF_RESKEY_pktcnt_timeout:="5"} if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } # get the link status on $BOND # asks ip about running (up) bonds, returns the number of matching bond names that are up get_link_status () { #$IP2UTIL -o link show up dev "$BOND" | grep -v 'NO-CARRIER' | grep -c "$BOND" ovs-appctl bond/show "$BOND"|awk -F: '/^slave/ {print $2}'|grep -c enabled } # returns the number of received rx packets on $BOND get_rx_packets () { ocf_log debug "bond $BOND - bridge $BRIDGE" #$IP2UTIL -o -s link show dev "$BOND" \ # | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/' local ovs_port for ovs_port in $(ovs-appctl bond/show $BOND|awk '/^slave/ {gsub(":","");print $2}') ; do ovs-ofctl dump-ports $BRIDGE $ovs_port done \ | awk -F, 'BEGIN{total=0} /rx/ {gsub(".*pkts=","");total=total+int($1)} END{print total}' } # watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds # returns immedeately with return code 0 if any packets were received # otherwise 1 is returned watch_pkt_counter () { local RX_PACKETS_NEW local RX_PACKETS_OLD RX_PACKETS_OLD="`get_rx_packets`" for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do sleep 0.1 RX_PACKETS_NEW="`get_rx_packets`" ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then ocf_log debug "we received some packets." return 0 fi done return 1 } # # Check the bond depending on the level given as parameter: $OCF_RESKEY_check_level # # 10: watch for packet counter changes # # # 30: watch for packet counter changes in promiscios mode # # If unsuccessfull in levels 18 and above, # the tests for higher check levels are run. # if_check () { # always check link status first link_status="`get_link_status`" ocf_log debug "link_status: $link_status (up > 0, down = 0)" if [ $link_status -eq 0 ]; then ocf_log notice "link_status: DOWN" return $OCF_NOT_RUNNING fi # if using link_status_only, skip RX count related test if ocf_is_true "$OCF_RESKEY_link_status_only"; then return $OCF_SUCCESS fi # watch for packet counter changes ocf_log debug "watch for packet counter changes" watch_pkt_counter if [ $? -eq 0 ]; then return $OCF_SUCCESS else ocf_log debug "No packets received during packet watch timeout" fi # watch for packet counter changes in promiscios mode # ocf_log debug "watch for packet counter changes in promiscios mode" # be sure switch off promiscios mode in any case # TODO: check first, wether promisc is already on and leave it untouched. # trap "$IP2UTIL link set dev $BOND promisc off; exit" INT TERM EXIT # $IP2UTIL link set dev $BOND promisc on # watch_pkt_counter && return $OCF_SUCCESS # $IP2UTIL link set dev $BOND promisc off # trap - INT TERM EXIT # looks like it's not working (for whatever reason) return $OCF_NOT_RUNNING } ####################################################################### if_usage() { cat < /dev/null` sleep $sleep_time 2> /dev/null runs=$(($runs + 1)) fi if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" fi done ocf_log debug "Monitoring return code: $mon_rc" if [ $mon_rc -eq $OCF_SUCCESS ]; then set_cib_value 1 attr_rc=$? else ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." set_cib_value 0 attr_rc=$? fi ## The resource should not fail, if the bond is down. It should fail, if the update of the CIB variable has errors. ## To react on the bond failure you must use constraints based on the CIB variable value, not on the resource itself. exit $attr_rc } if_stop() { attrd_updater -D -n $ATTRNAME ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop } if_start() { local rc ha_pseudo_resource $OCF_RESOURCE_INSTANCE start rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_exit_reason "Failure to create ovsmonitor state file" return $rc fi # perform the first monitor during the start operation if_monitor return $? } if_validate() { check_binary ovs-vsctl check_binary ovs-appctl check_binary ovs-ofctl if_init } case $__OCF_ACTION in meta-data) meta_data ;; usage|help) if_usage exit $OCF_SUCCESS ;; esac if_validate case $__OCF_ACTION in start) if_start exit $? ;; stop) if_stop exit $? ;; monitor|status) if_monitor exit $? ;; validate-all) exit $? ;; *) if_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac