From feffc766c48a1010c1bf4f8b1db74795d06dbd50 Mon Sep 17 00:00:00 2001 From: David Vossel Date: Mon, 25 Aug 2014 14:57:09 -0500 Subject: [PATCH 2/4] ethmonitor updates --- heartbeat/ethmonitor | 290 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 187 insertions(+), 103 deletions(-) diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor index b85d7fc..a447391 100755 --- a/heartbeat/ethmonitor +++ b/heartbeat/ethmonitor @@ -1,14 +1,14 @@ #!/bin/sh # -# OCF Resource Agent compliant script. -# Monitor the vitality of a local network interface. +# OCF Resource Agent compliant script. +# Monitor the vitality of a local network interface. # # Based on the work by Robert Euhus and Lars Marowsky-Brée. # # Transfered from Ipaddr2 into ethmonitor by Alexander Krauth # # Copyright (c) 2011 Robert Euhus, Alexander Krauth, Lars Marowsky-Brée -# All Rights Reserved. +# All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as @@ -29,12 +29,12 @@ # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # -# OCF parameters are as below +# OCF parameters are as below # # OCF_RESKEY_interface # OCF_RESKEY_multiplicator # OCF_RESKEY_name -# OCF_RESKEY_repeat_count +# OCF_RESKEY_repeat_count # OCF_RESKEY_repeat_interval # OCF_RESKEY_pktcnt_timeout # OCF_RESKEY_arping_count @@ -70,10 +70,13 @@ The resource configuration requires a monitor operation, because the monitor doe In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. The name of the attribute value is configured in the 'name' option of this RA. -Example constraint configuration: +Example constraint configuration using crmsh location loc_connected_node my_resource_grp \ rule $id="rule_loc_connected_node" -INF: ethmonitor eq 0 +Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. +pcs constraint location my_resource rule score=-INFINITY ethmonitor-eth0 ne 1 + The ethmonitor works in 3 different modes to test the interface vitality. 1. call ip to see if the link status is up (if link is down -> error) 2. call ip and watch the RX counter (if packages come around in a certain time -> success) @@ -157,14 +160,30 @@ Maximum number of IPs from ARP cache list to check for ARP REQUEST (arping) answ + + +For interfaces that are infiniband devices. + +infiniband device + + + + + +For infiniband devices, this is the port to monitor. + +infiniband port + + + - - - - - - + + + + + + END @@ -173,7 +192,7 @@ END } # -# Return true, if the interface exists +# Return true, if the interface exists # is_interface() { # @@ -181,14 +200,25 @@ is_interface() { # local iface=`$IP2UTIL -o -f inet addr show | grep " $1 " \ | cut -d ' ' -f2 | sort -u | grep -v '^ipsec[0-9][0-9]*$'` - [ "$iface" != "" ] + [ "$iface" != "" ] +} + +infiniband_status() +{ + local device="$OCF_RESKEY_infiniband_device" + + if [ -n "$OCF_RESKEY_infiniband_port" ]; then + device="${OCF_RESKEY_infiniband_device}:${OCF_RESKEY_infiniband_port}" + fi + + ibstatus ${device} | grep -q ACTIVE } if_init() { local rc if [ X"$OCF_RESKEY_interface" = "X" ]; then - ocf_log err "Interface name (the interface parameter) is mandatory" + ocf_exit_reason "Interface name (the interface parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi @@ -196,60 +226,67 @@ if_init() { if is_interface $NIC then - case "$NIC" in - *:*) ocf_log err "Do not specify a virtual interface : $OCF_RESKEY_interface" - exit $OCF_ERR_CONFIGURED;; - *) ;; - esac + case "$NIC" in + *:*) ocf_exit_reason "Do not specify a virtual interface : $OCF_RESKEY_interface" + exit $OCF_ERR_CONFIGURED;; + *) ;; + esac else - case $__OCF_ACTION in - validate-all) ocf_log err "Interface $NIC does not exist" - exit $OCF_ERR_CONFIGURED;; - *) ocf_log warn "Interface $NIC does not exist" - ## It might be a bond interface which is temporarily not available, therefore we want to continue here - ;; - esac + case $__OCF_ACTION in + validate-all) + ocf_exit_reason "Interface $NIC does not exist" + exit $OCF_ERR_CONFIGURED;; + *) + ## It might be a bond interface which is temporarily not available, therefore we want to continue here + ocf_log warn "Interface $NIC does not exist" + ;; + esac fi : ${OCF_RESKEY_multiplier:="1"} if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then - ocf_log err "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" + ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" exit $OCF_ERR_CONFIGURED fi ATTRNAME=${OCF_RESKEY_name:-"ethmonitor-$NIC"} - REP_COUNT=${OCF_RESKEY_repeat_count:-5} + REP_COUNT=${OCF_RESKEY_repeat_count:-5} if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then - ocf_log err "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" + ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" exit $OCF_ERR_CONFIGURED - fi + fi REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} if ! ocf_is_decimal "$REP_INTERVAL_S"; then - ocf_log err "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" + ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_pktcnt_timeout:="5"} if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then - ocf_log err "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" + ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_arping_count:="1"} if ! ocf_is_decimal "$OCF_RESKEY_arping_count"; then - ocf_log err "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]" + ocf_exit_reason "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_arping_timeout:="1"} if ! ocf_is_decimal "$OCF_RESKEY_arping_timeout"; then - ocf_log err "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]" + ocf_exit_reason "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_arping_cache_entries:="5"} if ! ocf_is_decimal "$OCF_RESKEY_arping_cache_entries"; then - ocf_log err "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]" + ocf_exit_reason "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]" exit $OCF_ERR_CONFIGURED fi - return $OCF_SUCCESS + + if [ -n "$OCF_RESKEY_infiniband_device" ]; then + #ibstatus is required if an infiniband_device is provided + check_binary ibstatus + fi + return $OCF_SUCCESS } # get the link status on $NIC @@ -277,7 +314,7 @@ watch_pkt_counter () { for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do sleep 0.1 RX_PACKETS_NEW="`get_rx_packets`" - ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" + ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then ocf_log debug "we received some packets." return 0 @@ -308,7 +345,7 @@ do_arping () { } # -# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level +# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level # # 09: check for nonempty ARP cache # 10: watch for packet counter changes @@ -322,21 +359,47 @@ do_arping () { # the tests for higher check levels are run. # if_check () { + local arp_list # always check link status first link_status="`get_link_status`" ocf_log debug "link_status: $link_status (1=up, 0=down)" - [ $link_status -eq 0 ] && return $OCF_NOT_RUNNING + + if [ $link_status -eq 0 ]; then + ocf_log notice "link_status: DOWN" + return $OCF_NOT_RUNNING + fi + + # if this is an infiniband device, try ibstatus script + if [ -n "$OCF_RESKEY_infiniband_device" ]; then + if infiniband_status; then + return $OCF_SUCCESS + fi + ocf_log info "Infiniband device $OCF_RESKEY_infiniband_device is not available, check ibstatus for more information" + return $OCF_NOT_RUNNING + fi # watch for packet counter changes - ocf_log debug "watch for packet counter changes" - watch_pkt_counter && return $OCF_SUCCESS + ocf_log debug "watch for packet counter changes" + watch_pkt_counter + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + else + ocf_log debug "No packets received during packet watch timeout" + fi # check arping ARP cache entries - ocf_log debug "check arping ARP cache entries" - for ip in `get_arp_list`; do + ocf_log debug "check arping ARP cache entries" + arp_list=`get_arp_list` + for ip in `echo $arp_list`; do do_arping $ip && return $OCF_SUCCESS done + # if we get here, the ethernet device is considered not running. + # provide some logging information + if [ -z "$arp_list" ]; then + ocf_log info "No ARP cache entries found to arping" + fi + # watch for packet counter changes in promiscios mode # ocf_log debug "watch for packet counter changes in promiscios mode" # be sure switch off promiscios mode in any case @@ -362,67 +425,89 @@ END } set_cib_value() { - local score=`expr $1 \* $OCF_RESKEY_multiplier` - attrd_updater -n $ATTRNAME -v $score -q - local rc=$? - case $rc in - 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;; - *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";; - esac - return $rc + local score=`expr $1 \* $OCF_RESKEY_multiplier` + attrd_updater -n $ATTRNAME -v $score -q + local rc=$? + case $rc in + 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;; + *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";; + esac + return $rc } if_monitor() { - ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor - local pseudo_status=$? - if [ $pseudo_status -ne $OCF_SUCCESS ]; then - exit $pseudo_status - fi - - local mon_rc=$OCF_NOT_RUNNING - local attr_rc=$OCF_NOT_RUNNING - local runs=0 - local start_time - local end_time - local sleep_time - while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ] - do - start_time=`date +%s%N` - if_check - mon_rc=$? - REP_COUNT=$(( $REP_COUNT - 1 )) - if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then - ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left." - end_time=`date +%s%N` - sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null` - sleep $sleep_time 2> /dev/null - runs=$(($runs + 1)) - fi - - if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then - ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" - fi - done - - ocf_log debug "Monitoring return code: $mon_rc" - if [ $mon_rc -eq $OCF_SUCCESS ]; then - set_cib_value 1 - attr_rc=$? - else - ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." - set_cib_value 0 - attr_rc=$? - fi - - ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors. - ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself. - exit $attr_rc + ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor + local pseudo_status=$? + if [ $pseudo_status -ne $OCF_SUCCESS ]; then + exit $pseudo_status + fi + + local mon_rc=$OCF_NOT_RUNNING + local attr_rc=$OCF_NOT_RUNNING + local runs=0 + local start_time + local end_time + local sleep_time + while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ] + do + start_time=`date +%s%N` + if_check + mon_rc=$? + REP_COUNT=$(( $REP_COUNT - 1 )) + if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then + ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left." + end_time=`date +%s%N` + sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null` + sleep $sleep_time 2> /dev/null + runs=$(($runs + 1)) + fi + + if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then + ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" + fi + done + + ocf_log debug "Monitoring return code: $mon_rc" + if [ $mon_rc -eq $OCF_SUCCESS ]; then + set_cib_value 1 + attr_rc=$? + else + ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." + set_cib_value 0 + attr_rc=$? + fi + + ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors. + ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself. + exit $attr_rc +} + +if_stop() +{ + attrd_updater -D -n $ATTRNAME + ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop } +if_start() +{ + local rc + ha_pseudo_resource $OCF_RESOURCE_INSTANCE start + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failure to create ethmonitor state file" + return $rc + fi + + # perform the first monitor during the start operation + if_monitor + return $? +} + + if_validate() { - check_binary $IP2UTIL - check_binary arping - if_init + check_binary $IP2UTIL + check_binary arping + if_init } case $__OCF_ACTION in @@ -436,18 +521,17 @@ esac if_validate case $__OCF_ACTION in -start) ha_pseudo_resource $OCF_RESOURCE_INSTANCE start +start) if_start exit $? ;; -stop) attrd_updater -D -n $ATTRNAME - ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop +stop) if_stop exit $? ;; monitor|status) if_monitor exit $? ;; validate-all) exit $? - ;; + ;; *) if_usage exit $OCF_ERR_UNIMPLEMENTED ;; -- 1.8.4.2