You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
466 lines
15 KiB
466 lines
15 KiB
From feffc766c48a1010c1bf4f8b1db74795d06dbd50 Mon Sep 17 00:00:00 2001 |
|
From: David Vossel <dvossel@redhat.com> |
|
Date: Mon, 25 Aug 2014 14:57:09 -0500 |
|
Subject: [PATCH 2/4] ethmonitor updates |
|
|
|
--- |
|
heartbeat/ethmonitor | 290 +++++++++++++++++++++++++++++++++------------------ |
|
1 file changed, 187 insertions(+), 103 deletions(-) |
|
|
|
diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor |
|
index b85d7fc..a447391 100755 |
|
--- a/heartbeat/ethmonitor |
|
+++ b/heartbeat/ethmonitor |
|
@@ -1,14 +1,14 @@ |
|
#!/bin/sh |
|
# |
|
-# OCF Resource Agent compliant script. |
|
-# Monitor the vitality of a local network interface. |
|
+# OCF Resource Agent compliant script. |
|
+# Monitor the vitality of a local network interface. |
|
# |
|
# Based on the work by Robert Euhus and Lars Marowsky-Brée. |
|
# |
|
# Transfered from Ipaddr2 into ethmonitor by Alexander Krauth |
|
# |
|
# Copyright (c) 2011 Robert Euhus, Alexander Krauth, Lars Marowsky-Brée |
|
-# All Rights Reserved. |
|
+# All Rights Reserved. |
|
# |
|
# This program is free software; you can redistribute it and/or modify |
|
# it under the terms of version 2 of the GNU General Public License as |
|
@@ -29,12 +29,12 @@ |
|
# along with this program; if not, write the Free Software Foundation, |
|
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. |
|
# |
|
-# OCF parameters are as below |
|
+# OCF parameters are as below |
|
# |
|
# OCF_RESKEY_interface |
|
# OCF_RESKEY_multiplicator |
|
# OCF_RESKEY_name |
|
-# OCF_RESKEY_repeat_count |
|
+# OCF_RESKEY_repeat_count |
|
# OCF_RESKEY_repeat_interval |
|
# OCF_RESKEY_pktcnt_timeout |
|
# OCF_RESKEY_arping_count |
|
@@ -70,10 +70,13 @@ The resource configuration requires a monitor operation, because the monitor doe |
|
In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. |
|
The name of the attribute value is configured in the 'name' option of this RA. |
|
|
|
-Example constraint configuration: |
|
+Example constraint configuration using crmsh |
|
location loc_connected_node my_resource_grp \ |
|
rule $id="rule_loc_connected_node" -INF: ethmonitor eq 0 |
|
|
|
+Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. |
|
+pcs constraint location my_resource rule score=-INFINITY ethmonitor-eth0 ne 1 |
|
+ |
|
The ethmonitor works in 3 different modes to test the interface vitality. |
|
1. call ip to see if the link status is up (if link is down -> error) |
|
2. call ip and watch the RX counter (if packages come around in a certain time -> success) |
|
@@ -157,14 +160,30 @@ Maximum number of IPs from ARP cache list to check for ARP REQUEST (arping) answ |
|
<content type="integer" default="5"/> |
|
</parameter> |
|
|
|
+<parameter name="infiniband_device"> |
|
+<longdesc lang="en"> |
|
+For interfaces that are infiniband devices. |
|
+</longdesc> |
|
+<shortdesc lang="en">infiniband device</shortdesc> |
|
+<content type="string" /> |
|
+</parameter> |
|
+ |
|
+<parameter name="infiniband_port"> |
|
+<longdesc lang="en"> |
|
+For infiniband devices, this is the port to monitor. |
|
+</longdesc> |
|
+<shortdesc lang="en">infiniband port</shortdesc> |
|
+<content type="integer" /> |
|
+</parameter> |
|
+ |
|
</parameters> |
|
<actions> |
|
-<action name="start" timeout="20s" /> |
|
-<action name="stop" timeout="20s" /> |
|
-<action name="status" depth="0" timeout="20s" interval="10s" /> |
|
-<action name="monitor" depth="0" timeout="20s" interval="10s" /> |
|
-<action name="meta-data" timeout="5s" /> |
|
-<action name="validate-all" timeout="20s" /> |
|
+<action name="start" timeout="60s" /> |
|
+<action name="stop" timeout="20s" /> |
|
+<action name="status" depth="0" timeout="60s" interval="10s" /> |
|
+<action name="monitor" depth="0" timeout="60s" interval="10s" /> |
|
+<action name="meta-data" timeout="5s" /> |
|
+<action name="validate-all" timeout="20s" /> |
|
</actions> |
|
</resource-agent> |
|
END |
|
@@ -173,7 +192,7 @@ END |
|
} |
|
|
|
# |
|
-# Return true, if the interface exists |
|
+# Return true, if the interface exists |
|
# |
|
is_interface() { |
|
# |
|
@@ -181,14 +200,25 @@ is_interface() { |
|
# |
|
local iface=`$IP2UTIL -o -f inet addr show | grep " $1 " \ |
|
| cut -d ' ' -f2 | sort -u | grep -v '^ipsec[0-9][0-9]*$'` |
|
- [ "$iface" != "" ] |
|
+ [ "$iface" != "" ] |
|
+} |
|
+ |
|
+infiniband_status() |
|
+{ |
|
+ local device="$OCF_RESKEY_infiniband_device" |
|
+ |
|
+ if [ -n "$OCF_RESKEY_infiniband_port" ]; then |
|
+ device="${OCF_RESKEY_infiniband_device}:${OCF_RESKEY_infiniband_port}" |
|
+ fi |
|
+ |
|
+ ibstatus ${device} | grep -q ACTIVE |
|
} |
|
|
|
if_init() { |
|
local rc |
|
|
|
if [ X"$OCF_RESKEY_interface" = "X" ]; then |
|
- ocf_log err "Interface name (the interface parameter) is mandatory" |
|
+ ocf_exit_reason "Interface name (the interface parameter) is mandatory" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
|
|
@@ -196,60 +226,67 @@ if_init() { |
|
|
|
if is_interface $NIC |
|
then |
|
- case "$NIC" in |
|
- *:*) ocf_log err "Do not specify a virtual interface : $OCF_RESKEY_interface" |
|
- exit $OCF_ERR_CONFIGURED;; |
|
- *) ;; |
|
- esac |
|
+ case "$NIC" in |
|
+ *:*) ocf_exit_reason "Do not specify a virtual interface : $OCF_RESKEY_interface" |
|
+ exit $OCF_ERR_CONFIGURED;; |
|
+ *) ;; |
|
+ esac |
|
else |
|
- case $__OCF_ACTION in |
|
- validate-all) ocf_log err "Interface $NIC does not exist" |
|
- exit $OCF_ERR_CONFIGURED;; |
|
- *) ocf_log warn "Interface $NIC does not exist" |
|
- ## It might be a bond interface which is temporarily not available, therefore we want to continue here |
|
- ;; |
|
- esac |
|
+ case $__OCF_ACTION in |
|
+ validate-all) |
|
+ ocf_exit_reason "Interface $NIC does not exist" |
|
+ exit $OCF_ERR_CONFIGURED;; |
|
+ *) |
|
+ ## It might be a bond interface which is temporarily not available, therefore we want to continue here |
|
+ ocf_log warn "Interface $NIC does not exist" |
|
+ ;; |
|
+ esac |
|
fi |
|
|
|
: ${OCF_RESKEY_multiplier:="1"} |
|
if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then |
|
- ocf_log err "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
|
|
ATTRNAME=${OCF_RESKEY_name:-"ethmonitor-$NIC"} |
|
|
|
- REP_COUNT=${OCF_RESKEY_repeat_count:-5} |
|
+ REP_COUNT=${OCF_RESKEY_repeat_count:-5} |
|
if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then |
|
- ocf_log err "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" |
|
exit $OCF_ERR_CONFIGURED |
|
- fi |
|
+ fi |
|
REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} |
|
if ! ocf_is_decimal "$REP_INTERVAL_S"; then |
|
- ocf_log err "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
: ${OCF_RESKEY_pktcnt_timeout:="5"} |
|
if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then |
|
- ocf_log err "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
: ${OCF_RESKEY_arping_count:="1"} |
|
if ! ocf_is_decimal "$OCF_RESKEY_arping_count"; then |
|
- ocf_log err "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
: ${OCF_RESKEY_arping_timeout:="1"} |
|
if ! ocf_is_decimal "$OCF_RESKEY_arping_timeout"; then |
|
- ocf_log err "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
: ${OCF_RESKEY_arping_cache_entries:="5"} |
|
if ! ocf_is_decimal "$OCF_RESKEY_arping_cache_entries"; then |
|
- ocf_log err "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]" |
|
+ ocf_exit_reason "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
- return $OCF_SUCCESS |
|
+ |
|
+ if [ -n "$OCF_RESKEY_infiniband_device" ]; then |
|
+ #ibstatus is required if an infiniband_device is provided |
|
+ check_binary ibstatus |
|
+ fi |
|
+ return $OCF_SUCCESS |
|
} |
|
|
|
# get the link status on $NIC |
|
@@ -277,7 +314,7 @@ watch_pkt_counter () { |
|
for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do |
|
sleep 0.1 |
|
RX_PACKETS_NEW="`get_rx_packets`" |
|
- ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" |
|
+ ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" |
|
if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then |
|
ocf_log debug "we received some packets." |
|
return 0 |
|
@@ -308,7 +345,7 @@ do_arping () { |
|
} |
|
|
|
# |
|
-# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level |
|
+# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level |
|
# |
|
# 09: check for nonempty ARP cache |
|
# 10: watch for packet counter changes |
|
@@ -322,21 +359,47 @@ do_arping () { |
|
# the tests for higher check levels are run. |
|
# |
|
if_check () { |
|
+ local arp_list |
|
# always check link status first |
|
link_status="`get_link_status`" |
|
ocf_log debug "link_status: $link_status (1=up, 0=down)" |
|
- [ $link_status -eq 0 ] && return $OCF_NOT_RUNNING |
|
+ |
|
+ if [ $link_status -eq 0 ]; then |
|
+ ocf_log notice "link_status: DOWN" |
|
+ return $OCF_NOT_RUNNING |
|
+ fi |
|
+ |
|
+ # if this is an infiniband device, try ibstatus script |
|
+ if [ -n "$OCF_RESKEY_infiniband_device" ]; then |
|
+ if infiniband_status; then |
|
+ return $OCF_SUCCESS |
|
+ fi |
|
+ ocf_log info "Infiniband device $OCF_RESKEY_infiniband_device is not available, check ibstatus for more information" |
|
+ return $OCF_NOT_RUNNING |
|
+ fi |
|
|
|
# watch for packet counter changes |
|
- ocf_log debug "watch for packet counter changes" |
|
- watch_pkt_counter && return $OCF_SUCCESS |
|
+ ocf_log debug "watch for packet counter changes" |
|
+ watch_pkt_counter |
|
+ if [ $? -eq 0 ]; then |
|
+ return $OCF_SUCCESS |
|
+ else |
|
+ ocf_log debug "No packets received during packet watch timeout" |
|
+ fi |
|
|
|
# check arping ARP cache entries |
|
- ocf_log debug "check arping ARP cache entries" |
|
- for ip in `get_arp_list`; do |
|
+ ocf_log debug "check arping ARP cache entries" |
|
+ arp_list=`get_arp_list` |
|
+ for ip in `echo $arp_list`; do |
|
do_arping $ip && return $OCF_SUCCESS |
|
done |
|
|
|
+ # if we get here, the ethernet device is considered not running. |
|
+ # provide some logging information |
|
+ if [ -z "$arp_list" ]; then |
|
+ ocf_log info "No ARP cache entries found to arping" |
|
+ fi |
|
+ |
|
# watch for packet counter changes in promiscios mode |
|
# ocf_log debug "watch for packet counter changes in promiscios mode" |
|
# be sure switch off promiscios mode in any case |
|
@@ -362,67 +425,89 @@ END |
|
} |
|
|
|
set_cib_value() { |
|
- local score=`expr $1 \* $OCF_RESKEY_multiplier` |
|
- attrd_updater -n $ATTRNAME -v $score -q |
|
- local rc=$? |
|
- case $rc in |
|
- 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;; |
|
- *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";; |
|
- esac |
|
- return $rc |
|
+ local score=`expr $1 \* $OCF_RESKEY_multiplier` |
|
+ attrd_updater -n $ATTRNAME -v $score -q |
|
+ local rc=$? |
|
+ case $rc in |
|
+ 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;; |
|
+ *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";; |
|
+ esac |
|
+ return $rc |
|
} |
|
|
|
if_monitor() { |
|
- ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor |
|
- local pseudo_status=$? |
|
- if [ $pseudo_status -ne $OCF_SUCCESS ]; then |
|
- exit $pseudo_status |
|
- fi |
|
- |
|
- local mon_rc=$OCF_NOT_RUNNING |
|
- local attr_rc=$OCF_NOT_RUNNING |
|
- local runs=0 |
|
- local start_time |
|
- local end_time |
|
- local sleep_time |
|
- while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ] |
|
- do |
|
- start_time=`date +%s%N` |
|
- if_check |
|
- mon_rc=$? |
|
- REP_COUNT=$(( $REP_COUNT - 1 )) |
|
- if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then |
|
- ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left." |
|
- end_time=`date +%s%N` |
|
- sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null` |
|
- sleep $sleep_time 2> /dev/null |
|
- runs=$(($runs + 1)) |
|
- fi |
|
- |
|
- if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then |
|
- ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" |
|
- fi |
|
- done |
|
- |
|
- ocf_log debug "Monitoring return code: $mon_rc" |
|
- if [ $mon_rc -eq $OCF_SUCCESS ]; then |
|
- set_cib_value 1 |
|
- attr_rc=$? |
|
- else |
|
- ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." |
|
- set_cib_value 0 |
|
- attr_rc=$? |
|
- fi |
|
- |
|
- ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors. |
|
- ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself. |
|
- exit $attr_rc |
|
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor |
|
+ local pseudo_status=$? |
|
+ if [ $pseudo_status -ne $OCF_SUCCESS ]; then |
|
+ exit $pseudo_status |
|
+ fi |
|
+ |
|
+ local mon_rc=$OCF_NOT_RUNNING |
|
+ local attr_rc=$OCF_NOT_RUNNING |
|
+ local runs=0 |
|
+ local start_time |
|
+ local end_time |
|
+ local sleep_time |
|
+ while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ] |
|
+ do |
|
+ start_time=`date +%s%N` |
|
+ if_check |
|
+ mon_rc=$? |
|
+ REP_COUNT=$(( $REP_COUNT - 1 )) |
|
+ if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then |
|
+ ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left." |
|
+ end_time=`date +%s%N` |
|
+ sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null` |
|
+ sleep $sleep_time 2> /dev/null |
|
+ runs=$(($runs + 1)) |
|
+ fi |
|
+ |
|
+ if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then |
|
+ ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" |
|
+ fi |
|
+ done |
|
+ |
|
+ ocf_log debug "Monitoring return code: $mon_rc" |
|
+ if [ $mon_rc -eq $OCF_SUCCESS ]; then |
|
+ set_cib_value 1 |
|
+ attr_rc=$? |
|
+ else |
|
+ ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." |
|
+ set_cib_value 0 |
|
+ attr_rc=$? |
|
+ fi |
|
+ |
|
+ ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors. |
|
+ ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself. |
|
+ exit $attr_rc |
|
+} |
|
+ |
|
+if_stop() |
|
+{ |
|
+ attrd_updater -D -n $ATTRNAME |
|
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop |
|
} |
|
|
|
+if_start() |
|
+{ |
|
+ local rc |
|
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE start |
|
+ rc=$? |
|
+ if [ $rc -ne $OCF_SUCCESS ]; then |
|
+ ocf_exit_reason "Failure to create ethmonitor state file" |
|
+ return $rc |
|
+ fi |
|
+ |
|
+ # perform the first monitor during the start operation |
|
+ if_monitor |
|
+ return $? |
|
+} |
|
+ |
|
+ |
|
if_validate() { |
|
- check_binary $IP2UTIL |
|
- check_binary arping |
|
- if_init |
|
+ check_binary $IP2UTIL |
|
+ check_binary arping |
|
+ if_init |
|
} |
|
|
|
case $__OCF_ACTION in |
|
@@ -436,18 +521,17 @@ esac |
|
if_validate |
|
|
|
case $__OCF_ACTION in |
|
-start) ha_pseudo_resource $OCF_RESOURCE_INSTANCE start |
|
+start) if_start |
|
exit $? |
|
;; |
|
-stop) attrd_updater -D -n $ATTRNAME |
|
- ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop |
|
+stop) if_stop |
|
exit $? |
|
;; |
|
monitor|status) if_monitor |
|
exit $? |
|
;; |
|
validate-all) exit $? |
|
- ;; |
|
+ ;; |
|
*) if_usage |
|
exit $OCF_ERR_UNIMPLEMENTED |
|
;; |
|
-- |
|
1.8.4.2 |
|
|
|
|