diff -uNr a/heartbeat/pgsql b/heartbeat/pgsql --- a/heartbeat/pgsql 2017-03-09 11:50:06.365145803 +0100 +++ b/heartbeat/pgsql 2017-03-09 12:19:41.566177608 +0100 @@ -966,8 +966,13 @@ cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ sort | head -1` if [ "$cmp_location" != "$my_master_baseline" ]; then + # We used to set the failcount to INF for the resource here in + # order to move the master to the other node. However, setting + # the failcount should be done only by the CRM and so this use + # got deprecated in pacemaker version 1.1.17. Now we do the + # "ban resource from the node". ocf_exit_reason "My data is newer than new master's one. New master's location : $master_baseline" - $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY + exec_with_retry 0 $CRM_RESOURCE -B -r $OCF_RESOURCE_INSTANCE -N $NODENAME -Q return $OCF_ERR_GENERIC fi fi @@ -1526,6 +1531,36 @@ wait $func_pid } +# retry command when command doesn't return 0 +# arg1 : count >= 0 (if arg1 is 0, it retries command in infinitum(1day)) +# arg2..argN : command and args +exec_with_retry() { + local count="86400" + local output + local rc + + if [ "$1" -ne 0 ]; then + count=$1 + fi + shift + + while [ $count -gt 0 ]; do + output=`$*` + rc=$? + if [ $rc -ne 0 ]; then + ocf_log warn "Retrying(remain $count). \"$*\" failed. rc=$rc. stdout=\"$output\"." + count=`expr $count - 1` + sleep 1 + else + printf "${output}" + return 0 + fi + done + + ocf_exit_reason "giving up executing \"$*\"" + return $rc +} + is_node_online() { crm_mon -1 -n | tr '[A-Z]' '[a-z]' | grep -e "^node $1 " -e "^node $1:" | grep -q -v "offline" } @@ -1734,7 +1769,7 @@ CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" - CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount" + CRM_RESOURCE="${HA_SBIN_DIR}/crm_resource" CAN_NOT_PROMOTE="-INFINITY" CAN_PROMOTE="100"