From 4d98bbcdadda60166faf7ccc512b9095b439e2bd Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Tue, 2 Feb 2016 16:29:10 +0100 Subject: [PATCH] galera: prevent recovered nodes from bootstrapping cluster when possible --- heartbeat/README.galera | 19 ++++++++++++++++++- heartbeat/galera | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/heartbeat/galera b/heartbeat/galera index ca94c21..84c92fd 100755 --- a/heartbeat/galera +++ b/heartbeat/galera @@ -276,6 +276,22 @@ is_bootstrap() } +set_heuristic_recovered() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -v "true" +} + +clear_heuristic_recovered() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -D +} + +is_heuristic_recovered() +{ + local node=$1 + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-heuristic-recovered" -Q 2>/dev/null +} + clear_last_commit() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D @@ -398,8 +414,19 @@ detect_first_master() local best_node="$NODENAME" local last_commit=0 local missing_nodes=0 + local nodes="" + local nodes_recovered="" + # avoid selecting a recovered node as bootstrap if possible for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + if is_heuristic_recovered $node; then + nodes_recovered="$nodes_recovered $node" + else + nodes="$nodes $node" + fi + done + + for node in $nodes_recovered $nodes; do last_commit=$(get_last_commit $node) if [ -z "$last_commit" ]; then @@ -466,6 +493,12 @@ detect_last_commit() --tc-heuristic-recover=rollback > $tmp 2>/dev/null last_commit="$(cat $tmp | sed -n $recovered_position_regex)" + if [ ! -z "$last_commit" ]; then + ocf_log warn "State recovered. force SST at next restart for full resynchronization" + rm -f ${OCF_RESKEY_datadir}/grastate.dat + # try not to use this node if bootstrap is needed + set_heuristic_recovered + fi fi fi rm -f $tmp $tmperr @@ -549,11 +582,17 @@ galera_promote() if ocf_is_true $bootstrap; then promote_everyone clear_bootstrap_node + # clear attribute heuristic-recovered. if last shutdown was + # not clean, we cannot be extra-cautious by requesting a SST + # since this is the bootstrap node + clear_heuristic_recovered ocf_log info "Bootstrap complete, promoting the rest of the galera instances." else # if this is not the bootstrap node, make sure this instance # syncs with the rest of the cluster before promotion returns. wait_for_sync + # sync is done, clear info about last recovery + clear_heuristic_recovered fi ocf_log info "Galera started"