diff --git a/heartbeat/galera b/heartbeat/galera index 994aad0..d74a70d 100755 --- a/heartbeat/galera +++ b/heartbeat/galera @@ -342,6 +342,14 @@ is_readonly() master_exists() { + if [ "$__OCF_ACTION" = "demote" ]; then + # We don't want to detect master instances during demote. + # 1. we could be detecting ourselves as being master, which is no longer the case. + # 2. we could be detecting other master instances that are in the process of shutting down. + # by not detecting other master instances in "demote" we are deferring this check + # to the next recurring monitor operation which will be much more accurate + return 1 + fi # determine if a master instance is already up and is healthy crm_mon --as-xml | grep "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1 return $? @@ -441,20 +449,24 @@ galera_promote() extra_opts="--wsrep-cluster-address=gcomm://" else ocf_exit_reason "Failure, Attempted to promote Master instance of $OCF_RESOURCE_INSTANCE before bootstrap node has been detected." + clear_last_commit return $OCF_ERR_GENERIC fi - fi - # make sure the read only instance is stopped - mysql_common_stop - rc=$? - if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then - ocf_exit_reason "Failed to stop read-only galera instance during promotion to Master" - return $rc + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node + ocf_log info "boostrap node already up, promoting the rest of the galera instances." + fi + clear_last_commit + return $OCF_SUCCESS fi - sleep 4 + # last commit is no longer relevant once promoted + clear_last_commit mysql_common_prepare_dirs mysql_common_start "$extra_opts" @@ -492,9 +504,6 @@ galera_promote() wait_for_sync fi - # last commit is no longer relevant once promoted - clear_last_commit - ocf_log info "Galera started" return $OCF_SUCCESS } @@ -510,14 +519,14 @@ galera_demote() # if this node was previously a bootstrap node, that is no longer the case. clear_bootstrap_node + clear_last_commit - # start again in slave mode so the new last commit is recorded + # record last commit by "starting" galera. start is just detection of the last sequence number galera_start } galera_start() { - local extra_opts='--read-only=true' local last_commit echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME @@ -526,22 +535,39 @@ galera_start() return $OCF_ERR_CONFIGURED fi - mysql_common_prepare_dirs - mysql_common_start "$extra_opts" - - is_readonly - if [ $? -ne 0 ]; then - ocf_exit_reason "Slave instance did not start correctly in read-only mode, Make sure local galera.cnf does not have wsrep_cluster_address set." + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "master galera instance started outside of the cluster's control" return $OCF_ERR_GENERIC fi - ocf_log info "attempting to detect last commit version" - while [ -z "$last_commit" ]; do - last_commit=$(get_status_variable "wsrep_last_committed") - if [ -z "$last_commit" ]; then - sleep 1 + mysql_common_prepare_dirs + + ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" + last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" + if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then + ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" + local tmp=$(mktemp) + ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --user=$OCF_RESKEY_user \ + --wsrep-recover > $tmp 2>&1 + + last_commit="$(cat $tmp | sed -n 's/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p')" + rm -f $tmp + + if [ "$last_commit" = "-1" ]; then + last_commit="0" fi - done + fi + + if [ -z "$last_commit" ]; then + ocf_exit_reason "Unable to detect last known write sequence number" + clear_last_commit + return $OCF_ERR_GENERIC + fi ocf_log info "Last commit version found: $last_commit" set_last_commit $last_commit @@ -567,28 +593,40 @@ galera_monitor() if ocf_is_probe; then status_loglevel="info" fi - + mysql_common_status $status_loglevel rc=$? - # If status returned an error, return that immediately - if [ $rc -ne $OCF_SUCCESS ]; then + if [ $rc -eq $OCF_NOT_RUNNING ]; then + last_commit=$(get_last_commit $node) + if [ -n "$last_commit" ]; then + # if last commit is set, this instance is considered started in slave mode + rc=$OCF_SUCCESS + master_exists + if [ $? -ne 0 ]; then + detect_first_master + else + # a master instance exists and is healthy, promote this + # local read only instance + # so it can join the master galera cluster. + set_master_score + fi + fi + return $rc + elif [ $rc -ne $OCF_SUCCESS ]; then return $rc fi + # if we make it here, mysql is running. Check cluster status now. + echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME if [ $? -ne 0 ]; then ocf_exit_reason "local node <${NODENAME}> is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" return $OCF_ERR_GENERIC fi - is_readonly - if [ $? -ne 0 ]; then - is_primary - if [ $? -ne 0 ]; then - ocf_exit_reason "local node <${NODENAME}> is neither in primary mode nor in read_only mode. Unknown state." - return $OCF_ERR_GENERIC - fi + is_primary + if [ $? -eq 0 ]; then if ocf_is_probe; then # restore master score during probe @@ -596,18 +634,10 @@ galera_monitor() set_master_score fi rc=$OCF_RUNNING_MASTER - else - master_exists - if [ $? -ne 0 ]; then - detect_first_master - else - # a master instance exists and is healthy, promote this - # local read only instance - # so it can join the master galera cluster. - set_master_score - fi + else + ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." + rc=$OCF_ERR_GENERIC fi - # TODO look at what is done in the wait script return $rc }