You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1778 lines
69 KiB
1778 lines
69 KiB
diff -uNr a/heartbeat/SAPHana b/heartbeat/SAPHana |
|
--- a/heartbeat/SAPHana 2016-04-26 12:01:55.620889964 +0200 |
|
+++ b/heartbeat/SAPHana 2016-04-26 12:03:17.240897137 +0200 |
|
@@ -2,9 +2,9 @@ |
|
# |
|
# SAPHana |
|
# |
|
-# Description: Manages two single SAP HANA Instance in System Replication |
|
+# Description: Manages two single SAP HANA Instance in System Replication |
|
# Planned: do also manage scale-up scenarios |
|
-# currently the SAPHana is dependent of the analysis of |
|
+# currently the SAPHana is dependent of the analysis of |
|
# SAPHanaTopology |
|
# For supported scenarios please read the README file provided |
|
# in the same software package (rpm) |
|
@@ -16,16 +16,17 @@ |
|
# Support: linux@sap.com |
|
# License: GNU General Public License (GPL) |
|
# Copyright: (c) 2013,2014 SUSE Linux Products GmbH |
|
+# Copyright: (c) 2015 SUSE Linux GmbH |
|
# |
|
-# An example usage: |
|
+# An example usage: |
|
# See usage() function below for more details... |
|
# |
|
# OCF instance parameters: |
|
-# OCF_RESKEY_SID |
|
-# OCF_RESKEY_InstanceNumber |
|
-# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) |
|
-# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) |
|
-# OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) |
|
+# OCF_RESKEY_SID |
|
+# OCF_RESKEY_InstanceNumber |
|
+# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) |
|
+# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) |
|
+# OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) |
|
# OCF_RESKEY_PREFER_SITE_TAKEOVER (optional, default is no) |
|
# OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT (optional, time difference needed between two last-primary-tiemstampe (lpt)) |
|
# OCF_RESKEY_SAPHanaFilter (optional, should only be set if been told by support or for debugging purposes) |
|
@@ -71,7 +72,7 @@ |
|
info ) |
|
case "$shf" in |
|
all) skip=0 |
|
- ;; |
|
+ ;; |
|
none ) |
|
skip=1 |
|
;; |
|
@@ -80,13 +81,13 @@ |
|
mtype=${mtype#fh} |
|
echo "$shf"| grep -iq ${mtype}; search=$? |
|
if [ $search -eq 0 ]; then |
|
- skip=0 |
|
+ skip=0 |
|
else |
|
skip=1 |
|
fi |
|
;; |
|
esac |
|
- ;; |
|
+ ;; |
|
esac |
|
if [ $skip -eq 0 ]; then |
|
ocf_log "$level" "$message" |
|
@@ -103,8 +104,8 @@ |
|
local rc=0 |
|
methods=$(saphana_methods) |
|
methods=$(echo $methods | tr ' ' '|') |
|
- cat <<-! |
|
- usage: $0 ($methods) |
|
+ cat <<-EOF |
|
+ usage: $0 ($methods) |
|
|
|
$0 manages a SAP HANA Instance as an HA resource. |
|
|
|
@@ -118,8 +119,17 @@ |
|
The 'validate-all' operation reports whether the parameters are valid |
|
The 'methods' operation reports on the methods $0 supports |
|
|
|
- ! |
|
- return $rc |
|
+EOF |
|
+ return $rc |
|
+} |
|
+ |
|
+function backup_global_and_nameserver() { |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ local rc=0 |
|
+ cp /hana/shared/LNX/global/hdb/custom/config/global.ini /hana/shared/LNX/global/hdb/custom/config/global.ini.$(date +"%s") |
|
+ cp /hana/shared/LNX/global/hdb/custom/config/nameserver.ini /hana/shared/LNX/global/hdb/custom/config/nameserver.ini.$(date +"%s") |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -130,11 +140,12 @@ |
|
function saphana_meta_data() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=0 |
|
- cat <<END |
|
+# |
|
+ cat <<END |
|
<?xml version="1.0"?> |
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> |
|
<resource-agent name="SAPHana"> |
|
-<version>0.149.7</version> |
|
+<version>0.151.1</version> |
|
|
|
<shortdesc lang="en">Manages two SAP HANA instances in system replication (SR).</shortdesc> |
|
<longdesc lang="en"> |
|
@@ -157,7 +168,7 @@ |
|
2. landscapeHostConfiguration |
|
The interface is used to monitor a HANA system. The python script is named landscapeHostConfiguration.py. |
|
landscapeHostConfiguration.py has some detailed output about HANA system status |
|
- and node roles. For our monitor the overall status is relevant. This overall |
|
+ and node roles. For our monitor the overall status is relevant. This overall |
|
status is reported by the returncode of the script: |
|
0: Internal Fatal, 1: ERROR, 2: WARNING, 3: INFO, 4: OK |
|
The SAPHana resource agent will interpret returncodes 0 as FATAL, 1 as not-running or ERROR and and returncodes 2+3+4 as RUNNING. |
|
@@ -168,14 +179,14 @@ |
|
system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). |
|
|
|
4. hdbsql / systemReplicationStatus |
|
- Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script |
|
+ Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script |
|
"systemReplicationStatus.py" in SAP HANA SPS8 or 9. |
|
As long as we need to use hdbsql you need to setup secure store users for linux user root to be able to |
|
access the SAP HANA database. You need to configure a secure store user key "SAPHANA${SID}SR" which can connect the SAP |
|
- HANA database: |
|
+ HANA database: |
|
|
|
5. saphostctrl |
|
- The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the |
|
+ The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the |
|
SAP HANA instance. This is the hostname used during the HANA installation. |
|
|
|
</longdesc> |
|
@@ -207,7 +218,7 @@ |
|
</parameter> |
|
<parameter name="DUPLICATE_PRIMARY_TIMEOUT" unique="0" required="0"> |
|
<shortdesc lang="en">Time difference needed between to primary time stamps, if a dual-primary situation occurs</shortdesc> |
|
- <longdesc lang="en">Time difference needed between to primary time stamps, |
|
+ <longdesc lang="en">Time difference needed between to primary time stamps, |
|
if a dual-primary situation occurs. If the time difference is |
|
less than the time gap, then the cluster hold one or both instances in a "WAITING" status. This is to give an admin |
|
a chance to react on a failover. A failed former primary will be registered after the time difference is passed. After |
|
@@ -231,12 +242,8 @@ |
|
<content type="string" default="" /> |
|
</parameter> |
|
<parameter name="SAPHanaFilter" unique="0" required="0"> |
|
- <shortdesc lang="en">Define SAPHana resource agent messages to be printed</shortdesc> |
|
- <longdesc lang="en">Define SAPHana resource agent messages to be printed. |
|
- This parameter should only be set if requested by support. The default is sufficient for normal operation. |
|
- Values: ra-act-lpa-dec-flow |
|
- You could specify any combination of the above values like "ra-act-flow" |
|
- </longdesc> |
|
+ <shortdesc lang="en">OUTDATED PARAMETER</shortdesc> |
|
+ <longdesc lang="en">OUTDATED PARAMETER</longdesc> |
|
<content type="string" default="" /> |
|
</parameter> |
|
</parameters> |
|
@@ -271,7 +278,7 @@ |
|
for m in start stop status monitor promote demote notify validate-all methods meta-data usage; do |
|
echo "$m" |
|
done |
|
- return $rc |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -298,7 +305,7 @@ |
|
local remoteNode="" |
|
local rc=1 |
|
for cl in ${otherNodes[@]}; do |
|
- vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) |
|
+ vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]} "$cl") |
|
if [ "$vHost" = "$remoteHost" ]; then # we found the correct node |
|
remoteNode=$cl |
|
rc=0 |
|
@@ -347,9 +354,10 @@ |
|
} |
|
|
|
# |
|
-# function: get_hana_attribute |
|
+# function: get_hana_attribute |
|
# params: NODE ATTR [STORE] |
|
# globals: - |
|
+# output: attribute value |
|
# |
|
function get_hana_attribute() |
|
{ |
|
@@ -358,14 +366,20 @@ |
|
local attr_node=$1 |
|
local attr_name=$2 |
|
local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter |
|
- local attr_default=${5:-} |
|
+ local attr_default=${4:-} |
|
+ local dstr |
|
local attr_val="" |
|
- attr_val=$(crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default"); rc=$? |
|
- if [ $debug_attributes -eq 1 ]; then |
|
- dstr=$(date) |
|
- echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q --> $attr_val" >> /var/log/fhATTRIBUTE |
|
- fi |
|
- echo "$attr_val" |
|
+ dstr=$(date) |
|
+ case "$attr_store" in |
|
+ reboot | forever ) |
|
+ echo "$dstr: SAPHana: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ props ) |
|
+ echo "$dstr: SAPHana: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ esac |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
@@ -388,11 +402,17 @@ |
|
attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store $attr_default); get_rc=$? |
|
if [ "$attr_old" != "$attr_value" ]; then |
|
super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " |
|
- crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store; rc=$? |
|
- if [ $debug_attributes -eq 1 ]; then |
|
- dstr=$(date) |
|
- echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE |
|
- fi |
|
+ dstr=$(date) |
|
+ case "$attr_store" in |
|
+ reboot | forever ) |
|
+ echo "$dstr: SAPHana: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ props ) |
|
+ echo "$dstr: SAPHana: crm_attribute -v $attr_value -n \"$attr_name\" -t crm_config -s SAPHanaSR" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -v $attr_value -n "$attr_name" -t crm_config -s SAPHanaSR 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ esac |
|
else |
|
super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" |
|
rc=0 |
|
@@ -408,7 +428,8 @@ |
|
# |
|
function assert() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local err_msg=$1 local default_rc=$OCF_NOT_RUNNING |
|
+ local err_msg=$1 |
|
+ local default_rc=$OCF_NOT_RUNNING |
|
# DONE: Check, if we need to destinguish between probe and others |
|
if ocf_is_probe; then |
|
default_exit=$OCF_NOT_RUNNING |
|
@@ -435,7 +456,7 @@ |
|
local score=0 |
|
if [ -n "$1" ]; then |
|
score=$1 |
|
- fi |
|
+ fi |
|
# DONE: PRIO2: Only adjust master if value is really different (try to check that) |
|
oldscore=$(${HA_SBIN_DIR}/crm_master -G -q -l reboot) |
|
if [ "$oldscore" != "$score" ]; then |
|
@@ -452,7 +473,7 @@ |
|
# |
|
# function: scoring_crm_master - score instance due to role ans sync match (table SCORING_TABLE_PREFERRED_SITE_TAKEOVER) |
|
# params: NODE_ROLES NODE_SYNC_STATUS |
|
-# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], |
|
+# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], |
|
# |
|
scoring_crm_master() |
|
{ |
|
@@ -467,7 +488,7 @@ |
|
if grep "$rolePatt" <<< "$roles"; then |
|
if grep "$syncPatt" <<< "$sync"; then |
|
skip=1 |
|
- myScore=$score |
|
+ myScore=$score |
|
fi |
|
fi |
|
fi |
|
@@ -496,7 +517,7 @@ |
|
# function: saphana_init - initialize variables for the resource agent |
|
# params: InstanceName |
|
# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), SAPVIRHOST(w), PreferSiteTakeover(w), |
|
-# globals: sr_name(w), remoteHost(w), otherNodes(w) |
|
+# globals: sr_name(w), remoteHost(w), otherNodes(w), rem_SR_name(w) |
|
# globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_CLONE_STATE(w) |
|
# globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w) |
|
# globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w), hdbSrQueryTimeout(w) |
|
@@ -506,6 +527,8 @@ |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=$OCF_SUCCESS |
|
local vName |
|
+ local clN |
|
+ # local site |
|
# two parameter models (for transition only) |
|
# OLD: InstanceName |
|
# NEW: SID InstanceNumber |
|
@@ -528,11 +551,10 @@ |
|
# |
|
# if saphostctrl does not know the answer, try to fallback to attribute provided by SAPHanaTopology |
|
# |
|
- vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}); |
|
+ vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]} "$NODENAME"); |
|
fi |
|
SAPVIRHOST=${vName} |
|
PreferSiteTakeover="$OCF_RESKEY_PREFER_SITE_TAKEOVER" |
|
- SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" |
|
AUTOMATED_REGISTER="${OCF_RESKEY_AUTOMATED_REGISTER:-false}" |
|
LPA_DIRECTORY=/var/lib/SAPHanaRA |
|
LPA_ATTR=("lpa_${sid}_lpt" "forever") |
|
@@ -591,6 +613,8 @@ |
|
*openais* ) otherNodes=($(crm_node -l | awk '$3 == "member" { if ($2 != me) { print $2 }}' me=${NODENAME}));; |
|
*cman* ) otherNodes=($(crm_node -l | awk '{for (i=1; i<=NF; i++) { if ($i != me) { print $i }}}' me=${NODENAME}));; |
|
esac |
|
+ # |
|
+ # |
|
|
|
remoteHost=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_REMOTEHOST[@]}); |
|
if [ -z "$remoteHost" ]; then |
|
@@ -611,9 +635,13 @@ |
|
# ATTR_NAME_HANA_SITE |
|
sr_name=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SITE[@]}); |
|
sr_mode=$(get_hana_attribute "${NODENAME}" ${ATTR_NAME_HANA_SRMODE[@]}) |
|
+ |
|
if [ -z "$sr_mode" ]; then |
|
sr_mode="sync" |
|
fi |
|
+ if [ -n "$remoteNode" ]; then |
|
+ rem_SR_name=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_SITE[@]}); |
|
+ fi |
|
super_ocf_log debug "DBG: sr_name=$sr_name, remoteHost=$remoteHost, remoteNode=$remoteNode, sr_mode=$sr_mode" |
|
# optional OCF parameters, we try to guess which directories are correct |
|
if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] |
|
@@ -706,7 +734,7 @@ |
|
then |
|
runninginst=$(echo "$output" | grep '^0 : ' | cut -d' ' -f3) |
|
if [ "$runninginst" != "$InstanceName" ] |
|
- then |
|
+ then |
|
super_ocf_log warn "ACT: sapstartsrv is running for instance $runninginst, that service will be killed" |
|
restart=1 |
|
else |
|
@@ -784,38 +812,113 @@ |
|
node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) |
|
node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') |
|
super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" |
|
+ # TODO: PRIO2: Maybe we need to use a fallback interface when hdbnsitil does not answer properly -> lookup in config files? |
|
+ # This might also solve some problems when we could not figure-out the ilocal or remote site name |
|
for i in 1 2 3 4 5 6 7 8 9; do |
|
case "$node_status" in |
|
- primary ) |
|
- super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" |
|
- return $HANA_STATE_PRIMARY;; |
|
+ primary ) |
|
+ super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_PRIMARY" |
|
+ return $HANA_STATE_PRIMARY;; |
|
syncmem | sync | async ) |
|
- super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" |
|
- return $HANA_STATE_SECONDARY;; |
|
- none ) # have seen that mode on second side BEFEORE we registered it as replica |
|
- super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" |
|
- return $HANA_STATE_STANDALONE;; |
|
+ super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_SECONDARY" |
|
+ return $HANA_STATE_SECONDARY;; |
|
+ none ) # have seen that mode on second side BEFEORE we registered it as replica |
|
+ super_ocf_log info "FLOW: $FUNCNAME rc=HANA_STATE_STANDALONE" |
|
+ return $HANA_STATE_STANDALONE;; |
|
* ) |
|
- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" |
|
- dump=$( echo $node_status | hexdump -C ); |
|
- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" |
|
- node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) |
|
- node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') |
|
- super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" |
|
- # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes |
|
+ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" |
|
+ dump=$( echo $node_status | hexdump -C ); |
|
+ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" |
|
+ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) |
|
+ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') |
|
+ super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" |
|
+ # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes |
|
esac; |
|
done |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
|
|
+# function: analyze_hana_sync_statusSRS |
|
+# params: - |
|
+# globals: DIR_EXECUTABLE(r), FULL_SR_STATUS(w), remoteNode |
|
+# |
|
+# systemReplicationStatus.py return-codes: |
|
+# NoHSR = 10 |
|
+# Error = 11 |
|
+# Unkown = 12 |
|
+# Initializing = 13 |
|
+# Syncing = 14 |
|
+# Active = 15 |
|
+function analyze_hana_sync_statusSRS() |
|
+{ |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ local rc=-1 srRc=0 all_nodes_other_side="" n="" siteParam="" |
|
+ if [ -n "$rem_SR_name" ]; then |
|
+ siteParam="--site=$rem_SR_name" |
|
+ fi |
|
+ FULL_SR_STATUS=$(su - $sidadm -c "python $DIR_EXECUTABLE/python_support/systemReplicationStatus.py $siteParam" 2>/dev/null); srRc=$? |
|
+ super_ocf_log info "DEC $FUNCNAME systemReplicationStatus.py (to site '$rem_SR_name')-> $srRc" |
|
+ super_ocf_log info "FLOW $FUNCNAME systemReplicationStatus.py (to site '$rem_SR_name')-> $srRc" |
|
+ # |
|
+ # TODO: PRIO2: Here we might also need to filter additional sites (if multi tier should be supported) |
|
+ # And is the check for return code capable for chains? |
|
+ # |
|
+ if [ $srRc -eq 15 ]; then |
|
+ # Fix for a HANA BUG, where a non-working SR resulted in RC 15: |
|
+ if grep -q "ACTIVE" <<< "$FULL_SR_STATUS"; then |
|
+ super_ocf_log info "FLOW $FUNCNAME SOK" |
|
+ set_hana_attribute "$remoteNode" "SOK" ${ATTR_NAME_HANA_SYNC_STATUS[@]} |
|
+ super_ocf_log info "ACT site=$sr_name, seting SOK for secondary (1)" |
|
+ lpa_set_lpt 30 "$remoteNode" |
|
+ rc=0; |
|
+ else |
|
+ # ok we should be careful and set secondary to SFAIL |
|
+ super_ocf_log info "FLOW $FUNCNAME SFAIL" |
|
+ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} |
|
+ super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (6) - srRc=$srRc lss=$lss No ACTIVES found in cmd output" |
|
+ # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary |
|
+ lpa_set_lpt 10 "$remoteNode" |
|
+ fi |
|
+ elif [ $srRc -le 11 ]; then # 11 and 10 |
|
+ # if systemReplicationStatus is ERROR and landscapeHostConfiguration is down than do NOT set SFAIL |
|
+ get_hana_landscape_status; lss=$? |
|
+ if [ $lss -lt 2 ]; then |
|
+ # keep everithing like it was |
|
+ rc=2 |
|
+ else |
|
+ # ok we should be careful and set secondary to SFAIL |
|
+ super_ocf_log info "FLOW $FUNCNAME SFAIL" |
|
+ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} |
|
+ super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (5) - srRc=$srRc lss=$lss" |
|
+ # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary |
|
+ lpa_set_lpt 10 "$remoteNode" |
|
+ rc=1 |
|
+ fi |
|
+ else |
|
+ super_ocf_log info "FLOW $FUNCNAME SFAIL" |
|
+ set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} |
|
+ super_ocf_log info "ACT site=$sr_name, seting SFAIL for secondary (2) - srRc=$srRc" |
|
+ # TODO: PRIO1 - P004: need to check LSS again to avoid dying primary to block (SFAIL) secondary |
|
+ lpa_set_lpt 10 "$remoteNode" |
|
+ rc=1; |
|
+ fi |
|
+ super_ocf_log info "FLOW $FUNCNAME PRIM+LPA" |
|
+ super_ocf_log info "DBG PRIM" |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
+ return $rc |
|
+} |
|
+ |
|
# |
|
-# function: analyze_hana_sync_status - query and check hana system replication status |
|
+#### |
|
+#### OLD HDBSQL STUFF FOR SPS6,7,8 AND SCALE-UP ONLY |
|
+#### |
|
+# function: analyze_hana_sync_statusSQL - query and check hana system replication status |
|
# params: - |
|
# globals: DIR_EXECUTABLE(r), remoteHost(r) |
|
# get the HANA sync status |
|
-# |
|
-function analyze_hana_sync_status() |
|
+# |
|
+function analyze_hana_sync_statusSQL() |
|
{ |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local -a clusterNodes=() |
|
@@ -863,35 +966,9 @@ |
|
# TODO PRIO1: REMOVE remoteNode dependency - set SFAIL |
|
set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} |
|
fi |
|
- # first get a list of all secondary hosts, than a list of all secondary hosts, if the is ANY failure at this site |
|
- # TODO: PRIO9: for first we assume there is only ONE secondary site (like ROT) |
|
- # TODO: PRIO3: should we loop over all cluster nodes fetching their roles-attribute? To minimize sql-queries? |
|
- # |
|
- all_secondary_hosts=$(timeout $hdbSrQueryTimeout hdbsql -a -x -U $secUser $query_secondaries ); sqlrc=$? |
|
- all_secondary_hosts=$(echo $all_secondary_hosts | dequote); |
|
- if [ "$sqlrc" -eq 0 ]; then |
|
- all_broken_secondary_hosts=$(timeout $hdbSrQueryTimeout hdbsql -a -x -U $secUser $query_failed_secondaries); sqlrc=$? |
|
- all_broken_secondary_hosts=$(echo $all_broken_secondary_hosts | dequote); |
|
- if [ "$sqlrc" -eq 0 ]; then |
|
- if [ -n "$all_broken_secondary_hosts" ]; then |
|
- # |
|
- # we have a broken secondary site - set all hosts to "SFAIL" |
|
- # |
|
- # Note: since HANA hostname can be different from nodename we need to check all vhost attributes |
|
- for n in $all_broken_secondary_hosts; do |
|
- for cl in ${otherNodes[@]}; do |
|
- vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) |
|
- if [ "$vHost" = "$n" ]; then # we found the correct node |
|
- set_hana_attribute $cl "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} |
|
- fi |
|
- done |
|
- done |
|
- fi |
|
- fi |
|
- fi |
|
else |
|
case "$sqlrc" in |
|
- 19 ) |
|
+ 19 ) |
|
# return codes 19: license error -> set SFAIL! |
|
# DONE: PRIO1: We should NOT set SFAIL, if HDB is exactly broken now |
|
# When HDB breaks during monitor this could prevent a prositive remote failover |
|
@@ -901,7 +978,7 @@ |
|
done |
|
;; |
|
esac |
|
- fi |
|
+ fi |
|
return $rc |
|
} |
|
|
|
@@ -932,10 +1009,18 @@ |
|
local remoteInstance=""; |
|
remoteInstance=$InstanceNr |
|
if ocf_is_true ${AUTOMATED_REGISTER}; then |
|
+ # |
|
+ # |
|
+ # |
|
+ # |
|
+ # |
|
super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name" |
|
+ # |
|
+ # |
|
su - $sidadm -c "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name"; rc=$? |
|
+ # backup_global_and_nameserver |
|
else |
|
- super_ocf_log info "ACT: IGNORE REGISTER because AUTOMATED_REGISTER is set to FALSE" |
|
+ super_ocf_log info "ACT: SAPHANA DROP REGISTER because AUTOMATED_REGISTER is set to FALSE" |
|
rc=1 |
|
fi |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
@@ -945,7 +1030,7 @@ |
|
# |
|
# function: saphana_status - pure status check |
|
# params: - |
|
-# globals: SIDInstanceName, OCF_*, |
|
+# globals: SIDInstanceName, OCF_*, |
|
function saphana_status() { |
|
local binDeam="hdb.sap${SIDInstanceName}" rc=0 |
|
binDeam=${binDeam:0:15} # Process name is limited to the first 15 characters |
|
@@ -956,13 +1041,13 @@ |
|
# |
|
# function: saphana_start - start a hana instance |
|
# params: - |
|
-# globals: OCF_*, SAPCONTROL, InstanceNr, SID, InstanceName, |
|
+# globals: OCF_*, SAPCONTROL, InstanceNr, SID, InstanceName, |
|
# |
|
function saphana_start() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=$OCF_NOT_RUNNING |
|
local output="" |
|
- local loopcount=0 |
|
+ local loopcount=0 |
|
check_sapstartsrv |
|
rc=$? |
|
# |
|
@@ -1000,11 +1085,11 @@ |
|
# saphana_stop: Stop the SAP instance |
|
# |
|
function saphana_stop() { |
|
- super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local output="" |
|
- local rc=0 |
|
- check_sapstartsrv; rc=$? |
|
- if [ $rc -eq $OCF_SUCCESS ]; then |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ local output="" |
|
+ local rc=0 |
|
+ check_sapstartsrv; rc=$? |
|
+ if [ $rc -eq $OCF_SUCCESS ]; then |
|
output=$($SAPCONTROL -nr $InstanceNr -function Stop) |
|
rc=$? |
|
super_ocf_log info "ACT: Stopping SAP Instance $SID-$InstanceName: $output" |
|
@@ -1032,7 +1117,7 @@ |
|
# function: saphana_validate - validation of (some) variables/parameters |
|
# params: - |
|
# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), SAPVIRHOST(r) |
|
-# saphana_validate: Check the symantic of the input parameters |
|
+# saphana_validate: Check the symantic of the input parameters |
|
# |
|
function saphana_validate() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
@@ -1060,12 +1145,12 @@ |
|
# |
|
# function: saphana_start_primary - handle startup of PRIMARY in M/S |
|
# params: |
|
-# globals: OCF_*(r), NODENAME, ATTR_NAME_*, HANA_STATE_*, |
|
+# globals: OCF_*(r), NODENAME, ATTR_NAME_*, HANA_STATE_*, |
|
# |
|
function saphana_start_primary() |
|
{ |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING |
|
+ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING |
|
local lss sqlrc; |
|
local rc=0 |
|
local lpa_dec=4 |
|
@@ -1074,7 +1159,7 @@ |
|
# we will be a master (PRIMARY) so checking, if the is an OTHER master |
|
# |
|
super_ocf_log debug "DBG: saphana_primary - check_for_primary reports HANA_STATE_PRIMARY" |
|
- # |
|
+ # |
|
lpa_init_lpt $HANA_STATE_PRIMARY |
|
lpa_check_lpt_status; lpa_dec=$? |
|
get_hana_landscape_status; lss=$? |
|
@@ -1139,7 +1224,7 @@ |
|
1 ) # landcape says we are down, lets start and adjust scores and return code |
|
super_ocf_log info "LPA: landcape: DOWN, LPA: start ==> start instance" |
|
saphana_start |
|
- rc=$? |
|
+ rc=$? |
|
LPTloc=$(date '+%s') |
|
lpa_set_lpt $LPTloc |
|
;; |
|
@@ -1152,7 +1237,7 @@ |
|
# DONE: PRIO3: check if this reaction is correct - tell cluster about failed start |
|
super_ocf_log info "LPA: landcape: UP, LPA: register ==> take down" |
|
set_crm_master -inf |
|
- rc=$OCF_NOT_RUNNING |
|
+ rc=$OCF_NOT_RUNNING |
|
;; |
|
1 ) # lets try to register |
|
# DONE: PRIO2: Like Action in start_secondary |
|
@@ -1160,7 +1245,7 @@ |
|
super_ocf_log info "DEC: AN OTHER HANA IS AVAILABLE ==> LETS REGISTER" |
|
set_crm_master 0 |
|
if wait_for_primary_master 1; then |
|
- register_hana_secondary |
|
+ register_hana_secondary |
|
check_for_primary; primary_status=$? |
|
if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then |
|
super_ocf_log info "ACT: Register successful" |
|
@@ -1169,11 +1254,11 @@ |
|
set_crm_master 0 |
|
saphana_start_secondary |
|
rc=$? |
|
- lpa_set_lpt 30 |
|
+ lpa_set_lpt 10 |
|
else |
|
super_ocf_log err "ACT: Register failed" |
|
rc=$OCF_NOT_RUNNING |
|
- fi |
|
+ fi |
|
else |
|
# lets check next monitor, if we can register |
|
rc=$OCF_SUCCESS |
|
@@ -1185,6 +1270,9 @@ |
|
case "$lss" in |
|
2 | 3 | 4 ) # as we ARE up we just keep it up |
|
# TODO: PRIO3: I now change from "just keep it up to take that down" |
|
+# TODO: PRIO1 differ lpt_advice!! |
|
+# 2 => DOWN |
|
+# 3 => KEEP |
|
# TODO: PRIO3: OCF_SUCCESS, OCF_NOT_RUNNING or OCF_ERR_xxxx ? |
|
set_crm_master -9000 |
|
#scoring_crm_master "$my_role" "$my_sync" |
|
@@ -1193,7 +1281,7 @@ |
|
1 ) # we are down, so we should wait --> followup in next monitor |
|
super_ocf_log info "LPA: landcape: DOWN, LPA: wait ==> keep waiting" |
|
# TODO: PRIO3: Check, if WAITING is correct here |
|
- set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} |
|
+ set_hana_attribute ${NODENAME} "WAITING4LPA" ${ATTR_NAME_HANA_CLONE_STATE[@]} |
|
set_crm_master -9000 |
|
rc=$OCF_SUCCESS |
|
;; |
|
@@ -1202,7 +1290,7 @@ |
|
fail ) # process a lpa FAIL |
|
super_ocf_log info "LPA: LPA reports FAIL" |
|
set_crm_master -inf |
|
- rc=$OCF_NOT_RUNNING |
|
+ rc=$OCF_NOT_RUNNING |
|
;; |
|
esac |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
@@ -1278,12 +1366,12 @@ |
|
# |
|
# function: saphana_start_secondary - handle startup of PRIMARY in M/S |
|
# params: |
|
-# globals: OCF_*(r), NODENAME, ATTR_NAME_*, |
|
+# globals: OCF_*(r), NODENAME, ATTR_NAME_*, |
|
# |
|
function saphana_start_secondary() |
|
{ |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING |
|
+ local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING |
|
local sqlrc; |
|
set_crm_master 0 |
|
# |
|
@@ -1291,9 +1379,9 @@ |
|
# |
|
lpa_push_lpt 10 |
|
lpa_set_lpt 10 |
|
- # |
|
+ # |
|
####### LPA - end |
|
- # |
|
+ # |
|
# |
|
# we would be slave (secondary) |
|
# we first need to check, if there are Master Nodes, because the Scecondary only starts |
|
@@ -1311,16 +1399,16 @@ |
|
# It seams the stating secondary could not start because of stopping primary |
|
# so this is a WAITING situation |
|
super_ocf_log info "ACT: PRIMARY seams to be down now ==> WAITING" |
|
- set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} |
|
+ set_hana_attribute ${NODENAME} "WAITING4PRIM" ${ATTR_NAME_HANA_CLONE_STATE[@]} |
|
set_crm_master -INFINITY |
|
rc=$OCF_SUCCESS |
|
fi |
|
else |
|
- lpa_set_lpt 30 |
|
+ lpa_set_lpt 10 |
|
fi |
|
else |
|
super_ocf_log info "ACT: wait_for_primary_master ==> WAITING" |
|
- set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} |
|
+ set_hana_attribute ${NODENAME} "WAITING4PRIM" ${ATTR_NAME_HANA_CLONE_STATE[@]} |
|
set_crm_master -INFINITY |
|
rc=$OCF_SUCCESS |
|
fi |
|
@@ -1329,11 +1417,71 @@ |
|
} |
|
|
|
# |
|
+# function: saphana_check_local_instance |
|
+# params: |
|
+# output: |
|
+# rc: rc=0 (UP) rc=1 (DOWN) |
|
+# globals: |
|
+# |
|
+function saphana_check_local_instance() |
|
+{ |
|
+ local rc=1 |
|
+ local count=0 |
|
+ local SERVNO |
|
+ local output |
|
+ local MONITOR_SERVICES="hdbnameserver|hdbdaemon" # TODO: PRIO1: exact list of Services |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ check_sapstartsrv |
|
+ rc=$? |
|
+ if [ $rc -eq $OCF_SUCCESS ] |
|
+ then |
|
+ output=$($SAPCONTROL -nr $InstanceNr -function GetProcessList -format script) |
|
+ # we have to parse the output, because the returncode doesn't tell anything about the instance status |
|
+ for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` |
|
+ do |
|
+ local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` |
|
+ local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` |
|
+ local STATE=0 |
|
+ local SEARCH |
|
+ |
|
+ case $COLOR in |
|
+ GREEN|YELLOW) STATE=$OCF_SUCCESS;; |
|
+ *) STATE=$OCF_NOT_RUNNING;; |
|
+ esac |
|
+ |
|
+ SEARCH=`echo "$MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` |
|
+ if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] |
|
+ then |
|
+ if [ $STATE -eq $OCF_NOT_RUNNING ] |
|
+ then |
|
+ [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" |
|
+ rc=$STATE |
|
+ fi |
|
+ count=1 |
|
+ fi |
|
+ done |
|
+ |
|
+ if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] |
|
+ then |
|
+ if ocf_is_probe |
|
+ then |
|
+ rc=1 |
|
+ else |
|
+ [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" |
|
+ rc=1 |
|
+ fi |
|
+ fi |
|
+ fi |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
+ return $rc |
|
+} |
|
+ |
|
+# |
|
# function: lpa_get_lpt - get lpt from cluster |
|
# params: NODE |
|
# output: LPT |
|
# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR |
|
-# globals: LPA_ATTR_*, |
|
+# globals: LPA_ATTR_*, |
|
# |
|
function lpa_get_lpt() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
@@ -1348,7 +1496,7 @@ |
|
rc=2 |
|
fi |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -1372,7 +1520,7 @@ |
|
rc=0 |
|
fi |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -1398,7 +1546,7 @@ |
|
rc=2 |
|
fi |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -1422,15 +1570,15 @@ |
|
rc=2 |
|
else |
|
rc=0 |
|
- fi |
|
+ fi |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ return $rc |
|
} |
|
|
|
# |
|
# function: lpa_init_lpt - initialize local lpt, if needed |
|
# params: HANA_STATE |
|
-# globals: HANA_STATE_*(r), LPA_DIRECTORY(r), sid(r), NODENAME(r), |
|
+# globals: HANA_STATE_*(r), LPA_DIRECTORY(r), sid(r), NODENAME(r), |
|
# lpa_init_lpt |
|
# |
|
# Returncodes: |
|
@@ -1439,7 +1587,7 @@ |
|
# Initializing (if NO local LPT-file): |
|
# SECONDARY sets to 0 |
|
# PRIMARY sets to 1 |
|
-# |
|
+# |
|
function lpa_init_lpt() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=1 |
|
@@ -1458,11 +1606,11 @@ |
|
LPTloc=10 |
|
lpa_push_lpt "10"; rc=$? |
|
else |
|
- rc=2 |
|
+ rc=2 |
|
fi |
|
lpa_set_lpt $LPTloc |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -1472,6 +1620,10 @@ |
|
# lpa_check_lpt_status |
|
# |
|
# Returncodes: |
|
+# 0: start |
|
+# 1: register than start |
|
+# 2: wait4gab |
|
+# 3: wait4other |
|
# |
|
# Initializing (if NO local LPT-file): |
|
# SECONDARY sets to 10 |
|
@@ -1480,20 +1632,20 @@ |
|
# LPRlocal OR LPTremore ARE real lpt (>1000) |
|
# THEN: |
|
# Bigger LPR wins, if delta-gab is OK |
|
-# LPTlocal >> LPTremore ===> rc=0 (start) |
|
+# LPTlocal >> LPTremore ===> rc=0 (start) |
|
# LPTRemote >> LPTlocal ===> rc=1 (register) |
|
-# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) |
|
+# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait4gab) |
|
# LPRlocal AND LPTremore ARE NOT real lpt (<=1000) |
|
# THEN: |
|
# Bigger LPT wins |
|
-# LPTlocal > LPTremore ===> rc=0 (start) |
|
+# LPTlocal > LPTremore ===> rc=0 (start) |
|
# LPTRemote > LPTlocal ===> rc=1 (register) |
|
-# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) |
|
+# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait4gab) |
|
# LPTRemote is not initialized or node not kown in cluster (crm_mon -l) (0) |
|
# TODO: PRIO1: Need to introduce a return-code 3 for remote sides lpa not ready |
|
# THEN: |
|
# WAIT ==> like STALEMATE-HANDLING ===> rc=2 (wait) |
|
-# |
|
+# |
|
function lpa_check_lpt_status() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=0 |
|
@@ -1501,6 +1653,8 @@ |
|
local LPTrem=-1 |
|
local LPTMark=1000 |
|
local delta=0 |
|
+ local remSn_name="" |
|
+ local remHost="" |
|
# |
|
# First GET LPT from ATTR-FILE-DEFAULT |
|
# |
|
@@ -1550,7 +1704,20 @@ |
|
fi |
|
fi |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ return $rc |
|
+} |
|
+ |
|
+# function: is_the_master_nameserver |
|
+# params: - |
|
+# rc: 0: yes, local node is THE master nameserver |
|
+# 1: else |
|
+# globals: |
|
+function is_the_master_nameserver() |
|
+{ |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ local rc=0 |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -1574,11 +1741,12 @@ |
|
check_for_primary; primary_status=$? |
|
if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then |
|
saphana_start_primary; rc=$? |
|
- else |
|
+ else |
|
+ lpa_set_lpt 10 |
|
saphana_start_secondary; rc=$? |
|
- lpa_set_lpt 30 |
|
- fi |
|
+ fi |
|
fi |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
|
|
@@ -1596,7 +1764,7 @@ |
|
check_for_primary; primary_status=$? |
|
if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then |
|
lpa_set_lpt 10 |
|
- fi |
|
+ fi |
|
saphana_stop; rc=$? |
|
return $rc |
|
} |
|
@@ -1637,7 +1805,7 @@ |
|
DEMOTED ) |
|
promoted=0; |
|
;; |
|
- WAITING ) |
|
+ WAITING* ) |
|
# DONE: lpa_check_lpt_status to come out of here :) |
|
# DONE: PRIO2: CHECK IF THE FIX FOR COMING OUT OF WAITING IS CORRECT |
|
get_hana_landscape_status; lss=$? |
|
@@ -1648,7 +1816,8 @@ |
|
lpa_set_lpt $LPTloc |
|
fi |
|
lpa_check_lpt_status; lparc=$? |
|
- if [ $lparc -ne 2 ]; then |
|
+ # TODO: PRIO1: Need to differ lpa_check_lpt_status return codes |
|
+ if [ $lparc -lt 2 ]; then |
|
# lpa - no need to wait any longer - lets try a new start |
|
saphana_start_clone |
|
rc=$? |
|
@@ -1663,7 +1832,7 @@ |
|
super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" |
|
fi |
|
return $OCF_SUCCESS |
|
- fi |
|
+ fi |
|
promoted=0; |
|
;; |
|
UNDEFINED ) |
|
@@ -1682,13 +1851,13 @@ |
|
get_hana_landscape_status; lss=$? |
|
super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" |
|
case "$lss" in |
|
- 0 ) # FATAL or ERROR |
|
+ 0 ) # FATAL or ERROR |
|
rc=$OCF_ERR_GENERIC |
|
;; |
|
- 1 ) # DOWN or ERROR |
|
+ 1 ) # DOWN or ERROR |
|
# DONE: PRIO2: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error |
|
if ocf_is_probe; then |
|
- # |
|
+ # |
|
# leave master score untouched, only set return code |
|
# |
|
rc=$OCF_NOT_RUNNING |
|
@@ -1699,7 +1868,7 @@ |
|
# For Migration it would be good to decrease master score |
|
# For Reload locally we should NOT adjust the master score |
|
# ===> Should we rely on the migration threshold? |
|
- # set_crm_master |
|
+ # set_crm_master |
|
if ocf_is_true "${PreferSiteTakeover}" ; then |
|
# |
|
# DONE: PRIO1: first check, if remote site is already (and still) in sync |
|
@@ -1708,7 +1877,7 @@ |
|
# TODO PRIO1: REMOVE remoteNode dependency - get_sync_status |
|
remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) |
|
case "$remoteSync" in |
|
- SOK ) |
|
+ SOK | PRIM ) |
|
super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here (and reset lpa)" |
|
set_crm_master 5 |
|
if check_for_primary_master; then |
|
@@ -1718,11 +1887,11 @@ |
|
SFAIL ) |
|
super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync (SFAIL) ==> local restart preferred" |
|
;; |
|
- * ) |
|
+ * ) |
|
super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync ($remoteSync) ==> local restart preferred" |
|
;; |
|
- esac |
|
- else |
|
+ esac |
|
+ else |
|
# TODO: PRIO5: SCALE-OUT ONLY? Implement for local restart |
|
# It maybe that for the local restart we only need to decrease the secondaries promotion score |
|
#super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here" |
|
@@ -1765,8 +1934,12 @@ |
|
case "$my_role" in |
|
[12]:P:*:master:* ) # primary is down or may not anser hdbsql query so drop analyze_hana_sync_status |
|
;; |
|
- [34]:P:*:master:* ) # primary is up and should now be able to anser hdbsql query |
|
- analyze_hana_sync_status |
|
+ [34]:P:*:*:* ) # primary is up and should now be able to anser hdbsql query |
|
+ if [ -f $DIR_EXECUTABLE/python_support/systemReplicationStatus.py ]; then |
|
+ analyze_hana_sync_statusSRS |
|
+ else |
|
+ analyze_hana_sync_statusSQL |
|
+ fi |
|
;; |
|
esac |
|
rem_role=$(get_hana_attribute ${remoteNode} ${ATTR_NAME_HANA_ROLES[@]}) |
|
@@ -1776,9 +1949,9 @@ |
|
[234]:P:* ) # dual primary, but other instance marked as PROMOTED by the cluster |
|
lpa_check_lpt_status; again_lpa_rc=$? |
|
if [ $again_lpa_rc -eq 2 ]; then |
|
- super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" |
|
- lpa_set_lpt 10 |
|
- lpa_push_lpt 10 |
|
+ super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" |
|
+ lpa_set_lpt 10 |
|
+ lpa_push_lpt 10 |
|
rc=$OCF_NOT_RUNNING |
|
fi |
|
;; |
|
@@ -1812,13 +1985,13 @@ |
|
function saphana_monitor_secondary() |
|
{ |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local rc=$OCF_ERR_GENERIC |
|
- local promoted=0 |
|
+ local rc=$OCF_ERR_GENERIC |
|
+ local promoted=0 |
|
local init_attribute=0 |
|
local lss |
|
# |
|
# OK, we are running as HANA SECONDARY |
|
- # |
|
+ # |
|
if ! lpa_get_lpt ${NODENAME}; then |
|
lpa_set_lpt 10 |
|
lpa_push_lpt 10 |
|
@@ -1863,7 +2036,7 @@ |
|
super_ocf_log debug "DBG: saphana_monitor_clone: HANA_STATE_SECONDARY" |
|
# |
|
# old method was: saphana_monitor - new method is get_hana_landscape_status |
|
- get_hana_landscape_status; lss=$? |
|
+ get_hana_landscape_status; lss=$? |
|
super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" |
|
case "$lss" in |
|
0 ) # FATAL |
|
@@ -1919,11 +2092,11 @@ |
|
# a) returning 7 here and force cluster a restart of the slave |
|
# b) starting the instance here inside the monitor -> may result in longer runtime, timeouts |
|
# |
|
- # first check with the status function (OS tools) if there could be something like a SAP instance running |
|
- # as we do not know here, if we are in master or slave state we do not want to start our monitoring |
|
- # agents (sapstartsrv) on the wrong host |
|
- local rc=$OCF_ERR_GENERIC |
|
- local promoted=0 |
|
+ # first check with the status function (OS tools) if there could be something like a SAP instance running |
|
+ # as we do not know here, if we are in master or slave state we do not want to start our monitoring |
|
+ # agents (sapstartsrv) on the wrong host |
|
+ local rc=$OCF_ERR_GENERIC |
|
+ local promoted=0 |
|
local init_attribute=0 |
|
local lpaRc=0 |
|
local mRc=0 |
|
@@ -1973,7 +2146,7 @@ |
|
# function: saphana_promote_clone - promote a hana clone |
|
# params: - |
|
# globals: OCF_*(r), NODENAME(r), HANA_STATE_*, SID(r), InstanceName(r), |
|
-# saphana_promote_clone: |
|
+# saphana_promote_clone: |
|
# In a Master/Slave configuration get Master being the primary OR by running hana takeover |
|
# |
|
function saphana_promote_clone() { |
|
@@ -2017,7 +2190,7 @@ |
|
rc=$OCF_SUCCESS; |
|
else |
|
rc=$OCF_FAILED_MASTER |
|
- fi |
|
+ fi |
|
;; |
|
* ) |
|
super_ocf_log err "ACT: HANA SYNC STATUS IS NOT 'SOK' SO THIS HANA SITE COULD NOT BE PROMOTED" |
|
@@ -2039,10 +2212,10 @@ |
|
# |
|
# function: saphana_demote_clone - demote a hana clone instance |
|
# params: - |
|
-# globals: OCF_*(r), NODENAME(r), |
|
+# globals: OCF_*(r), NODENAME(r), |
|
# saphana_demote_clone |
|
-# the HANA System Replication (SR) runs in a Master/Slave |
|
-# While we could not change a HANA instance to be really demoted, we only mark the status for |
|
+# the HANA System Replication (SR) runs in a Master/Slave |
|
+# While we could not change a HANA instance to be really demoted, we only mark the status for |
|
# correct monitor return codes |
|
# |
|
function saphana_demote_clone() { |
|
@@ -2056,9 +2229,9 @@ |
|
} |
|
|
|
# |
|
-# function: main - main function to operate |
|
+# function: main - main function to operate |
|
# params: ACTION |
|
-# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), SAPVIRHOST(w), DIR_EXECUTABLE(w), |
|
+# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), SAPVIRHOST(w), DIR_EXECUTABLE(w), |
|
# globals: SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) |
|
# |
|
|
|
@@ -2073,7 +2246,7 @@ |
|
SAPCONTROL="" |
|
DIR_PROFILE="" |
|
SAPSTARTPROFILE="" |
|
-SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" |
|
+SAPHanaFilter="ra-act-dec-lpa" |
|
|
|
NODENAME=$(crm_node -n) |
|
|
|
@@ -2100,7 +2273,7 @@ |
|
exit $OCF_SUCCESS;; |
|
*);; |
|
esac |
|
-saphana_init |
|
+saphana_init |
|
|
|
if ! ocf_is_root |
|
then |
|
@@ -2141,7 +2314,7 @@ |
|
saphana_$ACTION$CLACT |
|
ra_rc=$? |
|
;; |
|
- validate-all) |
|
+ validate-all) |
|
saphana_validate |
|
ra_rc=$? |
|
;; |
|
@@ -2149,12 +2322,13 @@ |
|
lpa_check_lpt_status |
|
ra_rc=$? |
|
;; |
|
- *) # seams to be a unknown request |
|
- saphana_methods |
|
+ *) # seams to be a unknown request |
|
+ saphana_methods |
|
ra_rc=$OCF_ERR_UNIMPLEMENTED |
|
;; |
|
esac |
|
timeE=$(date '+%s') |
|
(( timeR = timeE - timeB )) |
|
+#super_ocf_log info "RA ==== SAPHanaFilter=$SAPHanaFilter" |
|
super_ocf_log info "RA ==== end action $ACTION$CLACT with rc=${ra_rc} ($THE_VERSION) (${timeR}s)====" |
|
exit ${ra_rc} |
|
diff -uNr a/heartbeat/SAPHanaTopology b/heartbeat/SAPHanaTopology |
|
--- a/heartbeat/SAPHanaTopology 2016-04-26 12:01:55.620889964 +0200 |
|
+++ b/heartbeat/SAPHanaTopology 2016-04-26 12:03:18.033887556 +0200 |
|
@@ -16,7 +16,7 @@ |
|
# Copyright: (c) 2014 SUSE Linux Products GmbH |
|
# (c) 2015 SUSE Linux GmbH |
|
# |
|
-# An example usage: |
|
+# An example usage: |
|
# See usage() function below for more details... |
|
# |
|
# OCF instance parameters: |
|
@@ -41,7 +41,6 @@ |
|
HANA_STATE_DEFECT=3 |
|
|
|
debug_attributes=0 |
|
- |
|
SH=/bin/sh |
|
|
|
# |
|
@@ -57,7 +56,7 @@ |
|
local shf="${SAPHanaFilter:-all}" |
|
#ocf_log "info" "super_ocf_log: f:$shf l:$level m:$message" |
|
# message levels: (dbg)|info|warn|err|error |
|
- # |
|
+ # |
|
# message types: (ACT|RA|FLOW|DBG|LPA|DEC |
|
case "$level" in |
|
dbg | debug | warn | err | error ) skip=0 |
|
@@ -65,7 +64,7 @@ |
|
info ) |
|
case "$shf" in |
|
all) skip=0 |
|
- ;; |
|
+ ;; |
|
none ) |
|
skip=1 |
|
;; |
|
@@ -74,13 +73,13 @@ |
|
mtype=${mtype#fh} |
|
echo "$shf"| grep -iq ${mtype}; search=$? |
|
if [ $search -eq 0 ]; then |
|
- skip=0 |
|
+ skip=0 |
|
else |
|
skip=1 |
|
fi |
|
;; |
|
esac |
|
- ;; |
|
+ ;; |
|
esac |
|
if [ $skip -eq 0 ]; then |
|
ocf_log "$level" "$message" |
|
@@ -126,15 +125,15 @@ |
|
<?xml version="1.0"?> |
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> |
|
<resource-agent name="SAPHanaTopology"> |
|
- <version>0.149.6</version> |
|
+ <version>0.151.1</version> |
|
<shortdesc lang="en">Analyzes SAP HANA System Replication Topology.</shortdesc> |
|
<longdesc lang="en">This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to |
|
all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. |
|
In addition it starts and monitors the local saphostagent. |
|
|
|
-1. Interface to monitor a HANA system: landscapeHostConfiguration.py |
|
+1. Interface to monitor a HANA system: landscapeHostConfiguration.py |
|
landscapeHostConfiguration.py has some detailed output about HANA system status |
|
-and node roles. For our monitor the overall status is relevant. This overall |
|
+and node roles. For our monitor the overall status is relevant. This overall |
|
status is reported by the returncode of the script: |
|
0: Internal Fatal |
|
1: ERROR |
|
@@ -150,7 +149,7 @@ |
|
system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). |
|
|
|
3. saphostctrl |
|
- The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the |
|
+ The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the |
|
SAP HANA instance. This is the hostname used during the HANA installation. |
|
</longdesc> |
|
<parameters> |
|
@@ -172,13 +171,8 @@ |
|
<content type="string" default="" /> |
|
</parameter> |
|
<parameter name="SAPHanaFilter" unique="0" required="0"> |
|
- <shortdesc lang="en">Define type of SAPHanaTopology RA messages to be printed</shortdesc> |
|
- <longdesc lang="en">Define type of SAPHanaTopology RA messages to be printed. |
|
-Define SAPHana resource agent messages to be printed. |
|
- This parameter should only be set if requested by support. The default is sufficient for normal operation. |
|
- Values: ra-act-lpa-dec-flow |
|
- You could specify any combination of the above values like "ra-act-flow" |
|
- </longdesc> |
|
+ <shortdesc lang="en">OUTDATED</shortdesc> |
|
+ <longdesc lang="en">OUTDATED</longdesc> |
|
<content type="string" default="" /> |
|
</parameter> |
|
</parameters> |
|
@@ -197,7 +191,7 @@ |
|
} |
|
|
|
# |
|
-# function: get_hana_attribute |
|
+# function: get_hana_attribute |
|
# params: NODE ATTR [STORE] |
|
# globals: - |
|
# |
|
@@ -208,16 +202,19 @@ |
|
local attr_node=$1 |
|
local attr_name=$2 |
|
local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter |
|
- local attr_val="" |
|
- attr_val=$(crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q); rc=$? |
|
- if [ $debug_attributes -eq 1 ]; then |
|
- dstr=$(date) |
|
- echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q --> $attr_val" >> /var/log/fhATTRIBUTE |
|
- fi |
|
- echo "$attr_val" |
|
- if [ $rc -ne 0 ]; then |
|
- super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -G -n "$attr_name" -l $attr_store -q" |
|
- fi |
|
+ local attr_default=${4:-} |
|
+ local dstr |
|
+ dstr=$(date) |
|
+ case "$attr_store" in |
|
+ reboot | forever ) |
|
+ echo "$dstr: SAPHanaTopology: crm_attribute -N ${attr_node} -G -n \"$attr_name\" -l $attr_store -q" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ props ) |
|
+ echo "$dstr: SAPHanaTopology: crm_attribute -G -n \"$attr_name\" -t crm_config -q" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -G -n "$attr_name" -t crm_config -q -d "$attr_default" 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ esac |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
@@ -234,19 +231,24 @@ |
|
local attr_value=$2 |
|
local attr_name=$3 |
|
local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter |
|
+ local attr_default=${5:-} |
|
local rc=1 |
|
- local attr_old |
|
- attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? |
|
+ local attr_old="" |
|
+ local dstr |
|
+ dstr=$(date) |
|
+ attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store $attr_default); get_rc=$? |
|
if [ "$attr_old" != "$attr_value" ]; then |
|
super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " |
|
- if [ $debug_attributes -eq 1 ]; then |
|
- dstr=$(date) |
|
- echo "$dstr: SAPHanaTopology: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE |
|
- fi |
|
- crm_attribute -N $attr_node -v "$attr_value" -n "$attr_name" -l $attr_store; rc=$? |
|
- if [ $rc -ne 0 ]; then |
|
- super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store" |
|
- fi |
|
+ case "$attr_store" in |
|
+ reboot | forever ) |
|
+ echo "$dstr: SAPHanaTopology: crm_attribute -N $attr_node -v $attr_value -n \"$attr_name\" -l $attr_store" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ props ) |
|
+ echo "$dstr: SAPHanaTopology: crm_attribute -v $attr_value -n \"$attr_name\" -t crm_config -s SAPHanaSR" >> /var/log/fhATTRIBUTE |
|
+ crm_attribute -v $attr_value -n "$attr_name" -t crm_config -s SAPHanaSR 2>>/var/log/fhATTRIBUTE; rc=$? |
|
+ ;; |
|
+ esac |
|
else |
|
super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" |
|
rc=0 |
|
@@ -299,7 +301,7 @@ |
|
# |
|
# yes it is a clone config - check, if its configured well |
|
# |
|
- if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ; then |
|
+ if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ; then |
|
super_ocf_log err "ACT: Clone options misconfigured. (expect: clone_node_max=1)" |
|
exit $OCF_ERR_CONFIGURED |
|
fi |
|
@@ -314,8 +316,8 @@ |
|
# |
|
# function: sht_init - initialize variables for the resource agent |
|
# params: - |
|
-# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), |
|
-# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) |
|
+# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), |
|
+# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) |
|
# globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_PRIMARY_AT(w), ATTR_NAME_HANA_CLONE_STATE(w) |
|
# globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w), nodelist(w) |
|
# sht_init : Define global variables with default values, if optional parameters are not set |
|
@@ -327,6 +329,8 @@ |
|
local myInstanceName="" |
|
local rc=$OCF_SUCCESS |
|
local hdbANSWER="" |
|
+ local siteID |
|
+ local siteNAME |
|
HOSTEXECNAME=saphostexec |
|
USRSAP=/usr/sap |
|
SAPSERVICE_PATH=${USRSAP}/sapservices |
|
@@ -340,10 +344,9 @@ |
|
super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" |
|
sid=$(echo "$SID" | tr [:upper:] [:lower:]) |
|
sidadm="${sid}adm" |
|
- SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" |
|
ocf_env=$(env | grep 'OCF_RESKEY_CRM') |
|
super_ocf_log debug "DBG3: OCF: $ocf_env" |
|
- ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? |
|
+ ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? |
|
ATTR_NAME_HANA_PRIMARY_AT=("hana_${sid}_primary_at" "reboot") # Not really used |
|
ATTR_NAME_HANA_CLONE_STATE=("hana_${sid}_clone_state" "reboot") # UKNOWN?, DEMOTED, PROMOTED |
|
ATTR_NAME_HANA_REMOTEHOST=("hana_${sid}_remoteHost" "forever") |
|
@@ -352,8 +355,14 @@ |
|
ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") |
|
ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") |
|
ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") |
|
- |
|
+ # |
|
+ # new "central" attributes |
|
+ # |
|
+ ATTR_NAME_HANA_FILTER=("hana_${sid}_glob_filter" "props" "ra-act-dec-lpa") |
|
# optional OCF parameters, we try to guess which directories are correct |
|
+ |
|
+ SAPHanaFilter=$(get_hana_attribute "X" ${ATTR_NAME_HANA_FILTER[@]}) |
|
+ |
|
if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] |
|
then |
|
DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" |
|
@@ -387,19 +396,32 @@ |
|
# we need: mode=primary|sync|syncmem|...; site name=<site>; mapping/<me>=<site>/<node> (multiple lines) |
|
case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in |
|
*corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; |
|
- *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; |
|
- *cman* ) nodelist=$(crm_node -l);; |
|
+ *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; |
|
+ *cman* ) nodelist=$(crm_node -l);; |
|
esac |
|
#### SAP-CALL |
|
- hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) |
|
- super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" |
|
- site=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') |
|
+ # hdbnsutil was a bit unstable in some tests so we recall the tool, if it fails to report the srmode |
|
+ for i in 1 2 3 4 5 6 7 8 9; do |
|
+ hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) |
|
+ super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" |
|
+ srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') |
|
+ case "$srmode" in |
|
+ primary | syncmem | sync | async | none ) |
|
+ # we can leave the loop as we already got a result |
|
+ break |
|
+ ;; |
|
+ * ) |
|
+ # lets pause a bit to give hdbnsutil a chance to answer next time |
|
+ sleep 2 |
|
+ ;; |
|
+ esac |
|
+ done |
|
+ # TODO PRIO3: Implement a file lookup, if we did not get a result |
|
+ siteID=$(echo "$hdbANSWER" | awk -F= '/site id/ {print $2}') |
|
+ siteNAME=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') |
|
+ site=$siteNAME |
|
srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') |
|
- if [ $debug_attributes -eq 1 ]; then |
|
- dstr=$(date) |
|
- echo "$dstr: SAPHanaTopology: srmode=$srmode" >> /var/log/fhATTRIBUTE |
|
- fi |
|
- MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 == "mapping" && $3 != site { print $4 }' site=$site) |
|
+ MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 ~ "mapping" && $3 !~ site { print $4 }' site=$site) |
|
super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" |
|
# |
|
# filter all non-cluster mappings |
|
@@ -413,12 +435,12 @@ |
|
echo $hanaVHost; |
|
fi; |
|
done; |
|
- done ) |
|
+ done ) |
|
super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" |
|
super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" |
|
super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" |
|
return $OCF_SUCCESS |
|
-} |
|
+} |
|
|
|
# |
|
# function: check_for_primary - check if local SAP HANA is configured as primary |
|
@@ -428,32 +450,30 @@ |
|
function check_for_primary() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=0 |
|
- # DONE: Change stderr location!! |
|
- #sidadm=lnxadm |
|
- #node_status=$(check_for_primary_single) |
|
- node_status=$srmode |
|
- super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" |
|
- super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" |
|
- for i in 1 2 3 4 5 6 7 8 9; do |
|
- case "$node_status" in |
|
- primary ) |
|
+ node_status=$srmode |
|
+ super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" |
|
+ super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" |
|
+ for i in 1 2 3 4 5 6 7 8 9; do |
|
+ case "$node_status" in |
|
+ primary ) |
|
super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" |
|
return $HANA_STATE_PRIMARY;; |
|
syncmem | sync | async ) |
|
super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" |
|
return $HANA_STATE_SECONDARY;; |
|
- none ) # have seen that mode on second side BEFEORE we registered it as replica |
|
+ none ) # have seen that mode on second side BEFEORE we registered it as replica |
|
super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" |
|
return $HANA_STATE_STANDALONE;; |
|
* ) |
|
- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" |
|
- dump=$( echo $node_status | hexdump -C ); |
|
- super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" |
|
- #### SAP-CALL |
|
- node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) |
|
- node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') |
|
- super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" |
|
- # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes |
|
+ # TODO: PRIO1: Should we set SFAIL? |
|
+ # TODO: PRIO2: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes |
|
+ dump=$( echo $node_status | hexdump -C ); |
|
+ super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP: <$dump>" |
|
+ #### SAP-CALL |
|
+ node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) |
|
+ node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') |
|
+ super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" |
|
+ # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes |
|
esac; |
|
done |
|
super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_DEFECT" |
|
@@ -464,7 +484,7 @@ |
|
# |
|
# function: start_saphostagent |
|
# params: - |
|
-# globals: |
|
+# globals: HOSTEXEC_PATH(r), HOSTEXEC_PROFILE_PATH(r) |
|
# |
|
function start_saphostagent() |
|
{ |
|
@@ -478,7 +498,7 @@ |
|
# |
|
# function: stop_saphostagent |
|
# params: - |
|
-# globals: |
|
+# globals: HOSTEXEC_PATH(r) |
|
# |
|
function stop_saphostagent() |
|
{ |
|
@@ -496,6 +516,8 @@ |
|
function check_saphostagent() |
|
{ |
|
local rc=1 |
|
+ # TODO: PRIO3: should the path been removed like "saphostexec" instead of "/usr/sap/hostctrl/exe/saphostexec" |
|
+ # or should we use ${HOSTEXEC_PATH} instead? |
|
pgrep -f /usr/sap/hostctrl/exe/saphostexec; rc=$? |
|
return $rc |
|
} |
|
@@ -509,15 +531,16 @@ |
|
# sht_start : Start the SAP HANA instance |
|
# |
|
function sht_start() { |
|
- |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
|
|
local rc=$OCF_NOT_RUNNING |
|
local output="" |
|
- local loopcount=0 |
|
+ local loopcount=0 |
|
|
|
- mkdir -p /var/lib/SAPHana |
|
- touch /var/lib/SAPHana/SAPTopologyON |
|
+ # TODO: PRIO3: move the string "$HA_RSCTMP/SAPHana/SAPTopologyON" to a variable |
|
+ # TODO: PRIO3: move the file to the clusters tmp directory? |
|
+ mkdir -p $HA_RSCTMP/SAPHana |
|
+ touch $HA_RSCTMP/SAPHana/SAPTopologyON |
|
if ! check_saphostagent; then |
|
start_saphostagent |
|
fi |
|
@@ -532,16 +555,16 @@ |
|
# function: sht_stop - stop a hana instance |
|
# params: - |
|
# globals: OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r) |
|
-# sht_stop: Stop the SAP instance |
|
+# sht_stop: Stop the SAP HANA Topology Resource |
|
# |
|
function sht_stop() { |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local output="" |
|
local rc=0 |
|
|
|
- rm /var/lib/SAPHana/SAPTopologyON |
|
+ rm $HA_RSCTMP/SAPHana/SAPTopologyON |
|
rc=$OCF_SUCCESS |
|
- |
|
+ |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
@@ -557,13 +580,13 @@ |
|
super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
local rc=0 |
|
|
|
- if [ -f /var/lib/SAPHana/SAPTopologyON ]; then |
|
+ if [ -f $HA_RSCTMP/SAPHana/SAPTopologyON ]; then |
|
rc=$OCF_SUCCESS |
|
else |
|
rc=$OCF_NOT_RUNNING |
|
fi |
|
|
|
- super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
|
|
@@ -575,37 +598,37 @@ |
|
# sht_status: Lightweight check of SAP instance only with OS tools |
|
# |
|
function sht_status() { |
|
- super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local rc=0 |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ local rc=0 |
|
|
|
- sht_monitor; rc=$? |
|
- return $rc |
|
+ sht_monitor; rc=$? |
|
+ return $rc |
|
} |
|
|
|
|
|
# |
|
# function: sht_validate - validation of (some) variables/parameters |
|
# params: - |
|
-# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), |
|
-# sht_validate: Check the symantic of the input parameters |
|
+# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), |
|
+# sht_validate: Check the symantic of the input parameters |
|
# |
|
function sht_validate() { |
|
- super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
- local rc=$OCF_SUCCESS |
|
- if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] |
|
- then |
|
- super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" |
|
- rc=$OCF_ERR_ARGS |
|
- fi |
|
+ super_ocf_log info "FLOW $FUNCNAME ($*)" |
|
+ local rc=$OCF_SUCCESS |
|
+ if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] |
|
+ then |
|
+ super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" |
|
+ rc=$OCF_ERR_ARGS |
|
+ fi |
|
|
|
- if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] |
|
- then |
|
- super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" |
|
- rc=$OCF_ERR_ARGS |
|
- fi |
|
+ if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] |
|
+ then |
|
+ super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" |
|
+ rc=$OCF_ERR_ARGS |
|
+ fi |
|
|
|
- super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
- return $rc |
|
+ super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
+ return $rc |
|
} |
|
|
|
# |
|
@@ -661,15 +684,15 @@ |
|
|
|
if ocf_is_probe; then |
|
super_ocf_log debug "DBG2: PROBE ONLY" |
|
+ sht_monitor; rc=$? |
|
else |
|
super_ocf_log debug "DBG2: REGULAR MONITOR" |
|
if ! check_saphostagent; then |
|
start_saphostagent |
|
fi |
|
- fi |
|
# |
|
# First check, if we are PRIMARY or SECONDARY |
|
- # |
|
+ # |
|
super_ocf_log debug "DBG2: HANA SID $SID" |
|
super_ocf_log debug "DBG2: HANA InstanceName $InstanceName" |
|
super_ocf_log debug "DBG2: HANA InstanceNr $InstanceNr" |
|
@@ -721,8 +744,8 @@ |
|
set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} |
|
fi |
|
case "$hanaPrim" in |
|
- P ) ;; |
|
- S ) # only secondary may propargate its sync status |
|
+ P ) ;; |
|
+ S ) # only secondary may propargate its sync status |
|
case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in |
|
*corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; |
|
*openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; |
|
@@ -732,8 +755,10 @@ |
|
for n in ${nodelist}; do |
|
set_hana_attribute ${n} "$srmode" ${ATTR_NAME_HANA_SRMODE[@]} |
|
done |
|
- ;; |
|
+ ;; |
|
esac |
|
+ # |
|
+ fi # end ocf_is_NOT_probe |
|
super_ocf_log info "FLOW $FUNCNAME rc=$rc" |
|
return $rc |
|
} |
|
@@ -752,7 +777,7 @@ |
|
} |
|
|
|
# |
|
-# function: main - main function to operate |
|
+# function: main - main function to operate |
|
# params: ACTION |
|
# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), DIR_EXECUTABLE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) |
|
# |
|
@@ -763,7 +788,7 @@ |
|
InstanceName="" |
|
InstanceNr="" |
|
DIR_EXECUTABLE="" |
|
-SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" |
|
+SAPHanaFilter="ra-act-dec-lpa" |
|
NODENAME=$(crm_node -n) |
|
|
|
if [ $# -ne 1 ] |
|
@@ -785,11 +810,11 @@ |
|
exit $OCF_SUCCESS;; |
|
notify) sht_notify |
|
exit $OCF_SUCCESS;; |
|
- admin-setup) admin-setup |
|
- exit $OCF_SUCCESS;; |
|
+ admin-setup) admin-setup |
|
+ exit $OCF_SUCCESS;; |
|
*);; |
|
esac |
|
-sht_init |
|
+sht_init |
|
|
|
if ! ocf_is_root |
|
then |
|
@@ -810,7 +835,6 @@ |
|
exit $OCF_ERR_ARGS |
|
fi |
|
|
|
- |
|
if is_clone |
|
then |
|
CLACT=_clone |
|
@@ -830,12 +854,12 @@ |
|
sht_$ACTION$CLACT |
|
ra_rc=$? |
|
;; |
|
- validate-all) |
|
+ validate-all) |
|
sht_validate |
|
ra_rc=$? |
|
;; |
|
- *) # seams to be a unknown request |
|
- sht_methods |
|
+ *) # seams to be a unknown request |
|
+ sht_methods |
|
ra_rc=$OCF_ERR_UNIMPLEMENTED |
|
;; |
|
esac
|
|
|