From ef36b33da922b2b8501e80ca840bfb7accc65ff0 Mon Sep 17 00:00:00 2001 From: David Vossel Date: Thu, 26 Feb 2015 14:21:20 -0600 Subject: [PATCH] bz1168251-SAPHana-agents --- doc/man/Makefile.am | 2 + heartbeat/Makefile.am | 2 + heartbeat/SAPHana | 2106 +++++++++++++++++++++++++++++++++++++++ heartbeat/SAPHanaTopology | 813 +++++++++++++++ tools/Makefile.am | 2 +- tools/show_SAPHanaSR_attributes | 133 +++ 6 files changed, 3057 insertions(+), 1 deletion(-) create mode 100755 heartbeat/SAPHana create mode 100755 heartbeat/SAPHanaTopology create mode 100755 tools/show_SAPHanaSR_attributes diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index 5a1ad4d..31fc1f5 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -78,6 +78,8 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ ocf_heartbeat_Route.7 \ ocf_heartbeat_SAPDatabase.7 \ ocf_heartbeat_SAPInstance.7 \ + ocf_heartbeat_SAPHana.7 \ + ocf_heartbeat_SAPHanaTopology.7 \ ocf_heartbeat_SendArp.7 \ ocf_heartbeat_ServeRAID.7 \ ocf_heartbeat_SphinxSearchDaemon.7 \ diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am index f08dad4..dd5b0a9 100644 --- a/heartbeat/Makefile.am +++ b/heartbeat/Makefile.am @@ -105,6 +105,8 @@ ocf_SCRIPTS = ClusterMon \ rsyslog \ SAPDatabase \ SAPInstance \ + SAPHana \ + SAPHanaTopology \ SendArp \ ServeRAID \ slapd \ diff --git a/heartbeat/SAPHana b/heartbeat/SAPHana new file mode 100755 index 0000000..f4db17a --- /dev/null +++ b/heartbeat/SAPHana @@ -0,0 +1,2106 @@ +#!/bin/bash +# +# SAPHana +# +# Description: Manages two single SAP HANA Instance in System Replication +# Planned: do also manage scale-up scenarios +# currently the SAPHana is dependent of the analysis of +# SAPHanaTopology +# For supported scenarios please read the README file provided +# in the same software package (rpm) +# +############################################################################## +# +# SAPHana +# Author: Fabian Herschel, November 2013 +# Support: linux@sap.com +# License: GNU General Public License (GPL) +# Copyright: (c) 2013,2014 SUSE Linux Products GmbH +# +# An example usage: +# See usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_SID +# OCF_RESKEY_InstanceNumber +# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) +# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) +# OCF_RESKEY_INSTANCE_PROFILE (optional, well known directories will be searched by default) +# OCF_RESKEY_PREFER_SITE_TAKEOVER (optional, default is no) +# OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT (optional, time difference needed between two last-primary-tiemstampe (lpt)) +# OCF_RESKEY_SAPHanaFilter (optional, should only be set if been told by support or for debugging purposes) +# +# +####################################################################### +# +# Initialization: +timeB=$(date '+%s') + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# +####################################################################### +# + +HANA_STATE_PRIMARY=0 +HANA_STATE_SECONDARY=1 +HANA_STATE_STANDALONE=2 +HANA_STATE_DEFECT=3 + +SH=/bin/sh + +# +# function: super_ocf_log - wrapper function for ocf log in order catch usual logging into super log +# params: LOG_MESSAGE +# globals: SAPHanaFilter +function super_ocf_log() { + local level="$1" + local message="$2" + local skip=1 + local mtype="" + local search=0 + local shf="${SAPHanaFilter:-all}" + # message levels: (dbg)|info|warn|err|error + # message types: (ACT|RA|FLOW|DBG|LPA|DEC|DBG2... + case "$level" in + debug | dbg | warn | err | error ) skip=0 + ;; + info ) + case "$shf" in + all) skip=0 + ;; + none ) + skip=1 + ;; + * ) mtype=${message%% *} + mtype=${mtype%:} + mtype=${mtype#fh} + echo "$shf"| grep -iq ${mtype}; search=$? + if [ $search -eq 0 ]; then + skip=0 + else + skip=1 + fi + ;; + esac + ;; + esac + if [ $skip -eq 0 ]; then + ocf_log "$level" "$message" + fi +} + +# +# function: saphana_usage - short usage info +# params: - +# globals: $0(r) +# +function saphana_usage() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + methods=$(saphana_methods) + methods=$(echo $methods | tr ' ' '|') + cat <<-! + usage: $0 ($methods) + + $0 manages a SAP HANA Instance as an HA resource. + + The 'start' operation starts the HANA instance or bring the "clone instance" to a WAITING status + The 'stop' operation stops the HANA instance + The 'status' operation reports whether the HANA instance is running + The 'monitor' operation reports whether the HANA instance seems to be working in master/slave it also needs to check the system replication status + The 'promote' operation either runs a takeover for a secondary or a just-nothing for a primary + The 'demote' operation neary does nothing and just mark the instance as demoted + The 'notify' operation always returns SUCCESS + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + + ! + return $rc +} + +# +# function: saphana_meta_data - print resource agent meta-data for cluster +# params: - +# globals: - +# +function saphana_meta_data() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + cat < + + +0.149.4 + +Manages two SAP HANA instances in system replication (SR). + +The SAPHanaSR resource agent manages two SAP Hana instances (databases) which are configured +in system replication. This first version is limitted to the scale-up scenario. Scale-Up is +not supported in this version. + +Managing the two SAP HANA instances means that the resource agent controls the start/stop of the +instances. In addition the resource agent is able to monitor the SAP HANA databases to check their +availability on landscape host configuration level. For this monitoring the resource agent relies on interfaces +provided by SAP. A third task of the resource agent is to also check the synchronisation status +of the two SAP HANA databases. If the synchronisation is not "SOK", than the cluster avoids to +failover to the secondary side, if the primary fails. This is to improve the data consistency. + +The resource agent uses the following four interfaces provided by SAP: + +1. sapcontrol/sapstartsrv + The interface sapcontrol/sapstartsrv is used to start/stop a HANA database instance/system + +2. landscapeHostConfiguration + The interface is used to monitor a HANA system. The python script is named landscapeHostConfiguration.py. + landscapeHostConfiguration.py has some detailed output about HANA system status + and node roles. For our monitor the overall status is relevant. This overall + status is reported by the returncode of the script: + 0: Internal Fatal, 1: ERROR, 2: WARNING, 3: INFO, 4: OK + The SAPHana resource agent will interpret returncodes 0 as FATAL, 1 as not-running or ERROR and and returncodes 2+3+4 as RUNNING. + +3. hdbnsutil + The interface hdbnsutil is used to check the "topology" of the system replication as well as the current configuration + (primary/secondary) of a SAP HANA database instance. A second task of the interface is the posibility to run a + system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). + +4. hdbsql / systemReplicationStatus + Interface is SQL query into HANA (system replication table). The hdbsql query will be replaced by a python script + "systemReplicationStatus.py" in SAP HANA SPS8 or 9. + As long as we need to use hdbsql you need to setup secure store users for linux user root to be able to + access the SAP HANA database. You need to configure a secure store user key "SAPHANA${SID}SR" which can connect the SAP + HANA database: + +5. saphostctrl + The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the + SAP HANA instance. This is the hostname used during the HANA installation. + + + + + SAP System Identifier (SID) like "SLE" or "HAE" + SAP System Identifier (SID) + + + + SAP instance number like "00" or "07" + SAP instance number + + + + Should cluster/RA prefer to switchover to slave instance instead of restarting master locally? Default="yes" + no: Do prefer restart locally + yes: Do prefer takever to remote site + + Local or site recover preferred? + + + + Define, if a former primary should automatically be registered. + The parameter AUTOMATED_REGISTER defines, wether a former primary instance should + be registered automatically by the resource agent during cluster/resource start, if the DUPLICATE_PRIMARY_TIMEOUT is expired... TDB + + + + + Time difference needed between to primary time stamps, if a dual-primary situation occurs + Time difference needed between to primary time stamps, + if a dual-primary situation occurs. If the time difference is + less than the time gap, than the cluster hold one or both instances in a "WAITING" status. This is to give a admin + a chance to react on a failover. A failed former primary will be registered after the time difference is passed. After + this registration to the new primary all data will be overwritten by the system replication. + + + + + The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. + Path of sapstartsrv and sapcontrol + + + + The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. + Path of start profile + + + + The name of the SAP HANA instance profile. Specify this parameter, if you have changed the name of the SAP HANA instance profile after the default SAP installation. Normally you do not need to set this parameter. + HANA instance profile name + + + + Define SAPHana resource agent messages to be printed + Define SAPHana resource agent messages to be printed. + This parameter should only be set of been requested by SUSE support. The default is sufficient for normal operation. + + + + + + + + + + + + + + + + + + + +END +return $rc +} + +# +# function: saphana_methods - report supported cluster methods +# params: - +# globals: - +# methods: What methods/operations do we support? +# +function saphana_methods() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 m + for m in start stop status monitor promote demote notify validate-all methods meta-data usage; do + echo "$m" + done + return $rc +} + +# +# function: dequote - filter: remove quotes (") from stdin +# params: - +# globals: - +function dequote() +{ + local rc=0; tr -d '"'; return $rc +} + +# +# function: remoteHost2remoteNode - convert a SAP remoteHost to the cluster node name +# params: remoteHost +# globals: ATTR_NAME_HANA_VHOST[*] +# +function remoteHost2remoteNode() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local -a clusterNodes=() + local cl="" + local vHost="" + local remoteHost="$1" + local remoteNode="" + local rc=1 + for cl in ${otherNodes[@]}; do + vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) + if [ "$vHost" = "$remoteHost" ]; then # we found the correct node + remoteNode=$cl + rc=0 + fi + done + if [ -n "$remoteNode" ]; then + echo "$remoteNode" + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: is_clone - report, if resource is configured as a clone (also master/slave) +# params: - +# globals: OCF_*(r) +# descript: is_clone : find out if we are configured to run in a Master/Slave configuration +# rc: 0: it is a clone, 1: it is not a clone +# +# DONE: PRIO2: For the first shippment (scale-out) we need to limit the clones to 2 +# +function is_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + # + # is a clone config? + # + if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ + && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ]; then + # + # yes it is a clone config - check, if its configured well + # + if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ + [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ + [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ + [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ]; then + super_ocf_log err "ACT: Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" + exit $OCF_ERR_CONFIGURED + fi + rc=0; + else + rc=1; + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: get_hana_attribute +# params: NODE ATTR [STORE] +# globals: - +# +function get_hana_attribute() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter + crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q; rc=$? + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: set_hana_attribute - set the multi-state status of a node +# params: NODE VALUE ATTR [STORE] +# globals: - +# +function set_hana_attribute() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local attr_node=$1 + local attr_value=$2 + local attr_name=$3 + local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter + local rc=1 + local attr_old="" + attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " + crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store; rc=$? + else + super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" + rc=0 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: assert - quickly go out of here with minimal error/return code handling and log +# params: MESSAGE +# globals: OCF_*(r) +# +function assert() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local err_msg=$1 local default_rc=$OCF_NOT_RUNNING + # DONE: Check, if we need to destinguish between probe and others + if ocf_is_probe; then + default_exit=$OCF_NOT_RUNNING + else + default_exit=$OCF_ERR_CONFIGURED + fi + if [ "$ACTION" = "stop" ]; then + cleanup_instance + exit $OCF_SUCCESS + fi + super_ocf_log err "ACT: $err_msg" + exit $OCF_NOT_RUNNING +} + +# +# function: set_crm_master - set the crm master score of the local node +# params: SCORE +# globals: HA_SBIN_DIR(r), OCF_RESOURCE_INSTANCE(r) +# +function set_crm_master() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + local score=0 + if [ -n "$1" ]; then + score=$1 + fi + # DONE: PRIO2: Only adjust master if value is really different (try to check that) + oldscore=$(${HA_SBIN_DIR}/crm_master -G -q -l reboot) + if [ "$oldscore" != "$score" ]; then + super_ocf_log debug "DBG: SET crm master: $score (old: $oldscore)" + ${HA_SBIN_DIR}/crm_master -v $score -l reboot; rc=$? + else + super_ocf_log debug "DBG: LET crm master: $score" + rc=0 + fi + #logger -t fhLOG "crm_master with: $OCF_RESOURCE_INSTANCE -v $score -l reboot" + return $rc +} + +# +# function: scoring_crm_master - score instance due to role ans sync match (table SCORING_TABLE_PREFERRED_SITE_TAKEOVER) +# params: NODE_ROLES NODE_SYNC_STATUS +# globals: SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@], +# +scoring_crm_master() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local roles="$1" + local sync="$2" + local skip=0 + local myScore=-1 + for scan in "${SCORING_TABLE_PREFERRED_SITE_TAKEOVER[@]}"; do + if [ $skip -eq 0 ]; then + read rolePatt syncPatt score <<< $scan + if grep "$rolePatt" <<< "$roles"; then + if grep "$syncPatt" <<< "$sync"; then + skip=1 + myScore=$score + fi + fi + fi + done + super_ocf_log debug "DBG: scoring_crm_master adjust score $myScore" + set_crm_master $myScore +} + +# +# function: get_crm_master - get the crm master score of the local node +# params: - +# globals: HA_SBIN_DIR(r) +# +function get_crm_master() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + ${HA_SBIN_DIR}/crm_master -G -q -l reboot; rc=$? + return $rc +} + +# +# function: saphana_init - initialize variables for the resource agent +# params: InstanceName +# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), SAPVIRHOST(w), PreferSiteTakeover(w), +# globals: sr_name(w), remoteHost(w), otherNodes(w) +# globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_CLONE_STATE(w) +# globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w) +# globals: LPA_DIRECTORY(w), SIDInstanceName(w), remoteNode(w) +# saphana_init : Define global variables with default values, if optional parameters are not set +# +function saphana_init() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_SUCCESS + local vName + # two parameter models (for transition only) + # OLD: InstanceName + # NEW: SID InstanceNumber + SID=$OCF_RESKEY_SID + InstanceNr=$OCF_RESKEY_InstanceNumber + SIDInstanceName="${SID}_HDB${InstanceNr}" + InstanceName="HDB${InstanceNr}" + super_ocf_log debug "DBG: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" + sid=$(echo "$SID" | tr [:upper:] [:lower:]) + sidadm="${sid}adm" + # DONE: PRIO4: SAPVIRHOST might be different to NODENAME + # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? Answer: Yes + # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 + # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 + vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ + | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr) + if [ -z "$vName" ]; then + # + # if saphostctrl does not know the answer, try to fallback to attribute provided by SAPHanaTopology + # + vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}); + fi + SAPVIRHOST=${vName} + PreferSiteTakeover="$OCF_RESKEY_PREFER_SITE_TAKEOVER" + SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" + AUTOMATED_REGISTER="${OCF_RESKEY_AUTOMATED_REGISTER:-false}" + LPA_DIRECTORY=/var/lib/SAPHanaRA + LPA_ATTR=("lpa_${sid}_lpt" "forever") + super_ocf_log debug "DBG: SID=$SID, sid=$sid, SIDInstanceName=$SIDInstanceName, InstanceName=$InstanceName, InstanceNr=$InstanceNr, SAPVIRHOST=$SAPVIRHOST" + ocf_env=$(env | grep 'OCF_RESKEY_CRM') + super_ocf_log debug "DBG: OCF: $ocf_env" + # + ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? + ATTR_NAME_HANA_PRIMARY_AT=("hana_${sid}_primary_at" "reboot") # Not used so far + ATTR_NAME_HANA_CLONE_STATE=("hana_${sid}_clone_state" "reboot") # UKNOWN?, DEMOTED, PROMOTED + ATTR_NAME_HANA_REMOTEHOST=("hana_${sid}_remoteHost" "forever") + ATTR_NAME_HANA_SITE=("hana_${sid}_site" "forever") + ATTR_NAME_HANA_ROLES=("hana_${sid}_roles" "reboot") + ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") + ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") + ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") + # + # TODO: PRIO4: Table for non-preferred-site-takeover + # + SCORING_TABLE_PREFERRED_SITE_TAKEOVER=( + "[234]*:P:[^:]*:master .* 150" + "[015-9]*:P:[^:]*:master .* 90" + "[0-9]*:P:[^:]*:slave .* 60" + "[0-9]*:P:[^:]*:\? .* 0" + "[0-9]*:P:[^:]*:- .* 0" + "[234]*:S:[^:]*:master SOK 100" + "[015-9]*:S:[^:]*:master SOK 80" + "[0-9]*:S:[^:]*:master SFAIL -INFINITY" + "[0-9]*:S:[^:]*:slave SOK 10" + "[0-9]*:S:[^:]*:slave SFAIL -INFINITY" + "[0-9]*:S:[^:]*:\? .* 0" + "[0-9]*:S:[^:]*:- .* 0" + ".* .* -1" + ) + SCORING_TABLE_PREFERRED_LOCAL_RESTART=( + "[0-9]*:P:[^:]*:master .* 150" + "[0-9]*:P:[^:]*:slave .* 140" + "[0-9]*:P:[^:]*:\? .* 0" + "[0-9]*:P:[^:]*:- .* 0" + "[0-9]*:S:[^:]*:master SOK 100" + "[0-9]*:S:[^:]*:master SFAIL -INFINITY" + "[0-9]*:S:[^:]*:slave SOK 10" + "[0-9]*:S:[^:]*:slave SFAIL -INFINITY" + "[0-9]*:S:[^:]*:\? .* 0" + "[0-9]*:S:[^:]*:- .* 0" + ".* .* -1" + ) + # + DUPLICATE_PRIMARY_TIMEOUT="${OCF_RESKEY_DUPLICATE_PRIMARY_TIMEOUT:-7200}" + super_ocf_log debug "DBG: DUPLICATE_PRIMARY_TIMEOUT=$DUPLICATE_PRIMARY_TIMEOUT" + # + # Determine list of other cluster nodes and store in otherNodes variable + otherNodes=() + case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in + *corosync* ) otherNodes=($(crm_node -l | awk '{ if ($2 != me) { print $2 }}' me=${NODENAME}));; + *openais* ) otherNodes=($(crm_node -l | awk '$3 == "member" { if ($2 != me) { print $2 }}' me=${NODENAME}));; + *cman* ) otherNodes=($(crm_node -l | awk '{for (i=1; i<=NF; i++) { if ($i != me) { print $i }}}' me=${NODENAME}));; + esac + + remoteHost=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_REMOTEHOST[@]}); + if [ -z "$remoteHost" ]; then + if [ ${#otherNodes[@]} -eq 1 ]; then # we are a 2 node cluster, lets assume the other is the remote-host + remoteHost=${otherNodes[0]} + remoteNode=$remoteHost + super_ocf_log debug "DBG: auto-guess remoteHost=$remoteHost" + else + super_ocf_log debug "DBG: Could not auto-guess remoteHost out of list (${otherNodes[@]})" + fi + else + # + # search cluster node which vhost is equal remoteHost + # + remoteNode=$(remoteHost2remoteNode $remoteHost) + # TODO: PRIO5: catch rc!=0 + fi + # ATTR_NAME_HANA_SITE + sr_name=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SITE[@]}); + sr_mode=$(get_hana_attribute "${NODENAME}" ${ATTR_NAME_HANA_SRMODE[@]}) + if [ -z "$sr_mode" ]; then + sr_mode="sync" + fi + super_ocf_log debug "DBG: sr_name=$sr_name, remoteHost=$remoteHost, remoteNode=$remoteNode, sr_mode=$sr_mode" + # optional OCF parameters, we try to guess which directories are correct + if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] + then + if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol + then + DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" + fi + else + if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" + then + DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" + fi + fi + SAPSTARTSRV="$DIR_EXECUTABLE/sapstartsrv" + SAPCONTROL="$DIR_EXECUTABLE/sapcontrol" + + [ -z "$DIR_EXECUTABLE" ] && assert "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" + DIR_PROFILE="${OCF_RESKEY_DIR_PROFILE:-/usr/sap/$SID/SYS/profile}" + # check, if the following fall-back is ok, or if there could be multiple profiles matching this pattern + if [ -n "${SAPVIRHOST}" ]; then + SAPSTARTPROFILE="$DIR_PROFILE/${OCF_RESKEY_INSTANCE_PROFILE:-${SID}_${InstanceName}_${SAPVIRHOST}}" + else + # check, if the following fall-back is ok, or if there could be multiple profiles matching this pattern + # also take profile versions into account - they might break this fall-back + # TODO: PRIO4: Check, if it makes sense to implement an additional last fall-back: get the SAPSTARTPROFILE from /usr/sap/sapservices + # + SAPSTARTPROFILE="$(ls -1 $DIR_PROFILE/${OCF_RESKEY_INSTANCE_PROFILE:-${SID}_${InstanceName}_*})" + fi + # as root user we need the library path to the SAP kernel to be able to call sapcontrol + # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH + if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] + then + LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH + export LD_LIBRARY_PATH + fi + PATH=${PATH}:${DIR_EXECUTABLE}; export PATH + super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" + ############################# + # TODO: PRIO9: To be able to call landscapeHostConfig.py without su (so as root) + # TODO: PRIO9: Research for environment script .htacces or something like that + #export SAPSYSTEMNAME=ZLF + #export DIR_INSTANCE=/usr/sap/ZLF/HDB02 + #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$DIR_INSTANCE/exe:$DIR_INSTANCE/exe/Python/lib + #export PYTHONPATH=$DIR_INSTANCE/$HOST:$DIR_INSTANCE/exe/python_support:$DIR_INSTANCE/exe + #export PYTHONHOME=$DIR_INSTANCE/exe/Python + #export SAP_RETRIEVAL_PATH=$DIR_INSTANCE/$HOST + #export DIR_EXECUTABLE=$DIR_INSTANCE/exe + ############################# + return $OCF_SUCCESS +} + +# function: check_secstore_users +# params: USER +# globals: DIR_EXECUTABLE(r) +# +# TODO: PRIO5: Might be dropped, if we get a script for fetching the sync status +function check_secstore_users() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local user="" + local rc=1 + while [ $# -gt 0 ]; do + user="$1" + $DIR_EXECUTABLE/hdbuserstore list | grep -q "KEY $user" && echo "$user" && rc=0 && break + shift + done + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: check_sapstartsrv - check for sapstartsrv - optional start +# params: - +# globals: DIR_PROFILE(w), SAPSTARTPROFILE(r), SAPCONTROL(r), SID(r), InstanceName(r), InstanceNr(r), OCF_*(r) +# check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running. +# +function check_sapstartsrv() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local restart=0 + local runninginst="" + local rc=$OCF_SUCCESS + local output="" + if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then + super_ocf_log warn "ACT: sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" + restart=1 + else + output=$($SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script) + if [ $? -eq 0 ] + then + runninginst=$(echo "$output" | grep '^0 : ' | cut -d' ' -f3) + if [ "$runninginst" != "$InstanceName" ] + then + super_ocf_log warn "ACT: sapstartsrv is running for instance $runninginst, that service will be killed" + restart=1 + else + output=$($SAPCONTROL -nr $InstanceNr -function AccessCheck Start) + if [ $? -ne 0 ]; then + super_ocf_log warn "ACT: FAILED - sapcontrol -nr $InstanceNr -function AccessCheck Start ($(ls -ld1 /tmp/.sapstream5${InstanceNr}13))" + super_ocf_log warn "ACT: sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" + restart=1 + fi + fi + else + super_ocf_log warn "ACT: sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" + restart=1 + fi + fi + if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi + if [ $restart -eq 1 ] + then + if [ -d /usr/sap/$SID/SYS/profile/ ] + then + DIR_PROFILE="/usr/sap/$SID/SYS/profile" + else + assert "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" + fi + [ ! -r $SAPSTARTPROFILE ] && assert "Expected $SAPSTARTPROFILE to be the instance START profile, please set INSTANCE_PROFILE parameter!" + pkill -9 -f "sapstartsrv.*$runninginst" + # removing the unix domain socket files as they might have wrong permissions + # or ownership - they will be recreated by sapstartsrv during next start + rm -f /tmp/.sapstream5${InstanceNr}13 + rm -f /tmp/.sapstream5${InstanceNr}14 + $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm + # now make sure the daemon has been started and is able to respond + local srvrc=1 + while [ $srvrc -eq 1 -a $(pgrep -f "sapstartsrv.*$runninginst" | wc -l) -gt 0 ] + do + sleep 1 + $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 + srvrc=$? + done + if [ $srvrc -ne 1 ] + then + super_ocf_log info "ACT: sapstartsrv for instance $SID-$InstanceName was restarted!" + rc=$OCF_SUCCESS + else + super_ocf_log error "ACT: sapstartsrv for instance $SID-$InstanceName could not be started!" + rc=$OCF_ERR_GENERIC + ocf_is_probe && rc=$OCF_NOT_RUNNING + fi + fi + return $rc +} + +# +# function: cleanup_instance - remove resources from a crashed instance +# params: - +# globals: - +# +function cleanup_instance() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + # TODO: PRIO5: Check, if we need HANA cleanup procedure (processes, ipc obj, pid files); Currently not needed + super_ocf_log debug "DBG: cleanup_instance currently not implemented" + rc=0 + super_ocf_log info "FLOW $FUNCNAME rc=$rc" +} + +# +# function: check_for_primary - check if local SAP HANA is configured as primary +# params: - +# globals: HANA_STATE_PRIMARY(r), HANA_STATE_SECONDARY(r), HANA_STATE_DEFECT(r) +# +function check_for_primary() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$HANA_STATE_DEFECT + node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) + node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" + for i in 1 2 3 4 5 6 7 8 9; do + case "$node_status" in + primary ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" + return $HANA_STATE_PRIMARY;; + syncmem | sync | async ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" + return $HANA_STATE_SECONDARY;; + none ) # have seen that mode on second side BEFEORE we registered it as replica + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" + return $HANA_STATE_STANDALONE;; + * ) + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" + dump=$( echo $node_status | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" + node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) + node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + super_ocf_log debug "DEC: check_for_primary: loop=$i: node_status=$node_status" + # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes + esac; + done + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: analyze_hana_sync_status - query and check hana system replication status +# params: - +# globals: DIR_EXECUTABLE(r), remoteHost(r) +# get the HANA sync status +# +function analyze_hana_sync_status() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local -a clusterNodes=() + local cl="" + local vHost="" + local n="" + local hana_sync_status="" what_does_the_chamelion_say="" + local secUser="SLEHALOC" + local chkusr; + local rc=0 + local sqlrc=0 +# local query_state='select distinct REPLICATION_STATUS from SYS.M_SERVICE_REPLICATION' +# select distinct REPLICATION_STATUS from SYS.M_SERVICE_REPLICATION where SITE_NAME='"SITE1"'" + local query_state="select distinct REPLICATION_STATUS from SYS.M_SERVICE_REPLICATION where SITE_NAME='"${sr_name}"'" + local query_secondaries='select distinct SECONDARY_HOST from SYS.M_SERVICE_REPLICATION' + local query_failed_secondaries="select distinct SECONDARY_HOST from SYS.M_SERVICE_REPLICATION where SECONDARY_SITE_NAME = (select distinct SECONDARY_SITE_NAME from SYS.M_SERVICE_REPLICATION WHERE REPLICATION_STATUS != 'ACTIVE')" + local all_cluster_hosts all_secondary_hosts all_broken_secondaries +# +##################################################################################################### +# +# select distinct SITE_NAME, HOST, REPLICATION_STATUS, SECONDARY_SITE_NAME, SECONDARY_HOST from SYS.M_SERVICE_REPLICATION +# +# ===> "Walldorf", "sap-app-8" "ACTIVE", "Rot", "sap-app-5" +# "Rot", "sap-app-5", "ACTIVE", "oslo", "sap-app-7" +# +##################################################################################################### +# + secUser=$(check_secstore_users SAPHANA${SID}SR SLEHALOC RHELHALOC) ; chkusr=$? + if [ $chkusr -ne 0 ]; then + super_ocf_log err "ACT: Secure store users are missing (see best practice manual how to setup the users)" + rc=$OCF_ERR_CONFIGURED + fi + hana_sync_status=$(timeout 60 $DIR_EXECUTABLE/hdbsql -a -x -U $secUser $query_state); sqlrc=$? + hana_sync_status=$(echo $hana_sync_status | dequote) + super_ocf_log debug "DBG: hdbsql rc=$sqlrc hana_sync_status=\"$hana_sync_status\"" + if [ "$sqlrc" -eq 0 -a "$hana_sync_status" != "" ]; then + # + # UNKNOWN, ACTIVE, ERROR, INITIALIZING + # + if [ "${hana_sync_status}" == "ACTIVE" ]; then + # TODO PRIO1: REMOVE remoteNode dependency - set SOK + set_hana_attribute "$remoteNode" "SOK" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + else + super_ocf_log warn "ACT: HANA SYNC STATUS is: ${hana_sync_status}" + # TODO PRIO1: REMOVE remoteNode dependency - set SFAIL + set_hana_attribute "$remoteNode" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + fi + # first get a list of all secondary hosts, than a list of all secondary hosts, if the is ANY failure at this site + # TODO: PRIO9: for first we assume there is only ONE secondary site (like ROT) + # TODO: PRIO3: should we loop over all cluster nodes fetching their roles-attribute? To minimize sql-queries? + # + all_secondary_hosts=$(timeout 60 hdbsql -a -x -U $secUser $query_secondaries ); sqlrc=$? + all_secondary_hosts=$(echo $all_secondary_hosts | dequote); + if [ "$sqlrc" -eq 0 ]; then + all_broken_secondary_hosts=$(timeout 60 hdbsql -a -x -U $secUser $query_failed_secondaries); sqlrc=$? + all_broken_secondary_hosts=$(echo $all_broken_secondary_hosts | dequote); + if [ "$sqlrc" -eq 0 ]; then + if [ -n "$all_broken_secondary_hosts" ]; then + # + # we have a broken secondary site - set all hosts to "SFAIL" + # + # Note: since HANA hostname can be different from nodename we need to check all vhost attributes + for n in $all_broken_secondary_hosts; do + for cl in ${otherNodes[@]}; do + vHost=$(get_hana_attribute $cl ${ATTR_NAME_HANA_VHOST[@]}) + if [ "$vHost" = "$n" ]; then # we found the correct node + set_hana_attribute $cl "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + fi + done + done + fi + fi + fi + else + # return codes 19: license error -> set SFAIL! + case "$sqlrc" in + 19 ) + # DONE: PRIO1: We should NOT set SFAIL, if HDB is exactly broken now + # When HDB breaks during monitor this could prevent a prositive remote failover + super_ocf_log warn "ACT: Was not able to fetch HANA SYNC STATUS - set sync status to SFAIL for ALL OTHER cluster hosts" + for n in $otherNodes; do + set_hana_attribute "$n" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + done + ;; + esac + fi + return $rc +} + +# +# function: get_hana_landscape_status - figure out hana ladscape status +# params: - +# globals: sidadm(r), DIR_EXECUTABLE(r) +# +function get_hana_landscape_status() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + # + su - $sidadm -c "python $DIR_EXECUTABLE/python_support/landscapeHostConfiguration.py" 1>/dev/null 2>/dev/null; rc=$? + return $rc; +} + +# +# function: register_hana_secondary - register local hana as secondary to the other site +# params: - +# globals: sidadm(r), remoteHost(r), InstanceNr(r), sr_mode(r), sr_name(r) +# register_hana_secondary +# +function register_hana_secondary() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=2; + local remoteInstance=""; + remoteInstance=$InstanceNr + if ocf_is_true ${AUTOMATED_REGISTER}; then + super_ocf_log info "ACT: REGISTER: hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name" + su - $sidadm -c "hdbnsutil -sr_register --remoteHost=$remoteHost --remoteInstance=$remoteInstance --mode=$sr_mode --name=$sr_name"; rc=$? + else + super_ocf_log info "ACT: IGNORE REGISTER because AUTOMATED_REGISTER is set to FALSE" + rc=1 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc; +} + +# +# function: saphana_status - pure status check +# params: - +# globals: SIDInstanceName, OCF_*, +function saphana_status() { + local binDeam="hdb.sap${SIDInstanceName}" rc=0 + binDeam=${binDeam:0:15} # Process name is limited to the first 15 characters + if pgrep $binDeam 1>/dev/null; then rc=$OCF_SUCCESS; else rc=$OCF_NOT_RUNNING; fi + return $rc +} + +# +# function: saphana_start - start a hana instance +# params: - +# globals: OCF_*, SAPCONTROL, InstanceNr, SID, InstanceName, +# +function saphana_start() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_NOT_RUNNING + local output="" + local loopcount=0 + check_sapstartsrv + rc=$? + # + # TODO: ASK: PRIO5: For SCALE-OUT - do we need to use an other call like StartSystem? Or better to use the HDB command? + # + if [ $rc -eq $OCF_SUCCESS ]; then + output=$($SAPCONTROL -nr $InstanceNr -function Start) + rc=$? + super_ocf_log info "ACT: Starting SAPHANA Instance $SID-$InstanceName: $output" + fi + if [ $rc -eq 0 ] + then + # TODO: PRIO9: something more dynamic than 3600 seconds in WaitforStarted + output=$($SAPCONTROL -nr $InstanceNr -function WaitforStarted 3600 1) + if [ $? -eq 0 ] + then + super_ocf_log info "ACT: SAPHANA Instance $SID-$InstanceName started: $output" + rc=$OCF_SUCCESS + else + super_ocf_log err "ACT: SAPHANA Instance $SID-$InstanceName start failed: $output" + rc=$OCF_ERR_GENERIC + fi + else + super_ocf_log err "ACT: SAPHANA Instance $SID-$InstanceName start failed: $output" + rc=$OCF_ERR_GENERIC + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_stop - stop a hana instance +# params: - +# globals: OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r) +# saphana_stop: Stop the SAP instance +# +function saphana_stop() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local output="" + local rc=0 + check_sapstartsrv; rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + output=$($SAPCONTROL -nr $InstanceNr -function Stop) + rc=$? + super_ocf_log info "ACT: Stopping SAP Instance $SID-$InstanceName: $output" + fi + if [ $rc -eq 0 ] + then + output=$($SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1) + if [ $? -eq 0 ] + then + super_ocf_log info "ACT: SAP Instance $SID-$InstanceName stopped: $output" + rc=$OCF_SUCCESS + else + super_ocf_log err "ACT: SAP Instance $SID-$InstanceName stop failed: $output" + rc=$OCF_ERR_GENERIC + fi + else + super_ocf_log err "ACT: SAP Instance $SID-$InstanceName stop failed: $output" + rc=$OCF_ERR_GENERIC + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_validate - validation of (some) variables/parameters +# params: - +# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), SAPVIRHOST(r) +# saphana_validate: Check the symantic of the input parameters +# +function saphana_validate() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_SUCCESS + # + # SID is Alpha-AlphaNumeric-Alphanumeric? + # + if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] + then + super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" + rc=$OCF_ERR_ARGS + fi + # + # InstanceNr is a two-Digit? + # + if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] + then + super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" + rc=$OCF_ERR_ARGS + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_start_primary - handle startup of PRIMARY in M/S +# params: +# globals: OCF_*(r), NODENAME, ATTR_NAME_*, HANA_STATE_*, +# +function saphana_start_primary() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING + local lss sqlrc; + local rc=0 + local lpa_dec=4 + local lpa_advice="" + # + # we will be a master (PRIMARY) so checking, if the is an OTHER master + # + super_ocf_log debug "DBG: saphana_primary - check_for_primary reports HANA_STATE_PRIMARY" + # + lpa_init_lpt $HANA_STATE_PRIMARY + lpa_check_lpt_status; lpa_dec=$? + get_hana_landscape_status; lss=$? + my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) + my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + case "$lpa_dec" in + 0 ) # LPA says start-up + lpa_advice="start" + ;; + 1) # LPA says register! + lpa_advice="register" + ;; + 2) # LPA says wait for second LPT + lpa_advice="wait" + ;; + 3 | 4 ) # LPA says something is completely wrong - FAIL resource + lpa_advice="fail" + ;; + * ) # LPA failed with an unkonown status - FAIL resource + lpa_advice="fail" + ;; + esac + + # DONE: PRIO2: Do we need to differ 0 and 1 here? While 0 is a fatal SAP error, 1 for down/error + if [ $lss -eq 0 ]; then + super_ocf_log err "ACT: get_hana_landscape_status reports FATAL" + # DONE: PRIO1: what to do for lss=0? + # TODO: PRIO3: Check, if OCF_ERR_GENERIC is best reaction + lpa_advice="skip" + rc=$OCF_ERR_GENERIC + fi + case "$lpa_advice" in + start ) # process a normal START + case "$lss" in + 2 | 3 | 4 ) # as landcape says we are up - just set the scores and return code + super_ocf_log info "LPA: landcape: UP, LPA: start ==> keep running" + LPTloc=$(date '+%s') + lpa_set_lpt $LPTloc + rc=$OCF_SUCCSESS + ;; + 1 ) # landcape says we are down, lets start and adjust scores and return code + super_ocf_log info "LPA: landcape: DOWN, LPA: start ==> start instance" + saphana_start + rc=$? + LPTloc=$(date '+%s') + lpa_set_lpt $LPTloc + ;; + esac + scoring_crm_master "$my_role" "$my_sync" + ;; + register ) # process a REGISTER + case "$lss" in + 2 | 3 | 4 ) # upps we are up - but shoudn't? - we could not register with started HDB + # DONE: PRIO3: check if this reaction is correct - tell cluster about failed start + super_ocf_log info "LPA: landcape: UP, LPA: register ==> take down" + set_crm_master -inf + rc=$OCF_NOT_RUNNING + ;; + 1 ) # lets try to register + # DONE: PRIO2: Like Action in start_secondary + super_ocf_log info "LPA: landcape: DOWN, LPA: register ==> try to register" + super_ocf_log info "DEC: AN OTHER HANA IS AVAILABLE ==> LETS REGISTER" + set_crm_master 0 + if wait_for_primary_master 1; then + register_hana_secondary + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + super_ocf_log info "ACT: Register successful" + lpa_push_lpt 10 + lpa_set_lpt 10 + set_crm_master 0 + saphana_start_secondary + rc=$? + lpa_set_lpt 30 + else + super_ocf_log err "ACT: Register failed" + rc=$OCF_NOT_RUNNING + fi + else + # lets check next monitor, if we can register + rc=$OCF_SUCCESS + fi + ;; + esac + ;; + wait ) # process a WAIT + case "$lss" in + 2 | 3 | 4 ) # as we ARE up we just keep it up + # TODO: PRIO3: I now change from "just keep it up to take that down" + # TODO: PRIO3: OCF_SUCCSESS, OCF_NOT_RUNNING or OCF_ERR_xxxx ? + set_crm_master -9000 + #scoring_crm_master "$my_role" "$my_sync" + rc=$OCF_ERR_GENERIC + ;; + 1 ) # we are down, so we should wait --> followup in next monitor + super_ocf_log info "LPA: landcape: DOWN, LPA: wait ==> keep waiting" + # TODO: PRIO3: Check, if WAITING is correct here + set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -9000 + rc=$OCF_SUCCSESS + ;; + esac + ;; + fail ) # process a lpa FAIL + super_ocf_log info "LPA: LPA reports FAIL" + set_crm_master -inf + rc=$OCF_NOT_RUNNING + ;; + esac + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# check_for_primary_master +# params: - +# globals: ATTR_NAME_HANA_ROLES[@], NODENAME +# +check_for_primary_master() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=1 + local ch ch_role + # + # get actual list of cluster members + # + if [ -n "$otherNodes" ]; then + for ch in ${otherNodes[@]}; do + if [ $rc -eq 1 ]; then + ch_role=$(get_hana_attribute ${ch} ${ATTR_NAME_HANA_ROLES[@]}) +# TODO: PRIO3: check if [0-9], [234] or [34] is correct +# TODO: PRIO4: Do we need different checks like "any-primary-master" or "running-primary-master" ? +# grep '[0-9]*:P:[^:]*:master:' <<< $ch_role && rc=0 +# grep '[34]:P:[^:]*:master:' <<< $ch_role && rc=0 +# Match "Running+Available Primary" Master -> Match field 1: 3/4, 2: P, 4: master + awk -F: 'BEGIN { rc=1 } + $1 ~ "[34]" && $2 ="P" && $4="master" { rc=0 } + END { exit rc }' <<< $ch_role ; rc=$? + fi + done + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# wait_for_primary_master: wait some time till a running primary master is shown in attributes +# params: optional: loop count - currently time in 10s waiting loop +# globals: - +# +wait_for_primary_master() +{ + local wait=1 + local rc=1 + local loops=${1:-0} + local count=0 + super_ocf_log info "FLOW $FUNCNAME ($*)" + # + # hana_ndb_roles=primary:master1:master:worker:master + # + while [ "$wait" -eq 1 ]; do + if check_for_primary_master; then + wait=0 + rc=0 + else + if [ $loops -gt 0 ]; then + (( count++ )) + if [ $count -gt $loops ]; then + wait=0 + rc=1 + fi + fi + sleep 10 + fi + done + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_start_secondary - handle startup of PRIMARY in M/S +# params: +# globals: OCF_*(r), NODENAME, ATTR_NAME_*, +# +function saphana_start_secondary() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING + local sqlrc; + set_crm_master 0 + # + ####### LPA - begin + # + lpa_push_lpt 10 + lpa_set_lpt 10 + # + ####### LPA - end + # + # + # we would be slave (secondary) + # we first need to check, if there are Master Nodes, because the Scecondary only starts + # successfuly, if the Primary is available. Thatfore we mark the Secondary as "WAITING" + # DONE: PRIO3: wait_for_primary_master 10 is just a test value: 10 loops x10 seconds than go to WAITING + # DONE: PRIO3: rename 'wait_for_primary_master' to match better the use case ("wait_some_time") + # + super_ocf_log debug "DBG: wait for promoted side" + # TODO: PRIO3: Check if setting SFAIL during secondary start is ok + set_hana_attribute "${NODENAME}" "SFAIL" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + if wait_for_primary_master 10; then + saphana_start; rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + if ! wait_for_primary_master 1; then + # It seams the stating secondary could not start because of stopping primary + # so this is a WAITING situation + super_ocf_log info "ACT: PRIMARY seams to be down now ==> WAITING" + set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -INFINITY + rc=$OCF_SUCCSESS + fi + else + lpa_set_lpt 30 + fi + else + super_ocf_log info "ACT: wait_for_primary_master ==> WAITING" + set_hana_attribute ${NODENAME} "WAITING" ${ATTR_NAME_HANA_CLONE_STATE[@]} + set_crm_master -INFINITY + rc=$OCF_SUCCSESS + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: lpa_get_lpt - get lpt from cluster +# params: NODE +# output: LPT +# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR +# globals: LPA_ATTR_*, +# +function lpa_get_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=1 + local node=$1 + local lpt="" + lpt=$(get_hana_attribute ${node} ${LPA_ATTR[@]}) + if [ -n "$lpt" ]; then + rc=0 + echo $lpt + else + rc=2 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: lpa_set_lpt - set lpt in cluster +# params: LPT [node] +# globals: LPA_ATTR(r), NODENAME(r), +# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR +# +function lpa_set_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=1 + local crm_rc=1 + local lpt=$1 + local clpt=-1 + local node=${2:-${NODENAME}} + set_hana_attribute ${node} "$lpt" ${LPA_ATTR[@]}; crm_rc=$? + clpt=$(lpa_get_lpt $NODENAME) + if [ "$lpt" != "$clpt" ]; then + rc=2 + else + rc=0 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: lpa_pull_lpt - fetch lpt from file +# params: - +# globals: LPA_DIRECTORY(r), sid, NODENAME +# output: LPT +# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR +# +function lpa_pull_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=1 + local lpt="" + local readrest=0 + local lpa_file=$LPA_DIRECTORY/lpa_${sid}_${NODENAME} + if [ -f $lpa_file ]; then + read lpt readrest <<<$(cat $lpa_file) # exactly load first word from file to lpt + fi + if [ -n "$lpt" ]; then + rc=0 + echo $lpt + else + rc=2 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: lpa_push_lpt - put lpt to file +# params: LPT +# globals: LPA_DIRECTORY(r), sid, NODENAME +# output: -- +# rc: rc=0: OK, rc=1: InternalERROR, rc=2: ERROR +# +function lpa_push_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local lpt=$1 + local clpt=-1 + local rc=1 + local lpa_file=$LPA_DIRECTORY/lpa_${sid}_${NODENAME} + # + mkdir -p $LPA_DIRECTORY + echo "$lpt" > $lpa_file + clpt=$(lpa_pull_lpt); lpt_rc=$? + if [ "$clpt" != "$lpt" -o "$lpt_rc" -ne 0 ]; then + rc=2 + else + rc=0 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: lpa_init_lpt - initialize local lpt, if needed +# params: HANA_STATE +# globals: HANA_STATE_*(r), LPA_DIRECTORY(r), sid(r), NODENAME(r), +# lpa_init_lpt +# +# Returncodes: +# rc=0: OK, rc=1 InternalERROR, rc=2: ERROR +# +# Initializing (if NO local LPT-file): +# SECONDARY sets to 0 +# PRIMARY sets to 1 +# +function lpa_init_lpt() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=1 + local LPTloc=-1 + local LPTrem=-1 + local hana_state=$1 + local lpa_file=$LPA_DIRECTORY/lpa_${sid}_${NODENAME} + mkdir -p $LPA_DIRECTORY + LPTloc=$(lpa_get_lpt ${NODENAME}) || LPTloc=$(lpa_pull_lpt) || \ + if [ "$hana_state" -eq "$HANA_STATE_PRIMARY" ]; then # Initialize for Primary + # init primary + LPTloc=20 + lpa_push_lpt "20"; rc=$? + elif [ "$hana_state" -eq "$HANA_STATE_SECONDARY" ]; then # Initialize for Secondary + # init secondary + LPTloc=10 + lpa_push_lpt "10"; rc=$? + else + rc=2 + fi + lpa_set_lpt $LPTloc + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: lpa_check_lpt_status - start a hana clone instance +# params: - +# globals: DUPLICATE_PRIMARY_TIMEOUT, NODENAME, remoteNode +# lpa_check_lpt_status +# +# Returncodes: +# +# Initializing (if NO local LPT-file): +# SECONDARY sets to 10 +# PRIMARY sets to 20 +# +# LPRlocal OR LPTremore ARE real lpt (>1000) +# THEN: +# Bigger LPR wins, if delta-gab is OK +# LPTlocal >> LPTremore ===> rc=0 (start) +# LPTRemote >> LPTlocal ===> rc=1 (register) +# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) +# LPRlocal AND LPTremore ARE NOT real lpt (<=1000) +# THEN: +# Bigger LPT wins +# LPTlocal > LPTremore ===> rc=0 (start) +# LPTRemote > LPTlocal ===> rc=1 (register) +# Stalemate in all other cases ==> STALEMATE-HANDLING ===> rc=2 (wait) +# LPTRemote is not initialized (0) +# THEN: +# WAIT ==> like STALEMATE-HANDLING ===> rc=2 (wait) +# +function lpa_check_lpt_status() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + local LPTloc=-1 + local LPTrem=-1 + local LPTMark=1000 + local delta=0 + # + # First GET LPT from ATTR-FILE-DEFAULT + # + LPTloc=$(lpa_get_lpt $NODENAME); lparc=$? # ATTR + if [ "$lparc" -ne 0 ]; then + # as a fallback try to fetch the value from external status file + LPTloc=$(lpa_pull_lpt); # FILE + lparc=$? + if [ -z "$LPTloc" -o "$LPTloc" -eq -1 -o "$lparc" -ne 0 ]; then + # last option - try to initialize as PRIMARY + lpa_push_lpt 20 + lpa_set_lpt 20 + LPTloc=20 # DEFAULT + fi + fi + # TODO PRIO1: REMOVE remoteNode dependency - lpa_get_lpt + LPTrem=$(lpa_get_lpt $remoteNode); lparc=$? + if [ $lparc -ne 0 ]; then + # LPT of the other node could not be evaluated - LPA says WAIT + super_ocf_log debug "DBG: LPA: LPTloc=$LPTloc, LPTrem undefined ==> WAIT" + rc=2 + else + super_ocf_log debug "DBG: LPA: LPTloc ($LPTloc) LPTrem ($LPTrem) delta ($delta)" + if [ $LPTloc -lt $LPTMark -a $LPTrem -lt $LPTMark ]; then + delta=0 # both lpts are not a real timestamp so just take the greater one + else + delta=$DUPLICATE_PRIMARY_TIMEOUT # at least one of the lpts is a real timestamp so include delta-gap + fi + if (( delta < LPTloc - LPTrem )); then + # We are the winner - LPA says STARTUP + super_ocf_log debug "DBG: LPA: LPTloc wins $LPTloc > $LPTrem + $delta ==> START" + rc=0 + elif (( delta < LPTrem - LPTloc )); then + if ocf_is_true "$AUTOMATED_REGISTER" ; then + # The other one has won - LPA says REGISTER + super_ocf_log debug "DBG: LPA: LPTrem wins $LPTrem > $LPTloc + $delta ==> REGISTER" + rc=1 + else + super_ocf_log debug "DBG: LPA: LPTrem wins $LPTrem > $LPTloc + $delta BUT AUTOMATED_REGISTER='false' ==> WAIT" + rc=2 + fi + + else + super_ocf_log debug "DBG: LPA: Difference between LPTloc and LPTrem is less than delta ($delta) ==> WAIT" + # TODO: PRIO3: ADD STALEMATE-HANDLING HERE; currently admin should set one of the lpa to 20 + rc=2 + fi + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_start_clone - start a hana clone instance +# params: - +# globals: OCF_*, ATTR_NAME_*, HANA_STATE_*, NODENAME +# saphana_start_clone +# +function saphana_start_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local primary_status sync_attr score_master rc=$OCF_NOT_RUNNING + local sqlrc; + local chkusr; + # TODO: PRIO4: remove check_secstore_users later + secUser=$(check_secstore_users SAPHANA${SID}SR SLEHALOC RHELHALOC) ; chkusr=$? + if [ $chkusr -ne 0 ]; then + super_ocf_log err "ACT: Secure store users are missing (see best practice manual how to setup the users)" + rc=$OCF_ERR_CONFIGURED + else + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + saphana_start_primary; rc=$? + else + saphana_start_secondary; rc=$? + lpa_set_lpt 30 + fi + fi + return $rc +} + +# +# function: saphana_stop_clone - stop a hana clone instance +# params: - +# globals: NODENAME(r), HANA_STATE_*(r) +# saphana_stop_clone +# +function saphana_stop_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + local primary_status="x" + set_hana_attribute ${NODENAME} "UNDEFINED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + lpa_set_lpt 10 + fi + saphana_stop; rc=$? + return $rc +} + +# +# function: saphana_monitor_primary - monitor a hana clone instance +# params: - +# globals: HANA_STATE_*(r), remoteHost, NODENAME, ATTR_NAME_*, OCF_*, PreferSiteTakeover +# +function saphana_monitor_primary() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_ERR_GENERIC + local promoted=0 + local init_attribute=0 + local LPTloc=-1 + local lparc=4 + local lss + local remoreSync="" + local my_role="" + # + # OK, we are running/are configured as HANA PRIMARY + # + super_ocf_log debug "DBG: saphana_monitor_clone: HANA_STATE_PRIMARY" + # + ##### CHECK, IF WE ARE DEMOTED (CLUSTER NODE ATTRIBUTE) + # + promote_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_CLONE_STATE[@]}) + super_ocf_log debug "DBG: saphana_monitor_clone: $ATTR_NAME_HANA_CLONE_STATE=$promote_attr" + if [ -z "$promote_attr" ]; then + init_attribute=1 + promoted=0; + else + case "$promote_attr" in + PROMOTED ) + promoted=1; + ;; + DEMOTED ) + promoted=0; + ;; + WAITING ) + # DONE: lpa_check_lpt_status to come out of here :) + # DONE: PRIO2: CHECK IF THE FIX FOR COMING OUT OF WAITING IS CORRECT + get_hana_landscape_status; lss=$? + if [ $lss -ge 2 ]; then + # seems admin already decided that for us? -> we are running - set DEMOTED + promoted=0; + LPTloc=$(date '+%s') + lpa_set_lpt $LPTloc + fi + lpa_check_lpt_status; lparc=$? + if [ $lparc -ne 2 ]; then + # lpa - no need to wait any longer - lets try a new start + saphana_start_clone + rc=$? + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc + else + lpa_init_lpt $HANA_STATE_PRIMARY + # still waiting for second site to report lpa-lpt + if ocf_is_true "$AUTOMATED_REGISTER" ; then + super_ocf_log info "LPA: Still waiting for remote site to report LPA status" + else + super_ocf_log info "LPA: Dual primary detected and AUTOMATED_REGISTER='false' ==> WAITING" + fi + + return $OCF_SUCCESS + fi + promoted=0; + ;; + UNDEFINED ) + if ocf_is_probe; then + promoted=0; + else + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + promoted=0; + fi + ;; + * ) + promoted=0; + ;; + esac + fi + get_hana_landscape_status; lss=$? + super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" + case "$lss" in + 0 ) # FATAL or ERROR + rc=$OCF_ERR_GENERIC + ;; + 1 ) # DOWN or ERROR + # DONE: PRIO2: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error + if ocf_is_probe; then + # + # leave master score untouched, only set return code + # + rc=$OCF_NOT_RUNNING + else + if [ "$promoted" -eq 1 ]; then + # INSTANCE IS FAILED PRIMARY IN PROMOTED STATE + # DONE: PRIO2: Adjust with set_crm_master? + # For Migration it would be good to decrease master score + # For Reload locally we should NOT adjust the master score + # ===> Should we rely on the migration threshold? + # set_crm_master + if ocf_is_true "${PreferSiteTakeover}" ; then + # + # DONE: PRIO1: first check, if remote site is already (and still) in sync + # TODO: PRIO4: Decide if penality (-9000) or weak (5) is better here to cover situations where other clone is gone + # + # TODO PRIO1: REMOVE remoteNode dependency - get_sync_status + remoteSync=$(get_hana_attribute $remoteNode ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + case "$remoteSync" in + SOK ) + super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here (and reset lpa)" + set_crm_master 5 + if check_for_primary_master; then + lpa_set_lpt 20 + fi + ;; + SFAIL ) + super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync (SFAIL) ==> local restart preferred" + ;; + * ) + super_ocf_log info "DEC: PreferSiteTakeover selected BUT remoteHost is not in sync ($remoteSync) ==> local restart preferred" + ;; + esac + else + # TODO: PRIO5: SCALE-OUT ONLY? Implement for local restart + # It maybe that for the local restart we only need to decrease the secondaries promotion score + #super_ocf_log info "DEC: PreferSiteTakeover selected so decrease promotion score here" + my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) + my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + scoring_crm_master "$my_role" "$my_sync" + rc=$OCF_FAILED_MASTER + fi + rc=$OCF_FAILED_MASTER + else + # INSTANCE IS FAILED PRIMARY IN DEMOTED STATE + # TODO: PRIO3: Adjust with set_crm_master? + # Current decission: Do NOT adjust master score now as other + # steps should already have done that + # + rc=$OCF_NOT_RUNNING + fi + fi + ;; + 2 | 3 | 4 ) # WARN, INFO or OK + if ocf_is_probe; then + rc=$OCF_SUCCESS + else + LPTloc=$(date '+%s') + lpa_set_lpt $LPTloc + lpa_push_lpt $LPTloc + if [ "$promoted" -eq 1 ]; then + set_hana_attribute "$NODENAME" "PRIM" ${ATTR_NAME_HANA_SYNC_STATUS[@]} + rc=$OCF_RUNNING_MASTER + else + if [ "$init_attribute" -eq 1 ]; then + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + rc=$OCF_RUNNING_MASTER + else + rc=$OCF_SUCCESS + fi + fi + my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) + case "$my_role" in + [12]:P:*:master:* ) # primary is down or may not anser hdbsql query so drop analyze_hana_sync_status + ;; + [34]:P:*:master:* ) # primary is up and should now be able to anser hdbsql query + analyze_hana_sync_status + ;; + esac + rem_role=$(get_hana_attribute ${remoteHost} ${ATTR_NAME_HANA_ROLES[@]}) + rem_clone_status=$(get_hana_attribute ${remoteHost} ${ATTR_NAME_HANA_CLONE_STATE[@]}) + if [ "$promote_attr" = "DEMOTED" -a "$rem_clone_status" = "PROMOTED" ]; then + case "$rem_role" in + [234]:P:* ) # dual primary, but other instance marked as PROMOTED by the cluster + lpa_check_lpt_status; again_lpa_rc=$? + if [ $again_lpa_rc -eq 2 ]; then + super_ocf_log info "DEC: Dual primary detected, other instance is PROMOTED and lpa stalemate ==> local restart" + lpa_set_lpt 10 + lpa_push_lpt 10 + rc=$OCF_NOT_RUNNING + fi + ;; + esac + fi + scoring_crm_master "$my_role" "$my_sync" + fi + ;; + * ) # UNDEFINED STATUS + if ocf_is_probe; then + rc=$OCF_NOT_RUNNING + else + if [ "$promoted" -eq 1 ]; then + rc=$OCF_FAILED_MASTER + else + rc=$OCF_NOT_RUNNING + fi + fi + ;; + esac + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_monitor_secondary - monitor a hana clone instance +# params: - +# globals: OCF_*, ATTR_NAME_*, NODENAME +# saphana_monitor_secondary +# +function saphana_monitor_secondary() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_ERR_GENERIC + local promoted=0 + local init_attribute=0 + local lss + # + # OK, we are running as HANA SECONDARY + # + if ! lpa_get_lpt ${NODENAME}; then + lpa_set_lpt 10 + lpa_push_lpt 10 + fi + promote_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_CLONE_STATE[@]}) + super_ocf_log debug "DBG: saphana_monitor_clone: $ATTR_NAME_HANA_CLONE_STATE=$promote_attr" + if [ -z "$promote_attr" ]; then + init_attribute=1 + # DONE: PRIO3: do we need to inizialize also the DEMOTED attribute value? + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + promoted=0; + else + case "$promote_attr" in + PROMOTED ) # However - PROMOTED should never happen for a SECONDARY + promoted=1; + ;; + DEMOTED ) # This is the status we expect + promoted=0; + ;; + WAITING* ) # We are WAITING for PRIMARY so not testing the HANA engine now but check for a new start + if check_for_primary_master; then + super_ocf_log info "ACT: SECONDARY still in status WAITING - Primary now available - try a new start" + saphana_start_clone + rc=$? + else + super_ocf_log info "ACT: saphana_monitor_clone: SECONDARY still in status WAITING - Primary is still missing" + return $OCF_SUCCESS + fi + promoted=0; + ;; + UNDEFINED | * ) + if ocf_is_probe; then + promoted=0; + else + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + promoted=0; + fi + ;; + esac + fi + # + super_ocf_log debug "DBG: saphana_monitor_clone: HANA_STATE_SECONDARY" + # + # old method was: saphana_monitor - new method is get_hana_landscape_status + get_hana_landscape_status; lss=$? + super_ocf_log debug "DBG: saphana_monitor_clone: get_hana_landscape_status=$lss" + case "$lss" in + 0 ) # FATAL + # DONE: PRIO1: Maybe we need to differ between 0 and 1. While 0 is a fatal sap error, 1 is down/error + # TODO: PRIO3: is OCF_ERR_GENERIC best option? + lpa_set_lpt 10 + rc=$OCF_ERR_GENERIC + ;; + 1 ) # ERROR + lpa_set_lpt 10 + rc=$OCF_NOT_RUNNING + ;; + 2 | 3 | 4 ) # WARN INFO OK + rc=$OCF_SUCCESS + lpa_set_lpt 30 + sync_attr=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + super_ocf_log debug "DBG: sync_attr=$sync_attr" + case "$sync_attr" in + "SOK" ) # This is a possible node to promote, when primary is missing + super_ocf_log info "DEC: secondary with sync status SOK ==> possible takeover node" + my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) + my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + scoring_crm_master "$my_role" "$my_sync" + ;; + "SFAIL" ) # This is currently NOT a possible node to promote + super_ocf_log info "DEC: secondary with sync status FAILED ==> EXCLUDE as posible takeover node" + set_crm_master -INFINITY + ;; + "*" ) # Unknown sync status + super_ocf_log info "DEC: secondary with sync status UKNOWN/UNDEFINED ==> EXCLUDE as posible takeover node" + set_crm_master -INFINITY + ;; + esac + ;; + * ) # UNDEFINED STATUS + rc=$OCF_NOT_RUNNING + ;; + esac + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_monitor_clone - monitor a hana clone instance +# params: - +# globals: OCF_*, ATTR_NAME_*, HOSTNANE, HANA_STATE_* +# saphana_monitor_clone +# +function saphana_monitor_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + # + # TODO: PRIO3: For the secondary, which is missing the primary (so in status WAITING) what is better: + # a) returning 7 here and force cluster a restart of the slave + # b) starting the instance here inside the monitor -> may result in longer runtime, timeouts + # + # first check with the status function (OS tools) if there could be something like a SAP instance running + # as we do not know here, if we are in master or slave state we do not want to start our monitoring + # agents (sapstartsrv) on the wrong host + local rc=$OCF_ERR_GENERIC + local promoted=0 + local init_attribute=0 + + my_role=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_ROLES[@]}) + my_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + lpa_check_lpt_status # TODO: PRIO3 : remove that line later - its only to call lpa_check_lpt_status much more often for checking + + if ocf_is_probe; then + super_ocf_log debug "DBG: PROBE ONLY" + else + super_ocf_log debug "DBG: REGULAR MONITOR" + fi + # + # First check, if we are PRIMARY or SECONDARY + # + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + saphana_monitor_primary; rc=$? + else + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + saphana_monitor_secondary; rc=$? + else + # + # OK, we are neither HANA PRIMARY nor HANA SECONDARY + # + super_ocf_log warn "ACT: saphana_monitor_clone: HANA_STATE_DEFECT" + # TODO: PRIO2: Or only set_crm_master -INFINITY ? + rc=$OCF_ERR_GENERIC + fi + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_promote_clone - promote a hana clone +# params: - +# globals: OCF_*(r), NODENAME(r), HANA_STATE_*, SID(r), InstanceName(r), +# saphana_promote_clone: +# In a Master/Slave configuration get Master being the primary OR by running hana takeover +# +function saphana_promote_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_ERR_GENERIC; + local hana_sync; + local primary_status; + # + # first check, if we WILL be PRIMARY (checking HANA status) + # + set_hana_attribute ${NODENAME} "PROMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + check_for_primary; primary_status=$? + # + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + # + # as we are already planned to be PRIMARY we only mark the node as PROMOTED + # + super_ocf_log info "ACT: Promoted $SID-$InstanceName as master (no hdbnsutil action needed)." + rc=$OCF_SUCCESS; + else + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + # + # we are SECONDARY/SLAVE and need to takepover ... + # promote on the replica side... + # + hana_sync=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_SYNC_STATUS[@]}) + case "$hana_sync" in + SOK ) + super_ocf_log info "ACT: !!!!!!! Promote REPLICA $SID-$InstanceName to be primary. !!!!!!" + LPTloc=$(date '+%s') + # lpa_set_lpt 20 $remoteNode + lpa_set_lpt $LPTloc + lpa_push_lpt $LPTloc + su - $sidadm -c "hdbnsutil -sr_takeover" + # + # now gain check, if we are primary NOW + # + # TODO: PRIO3: check, if we need to destinguish between HANA_STATE_PRIMARY, HANA_STATE_SECONDARY, HANA_STATE_DEFECT + # + if check_for_primary; then + rc=$OCF_SUCCESS; + else + rc=$OCF_FAILED_MASTER + fi + ;; + * ) + super_ocf_log err "ACT: HANA SYNC STATUS IS NOT 'SOK' SO THIS HANA SITE COULD NOT BE PROMOTED" + rc=$OCF_ERR_GENERIC + ;; + esac + else + # + # neither MASTER nor SLAVE - This clone instance seams to be broken!! + # + rc=$OCF_ERR_GENERIC + fi + fi + rc=$? + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: saphana_demote_clone - demote a hana clone instance +# params: - +# globals: OCF_*(r), NODENAME(r), +# saphana_demote_clone +# the HANA System Replication (SR) runs in a Master/Slave +# While we could not change a HANA instance to be really demoted, we only mark the status for +# correct monitor return codes +# +function saphana_demote_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_ERR_GENERIC; + set_hana_attribute ${NODENAME} "DEMOTED" ${ATTR_NAME_HANA_CLONE_STATE[@]} + rc=$OCF_SUCCESS; + super_ocf_log info "ACT: Demoted $SID-$InstanceName." + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: main - main function to operate +# params: ACTION +# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), SAPVIRHOST(w), DIR_EXECUTABLE(w), +# globals: SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) +# + +## GLOBALS +SID="" +sidadm="" +InstanceName="" +InstanceNr="" +SAPVIRHOST="" +DIR_EXECUTABLE="" +SAPSTARTSRV="" +SAPCONTROL="" +DIR_PROFILE="" +SAPSTARTPROFILE="" +SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" + +NODENAME=$(crm_node -n) + + +if [ $# -ne 1 ] +then + saphana_usage + exit $OCF_ERR_ARGS +fi + +ACTION=$1 +if [ "$ACTION" = "status" ]; then + ACTION=monitor +fi + +# These operations don't require OCF parameters to be set +# TODO: PRIO5: check, if notify is still not needing OCF parameters +case "$ACTION" in + usage|methods) saphana_$ACTION + exit $OCF_SUCCESS;; + meta-data) saphana_meta_data + exit $OCF_SUCCESS;; + notify) #saphana_notify + exit $OCF_SUCCESS;; + *);; +esac +saphana_init + +if ! ocf_is_root +then + super_ocf_log err "ACT: $0 must be run as root" + exit $OCF_ERR_PERM +fi + +# parameter check +if [ -z "$OCF_RESKEY_SID" ] +then + super_ocf_log err "ACT: Please set parameter SID!" + exit $OCF_ERR_ARGS +fi + +if [ -z "$OCF_RESKEY_InstanceNumber" ] +then + super_ocf_log err "ACT: Please set parameter InstanceNumber!" + exit $OCF_ERR_ARGS +fi + +if is_clone +then + CLACT=_clone +else + if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] + then + super_ocf_log err "ACT: $ACTION called in a non master/slave environment" + exit $OCF_ERR_ARGS + fi +fi + +# What kind of method was invoked? +THE_VERSION=$(saphana_meta_data | grep ' + + + 0.149.3 + Analyzes SAP HANA System Replication Topology. + This RA analyzes the SAP HANA topology and "sends" all findings via the node status attributes to + all nodes in the cluster. These attributes are taken by the SAPHana RA to control the SAP Hana Databases. + In addition it starts and monitors the local saphostagent. + +1. Interface to monitor a HANA system: landscapeHostConfiguration.py +landscapeHostConfiguration.py has some detailed output about HANA system status +and node roles. For our monitor the overall status is relevant. This overall +status is reported by the returncode of the script: +0: Internal Fatal +1: ERROR +2: WARNING +3: INFO (maybe a switch of the resource running) +4: OK +The SAPHanaTopology resource agent will interpret returncodes 1 as NOT-RUNNING (or 1 failure) and returncodes 2+3+4 as RUNNING. +SAPHanaTopology scans the output table of landscapeHostConfiguration.py to identify the roles of the cluster node. Roles means configured and current role of the nameserver as well as the indexserver. + +2. Interface is hdbnsutil + The interface hdbnsutil is used to check the "topology" of the system replication as well as the current configuration + (primary/secondary) of a SAP HANA database instance. A second task of the interface is the posibility to run a + system replication takeover (sr_takeover) or to register a former primary to a newer one (sr_register). + +3. saphostctrl + The interface saphostctrl uses the function ListInstances to figure out the virtual host name of the + SAP HANA instance. This is the hostname used during the HANA installation. + + + + The SAP System Identifier (SID) + The SAP System Identifier (SID) + + + + The SAP Instance Number + The SAP Instance Number + + + + Path to the SAP Hana Instance executable directory. If not set the RA tries /usr/sap/\$SID/\$InstanceName/exe. + While InstanceName is the string of "HDB" and \$InstanceNumber for SAP Hana databases. + + Path to the SAP Hana Instance executable directory. + + + + Define type of SAPHanaTopology RA messages to be printed + Define type of SAPHanaTopology RA messages to be printed. +Define SAPHana resource agent messages to be printed. + This parameter should only be set of been requested by SUSE support. The default is sufficient for normal operation. + Values: ra-act-lpa-dec-flow + You could specify any combination of the above values like "ra-act-flow" + + + + + + + + + + + + + + +END +return $rc +} + +# +# function: get_hana_attribute +# params: NODE ATTR [STORE] +# globals: - +# +function get_hana_attribute() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + local attr_node=$1 + local attr_name=$2 + local attr_store=${3:-reboot} # DONE: PRIO5 get this (optional) from parameter + crm_attribute -N ${attr_node} -G -n "$attr_name" -l $attr_store -q; rc=$? + if [ $rc -ne 0 ]; then + super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -G -n "$attr_name" -l $attr_store -q" + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: set_hana_attribute - set the multi-state status of a node +# params: NODE VALUE ATTR [STORE] +# globals: - +# +function set_hana_attribute() +{ + super_ocf_log info "FLOW $FUNCNAME ($*)" + local attr_node=$1 + local attr_value=$2 + local attr_name=$3 + local attr_store=${4:-reboot} # DONE: PRIO5 get this (optional) from parameter + local rc=1 + local attr_old + attr_old=$(get_hana_attribute $attr_node $attr_name $attr_store); get_rc=$? + if [ "$attr_old" != "$attr_value" ]; then + super_ocf_log debug "DBG: SET attribute $attr_name for node ${attr_node} to ${attr_value} former ($attr_old) get_rc=$get_rc " + crm_attribute -N $attr_node -v "$attr_value" -n "$attr_name" -l $attr_store; rc=$? + if [ $rc -ne 0 ]; then + super_ocf_log debug "DBG: ATTRIBUTE-FAILURE: crm_attribute -N $attr_node -v $attr_value -n "$attr_name" -l $attr_store" + fi + else + super_ocf_log debug "DBG: LET attribute $attr_name for node ${attr_node} still be ${attr_value}" + rc=0 + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: sht_methods - report supported cluster methods +# params: - +# globals: - +# methods: What methods/operations do we support? +# +function sht_methods() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + cat <<-! + start + stop + status + monitor + notify + validate-all + methods + meta-data + usage + admin-setup + ! + return $rc +} + +# +# function: is_clone - report, if resource is configured as a clone (also master/slave) +# params: - +# globals: OCF_*(r) +# descript: is_clone : find out if we are configured to run in a Master/Slave configuration +# rc: 0: it is a clone +# 1: it is not a clone +# Special EXIT of RA, if clone is missconfigured +# +function is_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + # + # is a clone config? + # + if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ + && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ]; then + # + # yes it is a clone config - check, if its configured well + # + if [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ; then + super_ocf_log err "ACT: Clone options misconfigured. (expect: clone_node_max=1)" + exit $OCF_ERR_CONFIGURED + fi + rc=0; + else + rc=1; + fi + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: sht_init - initialize variables for the resource agent +# params: - +# globals: OCF_*(r), SID(w), sid(rw), sidadm(w), InstanceName(w), InstanceNr(w), +# globals: meta_notify_master_uname(w), HANA_SR_TOLOPOGY(w), sr_name(w), remoteHost(w) +# globals: ATTR_NAME_HANA_SYNC_STATUS(w), ATTR_NAME_HANA_PRIMARY_AT(w), ATTR_NAME_HANA_CLONE_STATE(w) +# globals: DIR_EXECUTABLE(w), SAPSTARTSRV(w), SAPCONTROL(w), DIR_PROFILE(w), SAPSTARTPROFILE(w), LD_LIBRARY_PATH(w), PATH(w), nodelist(w) +# sht_init : Define global variables with default values, if optional parameters are not set +# +# + +function sht_init() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local myInstanceName="" + local rc=$OCF_SUCCESS + local hdbANSWER="" + HOSTEXECNAME=saphostexec + USRSAP=/usr/sap + SAPSERVICE_PATH=${USRSAP}/sapservices + SAPHOSTCTRL_PATH=${USRSAP}/hostctrl/exe + HOSTEXEC_PATH=${SAPHOSTCTRL_PATH}/${HOSTEXECNAME} + HOSTEXEC_PROFILE_PATH=${SAPHOSTCTRL_PATH}/host_profile + SID=$OCF_RESKEY_SID + InstanceNr=$OCF_RESKEY_InstanceNumber + myInstanceName="${SID}_HDB${InstanceNr}" + InstanceName="HDB${InstanceNr}" + super_ocf_log debug "DBG2: Used new method to get SID ($SID) and InstanceNr ($InstanceNr)" + sid=$(echo "$SID" | tr [:upper:] [:lower:]) + sidadm="${sid}adm" + SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" + ocf_env=$(env | grep 'OCF_RESKEY_CRM') + super_ocf_log debug "DBG3: OCF: $ocf_env" + ATTR_NAME_HANA_SYNC_STATUS=("hana_${sid}_sync_state" "reboot") # SOK, SFAIL, UNKNOWN? + ATTR_NAME_HANA_PRIMARY_AT=("hana_${sid}_primary_at" "reboot") # Not really used + ATTR_NAME_HANA_CLONE_STATE=("hana_${sid}_clone_state" "reboot") # UKNOWN?, DEMOTED, PROMOTED + ATTR_NAME_HANA_REMOTEHOST=("hana_${sid}_remoteHost" "forever") + ATTR_NAME_HANA_SITE=("hana_${sid}_site" "forever") + ATTR_NAME_HANA_ROLES=("hana_${sid}_roles" "reboot") + ATTR_NAME_HANA_SRMODE=("hana_${sid}_srmode" "forever") + ATTR_NAME_HANA_VHOST=("hana_${sid}_vhost" "forever") + ATTR_NAME_HANA_STATUS=("hana_${sid}_status" "reboot") + + # optional OCF parameters, we try to guess which directories are correct + if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] + then + DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" + else + DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" + fi + + if [ -z "$DIR_EXECUTABLE" ]; then + super_ocf_log err "DEC: Can not determine DIR_EXECUTABLE. Please set this parameter. -> OCF_ERR_CONFIGURED" + rc=$OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_DIR_PROFILE" ] + then + DIR_PROFILE="/usr/sap/$SID/SYS/profile" + else + DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" + fi + + # as root user we need the library path to the SAP kernel to be able to call sapcontrol + # check, if we already added DIR_EXECUTABLE at the beginning of LD_LIBRARY_PATH + if [ "${LD_LIBRARY_PATH%%*:}" != "$DIR_EXECUTABLE" ] + then + LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH + export LD_LIBRARY_PATH + fi + + PATH=${PATH}:${DIR_EXECUTABLE} + # + # figure-out all needed values from system replication status with ONE call + # we need: mode=primary|sync|syncmem|...; site name=; mapping/=/ (multiple lines) + case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in + *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; + *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; + *cman* ) nodelist=$(crm_node -l);; + esac + hdbANSWER=$(su - ${sidadm} -c "hdbnsutil -sr_state --sapcontrol=1" 2>/dev/null) + super_ocf_log debug "DBG2: hdbANSWER=\$\(su - ${sidadm} -c \"hdbnsutil -sr_state --sapcontrol=1\"\)" + site=$(echo "$hdbANSWER" | awk -F= '/site name/ {print $2}') + srmode=$(echo "$hdbANSWER" | awk -F= '/mode/ {print $2}') + MAPPING=$(echo "$hdbANSWER" | awk -F[=/] '$1 ~ "mapping" && $3 !~ site { print $4 }' site=$site) + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING" + # + # filter all non-cluster mappings + # + hanaRemoteHost=$(for n1 in $nodelist; do for n2 in $MAPPING; do if [ "$n1" == "$n2" ]; then echo $n1; fi; done; done ) + super_ocf_log info "DEC: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + super_ocf_log debug "DBG: site=$site, mode=$srmode, MAPPING=$MAPPING, hanaRemoteHost=$hanaRemoteHost" + super_ocf_log info "FLOW $FUNCNAME rc=$OCF_SUCCESS" + return $OCF_SUCCESS +} + +# +# function: check_for_primary - check if local SAP HANA is configured as primary +# params: - +# globals: HANA_STATE_PRIMARY(r), HANA_STATE_SECONDARY(r), HANA_STATE_DEFECT(r), HANA_STATE_STANDALONE(r) +# +function check_for_primary() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + # DONE: Change stderr location!! + #sidadm=lnxadm + #node_status=$(check_for_primary_single) + node_status=$srmode + super_ocf_log debug "DBG2: check_for_primary: node_status=$node_status" + super_ocf_log debug "DBG: check_for_primary: node_status=$node_status" + for i in 1 2 3 4 5 6 7 8 9; do + case "$node_status" in + primary ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_PRIMARY" + return $HANA_STATE_PRIMARY;; + syncmem | sync | async ) + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_SECONDARY" + return $HANA_STATE_SECONDARY;; + none ) # have seen that mode on second side BEFEORE we registered it as replica + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_STANDALONE" + return $HANA_STATE_STANDALONE;; + * ) + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: <$node_status>" + dump=$( echo $node_status | hexdump -C ); + super_ocf_log err "ACT: check_for_primary: we didn't expect node_status to be: DUMP <$dump>" + node_full_status=$(su - ${sidadm} -c "hdbnsutil -sr_state" 2>/dev/null ) + node_status=$(echo "$node_full_status" | awk '$1=="mode:" {print $2}') + super_ocf_log info "DEC: check_for_primary: loop=$i: node_status=$node_status" + # TODO: PRIO1: Maybe we need to keep the old value for P/S/N, if hdbnsutil just crashes + esac; + done + super_ocf_log info "FLOW $FUNCNAME rc=HANA_STATE_DEFECT" + return $HANA_STATE_DEFECT +} + + +# +# function: start_saphostagent +# params: - +# globals: +# +function start_saphostagent() +{ + if [ -x "${HOSTEXEC_PATH}" ]; then + ${HOSTEXEC_PATH} pf=${HOSTEXEC_PROFILE_PATH} + fi + return 0 +} + +# +# function: stop_saphostagent +# params: - +# globals: +# +function stop_saphostagent() +{ + if [ -x "${HOSTEXEC_PATH}" ]; then + ${HOSTEXEC_PATH} -stop + fi +} + +# +# function: check_saphostagent +# params: - +# globals: +# +function check_saphostagent() +{ + local rc=1 + pgrep -f /usr/sap/hostctrl/exe/saphostexec; rc=$? + return $rc +} + +# +############################################################################# +# +# function: sht_start - start a hana instance +# params: - +# globals: OCF_* +# sht_start : Start the SAP HANA instance +# +function sht_start() { + + super_ocf_log info "FLOW $FUNCNAME ($*)" + + local rc=$OCF_NOT_RUNNING + local output="" + local loopcount=0 + + mkdir -p /var/lib/SAPHana + touch /var/lib/SAPHana/SAPTopologyON + if ! check_saphostagent; then + start_saphostagent + fi + + rc=$OCF_SUCCESS + + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: sht_stop - stop a hana instance +# params: - +# globals: OCF_*(r), SAPCONTROL(r), SID(r), InstanceName(r) +# sht_stop: Stop the SAP instance +# +function sht_stop() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local output="" + local rc=0 + + rm /var/lib/SAPHana/SAPTopologyON + rc=$OCF_SUCCESS + + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + + +# +# function: sht_monitor - monitor a hana topology instance +# params: -- +# globals: OCF_*(r), SAPCONTROL(r), InstanveNr(r) +# sht_monitor: Can the given SAP instance do anything useful? +# +function sht_monitor() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + + if [ -f /var/lib/SAPHana/SAPTopologyON ]; then + rc=$OCF_SUCCESS + else + rc=$OCF_NOT_RUNNING + fi + + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + + +# +# function: sht_status - get status of a hana instance (os tools only) +# params: - +# globals: SID(r), InstanceName(r), OCF_*(r), sidarm(r) +# sht_status: Lightweight check of SAP instance only with OS tools +# +function sht_status() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + + sht_monitor; rc=$? + return $rc +} + + +# +# function: sht_validate - validation of (some) variables/parameters +# params: - +# globals: OCF_*(r), SID(r), InstanceName(r), InstanceNr(r), +# sht_validate: Check the symantic of the input parameters +# +function sht_validate() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_SUCCESS + if [ $(echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$') -ne 1 ] + then + super_ocf_log err "ACT: Parsing instance profile name: '$SID' is not a valid SID!" + rc=$OCF_ERR_ARGS + fi + + if [ $(echo "$InstanceNr" | grep -c '^[0-9][0-9]$') -ne 1 ] + then + super_ocf_log err "ACT: Parsing instance profile name: '$InstanceNr' is not a valid instance number!" + rc=$OCF_ERR_ARGS + fi + + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: sht_start_clone - start a hana clone instance +# params: - +# globals: OCF_*(r), +# sht_start_clone +# +function sht_start_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=$OCF_NOT_RUNNING + sht_start; rc=$? + return $rc +} + +# +# function: sht_stop_clone - stop a hana clone instance +# params: - +# globals: NODENAME(r), HANA_STATE_*, ATTR_NAME_* +# sht_stop_clone +# +function sht_stop_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + hanaPrim="P" + elif [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + hanaPrim="S" + elif [ $primary_status -eq $HANA_STATE_STANDALONE ]; then + hanaPrim="N" + else + hanaPrim="-" + fi + set_hana_attribute "${NODENAME}" "1:$hanaPrim:-:-:-:-" ${ATTR_NAME_HANA_ROLES[@]} + sht_stop; rc=$? + return $rc +} + +# +# function: sht_monitor_clone - monitor a hana clone instance +# params: - +# globals: OCF_*, SID, InstanceNr, InstanceName, MAPPING(r) +# sht_monitor_clone +# +function sht_monitor_clone() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + # + local rc=$OCF_ERR_GENERIC + local promoted=0 + local init_attribute=0 + + + if ocf_is_probe; then + super_ocf_log debug "DBG2: PROBE ONLY" + else + super_ocf_log debug "DBG2: REGULAR MONITOR" + if ! check_saphostagent; then + start_saphostagent + fi + fi + # + # First check, if we are PRIMARY or SECONDARY + # + super_ocf_log debug "DBG2: HANA SID $SID" + super_ocf_log debug "DBG2: HANA InstanceName $InstanceName" + super_ocf_log debug "DBG2: HANA InstanceNr $InstanceNr" + check_for_primary; primary_status=$? + if [ $primary_status -eq $HANA_STATE_PRIMARY ]; then + hanaPrim="P" + super_ocf_log debug "DBG2: HANA IS PRIMARY" + sht_monitor; rc=$? + else + if [ $primary_status -eq $HANA_STATE_SECONDARY ]; then + hanaPrim="S" + super_ocf_log debug "DBG2: HANA IS SECONDARY" + sht_monitor; rc=$? + elif [ $primary_status -eq $HANA_STATE_STANDALONE ]; then + hanaPrim="N" + super_ocf_log debug "DBG2: HANA IS STANDALONE" + sht_monitor; rc=$? + else + hanaPrim="-" + super_ocf_log warn "ACT: sht_monitor_clone: HANA_STATE_DEFECT" + rc=$OCF_ERR_CONFIGURED + fi + fi + # DONE: PRIO1: ASK: Is the output format of ListInstances fix? Could we take that as an API? + # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 + # We rely on the following format: SID is word#4, NR is work#6, vHost is word#8 + vName=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances \ + | awk '$4 == SID && $6=NR { print $8 }' SID=$SID NR=$InstanceNr 2>/dev/null ) + super_ocf_log debug "DBG: ListInstances: $(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances)" + if [ -n "$vName" ]; then + set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} + else + vName=$(get_hana_attribute ${NODENAME} ${ATTR_NAME_HANA_VHOST[@]}) + fi + #site=$(get_site_name) + hanaANSWER=$(su - $sidadm -c "python exe/python_support/landscapeHostConfiguration.py" 2>/dev/null); hanalrc="$?" + hanarole=$(echo "$hanaANSWER" | tr -d ' ' | awk -F'|' '$2 == host { printf "%s:%s:%s:%s\n",$10,$11,$12,$13 } ' host=${vName}) + #if [ -z "$MAPPING" ]; then + # super_ocf_log info "ACT: Did not find remote Host at this moment" + #fi + # FH TODO PRIO1: TRY TO GET RID OF "ATTR_NAME_HANA_REMOTEHOST" + if [ -n "$hanaRemoteHost" ]; then + set_hana_attribute ${NODENAME} "$hanaRemoteHost" ${ATTR_NAME_HANA_REMOTEHOST[@]} + fi + set_hana_attribute ${NODENAME} "$hanalrc:$hanaPrim:$hanarole" ${ATTR_NAME_HANA_ROLES[@]} + set_hana_attribute ${NODENAME} "$site" ${ATTR_NAME_HANA_SITE[@]} + set_hana_attribute ${NODENAME} "$vName" ${ATTR_NAME_HANA_VHOST[@]} + case "$hanaPrim" in + P ) ;; + S ) # only secondary may propargate its sync status + case $(crm_attribute --type crm_config --name cluster-infrastructure -q) in + *corosync* ) nodelist=$(crm_node -l | awk '{ print $2 }');; + *openais* ) nodelist=$(crm_node -l | awk '/member/ {print $2}');; + *cman* ) nodelist=$(crm_node -l);; + esac + + for n in ${nodelist}; do + set_hana_attribute ${n} "$srmode" ${ATTR_NAME_HANA_SRMODE[@]} + done + ;; + esac + #ATTR_NAME_HANA_STATUS # TODO: PRIO5: For SCALE-OUT: Fill that attribute later + super_ocf_log info "FLOW $FUNCNAME rc=$rc" + return $rc +} + +# +# function: sht_notify - notify action +# params: - +# globals: OCF_*(r), ACTION(r), CLACT(r), NODENAME(r) +# sht_notify: Handle master scoring - to make sure a slave gets the next master +# +function sht_notify() { + super_ocf_log info "FLOW $FUNCNAME ($*)" + local rc=0 + super_ocf_log info "RA ==== end action $ACTION$CLACT (${n_type}/${n_op})====" + return $rc +} + +# +# function: main - main function to operate +# params: ACTION +# globals: OCF_*(r), SID(w), sidadm(w), InstanceName(w), DIR_EXECUTABLE(w), ACTION(w), CLACT(w), ra_rc(rw), $0(r), %ENV(r) +# + +## GLOBALS +SID="" +sidadm="" +InstanceName="" +InstanceNr="" +DIR_EXECUTABLE="" +SAPHanaFilter="${OCF_RESKEY_SAPHanaFilter:-ra-act-dec-lpa}" +NODENAME=$(crm_node -n) + +if [ $# -ne 1 ] +then + sht_usage + exit $OCF_ERR_ARGS +fi + +ACTION=$1 +if [ "$ACTION" = "status" ]; then + ACTION=monitor +fi + +# These operations don't require OCF parameters to be set +case "$ACTION" in + usage|methods) sht_$ACTION + exit $OCF_SUCCESS;; + meta-data) sht_meta_data + exit $OCF_SUCCESS;; + notify) sht_notify + exit $OCF_SUCCESS;; + admin-setup) admin-setup + exit $OCF_SUCCESS;; + *);; +esac +sht_init + +if ! ocf_is_root +then + super_ocf_log err "ACT: $0 must be run as root" + exit $OCF_ERR_PERM +fi + +# parameter check +if [ -z "$OCF_RESKEY_SID" ] +then + super_ocf_log err "ACT: Please set parameter SID!" + exit $OCF_ERR_ARGS +fi + +if [ -z "$OCF_RESKEY_InstanceNumber" ] +then + super_ocf_log err "ACT: Please set parameter InstanceNumber!" + exit $OCF_ERR_ARGS +fi + + +if is_clone +then + CLACT=_clone +else + if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] + then + super_ocf_log err "ACT: $ACTION called in a non clone environment" + exit $OCF_ERR_ARGS + fi +fi + +THE_VERSION=$(sht_meta_data | grep ' $b ? $a : $b; +} + +sub print_attr_host() +{ + my ($HKey, $AKey); + printf "%-22s", "Attribute \\ Host"; + foreach $HKey (sort keys %Host) { + printf "%-16s ", $HKey; + } + printf "\n"; + + printf "%s\n", "-" x 120 ; + + foreach $AKey (sort keys %Name) { + printf "%-22s", $AKey; + foreach $HKey (sort keys %Host) { + printf "%-16.16s ", $Host{$HKey} -> {$AKey}; + } + + printf "\n"; + } + return 0; +} + +sub print_host_attr() +{ + my ($AKey, $HKey, $len, $line_len, $hclen); + $hclen=$Name{_hosts}->{_length}; + $line_len=$hclen+1; + printf "%-$hclen.${hclen}s ", "$table_title"; + foreach $AKey (sort keys %Name) { + if ($AKey ne "_hosts") { + $len = $Name{$AKey}->{_length}; + $line_len=$line_len+$len+1; + printf "%-$len.${len}s ", $Name{$AKey}->{_title}; + } + } + printf "\n"; + printf "%s\n", "-" x $line_len ; + foreach $HKey (sort keys %Host) { + printf "%-$hclen.${hclen}s ", $HKey; + foreach $AKey (sort keys %Name) { + if ($AKey ne "_hosts") { + $len = $Name{$AKey}->{_length}; + printf "%-$len.${len}s ", $Host{$HKey} -> {$AKey}; + } + } + printf "\n"; + } + return 0; +} + +open ListInstances, "/usr/sap/hostctrl/exe/saphostctrl -function ListInstances|"; +while () { + # try to catch: Inst Info : LNX - 42 - lv9041 - 740, patch 36, changelist 1444691 + chomp; + if ( $_ =~ /:\s+([A-Z][A-Z0-9][A-Z0-9])\s+-/ ) { + $sid=tolower("$1"); + } +} +close ListInstances; + + +open CIB, "cibadmin -Ql |"; +while () { + chomp; + my ($host, $name, $value); + my $found=0; + if ( $_ =~ /nvpair.*name="(\w+_${sid}_\w+)"/ ) { + $name=$1; + # find attribute in forever and reboot store :) + if ( $_ =~ /id="(status|nodes)-([a-zA-Z0-9\_\-]+)-/ ) { + $host=$2; + } + if ( $_ =~ /value="([^"]+)"/ ) { + $value=$1; + $found=1; + } + } + if ( $found == 1 ) { + # + # handle the hosts name and table-title + # + $Host{$host}->{$name}=${value}; + if ( defined ($Name{_hosts}->{_length})) { + $Name{_hosts}->{_length} = max($Name{_hosts}->{_length}, length($host )); + } else { + $Name{_hosts}->{_length} = length($host ); + } + $Name{_hosts}->{_length} = max($Name{_hosts}->{_length}, length( $table_title)); + # + # now handle the attributes name and value + # + $Name{$name}->{$host}=${value}; + if ( defined ($Name{$name}->{_length})) { + $Name{$name}->{_length} = max($Name{$name}->{_length}, length($value )); + } else { + $Name{$name}->{_length} = length($value ); + } + if ( $name =~ /hana_${sid}_(.*)/ ) { + $Name{$name}->{_title} = $1; + } else { + $Name{$name}->{_title} = $name; + } + $Name{$name}->{_length} = max($Name{$name}->{_length}, length( $Name{$name}->{_title})); + # printf "%-8s %-20s %-30s\n", $1, $2, $3; + } +} +close CIB; + +#print_attr_host; +print_host_attr; -- 1.8.4.2