From a06ce7c166f4a7801b1fb7d50c77dead8a0c7a1d Mon Sep 17 00:00:00 2001 From: David Vossel Date: Wed, 21 Jan 2015 18:00:18 -0500 Subject: [PATCH] High: introducing rabbitmq clustering agent --- doc/man/Makefile.am | 1 + heartbeat/Makefile.am | 1 + heartbeat/rabbitmq-cluster | 370 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 372 insertions(+) create mode 100755 heartbeat/rabbitmq-cluster diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index eafb2d1..62e619a 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -127,6 +127,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ ocf_heartbeat_postfix.7 \ ocf_heartbeat_pound.7 \ ocf_heartbeat_proftpd.7 \ + ocf_heartbeat_rabbitmq-cluster.7 \ ocf_heartbeat_rsyncd.7 \ ocf_heartbeat_rsyslog.7 \ ocf_heartbeat_scsi2reservation.7 \ diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am index 330b7f7..66dcff2 100644 --- a/heartbeat/Makefile.am +++ b/heartbeat/Makefile.am @@ -106,6 +106,7 @@ ocf_SCRIPTS = ClusterMon \ pgsql \ proftpd \ Pure-FTPd \ + rabbitmq-cluster \ Raid1 \ Route \ rsyncd \ diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster new file mode 100755 index 0000000..b9dcfc3 --- /dev/null +++ b/heartbeat/rabbitmq-cluster @@ -0,0 +1,370 @@ +#!/bin/sh +# +# Copyright (c) 2014 David Vossel +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +RMQ_SERVER=/usr/sbin/rabbitmq-server +RMQ_CTL=/usr/sbin/rabbitmqctl +RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia" +RMQ_PID_DIR="/var/run/rabbitmq" +RMQ_PID_FILE="/var/run/rabbitmq/rmq.pid" +RMQ_LOG_DIR="/var/log/rabbitmq" +NODENAME=$(ocf_local_nodename) + +RMQ_CRM_ATTR_COOKIE="rmq-node-attr-${OCF_RESOURCE_INSTANCE}" + +meta_data() { + cat < + + +1.0 + + +Starts cloned rabbitmq cluster instance + +rabbitmq clustered + + + + +Policy string to pass to 'rabbitmqctl set_policy' right after bootstrapping the first rabbitmq instance. + +rabbitmqctl set_policy args + + + + + + + + + + + + + +END +} + +####################################################################### + +rmq_usage() { + cat < /dev/null 2>&1 +} + +rmq_local_node() +{ + + local node_name=$(rabbitmqctl status 2>&1 | sed -n -e "s/^.*[S|s]tatus of node \(.*\)\s.*$/\1/p" | tr -d "'") + + if [ -z "$node_name" ]; then + node_name=$(cat /etc/rabbitmq/rabbitmq-env.conf 2>/dev/null | grep "\s*RABBITMQ_NODENAME=" | awk -F= '{print $2}') + fi + + echo "$node_name" +} + +rmq_join_list() +{ + cibadmin -Q 2>/dev/null | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p" +} + +rmq_write_nodename() +{ + local node_name=$(rmq_local_node) + + if [ -z "$node_name" ]; then + ocf_log err "Failed to determine rabbitmq node name, exiting" + exit $OCF_ERR_GENERIC + fi + + # store the pcmknode to rmq node mapping as an attribute + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -v "$node_name" +} + +rmq_delete_nodename() +{ + # remove node-name + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -D +} + +prepare_dir () { + if [ ! -d ${1} ] ; then + mkdir -p ${1} + chown -R rabbitmq:rabbitmq ${1} + chmod 755 ${1} + fi +} + +remove_pid () { + rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 +} + +rmq_monitor() { + local rc + + $RMQ_CTL cluster_status > /dev/null 2>&1 + rc=$? + case "$rc" in + 0) + ocf_log debug "RabbitMQ server is running normally" + rmq_write_nodename + + return $OCF_SUCCESS + ;; + 2) + ocf_log info "RabbitMQ server is not running" + rmq_delete_nodename + return $OCF_NOT_RUNNING + ;; + *) + ocf_log err "Unexpected return code from '$RMQ_CTL cluster status' exit code: $rc" + rmq_delete_nodename + return $OCF_ERR_GENERIC + ;; + esac +} + +rmq_init_and_wait() +{ + local rc + + prepare_dir $RMQ_PID_DIR + prepare_dir $RMQ_LOG_DIR + remove_pid + + # the server startup script uses this environment variable + export RABBITMQ_PID_FILE="$RMQ_PID_FILE" + + setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" & + + ocf_log info "Waiting for server to start" + $RMQ_CTL wait $RMQ_PID_FILE + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + remove_pid + ocf_log info "rabbitmq-server start failed: $rc" + return $OCF_ERR_GENERIC + fi + + rmq_monitor + return $? +} + +rmq_set_policy() +{ + $RMQ_CTL set_policy $@ > /dev/null 2>&1 +} + +rmq_start_first() +{ + local rc + + ocf_log info "Bootstrapping rabbitmq cluster" + rmq_wipe_data + rmq_init_and_wait + rc=$? + + if [ $rc -eq 0 ]; then + rc=$OCF_SUCCESS + ocf_log info "cluster bootstrapped" + + if [ -n "$OCF_RESKEY_set_policy" ]; then + # do not quote set_policy, we are passing in arguments + rmq_set_policy $OCF_RESKEY_set_policy > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log err "Failed to set policy: $OCF_RESKEY_set_policy" + rc=$OCF_ERR_GENERIC + else + ocf_log info "Policy set: $OCF_RESKEY_set_policy" + fi + fi + + else + ocf_log info "failed to bootstrap cluster. Check SELINUX policy" + rc=$OCF_ERR_GENERIC + fi + + return $rc +} + +rmq_join_existing() +{ + local join_list="$1" + local rc=$OCF_ERR_GENERIC + + ocf_log info "Joining existing cluster with [ $(echo $join_list | tr '\n' ' ') ] nodes." + rmq_init_and_wait + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + # unconditionally join the cluster + $RMQ_CTL stop_app > /dev/null 2>&1 + for node in $(echo "$join_list"); do + ocf_log info "Attempting to join cluster with target node $node" + $RMQ_CTL join_cluster $node + if [ $? -eq 0 ]; then + ocf_log info "Joined cluster by connecting to node $node, starting app" + $RMQ_CTL start_app + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "'$RMQ_CTL start_app' failed" + fi + break; + fi + done + + if [ "$rc" -ne 0 ]; then + ocf_log info "Join process incomplete, shutting down." + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully joined existing rabbitmq cluster" + return $OCF_SUCCESS +} + +rmq_start() { + local join_list="" + local rc + + rmq_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + join_list=$(rmq_join_list) + + # No join list means no active instances are up. This instance + # is the first, so it needs to bootstrap the rest + if [ -z "$join_list" ]; then + rmq_start_first + rc=$? + return $rc + fi + + # first try to join without wiping mnesia data + rmq_join_existing "$join_list" + if [ $? -ne 0 ]; then + ocf_log info "node failed to join, wiping data directory and trying again" + # if the graceful join fails, use the hammer and reset all the data. + rmq_stop + rmq_wipe_data + rmq_join_existing "$join_list" + if [ $? -ne 0 ]; then + ocf_log info "node failed to join even after reseting local data. Check SELINUX policy" + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +rmq_stop() { + rmq_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + $RMQ_CTL stop + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log err "rabbitmq-server stop command failed: $RMQ_CTL stop, $rc" + return $rc + fi + + #TODO add kill logic + stop_wait=1 + while [ $stop_wait = 1 ]; do + rmq_monitor + rc=$? + if [ "$rc" -eq $OCF_NOT_RUNNING ]; then + stop_wait=0 + break + elif [ "$rc" -ne $OCF_SUCCESS ]; then + ocf_log info "rabbitmq-server stop failed: $rc" + exit $OCF_ERR_GENERIC + fi + sleep 1 + done + + remove_pid + return $OCF_SUCCESS +} + +rmq_validate() { + check_binary $RMQ_SERVER + check_binary $RMQ_CTL + + # This resource only makes sense as a clone right now. at some point + # we may want to verify the following. + #TODO verify cloned + #TODO verify ordered=true + + # Given that this resource does the cluster join explicitly, + # having a cluster_nodes list in the static config file will + # likely conflict with this agent. + #TODO verify no cluster list in rabbitmq conf + #cat /etc/rabbitmq/rabbitmq.config | grep "cluster_nodes" + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) rmq_start;; +stop) rmq_stop;; +monitor) rmq_monitor;; +validate-all) rmq_validate;; +usage|help) rmq_usage + exit $OCF_SUCCESS + ;; +*) rmq_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + -- 1.8.4.2