Improve rabbitmq_join_cluster on failure.

Add an option for rabbitmq_join_cluster to '--clear-on-error' and
an option for rabbitmq_reset_node to '--force-reset'. This allows
the DB to be cleared down forcefully when a cluster join fails as
the local DB may be in an inconsistent state.

In conjunction with the change in 51-rabbitmq (where the cluster
formation is orchestrated), this allows a node that has failed to
join a cluster to retry more safely.

Finally, after we join cluster, use `update_cluster_nodes` to ensure
the local node is fully synced with the cluster.

Change-Id: Ie53dbab52f0ead23f204dec418056f47480bb20a
This commit is contained in:
Nicholas Randon
2014-11-24 14:58:24 +00:00
parent 9892ef30de
commit 7b7eb69a70
3 changed files with 39 additions and 11 deletions

View File

@@ -8,6 +8,10 @@ function show_options () {
echo "Usage: ${SCRIPT_NAME} [options]"
echo "Tells the local RabbitMQ to join the remote node's cluster."
echo "Options:"
echo " --clean-on-error - Attempts to clear down the local"
echo " Mnesia database on failure to join"
echo " a cluster, cleaning all persistent"
echo " messages."
echo " --remote-node <NODENAME> - The Rabbit remote node name to use."
echo " Defaults to rabbit."
echo " --remote-host <HOSTNAME> - The Rabbit remote host name to use."
@@ -16,10 +20,11 @@ function show_options () {
# RabbitMQ database is tied to the system hostname.
LOCAL_HOST="$(hostname)"
CLEAN_ON_ERROR="0"
REMOTE_NODE="rabbit"
REMOTE_HOST=""
TEMP=$(getopt -o h -l help,remote-node:,remote-host: -n "${SCRIPT_NAME}" -- "${@}")
TEMP=$(getopt -o h -l help,remote-node:,remote-host:,clean-on-error -n "${SCRIPT_NAME}" -- "${@}")
[ ${?} -ne 0 ] && { echo "Terminating..." >&2; exit 1; };
# Note the quotes around "$TEMP": they are essential!
@@ -27,6 +32,7 @@ eval set -- "${TEMP}"
while true ; do
case "${1}" in
--clean-on-error) CLEAN_ON_ERROR="1"; shift ;;
--remote-node) REMOTE_NODE="${2}"; shift 2 ;;
--remote-host) REMOTE_HOST="${2}"; shift 2 ;;
-h | --help) show_options 0 ;;
@@ -40,8 +46,12 @@ function join_cluster_with() {
local remote_node="${1}"
local remote_host="${2}"
rabbitmqctl stop_app
# We could already be in a cluster with this node so do not fail.
rabbitmqctl join_cluster "${remote_node}@${remote_host}" 2>/dev/null || true
# If we are in the cluster already or have just joined we may need to
# update our status to become running. In all error case we return true
# so we can test the node cluster status later.
{ rabbitmqctl join_cluster "${remote_node}@${remote_host}" &&
rabbitmqctl update_cluster_nodes "${remote_node}@${remote_host}"; } ||
true
rabbitmqctl start_app
}
@@ -51,10 +61,21 @@ function join_cluster_with() {
if ! rabbitmq_is_in_cluster --check-host "${REMOTE_HOST}"; then
{ join_cluster_with "${REMOTE_NODE}" "${REMOTE_HOST}" &&
rabbitmq_is_in_cluster --check-host "${REMOTE_HOST}"; } ||
RET_VAL=${?}
RET_VAL=${?}
if [ ${RET_VAL:-0} -ne 0 ]; then
echo "Failed to join host [${LOCAL_HOST}] with [${REMOTE_NODE}@${REMOTE_HOST}]..." >&2
if [ ${CLEAN_ON_ERROR} -eq 1 ]; then
# Try to leave the cluster gracefully and unregister with
# REMOTE_HOST. This reset will try to inform the peers that we are
# leaving. Even if this succeeds, we might be holding messages
# from a corrupt remote node.
rabbitmq_reset_node || true
# Now we've at least tried to sync our messages out to the cluster,
# simply wipe the DB. --force-reset does not communicate with the
# peers about the node exiting cluster.
rabbitmq_reset_node --force-reset || true
fi
exit ${RET_VAL}
fi
fi

View File

@@ -5,15 +5,20 @@ set -o pipefail
SCRIPT_NAME="$(basename $0)"
function show_options () {
echo "Usage: ${SCRIPT_NAME}"
echo "Usage: ${SCRIPT_NAME} [options]"
echo "Let the local RabbitMQ node gracefully exit any cluster"
echo "and clear down the local mnesia database. At the end of"
echo "the process an attempt is made to restart the RabbitMQ"
echo "Erlang pocesses even if a failure was encounted."
echo "Options:"
echo " --force-reset - Forcefully return the node to"
echo " its virgin state."
exit ${1}
}
TEMP=$(getopt -o h -l help -n "${SCRIPT_NAME}" -- "${@}")
RESET_OPTION="reset"
TEMP=$(getopt -o h -l help,force-reset -n "${SCRIPT_NAME}" -- "${@}")
[ $? -ne 0 ] && { echo "Terminating..." >&2; exit 1; };
# Note the quotes around "$TEMP": they are essential!
@@ -21,7 +26,8 @@ eval set -- "${TEMP}"
while true ; do
case "${1}" in
-h | --help) show_options 0;;
--force-reset) RESET_OPTION="force_reset"; shift ;;
-h | --help) show_options 0 ;;
--) shift ; break ;;
*) echo "Error: unsupported option ${1}." >&2 ; exit 1 ;;
esac
@@ -29,12 +35,13 @@ done
function reset_node() {
local reset_option="${1}"
rabbitmqctl stop_app
# This syncs all data into the cluster, then removes this node, cleaning local mnesia.
rabbitmqctl reset
rabbitmqctl "${reset_option}"
}
if ! reset_node; then
if ! reset_node "${RESET_OPTION}"; then
RET_VAL=${?}
echo "Failed: Node has failed to correctly exit cluster" >&2
rabbitmqctl start_app ||

View File

@@ -109,7 +109,7 @@ if [ "${LOCAL_RABBIT_HOST}" == "${BOOTSTRAP_NODE}" ]; then
# Try to join with each node in turn.
COUNT=$(( (${COUNT:-0} + 1) % ${TOTAL_NODES} ))
if [ ${COUNT} -ne ${NODE_INDEX} ]; then
rabbitmq_join_cluster --remote-host "${NODES[${COUNT}]}" || true
rabbitmq_join_cluster --remote-host "${NODES[${COUNT}]}" --clean-on-error || true
fi
done
@@ -127,7 +127,7 @@ else
echo "Waiting for bootstrap node to initialise the cluster..."
sleep 10
done
rabbitmq_join_cluster --remote-host "${BOOTSTRAP_NODE}"
rabbitmq_join_cluster --remote-host "${BOOTSTRAP_NODE}" --clean-on-error
fi
# Make sure that all queues (except those with auto-generated names) are