Improve rabbitmq_join_cluster on failure.
Add an option for rabbitmq_join_cluster to '--clear-on-error' and an option for rabbitmq_reset_node to '--force-reset'. This allows the DB to be cleared down forcefully when a cluster join fails as the local DB may be in an inconsistent state. In conjunction with the change in 51-rabbitmq (where the cluster formation is orchestrated), this allows a node that has failed to join a cluster to retry more safely. Finally, after we join cluster, use `update_cluster_nodes` to ensure the local node is fully synced with the cluster. Change-Id: Ie53dbab52f0ead23f204dec418056f47480bb20a
This commit is contained in:
@@ -8,6 +8,10 @@ function show_options () {
|
||||
echo "Usage: ${SCRIPT_NAME} [options]"
|
||||
echo "Tells the local RabbitMQ to join the remote node's cluster."
|
||||
echo "Options:"
|
||||
echo " --clean-on-error - Attempts to clear down the local"
|
||||
echo " Mnesia database on failure to join"
|
||||
echo " a cluster, cleaning all persistent"
|
||||
echo " messages."
|
||||
echo " --remote-node <NODENAME> - The Rabbit remote node name to use."
|
||||
echo " Defaults to rabbit."
|
||||
echo " --remote-host <HOSTNAME> - The Rabbit remote host name to use."
|
||||
@@ -16,10 +20,11 @@ function show_options () {
|
||||
|
||||
# RabbitMQ database is tied to the system hostname.
|
||||
LOCAL_HOST="$(hostname)"
|
||||
CLEAN_ON_ERROR="0"
|
||||
REMOTE_NODE="rabbit"
|
||||
REMOTE_HOST=""
|
||||
|
||||
TEMP=$(getopt -o h -l help,remote-node:,remote-host: -n "${SCRIPT_NAME}" -- "${@}")
|
||||
TEMP=$(getopt -o h -l help,remote-node:,remote-host:,clean-on-error -n "${SCRIPT_NAME}" -- "${@}")
|
||||
[ ${?} -ne 0 ] && { echo "Terminating..." >&2; exit 1; };
|
||||
|
||||
# Note the quotes around "$TEMP": they are essential!
|
||||
@@ -27,6 +32,7 @@ eval set -- "${TEMP}"
|
||||
|
||||
while true ; do
|
||||
case "${1}" in
|
||||
--clean-on-error) CLEAN_ON_ERROR="1"; shift ;;
|
||||
--remote-node) REMOTE_NODE="${2}"; shift 2 ;;
|
||||
--remote-host) REMOTE_HOST="${2}"; shift 2 ;;
|
||||
-h | --help) show_options 0 ;;
|
||||
@@ -40,8 +46,12 @@ function join_cluster_with() {
|
||||
local remote_node="${1}"
|
||||
local remote_host="${2}"
|
||||
rabbitmqctl stop_app
|
||||
# We could already be in a cluster with this node so do not fail.
|
||||
rabbitmqctl join_cluster "${remote_node}@${remote_host}" 2>/dev/null || true
|
||||
# If we are in the cluster already or have just joined we may need to
|
||||
# update our status to become running. In all error case we return true
|
||||
# so we can test the node cluster status later.
|
||||
{ rabbitmqctl join_cluster "${remote_node}@${remote_host}" &&
|
||||
rabbitmqctl update_cluster_nodes "${remote_node}@${remote_host}"; } ||
|
||||
true
|
||||
rabbitmqctl start_app
|
||||
}
|
||||
|
||||
@@ -51,10 +61,21 @@ function join_cluster_with() {
|
||||
if ! rabbitmq_is_in_cluster --check-host "${REMOTE_HOST}"; then
|
||||
{ join_cluster_with "${REMOTE_NODE}" "${REMOTE_HOST}" &&
|
||||
rabbitmq_is_in_cluster --check-host "${REMOTE_HOST}"; } ||
|
||||
RET_VAL=${?}
|
||||
RET_VAL=${?}
|
||||
|
||||
if [ ${RET_VAL:-0} -ne 0 ]; then
|
||||
echo "Failed to join host [${LOCAL_HOST}] with [${REMOTE_NODE}@${REMOTE_HOST}]..." >&2
|
||||
if [ ${CLEAN_ON_ERROR} -eq 1 ]; then
|
||||
# Try to leave the cluster gracefully and unregister with
|
||||
# REMOTE_HOST. This reset will try to inform the peers that we are
|
||||
# leaving. Even if this succeeds, we might be holding messages
|
||||
# from a corrupt remote node.
|
||||
rabbitmq_reset_node || true
|
||||
# Now we've at least tried to sync our messages out to the cluster,
|
||||
# simply wipe the DB. --force-reset does not communicate with the
|
||||
# peers about the node exiting cluster.
|
||||
rabbitmq_reset_node --force-reset || true
|
||||
fi
|
||||
exit ${RET_VAL}
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -5,15 +5,20 @@ set -o pipefail
|
||||
SCRIPT_NAME="$(basename $0)"
|
||||
|
||||
function show_options () {
|
||||
echo "Usage: ${SCRIPT_NAME}"
|
||||
echo "Usage: ${SCRIPT_NAME} [options]"
|
||||
echo "Let the local RabbitMQ node gracefully exit any cluster"
|
||||
echo "and clear down the local mnesia database. At the end of"
|
||||
echo "the process an attempt is made to restart the RabbitMQ"
|
||||
echo "Erlang pocesses even if a failure was encounted."
|
||||
echo "Options:"
|
||||
echo " --force-reset - Forcefully return the node to"
|
||||
echo " its virgin state."
|
||||
exit ${1}
|
||||
}
|
||||
|
||||
TEMP=$(getopt -o h -l help -n "${SCRIPT_NAME}" -- "${@}")
|
||||
RESET_OPTION="reset"
|
||||
|
||||
TEMP=$(getopt -o h -l help,force-reset -n "${SCRIPT_NAME}" -- "${@}")
|
||||
[ $? -ne 0 ] && { echo "Terminating..." >&2; exit 1; };
|
||||
|
||||
# Note the quotes around "$TEMP": they are essential!
|
||||
@@ -21,7 +26,8 @@ eval set -- "${TEMP}"
|
||||
|
||||
while true ; do
|
||||
case "${1}" in
|
||||
-h | --help) show_options 0;;
|
||||
--force-reset) RESET_OPTION="force_reset"; shift ;;
|
||||
-h | --help) show_options 0 ;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Error: unsupported option ${1}." >&2 ; exit 1 ;;
|
||||
esac
|
||||
@@ -29,12 +35,13 @@ done
|
||||
|
||||
|
||||
function reset_node() {
|
||||
local reset_option="${1}"
|
||||
rabbitmqctl stop_app
|
||||
# This syncs all data into the cluster, then removes this node, cleaning local mnesia.
|
||||
rabbitmqctl reset
|
||||
rabbitmqctl "${reset_option}"
|
||||
}
|
||||
|
||||
if ! reset_node; then
|
||||
if ! reset_node "${RESET_OPTION}"; then
|
||||
RET_VAL=${?}
|
||||
echo "Failed: Node has failed to correctly exit cluster" >&2
|
||||
rabbitmqctl start_app ||
|
||||
|
||||
@@ -109,7 +109,7 @@ if [ "${LOCAL_RABBIT_HOST}" == "${BOOTSTRAP_NODE}" ]; then
|
||||
# Try to join with each node in turn.
|
||||
COUNT=$(( (${COUNT:-0} + 1) % ${TOTAL_NODES} ))
|
||||
if [ ${COUNT} -ne ${NODE_INDEX} ]; then
|
||||
rabbitmq_join_cluster --remote-host "${NODES[${COUNT}]}" || true
|
||||
rabbitmq_join_cluster --remote-host "${NODES[${COUNT}]}" --clean-on-error || true
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -127,7 +127,7 @@ else
|
||||
echo "Waiting for bootstrap node to initialise the cluster..."
|
||||
sleep 10
|
||||
done
|
||||
rabbitmq_join_cluster --remote-host "${BOOTSTRAP_NODE}"
|
||||
rabbitmq_join_cluster --remote-host "${BOOTSTRAP_NODE}" --clean-on-error
|
||||
fi
|
||||
|
||||
# Make sure that all queues (except those with auto-generated names) are
|
||||
|
||||
Reference in New Issue
Block a user