Improve rabbitmq_join_cluster on failure.

Add an option for rabbitmq_join_cluster to '--clear-on-error' and an option for rabbitmq_reset_node to '--force-reset'. This allows the DB to be cleared down forcefully when a cluster join fails as the local DB may be in an inconsistent state. In conjunction with the change in 51-rabbitmq (where the cluster formation is orchestrated), this allows a node that has failed to join a cluster to retry more safely. Finally, after we join cluster, use `update_cluster_nodes` to ensure the local node is fully synced with the cluster. Change-Id: Ie53dbab52f0ead23f204dec418056f47480bb20a
2014-11-24 14:58:24 +00:00
parent 9892ef30de
commit 7b7eb69a70
3 changed files with 39 additions and 11 deletions
--- a/elements/rabbitmq-server/bin/rabbitmq_join_cluster
+++ b/elements/rabbitmq-server/bin/rabbitmq_join_cluster
@@ -8,6 +8,10 @@ function show_options () {
    echo "Usage: ${SCRIPT_NAME} [options]"
    echo "Tells the local RabbitMQ to join the remote node's cluster."
    echo "Options:"
+    echo "    --clean-on-error         - Attempts to clear down the local"
+    echo "                               Mnesia database on failure to join"
+    echo "                               a cluster, cleaning all persistent"
+    echo "                               messages."
    echo "    --remote-node <NODENAME> - The Rabbit remote node name to use."
    echo "                               Defaults to rabbit."
    echo "    --remote-host <HOSTNAME> - The Rabbit remote host name to use."
@@ -16,10 +20,11 @@ function show_options () {

 # RabbitMQ database is tied to the system hostname.
 LOCAL_HOST="$(hostname)"
+CLEAN_ON_ERROR="0"
 REMOTE_NODE="rabbit"
 REMOTE_HOST=""

-TEMP=$(getopt -o h -l help,remote-node:,remote-host: -n "${SCRIPT_NAME}" -- "${@}")
+TEMP=$(getopt -o h -l help,remote-node:,remote-host:,clean-on-error -n "${SCRIPT_NAME}" -- "${@}")
 [ ${?} -ne 0 ] && { echo "Terminating..." >&2; exit 1; };

 # Note the quotes around "$TEMP": they are essential!
@@ -27,6 +32,7 @@ eval set -- "${TEMP}"

 while true ; do
    case "${1}" in
+        --clean-on-error) CLEAN_ON_ERROR="1"; shift ;;
        --remote-node) REMOTE_NODE="${2}"; shift 2 ;;
        --remote-host) REMOTE_HOST="${2}"; shift 2 ;;
        -h | --help) show_options 0 ;;
@@ -40,8 +46,12 @@ function join_cluster_with() {
    local remote_node="${1}"
    local remote_host="${2}"
    rabbitmqctl stop_app
-    # We could already be in a cluster with this node so do not fail.
-    rabbitmqctl join_cluster "${remote_node}@${remote_host}" 2>/dev/null || true
+    # If we are in the cluster already or have just joined we may need to
+    # update our status to become running. In all error case we return true
+    # so we can test the node cluster status later.
+    { rabbitmqctl join_cluster "${remote_node}@${remote_host}" &&
+        rabbitmqctl update_cluster_nodes "${remote_node}@${remote_host}"; } ||
+        true
    rabbitmqctl start_app
 }

@@ -51,10 +61,21 @@ function join_cluster_with() {
 if ! rabbitmq_is_in_cluster --check-host "${REMOTE_HOST}"; then
    { join_cluster_with "${REMOTE_NODE}" "${REMOTE_HOST}" &&
        rabbitmq_is_in_cluster --check-host "${REMOTE_HOST}"; } ||
-            RET_VAL=${?}
+        RET_VAL=${?}

    if [ ${RET_VAL:-0} -ne 0 ]; then
        echo "Failed to join host [${LOCAL_HOST}] with [${REMOTE_NODE}@${REMOTE_HOST}]..." >&2
+        if [ ${CLEAN_ON_ERROR} -eq 1 ]; then
+            # Try to leave the cluster gracefully and unregister with
+            # REMOTE_HOST. This reset will try to inform the peers that we are
+            # leaving. Even if this succeeds, we might be holding messages
+            # from a corrupt remote node.
+            rabbitmq_reset_node || true
+            # Now we've at least tried to sync our messages out to the cluster,
+            # simply wipe the DB. --force-reset does not communicate with the
+            # peers about the node exiting cluster.
+            rabbitmq_reset_node --force-reset || true
+        fi
        exit ${RET_VAL}
    fi
 fi
--- a/elements/rabbitmq-server/bin/rabbitmq_reset_node
+++ b/elements/rabbitmq-server/bin/rabbitmq_reset_node
@@ -5,15 +5,20 @@ set -o pipefail
 SCRIPT_NAME="$(basename $0)"

 function show_options () {
-    echo "Usage: ${SCRIPT_NAME}"
+    echo "Usage: ${SCRIPT_NAME} [options]"
    echo "Let the local RabbitMQ node gracefully exit any cluster"
    echo "and clear down the local mnesia database. At the end of"
    echo "the process an attempt is made to restart the RabbitMQ"
    echo "Erlang pocesses even if a failure was encounted."
+    echo "Options:"
+    echo "    --force-reset         - Forcefully return the node to"
+    echo "                            its virgin state."
    exit ${1}
 }

-TEMP=$(getopt -o h -l help -n "${SCRIPT_NAME}" -- "${@}")
+RESET_OPTION="reset"
+
+TEMP=$(getopt -o h -l help,force-reset -n "${SCRIPT_NAME}" -- "${@}")
 [ $? -ne 0 ] && { echo "Terminating..." >&2; exit 1; };

 # Note the quotes around "$TEMP": they are essential!
@@ -21,7 +26,8 @@ eval set -- "${TEMP}"

 while true ; do
    case "${1}" in
-        -h | --help) show_options 0;;
+        --force-reset) RESET_OPTION="force_reset"; shift ;;
+        -h | --help) show_options 0 ;;
        --) shift ; break ;;
        *) echo "Error: unsupported option ${1}." >&2 ; exit 1 ;;
    esac
@@ -29,12 +35,13 @@ done


 function reset_node() {
+    local reset_option="${1}"
    rabbitmqctl stop_app
    # This syncs all data into the cluster, then removes this node, cleaning local mnesia.
-    rabbitmqctl reset
+    rabbitmqctl "${reset_option}"
 }

-if ! reset_node; then
+if ! reset_node "${RESET_OPTION}"; then
   RET_VAL=${?}
   echo "Failed: Node has failed to correctly exit cluster" >&2
   rabbitmqctl start_app ||
--- a/elements/rabbitmq-server/os-refresh-config/post-configure.d/51-rabbitmq
+++ b/elements/rabbitmq-server/os-refresh-config/post-configure.d/51-rabbitmq
@@ -109,7 +109,7 @@ if [ "${LOCAL_RABBIT_HOST}" == "${BOOTSTRAP_NODE}" ]; then
        # Try to join with each node in turn.
        COUNT=$(( (${COUNT:-0} + 1)  % ${TOTAL_NODES} ))
        if [ ${COUNT} -ne ${NODE_INDEX} ]; then
-            rabbitmq_join_cluster --remote-host "${NODES[${COUNT}]}" || true
+            rabbitmq_join_cluster --remote-host "${NODES[${COUNT}]}" --clean-on-error || true
        fi
    done

@@ -127,7 +127,7 @@ else
        echo "Waiting for bootstrap node to initialise the cluster..."
        sleep 10
    done
-    rabbitmq_join_cluster --remote-host "${BOOTSTRAP_NODE}"
+    rabbitmq_join_cluster --remote-host "${BOOTSTRAP_NODE}" --clean-on-error
 fi

 # Make sure that all queues (except those with auto-generated names) are