Rabbit: Eradicate potential crashes in wait job while upgrading cluster

When upgrading/reconfiguring a rabbit cluster its possible that the nodes will not return the cluster status for some time, this ps allows us to cope with this much more gracefully than simply crashing a few times, before proceeding. Change-Id: Ibf525df9e3a9362282f70e5dbb136430734181fd Signed-off-by: Pete Birley <pete@port.direct>
2019-07-18 14:15:45 -05:00 · 2019-07-18 14:15:45 -05:00 · af270934d4
commit af270934d4
parent 2c8b18aeb8
1 changed files with 4 additions and 0 deletions
--- a/rabbitmq/templates/bin/_rabbitmq-wait-for-cluster.sh.tpl
+++ b/rabbitmq/templates/bin/_rabbitmq-wait-for-cluster.sh.tpl
@ -59,6 +59,10 @@ function sorted_node_list () {
 if test "$(active_rabbit_nodes)" -gt "$RABBIT_REPLICA_COUNT"; then
    echo "There are more nodes registed in the cluster than desired, pruning the cluster"
    PRIMARY_NODE="$(sorted_node_list | awk '{ print $1; exit }')"
+    until rabbitmqctl -l -n "${PRIMARY_NODE}" cluster_status >/dev/null 2>&1 ; do
+      echo "Waiting for primary node to return cluster status"
+      sleep 10
+    done
    echo "Current cluster:"
    rabbitmqctl -l -n "${PRIMARY_NODE}" cluster_status
    NODES_TO_REMOVE="$(sorted_node_list | awk "{print substr(\$0, index(\$0,\$$((RABBIT_REPLICA_COUNT+1))))}")"