Elasticsearch: Update Rolling Restart Procedure

This change implements the reccomended rolling restart procedure[0] for elasticsearch-data pods. [0] https://www.elastic.co/guide/en/elasticsearch/reference/7.x/restart-cluster.html#restart-cluster-rolling Change-Id: I935b3681999e9bda616898f2b5e01f582ee54ed9
2020-06-04 03:48:46 -05:00 · 2020-06-04 03:48:46 -05:00 · 309278389e
commit 309278389e
parent b62a46336c
1 changed files with 46 additions and 23 deletions
--- a/elasticsearch/templates/bin/_elasticsearch.sh.tpl
+++ b/elasticsearch/templates/bin/_elasticsearch.sh.tpl
@ -34,19 +34,29 @@ function stop () {
  kill -TERM 1
 }
 function wait_to_join() {
  joined=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" "${ELASTICSEARCH_ENDPOINT}/_cat/nodes" | grep -w $NODE_NAME || true )
  while [ -z "$joined" ]; do
    sleep 5
    joined=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" "${ELASTICSEARCH_ENDPOINT}/_cat/nodes" | grep -w $NODE_NAME || true )
  done
 }
 function allocate_data_node () {
-  CLUSTER_SETTINGS=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \
+  if [ -f /data/restarting ]; then
-    "${ELASTICSEARCH_ENDPOINT}/_cluster/settings")
+    rm /data/restarting
-  if echo "${CLUSTER_SETTINGS}" | grep -E "${NODE_NAME}"; then
+    echo "Node ${NODE_NAME} has restarted. Waiting to rejoin the cluster."
-    echo "Activate node ${NODE_NAME}"
+    wait_to_join
-    curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
+
    echo "Re-enabling Replica Shard Allocation"
    curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
     "${ELASTICSEARCH_ENDPOINT}/_cluster/settings" -d "{
-      \"transient\" :{
+      \"persistent\": {
-          \"cluster.routing.allocation.exclude._name\" : null
+        \"cluster.routing.allocation.enable\": null
      }
    }"
  fi
  echo "Node ${NODE_NAME} is ready to be used"
 }
 function start_master_node () {
@ -76,24 +86,37 @@ function start_data_node () {
  allocate_data_node &
  /usr/local/bin/docker-entrypoint.sh elasticsearch &
  function drain_data_node () {
-    echo "Prepare to migrate data off node ${NODE_NAME}"
+
-    echo "Move all data from node ${NODE_NAME}"
+    # Implement the Rolling Restart Protocol Described Here:
-    curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
+    # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/restart-cluster.html#restart-cluster-rolling
    echo "Disabling Replica Shard Allocation"
    curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPUT -H 'Content-Type: application/json' \
     "${ELASTICSEARCH_ENDPOINT}/_cluster/settings" -d "{
-      \"transient\" :{
+      \"persistent\": {
-          \"cluster.routing.allocation.exclude._name\" : \"${NODE_NAME}\"
+        \"cluster.routing.allocation.enable\": \"primaries\"
      }
    }"
-    echo ""
+
-    while true ; do
+    # If version < 7.6 use _flush/synced; otherwise use _flush
-      echo -e "Wait for node ${NODE_NAME} to become empty"
+    # https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-synced-flush-api.html#indices-synced-flush-api
-      SHARDS_ALLOCATION=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \
+
-        -XGET "${ELASTICSEARCH_ENDPOINT}/_cat/shards")
+    version=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" "${ELASTICSEARCH_ENDPOINT}/" | jq -r .version.number)
-      if ! echo "${SHARDS_ALLOCATION}" | grep -E "${NODE_NAME}"; then
+
-        break
+    if [[ $version =~ "7.1" ]]; then
      action="_flush/synced"
    else
      action="_flush"
    fi
-      sleep 5
+
-    done
+    curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" -XPOST "${ELASTICSEARCH_ENDPOINT}/$action"
    # TODO: Check the response of synced flush operations to make sure there are no failures.
    # Synced flush operations that fail due to pending indexing operations are listed in the response body,
    # although the request itself still returns a 200 OK status. If there are failures, reissue the request.
    # (The only side effect of not doing so is slower start up times. See flush documentation linked above)
    touch /data/restarting
    echo "Node ${NODE_NAME} is ready to shutdown"
    kill -TERM 1
  }