stx-monitor elasticsearch readiness probe enhancements

Modify the readiness probe in the elasticsearch chart so that 2 sets of cluster health parameters can be set. One set of cluster health parameters will be used when no connection to the master is possible or there are less than 2 elasticsearch data nodes in the cluster. The other will be used when connection to the master is possible and 2 or more elasticsearch data nodes are present in the cluster. This is necessary as in various cluster recovery scenarios and AIO-SX only local node health should be checked, whereas when modifying elasticsearch data nodes we need to ensure no recovery is in progress before advancing to updating the next data node. Change-Id: I8125c3c5b87e081a00907c519e3d513c74031c70 Closes-Bug: 1869001 Signed-off-by: Kevin Smith <kevin.smith@windriver.com>
2020-03-25 09:27:15 -04:00 · 2020-03-25 09:27:15 -04:00 · bdfd8a2abd
parent 687fb8584f
commit bdfd8a2abd
3 changed files with 94 additions and 1 deletions
--- a/monitor-helm-elastic/centos/monitor-helm-elastic.spec
+++ b/monitor-helm-elastic/centos/monitor-helm-elastic.spec
@ -23,6 +23,7 @@ Patch02: 0002-Add-compatibility-for-k8s-1.16.patch
 Patch03: 0003-use-oss-image.patch
 Patch04: 0004-Update-to-Elastic-7.4.0-Release.patch
 Patch05: 0005-set-initial-masters-to-master-0.patch
+Patch06: 0006-readiness-probe-enhancements.patch

 BuildRequires: helm

@ -36,6 +37,7 @@ Monitor Helm elasticsearch charts
 %patch03 -p1
 %patch04 -p1
 %patch05 -p1
+%patch06 -p1

 %build
 # initialize helm and build the toolkit
--- a/monitor-helm-elastic/files/0006-readiness-probe-enhancements.patch
+++ b/monitor-helm-elastic/files/0006-readiness-probe-enhancements.patch
@ -0,0 +1,91 @@
+From 36ea0e2a2fd6cf6ac8cb19411c14c5ef4d0618f9 Mon Sep 17 00:00:00 2001
+From: Kevin Smith <kevin.smith@windriver.com>
+Date: Mon, 23 Mar 2020 10:43:07 -0400
+Subject: [PATCH 1/1] readiness probe enhancements
+
+---
+ elasticsearch/templates/statefulset.yaml | 46 +++++++++++++++++++++++++++-----
+ elasticsearch/values.yaml                |  2 ++
+ 2 files changed, 41 insertions(+), 7 deletions(-)
+
+diff --git a/elasticsearch/templates/statefulset.yaml b/elasticsearch/templates/statefulset.yaml
+index e17d39e..483e1f4 100644
+--- a/elasticsearch/templates/statefulset.yaml
+++ b/elasticsearch/templates/statefulset.yaml
+@@ -194,7 +194,7 @@ spec:
+                 # If the node is starting up wait for the cluster to be ready (request params: '{{ .Values.clusterHealthCheckParams }}' )
+                 # Once it has started only check that the node itself is responding
+                 START_FILE=/tmp/.es_start_file
+-
+               
+                 http () {
+                     local path="${1}"
+                     if [ -n "${ELASTIC_USERNAME}" ] && [ -n "${ELASTIC_PASSWORD}" ]; then
+@@ -209,13 +209,45 @@ spec:
+                     echo 'Elasticsearch is already running, lets check the node is healthy'
+                     http "/"
+                 else
+-                    echo 'Waiting for elasticsearch cluster to become cluster to be ready (request params: "{{ .Values.clusterHealthCheckParams }}" )'
+-                    if http "/_cluster/health?{{ .Values.clusterHealthCheckParams }}" ; then
+-                        touch ${START_FILE}
+-                        exit 0
+                    DATA_NODE=$(printenv node.data)
+                    if [[ "$DATA_NODE" == true ]]; then
+                        # This is a data node, check for health depending on whether we can
+                        # reach the master node and how many data nodes there are.
+                        DATA_NODE_COUNT=$(http "/_cat/nodes?master_timeout=1s" | grep -c data)
+                        echo "data node count = $DATA_NODE_COUNT"
+                        if [[ $DATA_NODE_COUNT -gt 1 ]]; then
+                            # We connected to master and there is more than one data node.
+                            echo 'Waiting for elasticsearch cluster to become ready (request params: "{{ .Values.clusterHealthCheckParams }}" )'
+                            if http "/_cluster/health?{{ .Values.clusterHealthCheckParams }}" ; then
+                                touch ${START_FILE}
+                                exit 0
+                            else
+                                echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParams }}" )'
+                                exit 1
+                            fi
+                        else
+                            # Cannot connect to the master or we are the only data node
+                            # found. Could be DOR, AIO-SX, other host is locked and we
+                            # experienced a pod restart or other similar scenario.
+                            echo "Cannot connect to master or less than 2 data nodes"
+                            echo 'Waiting for elasticsearch cluster to become ready (request params: "{{ .Values.clusterHealthCheckParamsBasic }}" )'
+                            if http "/_cluster/health?{{ .Values.clusterHealthCheckParamsBasic }}" ; then
+                                touch ${START_FILE}
+                                exit 0
+                            else
+                                echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParamsBasic }}" )'
+                                exit 1
+                            fi
+                        fi
+                     else
+-                        echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParams }}" )'
+-                        exit 1
+                        echo 'Waiting for elasticsearch cluster to become ready (request params: "{{ .Values.clusterHealthCheckParams }}" )'
+                        if http "/_cluster/health?{{ .Values.clusterHealthCheckParams }}" ; then
+                            touch ${START_FILE}
+                            exit 0
+                        else
+                            echo 'Cluster is not yet ready (request params: "{{ .Values.clusterHealthCheckParams }}" )'
+                            exit 1
+                        fi
+                     fi
+                 fi
+         ports:
+diff --git a/elasticsearch/values.yaml b/elasticsearch/values.yaml
+index 0d983eb..ebbae6c 100755
+--- a/elasticsearch/values.yaml
+++ b/elasticsearch/values.yaml
+@@ -197,6 +197,8 @@ readinessProbe:
+ 
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-health.html#request-params wait_for_status
+ clusterHealthCheckParams: "wait_for_status=green&timeout=1s"
+# Used for readiness probe when on a data node and only a basic health check is needed.
+clusterHealthCheckParamsBasic: "local=true"
+ 
+ ## Use an alternate scheduler.
+ ## ref: https://kubernetes.io/docs/tasks/administer-cluster/configure-multiple-schedulers/
+-- 
+1.8.3.1
+
--- a/stx-monitor-helm/stx-monitor-helm/manifests/monitor_manifest.yaml
+++ b/stx-monitor-helm/stx-monitor-helm/manifests/monitor_manifest.yaml
@ -106,7 +106,7 @@ data:
    esMajorVersion: 7
    masterService: 'mon-elasticsearch-data-headless, mon-elasticsearch-master'
    podManagementPolicy: OrderedReady
-    clusterHealthCheckParams: 'local=true'
+    clusterHealthCheckParams: 'wait_for_no_relocating_shards&wait_for_no_initializing_shards&timeout=1s'
    maxUnavailable: 1
    extraEnvs:
      - name: DATA_PRESTOP_SLEEP