From 57b1f3905ba249259d2fe3b5b6fc12620920e12f Mon Sep 17 00:00:00 2001 From: Steven Fitzpatrick Date: Fri, 10 Jul 2020 14:31:22 -0500 Subject: [PATCH] Elasticsearch - Cluster Wait Function Improvements This change modifies the cluster wait function to check the cluster health status explicitly. Once a status of at least "yellow" has been reached, the Elasticsearch cluster should be able to facilitate the API calls required by the other jobs of this chart. Change-Id: I2660422a8e8122186d648042f5422ca9a82d23c7 --- .../templates/bin/_es-cluster-wait.sh.tpl | 98 ++----------------- .../job-register-snapshot-repository.yaml | 2 - elasticsearch/values.yaml | 5 +- 3 files changed, 10 insertions(+), 95 deletions(-) diff --git a/elasticsearch/templates/bin/_es-cluster-wait.sh.tpl b/elasticsearch/templates/bin/_es-cluster-wait.sh.tpl index da4f6e16a..d4ae9ac11 100644 --- a/elasticsearch/templates/bin/_es-cluster-wait.sh.tpl +++ b/elasticsearch/templates/bin/_es-cluster-wait.sh.tpl @@ -13,96 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */}} -function check_master_nodes() { - numMasterNodes=0 - expectedMasterNodes={{ .Values.pod.replicas.master | int64 }} - while [ "$numMasterNodes" -ne "$expectedMasterNodes" ] - do - currentMasterNodes=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \ - "${ELASTICSEARCH_HOST}/_cat/nodes?format=json&pretty" | jq -r '.[] | select(.name|test("elasticsearch-master.")) | .name') - numMasterNodes=$(echo $currentMasterNodes | wc -w) - if [ "$numMasterNodes" -ne "$expectedMasterNodes" ] - then - if [ "$numMasterNodes" -eq 0 ] - then - echo "No Elasticsearch master nodes accounted for: 0/${expectedMasterNodes}" - else - echo "Not all Elasticsearch master nodes accounted for and ready: (${numMasterNodes} / ${expectedMasterNodes})" - echo "$currentMasterNodes" - fi - echo "Sleeping for 10 seconds before next check" - echo "" - sleep 10 - fi - done - echo "All Elasticsearch master nodes accounted for and ready: (${numMasterNodes} / ${expectedMasterNodes})" - echo "$currentMasterNodes" - echo "" -} - -function check_data_nodes() { - numDataNodes=0 - expectedDataNodes={{ .Values.pod.replicas.data | int64 }} - while [ "$numDataNodes" -ne "$expectedDataNodes" ] - do - currentDataNodes=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \ - "${ELASTICSEARCH_HOST}/_cat/nodes?format=json&pretty" | jq -r '.[] | select(.name|test("elasticsearch-data.")) | .name') - numDataNodes=$(echo $currentDataNodes | wc -w) - if [ "$numDataNodes" -ne "$expectedDataNodes" ] - then - if [ "$numDataNodes" -eq 0 ] - then - echo "No Elasticsearch data nodes accounted for: 0/${expectedDataNodes}" - else - echo "Not all Elasticsearch data nodes accounted for and ready: (${numDataNodes} / ${expectedDataNodes})" - echo "$currentDataNodes" - fi - echo "Sleeping for 10 seconds before next check" - echo "" - sleep 10 - fi - done - echo "All Elasticsearch data nodes accounted for and ready: (${numDataNodes} / ${expectedDataNodes})" - echo "$currentDataNodes" - echo "" -} - -function check_client_nodes() { - numClientNodes=0 - expectedClientNodes={{ .Values.pod.replicas.client | int64 }} - while [ "$numClientNodes" -ne "$expectedClientNodes" ] - do - currentClientNodes=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \ - "${ELASTICSEARCH_HOST}/_cat/nodes?format=json&pretty" | jq -r '.[] | select(.name|test("elasticsearch-client.")) | .name') - numClientNodes=$(echo $currentClientNodes | wc -w) - if [ "$numClientNodes" -ne "$expectedClientNodes" ] - then - if [ "$numClientNodes" -eq 0 ] - then - echo "No Elasticsearch client nodes accounted for: 0/${expectedClientNodes}" - else - echo "Not all Elasticsearch client nodes accounted for and ready: (${numClientNodes} / ${expectedClientNodes})" - echo "$currentClientNodes" - fi - echo "Sleeping for 10 seconds before next check" - echo "" - sleep 10 - fi - done - echo "All Elasticsearch client nodes accounted for and ready: (${numClientNodes} / ${expectedClientNodes})" - echo "$currentClientNodes" - echo "" -} - function check_cluster_health() { - clusterHealth=$(curl -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \ - "${ELASTICSEARCH_HOST}/_cat/health?format=json&pretty") - echo "Elasticsearch cluster health is:" - echo "$clusterHealth" + STATUS=$(curl -s -K- <<< "--user ${ELASTICSEARCH_USERNAME}:${ELASTICSEARCH_PASSWORD}" \ + "${ELASTICSEARCH_HOST}/_cat/health?format=json&pretty" | jq -r .[].status) + echo "Status: $STATUS" } -sleep 10 -check_data_nodes -check_client_nodes -check_master_nodes check_cluster_health +while [[ $STATUS == "red" ]]; do + echo "Waiting for cluster to become ready." + sleep 30 + check_cluster_health +done +echo "Cluster is ready." diff --git a/elasticsearch/templates/job-register-snapshot-repository.yaml b/elasticsearch/templates/job-register-snapshot-repository.yaml index 18a9a303f..e2c24ed0a 100644 --- a/elasticsearch/templates/job-register-snapshot-repository.yaml +++ b/elasticsearch/templates/job-register-snapshot-repository.yaml @@ -28,7 +28,6 @@ metadata: annotations: {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }} spec: - backoffLimit: {{ .Values.jobs.snapshot_repository.backoffLimit }} template: metadata: labels: @@ -38,7 +37,6 @@ spec: spec: {{ dict "envAll" $envAll "application" "snapshot_repository" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }} serviceAccountName: {{ $serviceAccountName }} - activeDeadlineSeconds: {{ .Values.jobs.snapshot_repository.activeDeadlineSeconds }} restartPolicy: OnFailure nodeSelector: {{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value | quote }} diff --git a/elasticsearch/values.yaml b/elasticsearch/values.yaml index 2f4206c18..00684345e 100644 --- a/elasticsearch/values.yaml +++ b/elasticsearch/values.yaml @@ -420,10 +420,7 @@ jobs: failed: 1 es_cluster_wait: backoffLimit: 6 - activeDeadlineSeconds: 600 - snapshot_repository: - backoffLimit: 6 - activeDeadlineSeconds: 600 + activeDeadlineSeconds: 1200 verify_repositories: cron: "*/30 * * * *" history: