From 970c23acf433377eb6ff389679736a93f98aff2e Mon Sep 17 00:00:00 2001
From: "Parsons, Cliff (cp769u)" <cp769u@att.com>
Date: Tue, 12 Jan 2021 15:43:19 +0000
Subject: [PATCH] Improvements for ceph-client helm tests

This commit introduces the following helm test improvement for the
ceph-client chart:

1) Reworks the pg_validation function so that it allows some time for
peering PGs to finish peering, but fail if any other critical errors are
seen. The actual pg validation was split out into a function called
check_pgs(), and the pg_validation function manages the looping aspects.

2) The check_cluster_status function now calls pv_validation if the
cluster status is not OK. This is very similar to what was happening
before, except now, the logic will not be repeated.

Change-Id: I65906380817441bd2ff9ff9cfbf9586b6fdd2ba7
---
 ceph-client/Chart.yaml                       |  2 +-
 ceph-client/templates/bin/_helm-tests.sh.tpl | 99 +++++++++++++++-----
 2 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/ceph-client/Chart.yaml b/ceph-client/Chart.yaml
index 8bb63485b..e9941c152 100644
--- a/ceph-client/Chart.yaml
+++ b/ceph-client/Chart.yaml
@@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph Client
 name: ceph-client
-version: 0.1.3
+version: 0.1.4
 home: https://github.com/ceph/ceph-client
 ...
diff --git a/ceph-client/templates/bin/_helm-tests.sh.tpl b/ceph-client/templates/bin/_helm-tests.sh.tpl
index 0f749f1c0..a12c9b71d 100755
--- a/ceph-client/templates/bin/_helm-tests.sh.tpl
+++ b/ceph-client/templates/bin/_helm-tests.sh.tpl
@@ -25,25 +25,7 @@ function check_cluster_status() {
     echo "Ceph status is HEALTH_OK"
   else
     echo "Ceph cluster status is not HEALTH_OK, checking PG states"
-    retries=0
-    # If all PGs are active, pass
-    # This grep is just as robust as jq and is Ceph-version agnostic unlike jq
-    while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
-      # If all inactive PGs are peering, wait for peering to complete
-      # Run 'ceph pg ls' again before failing in case PG states have changed
-      if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
-        # If inactive PGs aren't peering, fail
-        echo "Failure, found inactive PGs that aren't peering"
-        exit 1
-      fi
-      sleep 3
-      ((retries=retries+1))
-    done
-    # If peering PGs haven't gone active after retries have expired, fail
-    if [[ retries -ge 60 ]]; then
-      echo "PGs appear to be stuck peering"
-      exit 1
-    fi
+    pg_validation
   fi
 }
 
@@ -264,12 +246,81 @@ function pool_failuredomain_validation() {
   done
 }
 
-function pg_validation() {
-  ceph pg ls
-  inactive_pgs=(`ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' | awk -F "\"" '/pgid/{print $4}'`)
-  if [ ${#inactive_pgs[*]} -gt 0 ];then
-    echo "There are few incomplete pgs in the cluster"
+function check_pgs() {
+  pgs_transitioning=false
+
+  ceph --cluster ${CLUSTER} pg dump_stuck -f json-pretty > ${stuck_pgs_file}
+
+  # Check if there are any stuck PGs, which could indicate a serious problem
+  # if it does not resolve itself soon.
+  stuck_pgs=(`cat ${stuck_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
+  if [[ ${#stuck_pgs[*]} -gt 0 ]]; then
+    # We have at least one stuck pg
+    echo "Some PGs are stuck: "
+    echo ${stuck_pgs[*]}
+    # Not a critical error - yet
+    pgs_transitioning=true
+  else
+    ceph --cluster ${CLUSTER} pg ls -f json-pretty | grep '"pgid":\|"state":' | grep -v "active" | grep -B1 '"state":' > ${inactive_pgs_file} || true
+
+    # If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
+    inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
+    echo "There is at least one inactive pg in the cluster: "
     echo ${inactive_pgs[*]}
+
+    echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
+
+    down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
+    if [[ ${#down_pgs[*]} -gt 0 ]]; then
+      # Some PGs could be down. This is really bad situation and test must fail.
+      echo "Some PGs are down: "
+      echo ${down_pgs[*]}
+      echo "This is critical error, exiting. "
+      exit 1
+    fi
+
+    non_peer_recover_pgs=(`cat ${inactive_pgs_file} | grep '"state":' | grep -v -E 'peer|recover' || true`)
+    if [[ ${#non_peer_recover_pgs[*]} -gt 0 ]]; then
+      # Some PGs could be inactive and not peering. Better we fail.
+      echo "We are unsure what's happening: we don't have down/stuck PGs,"
+      echo "but we have some inactive pgs that are not peering/recover: "
+      pg_list=(`sed -n '/recover\|peer/{s/.*//;x;d;};x;p;${x;p;}' ${inactive_pgs_file} | sed '/^$/d' | awk -F "\"" '/pgid/{print $4}'`)
+      echo ${pg_list[*]}
+      # Critical error. Fail/exit the script
+      exit 1
+    fi
+
+    peer_recover_pgs=(`cat ${inactive_pgs_file} | grep -B1 -E 'peer|recover' | awk -F "\"" '/pgid/{print $4}'`)
+    if [[ ${#peer_recover_pgs[*]} -gt 0 ]]; then
+      # Some PGs are not in an active state but peering and/or cluster is recovering
+      echo "Some PGs are peering and/or cluster is recovering: "
+      echo ${peer_recover_pgs[*]}
+      echo "This is normal but will wait a while to verify the PGs are not stuck in peering."
+      # not critical, just wait
+      pgs_transitioning=true
+    fi
+  fi
+}
+
+function pg_validation() {
+  retries=0
+  time_between_retries=3
+  max_retries=60
+  pgs_transitioning=false
+  stuck_pgs_file=$(mktemp -p /tmp)
+  inactive_pgs_file=$(mktemp -p /tmp)
+
+  # Check this over a period of retries. Fail/stop if any critical errors found.
+  while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
+    echo "Sleep for a bit waiting on the pg(s) to become active/unstuck..."
+    sleep ${time_between_retries}
+    ((retries=retries+1))
+  done
+
+  # If peering PGs haven't gone active after retries have expired, fail
+  if [[ retries -ge ${max_retries} ]]; then
+    ((timeout_sec=${time_between_retries}*${max_retries}))
+    echo "Some PGs have not become active or have been stuck after ${timeout_sec} seconds. Exiting..."
     exit 1
   fi
 }