Merge "[ceph-client] Fix a helm test issue and disable PG autoscaler"

2020-07-31 22:09:17 +00:00
parent 3f9006ccae 84f1557566
commit f79704a8f0
3 changed files with 49 additions and 4 deletions
--- a/ceph-client/templates/bin/_helm-tests.sh.tpl
+++ b/ceph-client/templates/bin/_helm-tests.sh.tpl
@@ -24,7 +24,37 @@ function check_cluster_status() {
  if [ "x${ceph_health_status}" == "xHEALTH_OK" ]; then
    echo "Ceph status is HEALTH_OK"
  else
-    echo "Ceph cluster status is NOT HEALTH_OK."
+    echo "Ceph cluster status is not HEALTH_OK, checking PG states"
+    retries=0
+    # If all PGs are active, pass
+    # This grep is just as robust as jq and is Ceph-version agnostic unlike jq
+    while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
+      # If all inactive PGs are peering, wait for peering to complete
+      # Run 'ceph pg ls' again before failing in case PG states have changed
+      if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
+        # If inactive PGs aren't peering, fail
+        echo "Failure, found inactive PGs that aren't peering"
+        exit 1
+      fi
+      sleep 3
+      ((retries=retries+1))
+    done
+    # If peering PGs haven't gone active after retries have expired, fail
+    if [[ retries -ge 60 ]]; then
+      echo "PGs appear to be stuck peering"
+      exit 1
+    fi
+  fi
+}
+
+function check_recovery_flags() {
+  echo "### Start: Checking for flags that will prevent recovery"
+
+  # Ensure there are no flags set that will prevent recovery of degraded PGs
+  if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
+    ceph osd stat
+    echo "Flags are set that prevent recovery of degraded PGs"
+    exit 1
  fi
 }

@@ -257,3 +287,4 @@ pool_validation
 pool_failuredomain_validation
 check_failure_domain_count_per_pool
 check_cluster_status
+check_recovery_flags
--- a/ceph-client/templates/bin/pool/_init.sh.tpl
+++ b/ceph-client/templates/bin/pool/_init.sh.tpl
@@ -46,6 +46,17 @@ function wait_for_inactive_pgs () {
  fi
 }

+function check_recovery_flags () {
+  echo "### Start: Checking for flags that will prevent recovery"
+
+  # Ensure there are no flags set that will prevent recovery of degraded PGs
+  if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
+    ceph osd stat
+    echo "Flags are set that prevent recovery of degraded PGs"
+    exit 1
+  fi
+}
+
 function check_osd_count() {
  echo "#### Start: Checking OSD count ####"
  noup_flag=$(ceph osd stat | awk '/noup/ {print $2}')
@@ -119,10 +130,12 @@ function reweight_osds () {
  done
 }

-function enable_autoscaling () {
+function enable_or_disable_autoscaling () {
  if [[ "${ENABLE_AUTOSCALER}" == "true" ]]; then
    ceph mgr module enable pg_autoscaler
    ceph config set global osd_pool_default_pg_autoscale_mode on
+  else
+    ceph mgr module disable pg_autoscaler
  fi
 }

@@ -232,7 +245,7 @@ reweight_osds
 cluster_capacity=0
 if [[ -z "$(ceph osd versions | grep ceph\ version | grep -v nautilus)" ]]; then
  cluster_capacity=$(ceph --cluster "${CLUSTER}" df | grep "TOTAL" | awk '{print $2 substr($3, 1, 1)}' | numfmt --from=iec)
-  enable_autoscaling
+  enable_or_disable_autoscaling
 else
  cluster_capacity=$(ceph --cluster "${CLUSTER}" df | head -n3 | tail -n1 | awk '{print $1 substr($2, 1, 1)}' | numfmt --from=iec)
 fi
@@ -253,3 +266,4 @@ ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunabl
 {{- end }}

 wait_for_inactive_pgs
+check_recovery_flags
--- a/ceph-client/values.yaml
+++ b/ceph-client/values.yaml
@@ -254,7 +254,7 @@ conf:
  features:
    mds: true
    mgr: true
-    pg_autoscaler: true
+    pg_autoscaler: false
    cluster_flags:
      # List of flags to set or unset separated by spaces
      set: ""