Merge "Make ceph-client helm test more PG specific"

2021-04-19 19:19:23 +00:00 · 2021-04-19 19:19:23 +00:00 · e3142120cb
commit e3142120cb
parent 38e6023351 7bb5ff5502
3 changed files with 71 additions and 6 deletions
--- a/ceph-client/Chart.yaml
+++ b/ceph-client/Chart.yaml
@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph Client
 name: ceph-client
-version: 0.1.14
+version: 0.1.15
 home: https://github.com/ceph/ceph-client
 ...
--- a/ceph-client/templates/bin/_helm-tests.sh.tpl
+++ b/ceph-client/templates/bin/_helm-tests.sh.tpl
@ -246,6 +246,62 @@ function pool_failuredomain_validation() {
  done
 }
 function check_transient_pgs_file() {
  current_time=$1
  pg_failed_list=()
  # Remove the lines NOT having the word "current" as these are the old
  # PGs that are no longer in transition.
  sed -i '/current/!d' ${transient_pgs_file}
  # For all remaining lines (PGs currently inactive), check for PGs which
  # are older than the limit.
  IFS=$'\n' read -d '' -r -a lines < ${transient_pgs_file} || true
  for pg_data in "${lines[@]}"; do
    pg=$(echo ${pg_data} | awk '{print $1}')
    pg_ts=$(echo ${pg_data} | awk '{print $2}')
    if [[ $((${current_time} - ${pg_ts})) -gt ${pg_inactive_timeout} ]]; then
      pg_failed_list+=("${pg}")
    fi
  done
  # Remove the current designation for all PGs, as we no longer need it
  # for this check.
  sed -i 's/ current//g' ${transient_pgs_file}
  cat ${transient_pgs_file}
  if [[ ${#pg_failed_list[@]} -gt 0 ]]; then
    echo "The following PGs have been in a transient state for longer than ${pg_inactive_timeout} seconds:"
    echo ${pg_failed_list[*]}
    exit 1
  fi
 }
 function update_transient_pgs_file() {
  pg=$1
  current_ts=$2
  pg_data=$(grep "${pg} " ${transient_pgs_file} || true)
  if [[ "${pg_data}" == "" ]]; then
    echo "${pg} ${current_ts} current" >> ${transient_pgs_file}
  else
    # Add the word "current" to the end of the line which has this PG
    sed -i '/^'"${pg} "'/s/$/ current/' ${transient_pgs_file}
  fi
 }
 function check_transient_pgs() {
  local -n pg_array=$1
  # Use a temporary transient PGs file to track the amount of time PGs
  # are spending in a transitional state.
  now=$(date +%s)
  for pg in "${pg_array[@]}"; do
    update_transient_pgs_file ${pg} ${now}
  done
  check_transient_pgs_file ${now}
 }
 function check_pgs() {
  pgs_transitioning=false
@ -260,6 +316,9 @@ function check_pgs() {
    echo ${stuck_pgs[*]}
    # Not a critical error - yet
    pgs_transitioning=true
    # Check to see if any transitioning PG has been stuck for too long
    check_transient_pgs stuck_pgs
  else
    # Examine the PGs that have non-active states. Consider those PGs that
    # are in a "premerge" state to be similar to active. "premerge" PGs may
@ -268,10 +327,10 @@ function check_pgs() {
    # If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
    inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
-    echo "There is at least one inactive pg in the cluster: "
+    echo "This is the list of inactive pgs in the cluster: "
    echo ${inactive_pgs[*]}
-    echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
+    echo "Checking to see if the cluster is rebalancing or recovering some PG's..."
    # Check for PGs that are down. These are critical errors.
    down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
@ -311,6 +370,9 @@ function check_pgs() {
      echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
      # not critical, just wait
      pgs_transitioning=true
      # Check to see if any transitioning PG has been stuck for too long
      check_transient_pgs transient_pgs
    fi
  fi
 }
@ -319,9 +381,11 @@ function pg_validation() {
  retries=0
  time_between_retries=3
  max_retries=60
  pg_inactive_timeout=30
  pgs_transitioning=false
  stuck_pgs_file=$(mktemp -p /tmp)
  inactive_pgs_file=$(mktemp -p /tmp)
  transient_pgs_file=$(mktemp -p /tmp)
  # Check this over a period of retries. Fail/stop if any critical errors found.
  while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
@ -330,11 +394,11 @@ function pg_validation() {
    ((retries=retries+1))
  done
-  # If peering PGs haven't gone active after retries have expired, fail
+  # Check if transitioning PGs have gone active after retries have expired
  if [[ retries -ge ${max_retries} ]]; then
    ((timeout_sec=${time_between_retries}*${max_retries}))
-    echo "Some PGs have not become active or have been stuck after ${timeout_sec} seconds. Exiting..."
+    echo "Some PGs have not become active after ${timeout_sec} seconds. Exiting..."
-    exit 1
+    # This is ok, as the autoscaler might still be adjusting the PGs.
  fi
 }
--- a/releasenotes/notes/ceph-client.yaml
+++ b/releasenotes/notes/ceph-client.yaml
@ -15,4 +15,5 @@ ceph-client:
  - 0.1.12 Disable autoscaling before pools are created
  - 0.1.13 Fix ceph-client helm test
  - 0.1.14 Allow Ceph RBD pool job to leave failed pods
  - 0.1.15 Make ceph-client helm test more PG specific
 ...