Make ceph-client helm test more PG specific

This patchset makes the current ceph-client helm test more specific
about checking each of the PGs that are transitioning through inactive
states during the test. If any single PG spends more than 30 seconds in
any of these inactive states (peering, activating, creating, unknown,
etc), then the test will fail.

Also, if after the three minute PG checking period is expired, we will
no longer fail the helm test, as it is very possible that the autoscaler
could be still adjusting the PGs for several minutes after a deployment
is done.

Change-Id: I7f3209b7b3399feb7bec7598e6e88d7680f825c4
This commit is contained in:
Parsons, Cliff (cp769u) 2021-04-16 19:51:10 +00:00
parent 704d808514
commit 7bb5ff5502
3 changed files with 71 additions and 6 deletions

View File

@ -15,6 +15,6 @@ apiVersion: v1
appVersion: v1.0.0
description: OpenStack-Helm Ceph Client
name: ceph-client
version: 0.1.14
version: 0.1.15
home: https://github.com/ceph/ceph-client
...

View File

@ -246,6 +246,62 @@ function pool_failuredomain_validation() {
done
}
function check_transient_pgs_file() {
current_time=$1
pg_failed_list=()
# Remove the lines NOT having the word "current" as these are the old
# PGs that are no longer in transition.
sed -i '/current/!d' ${transient_pgs_file}
# For all remaining lines (PGs currently inactive), check for PGs which
# are older than the limit.
IFS=$'\n' read -d '' -r -a lines < ${transient_pgs_file} || true
for pg_data in "${lines[@]}"; do
pg=$(echo ${pg_data} | awk '{print $1}')
pg_ts=$(echo ${pg_data} | awk '{print $2}')
if [[ $((${current_time} - ${pg_ts})) -gt ${pg_inactive_timeout} ]]; then
pg_failed_list+=("${pg}")
fi
done
# Remove the current designation for all PGs, as we no longer need it
# for this check.
sed -i 's/ current//g' ${transient_pgs_file}
cat ${transient_pgs_file}
if [[ ${#pg_failed_list[@]} -gt 0 ]]; then
echo "The following PGs have been in a transient state for longer than ${pg_inactive_timeout} seconds:"
echo ${pg_failed_list[*]}
exit 1
fi
}
function update_transient_pgs_file() {
pg=$1
current_ts=$2
pg_data=$(grep "${pg} " ${transient_pgs_file} || true)
if [[ "${pg_data}" == "" ]]; then
echo "${pg} ${current_ts} current" >> ${transient_pgs_file}
else
# Add the word "current" to the end of the line which has this PG
sed -i '/^'"${pg} "'/s/$/ current/' ${transient_pgs_file}
fi
}
function check_transient_pgs() {
local -n pg_array=$1
# Use a temporary transient PGs file to track the amount of time PGs
# are spending in a transitional state.
now=$(date +%s)
for pg in "${pg_array[@]}"; do
update_transient_pgs_file ${pg} ${now}
done
check_transient_pgs_file ${now}
}
function check_pgs() {
pgs_transitioning=false
@ -260,6 +316,9 @@ function check_pgs() {
echo ${stuck_pgs[*]}
# Not a critical error - yet
pgs_transitioning=true
# Check to see if any transitioning PG has been stuck for too long
check_transient_pgs stuck_pgs
else
# Examine the PGs that have non-active states. Consider those PGs that
# are in a "premerge" state to be similar to active. "premerge" PGs may
@ -268,10 +327,10 @@ function check_pgs() {
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
echo "There is at least one inactive pg in the cluster: "
echo "This is the list of inactive pgs in the cluster: "
echo ${inactive_pgs[*]}
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
echo "Checking to see if the cluster is rebalancing or recovering some PG's..."
# Check for PGs that are down. These are critical errors.
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
@ -311,6 +370,9 @@ function check_pgs() {
echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
# not critical, just wait
pgs_transitioning=true
# Check to see if any transitioning PG has been stuck for too long
check_transient_pgs transient_pgs
fi
fi
}
@ -319,9 +381,11 @@ function pg_validation() {
retries=0
time_between_retries=3
max_retries=60
pg_inactive_timeout=30
pgs_transitioning=false
stuck_pgs_file=$(mktemp -p /tmp)
inactive_pgs_file=$(mktemp -p /tmp)
transient_pgs_file=$(mktemp -p /tmp)
# Check this over a period of retries. Fail/stop if any critical errors found.
while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
@ -330,11 +394,11 @@ function pg_validation() {
((retries=retries+1))
done
# If peering PGs haven't gone active after retries have expired, fail
# Check if transitioning PGs have gone active after retries have expired
if [[ retries -ge ${max_retries} ]]; then
((timeout_sec=${time_between_retries}*${max_retries}))
echo "Some PGs have not become active or have been stuck after ${timeout_sec} seconds. Exiting..."
exit 1
echo "Some PGs have not become active after ${timeout_sec} seconds. Exiting..."
# This is ok, as the autoscaler might still be adjusting the PGs.
fi
}

View File

@ -15,4 +15,5 @@ ceph-client:
- 0.1.12 Disable autoscaling before pools are created
- 0.1.13 Fix ceph-client helm test
- 0.1.14 Allow Ceph RBD pool job to leave failed pods
- 0.1.15 Make ceph-client helm test more PG specific
...