Merge "Make ceph-client helm test more PG specific"
This commit is contained in:
commit
e3142120cb
@ -15,6 +15,6 @@ apiVersion: v1
|
|||||||
appVersion: v1.0.0
|
appVersion: v1.0.0
|
||||||
description: OpenStack-Helm Ceph Client
|
description: OpenStack-Helm Ceph Client
|
||||||
name: ceph-client
|
name: ceph-client
|
||||||
version: 0.1.14
|
version: 0.1.15
|
||||||
home: https://github.com/ceph/ceph-client
|
home: https://github.com/ceph/ceph-client
|
||||||
...
|
...
|
||||||
|
@ -246,6 +246,62 @@ function pool_failuredomain_validation() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function check_transient_pgs_file() {
|
||||||
|
current_time=$1
|
||||||
|
pg_failed_list=()
|
||||||
|
|
||||||
|
# Remove the lines NOT having the word "current" as these are the old
|
||||||
|
# PGs that are no longer in transition.
|
||||||
|
sed -i '/current/!d' ${transient_pgs_file}
|
||||||
|
|
||||||
|
# For all remaining lines (PGs currently inactive), check for PGs which
|
||||||
|
# are older than the limit.
|
||||||
|
IFS=$'\n' read -d '' -r -a lines < ${transient_pgs_file} || true
|
||||||
|
for pg_data in "${lines[@]}"; do
|
||||||
|
pg=$(echo ${pg_data} | awk '{print $1}')
|
||||||
|
pg_ts=$(echo ${pg_data} | awk '{print $2}')
|
||||||
|
if [[ $((${current_time} - ${pg_ts})) -gt ${pg_inactive_timeout} ]]; then
|
||||||
|
pg_failed_list+=("${pg}")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Remove the current designation for all PGs, as we no longer need it
|
||||||
|
# for this check.
|
||||||
|
sed -i 's/ current//g' ${transient_pgs_file}
|
||||||
|
|
||||||
|
cat ${transient_pgs_file}
|
||||||
|
if [[ ${#pg_failed_list[@]} -gt 0 ]]; then
|
||||||
|
echo "The following PGs have been in a transient state for longer than ${pg_inactive_timeout} seconds:"
|
||||||
|
echo ${pg_failed_list[*]}
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function update_transient_pgs_file() {
|
||||||
|
pg=$1
|
||||||
|
current_ts=$2
|
||||||
|
|
||||||
|
pg_data=$(grep "${pg} " ${transient_pgs_file} || true)
|
||||||
|
if [[ "${pg_data}" == "" ]]; then
|
||||||
|
echo "${pg} ${current_ts} current" >> ${transient_pgs_file}
|
||||||
|
else
|
||||||
|
# Add the word "current" to the end of the line which has this PG
|
||||||
|
sed -i '/^'"${pg} "'/s/$/ current/' ${transient_pgs_file}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function check_transient_pgs() {
|
||||||
|
local -n pg_array=$1
|
||||||
|
|
||||||
|
# Use a temporary transient PGs file to track the amount of time PGs
|
||||||
|
# are spending in a transitional state.
|
||||||
|
now=$(date +%s)
|
||||||
|
for pg in "${pg_array[@]}"; do
|
||||||
|
update_transient_pgs_file ${pg} ${now}
|
||||||
|
done
|
||||||
|
check_transient_pgs_file ${now}
|
||||||
|
}
|
||||||
|
|
||||||
function check_pgs() {
|
function check_pgs() {
|
||||||
pgs_transitioning=false
|
pgs_transitioning=false
|
||||||
|
|
||||||
@ -260,6 +316,9 @@ function check_pgs() {
|
|||||||
echo ${stuck_pgs[*]}
|
echo ${stuck_pgs[*]}
|
||||||
# Not a critical error - yet
|
# Not a critical error - yet
|
||||||
pgs_transitioning=true
|
pgs_transitioning=true
|
||||||
|
|
||||||
|
# Check to see if any transitioning PG has been stuck for too long
|
||||||
|
check_transient_pgs stuck_pgs
|
||||||
else
|
else
|
||||||
# Examine the PGs that have non-active states. Consider those PGs that
|
# Examine the PGs that have non-active states. Consider those PGs that
|
||||||
# are in a "premerge" state to be similar to active. "premerge" PGs may
|
# are in a "premerge" state to be similar to active. "premerge" PGs may
|
||||||
@ -268,10 +327,10 @@ function check_pgs() {
|
|||||||
|
|
||||||
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
|
# If the inactive pgs file is non-empty, there are some inactive pgs in the cluster.
|
||||||
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
|
inactive_pgs=(`cat ${inactive_pgs_file} | awk -F "\"" '/pgid/{print $4}'`)
|
||||||
echo "There is at least one inactive pg in the cluster: "
|
echo "This is the list of inactive pgs in the cluster: "
|
||||||
echo ${inactive_pgs[*]}
|
echo ${inactive_pgs[*]}
|
||||||
|
|
||||||
echo "Very likely the cluster is rebalancing or recovering some PG's. Checking..."
|
echo "Checking to see if the cluster is rebalancing or recovering some PG's..."
|
||||||
|
|
||||||
# Check for PGs that are down. These are critical errors.
|
# Check for PGs that are down. These are critical errors.
|
||||||
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
|
down_pgs=(`cat ${inactive_pgs_file} | grep -B1 'down' | awk -F "\"" '/pgid/{print $4}'`)
|
||||||
@ -311,6 +370,9 @@ function check_pgs() {
|
|||||||
echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
|
echo "This is normal but will wait a while to verify the PGs are not stuck in a transient state."
|
||||||
# not critical, just wait
|
# not critical, just wait
|
||||||
pgs_transitioning=true
|
pgs_transitioning=true
|
||||||
|
|
||||||
|
# Check to see if any transitioning PG has been stuck for too long
|
||||||
|
check_transient_pgs transient_pgs
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -319,9 +381,11 @@ function pg_validation() {
|
|||||||
retries=0
|
retries=0
|
||||||
time_between_retries=3
|
time_between_retries=3
|
||||||
max_retries=60
|
max_retries=60
|
||||||
|
pg_inactive_timeout=30
|
||||||
pgs_transitioning=false
|
pgs_transitioning=false
|
||||||
stuck_pgs_file=$(mktemp -p /tmp)
|
stuck_pgs_file=$(mktemp -p /tmp)
|
||||||
inactive_pgs_file=$(mktemp -p /tmp)
|
inactive_pgs_file=$(mktemp -p /tmp)
|
||||||
|
transient_pgs_file=$(mktemp -p /tmp)
|
||||||
|
|
||||||
# Check this over a period of retries. Fail/stop if any critical errors found.
|
# Check this over a period of retries. Fail/stop if any critical errors found.
|
||||||
while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
|
while check_pgs && [[ "${pgs_transitioning}" == "true" ]] && [[ retries -lt ${max_retries} ]]; do
|
||||||
@ -330,11 +394,11 @@ function pg_validation() {
|
|||||||
((retries=retries+1))
|
((retries=retries+1))
|
||||||
done
|
done
|
||||||
|
|
||||||
# If peering PGs haven't gone active after retries have expired, fail
|
# Check if transitioning PGs have gone active after retries have expired
|
||||||
if [[ retries -ge ${max_retries} ]]; then
|
if [[ retries -ge ${max_retries} ]]; then
|
||||||
((timeout_sec=${time_between_retries}*${max_retries}))
|
((timeout_sec=${time_between_retries}*${max_retries}))
|
||||||
echo "Some PGs have not become active or have been stuck after ${timeout_sec} seconds. Exiting..."
|
echo "Some PGs have not become active after ${timeout_sec} seconds. Exiting..."
|
||||||
exit 1
|
# This is ok, as the autoscaler might still be adjusting the PGs.
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,4 +15,5 @@ ceph-client:
|
|||||||
- 0.1.12 Disable autoscaling before pools are created
|
- 0.1.12 Disable autoscaling before pools are created
|
||||||
- 0.1.13 Fix ceph-client helm test
|
- 0.1.13 Fix ceph-client helm test
|
||||||
- 0.1.14 Allow Ceph RBD pool job to leave failed pods
|
- 0.1.14 Allow Ceph RBD pool job to leave failed pods
|
||||||
|
- 0.1.15 Make ceph-client helm test more PG specific
|
||||||
...
|
...
|
||||||
|
Loading…
Reference in New Issue
Block a user