Merge "[ceph-client] Fix a helm test issue and disable PG autoscaler"

This commit is contained in:
Zuul 2020-07-31 22:09:17 +00:00 committed by Gerrit Code Review
commit f79704a8f0
3 changed files with 49 additions and 4 deletions

View File

@ -24,7 +24,37 @@ function check_cluster_status() {
if [ "x${ceph_health_status}" == "xHEALTH_OK" ]; then
echo "Ceph status is HEALTH_OK"
else
echo "Ceph cluster status is NOT HEALTH_OK."
echo "Ceph cluster status is not HEALTH_OK, checking PG states"
retries=0
# If all PGs are active, pass
# This grep is just as robust as jq and is Ceph-version agnostic unlike jq
while [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v "active") ]] && [[ retries -lt 60 ]]; do
# If all inactive PGs are peering, wait for peering to complete
# Run 'ceph pg ls' again before failing in case PG states have changed
if [[ $(ceph pg ls -f json-pretty | grep '"state":' | grep -v -e "active" -e "peering") ]]; then
# If inactive PGs aren't peering, fail
echo "Failure, found inactive PGs that aren't peering"
exit 1
fi
sleep 3
((retries=retries+1))
done
# If peering PGs haven't gone active after retries have expired, fail
if [[ retries -ge 60 ]]; then
echo "PGs appear to be stuck peering"
exit 1
fi
fi
}
function check_recovery_flags() {
echo "### Start: Checking for flags that will prevent recovery"
# Ensure there are no flags set that will prevent recovery of degraded PGs
if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
ceph osd stat
echo "Flags are set that prevent recovery of degraded PGs"
exit 1
fi
}
@ -257,3 +287,4 @@ pool_validation
pool_failuredomain_validation
check_failure_domain_count_per_pool
check_cluster_status
check_recovery_flags

View File

@ -46,6 +46,17 @@ function wait_for_inactive_pgs () {
fi
}
function check_recovery_flags () {
echo "### Start: Checking for flags that will prevent recovery"
# Ensure there are no flags set that will prevent recovery of degraded PGs
if [[ $(ceph osd stat | grep "norecover\|nobackfill\|norebalance") ]]; then
ceph osd stat
echo "Flags are set that prevent recovery of degraded PGs"
exit 1
fi
}
function check_osd_count() {
echo "#### Start: Checking OSD count ####"
noup_flag=$(ceph osd stat | awk '/noup/ {print $2}')
@ -119,10 +130,12 @@ function reweight_osds () {
done
}
function enable_autoscaling () {
function enable_or_disable_autoscaling () {
if [[ "${ENABLE_AUTOSCALER}" == "true" ]]; then
ceph mgr module enable pg_autoscaler
ceph config set global osd_pool_default_pg_autoscale_mode on
else
ceph mgr module disable pg_autoscaler
fi
}
@ -232,7 +245,7 @@ reweight_osds
cluster_capacity=0
if [[ -z "$(ceph osd versions | grep ceph\ version | grep -v nautilus)" ]]; then
cluster_capacity=$(ceph --cluster "${CLUSTER}" df | grep "TOTAL" | awk '{print $2 substr($3, 1, 1)}' | numfmt --from=iec)
enable_autoscaling
enable_or_disable_autoscaling
else
cluster_capacity=$(ceph --cluster "${CLUSTER}" df | head -n3 | tail -n1 | awk '{print $1 substr($2, 1, 1)}' | numfmt --from=iec)
fi
@ -253,3 +266,4 @@ ceph --cluster "${CLUSTER}" osd crush tunables {{ .Values.conf.pool.crush.tunabl
{{- end }}
wait_for_inactive_pgs
check_recovery_flags

View File

@ -254,7 +254,7 @@ conf:
features:
mds: true
mgr: true
pg_autoscaler: true
pg_autoscaler: false
cluster_flags:
# List of flags to set or unset separated by spaces
set: ""