[ceph-osd] Allow for unconditional OSD restart

This change allows OSDs to be restarted unconditionally by the
ceph-osd chart. This can be useful in upgrade scenarios where
ceph-osd pods are unhealthy during the upgrade.

Change-Id: I6de98db2b4eb1d76411e1dbffa65c263de3aecee
This commit is contained in:
Stephen Taylor 2022-04-04 13:35:49 -06:00
parent 50063c809c
commit 76fb2562c6
5 changed files with 39 additions and 25 deletions

View File

@ -15,6 +15,6 @@ apiVersion: v1
appVersion: v1.0.0
description: OpenStack-Helm Ceph OSD
name: ceph-osd
version: 0.1.38
version: 0.1.39
home: https://github.com/ceph/ceph
...

View File

@ -188,31 +188,37 @@ done
echo "Latest revision of the helm chart(s) is : $max_release"
if [[ $max_release -gt 1 ]]; then
if [[ $require_upgrade -gt 0 ]]; then
if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then
echo "restarting all osds simultaneously"
kubectl -n $CEPH_NAMESPACE delete pod -l component=osd
sleep 60
echo "waiting for pgs to become active and for degraded objects to recover"
wait_for_pgs
wait_for_degraded_objects
ceph -s
else
echo "waiting for inactive pgs and degraded objects before upgrade"
wait_for_pgs
wait_for_degraded_and_misplaced_objects
ceph -s
ceph osd "set" noout
echo "lets restart the osds rack by rack"
restart_by_rack
ceph osd "unset" noout
# If flags are set that will prevent recovery, don't restart OSDs
ceph -s | grep "noup\|noin\|nobackfill\|norebalance\|norecover" > /dev/null
if [[ $? -ne 0 ]]; then
if [[ "$UNCONDITIONAL_OSD_RESTART" == "true" ]] || [[ $max_release -gt 1 ]]; then
if [[ "$UNCONDITIONAL_OSD_RESTART" == "true" ]] || [[ $require_upgrade -gt 0 ]]; then
if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then
echo "restarting all osds simultaneously"
kubectl -n $CEPH_NAMESPACE delete pod -l component=osd
sleep 60
echo "waiting for pgs to become active and for degraded objects to recover"
wait_for_pgs
wait_for_degraded_objects
ceph -s
else
echo "waiting for inactive pgs and degraded objects before upgrade"
wait_for_pgs
wait_for_degraded_and_misplaced_objects
ceph -s
ceph osd "set" noout
echo "lets restart the osds rack by rack"
restart_by_rack
ceph osd "unset" noout
fi
fi
fi
#lets check all the ceph-osd daemonsets
echo "checking DS"
check_ds
#lets check all the ceph-osd daemonsets
echo "checking DS"
check_ds
else
echo "No revisions found for upgrade"
fi
else
echo "No revisions found for upgrade"
echo "Skipping OSD restarts because flags are set that would prevent recovery"
fi

View File

@ -104,6 +104,8 @@ spec:
value: {{ .Values.conf.ceph.target.required_percent_of_osds | ceil | quote }}
- name: DISRUPTIVE_OSD_RESTART
value: {{ .Values.conf.storage.disruptive_osd_restart | quote }}
- name: UNCONDITIONAL_OSD_RESTART
value: {{ .Values.conf.storage.unconditional_osd_restart | quote }}
command:
- /tmp/post-apply.sh
volumeMounts:

View File

@ -293,6 +293,11 @@ conf:
# OSD restarts more quickly with disruption.
disruptive_osd_restart: "false"
# The post-apply job will try to determine if OSDs need to be restarted and
# only restart them if necessary. Set this value to "true" to restart OSDs
# unconditionally.
unconditional_osd_restart: "false"
# NOTE(portdirect): for heterogeneous clusters the overrides section can be used to define
# OSD pods that will be deployed upon specifc nodes.
# overrides:

View File

@ -39,4 +39,5 @@ ceph-osd:
- 0.1.36 Add OSD device location pre-check
- 0.1.37 Add a disruptive OSD restart to the post-apply job
- 0.1.38 Skip pod wait in post-apply job when disruptive
- 0.1.39 Allow for unconditional OSD restart
...