ceph-init-wrapper: Detect stuck peering OSDs and restart them

OSDs might become stuck peering.
Recover from such state.

Closes-bug: 1851287

Change-Id: I2ef1a0e93d38c3d041ee0c5c1e66a4ac42785a68
Signed-off-by: Dan Voiculeasa <dan.voiculeasa@windriver.com>
This commit is contained in:
Dan Voiculeasa 2019-11-21 15:01:36 +02:00
parent dcacc409f4
commit 11fd5d9cd4

View File

@ -156,8 +156,9 @@ log_and_restart_blocked_osds ()
{
# Log info about the blocked osd daemons and then restart it
local names=$1
local message=$2
for name in $names; do
wlog $name "INFO" "Restarting OSD with blocked operations"
wlog $name "INFO" "$message"
${CEPH_SCRIPT} restart $name
done
}
@ -253,6 +254,7 @@ status ()
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
invalid=0
host=`hostname`
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
@ -267,14 +269,12 @@ status ()
fi
done
log_and_restart_blocked_osds $blocked_ops_procs
log_and_restart_blocked_osds "$blocked_ops_procs"\
"Restarting OSD with blocked operations"
log_and_restart_blocked_osds "$stuck_peering_procs"\
"Restarting OSD stuck peering"
log_and_kill_hung_procs $hung_procs
hung_procs_text=""
for i in $(echo $hung_procs); do
hung_procs_text+="$i(process hung) "
done
rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
if [ $invalid -eq 0 ]; then
text=""