Execute one extra attempt to restore SRIOV device plugin

The service k8s-pod-recovery failed to restore the SRIOV device
plugin, necessary for pods that use SRIOV interfaces to create the
resource, those pods need to add the label 'restart-on-reboot=true'
to be restarted during boot. The failure was observed during an
upgrade, and although rare, it left the operator to actuate by
manually restarting the pods later.

This change adds a wait for the pod stabilization (it is considered
stable when stops the state transitions) and, if still in failure,
execute 2 attempts to restore the plugin. Logs were added to better
register the pod state in case of an error.

Test Plan:
[PASS]  execute 7 upgrades in an AIO-SX lab

Closes-Bug: 1999074

Signed-off-by: Andre Fernando Zanella Kantek <AndreFernandoZanella.Kantek@windriver.com>
Change-Id: I838c35d3e0a3557c71344945a8e00f22ccb50eb4
This commit is contained in:
Andre Fernando Zanella Kantek 2022-12-07 06:55:44 -05:00
parent 2ffcbeed18
commit e3705e6046
1 changed files with 43 additions and 13 deletions

View File

@ -95,11 +95,11 @@ function _wait_for_pod_stabilization {
while [[ $stability_count -lt $stable_cycles ]] ; do
pods_in_flux=$(KUBECONFIG=/etc/kubernetes/admin.conf kubectl get pods --no-headers --all-namespaces $extra_args | grep -v -e Running -e Completed | wc -l)
if [[ $pods_in_flux -ne $last_count ]]; then
LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed"
LOG "Waiting on pod transitions to stabilize... $pods_in_flux pods are not Running/Completed, extra_args:'${extra_args}'"
last_count=$pods_in_flux
stability_count=0
else
LOG "Pods transitions are stable... for $((stability_count*time_between_polls)) seconds."
LOG "Pods transitions are stable... for $((stability_count*time_between_polls)) seconds, extra_args:'${extra_args}'."
stability_count=$((stability_count+1))
fi
sleep $time_between_polls
@ -195,29 +195,51 @@ function _node_affinity_pods {
function _labeled_pods {
# $1: actions <recover|verify>
local SRIOVDP_STATUS="kubectl get pods --all-namespaces --no-headers --selector=app=sriovdp -o wide --field-selector=spec.nodeName=${HOST}"
local RESTARTPODS_STATUS="kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true"
if [ "$1" == 'recover' ]; then
POLLING_INTERVAL=5
STABILITY_COUNT=6
_wait_for_pod_stabilization "--selector=restart-on-reboot=true --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
PODS=$(${RESTARTPODS_STATUS} 2>/dev/null | awk '{print $1"/"$2}')
# Don't have to restart device-plugin if no labeled pods are present. System may not be configured for SRIOV.
if [ ! -z "${PODS}" ]; then
LOG "Waiting for SRIOV device plugin pod to become available"
_wait_for_pod_stabilization "--selector=app=sriovdp --field-selector=spec.nodeName=${HOST}" $POLLING_INTERVAL $STABILITY_COUNT
LOG "action $1: SRIOV device plugin pod status:'$(${SRIOVDP_STATUS})'."
# the wait for stabilization might finish with the pod not ready (but stable on a failed state)
# execute at least 2 attempts to restart it
for attempt in 1 2
do
# Check if device-plugin is ready, but do not wait
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s
# Check if device-plugin is ready, but do not wait
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s
# If device plugin is not ready, restart it and wait
if [ "$?" -ne 0 ]; then
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
# If device plugin is not ready, restart it and wait
if [ "$?" -ne 0 ]; then
ERROR "SRIOV device plugin timed out on ready wait. Continuing anyway. SRIOV pods may not recover."
kubectl delete pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --wait=false
ret_code=$?
if [ "${ret_code}" -ne 0 ]; then
ERROR "In attempt=${attempt}, SRIOV device plugin failed to delete in ${HOST} with ret_code=${ret_code}, SRIOV device plugin pod status:'$(${SRIOVDP_STATUS})'."
fi
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=360s
ret_code=$?
if [ "${ret_code}" -ne 0 ]; then
ERROR "In attempt=${attempt}, SRIOV device plugin timed out on ready wait with ret_code=${ret_code}. SRIOV device plugin pod status:'$(${SRIOVDP_STATUS})'."
continue
fi
fi
break
done
kubectl wait pods -n kube-system --selector=app=sriovdp --field-selector=spec.nodeName=${HOST} --for=condition=Ready --timeout=0s
ret_code=$?
if [ "${ret_code}" -ne 0 ]; then
ERROR "Continuing anyway with ret_code=${ret_code}. SRIOV pods may not recover. SRIOV device plugin pod status:'$(${SRIOVDP_STATUS})'."
fi
fi
@ -227,7 +249,15 @@ function _labeled_pods {
kubectl delete pods -n ${pod//// } --wait=false
done
elif [ "$1" == 'verify' ]; then
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
PODS=$(${RESTARTPODS_STATUS} 2>/dev/null | awk '{print $1"/"$2}')
if [ ! -z "${PODS}" ]; then
STATUS=$(${SRIOVDP_STATUS} 2>/dev/null | awk '{print $4}')
if [[ "${STATUS}" != "Running" ]]; then
ERROR "SRIOV device plugin: not recovered: '$(${SRIOVDP_STATUS})'."
else
LOG "SRIOV device plugin: recovered."
fi
fi
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }"
STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')