Enable pod restart based on a label

This commit adds a mechanism to the pod recovery service to restart
pods based on the restart-on-reboot label.

This is a mitigation for an issue seen on an AIO system using SR-IOV
interfaces on an N3000 FPGA device.  Since the kubernetes services
start coming up after the controller manifest has completed, a race
can happen with the configuration of devices and the SR-IOV device
plugin in the worker manifest.  The symptom of this would be the
SR-IOV device in the running pod disappearing as the FPGA device is
reset.

Notes:

- The pod recovery service only runs on controller nodes.
- The raciness between the kubernetes bring-up and worker configuration
  should be fixed in the future by a re-organization of the manifests to
  either have a separate AIO or kubernetes manifest.  This would require
  extensive feature work.  In the meantime, this mitigation will allow
  pods which experience this issue to recover.

Change-Id: If84b66b3a632752bd08293105bb780ea8c7cf400
Closes-Bug: #1896631
Signed-off-by: Steven Webster <steven.webster@windriver.com>
This commit is contained in:
Steven Webster 2020-09-22 12:25:32 -04:00
parent 17c1b8894d
commit 7756299303
1 changed files with 29 additions and 0 deletions

View File

@ -145,6 +145,32 @@ function _node_affinity_pods {
}
function _labeled_pods {
# $1: actions <recover|verify>
# Delete pods with the restart-on-reboot=true label
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
if [ "$1" == 'recover' ]; then
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
kubectl delete pods -n ${pod//// } --wait=false
done
elif [ "$1" == 'verify' ]; then
for pod in $PODS; do
LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }"
STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')
if [[ "${STATUS}" != "Running" ]]; then
ERROR "$pod: not recovered: $STATUS"
else
LOG "$pod: recovered"
fi
done
else
ERROR "Unknown action: $1"
fi
}
function _force_reset_pods {
# $1: actions <recover|verify>
@ -188,6 +214,9 @@ function _force_reset_pods {
function _examine_pods {
# $1: actions <recover|verify>
# No need to wait for pod transitions if we know the pod needs to be restarted
_labeled_pods $1
# Wait for pods transitions to stop
_wait_for_pod_stabilization