Enable pod restart based on a label
This commit adds a mechanism to the pod recovery service to restart pods based on the restart-on-reboot label. This is a mitigation for an issue seen on an AIO system using SR-IOV interfaces on an N3000 FPGA device. Since the kubernetes services start coming up after the controller manifest has completed, a race can happen with the configuration of devices and the SR-IOV device plugin in the worker manifest. The symptom of this would be the SR-IOV device in the running pod disappearing as the FPGA device is reset. Notes: - The pod recovery service only runs on controller nodes. - The raciness between the kubernetes bring-up and worker configuration should be fixed in the future by a re-organization of the manifests to either have a separate AIO or kubernetes manifest. This would require extensive feature work. In the meantime, this mitigation will allow pods which experience this issue to recover. Change-Id: If84b66b3a632752bd08293105bb780ea8c7cf400 Closes-Bug: #1896631 Signed-off-by: Steven Webster <steven.webster@windriver.com>
This commit is contained in:
parent
17c1b8894d
commit
7756299303
@ -145,6 +145,32 @@ function _node_affinity_pods {
|
||||
|
||||
}
|
||||
|
||||
function _labeled_pods {
|
||||
# $1: actions <recover|verify>
|
||||
|
||||
# Delete pods with the restart-on-reboot=true label
|
||||
PODS=$(kubectl get pods --all-namespaces --no-headers --field-selector=spec.nodeName=${HOST} --selector=restart-on-reboot=true 2>/dev/null | awk '{print $1"/"$2}')
|
||||
|
||||
if [ "$1" == 'recover' ]; then
|
||||
for pod in $PODS; do
|
||||
LOG "restart-on-reboot labeled pods: Recovering: ${pod//// }"
|
||||
kubectl delete pods -n ${pod//// } --wait=false
|
||||
done
|
||||
elif [ "$1" == 'verify' ]; then
|
||||
for pod in $PODS; do
|
||||
LOG "restart-on-reboot labeled pods: Verifying: ${pod//// }"
|
||||
STATUS=$(kubectl get pod --no-headers -n ${pod//// } 2>/dev/null | awk '{print $3}')
|
||||
if [[ "${STATUS}" != "Running" ]]; then
|
||||
ERROR "$pod: not recovered: $STATUS"
|
||||
else
|
||||
LOG "$pod: recovered"
|
||||
fi
|
||||
done
|
||||
else
|
||||
ERROR "Unknown action: $1"
|
||||
fi
|
||||
}
|
||||
|
||||
function _force_reset_pods {
|
||||
# $1: actions <recover|verify>
|
||||
|
||||
@ -188,6 +214,9 @@ function _force_reset_pods {
|
||||
function _examine_pods {
|
||||
# $1: actions <recover|verify>
|
||||
|
||||
# No need to wait for pod transitions if we know the pod needs to be restarted
|
||||
_labeled_pods $1
|
||||
|
||||
# Wait for pods transitions to stop
|
||||
_wait_for_pod_stabilization
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user