From c2e5db4305bca4f39a3391afd136b46216cb7d3f Mon Sep 17 00:00:00 2001 From: Thiago Brito Date: Tue, 9 Aug 2022 18:34:43 -0300 Subject: [PATCH] Deleting ic-nginx-ingress-controller at restore Once k8s comes up after the etcd restore, there is a span of time (around 20s) that the pod states have not been updated and are reported as they were at the point in time where the backup was taken. This returns that the ic-nginx-ingress-ingress-nginx-controller-XXX pod is "Ready", but it is not... in several instances during my tests, the pod was restarted 3-10 seconds after the task "Launch Armada with Helm v3" failed due to not being able to call the webhook. The proposed solution is to delete the pod preemptively and wait for it to be recreated and "Ready". TEST PLAN PASS restore on virtual AIO-SX (CentOS) Closes-Bug: #1978899 Signed-off-by: Thiago Brito Change-Id: I20bec1fbbf809bfcf5d515ef55c6d47ab968dbf3 --- .../src/playbooks/roles/common/armada-helm/tasks/main.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/playbookconfig/src/playbooks/roles/common/armada-helm/tasks/main.yml b/playbookconfig/src/playbooks/roles/common/armada-helm/tasks/main.yml index d942c0c7a..aae6ab60f 100644 --- a/playbookconfig/src/playbooks/roles/common/armada-helm/tasks/main.yml +++ b/playbookconfig/src/playbooks/roles/common/armada-helm/tasks/main.yml @@ -162,6 +162,13 @@ register: nginx_webhook_service ignore_errors: true + - name: If on system restore mode, kill ingress validating webhook pod so it can be recreated + shell: >- + kubectl delete pod -n kube-system + -l $(kubectl get service -n kube-system {{ nginx_webhook_service.stdout }} + -o jsonpath="{.spec.selector}" | tr -d "{}\"" | tr ":" "=") + when: mode == 'restore' and armada_check.rc == 0 and nginx_webhook_service.rc == 0 + - name: Check ingress validating webhook service and pod status shell: >- kubectl wait pod -n kube-system