Improve the CI check for pod liveness to fail on pods that are repeatedly restarting

Change-Id: Ic7d638c090c108efc70a8a9d5f417fbf0ca84795
2024-02-01 11:05:01 +01:00 · 2024-02-01 11:05:01 +01:00 · 49ebb72960
commit 49ebb72960
parent 5464c4c413
3 changed files with 31 additions and 0 deletions
--- a/playbooks/helm/run.yaml
+++ b/playbooks/helm/run.yaml
@ -0,0 +1,15 @@
+- hosts: all
+  tasks:
+    - name: Install helm charts
+      include_role:
+        name: helm-template
+      vars:
+        helm_release_name: "{{ zj_item.key }}"
+        helm_chart: "{{ zj_item.value }}"
+      loop: "{{ helm_charts | dict2items }}"
+      loop_control:
+        loop_var: 'zj_item'
+
+    - name: Check pod restarts after helm chart installations
+      include_role:
+        name: check-pod-restarts
--- a/roles/check-pod-restarts/tasks/main.yaml
+++ b/roles/check-pod-restarts/tasks/main.yaml
@ -0,0 +1,15 @@
+- name: Get pods and their restart counts
+  shell: >
+    kubectl get pods -o=jsonpath="{range .items[*]}{.metadata.name}:{.status.containerStatuses[*].restartCount}{'\n'}{end}"
+  register: pods_restart_counts
+
+- name: Check for pods that have restarted more than the allowed threshold
+  set_fact:
+    unstable_pods: "{{ unstable_pods | default([]) + [item.split(':')[0]] }}"
+  loop: "{{ pods_restart_counts.stdout_lines }}"
+  when: item.split(':')[1] | int >= 3
+
+- name: Report if any pod has restarted too many times
+  fail:
+    msg: "There were some unstable pods: {{ unstable_pods }}"
+  when: unstable_pods | default([]) | length > 0
--- a/zuul.d/jobs.yaml
+++ b/zuul.d/jobs.yaml
@ -22,6 +22,7 @@
    roles:
      - zuul: zuul/zuul-jobs
    pre-run: playbooks/helm/pre.yaml
+    run: playbooks/helm/run.yaml
    post-run: playbooks/kubernetes/post.yaml
    vars:
      helm_version: *helm_version