Merge "Remove Kubernetes checks during optimized restore"

This commit is contained in:
Zuul
2023-08-31 02:45:31 +00:00
committed by Gerrit Code Review

View File

@@ -161,106 +161,6 @@
- name: Restore helm service
import_tasks: restore-helm.yml
- name: Set Kubernetes components list
set_fact:
kube_component_list:
- k8s-app=calico-node
- k8s-app=kube-proxy
- app=multus
- app=sriov-cni
- component=kube-apiserver
- component=kube-controller-manager
- component=kube-scheduler
- name: Update Kubernetes components list
set_fact:
# We skip the calico-node pod on AIO-DX and STANDARD setups
# because the pods running on a different host than controller-0 will
# be unreachable at this moment and the calico-node pods
# will try to connect to them and fail forever
kube_component_list: >-
{{ kube_component_list | reject('search', 'calico-node') | list }}
- name: Get coredns deployment desired replicas
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf get deployment
-n kube-system coredns -o jsonpath={.spec.replicas}
register: coredns_get_replicas
# We scale these deployments down and back up because in setups with more
# than 3 nodes, the cluster could be in the PartialDisruption state and
# the pods may not be rescheduled off of a down
# node. This ensures that the pods will be on controller-0 and will
# become available.
- name: Scale calico-kube-controllers & coredns deployments to 0
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf scale deployment
-n {{ item.namespace }} {{ item.deployment }} --replicas=0
with_items:
- { namespace: kube-system, deployment: calico-kube-controllers }
- { namespace: kube-system, deployment: coredns }
- name: Scale calico-kube-controllers deployment back to 1
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf scale deployment
-n {{ item.namespace }} {{ item.deployment }} --replicas=1
with_items:
- { namespace: kube-system, deployment: calico-kube-controllers }
- name: Scale coredns deployment back to original size
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf scale deployment
-n kube-system coredns --replicas={{ coredns_get_replicas.stdout }}
- name: Override async parameters
set_fact:
async_timeout: 120
async_retries: 40
- name: Start parallel tasks to wait for Kubernetes component and Networking pods to reach ready state
# Only check for pods on the current host to avoid waiting for pods on downed nodes
# This speeds up "Get wait tasks results" on multi-node systems
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf wait --namespace=kube-system
--for=condition=Ready pods --selector {{ item }} --field-selector spec.nodeName=controller-0
--timeout={{ async_timeout }}s
async: "{{ async_timeout }}"
poll: 0
with_items: "{{ kube_component_list }}"
register: wait_for_kube_system_pods
- name: Start wait for calico-kube-controllers & coredns deployments to reach Available state
# Check the deployment status rather than the pod status in case some pods are down on other nodes
command: >-
kubectl --kubeconfig=/etc/kubernetes/admin.conf wait --namespace={{ item.namespace }}
--for=condition=Available deployment {{ item.deployment }} --timeout={{ async_timeout }}s
async: "{{ async_timeout }}"
poll: 0
with_items:
- { namespace: kube-system, deployment: calico-kube-controllers }
- { namespace: kube-system, deployment: coredns }
register: wait_for_deployments
- name: Get wait tasks results
async_status:
jid: "{{ item.ansible_job_id }}"
register: wait_job_result
until: wait_job_result.finished
# The retry length should be x2 the length of the async_timeout
# eg async_retries = async_timeout * 2 / delay
retries: "{{ async_retries }}"
delay: "{{ async_timeout * 2 // async_retries }}"
failed_when: false
with_items:
- "{{ wait_for_kube_system_pods.results }}"
- "{{ wait_for_deployments.results }}"
- name: Fail if any of the Kubernetes component or Networking pod are not ready by this time
fail:
msg: "Pod {{ item.item.item }} is still not ready."
when: item.stdout is not search(" condition met")
with_items: "{{ wait_job_result.results }}"
# we need to restart sysinv conductor here because it seem to be caching old data
# this prevents it from interacting with a fresh kubernetes cluster. For example,
# if the user changes OAM network and forces the kubernetes cluster to be torn down