Merge "Remove Kubernetes checks during optimized restore"
This commit is contained in:
@@ -161,106 +161,6 @@
|
||||
- name: Restore helm service
|
||||
import_tasks: restore-helm.yml
|
||||
|
||||
- name: Set Kubernetes components list
|
||||
set_fact:
|
||||
kube_component_list:
|
||||
- k8s-app=calico-node
|
||||
- k8s-app=kube-proxy
|
||||
- app=multus
|
||||
- app=sriov-cni
|
||||
- component=kube-apiserver
|
||||
- component=kube-controller-manager
|
||||
- component=kube-scheduler
|
||||
|
||||
- name: Update Kubernetes components list
|
||||
set_fact:
|
||||
# We skip the calico-node pod on AIO-DX and STANDARD setups
|
||||
# because the pods running on a different host than controller-0 will
|
||||
# be unreachable at this moment and the calico-node pods
|
||||
# will try to connect to them and fail forever
|
||||
kube_component_list: >-
|
||||
{{ kube_component_list | reject('search', 'calico-node') | list }}
|
||||
|
||||
- name: Get coredns deployment desired replicas
|
||||
command: >-
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf get deployment
|
||||
-n kube-system coredns -o jsonpath={.spec.replicas}
|
||||
register: coredns_get_replicas
|
||||
|
||||
# We scale these deployments down and back up because in setups with more
|
||||
# than 3 nodes, the cluster could be in the PartialDisruption state and
|
||||
# the pods may not be rescheduled off of a down
|
||||
# node. This ensures that the pods will be on controller-0 and will
|
||||
# become available.
|
||||
- name: Scale calico-kube-controllers & coredns deployments to 0
|
||||
command: >-
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf scale deployment
|
||||
-n {{ item.namespace }} {{ item.deployment }} --replicas=0
|
||||
with_items:
|
||||
- { namespace: kube-system, deployment: calico-kube-controllers }
|
||||
- { namespace: kube-system, deployment: coredns }
|
||||
|
||||
- name: Scale calico-kube-controllers deployment back to 1
|
||||
command: >-
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf scale deployment
|
||||
-n {{ item.namespace }} {{ item.deployment }} --replicas=1
|
||||
with_items:
|
||||
- { namespace: kube-system, deployment: calico-kube-controllers }
|
||||
|
||||
- name: Scale coredns deployment back to original size
|
||||
command: >-
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf scale deployment
|
||||
-n kube-system coredns --replicas={{ coredns_get_replicas.stdout }}
|
||||
|
||||
- name: Override async parameters
|
||||
set_fact:
|
||||
async_timeout: 120
|
||||
async_retries: 40
|
||||
|
||||
- name: Start parallel tasks to wait for Kubernetes component and Networking pods to reach ready state
|
||||
# Only check for pods on the current host to avoid waiting for pods on downed nodes
|
||||
# This speeds up "Get wait tasks results" on multi-node systems
|
||||
command: >-
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf wait --namespace=kube-system
|
||||
--for=condition=Ready pods --selector {{ item }} --field-selector spec.nodeName=controller-0
|
||||
--timeout={{ async_timeout }}s
|
||||
async: "{{ async_timeout }}"
|
||||
poll: 0
|
||||
with_items: "{{ kube_component_list }}"
|
||||
register: wait_for_kube_system_pods
|
||||
|
||||
- name: Start wait for calico-kube-controllers & coredns deployments to reach Available state
|
||||
# Check the deployment status rather than the pod status in case some pods are down on other nodes
|
||||
command: >-
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf wait --namespace={{ item.namespace }}
|
||||
--for=condition=Available deployment {{ item.deployment }} --timeout={{ async_timeout }}s
|
||||
async: "{{ async_timeout }}"
|
||||
poll: 0
|
||||
with_items:
|
||||
- { namespace: kube-system, deployment: calico-kube-controllers }
|
||||
- { namespace: kube-system, deployment: coredns }
|
||||
register: wait_for_deployments
|
||||
|
||||
- name: Get wait tasks results
|
||||
async_status:
|
||||
jid: "{{ item.ansible_job_id }}"
|
||||
register: wait_job_result
|
||||
until: wait_job_result.finished
|
||||
# The retry length should be x2 the length of the async_timeout
|
||||
# eg async_retries = async_timeout * 2 / delay
|
||||
retries: "{{ async_retries }}"
|
||||
delay: "{{ async_timeout * 2 // async_retries }}"
|
||||
failed_when: false
|
||||
with_items:
|
||||
- "{{ wait_for_kube_system_pods.results }}"
|
||||
- "{{ wait_for_deployments.results }}"
|
||||
|
||||
- name: Fail if any of the Kubernetes component or Networking pod are not ready by this time
|
||||
fail:
|
||||
msg: "Pod {{ item.item.item }} is still not ready."
|
||||
when: item.stdout is not search(" condition met")
|
||||
with_items: "{{ wait_job_result.results }}"
|
||||
|
||||
# we need to restart sysinv conductor here because it seem to be caching old data
|
||||
# this prevents it from interacting with a fresh kubernetes cluster. For example,
|
||||
# if the user changes OAM network and forces the kubernetes cluster to be torn down
|
||||
|
||||
Reference in New Issue
Block a user