From bcc60136a07482bb8a18a2989bd3afa0a44d49bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= Date: Sat, 7 Aug 2021 14:30:55 +0000 Subject: [PATCH] Add ability to retry image pulling Sometimes, the registries may intermittently fail to deliver the images. This is often seen in the CI, though it also happens with production deployments, even those with internal registries and/or registry mirrors - due to sheer load when trying to pull the images from many hosts. This patchs adds two new vars to control retry behaviour. The default has been set to make users happier by default. :-) Change-Id: I81ad7d8642654f8474f11084c6934aab40243d35 (cherry picked from commit cbb567cb868b3681b82d325cfb712ef4601c91a9) --- .../service-images-pull/defaults/main.yml | 7 ++++++ .../roles/service-images-pull/tasks/main.yml | 4 ++++ ansible/roles/swift/defaults/main.yml | 6 +++++ ansible/roles/swift/tasks/pull.yml | 24 +++++++++++++++++++ .../image-pull-retries-75490c3e6e1e4b54.yaml | 9 +++++++ 5 files changed, 50 insertions(+) create mode 100644 ansible/roles/service-images-pull/defaults/main.yml create mode 100644 releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml diff --git a/ansible/roles/service-images-pull/defaults/main.yml b/ansible/roles/service-images-pull/defaults/main.yml new file mode 100644 index 0000000000..57e0e696ed --- /dev/null +++ b/ansible/roles/service-images-pull/defaults/main.yml @@ -0,0 +1,7 @@ +--- +# Kolla image pulling settings: the amount of retries and the delay (in seconds) +# between them. These are useful if your registry is not 100% reliable (usually +# due to load). They modify the Ansible image pulling task params ``retries`` +# and ``delay``, respectively. +service_images_pull_retries: 3 +service_images_pull_delay: 5 diff --git a/ansible/roles/service-images-pull/tasks/main.yml b/ansible/roles/service-images-pull/tasks/main.yml index 240ea57cbe..cb526bfb31 100644 --- a/ansible/roles/service-images-pull/tasks/main.yml +++ b/ansible/roles/service-images-pull/tasks/main.yml @@ -7,6 +7,10 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ service.image }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success with_dict: "{{ lookup('vars', (kolla_role_name | default(project_name)) + '_services') | select_services_enabled_and_mapped_to_host }}" loop_control: label: "{{ item.key }}" diff --git a/ansible/roles/swift/defaults/main.yml b/ansible/roles/swift/defaults/main.yml index 82b70bb576..9e08fab3b8 100644 --- a/ansible/roles/swift/defaults/main.yml +++ b/ansible/roles/swift/defaults/main.yml @@ -93,3 +93,9 @@ swift_ks_users: user: "{{ swift_keystone_user }}" password: "{{ swift_keystone_password }}" role: "admin" + + +# FIXME(yoctozepto): These are copied from service-images-pull role. +# Remove when the Swift role is finally migrated to new style. +service_images_pull_retries: 3 +service_images_pull_delay: 5 diff --git a/ansible/roles/swift/tasks/pull.yml b/ansible/roles/swift/tasks/pull.yml index 61946da2f3..622c622764 100644 --- a/ansible/roles/swift/tasks/pull.yml +++ b/ansible/roles/swift/tasks/pull.yml @@ -5,6 +5,10 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ swift_rsyncd_image_full }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success when: inventory_hostname in groups['swift-account-server'] or inventory_hostname in groups['swift-container-server'] or inventory_hostname in groups['swift-object-server'] @@ -15,6 +19,10 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ swift_proxy_server_image_full }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success when: inventory_hostname in groups['swift-proxy-server'] - name: Pulling swift-account image @@ -23,6 +31,10 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ swift_account_image_full }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success when: inventory_hostname in groups['swift-account-server'] - name: Pulling swift-container image @@ -31,6 +43,10 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ swift_container_image_full }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success when: inventory_hostname in groups['swift-container-server'] - name: Pulling swift-object image @@ -39,6 +55,10 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ swift_object_image_full }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success when: inventory_hostname in groups['swift-object-server'] - name: Pulling swift-object-expirer image @@ -47,4 +67,8 @@ action: "pull_image" common_options: "{{ docker_common_options }}" image: "{{ swift_object_expirer_image_full }}" + retries: "{{ service_images_pull_retries }}" + delay: "{{ service_images_pull_delay }}" + register: result + until: result is success when: inventory_hostname in groups['swift-object-server'] diff --git a/releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml b/releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml new file mode 100644 index 0000000000..ea6039c26a --- /dev/null +++ b/releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + Adds two new variables ``service_images_pull_retries`` and + ``service_images_pull_delay`` which control the behaviour of image + pulling tasks. These are useful if your registry is not 100% + reliable (usually due to load). The defaults have been set to + 3 retries and 5 seconds delay to ensure a better default experience + (these are actually Ansible defaults when task retries are enabled).