From cbb567cb868b3681b82d325cfb712ef4601c91a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= <radoslaw.piliszek@gmail.com>
Date: Sat, 7 Aug 2021 14:30:55 +0000
Subject: [PATCH] Add ability to retry image pulling

Sometimes, the registries may intermittently fail to deliver the
images. This is often seen in the CI, though it also happens with
production deployments, even those with internal registries and/or
registry mirrors - due to sheer load when trying to pull the
images from many hosts.

This patchs adds two new vars to control retry behaviour.
The default has been set to make users happier by default. :-)

Change-Id: I81ad7d8642654f8474f11084c6934aab40243d35
---
 .../service-images-pull/defaults/main.yml     |  7 ++++++
 .../roles/service-images-pull/tasks/main.yml  |  4 ++++
 ansible/roles/swift/defaults/main.yml         |  6 +++++
 ansible/roles/swift/tasks/pull.yml            | 24 +++++++++++++++++++
 .../image-pull-retries-75490c3e6e1e4b54.yaml  |  9 +++++++
 5 files changed, 50 insertions(+)
 create mode 100644 ansible/roles/service-images-pull/defaults/main.yml
 create mode 100644 releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml

diff --git a/ansible/roles/service-images-pull/defaults/main.yml b/ansible/roles/service-images-pull/defaults/main.yml
new file mode 100644
index 000000000..57e0e696e
--- /dev/null
+++ b/ansible/roles/service-images-pull/defaults/main.yml
@@ -0,0 +1,7 @@
+---
+# Kolla image pulling settings: the amount of retries and the delay (in seconds)
+# between them. These are useful if your registry is not 100% reliable (usually
+# due to load). They modify the Ansible image pulling task params ``retries``
+# and ``delay``, respectively.
+service_images_pull_retries: 3
+service_images_pull_delay: 5
diff --git a/ansible/roles/service-images-pull/tasks/main.yml b/ansible/roles/service-images-pull/tasks/main.yml
index 240ea57cb..cb526bfb3 100644
--- a/ansible/roles/service-images-pull/tasks/main.yml
+++ b/ansible/roles/service-images-pull/tasks/main.yml
@@ -7,6 +7,10 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ service.image }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   with_dict: "{{ lookup('vars', (kolla_role_name | default(project_name)) + '_services') | select_services_enabled_and_mapped_to_host }}"
   loop_control:
     label: "{{ item.key }}"
diff --git a/ansible/roles/swift/defaults/main.yml b/ansible/roles/swift/defaults/main.yml
index 82b70bb57..9e08fab3b 100644
--- a/ansible/roles/swift/defaults/main.yml
+++ b/ansible/roles/swift/defaults/main.yml
@@ -93,3 +93,9 @@ swift_ks_users:
     user: "{{ swift_keystone_user }}"
     password: "{{ swift_keystone_password }}"
     role: "admin"
+
+
+# FIXME(yoctozepto): These are copied from service-images-pull role.
+# Remove when the Swift role is finally migrated to new style.
+service_images_pull_retries: 3
+service_images_pull_delay: 5
diff --git a/ansible/roles/swift/tasks/pull.yml b/ansible/roles/swift/tasks/pull.yml
index 61946da2f..622c62276 100644
--- a/ansible/roles/swift/tasks/pull.yml
+++ b/ansible/roles/swift/tasks/pull.yml
@@ -5,6 +5,10 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ swift_rsyncd_image_full }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   when: inventory_hostname in groups['swift-account-server'] or
         inventory_hostname in groups['swift-container-server'] or
         inventory_hostname in groups['swift-object-server']
@@ -15,6 +19,10 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ swift_proxy_server_image_full }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   when: inventory_hostname in groups['swift-proxy-server']
 
 - name: Pulling swift-account image
@@ -23,6 +31,10 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ swift_account_image_full }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   when: inventory_hostname in groups['swift-account-server']
 
 - name: Pulling swift-container image
@@ -31,6 +43,10 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ swift_container_image_full }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   when: inventory_hostname in groups['swift-container-server']
 
 - name: Pulling swift-object image
@@ -39,6 +55,10 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ swift_object_image_full }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   when: inventory_hostname in groups['swift-object-server']
 
 - name: Pulling swift-object-expirer image
@@ -47,4 +67,8 @@
     action: "pull_image"
     common_options: "{{ docker_common_options }}"
     image: "{{ swift_object_expirer_image_full }}"
+  retries: "{{ service_images_pull_retries }}"
+  delay: "{{ service_images_pull_delay }}"
+  register: result
+  until: result is success
   when: inventory_hostname in groups['swift-object-server']
diff --git a/releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml b/releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml
new file mode 100644
index 000000000..ea6039c26
--- /dev/null
+++ b/releasenotes/notes/image-pull-retries-75490c3e6e1e4b54.yaml
@@ -0,0 +1,9 @@
+---
+features:
+  - |
+    Adds two new variables ``service_images_pull_retries`` and
+    ``service_images_pull_delay`` which control the behaviour of image
+    pulling tasks. These are useful if your registry is not 100%
+    reliable (usually due to load). The defaults have been set to
+    3 retries and 5 seconds delay to ensure a better default experience
+    (these are actually Ansible defaults when task retries are enabled).
-- 
GitLab