From df00ba22e7e9b7bbbd299e669d5219862f57cb08 Mon Sep 17 00:00:00 2001
From: Mark Goddard <mark@stackhpc.com>
Date: Fri, 26 Mar 2021 17:24:05 +0000
Subject: [PATCH] CI: increase Ansible Galaxy retries & add delay

We still see flakiness when downloading content from Ansible Galaxy,
often HTTP 520. This change increases the retries from 3 to 10, and adds
a 5 second delay between attempts.

Change-Id: I0c46e5fcc6979027dc6f1bc5cc49e923a205f654
Related: https://github.com/ansible/galaxy/issues/2429
---
 dev/functions                   | 14 ++++++++++----
 tools/ansible-galaxy-retried.sh |  6 ++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/dev/functions b/dev/functions
index 019c54fd..00377528 100644
--- a/dev/functions
+++ b/dev/functions
@@ -225,13 +225,16 @@ function run_kayobe {
 }
 
 function control_host_bootstrap {
+    attempts=10
+    interval=5
     echo "Bootstrapping the Ansible control host"
-    for i in $(seq 1 3); do
+    for i in $(seq 1 $attempts); do
         if run_kayobe control host bootstrap; then
             chb_success=1
             break
         fi
-        echo "Control host bootstrap failed - likely Ansible Galaxy flakiness. Retrying"
+        echo "Control host bootstrap failed - likely Ansible Galaxy flakiness. Sleeping $interval seconds before retrying"
+        sleep $interval
     done
     if [[ -z ${chb_success+x} ]]; then
         die $LINENO "Failed to bootstrap control host"
@@ -241,13 +244,16 @@ function control_host_bootstrap {
 }
 
 function control_host_upgrade {
+    attempts=10
+    interval=5
     echo "Upgrading the Ansible control host"
-    for i in $(seq 1 3); do
+    for i in $(seq 1 $attempts); do
         if run_kayobe control host upgrade; then
             chu_success=1
             break
         fi
-        echo "Control host upgrade failed - likely Ansible Galaxy flakiness. Retrying"
+        echo "Control host upgrade failed - likely Ansible Galaxy flakiness. Sleeping $interval seconds before retrying"
+        sleep $interval
     done
     if [[ -z ${chu_success+x} ]]; then
         die $LINENO "Failed to upgrade control host"
diff --git a/tools/ansible-galaxy-retried.sh b/tools/ansible-galaxy-retried.sh
index 4082f98e..846c132f 100755
--- a/tools/ansible-galaxy-retried.sh
+++ b/tools/ansible-galaxy-retried.sh
@@ -2,13 +2,15 @@
 
 set -e
 
-GALAXY_RETRIES=${GALAXY_RETRIES:-3}
+GALAXY_RETRIES=${GALAXY_RETRIES:-10}
+GALAXY_INTERVAL=${GALAXY_INTERVAL:-5}
 
 for i in $(seq 1 $GALAXY_RETRIES); do
     if ansible-galaxy "${@}"; then
         exit 0
     fi
-    echo "Ansible Galaxy command failed. Retrying"
+    echo "Ansible Galaxy command failed. Sleeping $GALAXY_INTERVAL seconds before retry"
+    sleep $GALAXY_INTERVAL
 done
 
 echo "Failed to execute: ansible-galaxy ${@}"
-- 
GitLab