From 77613963036cc0e9fc98a69f234b1bbb86690076 Mon Sep 17 00:00:00 2001
From: Felix Edel <felix.edel@bmw.de>
Date: Mon, 6 Nov 2023 08:30:54 +0100
Subject: [PATCH] mirror-workspace-git-repos: Retry on failure in git update
 task

We occasionally see the this task fail for the first element in the
zuul.projects list with a MODULE FAILURE and a return code of -13
(SIGPIPE) [1]. So far we couldn't identify the root cause, so try to
mitigate this issue by retrying on failure. This solution is similar to
the one used for the "Synchronize repos" task[2].

There is a bug report in Ansible that fits

Since it's only the first element in the loop that is failing while
subsequent elements are successful, we currently have two assumptions:

  1. As the task before is using a `delegate_to: localhost' [3],
     there might be a problem with Ansible when switching the connection
     from localhost to the remote host (node).
  2. Since the task before is using the same SSH connection [4] that is
     used by Ansible to push the git repository, there might be some
     "leftovers" on the connection that make the next task fail.
  3. There is also a bug report in Ansible [5] which might be causing
     that error.

[1]:
    {
        "ansible_loop_var": "zj_project",
        "changed": false,
        "failed": true,
        "module_stderr": "",
        "module_stdout": "",
        "msg": "MODULE FAILURE\nSee stdout/stderr for the exact error",
        "rc": -13,
        "zj_project": {...}
    }

[2]: https://opendev.org/zuul/zuul-jobs/src/commit/3b3495e2557e55b70d47ab799bf71e58f095a918/roles/mirror-workspace-git-repos/tasks/main.yaml#L32
[3]: https://opendev.org/zuul/zuul-jobs/src/commit/3b3495e2557e55b70d47ab799bf71e58f095a918/roles/mirror-workspace-git-repos/tasks/main.yaml#L25
[4]: https://opendev.org/zuul/zuul-jobs/src/commit/3b3495e2557e55b70d47ab799bf71e58f095a918/roles/mirror-workspace-git-repos/tasks/main.yaml#L16
[5]: https://github.com/ansible/ansible/issues/81777

Change-Id: I0c4cb87bb076b9b40c9c446dbe5db437daff5897
---
 roles/mirror-workspace-git-repos/tasks/main.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/roles/mirror-workspace-git-repos/tasks/main.yaml b/roles/mirror-workspace-git-repos/tasks/main.yaml
index 54398b22c..cc40015d6 100644
--- a/roles/mirror-workspace-git-repos/tasks/main.yaml
+++ b/roles/mirror-workspace-git-repos/tasks/main.yaml
@@ -56,6 +56,14 @@
   with_dict: "{{ zuul.projects }}"
   loop_control:
     loop_var: zj_project
+  # We occasionally see the this task fail for the first element in the
+  # zuul.projects list with a MODULE FAILURE and a return code of -13
+  # (SIGPIPE).  This may be caused by
+  # https://github.com/ansible/ansible/issues/81777
+  # Try to mitigate this issue by retrying on failure.
+  register: git_update
+  until: git_update is success
+  retries: 3
   # ANSIBLE0006: Skip linting since it triggers on the "git" command,
   # but we prefer the shell above
   tags: