Add Zuul job variable to indicate a job will retry

This change adds a variable to post and cleanup playboks in order to
determine if a job will be retried due to a failure in one of the
earlier playbooks.

This variable might be useful for only performing certain actions (e.g.
interacting with a remote system) when the job result is final and there
won't be any further attempts.

Change-Id: If7f4488d4a59b1544795401bdc243978fea9ca86
This commit is contained in:
Simon Westphahl 2023-07-24 12:35:12 +02:00
parent 19ae1f2f8a
commit 7bba28a32f
No known key found for this signature in database
6 changed files with 58 additions and 17 deletions

View File

@ -675,6 +675,18 @@ of item.
- shell: echo example
when: zuul_success | bool
.. var:: zuul_will_retry
Post run and cleanup playbook(s) will be passed this variable to indicate
if the job will be retried. This variable is meant to be used with the
`bool` filter.
.. code-block:: yaml
tasks:
- shell: echo example
when: zuul_will_retry | bool
.. var:: nodepool
Information about each host from Nodepool is supplied in the

View File

@ -0,0 +1,5 @@
---
features:
- |
A new Zuul job variable :var:`zuul_will_retry` available in post and
cleanup playbooks, which indicates whether the current job will be retried.

View File

@ -0,0 +1,6 @@
- hosts: localhost
gather_facts: no
tasks:
- copy:
content: "{{ zuul_will_retry }}"
dest: "{{ zuul._test.test_root }}/builds/{{ zuul.build }}.will-retry.flag"

View File

@ -15,6 +15,7 @@
- job:
name: base
parent: null
post-run: playbooks/post.yaml
- job:
name: pre-unreachable

View File

@ -7333,6 +7333,17 @@ class TestUnreachable(AnsibleZuulTestCase):
'.ansible/nodes.unreachable')
self.assertEqual('fake\n', unreachable_log)
retried_builds = set()
for build in self.history:
will_retry_flag = os.path.join(
self.jobdir_root, f'{build.uuid}.will-retry.flag')
self.assertTrue(os.path.exists(will_retry_flag))
with open(will_retry_flag) as f:
will_retry = f.readline()
expect_retry = build.name not in retried_builds
self.assertEqual(str(expect_retry), will_retry)
retried_builds.add(build.name)
class TestJobPause(AnsibleZuulTestCase):
tenant_config_file = 'config/job-pause/main.yaml'

View File

@ -1052,6 +1052,13 @@ class AnsibleJob(object):
# The zuul.* vars
self.debug_zuul_vars = {}
self.waiting_for_semaphores = False
try:
max_attempts = self.arguments["zuul"]["max_attempts"]
except KeyError:
# TODO (swestphahl):
# Remove backward compatibility handling
max_attempts = self.arguments["max_attempts"]
self.retry_limit = self.arguments["zuul"]["attempts"] >= max_attempts
def run(self):
self.running = True
@ -1503,9 +1510,8 @@ class AnsibleJob(object):
self.executor_server.updateBuildStatus(self.build_request, data)
result = self.runPlaybooks(args)
success = result == 'SUCCESS'
self.runCleanupPlaybooks(success)
self.runCleanupPlaybooks(result)
# Stop the persistent SSH connections.
setup_status, setup_code = self.runAnsibleCleanup(
@ -1822,6 +1828,8 @@ class AnsibleJob(object):
post_timeout = self.job.post_timeout
post_unreachable = False
for index, playbook in enumerate(self.jobdir.post_playbooks):
will_retry = (
(pre_failed or post_unreachable) and not self.retry_limit)
# Post timeout operates a little differently to the main job
# timeout. We give each post playbook the full post timeout to
# do its job because post is where you'll often record job logs
@ -1829,7 +1837,7 @@ class AnsibleJob(object):
# the first place.
post_status, post_code = self.runAnsiblePlaybook(
playbook, post_timeout, self.ansible_version, success,
phase='post', index=index)
phase='post', index=index, will_retry=will_retry)
if post_status == self.RESULT_ABORTED:
return 'ABORTED'
if post_status == self.RESULT_UNREACHABLE:
@ -1857,7 +1865,7 @@ class AnsibleJob(object):
return result
def runCleanupPlaybooks(self, success):
def runCleanupPlaybooks(self, result):
if not self.jobdir.cleanup_playbooks:
return
@ -1871,11 +1879,14 @@ class AnsibleJob(object):
now=datetime.datetime.now()
))
success = result == 'SUCCESS'
will_retry = result is None and not self.retry_limit
self.cleanup_started = True
for index, playbook in enumerate(self.jobdir.cleanup_playbooks):
self.runAnsiblePlaybook(
playbook, CLEANUP_TIMEOUT, self.ansible_version,
success=success, phase='cleanup', index=index)
success=success, phase='cleanup', index=index,
will_retry=will_retry)
def _logFinalPlaybookError(self):
# Failures in the final post playbook can include failures
@ -3090,7 +3101,8 @@ class AnsibleJob(object):
msg=msg))
def runAnsiblePlaybook(self, playbook, timeout, ansible_version,
success=None, phase=None, index=None):
success=None, phase=None, index=None,
will_retry=None):
if playbook.trusted or playbook.secrets_content:
self.writeInventory(playbook, self.frozen_hostvars)
else:
@ -3107,6 +3119,9 @@ class AnsibleJob(object):
if success is not None:
cmd.extend(['-e', 'zuul_success=%s' % str(bool(success))])
if will_retry is not None:
cmd.extend(['-e', f'zuul_will_retry={bool(will_retry)}'])
if phase:
cmd.extend(['-e', 'zuul_execution_phase=%s' % phase])
@ -4081,23 +4096,14 @@ class ExecutorServer(BaseMergeServer):
ansible_job.end_time = time.monotonic()
duration = ansible_job.end_time - ansible_job.time_starting_build
params = ansible_job.arguments
# If the result is None, check if the build has reached
# its max attempts and if so set the result to
# RETRY_LIMIT. This must be done in order to correctly
# process the autohold in the next step. Since we only
# want to hold the node if the build has reached a final
# result.
if result.get("result") is None:
attempts = params["zuul"]["attempts"]
try:
max_attempts = params["zuul"]["max_attempts"]
except KeyError:
# TODO (swestphahl):
# Remove backward compatibility handling
max_attempts = params["max_attempts"]
if attempts >= max_attempts:
result["result"] = "RETRY_LIMIT"
if result.get("result") is None and ansible_job.retry_limit:
result["result"] = "RETRY_LIMIT"
# Provide the hold information back to the scheduler via the build
# result.