Add attempts logic for jobs

Today, if a job is aborted, zuul will launch said job until success /
failure.  If the job continues to abort, it will loop forever.  As a
result, we now added the ability to limit this.  By default we'll try
to relaunch an aborted job a total of 3 times, before RETRY_LIMIT is
returned as the result.

Change-Id: Ie26fdc29c07430ebfb3df8be8ac1786d63d7e0fe
Signed-off-by: Paul Belanger <pabelanger@redhat.com>
This commit is contained in:
Paul Belanger 2016-11-08 10:56:31 -05:00
parent 38ce39fe58
commit 71d9817406
8 changed files with 88 additions and 0 deletions

View File

@ -803,6 +803,11 @@ each job as it builds a list from the project specification.
Boolean value (``true`` or ``false``) that indicates whatever
a job is voting or not. Default: ``true``.
**attempts (optional)**
Number of attempts zuul will launch a job. Once reached, zuul will report
RETRY_LIMIT as the job result.
Defaults to 3.
**tags (optional)**
A list of arbitrary strings which will be associated with the job.
Can be used by the parameter-function to alter behavior based on

View File

@ -540,6 +540,7 @@ class FakeBuild(threading.Thread):
self.wait_condition = threading.Condition()
self.waiting = False
self.aborted = False
self.requeue = False
self.created = time.time()
self.description = ''
self.run_error = False
@ -602,6 +603,8 @@ class FakeBuild(threading.Thread):
result = 'FAILURE'
if self.aborted:
result = 'ABORTED'
if self.requeue:
result = None
if self.run_error:
work_fail = True

View File

@ -0,0 +1,30 @@
pipelines:
- name: check
manager: IndependentPipelineManager
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
verified: 1
failure:
gerrit:
verified: -1
- name: post
manager: IndependentPipelineManager
trigger:
gerrit:
- event: ref-updated
ref: ^(?!refs/).*$
jobs:
- name: project-test1
attempts: 4
projects:
- name: org/project
check:
- project-merge:
- project-test1
- project-test2

View File

@ -4481,3 +4481,36 @@ For CI problems and help debugging, contact ci@example.org"""
self.assertIn(
'- docs-draft-test2 https://server/job/docs-draft-test2/1/',
body[3])
def test_rerun_on_abort(self):
"Test that if a worker fails to run a job, it is run again"
self.config.set('zuul', 'layout_config',
'tests/fixtures/layout-abort-attempts.yaml')
self.sched.reconfigure(self.config)
self.worker.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.worker.release('.*-merge')
self.waitUntilSettled()
self.assertEqual(len(self.builds), 2)
self.builds[0].requeue = True
self.worker.release('.*-test*')
self.waitUntilSettled()
for x in range(3):
self.assertEqual(len(self.builds), 1)
self.builds[0].requeue = True
self.worker.release('.*-test1')
self.waitUntilSettled()
self.worker.hold_jobs_in_build = False
self.worker.release()
self.waitUntilSettled()
self.assertEqual(len(self.history), 6)
self.assertEqual(self.countJobResults(self.history, 'SUCCESS'), 2)
self.assertEqual(A.reported, 1)
self.assertIn('RETRY_LIMIT', A.messages[0])

View File

@ -367,6 +367,12 @@ class Gearman(object):
self.onBuildCompleted(gearman_job, 'NOT_REGISTERED')
return build
# NOTE(pabelanger): Rather then looping forever, check to see if job
# has passed attempts limit.
if item.current_build_set.getTries(job.name) > job.attempts:
self.onBuildCompleted(gearman_job, 'RETRY_LIMIT')
return build
if pipeline.precedence == zuul.model.PRECEDENCE_NORMAL:
precedence = gear.PRECEDENCE_NORMAL
elif pipeline.precedence == zuul.model.PRECEDENCE_HIGH:

View File

@ -103,6 +103,7 @@ class LayoutSchema(object):
'success-pattern': str,
'hold-following-changes': bool,
'voting': bool,
'attempts': int,
'mutex': str,
'tags': toList(str),
'parameter-function': str,

View File

@ -466,6 +466,8 @@ class Job(object):
self._files = []
self.skip_if_matcher = None
self.swift = {}
# Number of attempts to launch a job before giving up.
self.attempts = 3
def __str__(self):
return self.name
@ -646,6 +648,7 @@ class BuildSet(object):
self.unable_to_merge = False
self.failing_reasons = []
self.merge_state = self.NEW
self.tries = {}
def __repr__(self):
return '<BuildSet item: %s #builds: %s merge state: %s>' % (
@ -671,9 +674,12 @@ class BuildSet(object):
def addBuild(self, build):
self.builds[build.job.name] = build
if build.job.name not in self.tries:
self.tries[build.job.name] = 1
build.build_set = self
def removeBuild(self, build):
self.tries[build.job.name] += 1
del self.builds[build.job.name]
def getBuild(self, job_name):
@ -684,6 +690,9 @@ class BuildSet(object):
keys.sort()
return [self.builds.get(x) for x in keys]
def getTries(self, job_name):
return self.tries.get(job_name)
class QueueItem(object):
"""A changish inside of a Pipeline queue"""

View File

@ -529,6 +529,7 @@ class Scheduler(threading.Thread):
m = config_job.get('hold-following-changes', False)
if m:
job.hold_following_changes = True
job.attempts = config_job.get('attempts', 3)
m = config_job.get('voting', None)
if m is not None:
job.voting = m