Merge "Support fail-fast in project pipelines"

This commit is contained in:
Zuul 2019-05-16 21:51:44 +00:00 committed by Gerrit Code Review
commit bb6078da0b
13 changed files with 232 additions and 2 deletions

View File

@ -1419,6 +1419,19 @@ pipeline.
difficult to determine why Zuul did or did not run a certain difficult to determine why Zuul did or did not run a certain
job, the additional information this provides may help. job, the additional information this provides may help.
.. attr:: fail-fast
:default: false
If this is set to `true`, Zuul will report a build failure
immediately and abort all still running builds. This can be used
to save resources in resource constrained environments at the cost
of potentially requiring multiple attempts if more than one problem
is present.
Once this is defined it cannot be overridden afterwards. So this
can be forced to a specific value by e.g. defining it in a config
repo.
.. _project-template: .. _project-template:
Project Template Project Template

View File

@ -0,0 +1,5 @@
---
features:
- |
Zuul now supports :attr:`project.<pipeline>.fail-fast` to immediately
report and cancel builds on the first failure in a buildset.

View File

@ -0,0 +1,2 @@
- hosts: all
tasks: []

View File

@ -0,0 +1,63 @@
- pipeline:
name: check
manager: independent
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- job:
name: base
parent: null
run: playbooks/run.yaml
- job:
name: project-merge
- job:
name: project-test1
- job:
name: project-test2
- job:
name: project-test3
- job:
name: project-test4
- job:
name: project-test5
nodeset:
nodes:
- name: controller
label: label1
- job:
name: project-test6
- project:
name: org/project
check:
fail-fast: true
jobs:
- project-merge
- project-test1:
dependencies: project-merge
- project-test2:
dependencies: project-merge
- project-test3:
dependencies:
- name: project-test2
soft: true
- project-test4:
dependencies: project-test2
- project-test5
- project-test6:
dependencies: project-merge
voting: false

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,5 @@
# This tries to unset fail-fast which should not be possible because it's
# already set to true in common-config.
- project:
check:
fail-fast: false

View File

@ -0,0 +1,8 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project

View File

@ -7163,3 +7163,92 @@ class TestSchedulerBranchMatcher(ZuulTestCase):
"A should report start and success") "A should report start and success")
self.assertIn('gate', A.messages[1], self.assertIn('gate', A.messages[1],
"A should transit gate") "A should transit gate")
class TestSchedulerFailFast(ZuulTestCase):
tenant_config_file = 'config/fail-fast/main.yaml'
def test_fail_fast(self):
"""
Tests that a pipeline that is flagged with fail-fast
aborts jobs early.
"""
self.executor_server.hold_jobs_in_build = True
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.executor_server.failJob('project-test1', A)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 1)
self.assertEqual(self.builds[0].name, 'project-merge')
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Now project-test1, project-test2 and project-test6
# should be running
self.assertEqual(len(self.builds), 3)
# Release project-test1 which will fail
self.executor_server.release('project-test1')
self.waitUntilSettled()
self.fake_nodepool.unpause()
self.waitUntilSettled()
# Now project-test2 must be aborted
self.assertEqual(len(self.builds), 0)
self.assertEqual(A.reported, 1)
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='FAILURE', changes='1,1'),
dict(name='project-test2', result='ABORTED', changes='1,1'),
dict(name='project-test6', result='ABORTED', changes='1,1'),
], ordered=False)
def test_fail_fast_nonvoting(self):
"""
Tests that a pipeline that is flagged with fail-fast
doesn't abort jobs due to a non-voting job.
"""
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.executor_server.failJob('project-test6', A)
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 2)
self.assertEqual(self.builds[0].name, 'project-merge')
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Now project-test1, project-test2, project-test5 and project-test6
# should be running
self.assertEqual(len(self.builds), 4)
# Release project-test6 which will fail
self.executor_server.release('project-test6')
self.waitUntilSettled()
# Now project-test1, project-test2 and project-test5 should be running
self.assertEqual(len(self.builds), 3)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 0)
self.assertEqual(A.reported, 1)
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='SUCCESS', changes='1,1'),
dict(name='project-test2', result='SUCCESS', changes='1,1'),
dict(name='project-test3', result='SUCCESS', changes='1,1'),
dict(name='project-test4', result='SUCCESS', changes='1,1'),
dict(name='project-test5', result='SUCCESS', changes='1,1'),
dict(name='project-test6', result='FAILURE', changes='1,1'),
], ordered=False)

View File

@ -926,6 +926,7 @@ class ProjectTemplateParser(object):
pipeline_contents = { pipeline_contents = {
'queue': str, 'queue': str,
'debug': bool, 'debug': bool,
'fail-fast': bool,
'jobs': job_list 'jobs': job_list
} }
@ -955,6 +956,8 @@ class ProjectTemplateParser(object):
project_template.pipelines[pipeline_name] = project_pipeline project_template.pipelines[pipeline_name] = project_pipeline
project_pipeline.queue_name = conf_pipeline.get('queue') project_pipeline.queue_name = conf_pipeline.get('queue')
project_pipeline.debug = conf_pipeline.get('debug') project_pipeline.debug = conf_pipeline.get('debug')
project_pipeline.fail_fast = conf_pipeline.get(
'fail-fast')
self.parseJobList( self.parseJobList(
conf_pipeline.get('jobs', []), conf_pipeline.get('jobs', []),
source_context, start_mark, project_pipeline.job_list) source_context, start_mark, project_pipeline.job_list)
@ -1007,6 +1010,7 @@ class ProjectParser(object):
pipeline_contents = { pipeline_contents = {
'queue': str, 'queue': str,
'debug': bool, 'debug': bool,
'fail-fast': bool,
'jobs': job_list 'jobs': job_list
} }

View File

@ -449,6 +449,13 @@ class ExecutorClient(object):
# track of which results are non-final. # track of which results are non-final.
if build.retry: if build.retry:
result = None result = None
# If the build was canceled, we did actively cancel the job so
# don't overwrite the result and don't retry.
if build.canceled:
result = build.result
build.retry = False
self.sched.onBuildCompleted(build, result, result_data, warnings) self.sched.onBuildCompleted(build, result, result_data, warnings)
# The test suite expects the build to be removed from the # The test suite expects the build to be removed from the
# internal dict after it's added to the report queue. # internal dict after it's added to the report queue.

View File

@ -852,6 +852,13 @@ class PipelineManager(object):
if build: if build:
build_set.removeBuild(build) build_set.removeBuild(build)
def _cancelRunningBuilds(self, build_set):
item = build_set.item
for job in item.getJobs():
build = build_set.getBuild(job.name)
if not build or not build.result:
self.sched.cancelJob(build_set, job, final=True)
def onBuildCompleted(self, build): def onBuildCompleted(self, build):
item = build.build_set.item item = build.build_set.item
@ -870,6 +877,15 @@ class PipelineManager(object):
self._resetDependentBuilds(build.build_set, build) self._resetDependentBuilds(build.build_set, build)
self._resumeBuilds(build.build_set) self._resumeBuilds(build.build_set)
if (item.project_pipeline_config.fail_fast and
build.failed and build.job.voting):
# If fail-fast is set and the build is not successful
# cancel all remaining jobs.
self.log.debug("Build %s failed and fail-fast enabled, canceling "
"running builds", build)
self._cancelRunningBuilds(build.build_set)
return True return True
def onFilesChangesCompleted(self, event): def onFilesChangesCompleted(self, event):

View File

@ -1830,6 +1830,12 @@ class Build(object):
return ('<Build %s of %s voting:%s on %s>' % return ('<Build %s of %s voting:%s on %s>' %
(self.uuid, self.job.name, self.job.voting, self.worker)) (self.uuid, self.job.name, self.job.voting, self.worker))
@property
def failed(self):
if self.result and self.result not in ['SUCCESS', 'SKIPPED']:
return True
return False
@property @property
def pipeline(self): def pipeline(self):
return self.build_set.item.pipeline return self.build_set.item.pipeline
@ -2478,7 +2484,7 @@ class QueueItem(object):
build = build_set.getBuild(job.name) build = build_set.getBuild(job.name)
if build and (build.result == 'SUCCESS' or build.paused): if build and (build.result == 'SUCCESS' or build.paused):
successful_job_names.add(job.name) successful_job_names.add(job.name)
elif build and build.result in ('SKIPPED', 'FAILURE'): elif build and build.result in ('SKIPPED', 'FAILURE', 'CANCELED'):
pass pass
else: else:
nodeset = build_set.getJobNodeSet(job.name) nodeset = build_set.getJobNodeSet(job.name)
@ -3151,6 +3157,7 @@ class ProjectPipelineConfig(ConfigObject):
self.queue_name = None self.queue_name = None
self.debug = False self.debug = False
self.debug_messages = [] self.debug_messages = []
self.fail_fast = None
self.variables = {} self.variables = {}
def addDebug(self, msg): def addDebug(self, msg):
@ -3163,6 +3170,8 @@ class ProjectPipelineConfig(ConfigObject):
self.queue_name = other.queue_name self.queue_name = other.queue_name
if other.debug: if other.debug:
self.debug = other.debug self.debug = other.debug
if self.fail_fast is None:
self.fail_fast = other.fail_fast
self.job_list.inheritFrom(other.job_list) self.job_list.inheritFrom(other.job_list)
def updateVariables(self, other): def updateVariables(self, other):

View File

@ -37,6 +37,7 @@ from zuul.lib.config import get_default
from zuul.lib.gear_utils import getGearmanFunctions from zuul.lib.gear_utils import getGearmanFunctions
from zuul.lib.statsd import get_statsd from zuul.lib.statsd import get_statsd
import zuul.lib.queue import zuul.lib.queue
from zuul.model import Build
COMMANDS = ['full-reconfigure', 'stop'] COMMANDS = ['full-reconfigure', 'stop']
@ -1419,7 +1420,7 @@ class Scheduler(threading.Thread):
other_change.refresh_deps = True other_change.refresh_deps = True
change.refresh_deps = True change.refresh_deps = True
def cancelJob(self, buildset, job, build=None): def cancelJob(self, buildset, job, build=None, final=False):
item = buildset.item item = buildset.item
job_name = job.name job_name = job.name
try: try:
@ -1459,6 +1460,13 @@ class Scheduler(threading.Thread):
nodeset = buildset.getJobNodeSet(job_name) nodeset = buildset.getJobNodeSet(job_name)
if nodeset: if nodeset:
self.nodepool.returnNodeSet(nodeset) self.nodepool.returnNodeSet(nodeset)
if final:
# If final is set make sure that the job is not resurrected
# later by re-requesting nodes.
fakebuild = Build(job, None)
fakebuild.result = 'CANCELED'
buildset.addBuild(fakebuild)
finally: finally:
# Release the semaphore in any case # Release the semaphore in any case
tenant = buildset.item.pipeline.tenant tenant = buildset.item.pipeline.tenant