Support fail-fast in project pipelines
In some cases like resource constrained environments it is beneficial to report on changes in a fail fast manner to immediately report if one job fails. This can be useful especially if a project has many expensive long-running jobs. This introduces a fail-fast flag in the project pipeline that let's the project choose the trade off between full information and quick feedback. Change-Id: Ie4a5ac8e025362dbaacd3ae82f2e8369f7447a62
This commit is contained in:
parent
4affbd1152
commit
b0d7c3c69a
@ -1419,6 +1419,19 @@ pipeline.
|
||||
difficult to determine why Zuul did or did not run a certain
|
||||
job, the additional information this provides may help.
|
||||
|
||||
.. attr:: fail-fast
|
||||
:default: false
|
||||
|
||||
If this is set to `true`, Zuul will report a build failure
|
||||
immediately and abort all still running builds. This can be used
|
||||
to save resources in resource constrained environments at the cost
|
||||
of potentially requiring multiple attempts if more than one problem
|
||||
is present.
|
||||
|
||||
Once this is defined it cannot be overridden afterwards. So this
|
||||
can be forced to a specific value by e.g. defining it in a config
|
||||
repo.
|
||||
|
||||
.. _project-template:
|
||||
|
||||
Project Template
|
||||
|
5
releasenotes/notes/fail-fast-57a5694d5ff568cd.yaml
Normal file
5
releasenotes/notes/fail-fast-57a5694d5ff568cd.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Zuul now supports :attr:`project.<pipeline>.fail-fast` to immediately
|
||||
report and cancel builds on the first failure in a buildset.
|
2
tests/fixtures/config/fail-fast/git/common-config/playbooks/run.yaml
vendored
Normal file
2
tests/fixtures/config/fail-fast/git/common-config/playbooks/run.yaml
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
- hosts: all
|
||||
tasks: []
|
63
tests/fixtures/config/fail-fast/git/common-config/zuul.yaml
vendored
Normal file
63
tests/fixtures/config/fail-fast/git/common-config/zuul.yaml
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
- pipeline:
|
||||
name: check
|
||||
manager: independent
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: patchset-created
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 1
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -1
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
run: playbooks/run.yaml
|
||||
|
||||
- job:
|
||||
name: project-merge
|
||||
|
||||
- job:
|
||||
name: project-test1
|
||||
|
||||
- job:
|
||||
name: project-test2
|
||||
|
||||
- job:
|
||||
name: project-test3
|
||||
|
||||
- job:
|
||||
name: project-test4
|
||||
|
||||
- job:
|
||||
name: project-test5
|
||||
nodeset:
|
||||
nodes:
|
||||
- name: controller
|
||||
label: label1
|
||||
|
||||
- job:
|
||||
name: project-test6
|
||||
|
||||
- project:
|
||||
name: org/project
|
||||
check:
|
||||
fail-fast: true
|
||||
jobs:
|
||||
- project-merge
|
||||
- project-test1:
|
||||
dependencies: project-merge
|
||||
- project-test2:
|
||||
dependencies: project-merge
|
||||
- project-test3:
|
||||
dependencies:
|
||||
- name: project-test2
|
||||
soft: true
|
||||
- project-test4:
|
||||
dependencies: project-test2
|
||||
- project-test5
|
||||
- project-test6:
|
||||
dependencies: project-merge
|
||||
voting: false
|
1
tests/fixtures/config/fail-fast/git/org_project/README
vendored
Normal file
1
tests/fixtures/config/fail-fast/git/org_project/README
vendored
Normal file
@ -0,0 +1 @@
|
||||
test
|
5
tests/fixtures/config/fail-fast/git/org_project/zuul.yaml
vendored
Normal file
5
tests/fixtures/config/fail-fast/git/org_project/zuul.yaml
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
# This tries to unset fail-fast which should not be possible because it's
|
||||
# already set to true in common-config.
|
||||
- project:
|
||||
check:
|
||||
fail-fast: false
|
8
tests/fixtures/config/fail-fast/main.yaml
vendored
Normal file
8
tests/fixtures/config/fail-fast/main.yaml
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
- tenant:
|
||||
name: tenant-one
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- common-config
|
||||
untrusted-projects:
|
||||
- org/project
|
@ -7104,3 +7104,92 @@ class TestSchedulerBranchMatcher(ZuulTestCase):
|
||||
"A should report start and success")
|
||||
self.assertIn('gate', A.messages[1],
|
||||
"A should transit gate")
|
||||
|
||||
|
||||
class TestSchedulerFailFast(ZuulTestCase):
|
||||
tenant_config_file = 'config/fail-fast/main.yaml'
|
||||
|
||||
def test_fail_fast(self):
|
||||
"""
|
||||
Tests that a pipeline that is flagged with fail-fast
|
||||
aborts jobs early.
|
||||
"""
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
self.fake_nodepool.pause()
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
self.executor_server.failJob('project-test1', A)
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(len(self.builds), 1)
|
||||
self.assertEqual(self.builds[0].name, 'project-merge')
|
||||
self.executor_server.release('project-merge')
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Now project-test1, project-test2 and project-test6
|
||||
# should be running
|
||||
self.assertEqual(len(self.builds), 3)
|
||||
|
||||
# Release project-test1 which will fail
|
||||
self.executor_server.release('project-test1')
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.fake_nodepool.unpause()
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Now project-test2 must be aborted
|
||||
self.assertEqual(len(self.builds), 0)
|
||||
self.assertEqual(A.reported, 1)
|
||||
self.assertHistory([
|
||||
dict(name='project-merge', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test1', result='FAILURE', changes='1,1'),
|
||||
dict(name='project-test2', result='ABORTED', changes='1,1'),
|
||||
dict(name='project-test6', result='ABORTED', changes='1,1'),
|
||||
], ordered=False)
|
||||
|
||||
def test_fail_fast_nonvoting(self):
|
||||
"""
|
||||
Tests that a pipeline that is flagged with fail-fast
|
||||
doesn't abort jobs due to a non-voting job.
|
||||
"""
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
self.executor_server.failJob('project-test6', A)
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.waitUntilSettled()
|
||||
self.assertEqual(len(self.builds), 2)
|
||||
self.assertEqual(self.builds[0].name, 'project-merge')
|
||||
self.executor_server.release('project-merge')
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Now project-test1, project-test2, project-test5 and project-test6
|
||||
# should be running
|
||||
self.assertEqual(len(self.builds), 4)
|
||||
|
||||
# Release project-test6 which will fail
|
||||
self.executor_server.release('project-test6')
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Now project-test1, project-test2 and project-test5 should be running
|
||||
self.assertEqual(len(self.builds), 3)
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.assertEqual(len(self.builds), 0)
|
||||
self.assertEqual(A.reported, 1)
|
||||
self.assertHistory([
|
||||
dict(name='project-merge', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test1', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test2', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test3', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test4', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test5', result='SUCCESS', changes='1,1'),
|
||||
dict(name='project-test6', result='FAILURE', changes='1,1'),
|
||||
], ordered=False)
|
||||
|
@ -926,6 +926,7 @@ class ProjectTemplateParser(object):
|
||||
pipeline_contents = {
|
||||
'queue': str,
|
||||
'debug': bool,
|
||||
'fail-fast': bool,
|
||||
'jobs': job_list
|
||||
}
|
||||
|
||||
@ -955,6 +956,8 @@ class ProjectTemplateParser(object):
|
||||
project_template.pipelines[pipeline_name] = project_pipeline
|
||||
project_pipeline.queue_name = conf_pipeline.get('queue')
|
||||
project_pipeline.debug = conf_pipeline.get('debug')
|
||||
project_pipeline.fail_fast = conf_pipeline.get(
|
||||
'fail-fast')
|
||||
self.parseJobList(
|
||||
conf_pipeline.get('jobs', []),
|
||||
source_context, start_mark, project_pipeline.job_list)
|
||||
@ -1007,6 +1010,7 @@ class ProjectParser(object):
|
||||
pipeline_contents = {
|
||||
'queue': str,
|
||||
'debug': bool,
|
||||
'fail-fast': bool,
|
||||
'jobs': job_list
|
||||
}
|
||||
|
||||
|
@ -449,6 +449,13 @@ class ExecutorClient(object):
|
||||
# track of which results are non-final.
|
||||
if build.retry:
|
||||
result = None
|
||||
|
||||
# If the build was canceled, we did actively cancel the job so
|
||||
# don't overwrite the result and don't retry.
|
||||
if build.canceled:
|
||||
result = build.result
|
||||
build.retry = False
|
||||
|
||||
self.sched.onBuildCompleted(build, result, result_data, warnings)
|
||||
# The test suite expects the build to be removed from the
|
||||
# internal dict after it's added to the report queue.
|
||||
|
@ -845,6 +845,13 @@ class PipelineManager(object):
|
||||
if build:
|
||||
build_set.removeBuild(build)
|
||||
|
||||
def _cancelRunningBuilds(self, build_set):
|
||||
item = build_set.item
|
||||
for job in item.getJobs():
|
||||
build = build_set.getBuild(job.name)
|
||||
if not build or not build.result:
|
||||
self.sched.cancelJob(build_set, job, final=True)
|
||||
|
||||
def onBuildCompleted(self, build):
|
||||
item = build.build_set.item
|
||||
|
||||
@ -863,6 +870,15 @@ class PipelineManager(object):
|
||||
self._resetDependentBuilds(build.build_set, build)
|
||||
|
||||
self._resumeBuilds(build.build_set)
|
||||
|
||||
if (item.project_pipeline_config.fail_fast and
|
||||
build.failed and build.job.voting):
|
||||
# If fail-fast is set and the build is not successful
|
||||
# cancel all remaining jobs.
|
||||
self.log.debug("Build %s failed and fail-fast enabled, canceling "
|
||||
"running builds", build)
|
||||
self._cancelRunningBuilds(build.build_set)
|
||||
|
||||
return True
|
||||
|
||||
def onFilesChangesCompleted(self, event):
|
||||
|
@ -1818,6 +1818,12 @@ class Build(object):
|
||||
return ('<Build %s of %s voting:%s on %s>' %
|
||||
(self.uuid, self.job.name, self.job.voting, self.worker))
|
||||
|
||||
@property
|
||||
def failed(self):
|
||||
if self.result and self.result not in ['SUCCESS', 'SKIPPED']:
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def pipeline(self):
|
||||
return self.build_set.item.pipeline
|
||||
@ -2466,7 +2472,7 @@ class QueueItem(object):
|
||||
build = build_set.getBuild(job.name)
|
||||
if build and (build.result == 'SUCCESS' or build.paused):
|
||||
successful_job_names.add(job.name)
|
||||
elif build and build.result in ('SKIPPED', 'FAILURE'):
|
||||
elif build and build.result in ('SKIPPED', 'FAILURE', 'CANCELED'):
|
||||
pass
|
||||
else:
|
||||
nodeset = build_set.getJobNodeSet(job.name)
|
||||
@ -3125,6 +3131,7 @@ class ProjectPipelineConfig(ConfigObject):
|
||||
self.queue_name = None
|
||||
self.debug = False
|
||||
self.debug_messages = []
|
||||
self.fail_fast = None
|
||||
self.variables = {}
|
||||
|
||||
def addDebug(self, msg):
|
||||
@ -3137,6 +3144,8 @@ class ProjectPipelineConfig(ConfigObject):
|
||||
self.queue_name = other.queue_name
|
||||
if other.debug:
|
||||
self.debug = other.debug
|
||||
if self.fail_fast is None:
|
||||
self.fail_fast = other.fail_fast
|
||||
self.job_list.inheritFrom(other.job_list)
|
||||
|
||||
def updateVariables(self, other):
|
||||
|
@ -37,6 +37,7 @@ from zuul.lib.config import get_default
|
||||
from zuul.lib.gear_utils import getGearmanFunctions
|
||||
from zuul.lib.statsd import get_statsd
|
||||
import zuul.lib.queue
|
||||
from zuul.model import Build
|
||||
|
||||
COMMANDS = ['full-reconfigure', 'stop']
|
||||
|
||||
@ -1417,7 +1418,7 @@ class Scheduler(threading.Thread):
|
||||
other_change.refresh_deps = True
|
||||
change.refresh_deps = True
|
||||
|
||||
def cancelJob(self, buildset, job, build=None):
|
||||
def cancelJob(self, buildset, job, build=None, final=False):
|
||||
item = buildset.item
|
||||
job_name = job.name
|
||||
try:
|
||||
@ -1457,6 +1458,13 @@ class Scheduler(threading.Thread):
|
||||
nodeset = buildset.getJobNodeSet(job_name)
|
||||
if nodeset:
|
||||
self.nodepool.returnNodeSet(nodeset)
|
||||
|
||||
if final:
|
||||
# If final is set make sure that the job is not resurrected
|
||||
# later by re-requesting nodes.
|
||||
fakebuild = Build(job, None)
|
||||
fakebuild.result = 'CANCELED'
|
||||
buildset.addBuild(fakebuild)
|
||||
finally:
|
||||
# Release the semaphore in any case
|
||||
tenant = buildset.item.pipeline.tenant
|
||||
|
Loading…
x
Reference in New Issue
Block a user