From b0d7c3c69aeff757c4fb4aa8d186a8fcc8b4f88c Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Mon, 15 Apr 2019 20:07:55 +0200 Subject: [PATCH] Support fail-fast in project pipelines In some cases like resource constrained environments it is beneficial to report on changes in a fail fast manner to immediately report if one job fails. This can be useful especially if a project has many expensive long-running jobs. This introduces a fail-fast flag in the project pipeline that let's the project choose the trade off between full information and quick feedback. Change-Id: Ie4a5ac8e025362dbaacd3ae82f2e8369f7447a62 --- doc/source/user/config.rst | 13 +++ .../notes/fail-fast-57a5694d5ff568cd.yaml | 5 ++ .../git/common-config/playbooks/run.yaml | 2 + .../fail-fast/git/common-config/zuul.yaml | 63 +++++++++++++ .../config/fail-fast/git/org_project/README | 1 + .../fail-fast/git/org_project/zuul.yaml | 5 ++ tests/fixtures/config/fail-fast/main.yaml | 8 ++ tests/unit/test_scheduler.py | 89 +++++++++++++++++++ zuul/configloader.py | 4 + zuul/executor/client.py | 7 ++ zuul/manager/__init__.py | 16 ++++ zuul/model.py | 11 ++- zuul/scheduler.py | 10 ++- 13 files changed, 232 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/fail-fast-57a5694d5ff568cd.yaml create mode 100644 tests/fixtures/config/fail-fast/git/common-config/playbooks/run.yaml create mode 100644 tests/fixtures/config/fail-fast/git/common-config/zuul.yaml create mode 100644 tests/fixtures/config/fail-fast/git/org_project/README create mode 100644 tests/fixtures/config/fail-fast/git/org_project/zuul.yaml create mode 100644 tests/fixtures/config/fail-fast/main.yaml diff --git a/doc/source/user/config.rst b/doc/source/user/config.rst index 042a20b5e7..d64243ae2e 100644 --- a/doc/source/user/config.rst +++ b/doc/source/user/config.rst @@ -1419,6 +1419,19 @@ pipeline. difficult to determine why Zuul did or did not run a certain job, the additional information this provides may help. + .. attr:: fail-fast + :default: false + + If this is set to `true`, Zuul will report a build failure + immediately and abort all still running builds. This can be used + to save resources in resource constrained environments at the cost + of potentially requiring multiple attempts if more than one problem + is present. + + Once this is defined it cannot be overridden afterwards. So this + can be forced to a specific value by e.g. defining it in a config + repo. + .. _project-template: Project Template diff --git a/releasenotes/notes/fail-fast-57a5694d5ff568cd.yaml b/releasenotes/notes/fail-fast-57a5694d5ff568cd.yaml new file mode 100644 index 0000000000..e3b2633915 --- /dev/null +++ b/releasenotes/notes/fail-fast-57a5694d5ff568cd.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Zuul now supports :attr:`project..fail-fast` to immediately + report and cancel builds on the first failure in a buildset. diff --git a/tests/fixtures/config/fail-fast/git/common-config/playbooks/run.yaml b/tests/fixtures/config/fail-fast/git/common-config/playbooks/run.yaml new file mode 100644 index 0000000000..f679dceaef --- /dev/null +++ b/tests/fixtures/config/fail-fast/git/common-config/playbooks/run.yaml @@ -0,0 +1,2 @@ +- hosts: all + tasks: [] diff --git a/tests/fixtures/config/fail-fast/git/common-config/zuul.yaml b/tests/fixtures/config/fail-fast/git/common-config/zuul.yaml new file mode 100644 index 0000000000..ace1f59d65 --- /dev/null +++ b/tests/fixtures/config/fail-fast/git/common-config/zuul.yaml @@ -0,0 +1,63 @@ +- pipeline: + name: check + manager: independent + trigger: + gerrit: + - event: patchset-created + success: + gerrit: + Verified: 1 + failure: + gerrit: + Verified: -1 + +- job: + name: base + parent: null + run: playbooks/run.yaml + +- job: + name: project-merge + +- job: + name: project-test1 + +- job: + name: project-test2 + +- job: + name: project-test3 + +- job: + name: project-test4 + +- job: + name: project-test5 + nodeset: + nodes: + - name: controller + label: label1 + +- job: + name: project-test6 + +- project: + name: org/project + check: + fail-fast: true + jobs: + - project-merge + - project-test1: + dependencies: project-merge + - project-test2: + dependencies: project-merge + - project-test3: + dependencies: + - name: project-test2 + soft: true + - project-test4: + dependencies: project-test2 + - project-test5 + - project-test6: + dependencies: project-merge + voting: false diff --git a/tests/fixtures/config/fail-fast/git/org_project/README b/tests/fixtures/config/fail-fast/git/org_project/README new file mode 100644 index 0000000000..9daeafb986 --- /dev/null +++ b/tests/fixtures/config/fail-fast/git/org_project/README @@ -0,0 +1 @@ +test diff --git a/tests/fixtures/config/fail-fast/git/org_project/zuul.yaml b/tests/fixtures/config/fail-fast/git/org_project/zuul.yaml new file mode 100644 index 0000000000..9c85a171f1 --- /dev/null +++ b/tests/fixtures/config/fail-fast/git/org_project/zuul.yaml @@ -0,0 +1,5 @@ +# This tries to unset fail-fast which should not be possible because it's +# already set to true in common-config. +- project: + check: + fail-fast: false diff --git a/tests/fixtures/config/fail-fast/main.yaml b/tests/fixtures/config/fail-fast/main.yaml new file mode 100644 index 0000000000..208e274b13 --- /dev/null +++ b/tests/fixtures/config/fail-fast/main.yaml @@ -0,0 +1,8 @@ +- tenant: + name: tenant-one + source: + gerrit: + config-projects: + - common-config + untrusted-projects: + - org/project diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 49a6959111..b388c6a722 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -7104,3 +7104,92 @@ class TestSchedulerBranchMatcher(ZuulTestCase): "A should report start and success") self.assertIn('gate', A.messages[1], "A should transit gate") + + +class TestSchedulerFailFast(ZuulTestCase): + tenant_config_file = 'config/fail-fast/main.yaml' + + def test_fail_fast(self): + """ + Tests that a pipeline that is flagged with fail-fast + aborts jobs early. + """ + self.executor_server.hold_jobs_in_build = True + self.fake_nodepool.pause() + + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + self.executor_server.failJob('project-test1', A) + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + self.waitUntilSettled() + self.assertEqual(len(self.builds), 1) + self.assertEqual(self.builds[0].name, 'project-merge') + self.executor_server.release('project-merge') + self.waitUntilSettled() + + # Now project-test1, project-test2 and project-test6 + # should be running + self.assertEqual(len(self.builds), 3) + + # Release project-test1 which will fail + self.executor_server.release('project-test1') + self.waitUntilSettled() + + self.fake_nodepool.unpause() + self.waitUntilSettled() + + # Now project-test2 must be aborted + self.assertEqual(len(self.builds), 0) + self.assertEqual(A.reported, 1) + self.assertHistory([ + dict(name='project-merge', result='SUCCESS', changes='1,1'), + dict(name='project-test1', result='FAILURE', changes='1,1'), + dict(name='project-test2', result='ABORTED', changes='1,1'), + dict(name='project-test6', result='ABORTED', changes='1,1'), + ], ordered=False) + + def test_fail_fast_nonvoting(self): + """ + Tests that a pipeline that is flagged with fail-fast + doesn't abort jobs due to a non-voting job. + """ + self.executor_server.hold_jobs_in_build = True + + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + self.executor_server.failJob('project-test6', A) + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + self.waitUntilSettled() + self.assertEqual(len(self.builds), 2) + self.assertEqual(self.builds[0].name, 'project-merge') + self.executor_server.release('project-merge') + self.waitUntilSettled() + + # Now project-test1, project-test2, project-test5 and project-test6 + # should be running + self.assertEqual(len(self.builds), 4) + + # Release project-test6 which will fail + self.executor_server.release('project-test6') + self.waitUntilSettled() + + # Now project-test1, project-test2 and project-test5 should be running + self.assertEqual(len(self.builds), 3) + + self.executor_server.hold_jobs_in_build = False + self.executor_server.release() + self.waitUntilSettled() + + self.assertEqual(len(self.builds), 0) + self.assertEqual(A.reported, 1) + self.assertHistory([ + dict(name='project-merge', result='SUCCESS', changes='1,1'), + dict(name='project-test1', result='SUCCESS', changes='1,1'), + dict(name='project-test2', result='SUCCESS', changes='1,1'), + dict(name='project-test3', result='SUCCESS', changes='1,1'), + dict(name='project-test4', result='SUCCESS', changes='1,1'), + dict(name='project-test5', result='SUCCESS', changes='1,1'), + dict(name='project-test6', result='FAILURE', changes='1,1'), + ], ordered=False) diff --git a/zuul/configloader.py b/zuul/configloader.py index f945a4f2a1..70775a2cce 100644 --- a/zuul/configloader.py +++ b/zuul/configloader.py @@ -926,6 +926,7 @@ class ProjectTemplateParser(object): pipeline_contents = { 'queue': str, 'debug': bool, + 'fail-fast': bool, 'jobs': job_list } @@ -955,6 +956,8 @@ class ProjectTemplateParser(object): project_template.pipelines[pipeline_name] = project_pipeline project_pipeline.queue_name = conf_pipeline.get('queue') project_pipeline.debug = conf_pipeline.get('debug') + project_pipeline.fail_fast = conf_pipeline.get( + 'fail-fast') self.parseJobList( conf_pipeline.get('jobs', []), source_context, start_mark, project_pipeline.job_list) @@ -1007,6 +1010,7 @@ class ProjectParser(object): pipeline_contents = { 'queue': str, 'debug': bool, + 'fail-fast': bool, 'jobs': job_list } diff --git a/zuul/executor/client.py b/zuul/executor/client.py index 33312931a7..5070f2f478 100644 --- a/zuul/executor/client.py +++ b/zuul/executor/client.py @@ -449,6 +449,13 @@ class ExecutorClient(object): # track of which results are non-final. if build.retry: result = None + + # If the build was canceled, we did actively cancel the job so + # don't overwrite the result and don't retry. + if build.canceled: + result = build.result + build.retry = False + self.sched.onBuildCompleted(build, result, result_data, warnings) # The test suite expects the build to be removed from the # internal dict after it's added to the report queue. diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index c8eb33d965..018c5250f0 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -845,6 +845,13 @@ class PipelineManager(object): if build: build_set.removeBuild(build) + def _cancelRunningBuilds(self, build_set): + item = build_set.item + for job in item.getJobs(): + build = build_set.getBuild(job.name) + if not build or not build.result: + self.sched.cancelJob(build_set, job, final=True) + def onBuildCompleted(self, build): item = build.build_set.item @@ -863,6 +870,15 @@ class PipelineManager(object): self._resetDependentBuilds(build.build_set, build) self._resumeBuilds(build.build_set) + + if (item.project_pipeline_config.fail_fast and + build.failed and build.job.voting): + # If fail-fast is set and the build is not successful + # cancel all remaining jobs. + self.log.debug("Build %s failed and fail-fast enabled, canceling " + "running builds", build) + self._cancelRunningBuilds(build.build_set) + return True def onFilesChangesCompleted(self, event): diff --git a/zuul/model.py b/zuul/model.py index c600afe523..c236720a2f 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -1818,6 +1818,12 @@ class Build(object): return ('' % (self.uuid, self.job.name, self.job.voting, self.worker)) + @property + def failed(self): + if self.result and self.result not in ['SUCCESS', 'SKIPPED']: + return True + return False + @property def pipeline(self): return self.build_set.item.pipeline @@ -2466,7 +2472,7 @@ class QueueItem(object): build = build_set.getBuild(job.name) if build and (build.result == 'SUCCESS' or build.paused): successful_job_names.add(job.name) - elif build and build.result in ('SKIPPED', 'FAILURE'): + elif build and build.result in ('SKIPPED', 'FAILURE', 'CANCELED'): pass else: nodeset = build_set.getJobNodeSet(job.name) @@ -3125,6 +3131,7 @@ class ProjectPipelineConfig(ConfigObject): self.queue_name = None self.debug = False self.debug_messages = [] + self.fail_fast = None self.variables = {} def addDebug(self, msg): @@ -3137,6 +3144,8 @@ class ProjectPipelineConfig(ConfigObject): self.queue_name = other.queue_name if other.debug: self.debug = other.debug + if self.fail_fast is None: + self.fail_fast = other.fail_fast self.job_list.inheritFrom(other.job_list) def updateVariables(self, other): diff --git a/zuul/scheduler.py b/zuul/scheduler.py index b2b83a2c37..fd12a049bf 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -37,6 +37,7 @@ from zuul.lib.config import get_default from zuul.lib.gear_utils import getGearmanFunctions from zuul.lib.statsd import get_statsd import zuul.lib.queue +from zuul.model import Build COMMANDS = ['full-reconfigure', 'stop'] @@ -1417,7 +1418,7 @@ class Scheduler(threading.Thread): other_change.refresh_deps = True change.refresh_deps = True - def cancelJob(self, buildset, job, build=None): + def cancelJob(self, buildset, job, build=None, final=False): item = buildset.item job_name = job.name try: @@ -1457,6 +1458,13 @@ class Scheduler(threading.Thread): nodeset = buildset.getJobNodeSet(job_name) if nodeset: self.nodepool.returnNodeSet(nodeset) + + if final: + # If final is set make sure that the job is not resurrected + # later by re-requesting nodes. + fakebuild = Build(job, None) + fakebuild.result = 'CANCELED' + buildset.addBuild(fakebuild) finally: # Release the semaphore in any case tenant = buildset.item.pipeline.tenant