Respect fail-fast setting for node failures

Since an item will be considered failing in case of node failures, we
should also respect the fail-fast flag when a node request fails.

Change-Id: I1a6f247ccd86ef3aa7565d60c7d0c65c547edcaa
This commit is contained in:
Simon Westphahl
2024-09-19 11:45:35 +02:00
parent cddc0e096e
commit 40592bab4e
5 changed files with 92 additions and 5 deletions

View File

@@ -263,7 +263,7 @@ pipeline.
.. attr:: fail-fast
:default: false
If this is set to `true`, Zuul will report a build failure
If this is set to `true`, Zuul will report a build or node failure
immediately and abort all still running builds. This can be used
to save resources in resource constrained environments at the cost
of potentially requiring multiple attempts if more than one problem

View File

@@ -0,0 +1,5 @@
---
features:
- |
The :attr:`project.<pipeline>.fail-fast` attribute now also applies to node failures
of voting jobs.

View File

@@ -65,6 +65,10 @@
- job:
name: project-test6
nodeset:
nodes:
- name: controller
label: label1
- project:
name: org/project

View File

@@ -8970,9 +8970,8 @@ class TestSchedulerFailFast(ZuulTestCase):
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Now project-test1, project-test2 and project-test6
# should be running
self.assertEqual(len(self.builds), 3)
# Now project-test1 and project-test2 should be running
self.assertEqual(len(self.builds), 2)
# Release project-test1 which will fail
self.executor_server.release('project-test1')
@@ -8988,7 +8987,6 @@ class TestSchedulerFailFast(ZuulTestCase):
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='FAILURE', changes='1,1'),
dict(name='project-test2', result='ABORTED', changes='1,1'),
dict(name='project-test6', result='ABORTED', changes='1,1'),
], ordered=False)
def test_fail_fast_gate(self):
@@ -9117,6 +9115,79 @@ class TestSchedulerFailFast(ZuulTestCase):
dict(name='project-test6', result='SUCCESS', changes='1,1'),
], ordered=False)
def test_fail_fast_node_failure(self):
"""
Tests that a pipeline that is flagged with fail-fast
aborts jobs early if a node request failed.
"""
self.executor_server.hold_jobs_in_build = True
self.fake_nodepool.pause()
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 1)
self.assertEqual(self.builds[0].name, 'project-merge')
self.executor_server.release('project-merge')
self.waitUntilSettled()
# Fail node request for project-test5
request = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(request)
self.fake_nodepool.unpause()
self.waitUntilSettled()
self.assertEqual(A.reported, 1)
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='ABORTED', changes='1,1'),
dict(name='project-test2', result='ABORTED', changes='1,1'),
], ordered=False)
def test_fail_fast_node_failure_nonvoting(self):
"""
Tests that a pipeline that is flagged with fail-fast
doesn't abort jobs due to a node failure for non-voting job.
"""
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.fake_nodepool.pause()
self.assertEqual(len(self.builds), 2)
self.assertEqual(self.builds[0].name, 'project-merge')
self.executor_server.release('project-merge')
self.executor_server.release('project-test5')
self.waitUntilSettled()
# Now project-test1 and project-test2 should be running
self.assertEqual(len(self.builds), 2)
# Fail node request for project-test6
request = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(request)
self.fake_nodepool.unpause()
self.waitUntilSettled()
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 0)
self.assertEqual(A.reported, 1)
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1'),
dict(name='project-test1', result='SUCCESS', changes='1,1'),
dict(name='project-test2', result='SUCCESS', changes='1,1'),
dict(name='project-test3', result='SUCCESS', changes='1,1'),
dict(name='project-test4', result='SUCCESS', changes='1,1'),
dict(name='project-test5', result='SUCCESS', changes='1,1'),
], ordered=False)
class TestPipelineSupersedes(ZuulTestCase):

View File

@@ -2270,6 +2270,13 @@ class PipelineManager(metaclass=ABCMeta):
tenant.semaphore_handler.release(
event_queue, build_set.item, job)
if build_set.fail_fast and job.voting:
# If fail-fast is set and the node(set) request is not
# successful cancel all remaining jobs.
log.debug("Node(set) request %s failed and fail-fast enabled, "
"canceling running builds", request.id)
self._cancelRunningBuilds(build_set)
log.info("Completed node(set) request %s for job %s of item %s "
"with nodes %s",
request, job.name, build_set.item, request.nodes)