Correctly handle skipped jobs on retries

There is a scenario where skipping child jobs combined with retries
can lead to hanging buildsets. The only way to recover them is a
dequeue and re-enqueue wither by close/reopen the change or using the
cli for non-change based pipelines.

1. cache pauses
2. skip-upload skips all child jobs
3. test does a retry which resets upload which must get skipped
   again during the reset process because of skip-upload skipping it.

cache (pauses) -+
                |
                |
                +--> test (retries) -----------+
                                               |
                                               +--> upload (skipped)
                                               |
                +--> prepare-upload (skipped) -+
                |
skip-upload ----+

Currently the job upload in this scenario gets reset to none and
indefinitely waits for being scheduled.

This can be fixed by calling setResult on all finished jobs similar to
what is done during a reenqueue. This will reset the build result to
SKIPPED again if skipping is still the right choice.

Change-Id: Ia51e46f71267535fef792d875b271271305fdad2
This commit is contained in:
Tobias Henkel 2020-07-02 12:54:48 +02:00
parent 4418547143
commit 96f42571ea
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
6 changed files with 103 additions and 0 deletions

View File

@ -3616,6 +3616,10 @@ class ZuulTestCase(BaseTestCase):
git_url_with_auth = False
log_console_port = 19885
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fake_gerrit = None
def _startMerger(self):
self.merge_server = zuul.merger.server.MergeServer(self.config,
self.connections)

View File

@ -0,0 +1,6 @@
- hosts: localhost
tasks:
- zuul_return:
data:
zuul:
child_jobs: []

View File

@ -0,0 +1,31 @@
- job:
name: skip-upload
run: playbooks/skip-upload.yaml
- job:
name: cache
parent: just-pause
- job:
name: prepare-upload
parent: test
- job:
name: upload
parent: test
- project:
check:
jobs:
- cache
- skip-upload
- test:
dependencies:
- cache
- prepare-upload:
dependencies:
- skip-upload
- upload:
dependencies:
- prepare-upload
- test

View File

@ -8,3 +8,4 @@
- org/project2
- org/project3
- org/project4
- org/project5

View File

@ -5662,6 +5662,61 @@ class TestJobPause(AnsibleZuulTestCase):
self.assertIn('test : SKIPPED', A.messages[0])
def test_job_pause_skipped_child_retry(self):
"""
Tests that a paused job is resumed with skipped jobs and retries.
Tests that this situation won't lead to stuck buildsets.
1. cache pauses
2. skip-upload skips upload
3. test does a retry which resets upload which must get skipped
again during the reset process because of pre-test skipping it.
cache (pauses) -+
|
|
+--> test (retries) -----------+
|
+--> upload (skipped)
|
+--> prepare-upload (skipped) -+
|
skip-upload ----+
"""
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project5', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
self.executor_server.release('cache')
self.waitUntilSettled()
self.executor_server.release('skip-upload')
self.waitUntilSettled()
# Stop the job worker of test to simulate an executor restart
job_test = self.builds[1]
for job_worker in self.executor_server.job_workers.values():
if job_worker.job.unique == job_test.unique:
job_worker.stop()
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
self.waitUntilSettled()
# All builds must be finished by now
self.assertEqual(len(self.builds), 0, 'All builds must be finished')
# upload must not be run as this should have been skipped
self.assertHistory([
dict(name='skip-upload', result='SUCCESS', changes='1,1'),
dict(name='test', result='ABORTED', changes='1,1'),
dict(name='test', result='SUCCESS', changes='1,1'),
dict(name='cache', result='SUCCESS', changes='1,1'),
])
class TestJobPausePostFail(AnsibleZuulTestCase):
tenant_config_file = 'config/job-pause2/main.yaml'

View File

@ -1003,6 +1003,12 @@ class PipelineManager(metaclass=ABCMeta):
if build:
build_set.removeBuild(build)
# Re-set build results in case we resetted builds that were skipped
# not by this build/
for build in build_set.getBuilds():
if build.result:
build_set.item.setResult(build)
def _cancelRunningBuilds(self, build_set):
item = build_set.item
for job in item.getJobs():