Correctly handle skipped jobs on retries
There is a scenario where skipping child jobs combined with retries can lead to hanging buildsets. The only way to recover them is a dequeue and re-enqueue wither by close/reopen the change or using the cli for non-change based pipelines. 1. cache pauses 2. skip-upload skips all child jobs 3. test does a retry which resets upload which must get skipped again during the reset process because of skip-upload skipping it. cache (pauses) -+ | | +--> test (retries) -----------+ | +--> upload (skipped) | +--> prepare-upload (skipped) -+ | skip-upload ----+ Currently the job upload in this scenario gets reset to none and indefinitely waits for being scheduled. This can be fixed by calling setResult on all finished jobs similar to what is done during a reenqueue. This will reset the build result to SKIPPED again if skipping is still the right choice. Change-Id: Ia51e46f71267535fef792d875b271271305fdad2
This commit is contained in:
parent
4418547143
commit
96f42571ea
|
@ -3616,6 +3616,10 @@ class ZuulTestCase(BaseTestCase):
|
|||
git_url_with_auth = False
|
||||
log_console_port = 19885
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fake_gerrit = None
|
||||
|
||||
def _startMerger(self):
|
||||
self.merge_server = zuul.merger.server.MergeServer(self.config,
|
||||
self.connections)
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
- hosts: localhost
|
||||
tasks:
|
||||
- zuul_return:
|
||||
data:
|
||||
zuul:
|
||||
child_jobs: []
|
|
@ -0,0 +1,31 @@
|
|||
- job:
|
||||
name: skip-upload
|
||||
run: playbooks/skip-upload.yaml
|
||||
|
||||
- job:
|
||||
name: cache
|
||||
parent: just-pause
|
||||
|
||||
- job:
|
||||
name: prepare-upload
|
||||
parent: test
|
||||
|
||||
- job:
|
||||
name: upload
|
||||
parent: test
|
||||
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- cache
|
||||
- skip-upload
|
||||
- test:
|
||||
dependencies:
|
||||
- cache
|
||||
- prepare-upload:
|
||||
dependencies:
|
||||
- skip-upload
|
||||
- upload:
|
||||
dependencies:
|
||||
- prepare-upload
|
||||
- test
|
|
@ -8,3 +8,4 @@
|
|||
- org/project2
|
||||
- org/project3
|
||||
- org/project4
|
||||
- org/project5
|
||||
|
|
|
@ -5662,6 +5662,61 @@ class TestJobPause(AnsibleZuulTestCase):
|
|||
|
||||
self.assertIn('test : SKIPPED', A.messages[0])
|
||||
|
||||
def test_job_pause_skipped_child_retry(self):
|
||||
"""
|
||||
Tests that a paused job is resumed with skipped jobs and retries.
|
||||
|
||||
Tests that this situation won't lead to stuck buildsets.
|
||||
1. cache pauses
|
||||
2. skip-upload skips upload
|
||||
3. test does a retry which resets upload which must get skipped
|
||||
again during the reset process because of pre-test skipping it.
|
||||
|
||||
cache (pauses) -+
|
||||
|
|
||||
|
|
||||
+--> test (retries) -----------+
|
||||
|
|
||||
+--> upload (skipped)
|
||||
|
|
||||
+--> prepare-upload (skipped) -+
|
||||
|
|
||||
skip-upload ----+
|
||||
"""
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project5', 'master', 'A')
|
||||
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.release('cache')
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.executor_server.release('skip-upload')
|
||||
self.waitUntilSettled()
|
||||
|
||||
# Stop the job worker of test to simulate an executor restart
|
||||
job_test = self.builds[1]
|
||||
for job_worker in self.executor_server.job_workers.values():
|
||||
if job_worker.job.unique == job_test.unique:
|
||||
job_worker.stop()
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
self.waitUntilSettled()
|
||||
|
||||
# All builds must be finished by now
|
||||
self.assertEqual(len(self.builds), 0, 'All builds must be finished')
|
||||
|
||||
# upload must not be run as this should have been skipped
|
||||
self.assertHistory([
|
||||
dict(name='skip-upload', result='SUCCESS', changes='1,1'),
|
||||
dict(name='test', result='ABORTED', changes='1,1'),
|
||||
dict(name='test', result='SUCCESS', changes='1,1'),
|
||||
dict(name='cache', result='SUCCESS', changes='1,1'),
|
||||
])
|
||||
|
||||
|
||||
class TestJobPausePostFail(AnsibleZuulTestCase):
|
||||
tenant_config_file = 'config/job-pause2/main.yaml'
|
||||
|
|
|
@ -1003,6 +1003,12 @@ class PipelineManager(metaclass=ABCMeta):
|
|||
if build:
|
||||
build_set.removeBuild(build)
|
||||
|
||||
# Re-set build results in case we resetted builds that were skipped
|
||||
# not by this build/
|
||||
for build in build_set.getBuilds():
|
||||
if build.result:
|
||||
build_set.item.setResult(build)
|
||||
|
||||
def _cancelRunningBuilds(self, build_set):
|
||||
item = build_set.item
|
||||
for job in item.getJobs():
|
||||
|
|
Loading…
Reference in New Issue