Remove nodesets from builds canceled during reconfiguration
We observed errant behavior in the configuration covered by test_reconfigure_window_shrink in production when a reconfiguration shrunk the active window to less than the current value when jobs had already completed. Correct the underlying issue by removing the nodeset associated with a build from the buildset when the reconfiguration routine cancels it. Then, if we later launch the same job for some reason, we will obtain a new nodeset. If the build is running at the time it's canceled, we will still need the scheduler to return the nodeset to nodepool. Since it currently relies on the value in the buildset to find the nodeset, attach the nodeset to the build directly, so that even if we have removed the nodeset from the buildset, the scheduler will still have a pointer to the nodeset when the build completes. Having said all that, we really don't want to waste resources by shrinking the window on reconfiguration. A future change is likely to correct that and very likely invalidate the test just added. The only other time a build is likely to be canceled during reconfiguration yet used again later is if a job is removed, then added back to a project while changes are in the queue. So that we continue to have a test which covers this case, add a second test based on that scenario. Both of these tests fail without the included fix. Change-Id: If61b34e0f1464cb69d9d0b9053e05f1af996a67b
This commit is contained in:
parent
0015fda37b
commit
8fd207bdf2
|
@ -0,0 +1,41 @@
|
|||
- pipeline:
|
||||
name: gate
|
||||
manager: dependent
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: comment-added
|
||||
approval:
|
||||
- Approved: 1
|
||||
start:
|
||||
gerrit:
|
||||
Verified: 0
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 2
|
||||
submit: true
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -2
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
nodeset:
|
||||
nodes:
|
||||
- label: ubuntu-xenial
|
||||
name: controller
|
||||
|
||||
- job:
|
||||
name: job1
|
||||
run: playbooks/job1.yaml
|
||||
|
||||
- job:
|
||||
name: job2
|
||||
run: playbooks/job2.yaml
|
||||
|
||||
- project:
|
||||
name: org/project
|
||||
gate:
|
||||
jobs:
|
||||
- job1
|
||||
- job2
|
|
@ -0,0 +1,40 @@
|
|||
- pipeline:
|
||||
name: gate
|
||||
manager: dependent
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: comment-added
|
||||
approval:
|
||||
- Approved: 1
|
||||
start:
|
||||
gerrit:
|
||||
Verified: 0
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 2
|
||||
submit: true
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -2
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
nodeset:
|
||||
nodes:
|
||||
- label: ubuntu-xenial
|
||||
name: controller
|
||||
|
||||
- job:
|
||||
name: job1
|
||||
run: playbooks/job1.yaml
|
||||
|
||||
- job:
|
||||
name: job2
|
||||
run: playbooks/job2.yaml
|
||||
|
||||
- project:
|
||||
name: org/project
|
||||
gate:
|
||||
jobs:
|
||||
- job1
|
|
@ -0,0 +1,41 @@
|
|||
- pipeline:
|
||||
name: gate
|
||||
manager: dependent
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: comment-added
|
||||
approval:
|
||||
- Approved: 1
|
||||
start:
|
||||
gerrit:
|
||||
Verified: 0
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 2
|
||||
submit: true
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -2
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
nodeset:
|
||||
nodes:
|
||||
- label: ubuntu-xenial
|
||||
name: controller
|
||||
|
||||
- job:
|
||||
name: job1
|
||||
run: playbooks/job1.yaml
|
||||
|
||||
- job:
|
||||
name: job2
|
||||
run: playbooks/job2.yaml
|
||||
|
||||
- project:
|
||||
name: org/project
|
||||
gate:
|
||||
jobs:
|
||||
- job1
|
||||
- job2
|
|
@ -0,0 +1,47 @@
|
|||
- pipeline:
|
||||
name: gate
|
||||
manager: dependent
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: comment-added
|
||||
approval:
|
||||
- Approved: 1
|
||||
start:
|
||||
gerrit:
|
||||
Verified: 0
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 2
|
||||
submit: true
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -2
|
||||
window: 1
|
||||
window-floor: 1
|
||||
window-increase-type: linear
|
||||
window-increase-factor: 1
|
||||
window-decrease-type: linear
|
||||
window-decrease-factor: 1
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
nodeset:
|
||||
nodes:
|
||||
- label: ubuntu-xenial
|
||||
name: controller
|
||||
|
||||
- job:
|
||||
name: job1
|
||||
run: playbooks/job1.yaml
|
||||
|
||||
- job:
|
||||
name: job2
|
||||
run: playbooks/job2.yaml
|
||||
|
||||
- project:
|
||||
name: org/project
|
||||
gate:
|
||||
jobs:
|
||||
- job1
|
||||
- job2
|
|
@ -3853,6 +3853,91 @@ class TestScheduler(ZuulTestCase):
|
|||
self.assertEqual(queue.window_floor, 1)
|
||||
self.assertEqual(C.data['status'], 'MERGED')
|
||||
|
||||
@simple_layout('layouts/reconfigure-window.yaml')
|
||||
def test_reconfigure_window_shrink(self):
|
||||
# Test the active window shrinking during reconfiguration
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
A.addApproval('Code-Review', 2)
|
||||
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
|
||||
self.waitUntilSettled()
|
||||
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
|
||||
B.addApproval('Code-Review', 2)
|
||||
self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
tenant = self.sched.abide.tenants.get('tenant-one')
|
||||
queue = tenant.layout.pipelines['gate'].queues[0]
|
||||
self.assertEqual(queue.window, 20)
|
||||
self.assertTrue(len(self.builds), 4)
|
||||
|
||||
self.executor_server.release('job1')
|
||||
self.waitUntilSettled()
|
||||
self.commitConfigUpdate('org/common-config',
|
||||
'layouts/reconfigure-window2.yaml')
|
||||
self.sched.reconfigure(self.config)
|
||||
tenant = self.sched.abide.tenants.get('tenant-one')
|
||||
queue = tenant.layout.pipelines['gate'].queues[0]
|
||||
self.assertEqual(queue.window, 1)
|
||||
# B is now outside the window, but builds haven't been canceled
|
||||
self.assertTrue(len(self.builds), 4)
|
||||
|
||||
self.sched.reconfigure(self.config)
|
||||
tenant = self.sched.abide.tenants.get('tenant-one')
|
||||
queue = tenant.layout.pipelines['gate'].queues[0]
|
||||
self.assertEqual(queue.window, 1)
|
||||
# B's builds have been canceled now
|
||||
self.assertTrue(len(self.builds), 2)
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
# This will run new builds for B
|
||||
self.waitUntilSettled()
|
||||
self.assertHistory([
|
||||
dict(name='job1', result='SUCCESS', changes='1,1'),
|
||||
dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
|
||||
dict(name='job2', result='SUCCESS', changes='1,1'),
|
||||
dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
|
||||
dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
|
||||
dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
|
||||
], ordered=False)
|
||||
|
||||
@simple_layout('layouts/reconfigure-remove-add.yaml')
|
||||
def test_reconfigure_remove_add(self):
|
||||
# Test removing, then adding a job while in queue
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
A.addApproval('Code-Review', 2)
|
||||
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
|
||||
self.waitUntilSettled()
|
||||
self.assertTrue(len(self.builds), 2)
|
||||
self.executor_server.release('job2')
|
||||
self.assertTrue(len(self.builds), 1)
|
||||
|
||||
# Remove job2
|
||||
self.commitConfigUpdate('org/common-config',
|
||||
'layouts/reconfigure-remove-add2.yaml')
|
||||
self.sched.reconfigure(self.config)
|
||||
self.assertTrue(len(self.builds), 1)
|
||||
|
||||
# Add job2 back
|
||||
self.commitConfigUpdate('org/common-config',
|
||||
'layouts/reconfigure-remove-add.yaml')
|
||||
self.sched.reconfigure(self.config)
|
||||
self.assertTrue(len(self.builds), 2)
|
||||
|
||||
self.executor_server.hold_jobs_in_build = False
|
||||
self.executor_server.release()
|
||||
# This will run new builds for B
|
||||
self.waitUntilSettled()
|
||||
self.assertHistory([
|
||||
dict(name='job2', result='SUCCESS', changes='1,1'),
|
||||
dict(name='job1', result='SUCCESS', changes='1,1'),
|
||||
dict(name='job2', result='SUCCESS', changes='1,1'),
|
||||
], ordered=False)
|
||||
|
||||
def test_worker_update_metadata(self):
|
||||
"Test if a worker can send back metadata about itself"
|
||||
self.executor_server.hold_jobs_in_build = True
|
||||
|
|
|
@ -137,13 +137,11 @@ class ExecutorClient(object):
|
|||
merger_items=[]):
|
||||
tenant = pipeline.layout.tenant
|
||||
uuid = str(uuid4().hex)
|
||||
nodeset = item.current_build_set.getJobNodeSet(job.name)
|
||||
self.log.info(
|
||||
"Execute job %s (uuid: %s) on nodes %s for change %s "
|
||||
"with dependent changes %s" % (
|
||||
job, uuid,
|
||||
item.current_build_set.getJobNodeSet(job.name),
|
||||
item.change,
|
||||
dependent_changes))
|
||||
job, uuid, nodeset, item.change, dependent_changes))
|
||||
|
||||
# TODOv3(jeblair): This ansible vars data structure will
|
||||
# replace the environment variables below.
|
||||
|
@ -205,7 +203,6 @@ class ExecutorClient(object):
|
|||
params['pre_playbooks'] = [x.toDict() for x in job.pre_run]
|
||||
params['post_playbooks'] = [x.toDict() for x in job.post_run]
|
||||
|
||||
nodeset = item.current_build_set.getJobNodeSet(job.name)
|
||||
nodes = []
|
||||
for node in nodeset.getNodes():
|
||||
n = node.toDict()
|
||||
|
@ -278,6 +275,7 @@ class ExecutorClient(object):
|
|||
|
||||
build = Build(job, uuid)
|
||||
build.parameters = params
|
||||
build.nodeset = nodeset
|
||||
|
||||
if job.name == 'noop':
|
||||
self.sched.onBuildStarted(build)
|
||||
|
|
|
@ -1246,6 +1246,7 @@ class Build(object):
|
|||
self.worker = Worker()
|
||||
self.node_labels = []
|
||||
self.node_name = None
|
||||
self.nodeset = None
|
||||
|
||||
def __repr__(self):
|
||||
return ('<Build %s of %s on %s>' %
|
||||
|
|
|
@ -650,6 +650,15 @@ class Scheduler(threading.Thread):
|
|||
self.log.exception(
|
||||
"Exception while canceling build %s "
|
||||
"for change %s" % (build, build.build_set.item.change))
|
||||
# In the unlikely case that a build is removed and
|
||||
# later added back, make sure we clear out the nodeset
|
||||
# so it gets requested again.
|
||||
try:
|
||||
build.build_set.removeJobNodeSet(build.job.name)
|
||||
except Exception:
|
||||
self.log.exception(
|
||||
"Exception while removing nodeset from build %s "
|
||||
"for change %s" % (build, build.build_set.item.change))
|
||||
finally:
|
||||
tenant.semaphore_handler.release(
|
||||
build.build_set.item, build.job)
|
||||
|
@ -920,7 +929,7 @@ class Scheduler(threading.Thread):
|
|||
# to pass this on to the pipeline manager, make sure we return
|
||||
# the nodes to nodepool.
|
||||
try:
|
||||
nodeset = build.build_set.getJobNodeSet(build.job.name)
|
||||
nodeset = build.nodeset
|
||||
autohold_key = (build.pipeline.layout.tenant.name,
|
||||
build.build_set.item.change.project.canonical_name,
|
||||
build.job.name)
|
||||
|
|
Loading…
Reference in New Issue