Remove nodesets from builds canceled during reconfiguration

We observed errant behavior in the configuration covered by
test_reconfigure_window_shrink in production when a reconfiguration
shrunk the active window to less than the current value when
jobs had already completed.

Correct the underlying issue by removing the nodeset associated with
a build from the buildset when the reconfiguration routine cancels it.
Then, if we later launch the same job for some reason, we will obtain
a new nodeset.

If the build is running at the time it's canceled, we will still need
the scheduler to return the nodeset to nodepool.  Since it currently
relies on the value in the buildset to find the nodeset, attach the
nodeset to the build directly, so that even if we have removed the
nodeset from the buildset, the scheduler will still have a pointer
to the nodeset when the build completes.

Having said all that, we really don't want to waste resources by
shrinking the window on reconfiguration.  A future change is likely
to correct that and very likely invalidate the test just added.  The
only other time a build is likely to be canceled during reconfiguration
yet used again later is if a job is removed, then added back to a
project while changes are in the queue.  So that we continue to have
a test which covers this case, add a second test based on that scenario.

Both of these tests fail without the included fix.

Change-Id: If61b34e0f1464cb69d9d0b9053e05f1af996a67b
This commit is contained in:
James E. Blair 2017-11-30 13:44:38 -08:00
parent 0015fda37b
commit 8fd207bdf2
8 changed files with 268 additions and 6 deletions

View File

@ -0,0 +1,41 @@
- pipeline:
name: gate
manager: dependent
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
start:
gerrit:
Verified: 0
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
- job:
name: base
parent: null
nodeset:
nodes:
- label: ubuntu-xenial
name: controller
- job:
name: job1
run: playbooks/job1.yaml
- job:
name: job2
run: playbooks/job2.yaml
- project:
name: org/project
gate:
jobs:
- job1
- job2

View File

@ -0,0 +1,40 @@
- pipeline:
name: gate
manager: dependent
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
start:
gerrit:
Verified: 0
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
- job:
name: base
parent: null
nodeset:
nodes:
- label: ubuntu-xenial
name: controller
- job:
name: job1
run: playbooks/job1.yaml
- job:
name: job2
run: playbooks/job2.yaml
- project:
name: org/project
gate:
jobs:
- job1

View File

@ -0,0 +1,41 @@
- pipeline:
name: gate
manager: dependent
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
start:
gerrit:
Verified: 0
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
- job:
name: base
parent: null
nodeset:
nodes:
- label: ubuntu-xenial
name: controller
- job:
name: job1
run: playbooks/job1.yaml
- job:
name: job2
run: playbooks/job2.yaml
- project:
name: org/project
gate:
jobs:
- job1
- job2

View File

@ -0,0 +1,47 @@
- pipeline:
name: gate
manager: dependent
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
start:
gerrit:
Verified: 0
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
window: 1
window-floor: 1
window-increase-type: linear
window-increase-factor: 1
window-decrease-type: linear
window-decrease-factor: 1
- job:
name: base
parent: null
nodeset:
nodes:
- label: ubuntu-xenial
name: controller
- job:
name: job1
run: playbooks/job1.yaml
- job:
name: job2
run: playbooks/job2.yaml
- project:
name: org/project
gate:
jobs:
- job1
- job2

View File

@ -3853,6 +3853,91 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(queue.window_floor, 1)
self.assertEqual(C.data['status'], 'MERGED')
@simple_layout('layouts/reconfigure-window.yaml')
def test_reconfigure_window_shrink(self):
# Test the active window shrinking during reconfiguration
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
A.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.waitUntilSettled()
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
B.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
self.waitUntilSettled()
tenant = self.sched.abide.tenants.get('tenant-one')
queue = tenant.layout.pipelines['gate'].queues[0]
self.assertEqual(queue.window, 20)
self.assertTrue(len(self.builds), 4)
self.executor_server.release('job1')
self.waitUntilSettled()
self.commitConfigUpdate('org/common-config',
'layouts/reconfigure-window2.yaml')
self.sched.reconfigure(self.config)
tenant = self.sched.abide.tenants.get('tenant-one')
queue = tenant.layout.pipelines['gate'].queues[0]
self.assertEqual(queue.window, 1)
# B is now outside the window, but builds haven't been canceled
self.assertTrue(len(self.builds), 4)
self.sched.reconfigure(self.config)
tenant = self.sched.abide.tenants.get('tenant-one')
queue = tenant.layout.pipelines['gate'].queues[0]
self.assertEqual(queue.window, 1)
# B's builds have been canceled now
self.assertTrue(len(self.builds), 2)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
# This will run new builds for B
self.waitUntilSettled()
self.assertHistory([
dict(name='job1', result='SUCCESS', changes='1,1'),
dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
dict(name='job2', result='SUCCESS', changes='1,1'),
dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
], ordered=False)
@simple_layout('layouts/reconfigure-remove-add.yaml')
def test_reconfigure_remove_add(self):
# Test removing, then adding a job while in queue
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
A.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.waitUntilSettled()
self.assertTrue(len(self.builds), 2)
self.executor_server.release('job2')
self.assertTrue(len(self.builds), 1)
# Remove job2
self.commitConfigUpdate('org/common-config',
'layouts/reconfigure-remove-add2.yaml')
self.sched.reconfigure(self.config)
self.assertTrue(len(self.builds), 1)
# Add job2 back
self.commitConfigUpdate('org/common-config',
'layouts/reconfigure-remove-add.yaml')
self.sched.reconfigure(self.config)
self.assertTrue(len(self.builds), 2)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
# This will run new builds for B
self.waitUntilSettled()
self.assertHistory([
dict(name='job2', result='SUCCESS', changes='1,1'),
dict(name='job1', result='SUCCESS', changes='1,1'),
dict(name='job2', result='SUCCESS', changes='1,1'),
], ordered=False)
def test_worker_update_metadata(self):
"Test if a worker can send back metadata about itself"
self.executor_server.hold_jobs_in_build = True

View File

@ -137,13 +137,11 @@ class ExecutorClient(object):
merger_items=[]):
tenant = pipeline.layout.tenant
uuid = str(uuid4().hex)
nodeset = item.current_build_set.getJobNodeSet(job.name)
self.log.info(
"Execute job %s (uuid: %s) on nodes %s for change %s "
"with dependent changes %s" % (
job, uuid,
item.current_build_set.getJobNodeSet(job.name),
item.change,
dependent_changes))
job, uuid, nodeset, item.change, dependent_changes))
# TODOv3(jeblair): This ansible vars data structure will
# replace the environment variables below.
@ -205,7 +203,6 @@ class ExecutorClient(object):
params['pre_playbooks'] = [x.toDict() for x in job.pre_run]
params['post_playbooks'] = [x.toDict() for x in job.post_run]
nodeset = item.current_build_set.getJobNodeSet(job.name)
nodes = []
for node in nodeset.getNodes():
n = node.toDict()
@ -278,6 +275,7 @@ class ExecutorClient(object):
build = Build(job, uuid)
build.parameters = params
build.nodeset = nodeset
if job.name == 'noop':
self.sched.onBuildStarted(build)

View File

@ -1246,6 +1246,7 @@ class Build(object):
self.worker = Worker()
self.node_labels = []
self.node_name = None
self.nodeset = None
def __repr__(self):
return ('<Build %s of %s on %s>' %

View File

@ -650,6 +650,15 @@ class Scheduler(threading.Thread):
self.log.exception(
"Exception while canceling build %s "
"for change %s" % (build, build.build_set.item.change))
# In the unlikely case that a build is removed and
# later added back, make sure we clear out the nodeset
# so it gets requested again.
try:
build.build_set.removeJobNodeSet(build.job.name)
except Exception:
self.log.exception(
"Exception while removing nodeset from build %s "
"for change %s" % (build, build.build_set.item.change))
finally:
tenant.semaphore_handler.release(
build.build_set.item, build.job)
@ -920,7 +929,7 @@ class Scheduler(threading.Thread):
# to pass this on to the pipeline manager, make sure we return
# the nodes to nodepool.
try:
nodeset = build.build_set.getJobNodeSet(build.job.name)
nodeset = build.nodeset
autohold_key = (build.pipeline.layout.tenant.name,
build.build_set.item.change.project.canonical_name,
build.job.name)