Be more aggressive in canceling node requests

During a reconfiguration, we may cancel builds if they are no longer
necessary, however we do not do the same for node requests.  Currently
we let them run to completion and then return them unused.  It would
be more efficient (and behaviorally consistent) to cancel the request
as soon as we determine that it won't be used.

Also, change some warning log messages to info (as they don't indicate
a situation which may benefit from corrective action).

Change-Id: Ic3ef6b75437bf82bf9c8f426b23ea14d9aaa96b7
This commit is contained in:
James E. Blair 2019-01-03 13:06:39 -08:00
parent 31a7ddc791
commit bb9ad84cb6
2 changed files with 22 additions and 11 deletions

View File

@ -1861,9 +1861,8 @@ class BuildSet(object):
return self.node_requests.get(job_name) return self.node_requests.get(job_name)
def removeJobNodeRequest(self, job_name): def removeJobNodeRequest(self, job_name):
if job_name not in self.node_requests: if job_name in self.node_requests:
raise Exception("No node request for %s" % (job_name)) del self.node_requests[job_name]
del self.node_requests[job_name]
def jobNodeRequestComplete(self, job_name, req, nodeset): def jobNodeRequestComplete(self, job_name, req, nodeset):
if job_name in self.nodesets: if job_name in self.nodesets:

View File

@ -770,6 +770,7 @@ class Scheduler(threading.Thread):
new_pipeline.window_floor) new_pipeline.window_floor)
items_to_remove = [] items_to_remove = []
builds_to_cancel = [] builds_to_cancel = []
requests_to_cancel = []
last_head = None last_head = None
for shared_queue in old_pipeline.queues: for shared_queue in old_pipeline.queues:
# Attempt to keep window sizes from shrinking where possible # Attempt to keep window sizes from shrinking where possible
@ -812,15 +813,25 @@ class Scheduler(threading.Thread):
else: else:
item.removeBuild(build) item.removeBuild(build)
builds_to_cancel.append(build) builds_to_cancel.append(build)
for request_job, request in \
item.current_build_set.node_requests.items():
new_job = item.getJob(request_job)
if not new_job:
requests_to_cancel.append(
(item.current_build_set, request))
else: else:
items_to_remove.append(item) items_to_remove.append(item)
for item in items_to_remove: for item in items_to_remove:
self.log.warning( self.log.info(
"Removing item %s during reconfiguration" % (item,)) "Removing item %s during reconfiguration" % (item,))
for build in item.current_build_set.getBuilds(): for build in item.current_build_set.getBuilds():
builds_to_cancel.append(build) builds_to_cancel.append(build)
for request_job, request in \
item.current_build_set.node_requests.items():
requests_to_cancel.append(
(item.current_build_set, request))
for build in builds_to_cancel: for build in builds_to_cancel:
self.log.warning( self.log.info(
"Canceling build %s during reconfiguration" % (build,)) "Canceling build %s during reconfiguration" % (build,))
try: try:
self.executor.cancel(build) self.executor.cancel(build)
@ -839,6 +850,12 @@ class Scheduler(threading.Thread):
"for change %s" % (build, build.build_set.item.change)) "for change %s" % (build, build.build_set.item.change))
tenant.semaphore_handler.release( tenant.semaphore_handler.release(
build.build_set.item, build.job) build.build_set.item, build.job)
for build_set, request in requests_to_cancel:
self.log.info(
"Canceling node request %s during reconfiguration",
request)
self.nodepool.cancelRequest(request)
build_set.removeJobNodeRequest(request.job.name)
def _reconfigureTenant(self, tenant): def _reconfigureTenant(self, tenant):
# This is called from _doReconfigureEvent while holding the # This is called from _doReconfigureEvent while holding the
@ -1313,12 +1330,7 @@ class Scheduler(threading.Thread):
self.log.warning("Item %s does not contain job %s " self.log.warning("Item %s does not contain job %s "
"for node request %s", "for node request %s",
build_set.item, request.job.name, request) build_set.item, request.job.name, request)
try: build_set.removeJobNodeRequest(request.job.name)
build_set.removeJobNodeRequest(request.job.name)
except Exception:
self.log.exception("Unable to remove obsolete node request "
"%s for %s job %s",
request, build_set.item, request.job.name)
if request.fulfilled: if request.fulfilled:
self.nodepool.returnNodeSet(request.nodeset) self.nodepool.returnNodeSet(request.nodeset)
return return