From bb9ad84cb60c3b6132e4d74f5eb0116637215ef8 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 3 Jan 2019 13:06:39 -0800 Subject: [PATCH] Be more aggressive in canceling node requests During a reconfiguration, we may cancel builds if they are no longer necessary, however we do not do the same for node requests. Currently we let them run to completion and then return them unused. It would be more efficient (and behaviorally consistent) to cancel the request as soon as we determine that it won't be used. Also, change some warning log messages to info (as they don't indicate a situation which may benefit from corrective action). Change-Id: Ic3ef6b75437bf82bf9c8f426b23ea14d9aaa96b7 --- zuul/model.py | 5 ++--- zuul/scheduler.py | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/zuul/model.py b/zuul/model.py index 09726ac132..9899c92189 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -1861,9 +1861,8 @@ class BuildSet(object): return self.node_requests.get(job_name) def removeJobNodeRequest(self, job_name): - if job_name not in self.node_requests: - raise Exception("No node request for %s" % (job_name)) - del self.node_requests[job_name] + if job_name in self.node_requests: + del self.node_requests[job_name] def jobNodeRequestComplete(self, job_name, req, nodeset): if job_name in self.nodesets: diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 030342925d..0a4bcee7ae 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -770,6 +770,7 @@ class Scheduler(threading.Thread): new_pipeline.window_floor) items_to_remove = [] builds_to_cancel = [] + requests_to_cancel = [] last_head = None for shared_queue in old_pipeline.queues: # Attempt to keep window sizes from shrinking where possible @@ -812,15 +813,25 @@ class Scheduler(threading.Thread): else: item.removeBuild(build) builds_to_cancel.append(build) + for request_job, request in \ + item.current_build_set.node_requests.items(): + new_job = item.getJob(request_job) + if not new_job: + requests_to_cancel.append( + (item.current_build_set, request)) else: items_to_remove.append(item) for item in items_to_remove: - self.log.warning( + self.log.info( "Removing item %s during reconfiguration" % (item,)) for build in item.current_build_set.getBuilds(): builds_to_cancel.append(build) + for request_job, request in \ + item.current_build_set.node_requests.items(): + requests_to_cancel.append( + (item.current_build_set, request)) for build in builds_to_cancel: - self.log.warning( + self.log.info( "Canceling build %s during reconfiguration" % (build,)) try: self.executor.cancel(build) @@ -839,6 +850,12 @@ class Scheduler(threading.Thread): "for change %s" % (build, build.build_set.item.change)) tenant.semaphore_handler.release( build.build_set.item, build.job) + for build_set, request in requests_to_cancel: + self.log.info( + "Canceling node request %s during reconfiguration", + request) + self.nodepool.cancelRequest(request) + build_set.removeJobNodeRequest(request.job.name) def _reconfigureTenant(self, tenant): # This is called from _doReconfigureEvent while holding the @@ -1313,12 +1330,7 @@ class Scheduler(threading.Thread): self.log.warning("Item %s does not contain job %s " "for node request %s", build_set.item, request.job.name, request) - try: - build_set.removeJobNodeRequest(request.job.name) - except Exception: - self.log.exception("Unable to remove obsolete node request " - "%s for %s job %s", - request, build_set.item, request.job.name) + build_set.removeJobNodeRequest(request.job.name) if request.fulfilled: self.nodepool.returnNodeSet(request.nodeset) return