From 6ab79e06377d00028bbc02cb2974499512045cd6 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 6 Jan 2017 10:10:17 -0800 Subject: [PATCH] Handle nodepool allocation failure When a request is either fulfilled or failed, pass it through to the scheduler which will accept the request (which means deleting it in the case of a failure) and pass it on to the pipeline manager which will set the result of the requesting job to NODE_FAILURE and cause any sub-jobs to be SKIPPED. Adjust the request algorithm to only request nodes for jobs that are ready to run. The current behavior requests all jobs for a build set asap, but that has two downsides: it may request and return nodes more aggressively than necessary (if you have chosen to create a job tree, you *probably* don't want to tie up nodes until they are actually needed). However, that's a grey area, and we may want to adjust or make that behavior configurable later. More pressing here is that it makes the logic of when to return nodes *very* complicated (since SKIPPED jobs are represented by fake builds, there is no good opportunity to return their nodes). This seems like a good solution for now, and if we want to make the node request behavior more aggressive in the future, we can work out a better model for knowing when to return nodes. Change-Id: Ideab6eb5794a01d5c2b70cb87d02d61bb3d41cce --- tests/base.py | 21 ++++++++++++++------- tests/test_scheduler.py | 21 +++++++++++++++++++++ zuul/manager/__init__.py | 4 ++++ zuul/model.py | 32 ++++++++++++++++++++++++++------ zuul/nodepool.py | 23 ++++++++++++----------- zuul/scheduler.py | 13 +++++-------- 6 files changed, 82 insertions(+), 32 deletions(-) diff --git a/tests/base.py b/tests/base.py index 56c83f21cf..9e3c07bfda 100755 --- a/tests/base.py +++ b/tests/base.py @@ -887,6 +887,7 @@ class FakeNodepool(object): self.thread = threading.Thread(target=self.run) self.thread.daemon = True self.thread.start() + self.fail_requests = set() def stop(self): self._running = False @@ -965,21 +966,27 @@ class FakeNodepool(object): nodeid = path.split("/")[-1] return nodeid + def addFailRequest(self, request): + self.fail_requests.add(request['_oid']) + def fulfillRequest(self, request): - if request['state'] == 'fulfilled': + if request['state'] != 'requested': return request = request.copy() oid = request['_oid'] del request['_oid'] - nodes = [] - for node in request['node_types']: - nodeid = self.makeNode(oid, node) - nodes.append(nodeid) + if oid in self.fail_requests: + request['state'] = 'failed' + else: + request['state'] = 'fulfilled' + nodes = [] + for node in request['node_types']: + nodeid = self.makeNode(oid, node) + nodes.append(nodeid) + request['nodes'] = nodes - request['state'] = 'fulfilled' request['state_time'] = time.time() - request['nodes'] = nodes path = self.REQUEST_ROOT + '/' + oid data = json.dumps(request) self.log.debug("Fulfilling node request: %s %s" % (oid, data)) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 81635e0429..f65dbce748 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -4549,6 +4549,27 @@ For CI problems and help debugging, contact ci@example.org""" self.assertEqual(A.data['status'], 'MERGED') self.assertEqual(A.reported, 2) + def test_nodepool_failure(self): + "Test that jobs are reported after a nodepool failure" + + self.fake_nodepool.paused = True + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + A.addApproval('code-review', 2) + self.fake_gerrit.addEvent(A.addApproval('approved', 1)) + self.waitUntilSettled() + + req = self.fake_nodepool.getNodeRequests()[0] + self.fake_nodepool.addFailRequest(req) + + self.fake_nodepool.paused = False + self.waitUntilSettled() + + self.assertEqual(A.data['status'], 'NEW') + self.assertEqual(A.reported, 2) + self.assertIn('project-merge : NODE_FAILURE', A.messages[1]) + self.assertIn('project-test1 : SKIPPED', A.messages[1]) + self.assertIn('project-test2 : SKIPPED', A.messages[1]) + class TestDuplicatePipeline(ZuulTestCase): tenant_config_file = 'config/duplicate-pipeline/main.yaml' diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 0d11316497..988a9306df 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -648,6 +648,10 @@ class PipelineManager(object): build_set = request.build_set build_set.jobNodeRequestComplete(request.job.name, request, request.nodeset) + if request.failed or not request.fulfilled: + self.log.info("Node request failure for %s" % + (request.job.name,)) + build_set.item.setNodeRequestFailure(request.job) self.log.info("Completed node request %s for job %s of item %s " "with nodes %s" % (request, request.job, build_set.item, diff --git a/zuul/model.py b/zuul/model.py index 529f34604e..ed7af76419 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -473,6 +473,10 @@ class NodeRequest(object): # overwritten). self.failed = False + @property + def fulfilled(self): + return (self._state == STATE_FULFILLED) and not self.failed + @property def state(self): return self._state @@ -989,18 +993,28 @@ class QueueItem(object): return self._findJobsToRun(tree.job_trees, mutex) def _findJobsToRequest(self, job_trees): + build_set = self.current_build_set toreq = [] + if self.item_ahead: + if self.item_ahead.isHoldingFollowingChanges(): + return [] for tree in job_trees: job = tree.job + result = None if job: if not job.changeMatches(self.change): continue - nodeset = self.current_build_set.getJobNodeSet(job.name) - if nodeset is None: - req = self.current_build_set.getJobNodeRequest(job.name) - if req is None: - toreq.append(job) - toreq.extend(self._findJobsToRequest(tree.job_trees)) + build = build_set.getBuild(job.name) + if build: + result = build.result + else: + nodeset = build_set.getJobNodeSet(job.name) + if nodeset is None: + req = build_set.getJobNodeRequest(job.name) + if req is None: + toreq.append(job) + if result == 'SUCCESS' or not job: + toreq.extend(self._findJobsToRequest(tree.job_trees)) return toreq def findJobsToRequest(self): @@ -1022,6 +1036,12 @@ class QueueItem(object): fakebuild.result = 'SKIPPED' self.addBuild(fakebuild) + def setNodeRequestFailure(self, job): + fakebuild = Build(job, None) + self.addBuild(fakebuild) + fakebuild.result = 'NODE_FAILURE' + self.setResult(fakebuild) + def setDequeuedNeedingChange(self): self.dequeued_needing_change = True self._setAllJobsSkipped() diff --git a/zuul/nodepool.py b/zuul/nodepool.py index b7be94f299..8c944cce86 100644 --- a/zuul/nodepool.py +++ b/zuul/nodepool.py @@ -98,8 +98,8 @@ class Nodepool(object): if request.uid not in self.requests: return False - if request.state == model.STATE_FULFILLED: - self.log.info("Node request %s fulfilled" % (request,)) + if request.state in (model.STATE_FULFILLED, model.STATE_FAILED): + self.log.info("Node request %s %s" % (request, request.state)) # Give our results to the scheduler. self.sched.onNodesProvisioned(request) @@ -119,17 +119,18 @@ class Nodepool(object): self.log.info("Accepting node request %s" % (request,)) - # First, try to lock the nodes. locked = False - try: - self.lockNodeset(request.nodeset) - locked = True - except Exception: - self.log.exception("Error locking nodes:") - request.failed = True + if request.fulfilled: + # If the request suceeded, try to lock the nodes. + try: + self.lockNodeset(request.nodeset) + locked = True + except Exception: + self.log.exception("Error locking nodes:") + request.failed = True - # Regardless of whether locking succeeded, delete the - # request. + # Regardless of whether locking (or even the request) + # succeeded, delete the request. self.log.debug("Deleting node request %s" % (request,)) try: self.sched.zk.deleteNodeRequest(request) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 5f51cbf0a9..5e49f205e4 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -811,22 +811,19 @@ class Scheduler(threading.Thread): request = event.request build_set = request.build_set - try: - self.nodepool.acceptNodes(request) - except Exception: - self.log.exception("Unable to accept nodes from request %s:" - % (request,)) - return + self.nodepool.acceptNodes(request) if build_set is not build_set.item.current_build_set: self.log.warning("Build set %s is not current" % (build_set,)) - self.nodepool.returnNodeset(request.nodeset) + if request.fulfilled: + self.nodepool.returnNodeset(request.nodeset) return pipeline = build_set.item.pipeline if not pipeline: self.log.warning("Build set %s is not associated with a pipeline" % (build_set,)) - self.nodepool.returnNodeset(request.nodeset) + if request.fulfilled: + self.nodepool.returnNodeset(request.nodeset) return pipeline.manager.onNodesProvisioned(event)