Merge "Handle nodepool allocation failure" into feature/zuulv3

This commit is contained in:
Jenkins 2017-01-18 22:14:18 +00:00 committed by Gerrit Code Review
commit fa8e36a8b4
6 changed files with 82 additions and 32 deletions

View File

@ -887,6 +887,7 @@ class FakeNodepool(object):
self.thread = threading.Thread(target=self.run)
self.thread.daemon = True
self.thread.start()
self.fail_requests = set()
def stop(self):
self._running = False
@ -965,21 +966,27 @@ class FakeNodepool(object):
nodeid = path.split("/")[-1]
return nodeid
def addFailRequest(self, request):
self.fail_requests.add(request['_oid'])
def fulfillRequest(self, request):
if request['state'] == 'fulfilled':
if request['state'] != 'requested':
return
request = request.copy()
oid = request['_oid']
del request['_oid']
if oid in self.fail_requests:
request['state'] = 'failed'
else:
request['state'] = 'fulfilled'
nodes = []
for node in request['node_types']:
nodeid = self.makeNode(oid, node)
nodes.append(nodeid)
request['state'] = 'fulfilled'
request['state_time'] = time.time()
request['nodes'] = nodes
request['state_time'] = time.time()
path = self.REQUEST_ROOT + '/' + oid
data = json.dumps(request)
self.log.debug("Fulfilling node request: %s %s" % (oid, data))

View File

@ -4546,6 +4546,27 @@ For CI problems and help debugging, contact ci@example.org"""
self.assertEqual(A.data['status'], 'MERGED')
self.assertEqual(A.reported, 2)
def test_nodepool_failure(self):
"Test that jobs are reported after a nodepool failure"
self.fake_nodepool.paused = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
A.addApproval('code-review', 2)
self.fake_gerrit.addEvent(A.addApproval('approved', 1))
self.waitUntilSettled()
req = self.fake_nodepool.getNodeRequests()[0]
self.fake_nodepool.addFailRequest(req)
self.fake_nodepool.paused = False
self.waitUntilSettled()
self.assertEqual(A.data['status'], 'NEW')
self.assertEqual(A.reported, 2)
self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
self.assertIn('project-test1 : SKIPPED', A.messages[1])
self.assertIn('project-test2 : SKIPPED', A.messages[1])
class TestDuplicatePipeline(ZuulTestCase):
tenant_config_file = 'config/duplicate-pipeline/main.yaml'

View File

@ -648,6 +648,10 @@ class PipelineManager(object):
build_set = request.build_set
build_set.jobNodeRequestComplete(request.job.name, request,
request.nodeset)
if request.failed or not request.fulfilled:
self.log.info("Node request failure for %s" %
(request.job.name,))
build_set.item.setNodeRequestFailure(request.job)
self.log.info("Completed node request %s for job %s of item %s "
"with nodes %s" %
(request, request.job, build_set.item,

View File

@ -473,6 +473,10 @@ class NodeRequest(object):
# overwritten).
self.failed = False
@property
def fulfilled(self):
return (self._state == STATE_FULFILLED) and not self.failed
@property
def state(self):
return self._state
@ -989,17 +993,27 @@ class QueueItem(object):
return self._findJobsToRun(tree.job_trees, mutex)
def _findJobsToRequest(self, job_trees):
build_set = self.current_build_set
toreq = []
if self.item_ahead:
if self.item_ahead.isHoldingFollowingChanges():
return []
for tree in job_trees:
job = tree.job
result = None
if job:
if not job.changeMatches(self.change):
continue
nodeset = self.current_build_set.getJobNodeSet(job.name)
build = build_set.getBuild(job.name)
if build:
result = build.result
else:
nodeset = build_set.getJobNodeSet(job.name)
if nodeset is None:
req = self.current_build_set.getJobNodeRequest(job.name)
req = build_set.getJobNodeRequest(job.name)
if req is None:
toreq.append(job)
if result == 'SUCCESS' or not job:
toreq.extend(self._findJobsToRequest(tree.job_trees))
return toreq
@ -1022,6 +1036,12 @@ class QueueItem(object):
fakebuild.result = 'SKIPPED'
self.addBuild(fakebuild)
def setNodeRequestFailure(self, job):
fakebuild = Build(job, None)
self.addBuild(fakebuild)
fakebuild.result = 'NODE_FAILURE'
self.setResult(fakebuild)
def setDequeuedNeedingChange(self):
self.dequeued_needing_change = True
self._setAllJobsSkipped()

View File

@ -98,8 +98,8 @@ class Nodepool(object):
if request.uid not in self.requests:
return False
if request.state == model.STATE_FULFILLED:
self.log.info("Node request %s fulfilled" % (request,))
if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
self.log.info("Node request %s %s" % (request, request.state))
# Give our results to the scheduler.
self.sched.onNodesProvisioned(request)
@ -119,8 +119,9 @@ class Nodepool(object):
self.log.info("Accepting node request %s" % (request,))
# First, try to lock the nodes.
locked = False
if request.fulfilled:
# If the request suceeded, try to lock the nodes.
try:
self.lockNodeset(request.nodeset)
locked = True
@ -128,8 +129,8 @@ class Nodepool(object):
self.log.exception("Error locking nodes:")
request.failed = True
# Regardless of whether locking succeeded, delete the
# request.
# Regardless of whether locking (or even the request)
# succeeded, delete the request.
self.log.debug("Deleting node request %s" % (request,))
try:
self.sched.zk.deleteNodeRequest(request)

View File

@ -811,21 +811,18 @@ class Scheduler(threading.Thread):
request = event.request
build_set = request.build_set
try:
self.nodepool.acceptNodes(request)
except Exception:
self.log.exception("Unable to accept nodes from request %s:"
% (request,))
return
if build_set is not build_set.item.current_build_set:
self.log.warning("Build set %s is not current" % (build_set,))
if request.fulfilled:
self.nodepool.returnNodeset(request.nodeset)
return
pipeline = build_set.item.pipeline
if not pipeline:
self.log.warning("Build set %s is not associated with a pipeline" %
(build_set,))
if request.fulfilled:
self.nodepool.returnNodeset(request.nodeset)
return
pipeline.manager.onNodesProvisioned(event)