Merge "Handle nodepool allocation failure" into feature/zuulv3
This commit is contained in:
commit
fa8e36a8b4
|
@ -887,6 +887,7 @@ class FakeNodepool(object):
|
|||
self.thread = threading.Thread(target=self.run)
|
||||
self.thread.daemon = True
|
||||
self.thread.start()
|
||||
self.fail_requests = set()
|
||||
|
||||
def stop(self):
|
||||
self._running = False
|
||||
|
@ -965,21 +966,27 @@ class FakeNodepool(object):
|
|||
nodeid = path.split("/")[-1]
|
||||
return nodeid
|
||||
|
||||
def addFailRequest(self, request):
|
||||
self.fail_requests.add(request['_oid'])
|
||||
|
||||
def fulfillRequest(self, request):
|
||||
if request['state'] == 'fulfilled':
|
||||
if request['state'] != 'requested':
|
||||
return
|
||||
request = request.copy()
|
||||
oid = request['_oid']
|
||||
del request['_oid']
|
||||
|
||||
if oid in self.fail_requests:
|
||||
request['state'] = 'failed'
|
||||
else:
|
||||
request['state'] = 'fulfilled'
|
||||
nodes = []
|
||||
for node in request['node_types']:
|
||||
nodeid = self.makeNode(oid, node)
|
||||
nodes.append(nodeid)
|
||||
|
||||
request['state'] = 'fulfilled'
|
||||
request['state_time'] = time.time()
|
||||
request['nodes'] = nodes
|
||||
|
||||
request['state_time'] = time.time()
|
||||
path = self.REQUEST_ROOT + '/' + oid
|
||||
data = json.dumps(request)
|
||||
self.log.debug("Fulfilling node request: %s %s" % (oid, data))
|
||||
|
|
|
@ -4546,6 +4546,27 @@ For CI problems and help debugging, contact ci@example.org"""
|
|||
self.assertEqual(A.data['status'], 'MERGED')
|
||||
self.assertEqual(A.reported, 2)
|
||||
|
||||
def test_nodepool_failure(self):
|
||||
"Test that jobs are reported after a nodepool failure"
|
||||
|
||||
self.fake_nodepool.paused = True
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
A.addApproval('code-review', 2)
|
||||
self.fake_gerrit.addEvent(A.addApproval('approved', 1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
req = self.fake_nodepool.getNodeRequests()[0]
|
||||
self.fake_nodepool.addFailRequest(req)
|
||||
|
||||
self.fake_nodepool.paused = False
|
||||
self.waitUntilSettled()
|
||||
|
||||
self.assertEqual(A.data['status'], 'NEW')
|
||||
self.assertEqual(A.reported, 2)
|
||||
self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
|
||||
self.assertIn('project-test1 : SKIPPED', A.messages[1])
|
||||
self.assertIn('project-test2 : SKIPPED', A.messages[1])
|
||||
|
||||
|
||||
class TestDuplicatePipeline(ZuulTestCase):
|
||||
tenant_config_file = 'config/duplicate-pipeline/main.yaml'
|
||||
|
|
|
@ -648,6 +648,10 @@ class PipelineManager(object):
|
|||
build_set = request.build_set
|
||||
build_set.jobNodeRequestComplete(request.job.name, request,
|
||||
request.nodeset)
|
||||
if request.failed or not request.fulfilled:
|
||||
self.log.info("Node request failure for %s" %
|
||||
(request.job.name,))
|
||||
build_set.item.setNodeRequestFailure(request.job)
|
||||
self.log.info("Completed node request %s for job %s of item %s "
|
||||
"with nodes %s" %
|
||||
(request, request.job, build_set.item,
|
||||
|
|
|
@ -473,6 +473,10 @@ class NodeRequest(object):
|
|||
# overwritten).
|
||||
self.failed = False
|
||||
|
||||
@property
|
||||
def fulfilled(self):
|
||||
return (self._state == STATE_FULFILLED) and not self.failed
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
return self._state
|
||||
|
@ -989,17 +993,27 @@ class QueueItem(object):
|
|||
return self._findJobsToRun(tree.job_trees, mutex)
|
||||
|
||||
def _findJobsToRequest(self, job_trees):
|
||||
build_set = self.current_build_set
|
||||
toreq = []
|
||||
if self.item_ahead:
|
||||
if self.item_ahead.isHoldingFollowingChanges():
|
||||
return []
|
||||
for tree in job_trees:
|
||||
job = tree.job
|
||||
result = None
|
||||
if job:
|
||||
if not job.changeMatches(self.change):
|
||||
continue
|
||||
nodeset = self.current_build_set.getJobNodeSet(job.name)
|
||||
build = build_set.getBuild(job.name)
|
||||
if build:
|
||||
result = build.result
|
||||
else:
|
||||
nodeset = build_set.getJobNodeSet(job.name)
|
||||
if nodeset is None:
|
||||
req = self.current_build_set.getJobNodeRequest(job.name)
|
||||
req = build_set.getJobNodeRequest(job.name)
|
||||
if req is None:
|
||||
toreq.append(job)
|
||||
if result == 'SUCCESS' or not job:
|
||||
toreq.extend(self._findJobsToRequest(tree.job_trees))
|
||||
return toreq
|
||||
|
||||
|
@ -1022,6 +1036,12 @@ class QueueItem(object):
|
|||
fakebuild.result = 'SKIPPED'
|
||||
self.addBuild(fakebuild)
|
||||
|
||||
def setNodeRequestFailure(self, job):
|
||||
fakebuild = Build(job, None)
|
||||
self.addBuild(fakebuild)
|
||||
fakebuild.result = 'NODE_FAILURE'
|
||||
self.setResult(fakebuild)
|
||||
|
||||
def setDequeuedNeedingChange(self):
|
||||
self.dequeued_needing_change = True
|
||||
self._setAllJobsSkipped()
|
||||
|
|
|
@ -98,8 +98,8 @@ class Nodepool(object):
|
|||
if request.uid not in self.requests:
|
||||
return False
|
||||
|
||||
if request.state == model.STATE_FULFILLED:
|
||||
self.log.info("Node request %s fulfilled" % (request,))
|
||||
if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
|
||||
self.log.info("Node request %s %s" % (request, request.state))
|
||||
|
||||
# Give our results to the scheduler.
|
||||
self.sched.onNodesProvisioned(request)
|
||||
|
@ -119,8 +119,9 @@ class Nodepool(object):
|
|||
|
||||
self.log.info("Accepting node request %s" % (request,))
|
||||
|
||||
# First, try to lock the nodes.
|
||||
locked = False
|
||||
if request.fulfilled:
|
||||
# If the request suceeded, try to lock the nodes.
|
||||
try:
|
||||
self.lockNodeset(request.nodeset)
|
||||
locked = True
|
||||
|
@ -128,8 +129,8 @@ class Nodepool(object):
|
|||
self.log.exception("Error locking nodes:")
|
||||
request.failed = True
|
||||
|
||||
# Regardless of whether locking succeeded, delete the
|
||||
# request.
|
||||
# Regardless of whether locking (or even the request)
|
||||
# succeeded, delete the request.
|
||||
self.log.debug("Deleting node request %s" % (request,))
|
||||
try:
|
||||
self.sched.zk.deleteNodeRequest(request)
|
||||
|
|
|
@ -811,21 +811,18 @@ class Scheduler(threading.Thread):
|
|||
request = event.request
|
||||
build_set = request.build_set
|
||||
|
||||
try:
|
||||
self.nodepool.acceptNodes(request)
|
||||
except Exception:
|
||||
self.log.exception("Unable to accept nodes from request %s:"
|
||||
% (request,))
|
||||
return
|
||||
|
||||
if build_set is not build_set.item.current_build_set:
|
||||
self.log.warning("Build set %s is not current" % (build_set,))
|
||||
if request.fulfilled:
|
||||
self.nodepool.returnNodeset(request.nodeset)
|
||||
return
|
||||
pipeline = build_set.item.pipeline
|
||||
if not pipeline:
|
||||
self.log.warning("Build set %s is not associated with a pipeline" %
|
||||
(build_set,))
|
||||
if request.fulfilled:
|
||||
self.nodepool.returnNodeset(request.nodeset)
|
||||
return
|
||||
pipeline.manager.onNodesProvisioned(event)
|
||||
|
|
Loading…
Reference in New Issue