Improve safety around canceling node requests

It's possible for a node request to disappear while we are trying
to fulfill it.  That will cause an unhandled exception and the
fake nodepool will stop.

Log all exceptions and continue running.

Also add an exception handler for the specific case encountered.

Also, add a canceled flag to node requests to aid in some race
condiations.  The _updateNodeRequest method can be called
out-of-band by ZK.  It can race with the scheduler calling
cancelRequest, which is significant for deleting the request
from Nodepool.requests as well as handing off a completed request
to the scheduler.

The canceled flag on requests aids in that by allowing us to only
delete from Nodepool.requests within the _updateNodeRequest method.

It also helps us detect when a request was canceled after it was
fulfilled and handed off to the scheduler.  In those cases, the
cancel method will have already deleted the request, but the scheduler
would then procces it and perhaps try to delete it again.  We check
the canceled flag inside of the scheduler main loop.  Since the
scheduler main loop is also the only thread which can call the cancel
method, we can be assured that it won't change while we evaluate it.
If we find that we ended up with a completed request which was canceled,
we just ignore it since we know that the cancel method would have
deleted the request.

Change-Id: I30f854a869d7690ab50340ba4c02067c4ae9fe2b
This commit is contained in:
James E. Blair 2017-05-19 07:28:29 -07:00
parent ff97edb615
commit cbbce0d12c
4 changed files with 22 additions and 4 deletions

View File

@ -1364,7 +1364,10 @@ class FakeNodepool(object):
def run(self):
while self._running:
self._run()
try:
self._run()
except Exception:
self.log.exception("Error in fake nodepool:")
time.sleep(0.1)
def _run(self):
@ -1462,7 +1465,10 @@ class FakeNodepool(object):
path = self.REQUEST_ROOT + '/' + oid
data = json.dumps(request)
self.log.debug("Fulfilling node request: %s %s" % (oid, data))
self.client.set(path, data)
try:
self.client.set(path, data)
except kazoo.exceptions.NoNodeError:
self.log.debug("Node request %s %s disappeared" % (oid, data))
class ChrootedKazooFixture(fixtures.Fixture):

View File

@ -490,9 +490,10 @@ class NodeRequest(object):
self.stat = None
self.uid = uuid4().hex
self.id = None
# Zuul internal failure flag (not stored in ZK so it's not
# Zuul internal flags (not stored in ZK so they are not
# overwritten).
self.failed = False
self.canceled = False
@property
def fulfilled(self):

View File

@ -38,11 +38,11 @@ class Nodepool(object):
def cancelRequest(self, request):
self.log.info("Canceling node request %s" % (request,))
if request.uid in self.requests:
request.canceled = True
try:
self.sched.zk.deleteNodeRequest(request)
except Exception:
self.log.exception("Error deleting node request:")
del self.requests[request.uid]
def useNodeSet(self, nodeset):
self.log.info("Setting nodeset %s in use" % (nodeset,))
@ -98,6 +98,10 @@ class Nodepool(object):
if request.uid not in self.requests:
return False
if request.canceled:
del self.requests[request.uid]
return False
if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
self.log.info("Node request %s %s" % (request, request.state))
@ -119,6 +123,11 @@ class Nodepool(object):
self.log.info("Accepting node request %s" % (request,))
if request.canceled:
self.log.info("Ignoring canceled node request %s" % (request,))
# The request was already deleted when it was canceled
return
locked = False
if request.fulfilled:
# If the request suceeded, try to lock the nodes.

View File

@ -871,6 +871,8 @@ class Scheduler(threading.Thread):
build_set = request.build_set
self.nodepool.acceptNodes(request)
if request.canceled:
return
if build_set is not build_set.item.current_build_set:
self.log.warning("Build set %s is not current" % (build_set,))