Improve safety around canceling node requests
It's possible for a node request to disappear while we are trying to fulfill it. That will cause an unhandled exception and the fake nodepool will stop. Log all exceptions and continue running. Also add an exception handler for the specific case encountered. Also, add a canceled flag to node requests to aid in some race condiations. The _updateNodeRequest method can be called out-of-band by ZK. It can race with the scheduler calling cancelRequest, which is significant for deleting the request from Nodepool.requests as well as handing off a completed request to the scheduler. The canceled flag on requests aids in that by allowing us to only delete from Nodepool.requests within the _updateNodeRequest method. It also helps us detect when a request was canceled after it was fulfilled and handed off to the scheduler. In those cases, the cancel method will have already deleted the request, but the scheduler would then procces it and perhaps try to delete it again. We check the canceled flag inside of the scheduler main loop. Since the scheduler main loop is also the only thread which can call the cancel method, we can be assured that it won't change while we evaluate it. If we find that we ended up with a completed request which was canceled, we just ignore it since we know that the cancel method would have deleted the request. Change-Id: I30f854a869d7690ab50340ba4c02067c4ae9fe2b
This commit is contained in:
parent
ff97edb615
commit
cbbce0d12c
|
@ -1364,7 +1364,10 @@ class FakeNodepool(object):
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while self._running:
|
while self._running:
|
||||||
self._run()
|
try:
|
||||||
|
self._run()
|
||||||
|
except Exception:
|
||||||
|
self.log.exception("Error in fake nodepool:")
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
def _run(self):
|
def _run(self):
|
||||||
|
@ -1462,7 +1465,10 @@ class FakeNodepool(object):
|
||||||
path = self.REQUEST_ROOT + '/' + oid
|
path = self.REQUEST_ROOT + '/' + oid
|
||||||
data = json.dumps(request)
|
data = json.dumps(request)
|
||||||
self.log.debug("Fulfilling node request: %s %s" % (oid, data))
|
self.log.debug("Fulfilling node request: %s %s" % (oid, data))
|
||||||
self.client.set(path, data)
|
try:
|
||||||
|
self.client.set(path, data)
|
||||||
|
except kazoo.exceptions.NoNodeError:
|
||||||
|
self.log.debug("Node request %s %s disappeared" % (oid, data))
|
||||||
|
|
||||||
|
|
||||||
class ChrootedKazooFixture(fixtures.Fixture):
|
class ChrootedKazooFixture(fixtures.Fixture):
|
||||||
|
|
|
@ -490,9 +490,10 @@ class NodeRequest(object):
|
||||||
self.stat = None
|
self.stat = None
|
||||||
self.uid = uuid4().hex
|
self.uid = uuid4().hex
|
||||||
self.id = None
|
self.id = None
|
||||||
# Zuul internal failure flag (not stored in ZK so it's not
|
# Zuul internal flags (not stored in ZK so they are not
|
||||||
# overwritten).
|
# overwritten).
|
||||||
self.failed = False
|
self.failed = False
|
||||||
|
self.canceled = False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def fulfilled(self):
|
def fulfilled(self):
|
||||||
|
|
|
@ -38,11 +38,11 @@ class Nodepool(object):
|
||||||
def cancelRequest(self, request):
|
def cancelRequest(self, request):
|
||||||
self.log.info("Canceling node request %s" % (request,))
|
self.log.info("Canceling node request %s" % (request,))
|
||||||
if request.uid in self.requests:
|
if request.uid in self.requests:
|
||||||
|
request.canceled = True
|
||||||
try:
|
try:
|
||||||
self.sched.zk.deleteNodeRequest(request)
|
self.sched.zk.deleteNodeRequest(request)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception("Error deleting node request:")
|
self.log.exception("Error deleting node request:")
|
||||||
del self.requests[request.uid]
|
|
||||||
|
|
||||||
def useNodeSet(self, nodeset):
|
def useNodeSet(self, nodeset):
|
||||||
self.log.info("Setting nodeset %s in use" % (nodeset,))
|
self.log.info("Setting nodeset %s in use" % (nodeset,))
|
||||||
|
@ -98,6 +98,10 @@ class Nodepool(object):
|
||||||
if request.uid not in self.requests:
|
if request.uid not in self.requests:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if request.canceled:
|
||||||
|
del self.requests[request.uid]
|
||||||
|
return False
|
||||||
|
|
||||||
if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
|
if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
|
||||||
self.log.info("Node request %s %s" % (request, request.state))
|
self.log.info("Node request %s %s" % (request, request.state))
|
||||||
|
|
||||||
|
@ -119,6 +123,11 @@ class Nodepool(object):
|
||||||
|
|
||||||
self.log.info("Accepting node request %s" % (request,))
|
self.log.info("Accepting node request %s" % (request,))
|
||||||
|
|
||||||
|
if request.canceled:
|
||||||
|
self.log.info("Ignoring canceled node request %s" % (request,))
|
||||||
|
# The request was already deleted when it was canceled
|
||||||
|
return
|
||||||
|
|
||||||
locked = False
|
locked = False
|
||||||
if request.fulfilled:
|
if request.fulfilled:
|
||||||
# If the request suceeded, try to lock the nodes.
|
# If the request suceeded, try to lock the nodes.
|
||||||
|
|
|
@ -871,6 +871,8 @@ class Scheduler(threading.Thread):
|
||||||
build_set = request.build_set
|
build_set = request.build_set
|
||||||
|
|
||||||
self.nodepool.acceptNodes(request)
|
self.nodepool.acceptNodes(request)
|
||||||
|
if request.canceled:
|
||||||
|
return
|
||||||
|
|
||||||
if build_set is not build_set.item.current_build_set:
|
if build_set is not build_set.item.current_build_set:
|
||||||
self.log.warning("Build set %s is not current" % (build_set,))
|
self.log.warning("Build set %s is not current" % (build_set,))
|
||||||
|
|
Loading…
Reference in New Issue