From cbbce0d12c55ad1e5c7940720b4d3cc4eed59e31 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 19 May 2017 07:28:29 -0700 Subject: [PATCH] Improve safety around canceling node requests It's possible for a node request to disappear while we are trying to fulfill it. That will cause an unhandled exception and the fake nodepool will stop. Log all exceptions and continue running. Also add an exception handler for the specific case encountered. Also, add a canceled flag to node requests to aid in some race condiations. The _updateNodeRequest method can be called out-of-band by ZK. It can race with the scheduler calling cancelRequest, which is significant for deleting the request from Nodepool.requests as well as handing off a completed request to the scheduler. The canceled flag on requests aids in that by allowing us to only delete from Nodepool.requests within the _updateNodeRequest method. It also helps us detect when a request was canceled after it was fulfilled and handed off to the scheduler. In those cases, the cancel method will have already deleted the request, but the scheduler would then procces it and perhaps try to delete it again. We check the canceled flag inside of the scheduler main loop. Since the scheduler main loop is also the only thread which can call the cancel method, we can be assured that it won't change while we evaluate it. If we find that we ended up with a completed request which was canceled, we just ignore it since we know that the cancel method would have deleted the request. Change-Id: I30f854a869d7690ab50340ba4c02067c4ae9fe2b --- tests/base.py | 10 ++++++++-- zuul/model.py | 3 ++- zuul/nodepool.py | 11 ++++++++++- zuul/scheduler.py | 2 ++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tests/base.py b/tests/base.py index 2568ec8c73..045046352b 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1364,7 +1364,10 @@ class FakeNodepool(object): def run(self): while self._running: - self._run() + try: + self._run() + except Exception: + self.log.exception("Error in fake nodepool:") time.sleep(0.1) def _run(self): @@ -1462,7 +1465,10 @@ class FakeNodepool(object): path = self.REQUEST_ROOT + '/' + oid data = json.dumps(request) self.log.debug("Fulfilling node request: %s %s" % (oid, data)) - self.client.set(path, data) + try: + self.client.set(path, data) + except kazoo.exceptions.NoNodeError: + self.log.debug("Node request %s %s disappeared" % (oid, data)) class ChrootedKazooFixture(fixtures.Fixture): diff --git a/zuul/model.py b/zuul/model.py index 8002f160bc..b14ce120ab 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -490,9 +490,10 @@ class NodeRequest(object): self.stat = None self.uid = uuid4().hex self.id = None - # Zuul internal failure flag (not stored in ZK so it's not + # Zuul internal flags (not stored in ZK so they are not # overwritten). self.failed = False + self.canceled = False @property def fulfilled(self): diff --git a/zuul/nodepool.py b/zuul/nodepool.py index e94b9504e4..8f6489c628 100644 --- a/zuul/nodepool.py +++ b/zuul/nodepool.py @@ -38,11 +38,11 @@ class Nodepool(object): def cancelRequest(self, request): self.log.info("Canceling node request %s" % (request,)) if request.uid in self.requests: + request.canceled = True try: self.sched.zk.deleteNodeRequest(request) except Exception: self.log.exception("Error deleting node request:") - del self.requests[request.uid] def useNodeSet(self, nodeset): self.log.info("Setting nodeset %s in use" % (nodeset,)) @@ -98,6 +98,10 @@ class Nodepool(object): if request.uid not in self.requests: return False + if request.canceled: + del self.requests[request.uid] + return False + if request.state in (model.STATE_FULFILLED, model.STATE_FAILED): self.log.info("Node request %s %s" % (request, request.state)) @@ -119,6 +123,11 @@ class Nodepool(object): self.log.info("Accepting node request %s" % (request,)) + if request.canceled: + self.log.info("Ignoring canceled node request %s" % (request,)) + # The request was already deleted when it was canceled + return + locked = False if request.fulfilled: # If the request suceeded, try to lock the nodes. diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 2e9bef22e0..a67973ebe5 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -871,6 +871,8 @@ class Scheduler(threading.Thread): build_set = request.build_set self.nodepool.acceptNodes(request) + if request.canceled: + return if build_set is not build_set.item.current_build_set: self.log.warning("Build set %s is not current" % (build_set,))