Fix test race with node allocation

In order to determine if the system is settled, the test framework checks whether all node requests have been fulfilled (and, implicitly, that Zuul has seen and processed that fulfillment). That sequence is now: 1. Request state in ZK is set to fulfilled by nodepool 2. Scheduler receives watch that request is fulfilled 3. Internal cache is updated to reflect new state 4. NodesProvisionedEvent is added to ZK result event queue There is a window between 3 and 4 where, to an external observer, the system looks the same as after 4. We used to have an internal list of pending requests in the scheduler which we would not clear until after 4, however that has been removed since it doesn't make sense with multiple schedulers. To resolve this, we wrap steps 3 and 4 in a lock so they act as a critical section, and then in the test framework, we grab that same lock and check both the internal cache (3) and the result event queue(4) to determine if we're between 3 and 4 (not settled) or after 4 (settled). Change-Id: Ib6d0ad826cc4d3fad9d1b59434971f48bab7d23a
2021-09-11 11:01:44 -07:00 · 2021-09-11 11:01:44 -07:00 · cebebf0f42
parent 38776452bb
commit cebebf0f42
3 changed files with 29 additions and 15 deletions
--- a/tests/base.py
+++ b/tests/base.py
@ -5250,17 +5250,25 @@ class ZuulTestCase(BaseTestCase):
            return True
        # Check ZK and the scheduler cache and make sure they are
        # in sync.
-        for req in self.fake_nodepool.getNodeRequests():
-            if req['state'] != model.STATE_FULFILLED:
-                return False
-            for app in self.scheds.filter(matcher):
-                nodepool = app.sched.nodepool
-                r2 = nodepool.zk_nodepool.getNodeRequest(req['_oid'],
-                                                         cached=True)
-                if r2 and r2.state != req['state']:
-                    return False
-                if req and not r2:
-                    return False
+        for app in self.scheds.filter(matcher):
+            sched = app.sched
+            nodepool = app.sched.nodepool
+            with nodepool.zk_nodepool._callback_lock:
+                for req in self.fake_nodepool.getNodeRequests():
+                    if req['state'] != model.STATE_FULFILLED:
+                        return False
+                    r2 = nodepool.zk_nodepool.getNodeRequest(req['_oid'],
+                                                             cached=True)
+                    if r2 and r2.state != req['state']:
+                        return False
+                    if req and not r2:
+                        return False
+                    tenant_name = r2.tenant_name
+                    pipeline_name = r2.pipeline_name
+                    if sched.pipeline_result_events[tenant_name][
+                            pipeline_name
+                    ].hasEvents():
+                        return False
        return True

    def __areAllMergeJobsWaiting(self, matcher) -> bool:
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@ -116,9 +116,6 @@ class Nodepool(object):

        log.debug("Node request %s %s", request, request.state)
        if event == NodeRequestEvent.COMPLETED:
-            # This sequence is required for tests -- we can only
-            # remove the request from our internal cache after the
-            # completed event is added to the zk queue.
            try:
                if self.election_won:
                    if self.election.is_still_valid():
--- a/zuul/zk/nodepool.py
+++ b/zuul/zk/nodepool.py
@ -15,6 +15,7 @@ import logging
 import time
 from enum import Enum
 from typing import Optional, List
+import threading

 from kazoo.exceptions import NoNodeError, LockTimeout
 from kazoo.recipe.cache import TreeCache, TreeEvent
@ -67,6 +68,7 @@ class ZooKeeperNodepool(ZooKeeperBase):
        self._node_request_cache = {}
        self._node_tree = None
        self._node_cache = {}
+        self._callback_lock = threading.Lock()

        if self.client.connected:
            self._onConnect()
@ -380,7 +382,14 @@ class ZooKeeperNodepool(ZooKeeperBase):

    def requestCacheListener(self, event):
        try:
-            self._requestCacheListener(event)
+            # This lock is only needed for tests.  It wraps updating
+            # our internal cache and performing the callback which
+            # will generate node provisioned events in a critical
+            # section.  This way the tests know if a request is
+            # outstanding because either our cache is out of date or
+            # there is an event on the queue.
+            with self._callback_lock:
+                self._requestCacheListener(event)
        except Exception:
            self.log.exception(
                "Exception in request cache update for event: %s",