Fix test race with node allocation

In order to determine if the system is settled, the test framework
checks whether all node requests have been fulfilled (and, implicitly,
that Zuul has seen and processed that fulfillment).  That sequence
is now:

1. Request state in ZK is set to fulfilled by nodepool
2. Scheduler receives watch that request is fulfilled
3. Internal cache is updated to reflect new state
4. NodesProvisionedEvent is added to ZK result event queue

There is a window between 3 and 4 where, to an external observer, the
system looks the same as after 4.  We used to have an internal list of
pending requests in the scheduler which we would not clear until after
4, however that has been removed since it doesn't make sense with
multiple schedulers.

To resolve this, we wrap steps 3 and 4 in a lock so they act as a
critical section, and then in the test framework, we grab that same lock
and check both the internal cache (3) and the result event queue(4) to
determine if we're between 3 and 4 (not settled) or after 4 (settled).

Change-Id: Ib6d0ad826cc4d3fad9d1b59434971f48bab7d23a
This commit is contained in:
James E. Blair 2021-09-11 11:01:44 -07:00
parent 38776452bb
commit cebebf0f42
3 changed files with 29 additions and 15 deletions

View File

@ -5250,17 +5250,25 @@ class ZuulTestCase(BaseTestCase):
return True
# Check ZK and the scheduler cache and make sure they are
# in sync.
for req in self.fake_nodepool.getNodeRequests():
if req['state'] != model.STATE_FULFILLED:
return False
for app in self.scheds.filter(matcher):
nodepool = app.sched.nodepool
r2 = nodepool.zk_nodepool.getNodeRequest(req['_oid'],
cached=True)
if r2 and r2.state != req['state']:
return False
if req and not r2:
return False
for app in self.scheds.filter(matcher):
sched = app.sched
nodepool = app.sched.nodepool
with nodepool.zk_nodepool._callback_lock:
for req in self.fake_nodepool.getNodeRequests():
if req['state'] != model.STATE_FULFILLED:
return False
r2 = nodepool.zk_nodepool.getNodeRequest(req['_oid'],
cached=True)
if r2 and r2.state != req['state']:
return False
if req and not r2:
return False
tenant_name = r2.tenant_name
pipeline_name = r2.pipeline_name
if sched.pipeline_result_events[tenant_name][
pipeline_name
].hasEvents():
return False
return True
def __areAllMergeJobsWaiting(self, matcher) -> bool:

View File

@ -116,9 +116,6 @@ class Nodepool(object):
log.debug("Node request %s %s", request, request.state)
if event == NodeRequestEvent.COMPLETED:
# This sequence is required for tests -- we can only
# remove the request from our internal cache after the
# completed event is added to the zk queue.
try:
if self.election_won:
if self.election.is_still_valid():

View File

@ -15,6 +15,7 @@ import logging
import time
from enum import Enum
from typing import Optional, List
import threading
from kazoo.exceptions import NoNodeError, LockTimeout
from kazoo.recipe.cache import TreeCache, TreeEvent
@ -67,6 +68,7 @@ class ZooKeeperNodepool(ZooKeeperBase):
self._node_request_cache = {}
self._node_tree = None
self._node_cache = {}
self._callback_lock = threading.Lock()
if self.client.connected:
self._onConnect()
@ -380,7 +382,14 @@ class ZooKeeperNodepool(ZooKeeperBase):
def requestCacheListener(self, event):
try:
self._requestCacheListener(event)
# This lock is only needed for tests. It wraps updating
# our internal cache and performing the callback which
# will generate node provisioned events in a critical
# section. This way the tests know if a request is
# outstanding because either our cache is out of date or
# there is an event on the queue.
with self._callback_lock:
self._requestCacheListener(event)
except Exception:
self.log.exception(
"Exception in request cache update for event: %s",