Fix test race with node allocation
In order to determine if the system is settled, the test framework checks whether all node requests have been fulfilled (and, implicitly, that Zuul has seen and processed that fulfillment). That sequence is now: 1. Request state in ZK is set to fulfilled by nodepool 2. Scheduler receives watch that request is fulfilled 3. Internal cache is updated to reflect new state 4. NodesProvisionedEvent is added to ZK result event queue There is a window between 3 and 4 where, to an external observer, the system looks the same as after 4. We used to have an internal list of pending requests in the scheduler which we would not clear until after 4, however that has been removed since it doesn't make sense with multiple schedulers. To resolve this, we wrap steps 3 and 4 in a lock so they act as a critical section, and then in the test framework, we grab that same lock and check both the internal cache (3) and the result event queue(4) to determine if we're between 3 and 4 (not settled) or after 4 (settled). Change-Id: Ib6d0ad826cc4d3fad9d1b59434971f48bab7d23a
This commit is contained in:
parent
38776452bb
commit
cebebf0f42
|
@ -5250,17 +5250,25 @@ class ZuulTestCase(BaseTestCase):
|
|||
return True
|
||||
# Check ZK and the scheduler cache and make sure they are
|
||||
# in sync.
|
||||
for req in self.fake_nodepool.getNodeRequests():
|
||||
if req['state'] != model.STATE_FULFILLED:
|
||||
return False
|
||||
for app in self.scheds.filter(matcher):
|
||||
nodepool = app.sched.nodepool
|
||||
r2 = nodepool.zk_nodepool.getNodeRequest(req['_oid'],
|
||||
cached=True)
|
||||
if r2 and r2.state != req['state']:
|
||||
return False
|
||||
if req and not r2:
|
||||
return False
|
||||
for app in self.scheds.filter(matcher):
|
||||
sched = app.sched
|
||||
nodepool = app.sched.nodepool
|
||||
with nodepool.zk_nodepool._callback_lock:
|
||||
for req in self.fake_nodepool.getNodeRequests():
|
||||
if req['state'] != model.STATE_FULFILLED:
|
||||
return False
|
||||
r2 = nodepool.zk_nodepool.getNodeRequest(req['_oid'],
|
||||
cached=True)
|
||||
if r2 and r2.state != req['state']:
|
||||
return False
|
||||
if req and not r2:
|
||||
return False
|
||||
tenant_name = r2.tenant_name
|
||||
pipeline_name = r2.pipeline_name
|
||||
if sched.pipeline_result_events[tenant_name][
|
||||
pipeline_name
|
||||
].hasEvents():
|
||||
return False
|
||||
return True
|
||||
|
||||
def __areAllMergeJobsWaiting(self, matcher) -> bool:
|
||||
|
|
|
@ -116,9 +116,6 @@ class Nodepool(object):
|
|||
|
||||
log.debug("Node request %s %s", request, request.state)
|
||||
if event == NodeRequestEvent.COMPLETED:
|
||||
# This sequence is required for tests -- we can only
|
||||
# remove the request from our internal cache after the
|
||||
# completed event is added to the zk queue.
|
||||
try:
|
||||
if self.election_won:
|
||||
if self.election.is_still_valid():
|
||||
|
|
|
@ -15,6 +15,7 @@ import logging
|
|||
import time
|
||||
from enum import Enum
|
||||
from typing import Optional, List
|
||||
import threading
|
||||
|
||||
from kazoo.exceptions import NoNodeError, LockTimeout
|
||||
from kazoo.recipe.cache import TreeCache, TreeEvent
|
||||
|
@ -67,6 +68,7 @@ class ZooKeeperNodepool(ZooKeeperBase):
|
|||
self._node_request_cache = {}
|
||||
self._node_tree = None
|
||||
self._node_cache = {}
|
||||
self._callback_lock = threading.Lock()
|
||||
|
||||
if self.client.connected:
|
||||
self._onConnect()
|
||||
|
@ -380,7 +382,14 @@ class ZooKeeperNodepool(ZooKeeperBase):
|
|||
|
||||
def requestCacheListener(self, event):
|
||||
try:
|
||||
self._requestCacheListener(event)
|
||||
# This lock is only needed for tests. It wraps updating
|
||||
# our internal cache and performing the callback which
|
||||
# will generate node provisioned events in a critical
|
||||
# section. This way the tests know if a request is
|
||||
# outstanding because either our cache is out of date or
|
||||
# there is an event on the queue.
|
||||
with self._callback_lock:
|
||||
self._requestCacheListener(event)
|
||||
except Exception:
|
||||
self.log.exception(
|
||||
"Exception in request cache update for event: %s",
|
||||
|
|
Loading…
Reference in New Issue