Don't pause the static provider at quota

The static provider avoids accepting requests for a label when
all the nodes of that label are busy.  In other words, it only
accepts requests when it expects to be able to fulfill them.

However, it also checks whether a node is reachable before fulfilling
a request.  And if the reachability of a node changes before a
request is fulfilled, then Nodepool will pause the handler until
the request is fulfilled.  If the node never comes back online,
no other nodes in the provider will be handled.

In general, we should never pause the static provider since pausing
is designed to allow for the accumulation of sufficient free resources
to fulfill the next request.  Static provider resources are not
fungible, so pausing until label A is available doesn't make label
B any less available.

To correct the problem, we no longer pause the static provider when
there is insufficient quota; instead we simply release the current
node request without declining it, and then allow the provider to
either defer it (if the node continues to be unreachable) or accept
it (if the node later becomes available again).

In other words, the handling of a request for a node with connection
problems will now be the same regardless of whether it is detected
on startup or while running.

Change-Id: I61565e9864efcfc81a205318d30972c25d4ea880
This commit is contained in:
James E. Blair
2022-09-12 10:17:31 -07:00
parent f670c53a56
commit 1724d3a60f
4 changed files with 112 additions and 4 deletions

View File

@@ -160,6 +160,12 @@ class Provider(ProviderNotifications):
The class or instance attribute **name** must be provided as a string.
"""
# This is only intended for the static driver. It indicates
# whether we should pause the provider when we are at quota. This
# should almost always be true.
PAUSE_AT_QUOTA = True
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
@@ -495,12 +501,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
self.log.info(
"Not enough quota remaining to satisfy request")
if not self.paused:
self.log.debug(
"Pausing request handling to satisfy request")
self.paused = True
self.zk.deleteOldestUnusedNode(self.provider.name,
self.pool.name)
if self.manager.PAUSE_AT_QUOTA:
if not self.paused:
self.log.debug(
"Pausing request handling to satisfy request")
self.paused = True
else:
# Release the request so that we or another
# provider can try again when the label quota
# is available.
self._declinedHandlerCleanup()
return
if self.paused:

View File

@@ -48,6 +48,11 @@ class StaticNodeProvider(Provider, QuotaSupport):
log = logging.getLogger("nodepool.driver.static."
"StaticNodeProvider")
# Do not pause the provider when we are at quota so that requests
# for different labels do not interfere with each other (resources
# are not fungible in the static proivder).
PAUSE_AT_QUOTA = False
def __init__(self, provider, *args):
self.provider = provider
# Lock to avoid data races when registering nodes from

View File

@@ -0,0 +1,36 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: fake-label
- name: fake-label2
providers:
- name: static-provider
driver: static
pools:
- name: main
node-attributes:
key1: value1
key2: value2
nodes:
- name: fake-host-1
labels: fake-label
host-key: ssh-rsa FAKEKEY
timeout: 13
connection-port: 22022
username: zuul
- name: fake-host-2
labels: fake-label2
host-key: ssh-rsa FAKEKEY
timeout: 13
connection-port: 22022
username: zuul
host-key-checking: false

View File

@@ -549,6 +549,61 @@ class TestDriverStatic(tests.DBTestCase):
self.assertEqual(len(req.nodes), 1)
self.assertNotEqual(req.nodes[0], nodes[0].id)
def test_liveness_two_node(self):
'''
Test that a node going offline doesn't block a static provider
'''
configfile = self.setup_config('static-two-node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
nodes1 = self.waitForNodes('fake-label')
self.assertEqual(len(nodes1), 1)
nodes2 = self.waitForNodes('fake-label2')
self.assertEqual(len(nodes2), 1)
# We will take this node offline later
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('fake-label')
# This nod does not perform host checks so never goes offline
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('fake-label2')
# Take the first node offline then submit a node request for
# it (which should be accepted but then released once the
# driver fails the liveness check).
with mock.patch("nodepool.nodeutils.nodescan") as nodescan_mock:
nodescan_mock.side_effect = OSError
self.zk.storeNodeRequest(req1)
self.waitForNodeDeletion(nodes1[0])
# We know the first request has been accepted and
# processed since the node was deleted. At this point,
# submit the second request.
self.zk.storeNodeRequest(req2)
self.log.debug("Waiting for request %s", req2.id)
req2 = self.waitForNodeRequest(req2)
# The second request is fulfilled; get the status of the
# first request at the same time.
req1 = self.zk.getNodeRequest(req1.id)
# Verify that the first request was not fulfilled at the time
# that the second was.
self.assertEqual(req1.state, zk.REQUESTED)
self.assertEqual(req2.state, zk.FULFILLED)
self.assertEqual(len(req2.nodes), 1)
self.assertNotEqual(req2.nodes[0], nodes2[0].id)
# Now let req1 complete with the node online.
self.log.debug("Waiting for request %s", req1.id)
req1 = self.waitForNodeRequest(req1)
self.assertEqual(req1.state, zk.FULFILLED)
self.assertEqual(len(req1.nodes), 1)
self.assertNotEqual(req1.nodes[0], nodes1[0].id)
def test_host_key_checking_toggle(self):
"""Test that host key checking can be disabled"""
configfile = self.setup_config('static-no-check.yaml')