Don't pause the static provider at quota
The static provider avoids accepting requests for a label when all the nodes of that label are busy. In other words, it only accepts requests when it expects to be able to fulfill them. However, it also checks whether a node is reachable before fulfilling a request. And if the reachability of a node changes before a request is fulfilled, then Nodepool will pause the handler until the request is fulfilled. If the node never comes back online, no other nodes in the provider will be handled. In general, we should never pause the static provider since pausing is designed to allow for the accumulation of sufficient free resources to fulfill the next request. Static provider resources are not fungible, so pausing until label A is available doesn't make label B any less available. To correct the problem, we no longer pause the static provider when there is insufficient quota; instead we simply release the current node request without declining it, and then allow the provider to either defer it (if the node continues to be unreachable) or accept it (if the node later becomes available again). In other words, the handling of a request for a node with connection problems will now be the same regardless of whether it is detected on startup or while running. Change-Id: I61565e9864efcfc81a205318d30972c25d4ea880
This commit is contained in:
@@ -160,6 +160,12 @@ class Provider(ProviderNotifications):
|
||||
The class or instance attribute **name** must be provided as a string.
|
||||
|
||||
"""
|
||||
|
||||
# This is only intended for the static driver. It indicates
|
||||
# whether we should pause the provider when we are at quota. This
|
||||
# should almost always be true.
|
||||
PAUSE_AT_QUOTA = True
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super().__init__(*args, **kw)
|
||||
|
||||
@@ -495,12 +501,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
|
||||
|
||||
self.log.info(
|
||||
"Not enough quota remaining to satisfy request")
|
||||
if not self.paused:
|
||||
self.log.debug(
|
||||
"Pausing request handling to satisfy request")
|
||||
self.paused = True
|
||||
self.zk.deleteOldestUnusedNode(self.provider.name,
|
||||
self.pool.name)
|
||||
if self.manager.PAUSE_AT_QUOTA:
|
||||
if not self.paused:
|
||||
self.log.debug(
|
||||
"Pausing request handling to satisfy request")
|
||||
self.paused = True
|
||||
else:
|
||||
# Release the request so that we or another
|
||||
# provider can try again when the label quota
|
||||
# is available.
|
||||
self._declinedHandlerCleanup()
|
||||
return
|
||||
|
||||
if self.paused:
|
||||
|
||||
@@ -48,6 +48,11 @@ class StaticNodeProvider(Provider, QuotaSupport):
|
||||
log = logging.getLogger("nodepool.driver.static."
|
||||
"StaticNodeProvider")
|
||||
|
||||
# Do not pause the provider when we are at quota so that requests
|
||||
# for different labels do not interfere with each other (resources
|
||||
# are not fungible in the static proivder).
|
||||
PAUSE_AT_QUOTA = False
|
||||
|
||||
def __init__(self, provider, *args):
|
||||
self.provider = provider
|
||||
# Lock to avoid data races when registering nodes from
|
||||
|
||||
36
nodepool/tests/fixtures/static-two-node.yaml
vendored
Normal file
36
nodepool/tests/fixtures/static-two-node.yaml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
zookeeper-tls:
|
||||
ca: {zookeeper_ca}
|
||||
cert: {zookeeper_cert}
|
||||
key: {zookeeper_key}
|
||||
|
||||
labels:
|
||||
- name: fake-label
|
||||
- name: fake-label2
|
||||
|
||||
providers:
|
||||
- name: static-provider
|
||||
driver: static
|
||||
pools:
|
||||
- name: main
|
||||
node-attributes:
|
||||
key1: value1
|
||||
key2: value2
|
||||
nodes:
|
||||
- name: fake-host-1
|
||||
labels: fake-label
|
||||
host-key: ssh-rsa FAKEKEY
|
||||
timeout: 13
|
||||
connection-port: 22022
|
||||
username: zuul
|
||||
- name: fake-host-2
|
||||
labels: fake-label2
|
||||
host-key: ssh-rsa FAKEKEY
|
||||
timeout: 13
|
||||
connection-port: 22022
|
||||
username: zuul
|
||||
host-key-checking: false
|
||||
@@ -549,6 +549,61 @@ class TestDriverStatic(tests.DBTestCase):
|
||||
self.assertEqual(len(req.nodes), 1)
|
||||
self.assertNotEqual(req.nodes[0], nodes[0].id)
|
||||
|
||||
def test_liveness_two_node(self):
|
||||
'''
|
||||
Test that a node going offline doesn't block a static provider
|
||||
'''
|
||||
configfile = self.setup_config('static-two-node.yaml')
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
pool.start()
|
||||
nodes1 = self.waitForNodes('fake-label')
|
||||
self.assertEqual(len(nodes1), 1)
|
||||
nodes2 = self.waitForNodes('fake-label2')
|
||||
self.assertEqual(len(nodes2), 1)
|
||||
|
||||
# We will take this node offline later
|
||||
req1 = zk.NodeRequest()
|
||||
req1.state = zk.REQUESTED
|
||||
req1.node_types.append('fake-label')
|
||||
|
||||
# This nod does not perform host checks so never goes offline
|
||||
req2 = zk.NodeRequest()
|
||||
req2.state = zk.REQUESTED
|
||||
req2.node_types.append('fake-label2')
|
||||
|
||||
# Take the first node offline then submit a node request for
|
||||
# it (which should be accepted but then released once the
|
||||
# driver fails the liveness check).
|
||||
with mock.patch("nodepool.nodeutils.nodescan") as nodescan_mock:
|
||||
nodescan_mock.side_effect = OSError
|
||||
self.zk.storeNodeRequest(req1)
|
||||
self.waitForNodeDeletion(nodes1[0])
|
||||
|
||||
# We know the first request has been accepted and
|
||||
# processed since the node was deleted. At this point,
|
||||
# submit the second request.
|
||||
self.zk.storeNodeRequest(req2)
|
||||
|
||||
self.log.debug("Waiting for request %s", req2.id)
|
||||
req2 = self.waitForNodeRequest(req2)
|
||||
# The second request is fulfilled; get the status of the
|
||||
# first request at the same time.
|
||||
req1 = self.zk.getNodeRequest(req1.id)
|
||||
|
||||
# Verify that the first request was not fulfilled at the time
|
||||
# that the second was.
|
||||
self.assertEqual(req1.state, zk.REQUESTED)
|
||||
self.assertEqual(req2.state, zk.FULFILLED)
|
||||
self.assertEqual(len(req2.nodes), 1)
|
||||
self.assertNotEqual(req2.nodes[0], nodes2[0].id)
|
||||
|
||||
# Now let req1 complete with the node online.
|
||||
self.log.debug("Waiting for request %s", req1.id)
|
||||
req1 = self.waitForNodeRequest(req1)
|
||||
self.assertEqual(req1.state, zk.FULFILLED)
|
||||
self.assertEqual(len(req1.nodes), 1)
|
||||
self.assertNotEqual(req1.nodes[0], nodes1[0].id)
|
||||
|
||||
def test_host_key_checking_toggle(self):
|
||||
"""Test that host key checking can be disabled"""
|
||||
configfile = self.setup_config('static-no-check.yaml')
|
||||
|
||||
Reference in New Issue
Block a user