From 1724d3a60f7b9d87f26c57f166367e9912f52f13 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 12 Sep 2022 10:17:31 -0700 Subject: [PATCH] Don't pause the static provider at quota The static provider avoids accepting requests for a label when all the nodes of that label are busy. In other words, it only accepts requests when it expects to be able to fulfill them. However, it also checks whether a node is reachable before fulfilling a request. And if the reachability of a node changes before a request is fulfilled, then Nodepool will pause the handler until the request is fulfilled. If the node never comes back online, no other nodes in the provider will be handled. In general, we should never pause the static provider since pausing is designed to allow for the accumulation of sufficient free resources to fulfill the next request. Static provider resources are not fungible, so pausing until label A is available doesn't make label B any less available. To correct the problem, we no longer pause the static provider when there is insufficient quota; instead we simply release the current node request without declining it, and then allow the provider to either defer it (if the node continues to be unreachable) or accept it (if the node later becomes available again). In other words, the handling of a request for a node with connection problems will now be the same regardless of whether it is detected on startup or while running. Change-Id: I61565e9864efcfc81a205318d30972c25d4ea880 --- nodepool/driver/__init__.py | 20 +++++-- nodepool/driver/static/provider.py | 5 ++ nodepool/tests/fixtures/static-two-node.yaml | 36 +++++++++++++ nodepool/tests/unit/test_driver_static.py | 55 ++++++++++++++++++++ 4 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 nodepool/tests/fixtures/static-two-node.yaml diff --git a/nodepool/driver/__init__.py b/nodepool/driver/__init__.py index eee016285..6eedf673b 100644 --- a/nodepool/driver/__init__.py +++ b/nodepool/driver/__init__.py @@ -160,6 +160,12 @@ class Provider(ProviderNotifications): The class or instance attribute **name** must be provided as a string. """ + + # This is only intended for the static driver. It indicates + # whether we should pause the provider when we are at quota. This + # should almost always be true. + PAUSE_AT_QUOTA = True + def __init__(self, *args, **kw): super().__init__(*args, **kw) @@ -495,12 +501,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications, self.log.info( "Not enough quota remaining to satisfy request") - if not self.paused: - self.log.debug( - "Pausing request handling to satisfy request") - self.paused = True self.zk.deleteOldestUnusedNode(self.provider.name, self.pool.name) + if self.manager.PAUSE_AT_QUOTA: + if not self.paused: + self.log.debug( + "Pausing request handling to satisfy request") + self.paused = True + else: + # Release the request so that we or another + # provider can try again when the label quota + # is available. + self._declinedHandlerCleanup() return if self.paused: diff --git a/nodepool/driver/static/provider.py b/nodepool/driver/static/provider.py index e7348f4a6..c8e10f278 100644 --- a/nodepool/driver/static/provider.py +++ b/nodepool/driver/static/provider.py @@ -48,6 +48,11 @@ class StaticNodeProvider(Provider, QuotaSupport): log = logging.getLogger("nodepool.driver.static." "StaticNodeProvider") + # Do not pause the provider when we are at quota so that requests + # for different labels do not interfere with each other (resources + # are not fungible in the static proivder). + PAUSE_AT_QUOTA = False + def __init__(self, provider, *args): self.provider = provider # Lock to avoid data races when registering nodes from diff --git a/nodepool/tests/fixtures/static-two-node.yaml b/nodepool/tests/fixtures/static-two-node.yaml new file mode 100644 index 000000000..b551f35e0 --- /dev/null +++ b/nodepool/tests/fixtures/static-two-node.yaml @@ -0,0 +1,36 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: fake-label + - name: fake-label2 + +providers: + - name: static-provider + driver: static + pools: + - name: main + node-attributes: + key1: value1 + key2: value2 + nodes: + - name: fake-host-1 + labels: fake-label + host-key: ssh-rsa FAKEKEY + timeout: 13 + connection-port: 22022 + username: zuul + - name: fake-host-2 + labels: fake-label2 + host-key: ssh-rsa FAKEKEY + timeout: 13 + connection-port: 22022 + username: zuul + host-key-checking: false diff --git a/nodepool/tests/unit/test_driver_static.py b/nodepool/tests/unit/test_driver_static.py index 8523c9222..b5de5d44d 100644 --- a/nodepool/tests/unit/test_driver_static.py +++ b/nodepool/tests/unit/test_driver_static.py @@ -549,6 +549,61 @@ class TestDriverStatic(tests.DBTestCase): self.assertEqual(len(req.nodes), 1) self.assertNotEqual(req.nodes[0], nodes[0].id) + def test_liveness_two_node(self): + ''' + Test that a node going offline doesn't block a static provider + ''' + configfile = self.setup_config('static-two-node.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + nodes1 = self.waitForNodes('fake-label') + self.assertEqual(len(nodes1), 1) + nodes2 = self.waitForNodes('fake-label2') + self.assertEqual(len(nodes2), 1) + + # We will take this node offline later + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('fake-label') + + # This nod does not perform host checks so never goes offline + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('fake-label2') + + # Take the first node offline then submit a node request for + # it (which should be accepted but then released once the + # driver fails the liveness check). + with mock.patch("nodepool.nodeutils.nodescan") as nodescan_mock: + nodescan_mock.side_effect = OSError + self.zk.storeNodeRequest(req1) + self.waitForNodeDeletion(nodes1[0]) + + # We know the first request has been accepted and + # processed since the node was deleted. At this point, + # submit the second request. + self.zk.storeNodeRequest(req2) + + self.log.debug("Waiting for request %s", req2.id) + req2 = self.waitForNodeRequest(req2) + # The second request is fulfilled; get the status of the + # first request at the same time. + req1 = self.zk.getNodeRequest(req1.id) + + # Verify that the first request was not fulfilled at the time + # that the second was. + self.assertEqual(req1.state, zk.REQUESTED) + self.assertEqual(req2.state, zk.FULFILLED) + self.assertEqual(len(req2.nodes), 1) + self.assertNotEqual(req2.nodes[0], nodes2[0].id) + + # Now let req1 complete with the node online. + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + self.assertEqual(req1.state, zk.FULFILLED) + self.assertEqual(len(req1.nodes), 1) + self.assertNotEqual(req1.nodes[0], nodes1[0].id) + def test_host_key_checking_toggle(self): """Test that host key checking can be disabled""" configfile = self.setup_config('static-no-check.yaml')