From 1658aa9851ad67a1f966be0a5e52287d4b2a12d6 Mon Sep 17 00:00:00 2001 From: mbecker Date: Mon, 22 Aug 2022 16:30:36 +0200 Subject: [PATCH] Add hold command to disable nodes This allows nodes to be set in an idle state so that they will not have jobs scheduled while e.g. maintenance tasks are performed. This is probably most useful for static nodes. Change-Id: Iebc6b909f370fca11fab2be0b8805d4daef33afe --- doc/source/operation.rst | 5 + nodepool/cmd/nodepoolcmd.py | 26 ++++- nodepool/tests/__init__.py | 4 +- nodepool/tests/unit/test_commands.py | 156 +++++++++++++++++++++++++++ 4 files changed, 188 insertions(+), 3 deletions(-) diff --git a/doc/source/operation.rst b/doc/source/operation.rst index 2a680e99c..f85e4ec2a 100644 --- a/doc/source/operation.rst +++ b/doc/source/operation.rst @@ -186,6 +186,11 @@ delete .. program-output:: nodepool delete --help :nostderr: +hold +^^^^ +.. program-output:: nodepool hold --help + :nostderr: + The following subcommands deal with ZooKeeper data management: info diff --git a/nodepool/cmd/nodepoolcmd.py b/nodepool/cmd/nodepoolcmd.py index 65e3f4257..0265c2c97 100644 --- a/nodepool/cmd/nodepoolcmd.py +++ b/nodepool/cmd/nodepoolcmd.py @@ -88,6 +88,13 @@ class NodePoolCmd(NodepoolApp): action='store_true', help='delete the node in the foreground') + cmd_hold = subparsers.add_parser( + 'hold', + help='place a node in the HOLD state ' + 'e.g. for running maintenance tasks') + cmd_hold.set_defaults(func=self.hold) + cmd_hold.add_argument('id', help='node id') + cmd_image_delete = subparsers.add_parser( 'image-delete', help='delete an image') @@ -303,6 +310,23 @@ class NodePoolCmd(NodepoolApp): self.list(node_id=node.id) + def _change_node_state(self, new_state): + node = self.zk.getNode(self.args.id) + if not node: + print("Node id %s not found" % self.args.id) + return + + self.zk.lockNode(node, blocking=True, timeout=5) + + node.state = new_state + self.zk.storeNode(node) + self.zk.unlockNode(node) + + self.list(node_id=node.id) + + def hold(self): + self._change_node_state(zk.HOLD) + def dib_image_delete(self): (image, build_num) = self.args.id.rsplit('-', 1) build = self.zk.getBuild(image, build_num) @@ -434,7 +458,7 @@ class NodePoolCmd(NodepoolApp): 'image-status', 'image-list', 'dib-image-delete', 'image-delete', 'alien-image-list', - 'list', 'delete', + 'list', 'delete', 'hold', 'request-list', 'info', 'erase', 'image-pause', 'image-unpause', 'export-image-data', 'import-image-data'): diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 925cb8b20..347d27617 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -618,13 +618,13 @@ class DBTestCase(BaseTestCase): if node.state == state: return node - def waitForNodeRequest(self, req, states=None): + def waitForNodeRequest(self, req, states=None, max_time=ONE_MINUTE): ''' Wait for a node request to transition to a final state. ''' if states is None: states = (zk.FULFILLED, zk.FAILED) - for _ in iterate_timeout(ONE_MINUTE, Exception, + for _ in iterate_timeout(max_time, Exception, "Node request state transition", interval=1): req = self.zk.getNodeRequest(req.id) diff --git a/nodepool/tests/unit/test_commands.py b/nodepool/tests/unit/test_commands.py index a993402c7..c08851206 100644 --- a/nodepool/tests/unit/test_commands.py +++ b/nodepool/tests/unit/test_commands.py @@ -22,6 +22,7 @@ import fixtures import mock import testtools +from nodepool import exceptions as npe from nodepool.cmd import nodepoolcmd from nodepool import tests from nodepool.zk import zookeeper as zk @@ -350,6 +351,161 @@ class TestNodepoolCMD(tests.DBTestCase): # Assert the node is gone self.assert_listed(configfile, ['list'], 0, nodes[0].id, 0) + def test_hold(self): + configfile = self.setup_config('node.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + + # Assert one node exists and it is nodes[0].id in a ready state. + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.READY) + + # Hold node + self.patch_argv('-c', configfile, 'hold', nodes[0].id) + nodepoolcmd.main() + + # Assert the node is on hold + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.HOLD) + + # Re-enable node by deleting + old_node_id = nodes[0].id + self.patch_argv('-c', configfile, 'delete', nodes[0].id) + nodepoolcmd.main() + + # Assert that the node is ready + self.waitForNodeDeletion(nodes[0]) + new_nodes = self.waitForNodes('fake-label') + self.assertEqual(len(new_nodes), 1) + self.assert_listed(configfile, ['list'], 0, new_nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.READY) + self.assertNotEqual(old_node_id, new_nodes[0].id) + + # Request a node + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.assertEqual(len(req.nodes), 0) + self.zk.storeNodeRequest(req) + + self.log.debug("Waiting for request %s", req.id) + req = self.waitForNodeRequest(req, (zk.FULFILLED,)) + self.assertEqual(len(req.nodes), 1) + + def test_attempt_hold_busy_node(self): + configfile = self.setup_config('node.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + + # Assert one node exists and it is nodes[0].id in a ready state. + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.READY) + + # Request a node + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('fake-label') + self.zk.storeNodeRequest(req1) + + # Wait for node request + self.log.debug("Waiting for 1st request %s", req1.id) + req1 = self.waitForNodeRequest(req1, (zk.FULFILLED,)) + self.assertEqual(len(req1.nodes), 1) + + # Lock node and set it as in-use + node = self.zk.getNode(req1.nodes[0]) + self.zk.lockNode(node, blocking=False) + node.state = zk.IN_USE + self.zk.storeNode(node) + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.IN_USE) + + # Attempt to hold the node, this should fail + # since another process holds the lock + with testtools.ExpectedException(npe.TimeoutException): + self.patch_argv('-c', configfile, 'hold', nodes[0].id) + nodepoolcmd.main() + + def test_attempt_request_held_static_node(self): + configfile = self.setup_config('static-basic.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + + # Assert one node exists and it is nodes[0].id in a ready state. + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.READY) + + # Hold node + self.patch_argv('-c', configfile, 'hold', nodes[0].id) + nodepoolcmd.main() + + # Assert the node is on HOLD + self.assertEqual(len(nodes), 1) + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.HOLD) + + # Prepare node request + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + # Make a node request + # Expect to timeout since the node is not ready + self.log.debug("Waiting for request %s", req.id) + req = self.zk.getNodeRequest(req.id) + + with testtools.ExpectedException(Exception): + req = self.waitForNodeRequest(req, (zk.FULFILLED,), max_time=30) + + self.assertEqual(len(req.nodes), 0) + + def test_attempt_request_held_node(self): + configfile = self.setup_config('node.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + nodes = self.waitForNodes('fake-label') + self.assertEqual(len(nodes), 1) + + # Assert one node exists and it is nodes[0].id in a ready state. + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.READY) + + # Hold node + self.patch_argv('-c', configfile, 'hold', nodes[0].id) + nodepoolcmd.main() + + # Assert the node is on HOLD + self.assertEqual(len(nodes), 1) + self.assert_listed(configfile, ['list'], 0, nodes[0].id, 1) + self.assert_nodes_listed(configfile, 1, zk.HOLD) + + # Prepare node request + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + # Make a node request + self.log.debug("Waiting for request %s", req.id) + req = self.waitForNodeRequest(req, (zk.FULFILLED,)) + + # Make sure we did not assign the held node + # but another node as long as the quota is not reached + self.assertNotEqual(nodes[0].id, req.nodes[0]) + def test_image_build(self): configfile = self.setup_config('node.yaml') self.useBuilder(configfile)