From 8eb099283acbd05750c1baa1f8281a3be3ec6b49 Mon Sep 17 00:00:00 2001 From: Tristan Cacqueray Date: Tue, 19 Jun 2018 00:36:10 +0000 Subject: [PATCH] zk: skip node already being deleted in cleanup leaked instance task A cloud may fails to delete a node, and in this case we need to check if we are not already trying to delete the node. Otherwise the launcher keeps on adding artifical node. Change-Id: Ic5dfc75df771a5f312099ee82f82e2561f5f4829 --- nodepool/launcher.py | 12 +++++ .../tests/fixtures/node_delete_error.yaml | 49 +++++++++++++++++++ nodepool/tests/test_launcher.py | 41 ++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 nodepool/tests/fixtures/node_delete_error.yaml diff --git a/nodepool/launcher.py b/nodepool/launcher.py index be7e1d5c4..4b18cb54e 100755 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -471,6 +471,13 @@ class CleanupWorker(BaseCleanupWorker): ''' zk_conn = self._nodepool.getZK() + deleting_nodes = {} + for node in zk_conn.nodeIterator(): + if node.state == zk.DELETING: + if node.provider not in deleting_nodes: + deleting_nodes[node.provider] = [] + deleting_nodes[node.provider].append(node.external_id) + for provider in self._nodepool.config.providers.values(): manager = self._nodepool.getProviderManager(provider.name) @@ -485,6 +492,11 @@ class CleanupWorker(BaseCleanupWorker): # with a different name, owns this. continue + if (provider.name in deleting_nodes and + server.id in deleting_nodes[provider.name]): + # Already deleting this node + continue + if not zk_conn.getNode(meta['nodepool_node_id']): self.log.warning( "Marking for delete leaked instance %s (%s) in %s " diff --git a/nodepool/tests/fixtures/node_delete_error.yaml b/nodepool/tests/fixtures/node_delete_error.yaml new file mode 100644 index 000000000..1266e65e6 --- /dev/null +++ b/nodepool/tests/fixtures/node_delete_error.yaml @@ -0,0 +1,49 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' +build-log-retention: 1 + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: + - name: fake-label + min-ready: 0 + +providers: + - name: fake-provider + cloud: fake + driver: fake + region-name: fake-region + rate: 0.0001 + diskimages: + - name: fake-image + meta: + key: value + key2: value + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - net-name + labels: + - name: fake-label + diskimage: fake-image + min-ram: 8192 + flavor-name: 'Fake' + +diskimages: + - name: fake-image + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py index 63aa245c0..564a1e1a4 100644 --- a/nodepool/tests/test_launcher.py +++ b/nodepool/tests/test_launcher.py @@ -677,6 +677,47 @@ class TestLauncher(tests.DBTestCase): self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].provider, 'fake-provider') + def test_node_delete_error(self): + def error_delete(self, name): + # Set ERROR status instead of deleting the node + self._getClient()._server_list[0].status = 'ERROR' + + self.useFixture(fixtures.MockPatchObject( + fakeprovider.FakeProvider, 'deleteServer', error_delete)) + + configfile = self.setup_config('node_delete_error.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.useBuilder(configfile) + pool.start() + self.waitForImage('fake-provider', 'fake-image') + + # request a node + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + self.log.debug("Wait for request") + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + + self.assertEqual(len(req.nodes), 1) + + # remove the node from db + self.log.debug("deleting node %s", req.nodes[0]) + node = self.zk.getNode(req.nodes[0]) + self.zk.deleteNode(node) + + # wait the cleanup thread to kick in + time.sleep(5) + zk_nodes = self.zk.getNodes() + self.assertEqual(len(zk_nodes), 1) + node = self.zk.getNode(zk_nodes[0]) + self.assertEqual(node.state, zk.DELETING) + + # remove error nodes + pool.getProviderManager( + 'fake-provider')._getClient()._server_list.clear() + def test_leaked_node(self): """Test that a leaked node is deleted""" configfile = self.setup_config('leaked_node.yaml')