zk: skip node already being deleted in cleanup leaked instance task
A cloud may fails to delete a node, and in this case we need to check if we are not already trying to delete the node. Otherwise the launcher keeps on adding artifical node. Change-Id: Ic5dfc75df771a5f312099ee82f82e2561f5f4829
This commit is contained in:
parent
58ccd2c718
commit
8eb099283a
|
@ -471,6 +471,13 @@ class CleanupWorker(BaseCleanupWorker):
|
||||||
'''
|
'''
|
||||||
zk_conn = self._nodepool.getZK()
|
zk_conn = self._nodepool.getZK()
|
||||||
|
|
||||||
|
deleting_nodes = {}
|
||||||
|
for node in zk_conn.nodeIterator():
|
||||||
|
if node.state == zk.DELETING:
|
||||||
|
if node.provider not in deleting_nodes:
|
||||||
|
deleting_nodes[node.provider] = []
|
||||||
|
deleting_nodes[node.provider].append(node.external_id)
|
||||||
|
|
||||||
for provider in self._nodepool.config.providers.values():
|
for provider in self._nodepool.config.providers.values():
|
||||||
manager = self._nodepool.getProviderManager(provider.name)
|
manager = self._nodepool.getProviderManager(provider.name)
|
||||||
|
|
||||||
|
@ -485,6 +492,11 @@ class CleanupWorker(BaseCleanupWorker):
|
||||||
# with a different name, owns this.
|
# with a different name, owns this.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if (provider.name in deleting_nodes and
|
||||||
|
server.id in deleting_nodes[provider.name]):
|
||||||
|
# Already deleting this node
|
||||||
|
continue
|
||||||
|
|
||||||
if not zk_conn.getNode(meta['nodepool_node_id']):
|
if not zk_conn.getNode(meta['nodepool_node_id']):
|
||||||
self.log.warning(
|
self.log.warning(
|
||||||
"Marking for delete leaked instance %s (%s) in %s "
|
"Marking for delete leaked instance %s (%s) in %s "
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
elements-dir: .
|
||||||
|
images-dir: '{images_dir}'
|
||||||
|
build-log-dir: '{build_log_dir}'
|
||||||
|
build-log-retention: 1
|
||||||
|
|
||||||
|
zookeeper-servers:
|
||||||
|
- host: {zookeeper_host}
|
||||||
|
port: {zookeeper_port}
|
||||||
|
chroot: {zookeeper_chroot}
|
||||||
|
|
||||||
|
labels:
|
||||||
|
- name: fake-label
|
||||||
|
min-ready: 0
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: fake-provider
|
||||||
|
cloud: fake
|
||||||
|
driver: fake
|
||||||
|
region-name: fake-region
|
||||||
|
rate: 0.0001
|
||||||
|
diskimages:
|
||||||
|
- name: fake-image
|
||||||
|
meta:
|
||||||
|
key: value
|
||||||
|
key2: value
|
||||||
|
pools:
|
||||||
|
- name: main
|
||||||
|
max-servers: 96
|
||||||
|
availability-zones:
|
||||||
|
- az1
|
||||||
|
networks:
|
||||||
|
- net-name
|
||||||
|
labels:
|
||||||
|
- name: fake-label
|
||||||
|
diskimage: fake-image
|
||||||
|
min-ram: 8192
|
||||||
|
flavor-name: 'Fake'
|
||||||
|
|
||||||
|
diskimages:
|
||||||
|
- name: fake-image
|
||||||
|
elements:
|
||||||
|
- fedora
|
||||||
|
- vm
|
||||||
|
release: 21
|
||||||
|
env-vars:
|
||||||
|
TMPDIR: /opt/dib_tmp
|
||||||
|
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||||
|
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||||
|
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
|
@ -677,6 +677,47 @@ class TestLauncher(tests.DBTestCase):
|
||||||
self.assertEqual(len(nodes), 1)
|
self.assertEqual(len(nodes), 1)
|
||||||
self.assertEqual(nodes[0].provider, 'fake-provider')
|
self.assertEqual(nodes[0].provider, 'fake-provider')
|
||||||
|
|
||||||
|
def test_node_delete_error(self):
|
||||||
|
def error_delete(self, name):
|
||||||
|
# Set ERROR status instead of deleting the node
|
||||||
|
self._getClient()._server_list[0].status = 'ERROR'
|
||||||
|
|
||||||
|
self.useFixture(fixtures.MockPatchObject(
|
||||||
|
fakeprovider.FakeProvider, 'deleteServer', error_delete))
|
||||||
|
|
||||||
|
configfile = self.setup_config('node_delete_error.yaml')
|
||||||
|
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||||
|
self.useBuilder(configfile)
|
||||||
|
pool.start()
|
||||||
|
self.waitForImage('fake-provider', 'fake-image')
|
||||||
|
|
||||||
|
# request a node
|
||||||
|
req = zk.NodeRequest()
|
||||||
|
req.state = zk.REQUESTED
|
||||||
|
req.node_types.append('fake-label')
|
||||||
|
self.zk.storeNodeRequest(req)
|
||||||
|
self.log.debug("Wait for request")
|
||||||
|
req = self.waitForNodeRequest(req)
|
||||||
|
self.assertEqual(req.state, zk.FULFILLED)
|
||||||
|
|
||||||
|
self.assertEqual(len(req.nodes), 1)
|
||||||
|
|
||||||
|
# remove the node from db
|
||||||
|
self.log.debug("deleting node %s", req.nodes[0])
|
||||||
|
node = self.zk.getNode(req.nodes[0])
|
||||||
|
self.zk.deleteNode(node)
|
||||||
|
|
||||||
|
# wait the cleanup thread to kick in
|
||||||
|
time.sleep(5)
|
||||||
|
zk_nodes = self.zk.getNodes()
|
||||||
|
self.assertEqual(len(zk_nodes), 1)
|
||||||
|
node = self.zk.getNode(zk_nodes[0])
|
||||||
|
self.assertEqual(node.state, zk.DELETING)
|
||||||
|
|
||||||
|
# remove error nodes
|
||||||
|
pool.getProviderManager(
|
||||||
|
'fake-provider')._getClient()._server_list.clear()
|
||||||
|
|
||||||
def test_leaked_node(self):
|
def test_leaked_node(self):
|
||||||
"""Test that a leaked node is deleted"""
|
"""Test that a leaked node is deleted"""
|
||||||
configfile = self.setup_config('leaked_node.yaml')
|
configfile = self.setup_config('leaked_node.yaml')
|
||||||
|
|
Loading…
Reference in New Issue