Do not abort node launch if failed node cannot be deleted
There was a possibility for an uncaught exception when launching a node: if the launch fails and the subsequent node cleaning fails as well, nodepool would break out of the loop prematurely. Also add request information when logging launch failure to make debugging easier. Change-Id: I4a08ebe7b0ce763b6ac4bcd314403b869e487ac1
This commit is contained in:
parent
7fb7d69e3b
commit
ebdc35fc5e
@ -300,6 +300,41 @@ class FakeUploadFailCloud(FakeOpenStackCloud):
|
|||||||
return super(FakeUploadFailCloud, self).create_image(**kwargs)
|
return super(FakeUploadFailCloud, self).create_image(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
|
||||||
|
log = logging.getLogger("nodepool.FakeLaunchAndDeleteFailCloud")
|
||||||
|
|
||||||
|
def __init__(self, times_to_fail=None):
|
||||||
|
super(FakeLaunchAndDeleteFailCloud, self).__init__()
|
||||||
|
self.times_to_fail_delete = times_to_fail
|
||||||
|
self.times_to_fail_launch = times_to_fail
|
||||||
|
self.times_failed_delete = 0
|
||||||
|
self.times_failed_launch = 0
|
||||||
|
self.launch_success = False
|
||||||
|
self.delete_success = False
|
||||||
|
|
||||||
|
def wait_for_server(self, **kwargs):
|
||||||
|
if self.times_to_fail_launch is None:
|
||||||
|
raise Exception("Test fail server launch.")
|
||||||
|
if self.times_failed_launch < self.times_to_fail_launch:
|
||||||
|
self.times_failed_launch += 1
|
||||||
|
raise exceptions.ServerDeleteException("Test fail server launch.")
|
||||||
|
else:
|
||||||
|
self.launch_success = True
|
||||||
|
return super(FakeLaunchAndDeleteFailCloud,
|
||||||
|
self).wait_for_server(**kwargs)
|
||||||
|
|
||||||
|
def delete_server(self, *args, **kwargs):
|
||||||
|
if self.times_to_fail_delete is None:
|
||||||
|
raise exceptions.ServerDeleteException("Test fail server delete.")
|
||||||
|
if self.times_failed_delete < self.times_to_fail_delete:
|
||||||
|
self.times_failed_delete += 1
|
||||||
|
raise exceptions.ServerDeleteException("Test fail server delete.")
|
||||||
|
else:
|
||||||
|
self.delete_success = True
|
||||||
|
return super(FakeLaunchAndDeleteFailCloud,
|
||||||
|
self).delete_server(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class FakeProvider(OpenStackProvider):
|
class FakeProvider(OpenStackProvider):
|
||||||
fake_cloud = FakeOpenStackCloud
|
fake_cloud = FakeOpenStackCloud
|
||||||
|
|
||||||
|
@ -232,13 +232,19 @@ class OpenStackNodeLauncher(NodeLauncher):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if attempts <= self._retries:
|
if attempts <= self._retries:
|
||||||
self.log.exception(
|
self.log.exception(
|
||||||
"Launch attempt %d/%d failed for node %s:",
|
"Request %s: Launch attempt %d/%d failed for node %s:",
|
||||||
attempts, self._retries, self.node.id)
|
self.handler.request.id, attempts,
|
||||||
|
self._retries, self.node.id)
|
||||||
# If we created an instance, delete it.
|
# If we created an instance, delete it.
|
||||||
if self.node.external_id:
|
if self.node.external_id:
|
||||||
self.handler.manager.cleanupNode(self.node.external_id)
|
deleting_node = zk.Node()
|
||||||
self.handler.manager.waitForNodeCleanup(
|
deleting_node.provider = self.node.provider
|
||||||
self.node.external_id)
|
deleting_node.external_id = self.node.external_id
|
||||||
|
deleting_node.state = zk.DELETING
|
||||||
|
self.zk.storeNode(deleting_node)
|
||||||
|
self.log.info(
|
||||||
|
"Request %s: Node %s scheduled for cleanup",
|
||||||
|
self.handler.request.id, deleting_node.external_id)
|
||||||
self.node.external_id = None
|
self.node.external_id = None
|
||||||
self.node.public_ipv4 = None
|
self.node.public_ipv4 = None
|
||||||
self.node.public_ipv6 = None
|
self.node.public_ipv6 = None
|
||||||
|
@ -1497,6 +1497,43 @@ class TestLauncher(tests.DBTestCase):
|
|||||||
while self.zk.countPoolNodes('fake-provider', 'main'):
|
while self.zk.countPoolNodes('fake-provider', 'main'):
|
||||||
time.sleep(0)
|
time.sleep(0)
|
||||||
|
|
||||||
|
def test_launchNode_delete_error(self):
|
||||||
|
'''
|
||||||
|
Test that the launcher keeps trying to spawn a node in case of a
|
||||||
|
delete error
|
||||||
|
'''
|
||||||
|
fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud(
|
||||||
|
times_to_fail=1)
|
||||||
|
|
||||||
|
def get_fake_client(*args, **kwargs):
|
||||||
|
return fake_client
|
||||||
|
|
||||||
|
self.useFixture(fixtures.MockPatchObject(
|
||||||
|
fakeprovider.FakeProvider, '_getClient',
|
||||||
|
get_fake_client))
|
||||||
|
|
||||||
|
configfile = self.setup_config('node_launch_retry.yaml')
|
||||||
|
self.useBuilder(configfile)
|
||||||
|
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||||
|
pool.cleanup_interval = 60
|
||||||
|
pool.start()
|
||||||
|
self.waitForImage('fake-provider', 'fake-image')
|
||||||
|
|
||||||
|
req = zk.NodeRequest()
|
||||||
|
req.state = zk.REQUESTED
|
||||||
|
req.node_types.append('fake-label')
|
||||||
|
self.zk.storeNodeRequest(req)
|
||||||
|
|
||||||
|
req = self.waitForNodeRequest(req)
|
||||||
|
self.assertTrue(fake_client.launch_success)
|
||||||
|
self.assertTrue(fake_client.delete_success)
|
||||||
|
self.assertEqual(fake_client.times_to_fail_delete,
|
||||||
|
fake_client.times_failed_delete)
|
||||||
|
self.assertEqual(fake_client.times_to_fail_launch,
|
||||||
|
fake_client.times_failed_launch)
|
||||||
|
self.assertEqual(req.state, zk.FULFILLED)
|
||||||
|
self.assertEqual(len(req.nodes), 1)
|
||||||
|
|
||||||
@mock.patch(
|
@mock.patch(
|
||||||
'nodepool.driver.openstack.handler.OpenStackNodeRequestHandler.poll')
|
'nodepool.driver.openstack.handler.OpenStackNodeRequestHandler.poll')
|
||||||
def test_handler_poll_session_expired(self, mock_poll):
|
def test_handler_poll_session_expired(self, mock_poll):
|
||||||
|
Loading…
Reference in New Issue
Block a user