Detach volumes when VM creation fails

If the boot-volume creation fails, the data volume is left in state
"in-use", attached to the server which is now in "error" state.
The user can't detach the volume because of the server's error state.

They can delete the server, which then leaves the volume apparently
attached to a server that no longer exists, which is being fixed
separately: https://review.openstack.org/#/c/340614/

The only way out of this is to ask an administrator to reset the state of
the data volume (this option is not available to regular users by
default policy).

This change fixes the problem in the compute service such that
when the creation fails, compute manager detaches the created volumes
before putting the VM into error state. Then you can delete the instance
without care about attached volumes.

Change-Id: I8b1c05317734e14ea73dc868941351bb31210bf0
Closes-bug: #1633249
This commit is contained in:
Ameed Ashour 2018-01-24 09:32:24 -05:00 committed by Matt Riedemann
parent c1442d3c8c
commit 61f6751a18
3 changed files with 77 additions and 21 deletions

View File

@ -1860,7 +1860,7 @@ class ComputeManager(manager.Manager):
instance=instance) instance=instance)
self._cleanup_allocated_networks(context, instance, self._cleanup_allocated_networks(context, instance,
requested_networks) requested_networks)
self._cleanup_volumes(context, instance.uuid, self._cleanup_volumes(context, instance,
block_device_mapping, raise_exc=False) block_device_mapping, raise_exc=False)
compute_utils.add_instance_fault_from_exc(context, compute_utils.add_instance_fault_from_exc(context,
instance, e, sys.exc_info(), instance, e, sys.exc_info(),
@ -1920,7 +1920,7 @@ class ComputeManager(manager.Manager):
LOG.exception(e.format_message(), instance=instance) LOG.exception(e.format_message(), instance=instance)
self._cleanup_allocated_networks(context, instance, self._cleanup_allocated_networks(context, instance,
requested_networks) requested_networks)
self._cleanup_volumes(context, instance.uuid, self._cleanup_volumes(context, instance,
block_device_mapping, raise_exc=False) block_device_mapping, raise_exc=False)
compute_utils.add_instance_fault_from_exc(context, instance, compute_utils.add_instance_fault_from_exc(context, instance,
e, sys.exc_info()) e, sys.exc_info())
@ -1934,7 +1934,7 @@ class ComputeManager(manager.Manager):
instance=instance) instance=instance)
self._cleanup_allocated_networks(context, instance, self._cleanup_allocated_networks(context, instance,
requested_networks) requested_networks)
self._cleanup_volumes(context, instance.uuid, self._cleanup_volumes(context, instance,
block_device_mapping, raise_exc=False) block_device_mapping, raise_exc=False)
compute_utils.add_instance_fault_from_exc(context, instance, compute_utils.add_instance_fault_from_exc(context, instance,
e, sys.exc_info()) e, sys.exc_info())
@ -2435,14 +2435,27 @@ class ComputeManager(manager.Manager):
self.host, action=fields.NotificationAction.SHUTDOWN, self.host, action=fields.NotificationAction.SHUTDOWN,
phase=fields.NotificationPhase.END, bdms=bdms) phase=fields.NotificationPhase.END, bdms=bdms)
def _cleanup_volumes(self, context, instance_uuid, bdms, raise_exc=True): def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
detach=True):
exc_info = None exc_info = None
for bdm in bdms: for bdm in bdms:
LOG.debug("terminating bdm %s", bdm, if detach and bdm.volume_id:
instance_uuid=instance_uuid) try:
LOG.debug("Detaching volume: %s", bdm.volume_id,
instance_uuid=instance.uuid)
destroy = bdm.delete_on_termination
self._detach_volume(context, bdm, instance,
destroy_bdm=destroy)
except Exception as exc:
exc_info = sys.exc_info()
LOG.warning('Failed to detach volume: %(volume_id)s '
'due to %(exc)s',
{'volume_id': bdm.volume_id, 'exc': exc})
if bdm.volume_id and bdm.delete_on_termination: if bdm.volume_id and bdm.delete_on_termination:
try: try:
LOG.debug("Deleting volume: %s", bdm.volume_id,
instance_uuid=instance.uuid)
self.volume_api.delete(context, bdm.volume_id) self.volume_api.delete(context, bdm.volume_id)
except Exception as exc: except Exception as exc:
exc_info = sys.exc_info() exc_info = sys.exc_info()
@ -2494,8 +2507,14 @@ class ComputeManager(manager.Manager):
# future to set an instance fault the first time # future to set an instance fault the first time
# and to only ignore the failure if the instance # and to only ignore the failure if the instance
# is already in ERROR. # is already in ERROR.
self._cleanup_volumes(context, instance.uuid, bdms,
raise_exc=False) # NOTE(ameeda): The volumes already detached during the above
# _shutdown_instance() call and this is why
# detach is not requested from _cleanup_volumes()
# in this case
self._cleanup_volumes(context, instance, bdms,
raise_exc=False, detach=False)
# if a delete task succeeded, always update vm state and task # if a delete task succeeded, always update vm state and task
# state without expecting task state to be DELETING # state without expecting task state to be DELETING
instance.vm_state = vm_states.DELETED instance.vm_state = vm_states.DELETED
@ -7418,7 +7437,7 @@ class ComputeManager(manager.Manager):
try: try:
self._shutdown_instance(context, instance, bdms, self._shutdown_instance(context, instance, bdms,
notify=False) notify=False)
self._cleanup_volumes(context, instance.uuid, bdms) self._cleanup_volumes(context, instance, bdms)
except Exception as e: except Exception as e:
LOG.warning("Periodic cleanup failed to delete " LOG.warning("Periodic cleanup failed to delete "
"instance: %s", "instance: %s",

View File

@ -6938,7 +6938,7 @@ class ComputeTestCase(BaseTestCase,
mock_shutdown.assert_has_calls([ mock_shutdown.assert_has_calls([
mock.call(ctxt, inst1, bdms, notify=False), mock.call(ctxt, inst1, bdms, notify=False),
mock.call(ctxt, inst2, bdms, notify=False)]) mock.call(ctxt, inst2, bdms, notify=False)])
mock_cleanup.assert_called_once_with(ctxt, inst2['uuid'], bdms) mock_cleanup.assert_called_once_with(ctxt, inst2, bdms)
mock_get_uuid.assert_has_calls([ mock_get_uuid.assert_has_calls([
mock.call(ctxt, inst1.uuid, use_slave=True), mock.call(ctxt, inst1.uuid, use_slave=True),
mock.call(ctxt, inst2.uuid, use_slave=True)]) mock.call(ctxt, inst2.uuid, use_slave=True)])

View File

@ -277,7 +277,10 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.assertFalse(mock_log.error.called) self.assertFalse(mock_log.error.called)
@mock.patch('nova.compute.utils.notify_about_instance_action') @mock.patch('nova.compute.utils.notify_about_instance_action')
def test_delete_instance_without_info_cache(self, mock_notify): @mock.patch('nova.compute.manager.ComputeManager.'
'_detach_volume')
def test_delete_instance_without_info_cache(self, mock_detach,
mock_notify):
instance = fake_instance.fake_instance_obj( instance = fake_instance.fake_instance_obj(
self.context, self.context,
uuid=uuids.instance, uuid=uuids.instance,
@ -3757,7 +3760,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.assertRaises(test.TestingException, do_test) self.assertRaises(test.TestingException, do_test)
set_error.assert_called_once_with(self.context, instance) set_error.assert_called_once_with(self.context, instance)
def test_cleanup_volumes(self): @mock.patch('nova.compute.manager.ComputeManager.'
'_detach_volume')
def test_cleanup_volumes(self, mock_detach):
instance = fake_instance.fake_instance_obj(self.context) instance = fake_instance.fake_instance_obj(self.context)
bdm_do_not_delete_dict = fake_block_device.FakeDbBlockDeviceDict( bdm_do_not_delete_dict = fake_block_device.FakeDbBlockDeviceDict(
{'volume_id': 'fake-id1', 'source_type': 'image', {'volume_id': 'fake-id1', 'source_type': 'image',
@ -3770,11 +3775,17 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
with mock.patch.object(self.compute.volume_api, with mock.patch.object(self.compute.volume_api,
'delete') as volume_delete: 'delete') as volume_delete:
self.compute._cleanup_volumes(self.context, instance.uuid, bdms) self.compute._cleanup_volumes(self.context, instance, bdms)
calls = [mock.call(self.context, bdm, instance,
destroy_bdm=bdm.delete_on_termination)
for bdm in bdms]
self.assertEqual(calls, mock_detach.call_args_list)
volume_delete.assert_called_once_with(self.context, volume_delete.assert_called_once_with(self.context,
bdms[1].volume_id) bdms[1].volume_id)
def test_cleanup_volumes_exception_do_not_raise(self): @mock.patch('nova.compute.manager.ComputeManager.'
'_detach_volume')
def test_cleanup_volumes_exception_do_not_raise(self, mock_detach):
instance = fake_instance.fake_instance_obj(self.context) instance = fake_instance.fake_instance_obj(self.context)
bdm_dict1 = fake_block_device.FakeDbBlockDeviceDict( bdm_dict1 = fake_block_device.FakeDbBlockDeviceDict(
{'volume_id': 'fake-id1', 'source_type': 'image', {'volume_id': 'fake-id1', 'source_type': 'image',
@ -3788,12 +3799,17 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
with mock.patch.object(self.compute.volume_api, with mock.patch.object(self.compute.volume_api,
'delete', 'delete',
side_effect=[test.TestingException(), None]) as volume_delete: side_effect=[test.TestingException(), None]) as volume_delete:
self.compute._cleanup_volumes(self.context, instance.uuid, bdms, self.compute._cleanup_volumes(self.context, instance, bdms,
raise_exc=False) raise_exc=False)
calls = [mock.call(self.context, bdm.volume_id) for bdm in bdms] calls = [mock.call(self.context, bdm.volume_id) for bdm in bdms]
self.assertEqual(calls, volume_delete.call_args_list) self.assertEqual(calls, volume_delete.call_args_list)
calls = [mock.call(self.context, bdm, instance,
destroy_bdm=True) for bdm in bdms]
self.assertEqual(calls, mock_detach.call_args_list)
def test_cleanup_volumes_exception_raise(self): @mock.patch('nova.compute.manager.ComputeManager.'
'_detach_volume')
def test_cleanup_volumes_exception_raise(self, mock_detach):
instance = fake_instance.fake_instance_obj(self.context) instance = fake_instance.fake_instance_obj(self.context)
bdm_dict1 = fake_block_device.FakeDbBlockDeviceDict( bdm_dict1 = fake_block_device.FakeDbBlockDeviceDict(
{'volume_id': 'fake-id1', 'source_type': 'image', {'volume_id': 'fake-id1', 'source_type': 'image',
@ -3808,10 +3824,31 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
'delete', 'delete',
side_effect=[test.TestingException(), None]) as volume_delete: side_effect=[test.TestingException(), None]) as volume_delete:
self.assertRaises(test.TestingException, self.assertRaises(test.TestingException,
self.compute._cleanup_volumes, self.context, instance.uuid, self.compute._cleanup_volumes, self.context, instance,
bdms) bdms)
calls = [mock.call(self.context, bdm.volume_id) for bdm in bdms] calls = [mock.call(self.context, bdm.volume_id) for bdm in bdms]
self.assertEqual(calls, volume_delete.call_args_list) self.assertEqual(calls, volume_delete.call_args_list)
calls = [mock.call(self.context, bdm, instance,
destroy_bdm=bdm.delete_on_termination)
for bdm in bdms]
self.assertEqual(calls, mock_detach.call_args_list)
@mock.patch('nova.compute.manager.ComputeManager._detach_volume',
side_effect=exception.CinderConnectionFailed(reason='idk'))
def test_cleanup_volumes_detach_fails_raise_exc(self, mock_detach):
instance = fake_instance.fake_instance_obj(self.context)
bdms = block_device_obj.block_device_make_list(
self.context,
[fake_block_device.FakeDbBlockDeviceDict(
{'volume_id': uuids.volume_id,
'source_type': 'volume',
'destination_type': 'volume',
'delete_on_termination': False})])
self.assertRaises(exception.CinderConnectionFailed,
self.compute._cleanup_volumes, self.context,
instance, bdms)
mock_detach.assert_called_once_with(
self.context, bdms[0], instance, destroy_bdm=False)
def test_stop_instance_task_state_none_power_state_shutdown(self): def test_stop_instance_task_state_none_power_state_shutdown(self):
# Tests that stop_instance doesn't puke when the instance power_state # Tests that stop_instance doesn't puke when the instance power_state
@ -4574,7 +4611,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_clean_net.assert_called_once_with(self.context, self.instance, mock_clean_net.assert_called_once_with(self.context, self.instance,
self.requested_networks) self.requested_networks)
mock_clean_vol.assert_called_once_with(self.context, mock_clean_vol.assert_called_once_with(self.context,
self.instance.uuid, self.block_device_mapping, raise_exc=False) self.instance, self.block_device_mapping, raise_exc=False)
mock_add.assert_called_once_with(self.context, self.instance, mock_add.assert_called_once_with(self.context, self.instance,
mock.ANY, mock.ANY) mock.ANY, mock.ANY)
mock_nil.assert_called_once_with(self.instance) mock_nil.assert_called_once_with(self.instance)
@ -4811,7 +4848,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_clean_net.assert_called_once_with(self.context, self.instance, mock_clean_net.assert_called_once_with(self.context, self.instance,
self.requested_networks) self.requested_networks)
mock_clean_vol.assert_called_once_with(self.context, mock_clean_vol.assert_called_once_with(self.context,
self.instance.uuid, self.block_device_mapping, self.instance, self.block_device_mapping,
raise_exc=False) raise_exc=False)
mock_add.assert_called_once_with(self.context, self.instance, mock_add.assert_called_once_with(self.context, self.instance,
mock.ANY, mock.ANY, fault_message=mock.ANY) mock.ANY, mock.ANY, fault_message=mock.ANY)
@ -4959,7 +4996,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self._assert_build_instance_update(mock_save) self._assert_build_instance_update(mock_save)
if cleanup_volumes: if cleanup_volumes:
mock_clean_vol.assert_called_once_with(self.context, mock_clean_vol.assert_called_once_with(self.context,
self.instance.uuid, self.block_device_mapping, self.instance, self.block_device_mapping,
raise_exc=False) raise_exc=False)
if nil_out_host_and_node: if nil_out_host_and_node:
mock_nil.assert_called_once_with(self.instance) mock_nil.assert_called_once_with(self.instance)