From b3f39244a3eacd6fb141de61850cbd84fecdb544 Mon Sep 17 00:00:00 2001 From: ankitagrawal Date: Wed, 23 Sep 2015 03:58:19 -0700 Subject: [PATCH] Clean up ports and volumes when deleting ERROR instance Usually, when instance.host = None, it means the instance was never scheduled. However, the exception handling routine in compute manager [1] will set instance.host = None and set instance.vm_state = ERROR if the instance fails to build on the compute host. If that happens, we end up with an instance with host = None and vm_state = ERROR which may have ports and volumes still allocated. This adds some logic around deleting the instance when it may have ports or volumes allocated. 1. If the instance is not in ERROR or SHELVED_OFFLOADED state, we expect instance.host to be set to a compute host. So, if we find instance.host = None in states other than ERROR or SHELVED_OFFLOADED, we consider the instance to have failed scheduling and not require ports or volumes to be freed, and we simply destroy the instance database record and return. This is the "delete while booting" scenario. 2. If the instance is in ERROR because of a failed build or is SHELVED_OFFLOADED, we expect instance.host to be None even though there could be ports or volumes allocated. In this case, run the _local_delete routine to clean up ports and volumes and delete the instance database record. Co-Authored-By: Ankit Agrawal Co-Authored-By: Samuel Matzek Co-Authored-By: melanie witt Closes-Bug: 1404867 Closes-Bug: 1408527 [1] https://github.com/openstack/nova/blob/55ea961/nova/compute/manager.py#L1927-L1929 Change-Id: I4dc6c8bd3bb6c135f8a698af41f5d0e026c39117 --- nova/compute/api.py | 46 +++-- .../regressions/test_bug_1404867.py | 14 +- .../regressions/test_bug_1670627.py | 1 + .../regressions/test_bug_1689692.py | 1 + nova/tests/unit/compute/test_compute_api.py | 173 +++++++++++++++++- 5 files changed, 208 insertions(+), 27 deletions(-) diff --git a/nova/compute/api.py b/nova/compute/api.py index 494dd51229bc..1db7e0da733c 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -1780,12 +1780,12 @@ class API(base.Base): return cell = None - # If there is an instance.host (or the instance is shelved-offloaded), - # the instance has been scheduled and sent to a cell/compute which - # means it was pulled from the cell db. + # If there is an instance.host (or the instance is shelved-offloaded or + # in error state), the instance has been scheduled and sent to a + # cell/compute which means it was pulled from the cell db. # Normal delete should be attempted. - if not (instance.host or - instance.vm_state == vm_states.SHELVED_OFFLOADED): + may_have_ports_or_volumes = self._may_have_ports_or_volumes(instance) + if not instance.host and not may_have_ports_or_volumes: try: if self._delete_while_booting(context, instance): return @@ -1864,9 +1864,7 @@ class API(base.Base): # which will cause a cast to the child cell. cb(context, instance, bdms) return - shelved_offloaded = (instance.vm_state - == vm_states.SHELVED_OFFLOADED) - if not instance.host and not shelved_offloaded: + if not instance.host and not may_have_ports_or_volumes: try: with compute_utils.notify_about_instance_delete( self.notifier, context, instance, @@ -1879,7 +1877,12 @@ class API(base.Base): {'state': instance.vm_state}, instance=instance) return - except exception.ObjectActionError: + except exception.ObjectActionError as ex: + # The instance's host likely changed under us as + # this instance could be building and has since been + # scheduled. Continue with attempts to delete it. + LOG.debug('Refreshing instance because: %s', ex, + instance=instance) instance.refresh() if instance.vm_state == vm_states.RESIZED: @@ -1887,7 +1890,8 @@ class API(base.Base): is_local_delete = True try: - if not shelved_offloaded: + # instance.host must be set in order to look up the service. + if instance.host is not None: service = objects.Service.get_by_compute_host( context.elevated(), instance.host) is_local_delete = not self.servicegroup_api.service_is_up( @@ -1904,7 +1908,9 @@ class API(base.Base): cb(context, instance, bdms) except exception.ComputeHostNotFound: - pass + LOG.debug('Compute host %s not found during service up check, ' + 'going to local delete instance', instance.host, + instance=instance) if is_local_delete: # If instance is in shelved_offloaded state or compute node @@ -1931,6 +1937,16 @@ class API(base.Base): # NOTE(comstud): Race condition. Instance already gone. pass + def _may_have_ports_or_volumes(self, instance): + # NOTE(melwitt): When an instance build fails in the compute manager, + # the instance host and node are set to None and the vm_state is set + # to ERROR. In the case, the instance with host = None has actually + # been scheduled and may have ports and/or volumes allocated on the + # compute node. + if instance.vm_state in (vm_states.SHELVED_OFFLOADED, vm_states.ERROR): + return True + return False + def _confirm_resize_on_deleting(self, context, instance): # If in the middle of a resize, use confirm_resize to # ensure the original instance is cleaned up too @@ -1986,6 +2002,14 @@ class API(base.Base): 'the instance host %(instance_host)s.', {'connector_host': connector.get('host'), 'instance_host': instance.host}, instance=instance) + if (instance.host is None and + self._may_have_ports_or_volumes(instance)): + LOG.debug('Allowing use of stashed volume connector with ' + 'instance host None because instance with ' + 'vm_state %(vm_state)s has been scheduled in ' + 'the past.', {'vm_state': instance.vm_state}, + instance=instance) + return connector def _local_cleanup_bdm_volumes(self, bdms, instance, context): """The method deletes the bdm records and, if a bdm is a volume, call diff --git a/nova/tests/functional/regressions/test_bug_1404867.py b/nova/tests/functional/regressions/test_bug_1404867.py index 73d3a125692c..5fc6466e0c17 100644 --- a/nova/tests/functional/regressions/test_bug_1404867.py +++ b/nova/tests/functional/regressions/test_bug_1404867.py @@ -80,12 +80,7 @@ class DeleteWithReservedVolumes(integrated_helpers._IntegratedTestBase, # The volume should no longer be reserved as the deletion of the # server should have released all the resources. - # TODO(mnaser): Uncomment this in patch resolving the issue - # self.assertNotIn(volume_id, self.cinder.reserved_volumes) - - # The volume is still reserved at this point, which it should not be. - # TODO(mnaser): Remove this in patch resolving the issue - self.assertIn(volume_id, self.cinder.reserved_volumes) + self.assertNotIn(volume_id, self.cinder.reserved_volumes) @mock.patch('nova.objects.service.get_minimum_version_all_cells', return_value=compute_api.CINDER_V3_ATTACH_MIN_COMPUTE_VERSION) @@ -109,9 +104,4 @@ class DeleteWithReservedVolumes(integrated_helpers._IntegratedTestBase, # The volume should no longer have any attachments as instance delete # should have removed them. - # TODO(mnaser): Uncomment this in patch resolving the issue - # self.assertNotIn(volume_id, self.cinder.attachments[server_id]) - - # The volume still has attachments, which it should not have. - # TODO(mnaser): Remove this in patch resolving the issue - self.assertIn(volume_id, self.cinder.attachments[server_id]) + self.assertNotIn(volume_id, self.cinder.attachments[server_id]) diff --git a/nova/tests/functional/regressions/test_bug_1670627.py b/nova/tests/functional/regressions/test_bug_1670627.py index 4c01c01ac324..de970628c74d 100644 --- a/nova/tests/functional/regressions/test_bug_1670627.py +++ b/nova/tests/functional/regressions/test_bug_1670627.py @@ -59,6 +59,7 @@ class TestDeleteFromCell0CheckQuota(test.TestCase): self.start_service('conductor') self.start_service('scheduler') + self.start_service('consoleauth') # We don't actually start a compute service; this way we don't have any # compute hosts to schedule the instance to and will go into error and diff --git a/nova/tests/functional/regressions/test_bug_1689692.py b/nova/tests/functional/regressions/test_bug_1689692.py index 1b6cf48d835e..5c9d137ae2c0 100644 --- a/nova/tests/functional/regressions/test_bug_1689692.py +++ b/nova/tests/functional/regressions/test_bug_1689692.py @@ -53,6 +53,7 @@ class ServerListLimitMarkerCell0Test(test.TestCase, self.start_service('conductor') self.start_service('scheduler') + self.start_service('consoleauth') # We don't start the compute service because we want NoValidHost so # all of the instances go into ERROR state and get put into cell0. self.useFixture(cast_as_call.CastAsCall(self)) diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py index cc9b96b10e71..aff32ec674db 100644 --- a/nova/tests/unit/compute/test_compute_api.py +++ b/nova/tests/unit/compute/test_compute_api.py @@ -1115,7 +1115,7 @@ class _ComputeAPIUnitTestMixIn(object): if self.cell_type != 'api': if inst.vm_state == vm_states.RESIZED: self._test_delete_resized_part(inst) - if inst.vm_state != vm_states.SHELVED_OFFLOADED: + if inst.host is not None: self.context.elevated().AndReturn(self.context) objects.Service.get_by_compute_host(self.context, inst.host).AndReturn(objects.Service()) @@ -1123,9 +1123,7 @@ class _ComputeAPIUnitTestMixIn(object): mox.IsA(objects.Service)).AndReturn( inst.host != 'down-host') - if (inst.host == 'down-host' or - inst.vm_state == vm_states.SHELVED_OFFLOADED): - + if inst.host == 'down-host' or inst.host is None: self._test_downed_host_part(inst, updates, delete_time, delete_type) cast = False @@ -1215,6 +1213,76 @@ class _ComputeAPIUnitTestMixIn(object): system_metadata=fake_sys_meta) self._test_delete('force_delete', vm_state=vm_state) + @mock.patch('nova.compute.api.API._delete_while_booting', + return_value=False) + @mock.patch('nova.compute.api.API._lookup_instance') + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.objects.Service.get_by_compute_host') + @mock.patch('nova.compute.api.API._local_delete') + def test_delete_error_state_with_no_host( + self, mock_local_delete, mock_service_get, _mock_notify, + _mock_save, mock_bdm_get, mock_lookup, _mock_del_booting): + # Instance in error state with no host should be a local delete + # for non API cells + inst = self._create_instance_obj(params=dict(vm_state=vm_states.ERROR, + host=None)) + mock_lookup.return_value = None, inst + with mock.patch.object(self.compute_api.compute_rpcapi, + 'terminate_instance') as mock_terminate: + self.compute_api.delete(self.context, inst) + if self.cell_type == 'api': + mock_terminate.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + delete_type='delete') + mock_local_delete.assert_not_called() + else: + mock_local_delete.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + 'delete', self.compute_api._do_delete) + mock_terminate.assert_not_called() + mock_service_get.assert_not_called() + + @mock.patch('nova.compute.api.API._delete_while_booting', + return_value=False) + @mock.patch('nova.compute.api.API._lookup_instance') + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.objects.Service.get_by_compute_host') + @mock.patch('nova.context.RequestContext.elevated') + @mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True) + @mock.patch('nova.compute.api.API._record_action_start') + @mock.patch('nova.compute.api.API._local_delete') + def test_delete_error_state_with_host_set( + self, mock_local_delete, _mock_record, mock_service_up, + mock_elevated, mock_service_get, _mock_notify, _mock_save, + mock_bdm_get, mock_lookup, _mock_del_booting): + # Instance in error state with host set should be a non-local delete + # for non API cells if the service is up + inst = self._create_instance_obj(params=dict(vm_state=vm_states.ERROR, + host='fake-host')) + mock_lookup.return_value = inst + with mock.patch.object(self.compute_api.compute_rpcapi, + 'terminate_instance') as mock_terminate: + self.compute_api.delete(self.context, inst) + if self.cell_type == 'api': + mock_terminate.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + delete_type='delete') + mock_local_delete.assert_not_called() + mock_service_get.assert_not_called() + else: + mock_service_get.assert_called_once_with( + mock_elevated.return_value, 'fake-host') + mock_service_up.assert_called_once_with( + mock_service_get.return_value) + mock_terminate.assert_called_once_with( + self.context, inst, mock_bdm_get.return_value, + delete_type='delete') + mock_local_delete.assert_not_called() + def test_delete_forced_when_task_state_is_not_none(self): for vm_state in self._get_vm_states(): self._test_delete('force_delete', vm_state=vm_state, @@ -1425,6 +1493,52 @@ class _ComputeAPIUnitTestMixIn(object): self.assertIsNone( self.compute_api._get_stashed_volume_connector(bdm, inst)) + @mock.patch.object(objects.BlockDeviceMapping, 'destroy') + def test_local_cleanup_bdm_volumes_stashed_connector_host_none( + self, mock_destroy): + """Tests that we call volume_api.terminate_connection when we found + a stashed connector in the bdm.connection_info dict. + + This tests the case where: + + 1) the instance host is None + 2) the instance vm_state is one where we expect host to be None + + We allow a mismatch of the host in this situation if the instance is + in a state where we expect its host to have been set to None, such + as ERROR or SHELVED_OFFLOADED. + """ + params = dict(host=None, vm_state=vm_states.ERROR) + inst = self._create_instance_obj(params=params) + conn_info = {'connector': {'host': 'orig-host'}} + vol_bdm = objects.BlockDeviceMapping(self.context, id=1, + instance_uuid=inst.uuid, + volume_id=uuids.volume_id, + source_type='volume', + destination_type='volume', + delete_on_termination=True, + connection_info=jsonutils.dumps( + conn_info), + attachment_id=None) + bdms = objects.BlockDeviceMappingList(objects=[vol_bdm]) + + @mock.patch.object(self.compute_api.volume_api, 'terminate_connection') + @mock.patch.object(self.compute_api.volume_api, 'detach') + @mock.patch.object(self.compute_api.volume_api, 'delete') + @mock.patch.object(self.context, 'elevated', return_value=self.context) + def do_test(self, mock_elevated, mock_delete, + mock_detach, mock_terminate): + self.compute_api._local_cleanup_bdm_volumes( + bdms, inst, self.context) + mock_terminate.assert_called_once_with( + self.context, uuids.volume_id, conn_info['connector']) + mock_detach.assert_called_once_with( + self.context, uuids.volume_id, inst.uuid) + mock_delete.assert_called_once_with(self.context, uuids.volume_id) + mock_destroy.assert_called_once_with() + + do_test(self) + def test_local_delete_without_info_cache(self): inst = self._create_instance_obj() @@ -5773,6 +5887,57 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): fields=['device_id']) self.assertEqual([], instances.objects) + @mock.patch('nova.compute.api.API._delete_while_booting', + return_value=False) + @mock.patch('nova.compute.api.API._lookup_instance') + @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') + @mock.patch('nova.context.RequestContext.elevated') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(compute_utils, 'notify_about_instance_usage') + @mock.patch.object(objects.BlockDeviceMapping, 'destroy') + @mock.patch.object(objects.Instance, 'destroy') + def _test_delete_volume_backed_instance( + self, vm_state, mock_instance_destroy, bdm_destroy, + notify_about_instance_usage, mock_save, mock_elevated, + bdm_get_by_instance_uuid, mock_lookup, _mock_del_booting): + volume_id = uuidutils.generate_uuid() + conn_info = {'connector': {'host': 'orig-host'}} + bdms = [objects.BlockDeviceMapping( + **fake_block_device.FakeDbBlockDeviceDict( + {'id': 42, 'volume_id': volume_id, + 'source_type': 'volume', 'destination_type': 'volume', + 'delete_on_termination': False, + 'connection_info': jsonutils.dumps(conn_info)}))] + + bdm_get_by_instance_uuid.return_value = bdms + mock_elevated.return_value = self.context + + params = {'host': None, 'vm_state': vm_state} + inst = self._create_instance_obj(params=params) + mock_lookup.return_value = None, inst + connector = conn_info['connector'] + + with mock.patch.object(self.compute_api.network_api, + 'deallocate_for_instance') as mock_deallocate, \ + mock.patch.object(self.compute_api.volume_api, + 'terminate_connection') as mock_terminate_conn, \ + mock.patch.object(self.compute_api.volume_api, + 'detach') as mock_detach: + self.compute_api.delete(self.context, inst) + + mock_deallocate.assert_called_once_with(self.context, inst) + mock_detach.assert_called_once_with(self.context, volume_id, + inst.uuid) + mock_terminate_conn.assert_called_once_with(self.context, + volume_id, connector) + bdm_destroy.assert_called_once_with() + + def test_delete_volume_backed_instance_in_error(self): + self._test_delete_volume_backed_instance(vm_states.ERROR) + + def test_delete_volume_backed_instance_in_shelved_offloaded(self): + self._test_delete_volume_backed_instance(vm_states.SHELVED_OFFLOADED) + class Cellsv1DeprecatedTestMixIn(object): @mock.patch.object(objects.BuildRequestList, 'get_by_filters')