From d9c696acf85a00fe9e69faf5559cd0534e2c0f58 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Thu, 14 May 2015 15:52:05 -0700 Subject: [PATCH] Make evacuate leave a record for the source compute host to process This makes the evacuate process leave a record in the database about the evacuation, and from what host it was being performed. Now, the compute manager will use that breadcrumb trail to decide whether it should delete locally-running instances, instead of just the hostname mismatch. Since this does a query by migration_type, I added a unit test to the test_db_api module to prove this actually works. DocImpact: Deprecates the workarounds/destroy_after_evacuate config option Related to blueprint robustify-evacuation Closes-Bug: #1471887 Change-Id: I972c224dfb954a9f06e221c9235ee6e4889c2619 --- nova/compute/api.py | 12 +++ nova/compute/manager.py | 89 +++++++++------------ nova/tests/unit/compute/test_compute.py | 33 +++++++- nova/tests/unit/compute/test_compute_mgr.py | 60 +++++--------- nova/tests/unit/db/test_db_api.py | 15 +++- nova/utils.py | 4 +- 6 files changed, 115 insertions(+), 98 deletions(-) diff --git a/nova/compute/api.py b/nova/compute/api.py index 87f1d52c3c3b..3951002479f3 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -3272,6 +3272,18 @@ class API(base.Base): instance.save(expected_task_state=[None]) self._record_action_start(context, instance, instance_actions.EVACUATE) + # NOTE(danms): Create this as a tombstone for the source compute + # to find and cleanup. No need to pass it anywhere else. + migration = objects.Migration(context, + source_compute=instance.host, + source_node=instance.node, + instance_uuid=instance.uuid, + status='accepted', + migration_type='evacuation') + if host: + migration.dest_compute = host + migration.create() + return self.compute_task_api.rebuild_instance(context, instance=instance, new_pass=admin_password, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index f0d215d6d759..aecfee9b02d0 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -788,61 +788,44 @@ class ComputeManager(manager.Manager): the MIGRATING, RESIZE_MIGRATING, RESIZE_MIGRATED, RESIZE_FINISH task state or RESIZED vm state. """ - our_host = self.host + filters = { + 'source_compute': self.host, + 'status': 'accepted', + 'migration_type': 'evacuation', + } + evacuations = objects.MigrationList.get_by_filters(context, filters) + if not evacuations: + return + evacuations = {mig.instance_uuid: mig for mig in evacuations} + filters = {'deleted': False} local_instances = self._get_instances_on_driver(context, filters) - for instance in local_instances: - if instance.host != our_host: - if (instance.task_state in [task_states.MIGRATING, - task_states.RESIZE_MIGRATING, - task_states.RESIZE_MIGRATED, - task_states.RESIZE_FINISH] - or instance.vm_state in [vm_states.RESIZED]): - LOG.debug('Will not delete instance as its host (' - '%(instance_host)s) is not equal to our ' - 'host (%(our_host)s) but its task state is ' - '(%(task_state)s) and vm state is ' - '(%(vm_state)s)', - {'instance_host': instance.host, - 'our_host': our_host, - 'task_state': instance.task_state, - 'vm_state': instance.vm_state}, - instance=instance) - continue - if not CONF.workarounds.destroy_after_evacuate: - LOG.warning(_LW('Instance %(uuid)s appears to have been ' - 'evacuated from this host to %(host)s. ' - 'Not destroying it locally due to ' - 'config setting ' - '"workarounds.destroy_after_evacuate". ' - 'If this is not correct, enable that ' - 'option and restart nova-compute.'), - {'uuid': instance.uuid, - 'host': instance.host}) - continue - LOG.info(_LI('Deleting instance as its host (' - '%(instance_host)s) is not equal to our ' - 'host (%(our_host)s).'), - {'instance_host': instance.host, - 'our_host': our_host}, instance=instance) - try: - network_info = self.network_api.get_instance_nw_info( - context, instance) - bdi = self._get_instance_block_device_info(context, - instance) - destroy_disks = not (self._is_instance_storage_shared( - context, instance)) - except exception.InstanceNotFound: - network_info = network_model.NetworkInfo() - bdi = {} - LOG.info(_LI('Instance has been marked deleted already, ' - 'removing it from the hypervisor.'), - instance=instance) - # always destroy disks if the instance was deleted - destroy_disks = True - self.driver.destroy(context, instance, - network_info, - bdi, destroy_disks) + evacuated = [inst for inst in local_instances + if inst.uuid in evacuations] + for instance in evacuated: + migration = evacuations[instance.uuid] + LOG.info(_LI('Deleting instance as it has been evacuated from ' + 'this host'), instance=instance) + try: + network_info = self.network_api.get_instance_nw_info( + context, instance) + bdi = self._get_instance_block_device_info(context, + instance) + destroy_disks = not (self._is_instance_storage_shared( + context, instance)) + except exception.InstanceNotFound: + network_info = network_model.NetworkInfo() + bdi = {} + LOG.info(_LI('Instance has been marked deleted already, ' + 'removing it from the hypervisor.'), + instance=instance) + # always destroy disks if the instance was deleted + destroy_disks = True + self.driver.destroy(context, instance, + network_info, + bdi, destroy_disks) + migration.status = 'completed' + migration.save() def _is_instance_storage_shared(self, context, instance, host=None): shared_storage = True diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 4fbe2719e8d3..a894fb1a4eb3 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6520,7 +6520,10 @@ class ComputeTestCase(BaseTestCase): instance = self._create_fake_instance_obj(params) self.compute._instance_update(self.context, instance.uuid, vcpus=4) - def test_destroy_evacuated_instance_on_shared_storage(self): + @mock.patch('nova.objects.MigrationList.get_by_filters') + @mock.patch('nova.objects.Migration.save') + def test_destroy_evacuated_instance_on_shared_storage(self, mock_save, + mock_get): fake_context = context.get_admin_context() # instances in central db @@ -6538,6 +6541,9 @@ class ComputeTestCase(BaseTestCase): evacuated_instance = self._create_fake_instance_obj( {'host': 'otherhost'}) + migration = objects.Migration(instance_uuid=evacuated_instance.uuid) + mock_get.return_value = [migration] + instances.append(evacuated_instance) self.mox.StubOutWithMock(self.compute, @@ -6565,8 +6571,15 @@ class ComputeTestCase(BaseTestCase): self.mox.ReplayAll() self.compute._destroy_evacuated_instances(fake_context) + mock_get.assert_called_once_with(fake_context, + {'source_compute': self.compute.host, + 'status': 'accepted', + 'migration_type': 'evacuation'}) - def test_destroy_evacuated_instance_with_disks(self): + @mock.patch('nova.objects.MigrationList.get_by_filters') + @mock.patch('nova.objects.Migration.save') + def test_destroy_evacuated_instance_with_disks(self, mock_save, + mock_get): fake_context = context.get_admin_context() # instances in central db @@ -6584,6 +6597,9 @@ class ComputeTestCase(BaseTestCase): evacuated_instance = self._create_fake_instance_obj( {'host': 'otherhost'}) + migration = objects.Migration(instance_uuid=evacuated_instance.uuid) + mock_get.return_value = [migration] + instances.append(evacuated_instance) self.mox.StubOutWithMock(self.compute, @@ -6621,7 +6637,10 @@ class ComputeTestCase(BaseTestCase): self.mox.ReplayAll() self.compute._destroy_evacuated_instances(fake_context) - def test_destroy_evacuated_instance_not_implemented(self): + @mock.patch('nova.objects.MigrationList.get_by_filters') + @mock.patch('nova.objects.Migration.save') + def test_destroy_evacuated_instance_not_implemented(self, mock_save, + mock_get): fake_context = context.get_admin_context() # instances in central db @@ -6639,6 +6658,9 @@ class ComputeTestCase(BaseTestCase): evacuated_instance = self._create_fake_instance_obj( {'host': 'otherhost'}) + migration = objects.Migration(instance_uuid=evacuated_instance.uuid) + mock_get.return_value = [migration] + instances.append(evacuated_instance) self.mox.StubOutWithMock(self.compute, @@ -9791,6 +9813,11 @@ class ComputeAPITestCase(BaseTestCase): instance.refresh() self.assertEqual(instance.task_state, task_states.REBUILDING) self.assertEqual(instance.host, 'fake_dest_host') + migs = objects.MigrationList.get_by_filters( + self.context, {'source_host': 'fake_host'}) + self.assertEqual(1, len(migs)) + self.assertEqual('fake_host', migs[0].source_compute) + self.assertEqual('accepted', migs[0].status) def test_fail_evacuate_from_non_existing_host(self): inst = {} diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index f1078f9cd8c0..7bdd4dfebcb0 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -401,11 +401,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.mox.UnsetStubs() @mock.patch('nova.objects.InstanceList') - def test_cleanup_host(self, mock_instance_list): + @mock.patch('nova.objects.MigrationList.get_by_filters') + def test_cleanup_host(self, mock_miglist_get, mock_instance_list): # just testing whether the cleanup_host method # when fired will invoke the underlying driver's # equivalent method. + mock_miglist_get.return_value = [] mock_instance_list.get_by_host.return_value = [] with mock.patch.object(self.compute, 'driver') as mock_driver: @@ -426,12 +428,16 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.compute.init_virt_events() self.assertFalse(mock_register.called) - def test_init_host_with_deleted_migration(self): + @mock.patch('nova.objects.MigrationList.get_by_filters') + @mock.patch('nova.objects.Migration.save') + def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get): our_host = self.compute.host not_our_host = 'not-' + our_host deleted_instance = fake_instance.fake_instance_obj( self.context, host=not_our_host, uuid='fake-uuid') + migration = objects.Migration(instance_uuid=deleted_instance.uuid) + mock_mig_get.return_value = [migration] self.mox.StubOutWithMock(self.compute.driver, 'init_host') self.mox.StubOutWithMock(self.compute.driver, 'destroy') @@ -2054,13 +2060,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self._do_test_set_admin_password_driver_error( exc, vm_states.ACTIVE, None, expected_exception) - def _test_init_host_with_partial_migration(self, task_state=None, - vm_state=vm_states.ACTIVE): + def test_destroy_evacuated_instances(self): our_host = self.compute.host instance_1 = objects.Instance(self.context) instance_1.uuid = 'foo' - instance_1.task_state = task_state - instance_1.vm_state = vm_state + instance_1.task_state = None + instance_1.vm_state = vm_states.ACTIVE instance_1.host = 'not-' + our_host instance_2 = objects.Instance(self.context) instance_2.uuid = 'bar' @@ -2068,6 +2073,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): instance_2.vm_state = vm_states.ACTIVE instance_2.host = 'not-' + our_host + # Only instance 2 has a migration record + migration = objects.Migration(instance_uuid=instance_2.uuid) + with contextlib.nested( mock.patch.object(self.compute, '_get_instances_on_driver', return_value=[instance_1, @@ -2078,45 +2086,19 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): return_value={}), mock.patch.object(self.compute, '_is_instance_storage_shared', return_value=False), - mock.patch.object(self.compute.driver, 'destroy') + mock.patch.object(self.compute.driver, 'destroy'), + mock.patch('nova.objects.MigrationList.get_by_filters'), + mock.patch('nova.objects.Migration.save') ) as (_get_instances_on_driver, get_instance_nw_info, _get_instance_block_device_info, _is_instance_storage_shared, - destroy): + destroy, migration_list, migration_save): + migration_list.return_value = [migration] self.compute._destroy_evacuated_instances(self.context) + # Only instance 2 should be deleted. Instance 1 is still running + # here, but no migration from our host exists, so ignore it destroy.assert_called_once_with(self.context, instance_2, None, {}, True) - def test_init_host_with_partial_migration_migrating(self): - self._test_init_host_with_partial_migration( - task_state=task_states.MIGRATING) - - def test_init_host_with_partial_migration_resize_migrating(self): - self._test_init_host_with_partial_migration( - task_state=task_states.RESIZE_MIGRATING) - - def test_init_host_with_partial_migration_resize_migrated(self): - self._test_init_host_with_partial_migration( - task_state=task_states.RESIZE_MIGRATED) - - def test_init_host_with_partial_migration_finish_resize(self): - self._test_init_host_with_partial_migration( - task_state=task_states.RESIZE_FINISH) - - def test_init_host_with_partial_migration_resized(self): - self._test_init_host_with_partial_migration( - vm_state=vm_states.RESIZED) - - @mock.patch('nova.compute.manager.ComputeManager._get_instances_on_driver') - def test_evacuate_disabled(self, mock_giod): - self.flags(destroy_after_evacuate=False, group='workarounds') - inst = mock.MagicMock() - inst.uuid = 'foo' - inst.host = self.compute.host + '-alt' - mock_giod.return_value = [inst] - with mock.patch.object(self.compute.driver, 'destroy') as mock_d: - self.compute._destroy_evacuated_instances(mock.MagicMock()) - self.assertFalse(mock_d.called) - @mock.patch('nova.compute.manager.ComputeManager.' '_destroy_evacuated_instances') @mock.patch('nova.compute.manager.LOG') diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index 18be49db352a..d30e5c5a4c2f 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -1248,7 +1248,7 @@ class MigrationTestCase(test.TestCase): def _create(self, status='migrating', source_compute='host1', source_node='a', dest_compute='host2', dest_node='b', - system_metadata=None): + system_metadata=None, migration_type=None): values = {'host': source_compute} instance = db.instance_create(self.ctxt, values) @@ -1258,7 +1258,8 @@ class MigrationTestCase(test.TestCase): values = {'status': status, 'source_compute': source_compute, 'source_node': source_node, 'dest_compute': dest_compute, - 'dest_node': dest_node, 'instance_uuid': instance['uuid']} + 'dest_node': dest_node, 'instance_uuid': instance['uuid'], + 'migration_type': migration_type} db.migration_create(self.ctxt, values) def _assert_in_progress(self, migrations): @@ -1312,6 +1313,16 @@ class MigrationTestCase(test.TestCase): hosts = [migration['source_compute'], migration['dest_compute']] self.assertIn(filters["host"], hosts) + def test_get_migrations_by_filters_with_type(self): + self._create(status="special", source_compute="host9", + migration_type="evacuation") + self._create(status="special", source_compute="host9", + migration_type="live-migration") + filters = {"status": "special", "host": "host9", + "migration_type": "evacuation", "hidden": False} + migrations = db.migration_get_all_by_filters(self.ctxt, filters) + self.assertEqual(1, len(migrations)) + def test_get_migrations_by_filters_source_compute(self): filters = {'source_compute': 'host2'} migrations = db.migration_get_all_by_filters(self.ctxt, filters) diff --git a/nova/utils.py b/nova/utils.py index 264b44b2bea6..22ed104704c4 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -110,7 +110,9 @@ workarounds_opts = [ 'https://bugs.launchpad.net/nova/+bug/1334398'), cfg.BoolOpt('destroy_after_evacuate', default=True, - help='Whether to destroy instances on startup when we suspect ' + deprecated_for_removal=True, + help='DEPRECATED: Whether to destroy ' + 'instances on startup when we suspect ' 'they have previously been evacuated. This can result in ' 'data loss if undesired. See ' 'https://launchpad.net/bugs/1419785'),