From 8bba879141d644b248d246cb2ed196068e20785c Mon Sep 17 00:00:00 2001 From: Lee Yarwood Date: Wed, 18 Apr 2018 14:35:07 +0100 Subject: [PATCH] compute: Ensure pre-migrating instances are destroyed during init_host Previously _destroy_evacuated_instances would not remove instances associated with evacuation migration records in a pre-migrating state. This could lead to a race between the original source host and the new destination if the source returned early, calling init_instance during the evacuation process. This change now includes pre-migrating migration records when looking for active evacuations. Additionally the dict of evacuating instances is then returned to init_host and used to skip running init_instance against such instances ensuring no race occurs between the two computes. Closes-bug: #1764883 Change-Id: I379678dfdb2609f12a572d4f99c8e9da4deab803 (cherry picked from commit c4988cdabf311d29cf64af732091068cfabeedaa) --- nova/compute/manager.py | 27 ++++++++++++----- .../regressions/test_bug_1764883.py | 25 +++++++++------- nova/tests/unit/compute/test_compute.py | 5 +++- nova/tests/unit/compute/test_compute_mgr.py | 30 +++++++++++++++++++ 4 files changed, 68 insertions(+), 19 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index df4667877f72..a4c375d8a2bd 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -632,10 +632,11 @@ class ComputeManager(manager.Manager): While nova-compute was down, the instances running on it could be evacuated to another host. This method looks for evacuation migration records where this is the source host and which were either started - (accepted) or complete (done). From those migration records, local - instances reported by the hypervisor are compared to the instances - for the migration records and those local guests are destroyed, along - with instance allocation records in Placement for this node. + (accepted), in-progress (pre-migrating) or migrated (done). From those + migration records, local instances reported by the hypervisor are + compared to the instances for the migration records and those local + guests are destroyed, along with instance allocation records in + Placement for this node. """ filters = { 'source_compute': self.host, @@ -643,7 +644,13 @@ class ComputeManager(manager.Manager): # included in case the source node comes back up while instances # are being evacuated to another host. We don't want the same # instance being reported from multiple hosts. - 'status': ['accepted', 'done'], + # NOTE(lyarwood): pre-migrating is also included here as the + # source compute can come back online shortly after the RT + # claims on the destination that in-turn moves the migration to + # pre-migrating. If the evacuate fails on the destination host, + # the user can rebuild the instance (in ERROR state) on the source + # host. + 'status': ['accepted', 'pre-migrating', 'done'], 'migration_type': 'evacuation', } with utils.temporary_mutation(context, read_deleted='yes'): @@ -706,6 +713,7 @@ class ComputeManager(manager.Manager): migration.status = 'completed' migration.save() + return evacuations def _is_instance_storage_shared(self, context, instance, host=None): shared_storage = True @@ -1150,9 +1158,14 @@ class ComputeManager(manager.Manager): try: # checking that instance was not already evacuated to other host - self._destroy_evacuated_instances(context) + evacuated_instances = self._destroy_evacuated_instances(context) + + # Initialise instances on the host that are not evacuating for instance in instances: - self._init_instance(context, instance) + if (not evacuated_instances or + instance.uuid not in evacuated_instances): + self._init_instance(context, instance) + finally: if CONF.defer_iptables_apply: self.driver.filter_defer_apply_off() diff --git a/nova/tests/functional/regressions/test_bug_1764883.py b/nova/tests/functional/regressions/test_bug_1764883.py index 9e51c3c2941d..8025a50c660c 100644 --- a/nova/tests/functional/regressions/test_bug_1764883.py +++ b/nova/tests/functional/regressions/test_bug_1764883.py @@ -25,13 +25,9 @@ class TestEvacuationWithSourceReturningDuringRebuild( test.TestCase, integrated_helpers.InstanceHelperMixin): """Assert the behaviour of evacuating instances when the src returns early. - This test asserts that evacuating instances end up in an ERROR state on the - source when that host comes back online during an evacuation while the - migration record is in a pre-migrating state. - - This currently leads to a race between the source initialising the instance - and destination claim during the rebuild, both of which attempt to update - the underlying task_state of the instance. + This test asserts that evacuating instances end up in an ACTIVE state on + the destination even when the source host comes back online during an + evacuation while the migration record is in a pre-migrating state. """ def setUp(self): @@ -113,8 +109,15 @@ class TestEvacuationWithSourceReturningDuringRebuild( # Start evacuating the instance from the source_host self.api.post_server_action(server['id'], {'evacuate': {}}) - # FIXME(lyarwood): Assert that the evacuation fails at present with the - # instance remaining on the source in an ERROR state. - self._wait_for_state_change(self.api, server, 'ERROR') + # Wait for the instance to go into an ACTIVE state + self._wait_for_state_change(self.api, server, 'ACTIVE') server = self.api.get_server(server['id']) - self.assertEqual(self.source_compute, server['OS-EXT-SRV-ATTR:host']) + host = server['OS-EXT-SRV-ATTR:host'] + migrations = self.api.get_migrations() + + # Assert that we have a single `done` migration record after the evac + self.assertEqual(1, len(migrations)) + self.assertEqual('done', migrations[0]['status']) + + # Assert that the instance is now on the dest + self.assertNotEqual(self.source_compute, host) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index eb0ab8cb2e31..99b1d3c4048c 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -7459,7 +7459,10 @@ class ComputeTestCase(BaseTestCase, mock_get_filter.assert_called_once_with(fake_context, {'source_compute': self.compute.host, - 'status': ['accepted', 'done'], + 'status': [ + 'accepted', + 'pre-migrating', + 'done'], 'migration_type': 'evacuation'}) mock_get_inst.assert_called_once_with(fake_context) mock_get_nw.assert_called_once_with(fake_context, evacuated_instance) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 4b32b92d9bdd..b9ddabc2c076 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -767,6 +767,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): mock.ANY, mock.ANY, mock.ANY) mock_save.assert_called_once_with() + @mock.patch.object(context, 'get_admin_context') + @mock.patch.object(objects.InstanceList, 'get_by_host') + @mock.patch.object(fake_driver.FakeDriver, 'init_host') + @mock.patch('nova.compute.manager.ComputeManager._init_instance') + @mock.patch('nova.compute.manager.ComputeManager.' + '_destroy_evacuated_instances') + def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac, + mock_init_instance, mock_init_host, mock_host_get, + mock_admin_ctxt): + """Assert that init_instance is not called for instances that are + evacuating from the host during init_host. + """ + active_instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, uuid=uuids.active_instance) + evacuating_instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, uuid=uuids.evac_instance) + instance_list = objects.InstanceList(self.context, + objects=[active_instance, evacuating_instance]) + + mock_host_get.return_value = instance_list + mock_admin_ctxt.return_value = self.context + mock_destroy_evac.return_value = { + uuids.evac_instance: evacuating_instance + } + + self.compute.init_host() + + mock_init_instance.assert_called_once_with( + self.context, active_instance) + def test_init_instance_with_binding_failed_vif_type(self): # this instance will plug a 'binding_failed' vif instance = fake_instance.fake_instance_obj(