compute: Ensure pre-migrating instances are destroyed during init_host

Previously _destroy_evacuated_instances would not remove instances associated with evacuation migration records in a pre-migrating state. This could lead to a race between the original source host and the new destination if the source returned early, calling init_instance during the evacuation process. This change now includes pre-migrating migration records when looking for active evacuations. Additionally the dict of evacuating instances is then returned to init_host and used to skip running init_instance against such instances ensuring no race occurs between the two computes. Closes-bug: #1764883 Change-Id: I379678dfdb2609f12a572d4f99c8e9da4deab803
2018-04-18 14:35:07 +01:00 · 2018-04-18 14:35:07 +01:00 · c4988cdabf
parent b9213398c7
commit c4988cdabf
4 changed files with 68 additions and 19 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -624,10 +624,11 @@ class ComputeManager(manager.Manager):
        While nova-compute was down, the instances running on it could be
        evacuated to another host. This method looks for evacuation migration
        records where this is the source host and which were either started
-        (accepted) or complete (done). From those migration records, local
-        instances reported by the hypervisor are compared to the instances
-        for the migration records and those local guests are destroyed, along
-        with instance allocation records in Placement for this node.
+        (accepted), in-progress (pre-migrating) or migrated (done). From those
+        migration records, local instances reported by the hypervisor are
+        compared to the instances for the migration records and those local
+        guests are destroyed, along with instance allocation records in
+        Placement for this node.
        """
        filters = {
            'source_compute': self.host,
@ -635,7 +636,13 @@ class ComputeManager(manager.Manager):
            # included in case the source node comes back up while instances
            # are being evacuated to another host. We don't want the same
            # instance being reported from multiple hosts.
-            'status': ['accepted', 'done'],
+            # NOTE(lyarwood): pre-migrating is also included here as the
+            # source compute can come back online shortly after the RT
+            # claims on the destination that in-turn moves the migration to
+            # pre-migrating. If the evacuate fails on the destination host,
+            # the user can rebuild the instance (in ERROR state) on the source
+            # host.
+            'status': ['accepted', 'pre-migrating', 'done'],
            'migration_type': 'evacuation',
        }
        with utils.temporary_mutation(context, read_deleted='yes'):
@ -698,6 +705,7 @@ class ComputeManager(manager.Manager):

            migration.status = 'completed'
            migration.save()
+        return evacuations

    def _is_instance_storage_shared(self, context, instance, host=None):
        shared_storage = True
@ -1133,9 +1141,14 @@ class ComputeManager(manager.Manager):

        try:
            # checking that instance was not already evacuated to other host
-            self._destroy_evacuated_instances(context)
+            evacuated_instances = self._destroy_evacuated_instances(context)
+
+            # Initialise instances on the host that are not evacuating
            for instance in instances:
-                self._init_instance(context, instance)
+                if (not evacuated_instances or
+                        instance.uuid not in evacuated_instances):
+                    self._init_instance(context, instance)
+
        finally:
            if CONF.defer_iptables_apply:
                self.driver.filter_defer_apply_off()
--- a/nova/tests/functional/regressions/test_bug_1764883.py
+++ b/nova/tests/functional/regressions/test_bug_1764883.py
@ -25,13 +25,9 @@ class TestEvacuationWithSourceReturningDuringRebuild(
        test.TestCase, integrated_helpers.InstanceHelperMixin):
    """Assert the behaviour of evacuating instances when the src returns early.

-    This test asserts that evacuating instances end up in an ERROR state on the
-    source when that host comes back online during an evacuation while the
-    migration record is in a pre-migrating state.
-
-    This currently leads to a race between the source initialising the instance
-    and destination claim during the rebuild, both of which attempt to update
-    the underlying task_state of the instance.
+    This test asserts that evacuating instances end up in an ACTIVE state on
+    the destination even when the source host comes back online during an
+    evacuation while the migration record is in a pre-migrating state.
    """

    def setUp(self):
@ -113,8 +109,15 @@ class TestEvacuationWithSourceReturningDuringRebuild(
        # Start evacuating the instance from the source_host
        self.api.post_server_action(server['id'], {'evacuate': {}})

-        # FIXME(lyarwood): Assert that the evacuation fails at present with the
-        # instance remaining on the source in an ERROR state.
-        self._wait_for_state_change(self.api, server, 'ERROR')
+        # Wait for the instance to go into an ACTIVE state
+        self._wait_for_state_change(self.api, server, 'ACTIVE')
        server = self.api.get_server(server['id'])
-        self.assertEqual(self.source_compute, server['OS-EXT-SRV-ATTR:host'])
+        host = server['OS-EXT-SRV-ATTR:host']
+        migrations = self.api.get_migrations()
+
+        # Assert that we have a single `done` migration record after the evac
+        self.assertEqual(1, len(migrations))
+        self.assertEqual('done', migrations[0]['status'])
+
+        # Assert that the instance is now on the dest
+        self.assertNotEqual(self.source_compute, host)
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@ -7500,7 +7500,10 @@ class ComputeTestCase(BaseTestCase,

        mock_get_filter.assert_called_once_with(fake_context,
                                         {'source_compute': self.compute.host,
-                                          'status': ['accepted', 'done'],
+                                          'status': [
+                                              'accepted',
+                                              'pre-migrating',
+                                              'done'],
                                          'migration_type': 'evacuation'})
        mock_get_inst.assert_called_once_with(fake_context)
        mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -766,6 +766,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                                             mock.ANY, mock.ANY, mock.ANY)
        mock_save.assert_called_once_with()

+    @mock.patch.object(context, 'get_admin_context')
+    @mock.patch.object(objects.InstanceList, 'get_by_host')
+    @mock.patch.object(fake_driver.FakeDriver, 'init_host')
+    @mock.patch('nova.compute.manager.ComputeManager._init_instance')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                  '_destroy_evacuated_instances')
+    def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
+            mock_init_instance, mock_init_host, mock_host_get,
+            mock_admin_ctxt):
+        """Assert that init_instance is not called for instances that are
+           evacuating from the host during init_host.
+        """
+        active_instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, uuid=uuids.active_instance)
+        evacuating_instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, uuid=uuids.evac_instance)
+        instance_list = objects.InstanceList(self.context,
+            objects=[active_instance, evacuating_instance])
+
+        mock_host_get.return_value = instance_list
+        mock_admin_ctxt.return_value = self.context
+        mock_destroy_evac.return_value = {
+            uuids.evac_instance: evacuating_instance
+        }
+
+        self.compute.init_host()
+
+        mock_init_instance.assert_called_once_with(
+            self.context, active_instance)
+
    def test_init_instance_with_binding_failed_vif_type(self):
        # this instance will plug a 'binding_failed' vif
        instance = fake_instance.fake_instance_obj(