Merge "cleanup evacuated instances not on hypervisor" into stable/queens

2019-11-05 13:50:11 +00:00 · 2019-11-05 13:50:11 +00:00 · 2a9ffac1c6
parent e72c6a3a49 52e68f121e
commit 2a9ffac1c6
2 changed files with 121 additions and 13 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -641,6 +641,9 @@ class ComputeManager(manager.Manager):
        compared to the instances for the migration records and those local
        guests are destroyed, along with instance allocation records in
        Placement for this node.
+        Then allocations are removed from Placement for every instance that is
+        evacuated from this host regardless if the instance is reported by the
+        hypervisor or not.
        """
        filters = {
            'source_compute': self.host,
@ -672,18 +675,14 @@ class ComputeManager(manager.Manager):
        # we know we'll use, like info_cache and flavor. We can also replace
        # this with a generic solution: https://review.openstack.org/575190/
        local_instances = self._get_instances_on_driver(read_deleted_context)
-        evacuated = [inst for inst in local_instances
-                     if inst.uuid in evacuations]
+        evacuated_local_instances = {inst.uuid: inst
+                                     for inst in local_instances
+                                     if inst.uuid in evacuations}

-        # NOTE(gibi): We are called from init_host and at this point the
-        # compute_nodes of the resource tracker has not been populated yet so
-        # we cannot rely on the resource tracker here.
-        compute_nodes = {}
-
-        for instance in evacuated:
-            migration = evacuations[instance.uuid]
-            LOG.info('Deleting instance as it has been evacuated from '
-                     'this host', instance=instance)
+        for instance in evacuated_local_instances.values():
+            LOG.info('Destroying instance as it has been evacuated from '
+                     'this host but still exists in the hypervisor',
+                     instance=instance)
            try:
                network_info = self.network_api.get_instance_nw_info(
                    context, instance)
@ -703,7 +702,28 @@ class ComputeManager(manager.Manager):
                                network_info,
                                bdi, destroy_disks)

-            # delete the allocation of the evacuated instance from this host
+        # NOTE(gibi): We are called from init_host and at this point the
+        # compute_nodes of the resource tracker has not been populated yet so
+        # we cannot rely on the resource tracker here.
+        compute_nodes = {}
+
+        for instance_uuid, migration in evacuations.items():
+            try:
+                if instance_uuid in evacuated_local_instances:
+                    # Avoid the db call if we already have the instance loaded
+                    # above
+                    instance = evacuated_local_instances[instance_uuid]
+                else:
+                    instance = objects.Instance.get_by_uuid(
+                        context, instance_uuid)
+            except exception.InstanceNotFound:
+                # The instance already deleted so we expect that every
+                # allocation of that instance has already been cleaned up
+                continue
+
+            LOG.info('Cleaning up allocations of the instance as it has been '
+                     'evacuated from this host',
+                     instance=instance)
            if migration.source_node not in compute_nodes:
                try:
                    cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -3745,7 +3745,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            # both instance_1 and instance_2 is destroyed in the driver
            destroy.assert_has_calls(
                [mock.call(self.context, instance_1, None, {}, True),
-                 mock.call(self.context, instance_2, None, {}, True)])
+                 mock.call(self.context, instance_2, None, {}, True)],
+                any_order=True)

            # but only instance_2 is deallocated as the compute node for
            # instance_1 is already deleted
@ -3755,6 +3756,93 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

            self.assertEqual(2, get_node.call_count)

+    def test_destroy_evacuated_instances_not_on_hypervisor(self):
+        our_host = self.compute.host
+        flavor = objects.Flavor()
+        instance_1 = objects.Instance(self.context, flavor=flavor)
+        instance_1.uuid = uuids.instance_1
+        instance_1.task_state = None
+        instance_1.vm_state = vm_states.ACTIVE
+        instance_1.host = 'not-' + our_host
+        instance_1.user_id = uuids.user_id
+        instance_1.project_id = uuids.project_id
+        instance_1.deleted = False
+
+        migration_1 = objects.Migration(instance_uuid=instance_1.uuid)
+        migration_1.status = 'done'
+        migration_1.source_node = 'our-node'
+
+        our_node = objects.ComputeNode(
+            host=our_host, uuid=uuids.our_node_uuid)
+
+        with test.nested(
+            mock.patch.object(self.compute, '_get_instances_on_driver',
+                               return_value=[]),
+            mock.patch.object(self.compute.driver, 'destroy'),
+            mock.patch('nova.objects.MigrationList.get_by_filters'),
+            mock.patch('nova.objects.Migration.save'),
+            mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename'),
+            mock.patch('nova.scheduler.utils.resources_from_flavor'),
+            mock.patch.object(self.compute.reportclient,
+                              'remove_provider_from_instance_allocation'),
+            mock.patch('nova.objects.Instance.get_by_uuid')
+        ) as (_get_intances_on_driver, destroy, migration_list, migration_save,
+              get_node, get_resources, remove_allocation,
+              instance_get_by_uuid):
+            migration_list.return_value = [migration_1]
+            instance_get_by_uuid.return_value = instance_1
+            get_node.return_value = our_node
+            get_resources.return_value = mock.sentinel.resources
+
+            self.compute._destroy_evacuated_instances(self.context)
+
+            # nothing to be destroyed as the driver returned no instances on
+            # the hypervisor
+            self.assertFalse(destroy.called)
+
+            # but our only instance still cleaned up in placement
+            remove_allocation.assert_called_once_with(
+                self.context, instance_1.uuid, uuids.our_node_uuid,
+                instance_1.user_id, instance_1.project_id,
+                mock.sentinel.resources)
+            instance_get_by_uuid.assert_called_once_with(
+                self.context, instance_1.uuid)
+            get_node.assert_called_once_with(
+                self.context, our_host, 'our-node')
+
+    def test_destroy_evacuated_instances_not_on_hyp_and_instance_deleted(self):
+        migration_1 = objects.Migration(instance_uuid=uuids.instance_1)
+        migration_1.status = 'done'
+        migration_1.source_node = 'our-node'
+
+        with test.nested(
+            mock.patch.object(self.compute, '_get_instances_on_driver',
+                               return_value=[]),
+            mock.patch.object(self.compute.driver, 'destroy'),
+            mock.patch('nova.objects.MigrationList.get_by_filters'),
+            mock.patch('nova.objects.Migration.save'),
+            mock.patch('nova.scheduler.utils.resources_from_flavor'),
+            mock.patch.object(self.compute.reportclient,
+                              'remove_provider_from_instance_allocation'),
+            mock.patch('nova.objects.Instance.get_by_uuid')
+        ) as (_get_instances_on_driver,
+              destroy, migration_list, migration_save, get_resources,
+              remove_allocation, instance_get_by_uuid):
+            migration_list.return_value = [migration_1]
+            instance_get_by_uuid.side_effect = exception.InstanceNotFound(
+                instance_id=uuids.instance_1)
+
+            self.compute._destroy_evacuated_instances(self.context)
+
+            # nothing to be destroyed as the driver returned no instances on
+            # the hypervisor
+            self.assertFalse(destroy.called)
+
+            instance_get_by_uuid.assert_called_once_with(
+                self.context, uuids.instance_1)
+            # nothing to be cleaned as the instance was deleted already
+            self.assertFalse(remove_allocation.called)
+
    @mock.patch('nova.compute.manager.ComputeManager.'
                '_destroy_evacuated_instances')
    @mock.patch('nova.compute.manager.LOG')