From 52e68f121eda49dbb817404d3ab1468c2059e1a3 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@ericsson.com>
Date: Tue, 17 Oct 2017 15:06:59 +0200
Subject: [PATCH] cleanup evacuated instances not on hypervisor

When the compute host recovers from a failure the compute manager
destroys instances that were evacuated from the host while it was down.
However these code paths only consider evacuated instances that are
still reported by the hypervisor. This means that if the compute
host is recovered in a way that the hypervisor lost the definition
of the instances (for example the compute host was redeployed) then
the allocation of these instances will not be deleted.

This patch makes sure that the instance allocation is cleaned up
even if the driver doesn't return that instance as exists on the
hypervisor.

Note that the functional test coverage will be added on top as it needs
some refactoring that would make the bugfix non backportable.

Change-Id: I4bc81b482380c5778781659c4d167a712316dab4
Closes-Bug: #1724172
(cherry picked from commit 9cacaad14e8c18e99e85d9dc04308fee91303f8f)
(cherry picked from commit b8f2cd689f0a747778080ba4b6e148e71eb53085)
(cherry picked from commit 09de94e39bbcfc7f8130638e73a8248e49cb6ab7)
---
 nova/compute/manager.py                     | 44 +++++++---
 nova/tests/unit/compute/test_compute_mgr.py | 90 ++++++++++++++++++++-
 2 files changed, 121 insertions(+), 13 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 0462d12c25d6..bec8165a2f19 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -641,6 +641,9 @@ class ComputeManager(manager.Manager):
         compared to the instances for the migration records and those local
         guests are destroyed, along with instance allocation records in
         Placement for this node.
+        Then allocations are removed from Placement for every instance that is
+        evacuated from this host regardless if the instance is reported by the
+        hypervisor or not.
         """
         filters = {
             'source_compute': self.host,
@@ -672,18 +675,14 @@ class ComputeManager(manager.Manager):
         # we know we'll use, like info_cache and flavor. We can also replace
         # this with a generic solution: https://review.openstack.org/575190/
         local_instances = self._get_instances_on_driver(read_deleted_context)
-        evacuated = [inst for inst in local_instances
-                     if inst.uuid in evacuations]
+        evacuated_local_instances = {inst.uuid: inst
+                                     for inst in local_instances
+                                     if inst.uuid in evacuations}
 
-        # NOTE(gibi): We are called from init_host and at this point the
-        # compute_nodes of the resource tracker has not been populated yet so
-        # we cannot rely on the resource tracker here.
-        compute_nodes = {}
-
-        for instance in evacuated:
-            migration = evacuations[instance.uuid]
-            LOG.info('Deleting instance as it has been evacuated from '
-                     'this host', instance=instance)
+        for instance in evacuated_local_instances.values():
+            LOG.info('Destroying instance as it has been evacuated from '
+                     'this host but still exists in the hypervisor',
+                     instance=instance)
             try:
                 network_info = self.network_api.get_instance_nw_info(
                     context, instance)
@@ -703,7 +702,28 @@ class ComputeManager(manager.Manager):
                                 network_info,
                                 bdi, destroy_disks)
 
-            # delete the allocation of the evacuated instance from this host
+        # NOTE(gibi): We are called from init_host and at this point the
+        # compute_nodes of the resource tracker has not been populated yet so
+        # we cannot rely on the resource tracker here.
+        compute_nodes = {}
+
+        for instance_uuid, migration in evacuations.items():
+            try:
+                if instance_uuid in evacuated_local_instances:
+                    # Avoid the db call if we already have the instance loaded
+                    # above
+                    instance = evacuated_local_instances[instance_uuid]
+                else:
+                    instance = objects.Instance.get_by_uuid(
+                        context, instance_uuid)
+            except exception.InstanceNotFound:
+                # The instance already deleted so we expect that every
+                # allocation of that instance has already been cleaned up
+                continue
+
+            LOG.info('Cleaning up allocations of the instance as it has been '
+                     'evacuated from this host',
+                     instance=instance)
             if migration.source_node not in compute_nodes:
                 try:
                     cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 932e558994df..83122d3b5044 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -3745,7 +3745,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
             # both instance_1 and instance_2 is destroyed in the driver
             destroy.assert_has_calls(
                 [mock.call(self.context, instance_1, None, {}, True),
-                 mock.call(self.context, instance_2, None, {}, True)])
+                 mock.call(self.context, instance_2, None, {}, True)],
+                any_order=True)
 
             # but only instance_2 is deallocated as the compute node for
             # instance_1 is already deleted
@@ -3755,6 +3756,93 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
 
             self.assertEqual(2, get_node.call_count)
 
+    def test_destroy_evacuated_instances_not_on_hypervisor(self):
+        our_host = self.compute.host
+        flavor = objects.Flavor()
+        instance_1 = objects.Instance(self.context, flavor=flavor)
+        instance_1.uuid = uuids.instance_1
+        instance_1.task_state = None
+        instance_1.vm_state = vm_states.ACTIVE
+        instance_1.host = 'not-' + our_host
+        instance_1.user_id = uuids.user_id
+        instance_1.project_id = uuids.project_id
+        instance_1.deleted = False
+
+        migration_1 = objects.Migration(instance_uuid=instance_1.uuid)
+        migration_1.status = 'done'
+        migration_1.source_node = 'our-node'
+
+        our_node = objects.ComputeNode(
+            host=our_host, uuid=uuids.our_node_uuid)
+
+        with test.nested(
+            mock.patch.object(self.compute, '_get_instances_on_driver',
+                               return_value=[]),
+            mock.patch.object(self.compute.driver, 'destroy'),
+            mock.patch('nova.objects.MigrationList.get_by_filters'),
+            mock.patch('nova.objects.Migration.save'),
+            mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename'),
+            mock.patch('nova.scheduler.utils.resources_from_flavor'),
+            mock.patch.object(self.compute.reportclient,
+                              'remove_provider_from_instance_allocation'),
+            mock.patch('nova.objects.Instance.get_by_uuid')
+        ) as (_get_intances_on_driver, destroy, migration_list, migration_save,
+              get_node, get_resources, remove_allocation,
+              instance_get_by_uuid):
+            migration_list.return_value = [migration_1]
+            instance_get_by_uuid.return_value = instance_1
+            get_node.return_value = our_node
+            get_resources.return_value = mock.sentinel.resources
+
+            self.compute._destroy_evacuated_instances(self.context)
+
+            # nothing to be destroyed as the driver returned no instances on
+            # the hypervisor
+            self.assertFalse(destroy.called)
+
+            # but our only instance still cleaned up in placement
+            remove_allocation.assert_called_once_with(
+                self.context, instance_1.uuid, uuids.our_node_uuid,
+                instance_1.user_id, instance_1.project_id,
+                mock.sentinel.resources)
+            instance_get_by_uuid.assert_called_once_with(
+                self.context, instance_1.uuid)
+            get_node.assert_called_once_with(
+                self.context, our_host, 'our-node')
+
+    def test_destroy_evacuated_instances_not_on_hyp_and_instance_deleted(self):
+        migration_1 = objects.Migration(instance_uuid=uuids.instance_1)
+        migration_1.status = 'done'
+        migration_1.source_node = 'our-node'
+
+        with test.nested(
+            mock.patch.object(self.compute, '_get_instances_on_driver',
+                               return_value=[]),
+            mock.patch.object(self.compute.driver, 'destroy'),
+            mock.patch('nova.objects.MigrationList.get_by_filters'),
+            mock.patch('nova.objects.Migration.save'),
+            mock.patch('nova.scheduler.utils.resources_from_flavor'),
+            mock.patch.object(self.compute.reportclient,
+                              'remove_provider_from_instance_allocation'),
+            mock.patch('nova.objects.Instance.get_by_uuid')
+        ) as (_get_instances_on_driver,
+              destroy, migration_list, migration_save, get_resources,
+              remove_allocation, instance_get_by_uuid):
+            migration_list.return_value = [migration_1]
+            instance_get_by_uuid.side_effect = exception.InstanceNotFound(
+                instance_id=uuids.instance_1)
+
+            self.compute._destroy_evacuated_instances(self.context)
+
+            # nothing to be destroyed as the driver returned no instances on
+            # the hypervisor
+            self.assertFalse(destroy.called)
+
+            instance_get_by_uuid.assert_called_once_with(
+                self.context, uuids.instance_1)
+            # nothing to be cleaned as the instance was deleted already
+            self.assertFalse(remove_allocation.called)
+
     @mock.patch('nova.compute.manager.ComputeManager.'
                 '_destroy_evacuated_instances')
     @mock.patch('nova.compute.manager.LOG')