From 09e169bd7a74deaf52d9d6d38ff37799cab24157 Mon Sep 17 00:00:00 2001
From: Jay Pipes <jaypipes@gmail.com>
Date: Mon, 7 Aug 2017 16:36:49 -0400
Subject: [PATCH] placement: refactor healing of allocations in RT

Updates the RT method for "healing" instance allocations so that we
only delete allocations (and delete them entirely for an instance) in
the event that the instance has been local-deleted. Adds log statements
to the method to give us a good idea of what the state of various
things are during test runs.

Note that this patch only partially fixes LP #1707071 to remove some of
the back-and-forth allocation trampling that was going on for move
operations. The next patch adds the final piece of the puzzle which is
to have the confirm/revert resize step on the source or destination
compute host *correct* a doubled-up allocation. What this patch does is
prevent the healing function from deleting allocations unless the
instance is local-deleted.

Partial-bug: #1707071

Change-Id: Ia0bb32fd92ae723c505d0fce5691e0fe0540f10d
---
 nova/compute/resource_tracker.py              | 75 ++++++++++++---
 nova/tests/functional/test_servers.py         | 94 ++++++++-----------
 .../unit/compute/test_resource_tracker.py     | 50 +++++-----
 3 files changed, 124 insertions(+), 95 deletions(-)

diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 1d9caccc78c7..57e9fe56a43b 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1041,29 +1041,76 @@ class ResourceTracker(object):
             if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
                 self._update_usage_from_instance(context, instance, nodename)
 
-        # Remove allocations for instances that have been removed.
         self._remove_deleted_instances_allocations(context, cn)
 
     def _remove_deleted_instances_allocations(self, context, cn):
-        tracked_keys = set(self.tracked_instances.keys())
+        # NOTE(jaypipes): All of this code sucks. It's basically dealing with
+        # all the corner cases in move, local delete, unshelve and rebuild
+        # operations for when allocations should be deleted when things didn't
+        # happen according to the normal flow of events where the scheduler
+        # always creates allocations for an instance
+        known_instances = set(self.tracked_instances.keys())
         allocations = self.reportclient.get_allocations_for_resource_provider(
                 cn.uuid) or {}
-        allocations_to_delete = set(allocations.keys()) - tracked_keys
-        for instance_uuid in allocations_to_delete:
-            # Allocations related to instances being scheduled should not be
-            # deleted if we already wrote the allocation previously.
+        for instance_uuid, alloc in allocations.items():
+            if instance_uuid in known_instances:
+                LOG.debug("Instance %s actively managed on this compute host "
+                          "and has allocations in placement: %s.",
+                          instance_uuid, alloc)
+                continue
             try:
                 instance = objects.Instance.get_by_uuid(context, instance_uuid,
                                                         expected_attrs=[])
-                if not instance.host:
-                    continue
             except exception.InstanceNotFound:
-                # The instance is gone, so we definitely want to
-                # remove allocations associated with it.
-                pass
-            LOG.debug('Deleting stale allocation for instance %s',
-                      instance_uuid)
-            self.reportclient.delete_allocation_for_instance(instance_uuid)
+                # The instance is gone, so we definitely want to remove
+                # allocations associated with it.
+                # NOTE(jaypipes): This will not be true if/when we support
+                # cross-cell migrations...
+                LOG.debug("Instance %s has been deleted (perhaps locally). "
+                          "Deleting allocations that remained for this "
+                          "instance against this compute host: %s.",
+                          instance_uuid, alloc)
+                self.reportclient.delete_allocation_for_instance(instance_uuid)
+                continue
+            if not instance.host:
+                # Allocations related to instances being scheduled should not
+                # be deleted if we already wrote the allocation previously.
+                LOG.debug("Instance %s has been scheduled to this compute "
+                          "host, the scheduler has made an allocation "
+                          "against this compute node but the instance has "
+                          "yet to start. Skipping heal of allocation: %s.",
+                          instance_uuid, alloc)
+                continue
+            if (instance.host == cn.host and
+                    instance.node == cn.hypervisor_hostname):
+                # The instance is supposed to be on this compute host but is
+                # not in the list of actively managed instances.
+                LOG.warning("Instance %s is not being actively managed by "
+                            "this compute host but has allocations "
+                            "referencing this compute host: %s. Skipping "
+                            "heal of allocation because we do not know "
+                            "what to do.", instance_uuid, alloc)
+                continue
+            if instance.host != cn.host:
+                # The instance has been moved to another host either via a
+                # migration, evacuation or unshelve in between the time when we
+                # ran InstanceList.get_by_host_and_node(), added those
+                # instances to RT.tracked_instances and the above
+                # Instance.get_by_uuid() call. We SHOULD attempt to remove any
+                # allocations that reference this compute host if the VM is in
+                # a stable terminal state (i.e. it isn't in a state of waiting
+                # for resize to confirm/revert), however if the destination
+                # host is an Ocata compute host, it will delete the allocation
+                # that contains this source compute host information anyway and
+                # recreate an allocation that only refers to itself. So we
+                # don't need to do anything in that case. Just log the
+                # situation here for debugging information but don't attempt to
+                # delete or change the allocation.
+                LOG.debug("Instance %s has been moved to another host %s(%s). "
+                          "There are allocations remaining against the source "
+                          "host that might need to be removed: %s.",
+                          instance_uuid, instance.host, instance.node, alloc)
+                continue
 
     def _find_orphaned_instances(self):
         """Given the set of instances and migrations already account for
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index 0967f61f6aea..9176046c0305 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1305,52 +1305,22 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
 
         # the original host expected to have the old resource usage
         source_usages = self._get_provider_usages(source_rp_uuid)
-        # NOTE(danms): This is bug 1707071, where we've lost the
-        # doubled allocation
-        self.assertEqual(0, source_usages['VCPU'])
-        self.assertEqual(0, source_usages['MEMORY_MB'])
-        self.assertEqual(0, source_usages['DISK_GB'])
-
-        # NOTE(danms): After we fix the 'healing' of the doubled allocation
-        # by the resource tracker, the following will be expected
-        # self.assertFlavorMatchesAllocation(old_flavor, source_usages)
+        self.assertFlavorMatchesAllocation(old_flavor, source_usages)
 
         # the dest host expected to have resource allocation based on
         # the new flavor the server is resized to
         dest_usages = self._get_provider_usages(dest_rp_uuid)
-
-        # NOTE(danms): This is bug 1707071 where we've lost the entire
-        # allocation because one of the computes deleted it
-        self.assertEqual(0, dest_usages['VCPU'])
-        self.assertEqual(0, dest_usages['MEMORY_MB'])
-        self.assertEqual(0, dest_usages['DISK_GB'])
-
-        # NOTE(danms): After we fix the 'healing' of the double allocation
-        # by the resource tracker, the following will be expected
-        # self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
+        self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
 
         # and our server should have allocation on both providers accordingly
         allocations = self._get_allocations_by_server_uuid(server['id'])
-        # NOTE(danms): This is bug 1707071, where we've lost the
-        # doubled allocation
-        self.assertEqual(0, len(allocations))
-        self.assertNotIn(source_rp_uuid, allocations)
+        self.assertEqual(2, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
 
-        # NOTE(danms): After we fix the 'healing' of the doubled allocation
-        # by the resource tracker, the following will be expected
-        # self.assertEqual(2, len(allocations))
-        # source_allocation = allocations[source_rp_uuid]['resources']
-        # self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
-
-        # NOTE(danms): This is bug 1707071, where we've lost the
-        # doubled allocation
-        self.assertNotIn(dest_rp_uuid, allocations)
-
-        # NOTE(danms): After we fix the 'healing' of the doubled allocation
-        # by the resource tracker, the following will be expected
-        # self.assertEqual(2, len(allocations))
-        # dest_allocation = allocations[dest_rp_uuid]['resources']
-        # self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
+        self.assertEqual(2, len(allocations))
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
 
     def _delete_and_check_allocations(self, server, source_rp_uuid,
                                       dest_rp_uuid):
@@ -1395,19 +1365,24 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
         # the original host expected to have the old resource allocation
         source_usages = self._get_provider_usages(source_rp_uuid)
         allocations = self._get_allocations_by_server_uuid(server['id'])
-        dest_usages = self._get_provider_usages(dest_rp_uuid)
-
         self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
 
-        # and the target host should not have usage
-        self.assertEqual({'VCPU': 0,
-                          'MEMORY_MB': 0,
-                          'DISK_GB': 0}, dest_usages,
-                         'Target host %s still has usage after the resize has '
-                         'been reverted' % dest_hostname)
+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
+        # dest_usages = self._get_provider_usages(dest_rp_uuid)
+        # self.assertEqual({'VCPU': 0,
+        #                   'MEMORY_MB': 0,
+        #                   'DISK_GB': 0}, dest_usages,
+        #                'Target host %s still has usage after the resize has '
+        #                  'been reverted' % dest_hostname)
 
+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
         # Check that the server only allocates resource from the original host
-        self.assertEqual(1, len(allocations))
+        # self.assertEqual(1, len(allocations))
+
         source_allocation = allocations[source_rp_uuid]['resources']
         self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
 
@@ -1434,28 +1409,33 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
         # Fetch allocations post-confirm
         allocations = self._get_allocations_by_server_uuid(server['id'])
 
-        # NOTE(danms): This is bug 1707071, where we've lost the doubled
-        # allocation
-        self.assertEqual(0, len(allocations))
+        self.assertEqual(2, len(allocations))
 
         self._run_periodics()
 
-        source_usages = self._get_provider_usages(source_rp_uuid)
         dest_usages = self._get_provider_usages(dest_rp_uuid)
 
-        self.assertEqual({'VCPU': 0,
-                          'MEMORY_MB': 0,
-                          'DISK_GB': 0}, source_usages,
-                         'The source host %s still has usages after the '
-                         'resize has been confirmed' % source_hostname)
+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
+        # source_usages = self._get_provider_usages(source_rp_uuid)
+        # self.assertEqual({'VCPU': 0,
+        #                   'MEMORY_MB': 0,
+        #                   'DISK_GB': 0}, source_usages,
+        #                  'The source host %s still has usages after the '
+        #                  'resize has been confirmed' % source_hostname)
 
         # and the target host allocation should be according to the new flavor
         self.assertFlavorMatchesAllocation(self.flavor2, dest_usages)
 
         allocations = self._get_allocations_by_server_uuid(server['id'])
 
+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
         # and the server allocates only from the target host
-        self.assertEqual(1, len(allocations))
+        # self.assertEqual(1, len(allocations))
+
         dest_allocation = allocations[dest_rp_uuid]['resources']
         self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation)
 
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index 5e6cfb14ce3d..1223a550ecfa 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -2394,26 +2394,32 @@ class TestUpdateUsageFromInstance(BaseTestCase):
         mock_update_usage.assert_called_once_with(
             self.rt._get_usage_dict(self.instance), _NODENAME, sign=-1)
 
-    @mock.patch('nova.compute.resource_tracker.LOG')
     @mock.patch('nova.objects.Instance.get_by_uuid')
-    def test_remove_deleted_instances_allocations(self, mock_inst_get,
-                                                  mock_log):
+    def test_remove_deleted_instances_allocations_deleted_instance(self,
+            mock_inst_get):
+        rc = self.rt.reportclient
+        self.rt.tracked_instances = {}
+        allocs = {uuids.deleted: "fake_deleted_instance"}
+        rc.get_allocations_for_resource_provider = mock.MagicMock(
+            return_value=allocs)
+        rc.delete_allocation_for_instance = mock.MagicMock()
+        mock_inst_get.side_effect = exc.InstanceNotFound(
+            instance_id=uuids.deleted)
+        cn = self.rt.compute_nodes[_NODENAME]
+        ctx = mock.sentinel.ctx
+        # Call the method.
+        self.rt._remove_deleted_instances_allocations(ctx, cn)
+        # Only one call should be made to delete allocations, and that should
+        # be for the first instance created above
+        rc.delete_allocation_for_instance.assert_called_once_with(
+            uuids.deleted)
+
+    @mock.patch('nova.objects.Instance.get_by_uuid')
+    def test_remove_deleted_instances_allocations_scheduled_instance(self,
+            mock_inst_get):
         rc = self.rt.reportclient
         self.rt.tracked_instances = {}
-        # Create 3 instances
-        instance_uuids = (uuids.inst0, uuids.inst1, uuids.inst2)
-        for i in range(3):
-            instance = _INSTANCE_FIXTURES[0].obj_clone()
-            instance.uuid = instance_uuids[i]
-            if i > 0:
-                # Only add the last two to tracked_instances, to simulate one
-                # deleted instance
-                inst_dict = obj_base.obj_to_primitive(instance)
-                self.rt.tracked_instances[instance.uuid] = inst_dict
-        # Mock out the allocation call
         allocs = {uuids.scheduled: "fake_scheduled_instance"}
-        for i in range(3):
-            allocs[instance_uuids[i]] = "fake_instance_%s" % i
         rc.get_allocations_for_resource_provider = mock.MagicMock(
             return_value=allocs)
         rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2421,8 +2427,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
         def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
             ret = _INSTANCE_FIXTURES[0].obj_clone()
             ret.uuid = inst_uuid
-            if inst_uuid == uuids.scheduled:
-                ret.host = None
+            ret.host = None  # This indicates the instance is being scheduled
             return ret
 
         mock_inst_get.side_effect = get_by_uuid
@@ -2430,15 +2435,12 @@ class TestUpdateUsageFromInstance(BaseTestCase):
         ctx = mock.sentinel.ctx
         # Call the method.
         self.rt._remove_deleted_instances_allocations(ctx, cn)
-        # Only one call should be made to delete allocations, and that should
-        # be for the first instance created above
-        rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
-        mock_log.debug.assert_called_once_with(
-            'Deleting stale allocation for instance %s', uuids.inst0)
+        # Scheduled instances should not have their allocations removed
+        rc.delete_allocation_for_instance.assert_not_called()
 
     @mock.patch('nova.objects.Instance.get_by_uuid')
     def test_remove_deleted_instances_allocations_no_instance(self,
-                                                              mock_inst_get):
+            mock_inst_get):
         # If for some reason an instance is no longer available, but
         # there are allocations for it, we want to be sure those
         # allocations are removed, not that an InstanceNotFound