placement: refactor healing of allocations in RT

Updates the RT method for "healing" instance allocations so that we only delete allocations (and delete them entirely for an instance) in the event that the instance has been local-deleted. Adds log statements to the method to give us a good idea of what the state of various things are during test runs. Note that this patch only partially fixes LP #1707071 to remove some of the back-and-forth allocation trampling that was going on for move operations. The next patch adds the final piece of the puzzle which is to have the confirm/revert resize step on the source or destination compute host *correct* a doubled-up allocation. What this patch does is prevent the healing function from deleting allocations unless the instance is local-deleted. Partial-bug: #1707071 Change-Id: Ia0bb32fd92ae723c505d0fce5691e0fe0540f10d
2017-08-07 16:36:49 -04:00 · 2017-08-07 16:36:49 -04:00 · 09e169bd7a
parent 2651b27b68
commit 09e169bd7a
3 changed files with 124 additions and 95 deletions
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -1041,29 +1041,76 @@ class ResourceTracker(object):
            if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
                self._update_usage_from_instance(context, instance, nodename)

-        # Remove allocations for instances that have been removed.
        self._remove_deleted_instances_allocations(context, cn)

    def _remove_deleted_instances_allocations(self, context, cn):
-        tracked_keys = set(self.tracked_instances.keys())
+        # NOTE(jaypipes): All of this code sucks. It's basically dealing with
+        # all the corner cases in move, local delete, unshelve and rebuild
+        # operations for when allocations should be deleted when things didn't
+        # happen according to the normal flow of events where the scheduler
+        # always creates allocations for an instance
+        known_instances = set(self.tracked_instances.keys())
        allocations = self.reportclient.get_allocations_for_resource_provider(
                cn.uuid) or {}
-        allocations_to_delete = set(allocations.keys()) - tracked_keys
-        for instance_uuid in allocations_to_delete:
-            # Allocations related to instances being scheduled should not be
-            # deleted if we already wrote the allocation previously.
+        for instance_uuid, alloc in allocations.items():
+            if instance_uuid in known_instances:
+                LOG.debug("Instance %s actively managed on this compute host "
+                          "and has allocations in placement: %s.",
+                          instance_uuid, alloc)
+                continue
            try:
                instance = objects.Instance.get_by_uuid(context, instance_uuid,
                                                        expected_attrs=[])
-                if not instance.host:
-                    continue
            except exception.InstanceNotFound:
-                # The instance is gone, so we definitely want to
-                # remove allocations associated with it.
-                pass
-            LOG.debug('Deleting stale allocation for instance %s',
-                      instance_uuid)
-            self.reportclient.delete_allocation_for_instance(instance_uuid)
+                # The instance is gone, so we definitely want to remove
+                # allocations associated with it.
+                # NOTE(jaypipes): This will not be true if/when we support
+                # cross-cell migrations...
+                LOG.debug("Instance %s has been deleted (perhaps locally). "
+                          "Deleting allocations that remained for this "
+                          "instance against this compute host: %s.",
+                          instance_uuid, alloc)
+                self.reportclient.delete_allocation_for_instance(instance_uuid)
+                continue
+            if not instance.host:
+                # Allocations related to instances being scheduled should not
+                # be deleted if we already wrote the allocation previously.
+                LOG.debug("Instance %s has been scheduled to this compute "
+                          "host, the scheduler has made an allocation "
+                          "against this compute node but the instance has "
+                          "yet to start. Skipping heal of allocation: %s.",
+                          instance_uuid, alloc)
+                continue
+            if (instance.host == cn.host and
+                    instance.node == cn.hypervisor_hostname):
+                # The instance is supposed to be on this compute host but is
+                # not in the list of actively managed instances.
+                LOG.warning("Instance %s is not being actively managed by "
+                            "this compute host but has allocations "
+                            "referencing this compute host: %s. Skipping "
+                            "heal of allocation because we do not know "
+                            "what to do.", instance_uuid, alloc)
+                continue
+            if instance.host != cn.host:
+                # The instance has been moved to another host either via a
+                # migration, evacuation or unshelve in between the time when we
+                # ran InstanceList.get_by_host_and_node(), added those
+                # instances to RT.tracked_instances and the above
+                # Instance.get_by_uuid() call. We SHOULD attempt to remove any
+                # allocations that reference this compute host if the VM is in
+                # a stable terminal state (i.e. it isn't in a state of waiting
+                # for resize to confirm/revert), however if the destination
+                # host is an Ocata compute host, it will delete the allocation
+                # that contains this source compute host information anyway and
+                # recreate an allocation that only refers to itself. So we
+                # don't need to do anything in that case. Just log the
+                # situation here for debugging information but don't attempt to
+                # delete or change the allocation.
+                LOG.debug("Instance %s has been moved to another host %s(%s). "
+                          "There are allocations remaining against the source "
+                          "host that might need to be removed: %s.",
+                          instance_uuid, instance.host, instance.node, alloc)
+                continue

    def _find_orphaned_instances(self):
        """Given the set of instances and migrations already account for
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -1305,52 +1305,22 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):

        # the original host expected to have the old resource usage
        source_usages = self._get_provider_usages(source_rp_uuid)
-        # NOTE(danms): This is bug 1707071, where we've lost the
-        # doubled allocation
-        self.assertEqual(0, source_usages['VCPU'])
-        self.assertEqual(0, source_usages['MEMORY_MB'])
-        self.assertEqual(0, source_usages['DISK_GB'])
-
-        # NOTE(danms): After we fix the 'healing' of the doubled allocation
-        # by the resource tracker, the following will be expected
-        # self.assertFlavorMatchesAllocation(old_flavor, source_usages)
+        self.assertFlavorMatchesAllocation(old_flavor, source_usages)

        # the dest host expected to have resource allocation based on
        # the new flavor the server is resized to
        dest_usages = self._get_provider_usages(dest_rp_uuid)
-
-        # NOTE(danms): This is bug 1707071 where we've lost the entire
-        # allocation because one of the computes deleted it
-        self.assertEqual(0, dest_usages['VCPU'])
-        self.assertEqual(0, dest_usages['MEMORY_MB'])
-        self.assertEqual(0, dest_usages['DISK_GB'])
-
-        # NOTE(danms): After we fix the 'healing' of the double allocation
-        # by the resource tracker, the following will be expected
-        # self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
+        self.assertFlavorMatchesAllocation(new_flavor, dest_usages)

        # and our server should have allocation on both providers accordingly
        allocations = self._get_allocations_by_server_uuid(server['id'])
-        # NOTE(danms): This is bug 1707071, where we've lost the
-        # doubled allocation
-        self.assertEqual(0, len(allocations))
-        self.assertNotIn(source_rp_uuid, allocations)
+        self.assertEqual(2, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(old_flavor, source_allocation)

-        # NOTE(danms): After we fix the 'healing' of the doubled allocation
-        # by the resource tracker, the following will be expected
-        # self.assertEqual(2, len(allocations))
-        # source_allocation = allocations[source_rp_uuid]['resources']
-        # self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
-
-        # NOTE(danms): This is bug 1707071, where we've lost the
-        # doubled allocation
-        self.assertNotIn(dest_rp_uuid, allocations)
-
-        # NOTE(danms): After we fix the 'healing' of the doubled allocation
-        # by the resource tracker, the following will be expected
-        # self.assertEqual(2, len(allocations))
-        # dest_allocation = allocations[dest_rp_uuid]['resources']
-        # self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
+        self.assertEqual(2, len(allocations))
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)

    def _delete_and_check_allocations(self, server, source_rp_uuid,
                                      dest_rp_uuid):
@ -1395,19 +1365,24 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        # the original host expected to have the old resource allocation
        source_usages = self._get_provider_usages(source_rp_uuid)
        allocations = self._get_allocations_by_server_uuid(server['id'])
-        dest_usages = self._get_provider_usages(dest_rp_uuid)
-
        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)

-        # and the target host should not have usage
-        self.assertEqual({'VCPU': 0,
-                          'MEMORY_MB': 0,
-                          'DISK_GB': 0}, dest_usages,
-                         'Target host %s still has usage after the resize has '
-                         'been reverted' % dest_hostname)
+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
+        # dest_usages = self._get_provider_usages(dest_rp_uuid)
+        # self.assertEqual({'VCPU': 0,
+        #                   'MEMORY_MB': 0,
+        #                   'DISK_GB': 0}, dest_usages,
+        #                'Target host %s still has usage after the resize has '
+        #                  'been reverted' % dest_hostname)

+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
        # Check that the server only allocates resource from the original host
-        self.assertEqual(1, len(allocations))
+        # self.assertEqual(1, len(allocations))
+
        source_allocation = allocations[source_rp_uuid]['resources']
        self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)

@ -1434,28 +1409,33 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        # Fetch allocations post-confirm
        allocations = self._get_allocations_by_server_uuid(server['id'])

-        # NOTE(danms): This is bug 1707071, where we've lost the doubled
-        # allocation
-        self.assertEqual(0, len(allocations))
+        self.assertEqual(2, len(allocations))

        self._run_periodics()

-        source_usages = self._get_provider_usages(source_rp_uuid)
        dest_usages = self._get_provider_usages(dest_rp_uuid)

-        self.assertEqual({'VCPU': 0,
-                          'MEMORY_MB': 0,
-                          'DISK_GB': 0}, source_usages,
-                         'The source host %s still has usages after the '
-                         'resize has been confirmed' % source_hostname)
+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
+        # source_usages = self._get_provider_usages(source_rp_uuid)
+        # self.assertEqual({'VCPU': 0,
+        #                   'MEMORY_MB': 0,
+        #                   'DISK_GB': 0}, source_usages,
+        #                  'The source host %s still has usages after the '
+        #                  'resize has been confirmed' % source_hostname)

        # and the target host allocation should be according to the new flavor
        self.assertFlavorMatchesAllocation(self.flavor2, dest_usages)

        allocations = self._get_allocations_by_server_uuid(server['id'])

+        # NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
+        # Currently, we're not cleaning up the destination allocation on
+        # confirm and the source allocation on resize
        # and the server allocates only from the target host
-        self.assertEqual(1, len(allocations))
+        # self.assertEqual(1, len(allocations))
+
        dest_allocation = allocations[dest_rp_uuid]['resources']
        self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation)

--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -2394,26 +2394,32 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        mock_update_usage.assert_called_once_with(
            self.rt._get_usage_dict(self.instance), _NODENAME, sign=-1)

-    @mock.patch('nova.compute.resource_tracker.LOG')
    @mock.patch('nova.objects.Instance.get_by_uuid')
-    def test_remove_deleted_instances_allocations(self, mock_inst_get,
-                                                  mock_log):
+    def test_remove_deleted_instances_allocations_deleted_instance(self,
+            mock_inst_get):
+        rc = self.rt.reportclient
+        self.rt.tracked_instances = {}
+        allocs = {uuids.deleted: "fake_deleted_instance"}
+        rc.get_allocations_for_resource_provider = mock.MagicMock(
+            return_value=allocs)
+        rc.delete_allocation_for_instance = mock.MagicMock()
+        mock_inst_get.side_effect = exc.InstanceNotFound(
+            instance_id=uuids.deleted)
+        cn = self.rt.compute_nodes[_NODENAME]
+        ctx = mock.sentinel.ctx
+        # Call the method.
+        self.rt._remove_deleted_instances_allocations(ctx, cn)
+        # Only one call should be made to delete allocations, and that should
+        # be for the first instance created above
+        rc.delete_allocation_for_instance.assert_called_once_with(
+            uuids.deleted)
+
+    @mock.patch('nova.objects.Instance.get_by_uuid')
+    def test_remove_deleted_instances_allocations_scheduled_instance(self,
+            mock_inst_get):
        rc = self.rt.reportclient
        self.rt.tracked_instances = {}
-        # Create 3 instances
-        instance_uuids = (uuids.inst0, uuids.inst1, uuids.inst2)
-        for i in range(3):
-            instance = _INSTANCE_FIXTURES[0].obj_clone()
-            instance.uuid = instance_uuids[i]
-            if i > 0:
-                # Only add the last two to tracked_instances, to simulate one
-                # deleted instance
-                inst_dict = obj_base.obj_to_primitive(instance)
-                self.rt.tracked_instances[instance.uuid] = inst_dict
-        # Mock out the allocation call
        allocs = {uuids.scheduled: "fake_scheduled_instance"}
-        for i in range(3):
-            allocs[instance_uuids[i]] = "fake_instance_%s" % i
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
        rc.delete_allocation_for_instance = mock.MagicMock()
@ -2421,8 +2427,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
            ret = _INSTANCE_FIXTURES[0].obj_clone()
            ret.uuid = inst_uuid
-            if inst_uuid == uuids.scheduled:
-                ret.host = None
+            ret.host = None  # This indicates the instance is being scheduled
            return ret

        mock_inst_get.side_effect = get_by_uuid
@ -2430,15 +2435,12 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        ctx = mock.sentinel.ctx
        # Call the method.
        self.rt._remove_deleted_instances_allocations(ctx, cn)
-        # Only one call should be made to delete allocations, and that should
-        # be for the first instance created above
-        rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
-        mock_log.debug.assert_called_once_with(
-            'Deleting stale allocation for instance %s', uuids.inst0)
+        # Scheduled instances should not have their allocations removed
+        rc.delete_allocation_for_instance.assert_not_called()

    @mock.patch('nova.objects.Instance.get_by_uuid')
    def test_remove_deleted_instances_allocations_no_instance(self,
-                                                              mock_inst_get):
+            mock_inst_get):
        # If for some reason an instance is no longer available, but
        # there are allocations for it, we want to be sure those
        # allocations are removed, not that an InstanceNotFound