Don't delete allocation if instance being scheduled

There could be a potential race if something other than the compute service create allocations (say, the scheduler) because then the self-healing process that runs periodically on compute would delete that allocation given it wouldn't find the related instance for that host. Removing an unnecessary method in the report client code, and rather do the loop in the RT code for directly calling to delete allocations. Co-Authored-By: Ed Leafe <ed@leafe.com> Blueprint: placement-claims Change-Id: I80ba844a6e0fcea89f80aa253d57ac73092773ae
2017-04-26 17:16:03 +02:00 · 2017-04-26 17:16:03 +02:00 · 439f67e7bc
commit 439f67e7bc
parent d966d33227
4 changed files with 64 additions and 58 deletions
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -992,11 +992,28 @@ class ResourceTracker(object):
            if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
                self._update_usage_from_instance(context, instance, nodename)

-        self.reportclient.remove_deleted_instances(cn,
-                self.tracked_instances.values())
+        # Remove allocations for instances that have been removed.
+        self._remove_deleted_instances_allocations(context, cn)
+
        cn.free_ram_mb = max(0, cn.free_ram_mb)
        cn.free_disk_gb = max(0, cn.free_disk_gb)

+    def _remove_deleted_instances_allocations(self, context, cn):
+        tracked_keys = set(self.tracked_instances.keys())
+        allocations = self.reportclient.get_allocations_for_resource_provider(
+                cn.uuid) or {}
+        allocations_to_delete = set(allocations.keys()) - tracked_keys
+        for instance_uuid in allocations_to_delete:
+            # Allocations related to instances being scheduled should not be
+            # deleted if we already wrote the allocation previously.
+            instance = objects.Instance.get_by_uuid(context, instance_uuid,
+                                                    expected_attrs=[])
+            if not instance.host:
+                continue
+            LOG.warning('Deleting stale allocation for instance %s',
+                        instance_uuid)
+            self.reportclient.delete_allocation_for_instance(instance_uuid)
+
    def _find_orphaned_instances(self):
        """Given the set of instances and migrations already account for
        by resource tracker, sanity check the hypervisor to determine
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -884,7 +884,7 @@ class SchedulerReportClient(object):
        return r.status_code == 204

    @safe_connect
-    def _delete_allocation_for_instance(self, uuid):
+    def delete_allocation_for_instance(self, uuid):
        url = '/allocations/%s' % uuid
        r = self.delete(url)
        if r:
@ -905,10 +905,10 @@ class SchedulerReportClient(object):
        if sign > 0:
            self._allocate_for_instance(compute_node.uuid, instance)
        else:
-            self._delete_allocation_for_instance(instance.uuid)
+            self.delete_allocation_for_instance(instance.uuid)

    @safe_connect
-    def _get_allocations(self, rp_uuid):
+    def get_allocations_for_resource_provider(self, rp_uuid):
        url = '/resource_providers/%s/allocations' % rp_uuid
        resp = self.get(url)
        if not resp:
@ -916,20 +916,6 @@ class SchedulerReportClient(object):
        else:
            return resp.json()['allocations']

-    def remove_deleted_instances(self, compute_node, instance_uuids):
-        allocations = self._get_allocations(compute_node.uuid)
-        if allocations is None:
-            allocations = {}
-
-        instance_dict = {instance['uuid']: instance
-                         for instance in instance_uuids}
-        removed_instances = set(allocations.keys()) - set(instance_dict.keys())
-
-        for uuid in removed_instances:
-            LOG.warning(_LW('Deleting stale allocation for instance %s'),
-                        uuid)
-            self._delete_allocation_for_instance(uuid)
-
    @safe_connect
    def delete_resource_provider(self, context, compute_node, cascade=False):
        """Deletes the ResourceProvider record for the compute_node.
@ -951,7 +937,7 @@ class SchedulerReportClient(object):
            instances = objects.InstanceList.get_by_host_and_node(context,
                    host, nodename)
            for instance in instances:
-                self._delete_allocation_for_instance(instance.uuid)
+                self.delete_allocation_for_instance(instance.uuid)
        url = "/resource_providers/%s" % rp_uuid
        resp = self.delete(url)
        if resp:
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -2266,6 +2266,44 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        mock_update_usage.assert_called_once_with(
            self.rt._get_usage_dict(self.instance), _NODENAME, sign=-1)

+    @mock.patch('nova.objects.Instance.get_by_uuid')
+    def test_remove_deleted_instances_allocations(self, mock_inst_get):
+        rc = self.rt.reportclient
+        self.rt.tracked_instances = {}
+        # Create 3 instances
+        instance_uuids = (uuids.inst0, uuids.inst1, uuids.inst2)
+        for i in range(3):
+            instance = _INSTANCE_FIXTURES[0].obj_clone()
+            instance.uuid = instance_uuids[i]
+            if i > 0:
+                # Only add the last two to tracked_instances, to simulate one
+                # deleted instance
+                inst_dict = obj_base.obj_to_primitive(instance)
+                self.rt.tracked_instances[instance.uuid] = inst_dict
+        # Mock out the allocation call
+        allocs = {uuids.scheduled: "fake_scheduled_instance"}
+        for i in range(3):
+            allocs[instance_uuids[i]] = "fake_instance_%s" % i
+        rc.get_allocations_for_resource_provider = mock.MagicMock(
+            return_value=allocs)
+        rc.delete_allocation_for_instance = mock.MagicMock()
+
+        def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
+            ret = _INSTANCE_FIXTURES[0].obj_clone()
+            ret.uuid = inst_uuid
+            if inst_uuid == uuids.scheduled:
+                ret.host = None
+            return ret
+
+        mock_inst_get.side_effect = get_by_uuid
+        cn = self.rt.compute_nodes[_NODENAME]
+        ctx = mock.sentinel.ctx
+        # Call the method.
+        self.rt._remove_deleted_instances_allocations(ctx, cn)
+        # Only one call should be made to delete allocations, and that should
+        # be for the first instance created above
+        rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
+

 class TestInstanceInResizeState(test.NoDBTestCase):
    def test_active_suspending(self):
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@ -1482,41 +1482,6 @@ class TestAllocations(SchedulerReportClientTestCase):
        self.client.update_instance_allocation(cn, inst, -1)
        self.assertTrue(mock_warn.called)

-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
-                'delete')
-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
-                'get')
-    @mock.patch('nova.scheduler.client.report.'
-                '_instance_to_allocations_dict')
-    def test_remove_deleted_instances(self, mock_a, mock_get,
-                                      mock_delete):
-        cn = objects.ComputeNode(uuid=uuids.cn)
-        inst1 = objects.Instance(uuid=uuids.inst1)
-        inst2 = objects.Instance(uuid=uuids.inst2)
-        fake_allocations = {
-            'MEMORY_MB': 1024,
-            'VCPU': 2,
-            'DISK_GB': 101,
-        }
-        mock_get.return_value.json.return_value = {'allocations': {
-                inst1.uuid: fake_allocations,
-                inst2.uuid: fake_allocations,
-            }
-        }
-
-        # One instance still on the node, dict form as the
-        # RT tracks it
-        inst3 = {'uuid': 'foo'}
-
-        mock_delete.return_value = True
-        self.client.remove_deleted_instances(cn, [inst3])
-        mock_get.assert_called_once_with(
-            '/resource_providers/%s/allocations' % cn.uuid)
-        expected_calls = [
-            mock.call('/allocations/%s' % inst1.uuid),
-            mock.call('/allocations/%s' % inst2.uuid)]
-        mock_delete.assert_has_calls(expected_calls, any_order=True)
-
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'delete')
    @mock.patch('nova.scheduler.client.report.LOG')
@ -1532,7 +1497,7 @@ class TestAllocations(SchedulerReportClientTestCase):
            # py3 uses __bool__
            mock_response.__bool__.return_value = False
        mock_delete.return_value = mock_response
-        self.client._delete_allocation_for_instance(uuids.rp_uuid)
+        self.client.delete_allocation_for_instance(uuids.rp_uuid)
        # make sure we didn't screw up the logic or the mock
        mock_log.info.assert_not_called()
        # make sure warning wasn't called for the 404
@ -1541,7 +1506,7 @@ class TestAllocations(SchedulerReportClientTestCase):
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete")
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
-                "_delete_allocation_for_instance")
+                "delete_allocation_for_instance")
    @mock.patch("nova.objects.InstanceList.get_by_host_and_node")
    def test_delete_resource_provider_cascade(self, mock_by_host,
            mock_del_alloc, mock_delete):
@ -1563,7 +1528,7 @@ class TestAllocations(SchedulerReportClientTestCase):
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete")
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
-                "_delete_allocation_for_instance")
+                "delete_allocation_for_instance")
    @mock.patch("nova.objects.InstanceList.get_by_host_and_node")
    def test_delete_resource_provider_no_cascade(self, mock_by_host,
            mock_del_alloc, mock_delete):