Add force kwarg to delete_allocation_for_instance

This adds a force kwarg to delete_allocation_for_instance which defaults to True because that was found to be the most common use case by a significant margin during implementation of this patch. In most cases, this method is called when we want to delete the allocations because they should be gone, e.g. server delete, failed build, or shelve offload. The alternative in these cases is the caller could trap the conflict error and retry but we might as well just force the delete in that case (it's cleaner). When force=True, it will DELETE the consumer allocations rather than GET and PUT with an empty allocations dict and the consumer generation which can result in a 409 conflict from Placement. For example, bug 1836754 shows that in one tempest test that creates a server and then immediately deletes it, we can hit a very tight window where the method GETs the allocations and before it PUTs the empty allocations to remove them, something changes which results in a conflict and the server delete fails with a 409 error. It's worth noting that delete_allocation_for_instance used to just DELETE the allocations before Stein [1] when we started taking consumer generations into account. There was also a related mailing list thread [2]. Closes-Bug: #1836754 [1] I77f34788dd7ab8fdf60d668a4f76452e03cf9888 [2] http://lists.openstack.org/pipermail/openstack-dev/2018-August/133374.html Change-Id: Ife3c7a5a95c5d707983ab33fd2fbfc1cfb72f676
2019-10-15 15:49:55 -04:00 · 2019-10-15 15:49:55 -04:00 · c09d98dadb
parent 2a78626a85
commit c09d98dadb
12 changed files with 72 additions and 90 deletions
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@ -2452,7 +2452,7 @@ class PlacementCommands(object):
        else:
            try:
                placement.delete_allocation_for_instance(
-                    ctxt, consumer_uuid, consumer_type)
+                    ctxt, consumer_uuid, consumer_type, force=True)
            except exception.AllocationDeleteFailed:
                return False
        return True
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@ -2200,7 +2200,7 @@ class API:
        # mapping queued as deleted before _delete() return
        try:
            self.placementclient.delete_allocation_for_instance(
-                context, instance_uuid)
+                context, instance_uuid, force=True)
        except exception.AllocationDeleteFailed:
            LOG.info("Allocation delete failed during local delete cleanup.",
                     instance_uuid=instance_uuid)
@ -2512,7 +2512,7 @@ class API:
            # Cleanup allocations in Placement since we can't do it from the
            # compute service.
            self.placementclient.delete_allocation_for_instance(
-                context, instance.uuid)
+                context, instance.uuid, force=True)
            cb(context, instance, bdms, local=True)
            instance.destroy()

--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -825,8 +825,13 @@ class ComputeManager(manager.Manager):
    def _complete_deletion(self, context, instance):
        self._update_resource_tracker(context, instance)

+        # If we're configured to do deferred deletes, don't force deletion of
+        # allocations if there's a conflict.
+        force = False if CONF.reclaim_instance_interval > 0 else True
+
        self.reportclient.delete_allocation_for_instance(context,
-                                                         instance.uuid)
+                                                         instance.uuid,
+                                                         force=force)

        self._clean_instance_console_tokens(context, instance)
        self._delete_scheduler_instance_info(context, instance.uuid)
@ -2134,7 +2139,7 @@ class ComputeManager(manager.Manager):
                        # have already been removed in
                        # self._do_build_and_run_instance().
                        self.reportclient.delete_allocation_for_instance(
-                            context, instance.uuid)
+                            context, instance.uuid, force=True)

                    if result in (build_results.FAILED,
                                  build_results.RESCHEDULED):
@ -2252,8 +2257,8 @@ class ComputeManager(manager.Manager):
            # to unclaim those resources before casting to the conductor, so
            # that if there are alternate hosts available for a retry, it can
            # claim resources on that new host for the instance.
-            self.reportclient.delete_allocation_for_instance(context,
-                                                             instance.uuid)
+            self.reportclient.delete_allocation_for_instance(
+                context, instance.uuid, force=True)

            self.compute_task_api.build_instances(context, [instance],
                    image, filter_properties, admin_password,
@ -4426,7 +4431,8 @@ class ComputeManager(manager.Manager):
            # NOTE(danms): We're finishing on the source node, so try
            # to delete the allocation based on the migration uuid
            self.reportclient.delete_allocation_for_instance(
-                context, migration.uuid, consumer_type='migration')
+                context, migration.uuid, consumer_type='migration',
+                force=False)
        except exception.AllocationDeleteFailed:
            LOG.error('Deleting allocation in placement for migration '
                      '%(migration_uuid)s failed. The instance '
@ -6670,8 +6676,8 @@ class ComputeManager(manager.Manager):
                # the instance claim failed with ComputeResourcesUnavailable
                # or if we did claim but the spawn failed, because aborting the
                # instance claim will not remove the allocations.
-                self.reportclient.delete_allocation_for_instance(context,
-                                                                 instance.uuid)
+                self.reportclient.delete_allocation_for_instance(
+                    context, instance.uuid, force=True)
                # FIXME: Umm, shouldn't we be rolling back port bindings too?
                self._terminate_volume_connections(context, instance, bdms)
                # The reverts_task_state decorator on unshelve_instance will
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -1574,8 +1574,12 @@ class ResourceTracker(object):
                          "Deleting allocations that remained for this "
                          "instance against this compute host: %s.",
                          instance_uuid, alloc)
+                # We don't force delete the allocation in this case because if
+                # there is a conflict we'll retry on the next
+                # update_available_resource periodic run.
                self.reportclient.delete_allocation_for_instance(context,
-                                                                 instance_uuid)
+                                                                 instance_uuid,
+                                                                 force=False)
                continue
            if not instance.host:
                # Allocations related to instances being scheduled should not
@ -1643,8 +1647,8 @@ class ResourceTracker(object):

    def delete_allocation_for_shelve_offloaded_instance(self, context,
                                                        instance):
-        self.reportclient.delete_allocation_for_instance(context,
-                                                         instance.uuid)
+        self.reportclient.delete_allocation_for_instance(
+            context, instance.uuid, force=True)

    def _verify_resources(self, resources):
        resource_keys = ["vcpus", "memory_mb", "local_gb", "cpu_info",
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -1608,7 +1608,7 @@ class ComputeTaskManager:
                except exception.InstanceMappingNotFound:
                    pass
                self.report_client.delete_allocation_for_instance(
-                    context, instance.uuid)
+                    context, instance.uuid, force=True)
                continue
            else:
                if host.service_host not in host_az:
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -2050,8 +2050,9 @@ class SchedulerReportClient(object):
        return r.status_code == 204

    @safe_connect
-    def delete_allocation_for_instance(self, context, uuid,
-                                       consumer_type='instance'):
+    def delete_allocation_for_instance(
+        self, context, uuid, consumer_type='instance', force=False
+    ):
        """Delete the instance allocation from placement

        :param context: The security context
@ -2059,13 +2060,42 @@ class SchedulerReportClient(object):
                     as the consumer UUID towards placement
        :param consumer_type: The type of the consumer specified by uuid.
                              'instance' or 'migration' (Default: instance)
+        :param force: True if the allocations should be deleted without regard
+                      to consumer generation conflicts, False will attempt to
+                      GET and PUT empty allocations and use the consumer
+                      generation which could result in a conflict and need to
+                      retry the operation.
        :return: Returns True if the allocation is successfully deleted by this
                 call. Returns False if the allocation does not exist.
        :raises AllocationDeleteFailed: If the allocation cannot be read from
-                placement or it is changed by another process while we tried to
-                delete it.
+                placement (if force=False), is changed by another process while
+                we tried to delete it (if force=False), or if some other server
+                side error occurred (if force=True)
        """
        url = '/allocations/%s' % uuid
+        if force:
+            # Do not bother with consumer generations, just delete the
+            # allocations.
+            r = self.delete(url, global_request_id=context.global_id)
+            if r:
+                LOG.info('Deleted allocations for %(consumer_type)s %(uuid)s',
+                         {'consumer_type': consumer_type, 'uuid': uuid})
+                return True
+
+            # Check for 404 since we don't need to log a warning if we
+            # tried to delete something which doesn't actually exist.
+            if r.status_code != 404:
+                LOG.warning(
+                    'Unable to delete allocation for %(consumer_type)s '
+                    '%(uuid)s: (%(code)i %(text)s)',
+                    {'consumer_type': consumer_type,
+                     'uuid': uuid,
+                     'code': r.status_code,
+                     'text': r.text})
+                raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
+                                                       error=r.text)
+            return False
+
        # We read the consumer generation then try to put an empty allocation
        # for that consumer. If between the GET and the PUT the consumer
        # generation changes then we raise AllocationDeleteFailed.
@ -2245,7 +2275,8 @@ class SchedulerReportClient(object):
            instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
                context, host, nodename)
            for instance_uuid in instance_uuids:
-                self.delete_allocation_for_instance(context, instance_uuid)
+                self.delete_allocation_for_instance(
+                    context, instance_uuid, force=True)
        # Ensure to delete resource provider in tree by top-down
        # traversable order.
        rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)
--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@ -461,7 +461,8 @@ class SchedulerManager(manager.Manager):

        LOG.debug("Cleaning up allocations for %s", instance_uuids)
        for uuid in instance_uuids:
-            self.placement_client.delete_allocation_for_instance(context, uuid)
+            self.placement_client.delete_allocation_for_instance(
+                context, uuid, force=True)

    def _legacy_find_hosts(
        self, context, num_instances, spec_obj, hosts, num_alts,
--- a/nova/tests/functional/test_report_client.py
+++ b/nova/tests/functional/test_report_client.py
@ -245,8 +245,8 @@ class SchedulerReportClientTests(test.TestCase):
        self.assertEqual(2, vcpu_data)

        # Delete allocations with our instance
-        self.client.delete_allocation_for_instance(self.context,
-                                                   self.instance.uuid)
+        self.client.delete_allocation_for_instance(
+            self.context, self.instance.uuid, force=True)

        # No usage
        resp = self.client.get('/resource_providers/%s/usages' %
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -5079,68 +5079,6 @@ class ConsumerGenerationConflictTest(
    def test_evacuate_fails_allocating_on_dest_host(self):
        self._test_evacuate_fails_allocating_on_dest_host(force=False)

-    def test_server_delete_fails_due_to_conflict(self):
-        source_hostname = self.compute1.host
-
-        server = self._boot_and_check_allocations(self.flavor, source_hostname)
-
-        rsp = fake_requests.FakeResponse(
-            409, jsonutils.dumps({'text': 'consumer generation conflict'}))
-
-        with mock.patch('keystoneauth1.adapter.Adapter.put',
-                        autospec=True) as mock_put:
-            mock_put.return_value = rsp
-
-            self.api.delete_server(server['id'])
-            server = self._wait_for_state_change(server,
-                                                 'ERROR')
-            self.assertEqual(1, mock_put.call_count)
-
-        # We still have the allocations as deletion failed
-        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
-        self.assertFlavorMatchesUsage(source_rp_uuid, self.flavor)
-
-        self.assertFlavorMatchesAllocation(self.flavor, server['id'],
-                                           source_rp_uuid)
-
-        # retry the delete to make sure that allocations are removed this time
-        self._delete_and_check_allocations(server)
-
-    def test_server_local_delete_fails_due_to_conflict(self):
-        source_hostname = self.compute1.host
-
-        server = self._boot_and_check_allocations(self.flavor, source_hostname)
-        source_compute_id = self.admin_api.get_services(
-            host=self.compute1.host, binary='nova-compute')[0]['id']
-        self.compute1.stop()
-        self.admin_api.put_service(
-            source_compute_id, {'forced_down': 'true'})
-
-        rsp = fake_requests.FakeResponse(
-            409, jsonutils.dumps({'text': 'consumer generation conflict'}))
-
-        with mock.patch('keystoneauth1.adapter.Adapter.put',
-                        autospec=True) as mock_put:
-            mock_put.return_value = rsp
-
-            ex = self.assertRaises(client.OpenStackApiException,
-                                   self.api.delete_server, server['id'])
-            self.assertEqual(409, ex.response.status_code)
-            self.assertIn('Failed to delete allocations for consumer',
-                          jsonutils.loads(ex.response.content)[
-                              'conflictingRequest']['message'])
-            self.assertEqual(1, mock_put.call_count)
-
-        # We still have the allocations as deletion failed
-        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
-        self.assertFlavorMatchesUsage(source_rp_uuid, self.flavor)
-
-        self.assertFlavorMatchesAllocation(self.flavor, server['id'],
-                                           source_rp_uuid)
-
-        # retry the delete to make sure that allocations are removed this time
-        self._delete_and_check_allocations(server)
-

 class ServerMovingTestsWithNestedComputes(ServerMovingTests):
    """Runs all the server moving tests while the computes have nested trees.
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -9065,7 +9065,9 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
                                                       self.instance,
                                                       self.migration)
        mock_report.delete_allocation_for_instance.assert_called_once_with(
-            self.context, self.migration.uuid, consumer_type='migration')
+            self.context, self.migration.uuid, consumer_type='migration',
+            force=False
+        )

    def test_revert_allocation_allocation_exists(self):
        """New-style migration-based allocation revert."""
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -3375,7 +3375,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        # Only one call should be made to delete allocations, and that should
        # be for the first instance created above
        rc.delete_allocation_for_instance.assert_called_once_with(
-            ctx, uuids.deleted)
+            ctx, uuids.deleted, force=False)
        mock_inst_get.assert_called_once_with(
            ctx.elevated.return_value,
            uuids.deleted,
@ -3452,7 +3452,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        # Only one call should be made to delete allocations, and that should
        # be for the first instance created above
        rc.delete_allocation_for_instance.assert_called_once_with(
-            ctx, uuids.deleted)
+            ctx, uuids.deleted, force=False)

    @mock.patch('nova.objects.Instance.get_by_uuid')
    def test_remove_deleted_instances_allocations_scheduled_instance(self,
@ -3652,7 +3652,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        rc = self.rt.reportclient
        mock_remove_allocation = rc.delete_allocation_for_instance
        mock_remove_allocation.assert_called_once_with(
-            mock.sentinel.ctx, instance.uuid)
+            mock.sentinel.ctx, instance.uuid, force=True)

    def test_update_usage_from_instances_goes_negative(self):
        # NOTE(danms): The resource tracker _should_ report negative resources
--- a/nova/tests/unit/scheduler/test_manager.py
+++ b/nova/tests/unit/scheduler/test_manager.py
@ -1049,8 +1049,8 @@ class SchedulerManagerTestCase(test.NoDBTestCase):
        instance_uuids = [uuids.instance1, uuids.instance2]
        self.manager._cleanup_allocations(self.context, instance_uuids)
        mock_delete_alloc.assert_has_calls([
-            mock.call(self.context, uuids.instance1),
-            mock.call(self.context, uuids.instance2)
+            mock.call(self.context, uuids.instance1, force=True),
+            mock.call(self.context, uuids.instance2, force=True),
        ])

    def test_add_retry_host(self):