Add force kwarg to delete_allocation_for_instance

This adds a force kwarg to delete_allocation_for_instance which
defaults to True because that was found to be the most common use case
by a significant margin during implementation of this patch.
In most cases, this method is called when we want to delete the
allocations because they should be gone, e.g. server delete, failed
build, or shelve offload. The alternative in these cases is the caller
could trap the conflict error and retry but we might as well just force
the delete in that case (it's cleaner).

When force=True, it will DELETE the consumer allocations rather than
GET and PUT with an empty allocations dict and the consumer generation
which can result in a 409 conflict from Placement. For example, bug
1836754 shows that in one tempest test that creates a server and then
immediately deletes it, we can hit a very tight window where the method
GETs the allocations and before it PUTs the empty allocations to remove
them, something changes which results in a conflict and the server
delete fails with a 409 error.

It's worth noting that delete_allocation_for_instance used to just
DELETE the allocations before Stein [1] when we started taking consumer
generations into account. There was also a related mailing list thread
[2].


Closes-Bug: #1836754

[1] I77f34788dd7ab8fdf60d668a4f76452e03cf9888
[2] http://lists.openstack.org/pipermail/openstack-dev/2018-August/133374.html

Change-Id: Ife3c7a5a95c5d707983ab33fd2fbfc1cfb72f676
This commit is contained in:
Matt Riedemann 2019-10-15 15:49:55 -04:00 committed by Balazs Gibizer
parent 2a78626a85
commit c09d98dadb
12 changed files with 72 additions and 90 deletions

View File

@ -2452,7 +2452,7 @@ class PlacementCommands(object):
else:
try:
placement.delete_allocation_for_instance(
ctxt, consumer_uuid, consumer_type)
ctxt, consumer_uuid, consumer_type, force=True)
except exception.AllocationDeleteFailed:
return False
return True

View File

@ -2200,7 +2200,7 @@ class API:
# mapping queued as deleted before _delete() return
try:
self.placementclient.delete_allocation_for_instance(
context, instance_uuid)
context, instance_uuid, force=True)
except exception.AllocationDeleteFailed:
LOG.info("Allocation delete failed during local delete cleanup.",
instance_uuid=instance_uuid)
@ -2512,7 +2512,7 @@ class API:
# Cleanup allocations in Placement since we can't do it from the
# compute service.
self.placementclient.delete_allocation_for_instance(
context, instance.uuid)
context, instance.uuid, force=True)
cb(context, instance, bdms, local=True)
instance.destroy()

View File

@ -825,8 +825,13 @@ class ComputeManager(manager.Manager):
def _complete_deletion(self, context, instance):
self._update_resource_tracker(context, instance)
# If we're configured to do deferred deletes, don't force deletion of
# allocations if there's a conflict.
force = False if CONF.reclaim_instance_interval > 0 else True
self.reportclient.delete_allocation_for_instance(context,
instance.uuid)
instance.uuid,
force=force)
self._clean_instance_console_tokens(context, instance)
self._delete_scheduler_instance_info(context, instance.uuid)
@ -2134,7 +2139,7 @@ class ComputeManager(manager.Manager):
# have already been removed in
# self._do_build_and_run_instance().
self.reportclient.delete_allocation_for_instance(
context, instance.uuid)
context, instance.uuid, force=True)
if result in (build_results.FAILED,
build_results.RESCHEDULED):
@ -2252,8 +2257,8 @@ class ComputeManager(manager.Manager):
# to unclaim those resources before casting to the conductor, so
# that if there are alternate hosts available for a retry, it can
# claim resources on that new host for the instance.
self.reportclient.delete_allocation_for_instance(context,
instance.uuid)
self.reportclient.delete_allocation_for_instance(
context, instance.uuid, force=True)
self.compute_task_api.build_instances(context, [instance],
image, filter_properties, admin_password,
@ -4426,7 +4431,8 @@ class ComputeManager(manager.Manager):
# NOTE(danms): We're finishing on the source node, so try
# to delete the allocation based on the migration uuid
self.reportclient.delete_allocation_for_instance(
context, migration.uuid, consumer_type='migration')
context, migration.uuid, consumer_type='migration',
force=False)
except exception.AllocationDeleteFailed:
LOG.error('Deleting allocation in placement for migration '
'%(migration_uuid)s failed. The instance '
@ -6670,8 +6676,8 @@ class ComputeManager(manager.Manager):
# the instance claim failed with ComputeResourcesUnavailable
# or if we did claim but the spawn failed, because aborting the
# instance claim will not remove the allocations.
self.reportclient.delete_allocation_for_instance(context,
instance.uuid)
self.reportclient.delete_allocation_for_instance(
context, instance.uuid, force=True)
# FIXME: Umm, shouldn't we be rolling back port bindings too?
self._terminate_volume_connections(context, instance, bdms)
# The reverts_task_state decorator on unshelve_instance will

View File

@ -1574,8 +1574,12 @@ class ResourceTracker(object):
"Deleting allocations that remained for this "
"instance against this compute host: %s.",
instance_uuid, alloc)
# We don't force delete the allocation in this case because if
# there is a conflict we'll retry on the next
# update_available_resource periodic run.
self.reportclient.delete_allocation_for_instance(context,
instance_uuid)
instance_uuid,
force=False)
continue
if not instance.host:
# Allocations related to instances being scheduled should not
@ -1643,8 +1647,8 @@ class ResourceTracker(object):
def delete_allocation_for_shelve_offloaded_instance(self, context,
instance):
self.reportclient.delete_allocation_for_instance(context,
instance.uuid)
self.reportclient.delete_allocation_for_instance(
context, instance.uuid, force=True)
def _verify_resources(self, resources):
resource_keys = ["vcpus", "memory_mb", "local_gb", "cpu_info",

View File

@ -1608,7 +1608,7 @@ class ComputeTaskManager:
except exception.InstanceMappingNotFound:
pass
self.report_client.delete_allocation_for_instance(
context, instance.uuid)
context, instance.uuid, force=True)
continue
else:
if host.service_host not in host_az:

View File

@ -2050,8 +2050,9 @@ class SchedulerReportClient(object):
return r.status_code == 204
@safe_connect
def delete_allocation_for_instance(self, context, uuid,
consumer_type='instance'):
def delete_allocation_for_instance(
self, context, uuid, consumer_type='instance', force=False
):
"""Delete the instance allocation from placement
:param context: The security context
@ -2059,13 +2060,42 @@ class SchedulerReportClient(object):
as the consumer UUID towards placement
:param consumer_type: The type of the consumer specified by uuid.
'instance' or 'migration' (Default: instance)
:param force: True if the allocations should be deleted without regard
to consumer generation conflicts, False will attempt to
GET and PUT empty allocations and use the consumer
generation which could result in a conflict and need to
retry the operation.
:return: Returns True if the allocation is successfully deleted by this
call. Returns False if the allocation does not exist.
:raises AllocationDeleteFailed: If the allocation cannot be read from
placement or it is changed by another process while we tried to
delete it.
placement (if force=False), is changed by another process while
we tried to delete it (if force=False), or if some other server
side error occurred (if force=True)
"""
url = '/allocations/%s' % uuid
if force:
# Do not bother with consumer generations, just delete the
# allocations.
r = self.delete(url, global_request_id=context.global_id)
if r:
LOG.info('Deleted allocations for %(consumer_type)s %(uuid)s',
{'consumer_type': consumer_type, 'uuid': uuid})
return True
# Check for 404 since we don't need to log a warning if we
# tried to delete something which doesn't actually exist.
if r.status_code != 404:
LOG.warning(
'Unable to delete allocation for %(consumer_type)s '
'%(uuid)s: (%(code)i %(text)s)',
{'consumer_type': consumer_type,
'uuid': uuid,
'code': r.status_code,
'text': r.text})
raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
error=r.text)
return False
# We read the consumer generation then try to put an empty allocation
# for that consumer. If between the GET and the PUT the consumer
# generation changes then we raise AllocationDeleteFailed.
@ -2245,7 +2275,8 @@ class SchedulerReportClient(object):
instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
context, host, nodename)
for instance_uuid in instance_uuids:
self.delete_allocation_for_instance(context, instance_uuid)
self.delete_allocation_for_instance(
context, instance_uuid, force=True)
# Ensure to delete resource provider in tree by top-down
# traversable order.
rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)

View File

@ -461,7 +461,8 @@ class SchedulerManager(manager.Manager):
LOG.debug("Cleaning up allocations for %s", instance_uuids)
for uuid in instance_uuids:
self.placement_client.delete_allocation_for_instance(context, uuid)
self.placement_client.delete_allocation_for_instance(
context, uuid, force=True)
def _legacy_find_hosts(
self, context, num_instances, spec_obj, hosts, num_alts,

View File

@ -245,8 +245,8 @@ class SchedulerReportClientTests(test.TestCase):
self.assertEqual(2, vcpu_data)
# Delete allocations with our instance
self.client.delete_allocation_for_instance(self.context,
self.instance.uuid)
self.client.delete_allocation_for_instance(
self.context, self.instance.uuid, force=True)
# No usage
resp = self.client.get('/resource_providers/%s/usages' %

View File

@ -5079,68 +5079,6 @@ class ConsumerGenerationConflictTest(
def test_evacuate_fails_allocating_on_dest_host(self):
self._test_evacuate_fails_allocating_on_dest_host(force=False)
def test_server_delete_fails_due_to_conflict(self):
source_hostname = self.compute1.host
server = self._boot_and_check_allocations(self.flavor, source_hostname)
rsp = fake_requests.FakeResponse(
409, jsonutils.dumps({'text': 'consumer generation conflict'}))
with mock.patch('keystoneauth1.adapter.Adapter.put',
autospec=True) as mock_put:
mock_put.return_value = rsp
self.api.delete_server(server['id'])
server = self._wait_for_state_change(server,
'ERROR')
self.assertEqual(1, mock_put.call_count)
# We still have the allocations as deletion failed
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
self.assertFlavorMatchesUsage(source_rp_uuid, self.flavor)
self.assertFlavorMatchesAllocation(self.flavor, server['id'],
source_rp_uuid)
# retry the delete to make sure that allocations are removed this time
self._delete_and_check_allocations(server)
def test_server_local_delete_fails_due_to_conflict(self):
source_hostname = self.compute1.host
server = self._boot_and_check_allocations(self.flavor, source_hostname)
source_compute_id = self.admin_api.get_services(
host=self.compute1.host, binary='nova-compute')[0]['id']
self.compute1.stop()
self.admin_api.put_service(
source_compute_id, {'forced_down': 'true'})
rsp = fake_requests.FakeResponse(
409, jsonutils.dumps({'text': 'consumer generation conflict'}))
with mock.patch('keystoneauth1.adapter.Adapter.put',
autospec=True) as mock_put:
mock_put.return_value = rsp
ex = self.assertRaises(client.OpenStackApiException,
self.api.delete_server, server['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Failed to delete allocations for consumer',
jsonutils.loads(ex.response.content)[
'conflictingRequest']['message'])
self.assertEqual(1, mock_put.call_count)
# We still have the allocations as deletion failed
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
self.assertFlavorMatchesUsage(source_rp_uuid, self.flavor)
self.assertFlavorMatchesAllocation(self.flavor, server['id'],
source_rp_uuid)
# retry the delete to make sure that allocations are removed this time
self._delete_and_check_allocations(server)
class ServerMovingTestsWithNestedComputes(ServerMovingTests):
"""Runs all the server moving tests while the computes have nested trees.

View File

@ -9065,7 +9065,9 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.instance,
self.migration)
mock_report.delete_allocation_for_instance.assert_called_once_with(
self.context, self.migration.uuid, consumer_type='migration')
self.context, self.migration.uuid, consumer_type='migration',
force=False
)
def test_revert_allocation_allocation_exists(self):
"""New-style migration-based allocation revert."""

View File

@ -3375,7 +3375,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
# Only one call should be made to delete allocations, and that should
# be for the first instance created above
rc.delete_allocation_for_instance.assert_called_once_with(
ctx, uuids.deleted)
ctx, uuids.deleted, force=False)
mock_inst_get.assert_called_once_with(
ctx.elevated.return_value,
uuids.deleted,
@ -3452,7 +3452,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
# Only one call should be made to delete allocations, and that should
# be for the first instance created above
rc.delete_allocation_for_instance.assert_called_once_with(
ctx, uuids.deleted)
ctx, uuids.deleted, force=False)
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_scheduled_instance(self,
@ -3652,7 +3652,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
rc = self.rt.reportclient
mock_remove_allocation = rc.delete_allocation_for_instance
mock_remove_allocation.assert_called_once_with(
mock.sentinel.ctx, instance.uuid)
mock.sentinel.ctx, instance.uuid, force=True)
def test_update_usage_from_instances_goes_negative(self):
# NOTE(danms): The resource tracker _should_ report negative resources

View File

@ -1049,8 +1049,8 @@ class SchedulerManagerTestCase(test.NoDBTestCase):
instance_uuids = [uuids.instance1, uuids.instance2]
self.manager._cleanup_allocations(self.context, instance_uuids)
mock_delete_alloc.assert_has_calls([
mock.call(self.context, uuids.instance1),
mock.call(self.context, uuids.instance2)
mock.call(self.context, uuids.instance1, force=True),
mock.call(self.context, uuids.instance2, force=True),
])
def test_add_retry_host(self):