Avoid allocation leak when deleting instance stuck in BUILD

During instance build, conductor claim resources to scheduler
and create instance DB entry in cell.

If for any reason conductor is not able to complete a build after
instance claim (ex: AMQP issues, conductor restart before build completes)
and in the mean time user requests deletion of its stuck instance in BUILD,
nova api will delete build_request but let allocation in place resulting
in a leak.

The change proposes that nova api ensures allocation cleanup is made
in case of ongoing/incomplete build.
Note that because build did not reach a cell, compute is not able to heal
allocation during its periodic update_available_resource task.
Furthermore, it ensures that instance mapping is also queued for deletion.

Change-Id: I4d3193d8401614311010ed0e055fcb3aaeeebaed
Closes-Bug: #1859496
This commit is contained in:
Alexandre Arents 2020-01-13 15:53:24 +00:00
parent f4fcc24bd0
commit f35930eef8
4 changed files with 90 additions and 1 deletions

View File

@ -2136,6 +2136,23 @@ class API(base.Base):
return True
return False
def _local_delete_cleanup(self, context, instance):
# NOTE(aarents) Ensure instance allocation is cleared and instance
# mapping queued as deleted before _delete() return
try:
self.placementclient.delete_allocation_for_instance(
context, instance.uuid)
except exception.AllocationDeleteFailed:
LOG.info("Allocation delete failed during local delete cleanup.",
instance=instance)
try:
self._update_queued_for_deletion(context, instance, True)
except exception.InstanceMappingNotFound:
LOG.info("Instance Mapping does not exist while attempting"
"local delete cleanup.",
instance=instance)
def _attempt_delete_of_buildrequest(self, context, instance):
# If there is a BuildRequest then the instance may not have been
# written to a cell db yet. Delete the BuildRequest here, which
@ -2171,6 +2188,7 @@ class API(base.Base):
if not instance.host and not may_have_ports_or_volumes:
try:
if self._delete_while_booting(context, instance):
self._local_delete_cleanup(context, instance)
return
# If instance.host was not set it's possible that the Instance
# object here was pulled from a BuildRequest object and is not
@ -2189,9 +2207,11 @@ class API(base.Base):
except exception.InstanceNotFound:
pass
# The instance was deleted or is already gone.
self._local_delete_cleanup(context, instance)
return
if not instance:
# Instance is already deleted.
self._local_delete_cleanup(context, instance)
return
except exception.ObjectActionError:
# NOTE(melwitt): This means the instance.host changed
@ -2204,6 +2224,7 @@ class API(base.Base):
cell, instance = self._lookup_instance(context, instance.uuid)
if not instance:
# Instance is already deleted
self._local_delete_cleanup(context, instance)
return
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
@ -2247,6 +2268,7 @@ class API(base.Base):
'field, its vm_state is %(state)s.',
{'state': instance.vm_state},
instance=instance)
self._local_delete_cleanup(context, instance)
return
except exception.ObjectActionError as ex:
# The instance's host likely changed under us as

View File

@ -534,6 +534,19 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
return self.placement_api.get(
'/allocations/%s' % server_uuid).body['allocations']
def _wait_for_server_allocations(self, consumer_id, max_retries=20):
retry_count = 0
while True:
alloc = self._get_allocations_by_server_uuid(consumer_id)
if alloc:
break
retry_count += 1
if retry_count == max_retries:
self.fail('Wait for server allocations failed, '
'server=%s' % (consumer_id))
time.sleep(0.5)
return alloc
def _get_allocations_by_provider_uuid(self, rp_uuid):
return self.placement_api.get(
'/resource_providers/%s/allocations' % rp_uuid).body['allocations']

View File

@ -3662,6 +3662,58 @@ class ServerBuildAbortTests(integrated_helpers.ProviderUsageBaseTestCase):
'DISK_GB': 0}, failed_rp_uuid)
class ServerDeleteBuildTests(integrated_helpers.ProviderUsageBaseTestCase):
"""Tests server delete during instance in build and validates that
allocations in Placement are properly cleaned up.
"""
compute_driver = 'fake.SmallFakeDriver'
def setUp(self):
super(ServerDeleteBuildTests, self).setUp()
self.compute1 = self._start_compute(host='host1')
flavors = self.api.get_flavors()
self.flavor1 = flavors[0]
def test_delete_stuck_build_instance_after_claim(self):
"""Test for bug 1859496 where an instance allocation can leaks after
deletion if build process have been interrupted after resource claim
"""
# To reproduce the issue we need to interrupt instance spawn
# when build request has already reached the scheduler service,
# so that instance resource get claims.
# Real case can typically be a conductor restart during
# instance claim.
# To emulate conductor restart we raise an Exception in
# filter_scheduler after instance is claimed and mock
# _bury_in_cell0 in that case conductor thread return.
# Then we delete server after ensuring allocation is made and check
# There is no leak.
# Note that because deletion occurs early, conductor did not populate
# instance DB entries in cells, preventing the compute
# update_available_resource periodic task to heal leaked allocations.
server_req = self._build_server(
'interrupted-server', flavor_id=self.flavor1['id'],
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
networks='none')
with test.nested(
mock.patch.object(nova.scheduler.filter_scheduler.
FilterScheduler,
'_ensure_sufficient_hosts'),
mock.patch.object(nova.conductor.manager.
ComputeTaskManager,
'_bury_in_cell0')
) as (mock_suff_hosts, mock_bury):
mock_suff_hosts.side_effect = test.TestingException('oops')
server = self.api.post_server({'server': server_req})
self._wait_for_server_allocations(server['id'])
self.api.api_delete('/servers/%s' % server['id'])
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual({}, allocations)
class ServerBuildAbortTestsWithNestedResourceRequest(ServerBuildAbortTests):
compute_driver = 'fake.FakeBuildAbortDriverWithNestedCustomResources'

View File

@ -8406,6 +8406,7 @@ class ComputeTestCase(BaseTestCase,
legacy_notify, notify, instance.uuid, self.compute_api.notifier,
self.context)
@mock.patch('nova.compute.api.API._local_delete_cleanup')
@mock.patch('nova.compute.utils.notify_about_instance_action')
@mock.patch('nova.objects.Instance.destroy')
@mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid')
@ -8413,7 +8414,7 @@ class ComputeTestCase(BaseTestCase,
@mock.patch('nova.objects.BuildRequest.get_by_instance_uuid')
def test_delete_while_booting_instance_not_scheduled_cellv1(
self, br_get_by_instance, legacy_notify, im_get_by_instance,
instance_destroy, notify):
instance_destroy, notify, api_del_cleanup):
instance = self._create_fake_instance_obj()
instance.host = None
@ -8433,6 +8434,7 @@ class ComputeTestCase(BaseTestCase,
self.context)
instance_destroy.assert_called_once_with()
api_del_cleanup.assert_called_once()
@mock.patch('nova.compute.utils.notify_about_instance_action')
@mock.patch('nova.objects.Instance.destroy')