Avoid allocation leak when deleting instance stuck in BUILD
During instance build, conductor claim resources to scheduler and create instance DB entry in cell. If for any reason conductor is not able to complete a build after instance claim (ex: AMQP issues, conductor restart before build completes) and in the mean time user requests deletion of its stuck instance in BUILD, nova api will delete build_request but let allocation in place resulting in a leak. The change proposes that nova api ensures allocation cleanup is made in case of ongoing/incomplete build. Note that because build did not reach a cell, compute is not able to heal allocation during its periodic update_available_resource task. Furthermore, it ensures that instance mapping is also queued for deletion. Change-Id: I4d3193d8401614311010ed0e055fcb3aaeeebaed Closes-Bug: #1859496
This commit is contained in:
parent
f4fcc24bd0
commit
f35930eef8
|
@ -2136,6 +2136,23 @@ class API(base.Base):
|
|||
return True
|
||||
return False
|
||||
|
||||
def _local_delete_cleanup(self, context, instance):
|
||||
# NOTE(aarents) Ensure instance allocation is cleared and instance
|
||||
# mapping queued as deleted before _delete() return
|
||||
try:
|
||||
self.placementclient.delete_allocation_for_instance(
|
||||
context, instance.uuid)
|
||||
except exception.AllocationDeleteFailed:
|
||||
LOG.info("Allocation delete failed during local delete cleanup.",
|
||||
instance=instance)
|
||||
|
||||
try:
|
||||
self._update_queued_for_deletion(context, instance, True)
|
||||
except exception.InstanceMappingNotFound:
|
||||
LOG.info("Instance Mapping does not exist while attempting"
|
||||
"local delete cleanup.",
|
||||
instance=instance)
|
||||
|
||||
def _attempt_delete_of_buildrequest(self, context, instance):
|
||||
# If there is a BuildRequest then the instance may not have been
|
||||
# written to a cell db yet. Delete the BuildRequest here, which
|
||||
|
@ -2171,6 +2188,7 @@ class API(base.Base):
|
|||
if not instance.host and not may_have_ports_or_volumes:
|
||||
try:
|
||||
if self._delete_while_booting(context, instance):
|
||||
self._local_delete_cleanup(context, instance)
|
||||
return
|
||||
# If instance.host was not set it's possible that the Instance
|
||||
# object here was pulled from a BuildRequest object and is not
|
||||
|
@ -2189,9 +2207,11 @@ class API(base.Base):
|
|||
except exception.InstanceNotFound:
|
||||
pass
|
||||
# The instance was deleted or is already gone.
|
||||
self._local_delete_cleanup(context, instance)
|
||||
return
|
||||
if not instance:
|
||||
# Instance is already deleted.
|
||||
self._local_delete_cleanup(context, instance)
|
||||
return
|
||||
except exception.ObjectActionError:
|
||||
# NOTE(melwitt): This means the instance.host changed
|
||||
|
@ -2204,6 +2224,7 @@ class API(base.Base):
|
|||
cell, instance = self._lookup_instance(context, instance.uuid)
|
||||
if not instance:
|
||||
# Instance is already deleted
|
||||
self._local_delete_cleanup(context, instance)
|
||||
return
|
||||
|
||||
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
|
||||
|
@ -2247,6 +2268,7 @@ class API(base.Base):
|
|||
'field, its vm_state is %(state)s.',
|
||||
{'state': instance.vm_state},
|
||||
instance=instance)
|
||||
self._local_delete_cleanup(context, instance)
|
||||
return
|
||||
except exception.ObjectActionError as ex:
|
||||
# The instance's host likely changed under us as
|
||||
|
|
|
@ -534,6 +534,19 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
|
|||
return self.placement_api.get(
|
||||
'/allocations/%s' % server_uuid).body['allocations']
|
||||
|
||||
def _wait_for_server_allocations(self, consumer_id, max_retries=20):
|
||||
retry_count = 0
|
||||
while True:
|
||||
alloc = self._get_allocations_by_server_uuid(consumer_id)
|
||||
if alloc:
|
||||
break
|
||||
retry_count += 1
|
||||
if retry_count == max_retries:
|
||||
self.fail('Wait for server allocations failed, '
|
||||
'server=%s' % (consumer_id))
|
||||
time.sleep(0.5)
|
||||
return alloc
|
||||
|
||||
def _get_allocations_by_provider_uuid(self, rp_uuid):
|
||||
return self.placement_api.get(
|
||||
'/resource_providers/%s/allocations' % rp_uuid).body['allocations']
|
||||
|
|
|
@ -3662,6 +3662,58 @@ class ServerBuildAbortTests(integrated_helpers.ProviderUsageBaseTestCase):
|
|||
'DISK_GB': 0}, failed_rp_uuid)
|
||||
|
||||
|
||||
class ServerDeleteBuildTests(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
"""Tests server delete during instance in build and validates that
|
||||
allocations in Placement are properly cleaned up.
|
||||
"""
|
||||
compute_driver = 'fake.SmallFakeDriver'
|
||||
|
||||
def setUp(self):
|
||||
super(ServerDeleteBuildTests, self).setUp()
|
||||
self.compute1 = self._start_compute(host='host1')
|
||||
flavors = self.api.get_flavors()
|
||||
self.flavor1 = flavors[0]
|
||||
|
||||
def test_delete_stuck_build_instance_after_claim(self):
|
||||
"""Test for bug 1859496 where an instance allocation can leaks after
|
||||
deletion if build process have been interrupted after resource claim
|
||||
"""
|
||||
|
||||
# To reproduce the issue we need to interrupt instance spawn
|
||||
# when build request has already reached the scheduler service,
|
||||
# so that instance resource get claims.
|
||||
# Real case can typically be a conductor restart during
|
||||
# instance claim.
|
||||
# To emulate conductor restart we raise an Exception in
|
||||
# filter_scheduler after instance is claimed and mock
|
||||
# _bury_in_cell0 in that case conductor thread return.
|
||||
# Then we delete server after ensuring allocation is made and check
|
||||
# There is no leak.
|
||||
# Note that because deletion occurs early, conductor did not populate
|
||||
# instance DB entries in cells, preventing the compute
|
||||
# update_available_resource periodic task to heal leaked allocations.
|
||||
|
||||
server_req = self._build_server(
|
||||
'interrupted-server', flavor_id=self.flavor1['id'],
|
||||
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
|
||||
networks='none')
|
||||
|
||||
with test.nested(
|
||||
mock.patch.object(nova.scheduler.filter_scheduler.
|
||||
FilterScheduler,
|
||||
'_ensure_sufficient_hosts'),
|
||||
mock.patch.object(nova.conductor.manager.
|
||||
ComputeTaskManager,
|
||||
'_bury_in_cell0')
|
||||
) as (mock_suff_hosts, mock_bury):
|
||||
mock_suff_hosts.side_effect = test.TestingException('oops')
|
||||
server = self.api.post_server({'server': server_req})
|
||||
self._wait_for_server_allocations(server['id'])
|
||||
self.api.api_delete('/servers/%s' % server['id'])
|
||||
allocations = self._get_allocations_by_server_uuid(server['id'])
|
||||
self.assertEqual({}, allocations)
|
||||
|
||||
|
||||
class ServerBuildAbortTestsWithNestedResourceRequest(ServerBuildAbortTests):
|
||||
compute_driver = 'fake.FakeBuildAbortDriverWithNestedCustomResources'
|
||||
|
||||
|
|
|
@ -8406,6 +8406,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
legacy_notify, notify, instance.uuid, self.compute_api.notifier,
|
||||
self.context)
|
||||
|
||||
@mock.patch('nova.compute.api.API._local_delete_cleanup')
|
||||
@mock.patch('nova.compute.utils.notify_about_instance_action')
|
||||
@mock.patch('nova.objects.Instance.destroy')
|
||||
@mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid')
|
||||
|
@ -8413,7 +8414,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
@mock.patch('nova.objects.BuildRequest.get_by_instance_uuid')
|
||||
def test_delete_while_booting_instance_not_scheduled_cellv1(
|
||||
self, br_get_by_instance, legacy_notify, im_get_by_instance,
|
||||
instance_destroy, notify):
|
||||
instance_destroy, notify, api_del_cleanup):
|
||||
|
||||
instance = self._create_fake_instance_obj()
|
||||
instance.host = None
|
||||
|
@ -8433,6 +8434,7 @@ class ComputeTestCase(BaseTestCase,
|
|||
self.context)
|
||||
|
||||
instance_destroy.assert_called_once_with()
|
||||
api_del_cleanup.assert_called_once()
|
||||
|
||||
@mock.patch('nova.compute.utils.notify_about_instance_action')
|
||||
@mock.patch('nova.objects.Instance.destroy')
|
||||
|
|
Loading…
Reference in New Issue