Remove allocation when booting instance rescheduled or aborted
In Pike release, there is no auto-heal in the resource tracker.
So when instance booting failed, the allocation records aren't
cleaned up.
This patch clean up the allocation records when the instance
rescheduled and aborted.
Change-Id: I95d2f4c9392883052188fb7901451530068502db
Closes-Bug: #1712718
(cherry picked from commit c19ebcbd58
)
This commit is contained in:
parent
495621bfc5
commit
27fa0f03d2
|
@ -1724,12 +1724,34 @@ class ComputeManager(manager.Manager):
|
|||
try:
|
||||
result = self._do_build_and_run_instance(*args, **kwargs)
|
||||
except Exception:
|
||||
# NOTE(mriedem): This should really only happen if
|
||||
# _decode_files in _do_build_and_run_instance fails, and
|
||||
# that's before a guest is spawned so it's OK to remove
|
||||
# allocations for the instance for this node from Placement
|
||||
# below as there is no guest consuming resources anyway.
|
||||
# The _decode_files case could be handled more specifically
|
||||
# but that's left for another day.
|
||||
result = build_results.FAILED
|
||||
raise
|
||||
finally:
|
||||
fails = (build_results.FAILED,
|
||||
build_results.RESCHEDULED)
|
||||
if result in fails:
|
||||
# Remove the allocation records from Placement for
|
||||
# the instance if the build failed or is being
|
||||
# rescheduled to another node. The instance.host is
|
||||
# likely set to None in _do_build_and_run_instance
|
||||
# which means if the user deletes the instance, it will
|
||||
# be deleted in the API, not the compute service.
|
||||
# Setting the instance.host to None in
|
||||
# _do_build_and_run_instance means that the
|
||||
# ResourceTracker will no longer consider this instance
|
||||
# to be claiming resources against it, so we want to
|
||||
# reflect that same thing in Placement.
|
||||
rt = self._get_resource_tracker()
|
||||
rt.reportclient.delete_allocation_for_instance(
|
||||
instance.uuid)
|
||||
|
||||
self._build_failed()
|
||||
else:
|
||||
self._failed_builds = 0
|
||||
|
|
|
@ -23,6 +23,7 @@ from oslo_serialization import base64
|
|||
from oslo_utils import timeutils
|
||||
|
||||
from nova.compute import api as compute_api
|
||||
from nova.compute import manager as compute_manager
|
||||
from nova.compute import rpcapi
|
||||
from nova import context
|
||||
from nova import exception
|
||||
|
@ -1877,3 +1878,77 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
|
|||
|
||||
self._delete_and_check_allocations(
|
||||
server, source_rp_uuid, dest_rp_uuid)
|
||||
|
||||
def test_rescheduling_when_booting_instance(self):
|
||||
self.failed_hostname = None
|
||||
old_build_resources = (compute_manager.ComputeManager.
|
||||
_build_resources)
|
||||
|
||||
def fake_build_resources(sl, *args, **kwargs):
|
||||
# We failed on the first scheduling
|
||||
if not self.failed_hostname:
|
||||
self.failed_hostname = sl.host
|
||||
raise Exception()
|
||||
|
||||
return old_build_resources(sl, *args, **kwargs)
|
||||
|
||||
self.stub_out('nova.compute.manager.ComputeManager._build_resources',
|
||||
fake_build_resources)
|
||||
|
||||
server_req = self._build_minimal_create_server_request(
|
||||
self.api, 'some-server', flavor_id=self.flavor1['id'],
|
||||
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
|
||||
networks=[])
|
||||
|
||||
created_server = self.api.post_server({'server': server_req})
|
||||
server = self._wait_for_state_change(
|
||||
self.admin_api, created_server, 'ACTIVE')
|
||||
dest_hostname = server['OS-EXT-SRV-ATTR:host']
|
||||
|
||||
LOG.info('failed on %s', self.failed_hostname)
|
||||
LOG.info('booting on %s', dest_hostname)
|
||||
|
||||
failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname)
|
||||
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
|
||||
|
||||
failed_usages = self._get_provider_usages(failed_rp_uuid)
|
||||
# Expects no allocation records on the failed host.
|
||||
self.assertFlavorMatchesAllocation(
|
||||
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
|
||||
|
||||
# Ensure the allocation records on the destination host.
|
||||
dest_usages = self._get_provider_usages(dest_rp_uuid)
|
||||
self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
|
||||
|
||||
def test_abort_when_booting_instance(self):
|
||||
self.failed_hostname = None
|
||||
old_build_resources = (compute_manager.ComputeManager.
|
||||
_build_resources)
|
||||
|
||||
def fake_build_resources(sl, *args, **kwargs):
|
||||
# We failed on the first scheduling
|
||||
if not self.failed_hostname:
|
||||
self.failed_hostname = sl.host
|
||||
raise exception.BuildAbortException(instance_uuid='fake_uuid',
|
||||
reason='just abort')
|
||||
|
||||
return old_build_resources(sl, *args, **kwargs)
|
||||
|
||||
self.stub_out('nova.compute.manager.ComputeManager._build_resources',
|
||||
fake_build_resources)
|
||||
|
||||
server_req = self._build_minimal_create_server_request(
|
||||
self.api, 'some-server', flavor_id=self.flavor1['id'],
|
||||
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
|
||||
networks=[])
|
||||
|
||||
created_server = self.api.post_server({'server': server_req})
|
||||
self._wait_for_state_change(self.admin_api, created_server, 'ERROR')
|
||||
|
||||
LOG.info('failed on %s', self.failed_hostname)
|
||||
|
||||
failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname)
|
||||
failed_usages = self._get_provider_usages(failed_rp_uuid)
|
||||
# Expects no allocation records on the failed host.
|
||||
self.assertFlavorMatchesAllocation(
|
||||
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
|
||||
|
|
|
@ -4834,13 +4834,13 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
@mock.patch.object(manager.ComputeManager,
|
||||
'_nil_out_instance_obj_host_and_node')
|
||||
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_resource_tracker')
|
||||
def test_reschedule_on_resources_unavailable(self, mock_get_resource,
|
||||
@mock.patch.object(resource_tracker.ResourceTracker, 'instance_claim')
|
||||
def test_reschedule_on_resources_unavailable(self, mock_claim,
|
||||
mock_build, mock_nil, mock_save, mock_start,
|
||||
mock_finish, mock_notify):
|
||||
reason = 'resource unavailable'
|
||||
exc = exception.ComputeResourcesUnavailable(reason=reason)
|
||||
mock_get_resource.side_effect = exc
|
||||
mock_claim.side_effect = exc
|
||||
self._do_build_instance_update(mock_save, reschedule_update=True)
|
||||
|
||||
with mock.patch.object(
|
||||
|
@ -4858,7 +4858,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
|
||||
self._instance_action_events(mock_start, mock_finish)
|
||||
self._assert_build_instance_update(mock_save, reschedule_update=True)
|
||||
mock_get_resource.assert_called_once_with()
|
||||
mock_claim.assert_called_once_with(self.context, self.instance,
|
||||
self.node, self.limits)
|
||||
mock_notify.assert_has_calls([
|
||||
mock.call(self.context, self.instance, 'create.start',
|
||||
extra_usage_info= {'image_name': self.image.get('name')}),
|
||||
|
|
Loading…
Reference in New Issue