Remove allocation when booting instance rescheduled or aborted

In Pike release, there is no auto-heal in the resource tracker.
So when instance booting failed, the allocation records aren't
cleaned up.

This patch clean up the allocation records when the instance
rescheduled and aborted.

Change-Id: I95d2f4c9392883052188fb7901451530068502db
Closes-Bug: #1712718
(cherry picked from commit c19ebcbd58)
This commit is contained in:
He Jie Xu 2017-08-24 11:38:31 +08:00 committed by Matt Riedemann
parent 495621bfc5
commit 27fa0f03d2
3 changed files with 102 additions and 4 deletions

View File

@ -1724,12 +1724,34 @@ class ComputeManager(manager.Manager):
try:
result = self._do_build_and_run_instance(*args, **kwargs)
except Exception:
# NOTE(mriedem): This should really only happen if
# _decode_files in _do_build_and_run_instance fails, and
# that's before a guest is spawned so it's OK to remove
# allocations for the instance for this node from Placement
# below as there is no guest consuming resources anyway.
# The _decode_files case could be handled more specifically
# but that's left for another day.
result = build_results.FAILED
raise
finally:
fails = (build_results.FAILED,
build_results.RESCHEDULED)
if result in fails:
# Remove the allocation records from Placement for
# the instance if the build failed or is being
# rescheduled to another node. The instance.host is
# likely set to None in _do_build_and_run_instance
# which means if the user deletes the instance, it will
# be deleted in the API, not the compute service.
# Setting the instance.host to None in
# _do_build_and_run_instance means that the
# ResourceTracker will no longer consider this instance
# to be claiming resources against it, so we want to
# reflect that same thing in Placement.
rt = self._get_resource_tracker()
rt.reportclient.delete_allocation_for_instance(
instance.uuid)
self._build_failed()
else:
self._failed_builds = 0

View File

@ -23,6 +23,7 @@ from oslo_serialization import base64
from oslo_utils import timeutils
from nova.compute import api as compute_api
from nova.compute import manager as compute_manager
from nova.compute import rpcapi
from nova import context
from nova import exception
@ -1877,3 +1878,77 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self._delete_and_check_allocations(
server, source_rp_uuid, dest_rp_uuid)
def test_rescheduling_when_booting_instance(self):
self.failed_hostname = None
old_build_resources = (compute_manager.ComputeManager.
_build_resources)
def fake_build_resources(sl, *args, **kwargs):
# We failed on the first scheduling
if not self.failed_hostname:
self.failed_hostname = sl.host
raise Exception()
return old_build_resources(sl, *args, **kwargs)
self.stub_out('nova.compute.manager.ComputeManager._build_resources',
fake_build_resources)
server_req = self._build_minimal_create_server_request(
self.api, 'some-server', flavor_id=self.flavor1['id'],
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
networks=[])
created_server = self.api.post_server({'server': server_req})
server = self._wait_for_state_change(
self.admin_api, created_server, 'ACTIVE')
dest_hostname = server['OS-EXT-SRV-ATTR:host']
LOG.info('failed on %s', self.failed_hostname)
LOG.info('booting on %s', dest_hostname)
failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
failed_usages = self._get_provider_usages(failed_rp_uuid)
# Expects no allocation records on the failed host.
self.assertFlavorMatchesAllocation(
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
# Ensure the allocation records on the destination host.
dest_usages = self._get_provider_usages(dest_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
def test_abort_when_booting_instance(self):
self.failed_hostname = None
old_build_resources = (compute_manager.ComputeManager.
_build_resources)
def fake_build_resources(sl, *args, **kwargs):
# We failed on the first scheduling
if not self.failed_hostname:
self.failed_hostname = sl.host
raise exception.BuildAbortException(instance_uuid='fake_uuid',
reason='just abort')
return old_build_resources(sl, *args, **kwargs)
self.stub_out('nova.compute.manager.ComputeManager._build_resources',
fake_build_resources)
server_req = self._build_minimal_create_server_request(
self.api, 'some-server', flavor_id=self.flavor1['id'],
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
networks=[])
created_server = self.api.post_server({'server': server_req})
self._wait_for_state_change(self.admin_api, created_server, 'ERROR')
LOG.info('failed on %s', self.failed_hostname)
failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname)
failed_usages = self._get_provider_usages(failed_rp_uuid)
# Expects no allocation records on the failed host.
self.assertFlavorMatchesAllocation(
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)

View File

@ -4834,13 +4834,13 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_get_resource_tracker')
def test_reschedule_on_resources_unavailable(self, mock_get_resource,
@mock.patch.object(resource_tracker.ResourceTracker, 'instance_claim')
def test_reschedule_on_resources_unavailable(self, mock_claim,
mock_build, mock_nil, mock_save, mock_start,
mock_finish, mock_notify):
reason = 'resource unavailable'
exc = exception.ComputeResourcesUnavailable(reason=reason)
mock_get_resource.side_effect = exc
mock_claim.side_effect = exc
self._do_build_instance_update(mock_save, reschedule_update=True)
with mock.patch.object(
@ -4858,7 +4858,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save, reschedule_update=True)
mock_get_resource.assert_called_once_with()
mock_claim.assert_called_once_with(self.context, self.instance,
self.node, self.limits)
mock_notify.assert_has_calls([
mock.call(self.context, self.instance, 'create.start',
extra_usage_info= {'image_name': self.image.get('name')}),