Handle placement error during re-schedule

If claim_resources or _fill_provider_mapping calls fail due to placement
error then the re-scheduling stops and the instance is not put into
ERROR state.

This patch adds proper exception handling to these cases.

Change-Id: I78fc2312274471a7bd85a263de12cc5a0b19fd10
Closes-Bug: #1819460
This commit is contained in:
Balazs Gibizer 2019-03-11 15:24:05 +01:00
parent fd3b86d1c3
commit b096d9303a
3 changed files with 48 additions and 46 deletions

View File

@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base):
bdm.attachment_id = attachment['id']
bdm.save()
def _cleanup_when_reschedule_fails(
self, context, instance, exception, legacy_request_spec,
requested_networks):
"""Set the instance state and clean up.
It is only used in case build_instance fails while rescheduling the
instance
"""
updates = {'vm_state': vm_states.ERROR,
'task_state': None}
self._set_vm_state_and_notify(
context, instance.uuid, 'build_instances', updates, exception,
legacy_request_spec)
self._cleanup_allocated_networks(
context, instance, requested_networks)
# NOTE(danms): This is never cell-targeted because it is only used for
# cellsv1 (which does not target cells directly) and n-cpu reschedules
# (which go to the cell conductor and thus are always cell-specific).
@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base):
# disabled in those cases.
num_attempts = filter_properties.get(
'retry', {}).get('num_attempts', 1)
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
for instance in instances:
self._set_vm_state_and_notify(
context, instance.uuid, 'build_instances', updates,
exc, legacy_request_spec)
# If num_attempts > 1, we're in a reschedule and probably
# either hit NoValidHost or MaxRetriesExceeded. Either way,
# the build request should already be gone and we probably
@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base):
self._destroy_build_request(context, instance)
except exception.BuildRequestNotFound:
pass
self._cleanup_allocated_networks(
context, instance, requested_networks)
self._cleanup_when_reschedule_fails(
context, instance, exc, legacy_request_spec,
requested_networks)
return
elevated = context.elevated()
@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base):
else:
alloc_req = None
if alloc_req:
try:
host_available = scheduler_utils.claim_resources(
elevated, self.report_client, spec_obj,
instance.uuid, alloc_req,
host.allocation_request_version)
if request_spec:
# NOTE(gibi): redo the request group - resource
# provider mapping as the above claim call moves
# the allocation of the instance to another host
# TODO(gibi): handle if the below call raises
# provider mapping as the above claim call
# moves the allocation of the instance to
# another host
self._fill_provider_mapping(
context, instance.uuid, request_spec, host)
except Exception as exc:
self._cleanup_when_reschedule_fails(
context, instance, exc, legacy_request_spec,
requested_networks)
return
else:
# Some deployments use different schedulers that do not
# use Placement, so they will not have an

View File

@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase):
class InstanceHelperMixin(object):
def _wait_for_server_parameter(self, admin_api, server, expected_params,
max_retries=10,
fail_when_run_out_of_retries=True):
max_retries=10):
retry_count = 0
while True:
server = admin_api.get_server(server['id'])
@ -255,22 +254,17 @@ class InstanceHelperMixin(object):
break
retry_count += 1
if retry_count == max_retries:
if fail_when_run_out_of_retries:
self.fail('Wait for state change failed, '
'expected_params=%s, server=%s'
% (expected_params, server))
else:
break
time.sleep(0.5)
return server
def _wait_for_state_change(self, admin_api, server, expected_status,
max_retries=10,
fail_when_run_out_of_retries=True):
max_retries=10):
return self._wait_for_server_parameter(
admin_api, server, {'status': expected_status}, max_retries,
fail_when_run_out_of_retries=fail_when_run_out_of_retries)
admin_api, server, {'status': expected_status}, max_retries)
def _build_minimal_create_server_request(self, api, name, image_uuid=None,
flavor_id=None, networks=None,

View File

@ -4058,13 +4058,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
consumer_uuid=uuids.inst1, error='testing')]):
server = self.api.post_server({'server': server_req})
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
# instead of going to ERROR state
server = self._wait_for_state_change(
self.admin_api, server, 'ERROR',
fail_when_run_out_of_retries=False)
self.assertEqual('BUILD', server['status'])
self.admin_api, server, 'ERROR')
self._delete_and_check_allocations(server)
@ -6744,23 +6739,16 @@ class PortResourceRequestReSchedulingTest(
server = self._create_server(
flavor=self.flavor,
networks=[{'port': port['id']}])
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
server = self._wait_for_state_change(
self.admin_api, server, 'ERROR',
fail_when_run_out_of_retries=False)
self.admin_api, server, 'ERROR')
self.assertEqual('BUILD', server['status'])
self.assertIn(
'Failed to get traits for resource provider',
server['fault']['message'])
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
# error is presented to the user
# self.assertIn(
# 'Failed to get traits for resource provider',
# server['fault']['message'])
#
# NOTE(gibi): even after delete the allocation of such server is leaked
# self._delete_and_check_allocations(server)
#
# # assert that unbind removes the allocation from the binding
# updated_port = self.neutron.show_port(port['id'])['port']
# binding_profile = updated_port['binding:profile']
# self.assertNotIn('allocation', binding_profile)
self._delete_and_check_allocations(server)
# assert that unbind removes the allocation from the binding
updated_port = self.neutron.show_port(port['id'])['port']
binding_profile = updated_port['binding:profile']
self.assertNotIn('allocation', binding_profile)