Handle placement error during re-schedule
If claim_resources or _fill_provider_mapping calls fail due to placement error then the re-scheduling stops and the instance is not put into ERROR state. This patch adds proper exception handling to these cases. Change-Id: I78fc2312274471a7bd85a263de12cc5a0b19fd10 Closes-Bug: #1819460
This commit is contained in:
parent
fd3b86d1c3
commit
b096d9303a
|
@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base):
|
||||||
bdm.attachment_id = attachment['id']
|
bdm.attachment_id = attachment['id']
|
||||||
bdm.save()
|
bdm.save()
|
||||||
|
|
||||||
|
def _cleanup_when_reschedule_fails(
|
||||||
|
self, context, instance, exception, legacy_request_spec,
|
||||||
|
requested_networks):
|
||||||
|
"""Set the instance state and clean up.
|
||||||
|
|
||||||
|
It is only used in case build_instance fails while rescheduling the
|
||||||
|
instance
|
||||||
|
"""
|
||||||
|
|
||||||
|
updates = {'vm_state': vm_states.ERROR,
|
||||||
|
'task_state': None}
|
||||||
|
self._set_vm_state_and_notify(
|
||||||
|
context, instance.uuid, 'build_instances', updates, exception,
|
||||||
|
legacy_request_spec)
|
||||||
|
self._cleanup_allocated_networks(
|
||||||
|
context, instance, requested_networks)
|
||||||
|
|
||||||
# NOTE(danms): This is never cell-targeted because it is only used for
|
# NOTE(danms): This is never cell-targeted because it is only used for
|
||||||
# cellsv1 (which does not target cells directly) and n-cpu reschedules
|
# cellsv1 (which does not target cells directly) and n-cpu reschedules
|
||||||
# (which go to the cell conductor and thus are always cell-specific).
|
# (which go to the cell conductor and thus are always cell-specific).
|
||||||
|
@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base):
|
||||||
# disabled in those cases.
|
# disabled in those cases.
|
||||||
num_attempts = filter_properties.get(
|
num_attempts = filter_properties.get(
|
||||||
'retry', {}).get('num_attempts', 1)
|
'retry', {}).get('num_attempts', 1)
|
||||||
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
|
|
||||||
for instance in instances:
|
for instance in instances:
|
||||||
self._set_vm_state_and_notify(
|
|
||||||
context, instance.uuid, 'build_instances', updates,
|
|
||||||
exc, legacy_request_spec)
|
|
||||||
# If num_attempts > 1, we're in a reschedule and probably
|
# If num_attempts > 1, we're in a reschedule and probably
|
||||||
# either hit NoValidHost or MaxRetriesExceeded. Either way,
|
# either hit NoValidHost or MaxRetriesExceeded. Either way,
|
||||||
# the build request should already be gone and we probably
|
# the build request should already be gone and we probably
|
||||||
|
@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base):
|
||||||
self._destroy_build_request(context, instance)
|
self._destroy_build_request(context, instance)
|
||||||
except exception.BuildRequestNotFound:
|
except exception.BuildRequestNotFound:
|
||||||
pass
|
pass
|
||||||
self._cleanup_allocated_networks(
|
self._cleanup_when_reschedule_fails(
|
||||||
context, instance, requested_networks)
|
context, instance, exc, legacy_request_spec,
|
||||||
|
requested_networks)
|
||||||
return
|
return
|
||||||
|
|
||||||
elevated = context.elevated()
|
elevated = context.elevated()
|
||||||
|
@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base):
|
||||||
else:
|
else:
|
||||||
alloc_req = None
|
alloc_req = None
|
||||||
if alloc_req:
|
if alloc_req:
|
||||||
|
try:
|
||||||
host_available = scheduler_utils.claim_resources(
|
host_available = scheduler_utils.claim_resources(
|
||||||
elevated, self.report_client, spec_obj,
|
elevated, self.report_client, spec_obj,
|
||||||
instance.uuid, alloc_req,
|
instance.uuid, alloc_req,
|
||||||
host.allocation_request_version)
|
host.allocation_request_version)
|
||||||
if request_spec:
|
if request_spec:
|
||||||
# NOTE(gibi): redo the request group - resource
|
# NOTE(gibi): redo the request group - resource
|
||||||
# provider mapping as the above claim call moves
|
# provider mapping as the above claim call
|
||||||
# the allocation of the instance to another host
|
# moves the allocation of the instance to
|
||||||
# TODO(gibi): handle if the below call raises
|
# another host
|
||||||
self._fill_provider_mapping(
|
self._fill_provider_mapping(
|
||||||
context, instance.uuid, request_spec, host)
|
context, instance.uuid, request_spec, host)
|
||||||
|
except Exception as exc:
|
||||||
|
self._cleanup_when_reschedule_fails(
|
||||||
|
context, instance, exc, legacy_request_spec,
|
||||||
|
requested_networks)
|
||||||
|
return
|
||||||
else:
|
else:
|
||||||
# Some deployments use different schedulers that do not
|
# Some deployments use different schedulers that do not
|
||||||
# use Placement, so they will not have an
|
# use Placement, so they will not have an
|
||||||
|
|
|
@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase):
|
||||||
|
|
||||||
class InstanceHelperMixin(object):
|
class InstanceHelperMixin(object):
|
||||||
def _wait_for_server_parameter(self, admin_api, server, expected_params,
|
def _wait_for_server_parameter(self, admin_api, server, expected_params,
|
||||||
max_retries=10,
|
max_retries=10):
|
||||||
fail_when_run_out_of_retries=True):
|
|
||||||
retry_count = 0
|
retry_count = 0
|
||||||
while True:
|
while True:
|
||||||
server = admin_api.get_server(server['id'])
|
server = admin_api.get_server(server['id'])
|
||||||
|
@ -255,22 +254,17 @@ class InstanceHelperMixin(object):
|
||||||
break
|
break
|
||||||
retry_count += 1
|
retry_count += 1
|
||||||
if retry_count == max_retries:
|
if retry_count == max_retries:
|
||||||
if fail_when_run_out_of_retries:
|
|
||||||
self.fail('Wait for state change failed, '
|
self.fail('Wait for state change failed, '
|
||||||
'expected_params=%s, server=%s'
|
'expected_params=%s, server=%s'
|
||||||
% (expected_params, server))
|
% (expected_params, server))
|
||||||
else:
|
|
||||||
break
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
return server
|
return server
|
||||||
|
|
||||||
def _wait_for_state_change(self, admin_api, server, expected_status,
|
def _wait_for_state_change(self, admin_api, server, expected_status,
|
||||||
max_retries=10,
|
max_retries=10):
|
||||||
fail_when_run_out_of_retries=True):
|
|
||||||
return self._wait_for_server_parameter(
|
return self._wait_for_server_parameter(
|
||||||
admin_api, server, {'status': expected_status}, max_retries,
|
admin_api, server, {'status': expected_status}, max_retries)
|
||||||
fail_when_run_out_of_retries=fail_when_run_out_of_retries)
|
|
||||||
|
|
||||||
def _build_minimal_create_server_request(self, api, name, image_uuid=None,
|
def _build_minimal_create_server_request(self, api, name, image_uuid=None,
|
||||||
flavor_id=None, networks=None,
|
flavor_id=None, networks=None,
|
||||||
|
|
|
@ -4058,13 +4058,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
|
||||||
consumer_uuid=uuids.inst1, error='testing')]):
|
consumer_uuid=uuids.inst1, error='testing')]):
|
||||||
|
|
||||||
server = self.api.post_server({'server': server_req})
|
server = self.api.post_server({'server': server_req})
|
||||||
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
|
|
||||||
# instead of going to ERROR state
|
|
||||||
server = self._wait_for_state_change(
|
server = self._wait_for_state_change(
|
||||||
self.admin_api, server, 'ERROR',
|
self.admin_api, server, 'ERROR')
|
||||||
fail_when_run_out_of_retries=False)
|
|
||||||
|
|
||||||
self.assertEqual('BUILD', server['status'])
|
|
||||||
|
|
||||||
self._delete_and_check_allocations(server)
|
self._delete_and_check_allocations(server)
|
||||||
|
|
||||||
|
@ -6744,23 +6739,16 @@ class PortResourceRequestReSchedulingTest(
|
||||||
server = self._create_server(
|
server = self._create_server(
|
||||||
flavor=self.flavor,
|
flavor=self.flavor,
|
||||||
networks=[{'port': port['id']}])
|
networks=[{'port': port['id']}])
|
||||||
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
|
|
||||||
server = self._wait_for_state_change(
|
server = self._wait_for_state_change(
|
||||||
self.admin_api, server, 'ERROR',
|
self.admin_api, server, 'ERROR')
|
||||||
fail_when_run_out_of_retries=False)
|
|
||||||
|
|
||||||
self.assertEqual('BUILD', server['status'])
|
self.assertIn(
|
||||||
|
'Failed to get traits for resource provider',
|
||||||
|
server['fault']['message'])
|
||||||
|
|
||||||
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
|
self._delete_and_check_allocations(server)
|
||||||
# error is presented to the user
|
|
||||||
# self.assertIn(
|
# assert that unbind removes the allocation from the binding
|
||||||
# 'Failed to get traits for resource provider',
|
updated_port = self.neutron.show_port(port['id'])['port']
|
||||||
# server['fault']['message'])
|
binding_profile = updated_port['binding:profile']
|
||||||
#
|
self.assertNotIn('allocation', binding_profile)
|
||||||
# NOTE(gibi): even after delete the allocation of such server is leaked
|
|
||||||
# self._delete_and_check_allocations(server)
|
|
||||||
#
|
|
||||||
# # assert that unbind removes the allocation from the binding
|
|
||||||
# updated_port = self.neutron.show_port(port['id'])['port']
|
|
||||||
# binding_profile = updated_port['binding:profile']
|
|
||||||
# self.assertNotIn('allocation', binding_profile)
|
|
||||||
|
|
Loading…
Reference in New Issue