Merge "Handle placement error during re-schedule" into stable/stein
This commit is contained in:
commit
99600d5b2a
@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base):
|
||||
bdm.attachment_id = attachment['id']
|
||||
bdm.save()
|
||||
|
||||
def _cleanup_when_reschedule_fails(
|
||||
self, context, instance, exception, legacy_request_spec,
|
||||
requested_networks):
|
||||
"""Set the instance state and clean up.
|
||||
|
||||
It is only used in case build_instance fails while rescheduling the
|
||||
instance
|
||||
"""
|
||||
|
||||
updates = {'vm_state': vm_states.ERROR,
|
||||
'task_state': None}
|
||||
self._set_vm_state_and_notify(
|
||||
context, instance.uuid, 'build_instances', updates, exception,
|
||||
legacy_request_spec)
|
||||
self._cleanup_allocated_networks(
|
||||
context, instance, requested_networks)
|
||||
|
||||
# NOTE(danms): This is never cell-targeted because it is only used for
|
||||
# cellsv1 (which does not target cells directly) and n-cpu reschedules
|
||||
# (which go to the cell conductor and thus are always cell-specific).
|
||||
@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base):
|
||||
# disabled in those cases.
|
||||
num_attempts = filter_properties.get(
|
||||
'retry', {}).get('num_attempts', 1)
|
||||
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
|
||||
for instance in instances:
|
||||
self._set_vm_state_and_notify(
|
||||
context, instance.uuid, 'build_instances', updates,
|
||||
exc, legacy_request_spec)
|
||||
# If num_attempts > 1, we're in a reschedule and probably
|
||||
# either hit NoValidHost or MaxRetriesExceeded. Either way,
|
||||
# the build request should already be gone and we probably
|
||||
@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base):
|
||||
self._destroy_build_request(context, instance)
|
||||
except exception.BuildRequestNotFound:
|
||||
pass
|
||||
self._cleanup_allocated_networks(
|
||||
context, instance, requested_networks)
|
||||
self._cleanup_when_reschedule_fails(
|
||||
context, instance, exc, legacy_request_spec,
|
||||
requested_networks)
|
||||
return
|
||||
|
||||
elevated = context.elevated()
|
||||
@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base):
|
||||
else:
|
||||
alloc_req = None
|
||||
if alloc_req:
|
||||
host_available = scheduler_utils.claim_resources(
|
||||
try:
|
||||
host_available = scheduler_utils.claim_resources(
|
||||
elevated, self.report_client, spec_obj,
|
||||
instance.uuid, alloc_req,
|
||||
host.allocation_request_version)
|
||||
if request_spec:
|
||||
# NOTE(gibi): redo the request group - resource
|
||||
# provider mapping as the above claim call moves
|
||||
# the allocation of the instance to another host
|
||||
# TODO(gibi): handle if the below call raises
|
||||
self._fill_provider_mapping(
|
||||
context, instance.uuid, request_spec, host)
|
||||
if request_spec:
|
||||
# NOTE(gibi): redo the request group - resource
|
||||
# provider mapping as the above claim call
|
||||
# moves the allocation of the instance to
|
||||
# another host
|
||||
self._fill_provider_mapping(
|
||||
context, instance.uuid, request_spec, host)
|
||||
except Exception as exc:
|
||||
self._cleanup_when_reschedule_fails(
|
||||
context, instance, exc, legacy_request_spec,
|
||||
requested_networks)
|
||||
return
|
||||
else:
|
||||
# Some deployments use different schedulers that do not
|
||||
# use Placement, so they will not have an
|
||||
|
@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase):
|
||||
|
||||
class InstanceHelperMixin(object):
|
||||
def _wait_for_server_parameter(self, admin_api, server, expected_params,
|
||||
max_retries=10,
|
||||
fail_when_run_out_of_retries=True):
|
||||
max_retries=10):
|
||||
retry_count = 0
|
||||
while True:
|
||||
server = admin_api.get_server(server['id'])
|
||||
@ -255,22 +254,17 @@ class InstanceHelperMixin(object):
|
||||
break
|
||||
retry_count += 1
|
||||
if retry_count == max_retries:
|
||||
if fail_when_run_out_of_retries:
|
||||
self.fail('Wait for state change failed, '
|
||||
'expected_params=%s, server=%s'
|
||||
% (expected_params, server))
|
||||
else:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
return server
|
||||
|
||||
def _wait_for_state_change(self, admin_api, server, expected_status,
|
||||
max_retries=10,
|
||||
fail_when_run_out_of_retries=True):
|
||||
max_retries=10):
|
||||
return self._wait_for_server_parameter(
|
||||
admin_api, server, {'status': expected_status}, max_retries,
|
||||
fail_when_run_out_of_retries=fail_when_run_out_of_retries)
|
||||
admin_api, server, {'status': expected_status}, max_retries)
|
||||
|
||||
def _build_minimal_create_server_request(self, api, name, image_uuid=None,
|
||||
flavor_id=None, networks=None,
|
||||
|
@ -4111,13 +4111,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
consumer_uuid=uuids.inst1, error='testing')]):
|
||||
|
||||
server = self.api.post_server({'server': server_req})
|
||||
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
|
||||
# instead of going to ERROR state
|
||||
server = self._wait_for_state_change(
|
||||
self.admin_api, server, 'ERROR',
|
||||
fail_when_run_out_of_retries=False)
|
||||
|
||||
self.assertEqual('BUILD', server['status'])
|
||||
self.admin_api, server, 'ERROR')
|
||||
|
||||
self._delete_and_check_allocations(server)
|
||||
|
||||
@ -6946,23 +6941,16 @@ class PortResourceRequestReSchedulingTest(
|
||||
server = self._create_server(
|
||||
flavor=self.flavor,
|
||||
networks=[{'port': port['id']}])
|
||||
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
|
||||
server = self._wait_for_state_change(
|
||||
self.admin_api, server, 'ERROR',
|
||||
fail_when_run_out_of_retries=False)
|
||||
self.admin_api, server, 'ERROR')
|
||||
|
||||
self.assertEqual('BUILD', server['status'])
|
||||
self.assertIn(
|
||||
'Failed to get traits for resource provider',
|
||||
server['fault']['message'])
|
||||
|
||||
# NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
|
||||
# error is presented to the user
|
||||
# self.assertIn(
|
||||
# 'Failed to get traits for resource provider',
|
||||
# server['fault']['message'])
|
||||
#
|
||||
# NOTE(gibi): even after delete the allocation of such server is leaked
|
||||
# self._delete_and_check_allocations(server)
|
||||
#
|
||||
# # assert that unbind removes the allocation from the binding
|
||||
# updated_port = self.neutron.show_port(port['id'])['port']
|
||||
# binding_profile = updated_port['binding:profile']
|
||||
# self.assertNotIn('allocation', binding_profile)
|
||||
self._delete_and_check_allocations(server)
|
||||
|
||||
# assert that unbind removes the allocation from the binding
|
||||
updated_port = self.neutron.show_port(port['id'])['port']
|
||||
binding_profile = updated_port['binding:profile']
|
||||
self.assertNotIn('allocation', binding_profile)
|
||||
|
Loading…
Reference in New Issue
Block a user