From 1c774315a5134937909f674b5c5a65f06efb3cb7 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Mon, 11 Mar 2019 15:24:05 +0100 Subject: [PATCH] Handle placement error during re-schedule If claim_resources or _fill_provider_mapping calls fail due to placement error then the re-scheduling stops and the instance is not put into ERROR state. This patch adds proper exception handling to these cases. Change-Id: I78fc2312274471a7bd85a263de12cc5a0b19fd10 Closes-Bug: #1819460 (cherry picked from commit b096d9303acfea81ca56394cab681d2b2eed2d91) --- nova/conductor/manager.py | 48 +++++++++++++++------ nova/tests/functional/integrated_helpers.py | 12 ++---- nova/tests/functional/test_servers.py | 34 +++++---------- 3 files changed, 48 insertions(+), 46 deletions(-) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index aa05e45191b6..a6fa3ba976bb 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base): bdm.attachment_id = attachment['id'] bdm.save() + def _cleanup_when_reschedule_fails( + self, context, instance, exception, legacy_request_spec, + requested_networks): + """Set the instance state and clean up. + + It is only used in case build_instance fails while rescheduling the + instance + """ + + updates = {'vm_state': vm_states.ERROR, + 'task_state': None} + self._set_vm_state_and_notify( + context, instance.uuid, 'build_instances', updates, exception, + legacy_request_spec) + self._cleanup_allocated_networks( + context, instance, requested_networks) + # NOTE(danms): This is never cell-targeted because it is only used for # cellsv1 (which does not target cells directly) and n-cpu reschedules # (which go to the cell conductor and thus are always cell-specific). @@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base): # disabled in those cases. num_attempts = filter_properties.get( 'retry', {}).get('num_attempts', 1) - updates = {'vm_state': vm_states.ERROR, 'task_state': None} for instance in instances: - self._set_vm_state_and_notify( - context, instance.uuid, 'build_instances', updates, - exc, legacy_request_spec) # If num_attempts > 1, we're in a reschedule and probably # either hit NoValidHost or MaxRetriesExceeded. Either way, # the build request should already be gone and we probably @@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base): self._destroy_build_request(context, instance) except exception.BuildRequestNotFound: pass - self._cleanup_allocated_networks( - context, instance, requested_networks) + self._cleanup_when_reschedule_fails( + context, instance, exc, legacy_request_spec, + requested_networks) return elevated = context.elevated() @@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base): else: alloc_req = None if alloc_req: - host_available = scheduler_utils.claim_resources( + try: + host_available = scheduler_utils.claim_resources( elevated, self.report_client, spec_obj, instance.uuid, alloc_req, host.allocation_request_version) - if request_spec: - # NOTE(gibi): redo the request group - resource - # provider mapping as the above claim call moves - # the allocation of the instance to another host - # TODO(gibi): handle if the below call raises - self._fill_provider_mapping( - context, instance.uuid, request_spec, host) + if request_spec: + # NOTE(gibi): redo the request group - resource + # provider mapping as the above claim call + # moves the allocation of the instance to + # another host + self._fill_provider_mapping( + context, instance.uuid, request_spec, host) + except Exception as exc: + self._cleanup_when_reschedule_fails( + context, instance, exc, legacy_request_spec, + requested_networks) + return else: # Some deployments use different schedulers that do not # use Placement, so they will not have an diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index fe6d8fd42598..c15f579802df 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase): class InstanceHelperMixin(object): def _wait_for_server_parameter(self, admin_api, server, expected_params, - max_retries=10, - fail_when_run_out_of_retries=True): + max_retries=10): retry_count = 0 while True: server = admin_api.get_server(server['id']) @@ -255,22 +254,17 @@ class InstanceHelperMixin(object): break retry_count += 1 if retry_count == max_retries: - if fail_when_run_out_of_retries: self.fail('Wait for state change failed, ' 'expected_params=%s, server=%s' % (expected_params, server)) - else: - break time.sleep(0.5) return server def _wait_for_state_change(self, admin_api, server, expected_status, - max_retries=10, - fail_when_run_out_of_retries=True): + max_retries=10): return self._wait_for_server_parameter( - admin_api, server, {'status': expected_status}, max_retries, - fail_when_run_out_of_retries=fail_when_run_out_of_retries) + admin_api, server, {'status': expected_status}, max_retries) def _build_minimal_create_server_request(self, api, name, image_uuid=None, flavor_id=None, networks=None, diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index a639b4842825..d8b2bb5ec1d9 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -4058,13 +4058,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase): consumer_uuid=uuids.inst1, error='testing')]): server = self.api.post_server({'server': server_req}) - # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state - # instead of going to ERROR state server = self._wait_for_state_change( - self.admin_api, server, 'ERROR', - fail_when_run_out_of_retries=False) - - self.assertEqual('BUILD', server['status']) + self.admin_api, server, 'ERROR') self._delete_and_check_allocations(server) @@ -6811,23 +6806,16 @@ class PortResourceRequestReSchedulingTest( server = self._create_server( flavor=self.flavor, networks=[{'port': port['id']}]) - # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state server = self._wait_for_state_change( - self.admin_api, server, 'ERROR', - fail_when_run_out_of_retries=False) + self.admin_api, server, 'ERROR') - self.assertEqual('BUILD', server['status']) + self.assertIn( + 'Failed to get traits for resource provider', + server['fault']['message']) - # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no - # error is presented to the user - # self.assertIn( - # 'Failed to get traits for resource provider', - # server['fault']['message']) - # - # NOTE(gibi): even after delete the allocation of such server is leaked - # self._delete_and_check_allocations(server) - # - # # assert that unbind removes the allocation from the binding - # updated_port = self.neutron.show_port(port['id'])['port'] - # binding_profile = updated_port['binding:profile'] - # self.assertNotIn('allocation', binding_profile) + self._delete_and_check_allocations(server) + + # assert that unbind removes the allocation from the binding + updated_port = self.neutron.show_port(port['id'])['port'] + binding_profile = updated_port['binding:profile'] + self.assertNotIn('allocation', binding_profile)