Handle placement error during re-schedule

If claim_resources or _fill_provider_mapping calls fail due to placement error then the re-scheduling stops and the instance is not put into ERROR state. This patch adds proper exception handling to these cases. Change-Id: I78fc2312274471a7bd85a263de12cc5a0b19fd10 Closes-Bug: #1819460
2019-03-11 15:24:05 +01:00 · 2019-03-11 15:24:05 +01:00 · b096d9303a
parent fd3b86d1c3
commit b096d9303a
3 changed files with 48 additions and 46 deletions
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base):
                    bdm.attachment_id = attachment['id']
                    bdm.save()
    def _cleanup_when_reschedule_fails(
            self, context, instance, exception, legacy_request_spec,
            requested_networks):
        """Set the instance state and clean up.
        It is only used in case build_instance fails while rescheduling the
        instance
        """
        updates = {'vm_state': vm_states.ERROR,
                   'task_state': None}
        self._set_vm_state_and_notify(
            context, instance.uuid, 'build_instances', updates, exception,
            legacy_request_spec)
        self._cleanup_allocated_networks(
            context, instance, requested_networks)
    # NOTE(danms): This is never cell-targeted because it is only used for
    # cellsv1 (which does not target cells directly) and n-cpu reschedules
    # (which go to the cell conductor and thus are always cell-specific).
@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base):
            # disabled in those cases.
            num_attempts = filter_properties.get(
                'retry', {}).get('num_attempts', 1)
            updates = {'vm_state': vm_states.ERROR, 'task_state': None}
            for instance in instances:
                self._set_vm_state_and_notify(
                    context, instance.uuid, 'build_instances', updates,
                    exc, legacy_request_spec)
                # If num_attempts > 1, we're in a reschedule and probably
                # either hit NoValidHost or MaxRetriesExceeded. Either way,
                # the build request should already be gone and we probably
@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base):
                        self._destroy_build_request(context, instance)
                    except exception.BuildRequestNotFound:
                        pass
-                self._cleanup_allocated_networks(
+                self._cleanup_when_reschedule_fails(
-                    context, instance, requested_networks)
+                    context, instance, exc, legacy_request_spec,
                    requested_networks)
            return
        elevated = context.elevated()
@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base):
                    else:
                        alloc_req = None
                    if alloc_req:
                        try:
                            host_available = scheduler_utils.claim_resources(
                                elevated, self.report_client, spec_obj,
                                instance.uuid, alloc_req,
                                host.allocation_request_version)
                            if request_spec:
                                # NOTE(gibi): redo the request group - resource
-                            # provider mapping as the above claim call moves
+                                # provider mapping as the above claim call
-                            # the allocation of the instance to another host
+                                # moves the allocation of the instance to
-                            # TODO(gibi): handle if the below call raises
+                                # another host
                                self._fill_provider_mapping(
                                    context, instance.uuid, request_spec, host)
                        except Exception as exc:
                            self._cleanup_when_reschedule_fails(
                                context, instance, exc, legacy_request_spec,
                                requested_networks)
                            return
                    else:
                        # Some deployments use different schedulers that do not
                        # use Placement, so they will not have an
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase):
 class InstanceHelperMixin(object):
    def _wait_for_server_parameter(self, admin_api, server, expected_params,
-                                   max_retries=10,
+                                   max_retries=10):
                                   fail_when_run_out_of_retries=True):
        retry_count = 0
        while True:
            server = admin_api.get_server(server['id'])
@ -255,22 +254,17 @@ class InstanceHelperMixin(object):
                break
            retry_count += 1
            if retry_count == max_retries:
                if fail_when_run_out_of_retries:
                    self.fail('Wait for state change failed, '
                              'expected_params=%s, server=%s'
                              % (expected_params, server))
                else:
                    break
            time.sleep(0.5)
        return server
    def _wait_for_state_change(self, admin_api, server, expected_status,
-                               max_retries=10,
+                               max_retries=10):
                               fail_when_run_out_of_retries=True):
        return self._wait_for_server_parameter(
-            admin_api, server, {'status': expected_status}, max_retries,
+            admin_api, server, {'status': expected_status}, max_retries)
            fail_when_run_out_of_retries=fail_when_run_out_of_retries)
    def _build_minimal_create_server_request(self, api, name, image_uuid=None,
                                             flavor_id=None, networks=None,
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -4058,13 +4058,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
                                consumer_uuid=uuids.inst1, error='testing')]):
            server = self.api.post_server({'server': server_req})
            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
            # instead of going to ERROR state
            server = self._wait_for_state_change(
-                self.admin_api, server, 'ERROR',
+                self.admin_api, server, 'ERROR')
                fail_when_run_out_of_retries=False)
            self.assertEqual('BUILD', server['status'])
        self._delete_and_check_allocations(server)
@ -6744,23 +6739,16 @@ class PortResourceRequestReSchedulingTest(
            server = self._create_server(
                flavor=self.flavor,
                networks=[{'port': port['id']}])
            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
            server = self._wait_for_state_change(
-                self.admin_api, server, 'ERROR',
+                self.admin_api, server, 'ERROR')
                fail_when_run_out_of_retries=False)
-            self.assertEqual('BUILD', server['status'])
+        self.assertIn(
            'Failed to get traits for resource provider',
            server['fault']['message'])
-        # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
+        self._delete_and_check_allocations(server)
-        # error is presented to the user
+
-        # self.assertIn(
+        # assert that unbind removes the allocation from the binding
-        #     'Failed to get traits for resource provider',
+        updated_port = self.neutron.show_port(port['id'])['port']
-        #     server['fault']['message'])
+        binding_profile = updated_port['binding:profile']
-        #
+        self.assertNotIn('allocation', binding_profile)
        # NOTE(gibi): even after delete the allocation of such server is leaked
        # self._delete_and_check_allocations(server)
        #
        # # assert that unbind removes the allocation from the binding
        # updated_port = self.neutron.show_port(port['id'])['port']
        # binding_profile = updated_port['binding:profile']
        # self.assertNotIn('allocation', binding_profile)