Handle placement error during re-schedule

If claim_resources or _fill_provider_mapping calls fail due to placement error then the re-scheduling stops and the instance is not put into ERROR state. This patch adds proper exception handling to these cases. Change-Id: I78fc2312274471a7bd85a263de12cc5a0b19fd10 Closes-Bug: #1819460
2019-03-11 15:24:05 +01:00 · 2019-03-11 15:24:05 +01:00 · b096d9303a
parent fd3b86d1c3
commit b096d9303a
3 changed files with 48 additions and 46 deletions
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base):
                    bdm.attachment_id = attachment['id']
                    bdm.save()

+    def _cleanup_when_reschedule_fails(
+            self, context, instance, exception, legacy_request_spec,
+            requested_networks):
+        """Set the instance state and clean up.
+
+        It is only used in case build_instance fails while rescheduling the
+        instance
+        """
+
+        updates = {'vm_state': vm_states.ERROR,
+                   'task_state': None}
+        self._set_vm_state_and_notify(
+            context, instance.uuid, 'build_instances', updates, exception,
+            legacy_request_spec)
+        self._cleanup_allocated_networks(
+            context, instance, requested_networks)
+
    # NOTE(danms): This is never cell-targeted because it is only used for
    # cellsv1 (which does not target cells directly) and n-cpu reschedules
    # (which go to the cell conductor and thus are always cell-specific).
@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base):
            # disabled in those cases.
            num_attempts = filter_properties.get(
                'retry', {}).get('num_attempts', 1)
-            updates = {'vm_state': vm_states.ERROR, 'task_state': None}
            for instance in instances:
-                self._set_vm_state_and_notify(
-                    context, instance.uuid, 'build_instances', updates,
-                    exc, legacy_request_spec)
                # If num_attempts > 1, we're in a reschedule and probably
                # either hit NoValidHost or MaxRetriesExceeded. Either way,
                # the build request should already be gone and we probably
@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base):
                        self._destroy_build_request(context, instance)
                    except exception.BuildRequestNotFound:
                        pass
-                self._cleanup_allocated_networks(
-                    context, instance, requested_networks)
+                self._cleanup_when_reschedule_fails(
+                    context, instance, exc, legacy_request_spec,
+                    requested_networks)
            return

        elevated = context.elevated()
@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base):
                    else:
                        alloc_req = None
                    if alloc_req:
-                        host_available = scheduler_utils.claim_resources(
+                        try:
+                            host_available = scheduler_utils.claim_resources(
                                elevated, self.report_client, spec_obj,
                                instance.uuid, alloc_req,
                                host.allocation_request_version)
-                        if request_spec:
-                            # NOTE(gibi): redo the request group - resource
-                            # provider mapping as the above claim call moves
-                            # the allocation of the instance to another host
-                            # TODO(gibi): handle if the below call raises
-                            self._fill_provider_mapping(
-                                context, instance.uuid, request_spec, host)
+                            if request_spec:
+                                # NOTE(gibi): redo the request group - resource
+                                # provider mapping as the above claim call
+                                # moves the allocation of the instance to
+                                # another host
+                                self._fill_provider_mapping(
+                                    context, instance.uuid, request_spec, host)
+                        except Exception as exc:
+                            self._cleanup_when_reschedule_fails(
+                                context, instance, exc, legacy_request_spec,
+                                requested_networks)
+                            return
                    else:
                        # Some deployments use different schedulers that do not
                        # use Placement, so they will not have an
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase):

 class InstanceHelperMixin(object):
    def _wait_for_server_parameter(self, admin_api, server, expected_params,
-                                   max_retries=10,
-                                   fail_when_run_out_of_retries=True):
+                                   max_retries=10):
        retry_count = 0
        while True:
            server = admin_api.get_server(server['id'])
@ -255,22 +254,17 @@ class InstanceHelperMixin(object):
                break
            retry_count += 1
            if retry_count == max_retries:
-                if fail_when_run_out_of_retries:
                    self.fail('Wait for state change failed, '
                              'expected_params=%s, server=%s'
                              % (expected_params, server))
-                else:
-                    break
            time.sleep(0.5)

        return server

    def _wait_for_state_change(self, admin_api, server, expected_status,
-                               max_retries=10,
-                               fail_when_run_out_of_retries=True):
+                               max_retries=10):
        return self._wait_for_server_parameter(
-            admin_api, server, {'status': expected_status}, max_retries,
-            fail_when_run_out_of_retries=fail_when_run_out_of_retries)
+            admin_api, server, {'status': expected_status}, max_retries)

    def _build_minimal_create_server_request(self, api, name, image_uuid=None,
                                             flavor_id=None, networks=None,
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -4058,13 +4058,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
                                consumer_uuid=uuids.inst1, error='testing')]):

            server = self.api.post_server({'server': server_req})
-            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
-            # instead of going to ERROR state
            server = self._wait_for_state_change(
-                self.admin_api, server, 'ERROR',
-                fail_when_run_out_of_retries=False)
-
-            self.assertEqual('BUILD', server['status'])
+                self.admin_api, server, 'ERROR')

        self._delete_and_check_allocations(server)

@ -6744,23 +6739,16 @@ class PortResourceRequestReSchedulingTest(
            server = self._create_server(
                flavor=self.flavor,
                networks=[{'port': port['id']}])
-            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
            server = self._wait_for_state_change(
-                self.admin_api, server, 'ERROR',
-                fail_when_run_out_of_retries=False)
+                self.admin_api, server, 'ERROR')

-            self.assertEqual('BUILD', server['status'])
+        self.assertIn(
+            'Failed to get traits for resource provider',
+            server['fault']['message'])

-        # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
-        # error is presented to the user
-        # self.assertIn(
-        #     'Failed to get traits for resource provider',
-        #     server['fault']['message'])
-        #
-        # NOTE(gibi): even after delete the allocation of such server is leaked
-        # self._delete_and_check_allocations(server)
-        #
-        # # assert that unbind removes the allocation from the binding
-        # updated_port = self.neutron.show_port(port['id'])['port']
-        # binding_profile = updated_port['binding:profile']
-        # self.assertNotIn('allocation', binding_profile)
+        self._delete_and_check_allocations(server)
+
+        # assert that unbind removes the allocation from the binding
+        updated_port = self.neutron.show_port(port['id'])['port']
+        binding_profile = updated_port['binding:profile']
+        self.assertNotIn('allocation', binding_profile)