From 1c774315a5134937909f674b5c5a65f06efb3cb7 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@ericsson.com>
Date: Mon, 11 Mar 2019 15:24:05 +0100
Subject: [PATCH] Handle placement error during re-schedule

If claim_resources or _fill_provider_mapping calls fail due to placement
error then the re-scheduling stops and the instance is not put into
ERROR state.

This patch adds proper exception handling to these cases.

Change-Id: I78fc2312274471a7bd85a263de12cc5a0b19fd10
Closes-Bug: #1819460
(cherry picked from commit b096d9303acfea81ca56394cab681d2b2eed2d91)
---
 nova/conductor/manager.py                   | 48 +++++++++++++++------
 nova/tests/functional/integrated_helpers.py | 12 ++----
 nova/tests/functional/test_servers.py       | 34 +++++----------
 3 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index aa05e45191b6..a6fa3ba976bb 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -540,6 +540,23 @@ class ComputeTaskManager(base.Base):
                     bdm.attachment_id = attachment['id']
                     bdm.save()
 
+    def _cleanup_when_reschedule_fails(
+            self, context, instance, exception, legacy_request_spec,
+            requested_networks):
+        """Set the instance state and clean up.
+
+        It is only used in case build_instance fails while rescheduling the
+        instance
+        """
+
+        updates = {'vm_state': vm_states.ERROR,
+                   'task_state': None}
+        self._set_vm_state_and_notify(
+            context, instance.uuid, 'build_instances', updates, exception,
+            legacy_request_spec)
+        self._cleanup_allocated_networks(
+            context, instance, requested_networks)
+
     # NOTE(danms): This is never cell-targeted because it is only used for
     # cellsv1 (which does not target cells directly) and n-cpu reschedules
     # (which go to the cell conductor and thus are always cell-specific).
@@ -628,11 +645,7 @@ class ComputeTaskManager(base.Base):
             # disabled in those cases.
             num_attempts = filter_properties.get(
                 'retry', {}).get('num_attempts', 1)
-            updates = {'vm_state': vm_states.ERROR, 'task_state': None}
             for instance in instances:
-                self._set_vm_state_and_notify(
-                    context, instance.uuid, 'build_instances', updates,
-                    exc, legacy_request_spec)
                 # If num_attempts > 1, we're in a reschedule and probably
                 # either hit NoValidHost or MaxRetriesExceeded. Either way,
                 # the build request should already be gone and we probably
@@ -645,8 +658,9 @@ class ComputeTaskManager(base.Base):
                         self._destroy_build_request(context, instance)
                     except exception.BuildRequestNotFound:
                         pass
-                self._cleanup_allocated_networks(
-                    context, instance, requested_networks)
+                self._cleanup_when_reschedule_fails(
+                    context, instance, exc, legacy_request_spec,
+                    requested_networks)
             return
 
         elevated = context.elevated()
@@ -667,17 +681,23 @@ class ComputeTaskManager(base.Base):
                     else:
                         alloc_req = None
                     if alloc_req:
-                        host_available = scheduler_utils.claim_resources(
+                        try:
+                            host_available = scheduler_utils.claim_resources(
                                 elevated, self.report_client, spec_obj,
                                 instance.uuid, alloc_req,
                                 host.allocation_request_version)
-                        if request_spec:
-                            # NOTE(gibi): redo the request group - resource
-                            # provider mapping as the above claim call moves
-                            # the allocation of the instance to another host
-                            # TODO(gibi): handle if the below call raises
-                            self._fill_provider_mapping(
-                                context, instance.uuid, request_spec, host)
+                            if request_spec:
+                                # NOTE(gibi): redo the request group - resource
+                                # provider mapping as the above claim call
+                                # moves the allocation of the instance to
+                                # another host
+                                self._fill_provider_mapping(
+                                    context, instance.uuid, request_spec, host)
+                        except Exception as exc:
+                            self._cleanup_when_reschedule_fails(
+                                context, instance, exc, legacy_request_spec,
+                                requested_networks)
+                            return
                     else:
                         # Some deployments use different schedulers that do not
                         # use Placement, so they will not have an
diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
index fe6d8fd42598..c15f579802df 100644
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@@ -245,8 +245,7 @@ class _IntegratedTestBase(test.TestCase):
 
 class InstanceHelperMixin(object):
     def _wait_for_server_parameter(self, admin_api, server, expected_params,
-                                   max_retries=10,
-                                   fail_when_run_out_of_retries=True):
+                                   max_retries=10):
         retry_count = 0
         while True:
             server = admin_api.get_server(server['id'])
@@ -255,22 +254,17 @@ class InstanceHelperMixin(object):
                 break
             retry_count += 1
             if retry_count == max_retries:
-                if fail_when_run_out_of_retries:
                     self.fail('Wait for state change failed, '
                               'expected_params=%s, server=%s'
                               % (expected_params, server))
-                else:
-                    break
             time.sleep(0.5)
 
         return server
 
     def _wait_for_state_change(self, admin_api, server, expected_status,
-                               max_retries=10,
-                               fail_when_run_out_of_retries=True):
+                               max_retries=10):
         return self._wait_for_server_parameter(
-            admin_api, server, {'status': expected_status}, max_retries,
-            fail_when_run_out_of_retries=fail_when_run_out_of_retries)
+            admin_api, server, {'status': expected_status}, max_retries)
 
     def _build_minimal_create_server_request(self, api, name, image_uuid=None,
                                              flavor_id=None, networks=None,
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index a639b4842825..d8b2bb5ec1d9 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -4058,13 +4058,8 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
                                 consumer_uuid=uuids.inst1, error='testing')]):
 
             server = self.api.post_server({'server': server_req})
-            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
-            # instead of going to ERROR state
             server = self._wait_for_state_change(
-                self.admin_api, server, 'ERROR',
-                fail_when_run_out_of_retries=False)
-
-            self.assertEqual('BUILD', server['status'])
+                self.admin_api, server, 'ERROR')
 
         self._delete_and_check_allocations(server)
 
@@ -6811,23 +6806,16 @@ class PortResourceRequestReSchedulingTest(
             server = self._create_server(
                 flavor=self.flavor,
                 networks=[{'port': port['id']}])
-            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
             server = self._wait_for_state_change(
-                self.admin_api, server, 'ERROR',
-                fail_when_run_out_of_retries=False)
+                self.admin_api, server, 'ERROR')
 
-            self.assertEqual('BUILD', server['status'])
+        self.assertIn(
+            'Failed to get traits for resource provider',
+            server['fault']['message'])
 
-        # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
-        # error is presented to the user
-        # self.assertIn(
-        #     'Failed to get traits for resource provider',
-        #     server['fault']['message'])
-        #
-        # NOTE(gibi): even after delete the allocation of such server is leaked
-        # self._delete_and_check_allocations(server)
-        #
-        # # assert that unbind removes the allocation from the binding
-        # updated_port = self.neutron.show_port(port['id'])['port']
-        # binding_profile = updated_port['binding:profile']
-        # self.assertNotIn('allocation', binding_profile)
+        self._delete_and_check_allocations(server)
+
+        # assert that unbind removes the allocation from the binding
+        updated_port = self.neutron.show_port(port['id'])['port']
+        binding_profile = updated_port['binding:profile']
+        self.assertNotIn('allocation', binding_profile)