From 07a1a8ff7dcb00283ba7ebb6f59a70002a4ee4db Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@ericsson.com>
Date: Mon, 11 Mar 2019 14:39:10 +0100
Subject: [PATCH] Reproduce bug #1819460 in functional test

There are two calls during ConductorTaskManager.build_instances,
used during re-schedule, that could potentially raise exceptions
which leads to that the instance is stuck in BUILD state instead
of going to ERROR state.

This patch adds two functional testcase to reproduce the problems.

Conflicts:
       nova/tests/functional/test_servers.py

Change-Id: If80c4e4776b81cc06293989ee41d39b53735352b
Related-Bug: #1819460
(cherry picked from commit b63c42a0d4836fd0364cb306145d3474619f1e19)
---
 nova/tests/functional/integrated_helpers.py | 18 +++--
 nova/tests/functional/test_servers.py       | 77 +++++++++++++++++++++
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
index a5f1d65e7548..fe6d8fd42598 100644
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@@ -245,7 +245,8 @@ class _IntegratedTestBase(test.TestCase):
 
 class InstanceHelperMixin(object):
     def _wait_for_server_parameter(self, admin_api, server, expected_params,
-                                   max_retries=10):
+                                   max_retries=10,
+                                   fail_when_run_out_of_retries=True):
         retry_count = 0
         while True:
             server = admin_api.get_server(server['id'])
@@ -254,17 +255,22 @@ class InstanceHelperMixin(object):
                 break
             retry_count += 1
             if retry_count == max_retries:
-                self.fail('Wait for state change failed, '
-                          'expected_params=%s, server=%s'
-                          % (expected_params, server))
+                if fail_when_run_out_of_retries:
+                    self.fail('Wait for state change failed, '
+                              'expected_params=%s, server=%s'
+                              % (expected_params, server))
+                else:
+                    break
             time.sleep(0.5)
 
         return server
 
     def _wait_for_state_change(self, admin_api, server, expected_status,
-                               max_retries=10):
+                               max_retries=10,
+                               fail_when_run_out_of_retries=True):
         return self._wait_for_server_parameter(
-            admin_api, server, {'status': expected_status}, max_retries)
+            admin_api, server, {'status': expected_status}, max_retries,
+            fail_when_run_out_of_retries=fail_when_run_out_of_retries)
 
     def _build_minimal_create_server_request(self, api, name, image_uuid=None,
                                              flavor_id=None, networks=None,
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index 6371bf21c306..067a3b86ee30 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -35,11 +35,13 @@ from nova.compute import api as compute_api
 from nova.compute import instance_actions
 from nova.compute import manager as compute_manager
 from nova.compute import rpcapi
+from nova.conductor import manager
 from nova import context
 from nova import exception
 from nova import objects
 from nova.objects import block_device as block_device_obj
 from nova import rc_fields
+from nova.scheduler import utils
 from nova.scheduler import weights
 from nova import test
 from nova.tests import fixtures as nova_fixtures
@@ -4035,6 +4037,37 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
         # Ensure the allocation records on the destination host.
         self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor1)
 
+    def test_allocation_fails_during_reschedule(self):
+        """Verify that if nova fails to allocate resources during re-schedule
+        then the server is put into ERROR state properly.
+        """
+
+        server_req = self._build_minimal_create_server_request(
+            self.api, 'some-server', flavor_id=self.flavor1['id'],
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            networks='none')
+
+        orig_claim = utils.claim_resources
+        # First call is during boot, we want that to succeed normally. Then the
+        # fake virt driver triggers a re-schedule. During that re-schedule we
+        # simulate that the placement call fails.
+        with mock.patch('nova.scheduler.utils.claim_resources',
+                        side_effect=[
+                            orig_claim,
+                            exception.AllocationUpdateFailed(
+                                consumer_uuid=uuids.inst1, error='testing')]):
+
+            server = self.api.post_server({'server': server_req})
+            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
+            # instead of going to ERROR state
+            server = self._wait_for_state_change(
+                self.admin_api, server, 'ERROR',
+                fail_when_run_out_of_retries=False)
+
+            self.assertEqual('BUILD', server['status'])
+
+        self._delete_and_check_allocations(server)
+
 
 class ServerRescheduleTestsWithNestedResourcesRequest(ServerRescheduleTests):
     compute_driver = 'fake.FakeRescheduleDriverWithNestedCustomResources'
@@ -6754,3 +6787,47 @@ class PortResourceRequestReSchedulingTest(
         updated_port = self.neutron.show_port(port['id'])['port']
         binding_profile = updated_port['binding:profile']
         self.assertNotIn('allocation', binding_profile)
+
+    def test_boot_reschedule_fill_provider_mapping_raises(self):
+        """Verify that if the  _fill_provider_mapping raises during re-schedule
+        then the instance is properly put into ERROR state.
+        """
+
+        port = self.neutron.port_with_resource_request
+
+        # First call is during boot, we want that to succeed normally. Then the
+        # fake virt driver triggers a re-schedule. During that re-schedule the
+        # fill is called again, and we simulate that call raises.
+        fill = manager.ComputeTaskManager._fill_provider_mapping
+
+        with mock.patch(
+                'nova.conductor.manager.ComputeTaskManager.'
+                '_fill_provider_mapping',
+                side_effect=[
+                    fill,
+                    exception.ConsumerAllocationRetrievalFailed(
+                        consumer_uuid=uuids.inst1, error='testing')],
+                autospec=True):
+            server = self._create_server(
+                flavor=self.flavor,
+                networks=[{'port': port['id']}])
+            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
+            server = self._wait_for_state_change(
+                self.admin_api, server, 'ERROR',
+                fail_when_run_out_of_retries=False)
+
+            self.assertEqual('BUILD', server['status'])
+
+        # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
+        # error is presented to the user
+        # self.assertIn(
+        #     'Failed to retrieve allocations for consumer',
+        #     server['fault']['message'])
+        #
+        # NOTE(gibi): even after delete the allocation of such server is leaked
+        # self._delete_and_check_allocations(server)
+        #
+        # # assert that unbind removes the allocation from the binding
+        # updated_port = self.neutron.show_port(port['id'])['port']
+        # binding_profile = updated_port['binding:profile']
+        # self.assertNotIn('allocation', binding_profile)