From e4a5516098454775e1c5d5f631308bfa9abf7167 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Fri, 21 Jun 2019 17:13:31 +0200
Subject: [PATCH] Functional reproduce for bug 1833581

Change-Id: Id112098ef7603d0e514120ac9b7ed861dfa32bd3
Related-Bug: #1833581
(cherry picked from commit d2e0bd81df6a732f9c78df29538db89dda37b246)
---
 .../functional/compute/test_init_host.py      | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/nova/tests/functional/compute/test_init_host.py b/nova/tests/functional/compute/test_init_host.py
index 91eb7ff157db..2ce71df8a5d6 100644
--- a/nova/tests/functional/compute/test_init_host.py
+++ b/nova/tests/functional/compute/test_init_host.py
@@ -16,6 +16,7 @@ import time
 from nova import context as nova_context
 from nova import objects
 from nova.tests.functional import integrated_helpers
+from nova.tests.unit import fake_notifier
 from nova.tests.unit.image import fake as fake_image
 
 
@@ -120,3 +121,75 @@ class ComputeManagerInitHostTestCase(
         # the source host but is not tracking allocations against the source
         # host.
         self.assertNotIn(server['id'], source_allocations)
+
+
+class TestComputeRestartInstanceStuckInBuild(
+        integrated_helpers.ProviderUsageBaseTestCase):
+
+    compute_driver = 'fake.SmallFakeDriver'
+
+    def setUp(self):
+        super(TestComputeRestartInstanceStuckInBuild, self).setUp()
+        self.compute1 = self._start_compute(host='host1')
+
+        flavors = self.api.get_flavors()
+        self.flavor1 = flavors[0]
+
+    def test_restart_compute_while_instance_waiting_for_resource_claim(self):
+        """Test for bug 1833581 where an instance is stuck in
+        BUILD state forever due to compute service is restarted before the
+        resource claim finished.
+        """
+
+        # To reproduce the problem we need to stop / kill the compute service
+        # when an instance build request has already reached the service but
+        # the instance_claim() has not finished. One way that this
+        # happens in practice is when multiple builds are waiting for the
+        # 'nova-compute-resource' semaphore. So one way to reproduce this in
+        # the test would be to grab that semaphore, boot an instance, wait for
+        # it to reach the compute then stop the compute.
+        # Unfortunately when we release the semaphore after the simulated
+        # compute restart the original instance_claim execution continues as
+        # the stopped compute is not 100% stopped in the func test env. Also
+        # we cannot really keep the semaphore forever as this named semaphore
+        # is shared between the old and new compute service.
+        # There is another way to trigger the issue. We can inject a sleep into
+        # instance_claim() to stop it. This is less realistic but it works in
+        # the test env.
+        server_req = self._build_minimal_create_server_request(
+            self.api, 'interrupted-server', flavor_id=self.flavor1['id'],
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            networks='none')
+
+        def sleep_forever(*args, **kwargs):
+            time.sleep(1000000)
+
+        with mock.patch('nova.compute.resource_tracker.ResourceTracker.'
+                        'instance_claim') as mock_instance_claim:
+            mock_instance_claim.side_effect = sleep_forever
+
+            server = self.api.post_server({'server': server_req})
+            self._wait_for_state_change(self.admin_api, server, 'BUILD')
+
+            # the instance.create.start is the closest thing to the
+            # instance_claim call we can wait for in the test
+            fake_notifier.wait_for_versioned_notifications(
+                'instance.create.start')
+            self.restart_compute_service(self.compute1)
+
+        # This is bug 1833581 as the server remains in BUILD state after the
+        # compute restart.
+        self._wait_for_state_change(self.admin_api, server, 'BUILD')
+
+        # Not even the periodic task push this server to ERROR because the
+        # server host is still None since the instance_claim didn't set it.
+        self.flags(instance_build_timeout=1)
+        self.compute1.manager._check_instance_build_time(
+            nova_context.get_admin_context())
+        server = self.admin_api.get_server(server['id'])
+        self.assertEqual('BUILD', server['status'])
+        self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
+
+        # We expect that the instance is pushed to ERROR state during the
+        # compute restart.
+        # self._wait_for_state_change(self.admin_api, server, 'ERROR')