From 64f797a0514b0276540d4f6c28cb290383088e35 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Fri, 15 Nov 2019 16:31:04 +0100
Subject: [PATCH] Fix false ERROR message at compute restart

If an empty compute is restarted a false ERROR message was printed in
the log as the placement report client does not distinguish between
error from placement from empty allocation dict from placement.

This patch changes get_allocations_for_resource_provider to return None
in case of error instead of an empty dict. This is in line with
@safe_connect that would make the call return None as well. The
_error_out_instances_whose_build_was_interrupted also is changed to check
for None instead of empty dict before reporting the ERROR. The only
other caller of get_allocations_for_resource_provider was already
checking for None and converting it to an empty dict so from that caller
perspective this is compatible change on the report client.

This is stable only change as get_allocations_for_resource_provider was
improved during stein[1] to raise on placement error.

[1]I020e7dc47efc79f8907b7bfb753ec779a8da69a1

Change-Id: I6042e493144d4d5a29ec6ab23ffed6b3e7f385fe
Closes-Bug: #1852759
---
 nova/compute/manager.py                     |  2 +-
 nova/scheduler/client/report.py             |  6 +-
 nova/tests/unit/compute/test_compute_mgr.py | 72 +++++++++++++++++++++
 3 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 86523e006935..c44a4490ed39 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1324,7 +1324,7 @@ class ComputeManager(manager.Manager):
 
             f = self.reportclient.get_allocations_for_resource_provider
             allocations = f(context, cn_uuid)
-            if not allocations:
+            if allocations is None:
                 LOG.error(
                     "Could not retrieve compute node resource provider %s and "
                     "therefore unable to error out any instances stuck in "
diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
index 51005164a9c8..23bb6fc89ac0 100644
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -1938,7 +1938,11 @@ class SchedulerReportClient(object):
         url = '/resource_providers/%s/allocations' % rp_uuid
         resp = self.get(url, global_request_id=context.global_id)
         if not resp:
-            return {}
+            # NOTE(gibi): The request failed with an error response. Rather
+            # than return an empty dict, which is possible if there are no
+            # allocations against the given provider, return None to indicate
+            # a failure - like in the @safe_connect decorator.
+            return None
         else:
             return resp.json()['allocations']
 
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 85e89a530038..00c38b82d461 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -976,6 +976,78 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
         mock_instance_save.assert_called_once_with()
         self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)
 
+    @mock.patch.object(manager.LOG, 'error')
+    @mock.patch.object(objects.Instance, 'save')
+    @mock.patch.object(objects.InstanceList, 'get_by_filters')
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_resource_provider')
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    def test_init_host_with_interrupted_instance_build_empty_compute(
+            self, mock_get_nodes, mock_get_by_host_and_node,
+            mock_get_allocations, mock_get_instances, mock_instance_save,
+            mock_log):
+
+        mock_get_nodes.return_value = ['fake-node']
+        mock_get_by_host_and_node.return_value = objects.ComputeNode(
+            host=self.compute.host, uuid=uuids.cn_uuid)
+
+        # no instances on the host so no allocations in placement
+        allocations = {}
+        mock_get_allocations.return_value = allocations
+        mock_get_instances.return_value = objects.InstanceList(
+            self.context, objects=[])
+
+        self.compute._error_out_instances_whose_build_was_interrupted(
+            self.context, set())
+
+        mock_get_by_host_and_node.assert_called_once_with(
+            self.context, self.compute.host, 'fake-node')
+        mock_get_allocations.assert_called_once_with(
+            self.context, uuids.cn_uuid)
+
+        mock_get_instances.assert_not_called()
+        mock_instance_save.assert_not_called()
+        mock_log.assert_not_called()
+
+    @mock.patch.object(manager.LOG, 'error')
+    @mock.patch.object(objects.Instance, 'save')
+    @mock.patch.object(objects.InstanceList, 'get_by_filters')
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_resource_provider')
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    def test_init_host_with_interrupted_instance_build_placement_error(
+            self, mock_get_nodes, mock_get_by_host_and_node,
+            mock_get_allocations, mock_get_instances, mock_instance_save,
+            mock_log):
+
+        mock_get_nodes.return_value = ['fake-node']
+        mock_get_by_host_and_node.return_value = objects.ComputeNode(
+            host=self.compute.host, uuid=uuids.cn_uuid)
+
+        # get_allocations_for_resource_provider returns None if placement
+        # returns an error
+        allocations = None
+        mock_get_allocations.return_value = allocations
+        mock_get_instances.return_value = objects.InstanceList(
+            self.context, objects=[])
+
+        self.compute._error_out_instances_whose_build_was_interrupted(
+            self.context, set())
+
+        mock_get_by_host_and_node.assert_called_once_with(
+            self.context, self.compute.host, 'fake-node')
+        mock_get_allocations.assert_called_once_with(
+            self.context, uuids.cn_uuid)
+
+        mock_get_instances.assert_not_called()
+        mock_instance_save.assert_not_called()
+        mock_log.assert_called_once_with(
+            'Could not retrieve compute node resource provider %s and '
+            'therefore unable to error out any instances stuck in '
+            'BUILDING state.', uuids.cn_uuid)
+
     @mock.patch.object(manager.LOG, 'warning')
     @mock.patch.object(
         fake_driver.FakeDriver, 'get_available_nodes',