Fix false ERROR message at compute restart

If an empty compute is restarted a false ERROR message was printed in the log as the placement report client does not distinguish between error from placement from empty allocation dict from placement. This patch changes get_allocations_for_resource_provider to return None in case of error instead of an empty dict. This is in line with @safe_connect that would make the call return None as well. The _error_out_instances_whose_build_was_interrupted also is changed to check for None instead of empty dict before reporting the ERROR. The only other caller of get_allocations_for_resource_provider was already checking for None and converting it to an empty dict so from that caller perspective this is compatible change on the report client. This is stable only change as get_allocations_for_resource_provider was improved during stein[1] to raise on placement error. [1]I020e7dc47efc79f8907b7bfb753ec779a8da69a1 Change-Id: I6042e493144d4d5a29ec6ab23ffed6b3e7f385fe Closes-Bug: #1852759 (cherry picked from commit 64f797a051)
2019-11-15 16:31:04 +01:00 · 2019-11-15 16:31:04 +01:00 · 4fcb7816bc
parent 4e9f852056
commit 4fcb7816bc
3 changed files with 78 additions and 2 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -1287,7 +1287,7 @@ class ComputeManager(manager.Manager):
            f = self.reportclient.get_allocations_for_resource_provider
            allocations = f(context, cn_uuid)
-            if not allocations:
+            if allocations is None:
                LOG.error(
                    "Could not retrieve compute node resource provider %s and "
                    "therefore unable to error out any instances stuck in "
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1780,7 +1780,11 @@ class SchedulerReportClient(object):
        url = '/resource_providers/%s/allocations' % rp_uuid
        resp = self.get(url, global_request_id=context.global_id)
        if not resp:
-            return {}
+            # NOTE(gibi): The request failed with an error response. Rather
            # than return an empty dict, which is possible if there are no
            # allocations against the given provider, return None to indicate
            # a failure - like in the @safe_connect decorator.
            return None
        else:
            return resp.json()['allocations']
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -885,6 +885,78 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        mock_instance_save.assert_called_once_with()
        self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)
    @mock.patch.object(manager.LOG, 'error')
    @mock.patch.object(objects.Instance, 'save')
    @mock.patch.object(objects.InstanceList, 'get_by_filters')
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_allocations_for_resource_provider')
    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
    def test_init_host_with_interrupted_instance_build_empty_compute(
            self, mock_get_nodes, mock_get_by_host_and_node,
            mock_get_allocations, mock_get_instances, mock_instance_save,
            mock_log):
        mock_get_nodes.return_value = ['fake-node']
        mock_get_by_host_and_node.return_value = objects.ComputeNode(
            host=self.compute.host, uuid=uuids.cn_uuid)
        # no instances on the host so no allocations in placement
        allocations = {}
        mock_get_allocations.return_value = allocations
        mock_get_instances.return_value = objects.InstanceList(
            self.context, objects=[])
        self.compute._error_out_instances_whose_build_was_interrupted(
            self.context, set())
        mock_get_by_host_and_node.assert_called_once_with(
            self.context, self.compute.host, 'fake-node')
        mock_get_allocations.assert_called_once_with(
            self.context, uuids.cn_uuid)
        mock_get_instances.assert_not_called()
        mock_instance_save.assert_not_called()
        mock_log.assert_not_called()
    @mock.patch.object(manager.LOG, 'error')
    @mock.patch.object(objects.Instance, 'save')
    @mock.patch.object(objects.InstanceList, 'get_by_filters')
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_allocations_for_resource_provider')
    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
    def test_init_host_with_interrupted_instance_build_placement_error(
            self, mock_get_nodes, mock_get_by_host_and_node,
            mock_get_allocations, mock_get_instances, mock_instance_save,
            mock_log):
        mock_get_nodes.return_value = ['fake-node']
        mock_get_by_host_and_node.return_value = objects.ComputeNode(
            host=self.compute.host, uuid=uuids.cn_uuid)
        # get_allocations_for_resource_provider returns None if placement
        # returns an error
        allocations = None
        mock_get_allocations.return_value = allocations
        mock_get_instances.return_value = objects.InstanceList(
            self.context, objects=[])
        self.compute._error_out_instances_whose_build_was_interrupted(
            self.context, set())
        mock_get_by_host_and_node.assert_called_once_with(
            self.context, self.compute.host, 'fake-node')
        mock_get_allocations.assert_called_once_with(
            self.context, uuids.cn_uuid)
        mock_get_instances.assert_not_called()
        mock_instance_save.assert_not_called()
        mock_log.assert_called_once_with(
            'Could not retrieve compute node resource provider %s and '
            'therefore unable to error out any instances stuck in '
            'BUILDING state.', uuids.cn_uuid)
    @mock.patch.object(manager.LOG, 'warning')
    @mock.patch.object(
        fake_driver.FakeDriver, 'get_available_nodes',