From 64f797a0514b0276540d4f6c28cb290383088e35 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Fri, 15 Nov 2019 16:31:04 +0100 Subject: [PATCH] Fix false ERROR message at compute restart If an empty compute is restarted a false ERROR message was printed in the log as the placement report client does not distinguish between error from placement from empty allocation dict from placement. This patch changes get_allocations_for_resource_provider to return None in case of error instead of an empty dict. This is in line with @safe_connect that would make the call return None as well. The _error_out_instances_whose_build_was_interrupted also is changed to check for None instead of empty dict before reporting the ERROR. The only other caller of get_allocations_for_resource_provider was already checking for None and converting it to an empty dict so from that caller perspective this is compatible change on the report client. This is stable only change as get_allocations_for_resource_provider was improved during stein[1] to raise on placement error. [1]I020e7dc47efc79f8907b7bfb753ec779a8da69a1 Change-Id: I6042e493144d4d5a29ec6ab23ffed6b3e7f385fe Closes-Bug: #1852759 --- nova/compute/manager.py | 2 +- nova/scheduler/client/report.py | 6 +- nova/tests/unit/compute/test_compute_mgr.py | 72 +++++++++++++++++++++ 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 86523e006935..c44a4490ed39 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1324,7 +1324,7 @@ class ComputeManager(manager.Manager): f = self.reportclient.get_allocations_for_resource_provider allocations = f(context, cn_uuid) - if not allocations: + if allocations is None: LOG.error( "Could not retrieve compute node resource provider %s and " "therefore unable to error out any instances stuck in " diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 51005164a9c8..23bb6fc89ac0 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -1938,7 +1938,11 @@ class SchedulerReportClient(object): url = '/resource_providers/%s/allocations' % rp_uuid resp = self.get(url, global_request_id=context.global_id) if not resp: - return {} + # NOTE(gibi): The request failed with an error response. Rather + # than return an empty dict, which is possible if there are no + # allocations against the given provider, return None to indicate + # a failure - like in the @safe_connect decorator. + return None else: return resp.json()['allocations'] diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 85e89a530038..00c38b82d461 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -976,6 +976,78 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): mock_instance_save.assert_called_once_with() self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state) + @mock.patch.object(manager.LOG, 'error') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(objects.InstanceList, 'get_by_filters') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocations_for_resource_provider') + @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + def test_init_host_with_interrupted_instance_build_empty_compute( + self, mock_get_nodes, mock_get_by_host_and_node, + mock_get_allocations, mock_get_instances, mock_instance_save, + mock_log): + + mock_get_nodes.return_value = ['fake-node'] + mock_get_by_host_and_node.return_value = objects.ComputeNode( + host=self.compute.host, uuid=uuids.cn_uuid) + + # no instances on the host so no allocations in placement + allocations = {} + mock_get_allocations.return_value = allocations + mock_get_instances.return_value = objects.InstanceList( + self.context, objects=[]) + + self.compute._error_out_instances_whose_build_was_interrupted( + self.context, set()) + + mock_get_by_host_and_node.assert_called_once_with( + self.context, self.compute.host, 'fake-node') + mock_get_allocations.assert_called_once_with( + self.context, uuids.cn_uuid) + + mock_get_instances.assert_not_called() + mock_instance_save.assert_not_called() + mock_log.assert_not_called() + + @mock.patch.object(manager.LOG, 'error') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(objects.InstanceList, 'get_by_filters') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocations_for_resource_provider') + @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + def test_init_host_with_interrupted_instance_build_placement_error( + self, mock_get_nodes, mock_get_by_host_and_node, + mock_get_allocations, mock_get_instances, mock_instance_save, + mock_log): + + mock_get_nodes.return_value = ['fake-node'] + mock_get_by_host_and_node.return_value = objects.ComputeNode( + host=self.compute.host, uuid=uuids.cn_uuid) + + # get_allocations_for_resource_provider returns None if placement + # returns an error + allocations = None + mock_get_allocations.return_value = allocations + mock_get_instances.return_value = objects.InstanceList( + self.context, objects=[]) + + self.compute._error_out_instances_whose_build_was_interrupted( + self.context, set()) + + mock_get_by_host_and_node.assert_called_once_with( + self.context, self.compute.host, 'fake-node') + mock_get_allocations.assert_called_once_with( + self.context, uuids.cn_uuid) + + mock_get_instances.assert_not_called() + mock_instance_save.assert_not_called() + mock_log.assert_called_once_with( + 'Could not retrieve compute node resource provider %s and ' + 'therefore unable to error out any instances stuck in ' + 'BUILDING state.', uuids.cn_uuid) + @mock.patch.object(manager.LOG, 'warning') @mock.patch.object( fake_driver.FakeDriver, 'get_available_nodes',