Fix false ERROR message at compute restart

If an empty compute is restarted a false ERROR message was printed in
the log as the placement report client does not distinguish between
error from placement from empty allocation dict from placement.

This patch changes get_allocations_for_resource_provider to return None
in case of error instead of an empty dict. This is in line with
@safe_connect that would make the call return None as well. The
_error_out_instances_whose_build_was_interrupted also is changed to check
for None instead of empty dict before reporting the ERROR. The only
other caller of get_allocations_for_resource_provider was already
checking for None and converting it to an empty dict so from that caller
perspective this is compatible change on the report client.

This is stable only change as get_allocations_for_resource_provider was
improved during stein[1] to raise on placement error.

[1]I020e7dc47efc79f8907b7bfb753ec779a8da69a1

Change-Id: I6042e493144d4d5a29ec6ab23ffed6b3e7f385fe
Closes-Bug: #1852759
(cherry picked from commit 64f797a051)
This commit is contained in:
Balazs Gibizer 2019-11-15 16:31:04 +01:00 committed by Vlad Gusev
parent 4e9f852056
commit 4fcb7816bc
3 changed files with 78 additions and 2 deletions

View File

@ -1287,7 +1287,7 @@ class ComputeManager(manager.Manager):
f = self.reportclient.get_allocations_for_resource_provider f = self.reportclient.get_allocations_for_resource_provider
allocations = f(context, cn_uuid) allocations = f(context, cn_uuid)
if not allocations: if allocations is None:
LOG.error( LOG.error(
"Could not retrieve compute node resource provider %s and " "Could not retrieve compute node resource provider %s and "
"therefore unable to error out any instances stuck in " "therefore unable to error out any instances stuck in "

View File

@ -1780,7 +1780,11 @@ class SchedulerReportClient(object):
url = '/resource_providers/%s/allocations' % rp_uuid url = '/resource_providers/%s/allocations' % rp_uuid
resp = self.get(url, global_request_id=context.global_id) resp = self.get(url, global_request_id=context.global_id)
if not resp: if not resp:
return {} # NOTE(gibi): The request failed with an error response. Rather
# than return an empty dict, which is possible if there are no
# allocations against the given provider, return None to indicate
# a failure - like in the @safe_connect decorator.
return None
else: else:
return resp.json()['allocations'] return resp.json()['allocations']

View File

@ -885,6 +885,78 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock_instance_save.assert_called_once_with() mock_instance_save.assert_called_once_with()
self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state) self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)
@mock.patch.object(manager.LOG, 'error')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(objects.InstanceList, 'get_by_filters')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build_empty_compute(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations, mock_get_instances, mock_instance_save,
mock_log):
mock_get_nodes.return_value = ['fake-node']
mock_get_by_host_and_node.return_value = objects.ComputeNode(
host=self.compute.host, uuid=uuids.cn_uuid)
# no instances on the host so no allocations in placement
allocations = {}
mock_get_allocations.return_value = allocations
mock_get_instances.return_value = objects.InstanceList(
self.context, objects=[])
self.compute._error_out_instances_whose_build_was_interrupted(
self.context, set())
mock_get_by_host_and_node.assert_called_once_with(
self.context, self.compute.host, 'fake-node')
mock_get_allocations.assert_called_once_with(
self.context, uuids.cn_uuid)
mock_get_instances.assert_not_called()
mock_instance_save.assert_not_called()
mock_log.assert_not_called()
@mock.patch.object(manager.LOG, 'error')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(objects.InstanceList, 'get_by_filters')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build_placement_error(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations, mock_get_instances, mock_instance_save,
mock_log):
mock_get_nodes.return_value = ['fake-node']
mock_get_by_host_and_node.return_value = objects.ComputeNode(
host=self.compute.host, uuid=uuids.cn_uuid)
# get_allocations_for_resource_provider returns None if placement
# returns an error
allocations = None
mock_get_allocations.return_value = allocations
mock_get_instances.return_value = objects.InstanceList(
self.context, objects=[])
self.compute._error_out_instances_whose_build_was_interrupted(
self.context, set())
mock_get_by_host_and_node.assert_called_once_with(
self.context, self.compute.host, 'fake-node')
mock_get_allocations.assert_called_once_with(
self.context, uuids.cn_uuid)
mock_get_instances.assert_not_called()
mock_instance_save.assert_not_called()
mock_log.assert_called_once_with(
'Could not retrieve compute node resource provider %s and '
'therefore unable to error out any instances stuck in '
'BUILDING state.', uuids.cn_uuid)
@mock.patch.object(manager.LOG, 'warning') @mock.patch.object(manager.LOG, 'warning')
@mock.patch.object( @mock.patch.object(
fake_driver.FakeDriver, 'get_available_nodes', fake_driver.FakeDriver, 'get_available_nodes',