From ad4ef5af8bf7dc34caef21b0062ef9cc504bc216 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 31 Jul 2019 12:41:00 -0400 Subject: [PATCH] rt: soften warning case in _remove_deleted_instances_allocations During an instance_claim during initial server create or unshelve, the instance.host/node values will be set before the resource tracker has the instance in the tracked_instances dict. If _remove_deleted_instances_allocations is called with the instance before it's being tracked, a warning like this is logged: Jul 31 13:12:57.455904 ubuntu-bionic-rax-iad-0009534722 nova-compute[31951]: WARNING nova.compute.resource_tracker [None req-d6f2ae97-d8f7-46f6-8974-b42aeb58302d None None] Instance 227c23cd-aeb2-4b5a-b001-21bd920a5e77 is not being actively managed by this compute host but has allocations referencing this compute host: {u'resources': {u'MEMORY_MB': 64, u'VCPU': 1, u'DISK_GB': 1}}. Skipping heal of allocation because we do not know what to do. This shows up quite frequently in CI runs (see the bug report for a logstash query) which means it should not be a warning. This change checks the instance task_state and if set then we only log a debug message rather than the warning since we can assume we are racing and the task will correct itself upon completion. Change-Id: I6db8bea6761b68c39e6332d4698d1f8312863758 Closes-Bug: #1838541 --- nova/compute/resource_tracker.py | 26 ++++++--- .../unit/compute/test_resource_tracker.py | 53 ++++++++++++++++++- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index f191b9d7bc0b..57fb2af62c30 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1387,12 +1387,26 @@ class ResourceTracker(object): if (instance.host == cn.host and instance.node == cn.hypervisor_hostname): # The instance is supposed to be on this compute host but is - # not in the list of actively managed instances. - LOG.warning("Instance %s is not being actively managed by " - "this compute host but has allocations " - "referencing this compute host: %s. Skipping " - "heal of allocation because we do not know " - "what to do.", instance_uuid, alloc) + # not in the list of actively managed instances. This could be + # because we are racing with an instance_claim call during + # initial build or unshelve where the instance host/node is set + # before the instance is added to tracked_instances. If the + # task_state is set, then consider things in motion and log at + # debug level instead of warning. + if instance.task_state: + LOG.debug('Instance with task_state "%s" is not being ' + 'actively managed by this compute host but has ' + 'allocations referencing this compute node ' + '(%s): %s. Skipping heal of allocations during ' + 'the task state transition.', + instance.task_state, cn.uuid, alloc, + instance=instance) + else: + LOG.warning("Instance %s is not being actively managed by " + "this compute host but has allocations " + "referencing this compute host: %s. Skipping " + "heal of allocation because we do not know " + "what to do.", instance_uuid, alloc) continue if instance.host != cn.host: # The instance has been moved to another host either via a diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index fdfcfbd40fca..f89c7f25621c 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -3204,9 +3204,10 @@ class TestUpdateUsageFromInstance(BaseTestCase): # instance and has allocations for it. rc.delete_allocation_for_instance.assert_not_called() + @mock.patch('nova.compute.resource_tracker.LOG.warning') @mock.patch('nova.objects.Instance.get_by_uuid') def test_remove_deleted_instances_allocations_unknown_instance( - self, mock_inst_get): + self, mock_inst_get, mock_log_warning): """Tests the case that an instance is found with allocations for this host/node but is not in the dict of tracked instances. The allocations are not removed for the instance since we don't know @@ -3241,6 +3242,56 @@ class TestUpdateUsageFromInstance(BaseTestCase): # testing the current behavior but in the future when we get smart # and figure things out, this should actually be an error. rc.delete_allocation_for_instance.assert_not_called() + # Assert the expected warning was logged. + mock_log_warning.assert_called_once() + self.assertIn("Instance %s is not being actively managed by " + "this compute host but has allocations " + "referencing this compute host", + mock_log_warning.call_args[0][0]) + + @mock.patch('nova.compute.resource_tracker.LOG.debug') + @mock.patch('nova.objects.Instance.get_by_uuid') + def test_remove_deleted_instances_allocations_state_transition_instance( + self, mock_inst_get, mock_log_debug): + """Tests the case that an instance is found with allocations for + this host/node but is not in the dict of tracked instances but the + instance.task_state is not None so we do not log a warning nor remove + allocations since we want to let the operation play out. + """ + instance = copy.deepcopy(_INSTANCE_FIXTURES[0]) + instance.task_state = task_states.SPAWNING + mock_inst_get.return_value = instance + rc = self.rt.reportclient + # No tracked instances on this node. + # But there is an allocation for an instance on this node. + allocs = report.ProviderAllocInfo( + allocations={ + instance.uuid: { + 'resources': { + 'VCPU': 1, + 'MEMORY_MB': 2048, + 'DISK_GB': 20 + } + } + } + ) + rc.get_allocations_for_resource_provider = mock.MagicMock( + return_value=allocs) + rc.delete_allocation_for_instance = mock.MagicMock() + cn = self.rt.compute_nodes[_NODENAME] + ctx = mock.MagicMock() + # Call the method. + self.rt._remove_deleted_instances_allocations( + ctx, cn, [], {}) + # We don't delete the allocation because the instance is on this host + # but is transitioning task states. + rc.delete_allocation_for_instance.assert_not_called() + # Assert the expected debug message was logged. + mock_log_debug.assert_called_once() + self.assertIn('Instance with task_state "%s" is not being ' + 'actively managed by this compute host but has ' + 'allocations referencing this compute node', + mock_log_debug.call_args[0][0]) def test_remove_deleted_instances_allocations_retrieval_fail(self): """When the report client errs or otherwise retrieves no allocations,