diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 8f2fdd2aa528..15b2735e15ec 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1195,9 +1195,16 @@ class ResourceTracker(object): return list(traits) - @retrying.retry(stop_max_attempt_number=4, - retry_on_exception=lambda e: isinstance( - e, exception.ResourceProviderUpdateConflict)) + @retrying.retry( + stop_max_attempt_number=4, + retry_on_exception=lambda e: isinstance( + e, + ( + exception.ResourceProviderUpdateConflict, + exception.PlacementReshapeConflict, + ), + ), + ) def _update_to_placement(self, context, compute_node, startup): """Send resource and inventory changes to placement.""" # NOTE(jianghuaw): Some resources(e.g. VGPU) are not saved in the diff --git a/nova/exception.py b/nova/exception.py index b6a900144f52..fe3c12d87072 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2088,6 +2088,16 @@ class ResourceProviderUpdateConflict(PlacementAPIConflict): "provider %(uuid)s (generation %(generation)d): %(error)s") +class PlacementReshapeConflict(PlacementAPIConflict): + """A 409 caused by generation mismatch from attempting to reshape a + provider tree. + """ + msg_fmt = _( + "A conflict was encountered attempting to reshape a provider tree: " + "$(error)s" + ) + + class InvalidResourceClass(Invalid): msg_fmt = _("Resource class '%(resource_class)s' invalid.") diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index ff86527cf55c..cb04711bd0b1 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -1277,6 +1277,11 @@ class SchedulerReportClient(object): resp = self.post('/reshaper', payload, version=RESHAPER_VERSION, global_request_id=context.global_id) if not resp: + if resp.status_code == 409: + err = resp.json()['errors'][0] + if err['code'] == 'placement.concurrent_update': + raise exception.PlacementReshapeConflict(error=resp.text) + raise exception.ReshapeFailed(error=resp.text) return resp @@ -1310,7 +1315,7 @@ class SchedulerReportClient(object): # failure here to be fatal to the caller. try: self._reshape(context, inventories, allocations) - except exception.ReshapeFailed: + except (exception.ReshapeFailed, exception.PlacementReshapeConflict): raise except Exception as e: # Make sure the original stack trace gets logged. @@ -1429,8 +1434,16 @@ class SchedulerReportClient(object): if allocations is not None: # NOTE(efried): We do not catch_all here, because ReshapeFailed # needs to bubble up right away and be handled specially. - self._set_up_and_do_reshape(context, old_tree, new_tree, - allocations) + try: + self._set_up_and_do_reshape( + context, old_tree, new_tree, allocations) + except exception.PlacementReshapeConflict: + # The conflict means we need to invalidate the local caches and + # let the retry mechanism in _update_to_placement to re-drive + # the reshape top of the fresh data + with excutils.save_and_reraise_exception(): + self.clear_provider_cache() + # The reshape updated provider generations, so the ones we have in # the cache are now stale. The inventory update below will short # out, but we would still bounce with a provider generation diff --git a/nova/tests/functional/test_report_client.py b/nova/tests/functional/test_report_client.py index ba4b729b8734..a5da9f87b46c 100644 --- a/nova/tests/functional/test_report_client.py +++ b/nova/tests/functional/test_report_client.py @@ -1363,6 +1363,17 @@ class SchedulerReportClientTests(test.TestCase): resp = self.client._reshape(self.context, inventories, allocs) self.assertEqual(204, resp.status_code) + # Trigger generation conflict + # We can do this is by simply sending back the same reshape as that + # will not work because the previous reshape updated generations + self.assertRaises( + exception.PlacementReshapeConflict, + self.client._reshape, + self.context, + inventories, + allocs, + ) + def test_update_from_provider_tree_reshape(self): """Run update_from_provider_tree with reshaping.""" exp_ptree = self._set_up_provider_tree() @@ -1519,3 +1530,44 @@ class SchedulerReportClientTests(test.TestCase): self.context, self.compute_name) self.assertProviderTree(orig_exp_ptree, ptree) self.assertAllocations(orig_exp_allocs, allocs) + + def test_update_from_provider_tree_reshape_conflict_retry(self): + exp_ptree = self._set_up_provider_tree() + + ptree = self.client.get_provider_tree_and_ensure_root( + self.context, self.compute_uuid) + allocs = self.client.get_allocations_for_provider_tree( + self.context, self.compute_name) + self.assertProviderTree(exp_ptree, ptree) + self.assertAllocations({}, allocs) + + exp_allocs = self._set_up_provider_tree_allocs() + + # we prepare inventory and allocation changes to trigger a reshape + for rp_uuid in ptree.get_provider_uuids(): + # Add a new resource class to the inventories + ptree.update_inventory( + rp_uuid, dict(ptree.data(rp_uuid).inventory, + CUSTOM_FOO={'total': 10})) + exp_ptree[rp_uuid]['inventory']['CUSTOM_FOO'] = {'total': 10} + for c_uuid, alloc in allocs.items(): + for rp_uuid, res in alloc['allocations'].items(): + res['resources']['CUSTOM_FOO'] = 1 + exp_allocs[c_uuid]['allocations'][rp_uuid][ + 'resources']['CUSTOM_FOO'] = 1 + + # As the inventory update happens is the same request as the allocation + # update the allocation update will have a generation conflict. + # So we expect that it is signalled with an exception so that the + # upper layer can re-drive the reshape process with a fresh tree that + # now has the inventories + self.assertRaises( + exception.PlacementReshapeConflict, + self.client.update_from_provider_tree, + self.context, + ptree, + allocations=allocs, + ) + # also we except that the internal caches is cleared so that the + # re-drive will have a chance to load fresh data from placement + self.assertEqual(0, len(self.client._provider_tree.roots)) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index f64c9719efa6..2e7b04fbb576 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -1710,12 +1710,18 @@ class TestUpdateComputeNode(BaseTestCase): self.assertEqual(exp_inv, ptree.data(new_compute.uuid).inventory) mock_sync_disabled.assert_called_once() + @ddt.data( + exc.ResourceProviderUpdateConflict( + uuid='uuid', generation=42, error='error'), + exc.PlacementReshapeConflict(error='error'), + ) @mock.patch('nova.compute.resource_tracker.ResourceTracker.' '_sync_compute_service_disabled_trait') @mock.patch('nova.compute.resource_tracker.ResourceTracker.' '_resource_change', return_value=False) - def test_update_retry_success(self, mock_resource_change, - mock_sync_disabled): + def test_update_retry_success( + self, exc, mock_resource_change, mock_sync_disabled + ): self._setup_rt() orig_compute = _COMPUTE_NODE_FIXTURES[0].obj_clone() self.rt.compute_nodes[_NODENAME] = orig_compute @@ -1729,9 +1735,7 @@ class TestUpdateComputeNode(BaseTestCase): self.driver_mock.update_provider_tree.side_effect = lambda *a: None ufpt_mock = self.rt.reportclient.update_from_provider_tree - ufpt_mock.side_effect = ( - exc.ResourceProviderUpdateConflict( - uuid='uuid', generation=42, error='error'), None) + ufpt_mock.side_effect = (exc, None) self.rt._update(mock.sentinel.ctx, new_compute)