Retry /reshape at provider generation conflict

During a normal update_available_resources run if the local provider tree caches is invalid (i.e. due to the scheduler made an allocation bumping the generation of the RPs) and the virt driver try to update the inventory of an RP based on the cache Placement will report conflict, the report client will invalidate the caches and the retry decorator on ResourceTracker._update_to_placement will re-drive the top of the fresh RP data. However the same thing can happen during reshape as well but the retry mechanism is missing in that code path so the stale caches can cause reshape failures. This patch adds specific error handling in the reshape code path to implement the same retry mechanism as exists for inventory update. blueprint: pci-device-tracking-in-placement Change-Id: Ieb954a04e6aba827611765f7f401124a1fe298f3
2022-07-28 10:59:48 +02:00 · 2022-07-28 10:59:48 +02:00 · 48229b46b4
commit 48229b46b4
parent bdec962fd6
5 changed files with 97 additions and 11 deletions
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -1195,9 +1195,16 @@ class ResourceTracker(object):

        return list(traits)

-    @retrying.retry(stop_max_attempt_number=4,
-                    retry_on_exception=lambda e: isinstance(
-                        e, exception.ResourceProviderUpdateConflict))
+    @retrying.retry(
+        stop_max_attempt_number=4,
+        retry_on_exception=lambda e: isinstance(
+            e,
+            (
+                exception.ResourceProviderUpdateConflict,
+                exception.PlacementReshapeConflict,
+            ),
+        ),
+    )
    def _update_to_placement(self, context, compute_node, startup):
        """Send resource and inventory changes to placement."""
        # NOTE(jianghuaw): Some resources(e.g. VGPU) are not saved in the
--- a/nova/exception.py
+++ b/nova/exception.py
@ -2077,6 +2077,16 @@ class ResourceProviderUpdateConflict(PlacementAPIConflict):
                "provider %(uuid)s (generation %(generation)d): %(error)s")


+class PlacementReshapeConflict(PlacementAPIConflict):
+    """A 409 caused by generation mismatch from attempting to reshape a
+    provider tree.
+    """
+    msg_fmt = _(
+        "A conflict was encountered attempting to reshape a provider tree: "
+        "$(error)s"
+    )
+
+
 class InvalidResourceClass(Invalid):
    msg_fmt = _("Resource class '%(resource_class)s' invalid.")

--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1277,6 +1277,11 @@ class SchedulerReportClient(object):
        resp = self.post('/reshaper', payload, version=RESHAPER_VERSION,
                         global_request_id=context.global_id)
        if not resp:
+            if resp.status_code == 409:
+                err = resp.json()['errors'][0]
+                if err['code'] == 'placement.concurrent_update':
+                    raise exception.PlacementReshapeConflict(error=resp.text)
+
            raise exception.ReshapeFailed(error=resp.text)

        return resp
@ -1310,7 +1315,7 @@ class SchedulerReportClient(object):
        # failure here to be fatal to the caller.
        try:
            self._reshape(context, inventories, allocations)
-        except exception.ReshapeFailed:
+        except (exception.ReshapeFailed, exception.PlacementReshapeConflict):
            raise
        except Exception as e:
            # Make sure the original stack trace gets logged.
@ -1429,8 +1434,16 @@ class SchedulerReportClient(object):
        if allocations is not None:
            # NOTE(efried): We do not catch_all here, because ReshapeFailed
            # needs to bubble up right away and be handled specially.
-            self._set_up_and_do_reshape(context, old_tree, new_tree,
-                                        allocations)
+            try:
+                self._set_up_and_do_reshape(
+                    context, old_tree, new_tree, allocations)
+            except exception.PlacementReshapeConflict:
+                # The conflict means we need to invalidate the local caches and
+                # let the retry mechanism in _update_to_placement to re-drive
+                # the reshape top of the fresh data
+                with excutils.save_and_reraise_exception():
+                    self.clear_provider_cache()
+
            # The reshape updated provider generations, so the ones we have in
            # the cache are now stale. The inventory update below will short
            # out, but we would still bounce with a provider generation
--- a/nova/tests/functional/test_report_client.py
+++ b/nova/tests/functional/test_report_client.py
@ -1363,6 +1363,17 @@ class SchedulerReportClientTests(test.TestCase):
        resp = self.client._reshape(self.context, inventories, allocs)
        self.assertEqual(204, resp.status_code)

+        # Trigger generation conflict
+        # We can do this is by simply sending back the same reshape as that
+        # will not work because the previous reshape updated generations
+        self.assertRaises(
+            exception.PlacementReshapeConflict,
+            self.client._reshape,
+            self.context,
+            inventories,
+            allocs,
+        )
+
    def test_update_from_provider_tree_reshape(self):
        """Run update_from_provider_tree with reshaping."""
        exp_ptree = self._set_up_provider_tree()
@ -1519,3 +1530,44 @@ class SchedulerReportClientTests(test.TestCase):
            self.context, self.compute_name)
        self.assertProviderTree(orig_exp_ptree, ptree)
        self.assertAllocations(orig_exp_allocs, allocs)
+
+    def test_update_from_provider_tree_reshape_conflict_retry(self):
+        exp_ptree = self._set_up_provider_tree()
+
+        ptree = self.client.get_provider_tree_and_ensure_root(
+            self.context, self.compute_uuid)
+        allocs = self.client.get_allocations_for_provider_tree(
+            self.context, self.compute_name)
+        self.assertProviderTree(exp_ptree, ptree)
+        self.assertAllocations({}, allocs)
+
+        exp_allocs = self._set_up_provider_tree_allocs()
+
+        # we prepare inventory and allocation changes to trigger a reshape
+        for rp_uuid in ptree.get_provider_uuids():
+            # Add a new resource class to the inventories
+            ptree.update_inventory(
+                rp_uuid, dict(ptree.data(rp_uuid).inventory,
+                              CUSTOM_FOO={'total': 10}))
+            exp_ptree[rp_uuid]['inventory']['CUSTOM_FOO'] = {'total': 10}
+        for c_uuid, alloc in allocs.items():
+            for rp_uuid, res in alloc['allocations'].items():
+                res['resources']['CUSTOM_FOO'] = 1
+                exp_allocs[c_uuid]['allocations'][rp_uuid][
+                    'resources']['CUSTOM_FOO'] = 1
+
+        # As the inventory update happens is the same request as the allocation
+        # update the allocation update will have a generation conflict.
+        # So we expect that it is signalled with an exception so that the
+        # upper layer can re-drive the reshape process with a fresh tree that
+        # now has the inventories
+        self.assertRaises(
+            exception.PlacementReshapeConflict,
+            self.client.update_from_provider_tree,
+            self.context,
+            ptree,
+            allocations=allocs,
+        )
+        # also we except that the internal caches is cleared so that the
+        # re-drive will have a chance to load fresh data from placement
+        self.assertEqual(0, len(self.client._provider_tree.roots))
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -1710,12 +1710,18 @@ class TestUpdateComputeNode(BaseTestCase):
        self.assertEqual(exp_inv, ptree.data(new_compute.uuid).inventory)
        mock_sync_disabled.assert_called_once()

+    @ddt.data(
+        exc.ResourceProviderUpdateConflict(
+            uuid='uuid', generation=42, error='error'),
+        exc.PlacementReshapeConflict(error='error'),
+    )
    @mock.patch('nova.compute.resource_tracker.ResourceTracker.'
                '_sync_compute_service_disabled_trait')
    @mock.patch('nova.compute.resource_tracker.ResourceTracker.'
                '_resource_change', return_value=False)
-    def test_update_retry_success(self, mock_resource_change,
-                                  mock_sync_disabled):
+    def test_update_retry_success(
+        self, exc, mock_resource_change, mock_sync_disabled
+    ):
        self._setup_rt()
        orig_compute = _COMPUTE_NODE_FIXTURES[0].obj_clone()
        self.rt.compute_nodes[_NODENAME] = orig_compute
@ -1729,9 +1735,7 @@ class TestUpdateComputeNode(BaseTestCase):
        self.driver_mock.update_provider_tree.side_effect = lambda *a: None

        ufpt_mock = self.rt.reportclient.update_from_provider_tree
-        ufpt_mock.side_effect = (
-            exc.ResourceProviderUpdateConflict(
-                uuid='uuid', generation=42, error='error'), None)
+        ufpt_mock.side_effect = (exc, None)

        self.rt._update(mock.sentinel.ctx, new_compute)