[placement] Retry allocation writes server side

This change adds a fast retry loop around AllocationList._set_allocations if a resource provider generation conflict happens. It turns out that under high concurrency of allocation claims being made on the same resource provider conflicts can be quite common and client side retries are insufficient. Because both consumer generation and resource provider generations had raised the same exception there was no way to distinguish between the two so a child of ConcurrentUpdateDetected has been created as ResourceProviderConcurrentUpdateDetected. In the future this will allow us to send different error codes to the client as well, but that change is not done here. When the conflict is detected, all the resource providers in the AllocationList are reloaded and the list objects refreshed. Logging is provided to indicate: * at debug that a retry is going to happen * at warning that all the retries failed and the client is going to see the conflict The tests for this are a bit funky: Some mocks are used to cause the conflicts, then the real actions after a couple of iterations. This was backported from 72e4c4c8d7 with conflicts because exceptions, tests and object files were moved to new locations with Rocky and the AllocationList.create_all method was renamed to the more accurate replace_all. Prior to Rocky there were no Consumer objects and fewer helper methods in functional tests, so the test is adjusted accordingly. Change-Id: Id614d609fc8f3ed2d2ff29a2b52143f53b3b1b9a Closes-Bug: #1719933 (cherry picked from commit 72e4c4c8d7)
2018-07-26 12:00:14 +01:00 · 2018-07-26 12:00:14 +01:00 · 66a47b7623
parent a9c9285a5a
commit 66a47b7623
3 changed files with 138 additions and 4 deletions
--- a/nova/exception.py
+++ b/nova/exception.py
@ -2122,6 +2122,11 @@ class ConcurrentUpdateDetected(NovaException):
                "Please retry your update")


+class ResourceProviderConcurrentUpdateDetected(ConcurrentUpdateDetected):
+    msg_fmt = _("Another thread concurrently updated the resource provider "
+                "data. Please retry your update")
+
+
 class ResourceClassNotFound(NotFound):
    msg_fmt = _("No such resource class %(resource_class)s.")

--- a/nova/objects/resource_provider.py
+++ b/nova/objects/resource_provider.py
@ -279,7 +279,7 @@ def _increment_provider_generation(ctx, rp):

    res = ctx.session.execute(upd_stmt)
    if res.rowcount != 1:
-        raise exception.ConcurrentUpdateDetected
+        raise exception.ResourceProviderConcurrentUpdateDetected()
    return new_generation


@ -1992,6 +1992,10 @@ def _get_allocations_by_consumer_uuid(ctx, consumer_uuid):
@base.NovaObjectRegistry.register_if(False)
 class AllocationList(base.ObjectListBase, base.NovaObject):

+    # The number of times to retry set_allocations if there has
+    # been a resource provider (not consumer) generation coflict.
+    RP_CONFLICT_RETRY_COUNT = 10
+
    fields = {
        'objects': fields.ListOfObjectsField('Allocation'),
    }
@ -2139,9 +2143,41 @@ class AllocationList(base.ObjectListBase, base.NovaObject):
        returned to the caller, nor are their database ids set. If
        those ids are required use one of the get_all_by* methods.
        """
-        # TODO(jaypipes): Retry the allocation writes on
-        # ConcurrentUpdateDetected
-        self._set_allocations(self._context, self.objects)
+        # Retry _set_allocations server side if there is a
+        # ResourceProviderConcurrentUpdateDetected. We don't care about
+        # sleeping, we simply want to reset the resource provider objects
+        # and try again. For sake of simplicity (and because we don't have
+        # easy access to the information) we reload all the resource
+        # providers that may be present.
+        retries = self.RP_CONFLICT_RETRY_COUNT
+        while retries:
+            retries -= 1
+            try:
+                self._set_allocations(self._context, self.objects)
+                break
+            except exception.ResourceProviderConcurrentUpdateDetected:
+                LOG.debug('Retrying allocations write on resource provider '
+                          'generation conflict')
+                # We only want to reload each unique resource provider once.
+                alloc_rp_uuids = set(
+                    alloc.resource_provider.uuid for alloc in self.objects)
+                seen_rps = {}
+                for rp_uuid in alloc_rp_uuids:
+                    seen_rps[rp_uuid] = ResourceProvider.get_by_uuid(
+                        self._context, rp_uuid)
+                for alloc in self.objects:
+                    rp_uuid = alloc.resource_provider.uuid
+                    alloc.resource_provider = seen_rps[rp_uuid]
+        else:
+            # We ran out of retries so we need to raise again.
+            # The log will automatically have request id info associated with
+            # it that will allow tracing back to specific allocations.
+            # Attempting to extract specific consumer or resource provider
+            # information from the allocations is not coherent as this
+            # could be multiple consumers and providers.
+            LOG.warning('Exceeded retry limit of %d on allocations write',
+                        self.RP_CONFLICT_RETRY_COUNT)
+            raise exception.ResourceProviderConcurrentUpdateDetected()

    def delete_all(self):
        # Allocations can only have a single consumer, so take advantage of
--- a/nova/tests/functional/db/test_resource_provider.py
+++ b/nova/tests/functional/db/test_resource_provider.py
@ -11,6 +11,7 @@
 #    under the License.


+import functools
 import mock
 import os_traits
 from oslo_db import exception as db_exc
@ -42,6 +43,15 @@ DISK_ALLOCATION = dict(
 )


+def add_inventory(rp, rc, total, **kwargs):
+    kwargs.setdefault('max_unit', total)
+    inv = rp_obj.Inventory(rp._context, resource_provider=rp,
+                           resource_class=rc, total=total, **kwargs)
+    inv.obj_set_defaults()
+    rp.add_inventory(inv)
+    return inv
+
+
 class ResourceProviderBaseCase(test.NoDBTestCase):

    USES_DB_SELF = True
@ -1866,6 +1876,89 @@ class TestAllocationListCreateDelete(ResourceProviderBaseCase):
            self.ctx, migration_uuid)
        self.assertEqual(0, len(allocations))

+    @mock.patch('nova.objects.resource_provider.LOG')
+    def test_set_allocations_retry(self, mock_log):
+        """Test server side allocation write retry handling."""
+
+        # Create a single resource provider and give it some inventory.
+        rp1 = rp_obj.ResourceProvider(
+            self.ctx, name='rp1', uuid=uuidsentinel.rp1)
+        rp1.create()
+        add_inventory(rp1, fields.ResourceClass.VCPU, 24,
+                      allocation_ratio=16.0)
+        add_inventory(rp1, fields.ResourceClass.MEMORY_MB, 1024,
+                      min_unit=64,
+                      max_unit=1024,
+                      step_size=64)
+        original_generation = rp1.generation
+        # Verify the generation is what we expect (we'll be checking again
+        # later).
+        self.assertEqual(2, original_generation)
+
+        inst_consumer = uuidsentinel.instance
+
+        alloc_list = rp_obj.AllocationList(context=self.ctx,
+            objects=[
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer_id=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.VCPU,
+                    used=12),
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer_id=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.MEMORY_MB,
+                    used=1024)
+            ])
+
+        # Make sure the right exception happens when the retry loop expires.
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 0):
+            self.assertRaises(
+                exception.ResourceProviderConcurrentUpdateDetected,
+                alloc_list.create_all)
+            mock_log.warning.assert_called_with(
+                'Exceeded retry limit of %d on allocations write', 0)
+
+        # Make sure the right thing happens after a small number of failures.
+        # There's a bit of mock magic going on here to enusre that we can
+        # both do some side effects on _set_allocations as well as have the
+        # real behavior. Two generation conflicts and then a success.
+        mock_log.reset_mock()
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 3):
+            unmocked_set = functools.partial(
+                rp_obj.AllocationList._set_allocations, alloc_list)
+            with mock.patch(
+                'nova.objects.resource_provider.'
+                'AllocationList._set_allocations') as mock_set:
+                exceptions = iter([
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                ])
+
+                def side_effect(*args, **kwargs):
+                    try:
+                        raise next(exceptions)
+                    except StopIteration:
+                        return unmocked_set(*args, **kwargs)
+
+                mock_set.side_effect = side_effect
+                alloc_list.create_all()
+                self.assertEqual(2, mock_log.debug.call_count)
+                mock_log.debug.called_with(
+                    'Retrying allocations write on resource provider '
+                    'generation conflict')
+                self.assertEqual(3, mock_set.call_count)
+
+        # Confirm we're using a different rp object after the change
+        # and that it has a higher generation.
+        new_rp = alloc_list[0].resource_provider
+        self.assertEqual(original_generation, rp1.generation)
+        self.assertEqual(original_generation + 1, new_rp.generation)
+

 class UsageListTestCase(ResourceProviderBaseCase):