[placement] Retry allocation writes server side

This change adds a fast retry loop around AllocationList._set_allocations if a resource provider generation conflict happens. It turns out that under high concurrency of allocation claims being made on the same resource provider conflicts can be quite common and client side retries are insufficient. Because both consumer generation and resource provider generations had raised the same exception there was no way to distinguish between the two so a child of ConcurrentUpdateDetected has been created as ResourceProviderConcurrentUpdateDetected. In the future this will allow us to send different error codes to the client as well, but that change is not done here. When the conflict is detected, all the resource providers in the AllocationList are reloaded and the list objects refreshed. Logging is provided to indicate: * at debug that a retry is going to happen * at warning that all the retries failed and the client is going to see the conflict The tests for this are a bit funky: Some mocks are used to cause the conflicts, then the real actions after a couple of iterations. Change-Id: Id614d609fc8f3ed2d2ff29a2b52143f53b3b1b9a Closes-Bug: #1719933
2018-07-26 12:00:14 +01:00 · 2018-07-26 12:00:14 +01:00 · 72e4c4c8d7
parent 04b83e7013
commit 72e4c4c8d7
3 changed files with 131 additions and 4 deletions
--- a/nova/api/openstack/placement/exception.py
+++ b/nova/api/openstack/placement/exception.py
@ -85,6 +85,11 @@ class ConcurrentUpdateDetected(_BaseException):
                "Please retry your update")


+class ResourceProviderConcurrentUpdateDetected(ConcurrentUpdateDetected):
+    msg_fmt = _("Another thread concurrently updated the resource provider "
+                "data. Please retry your update")
+
+
 class InvalidAllocationCapacityExceeded(InvalidInventory):
    msg_fmt = _("Unable to create allocation for '%(resource_class)s' on "
                "resource provider '%(resource_provider)s'. The requested "
--- a/nova/api/openstack/placement/objects/resource_provider.py
+++ b/nova/api/openstack/placement/objects/resource_provider.py
@ -278,7 +278,7 @@ def _increment_provider_generation(ctx, rp):

    res = ctx.session.execute(upd_stmt)
    if res.rowcount != 1:
-        raise exception.ConcurrentUpdateDetected
+        raise exception.ResourceProviderConcurrentUpdateDetected()
    return new_generation


@ -1924,6 +1924,10 @@ def _create_incomplete_consumer(ctx, consumer_id):
@base.VersionedObjectRegistry.register_if(False)
 class AllocationList(base.ObjectListBase, base.VersionedObject):

+    # The number of times to retry set_allocations if there has
+    # been a resource provider (not consumer) generation coflict.
+    RP_CONFLICT_RETRY_COUNT = 10
+
    fields = {
        'objects': fields.ListOfObjectsField('Allocation'),
    }
@ -2106,9 +2110,41 @@ class AllocationList(base.ObjectListBase, base.VersionedObject):
               so, it will end up setting the Allocation.id attribute of each
               Allocation object.
        """
-        # TODO(jaypipes): Retry the allocation writes on
-        # ConcurrentUpdateDetected
-        self._set_allocations(self._context, self.objects)
+        # Retry _set_allocations server side if there is a
+        # ResourceProviderConcurrentUpdateDetected. We don't care about
+        # sleeping, we simply want to reset the resource provider objects
+        # and try again. For sake of simplicity (and because we don't have
+        # easy access to the information) we reload all the resource
+        # providers that may be present.
+        retries = self.RP_CONFLICT_RETRY_COUNT
+        while retries:
+            retries -= 1
+            try:
+                self._set_allocations(self._context, self.objects)
+                break
+            except exception.ResourceProviderConcurrentUpdateDetected:
+                LOG.debug('Retrying allocations write on resource provider '
+                          'generation conflict')
+                # We only want to reload each unique resource provider once.
+                alloc_rp_uuids = set(
+                    alloc.resource_provider.uuid for alloc in self.objects)
+                seen_rps = {}
+                for rp_uuid in alloc_rp_uuids:
+                    seen_rps[rp_uuid] = ResourceProvider.get_by_uuid(
+                        self._context, rp_uuid)
+                for alloc in self.objects:
+                    rp_uuid = alloc.resource_provider.uuid
+                    alloc.resource_provider = seen_rps[rp_uuid]
+        else:
+            # We ran out of retries so we need to raise again.
+            # The log will automatically have request id info associated with
+            # it that will allow tracing back to specific allocations.
+            # Attempting to extract specific consumer or resource provider
+            # information from the allocations is not coherent as this
+            # could be multiple consumers and providers.
+            LOG.warning('Exceeded retry limit of %d on allocations write',
+                        self.RP_CONFLICT_RETRY_COUNT)
+            raise exception.ResourceProviderConcurrentUpdateDetected()

    def delete_all(self):
        consumer_uuids = set(alloc.consumer.uuid for alloc in self.objects)
--- a/nova/tests/functional/api/openstack/placement/db/test_resource_provider.py
+++ b/nova/tests/functional/api/openstack/placement/db/test_resource_provider.py
@ -11,6 +11,7 @@
 #    under the License.


+import functools
 import mock
 import os_traits
 from oslo_db import exception as db_exc
@ -1701,6 +1702,91 @@ class TestAllocationListCreateDelete(tb.PlacementDbBaseTestCase):
            self.ctx, empty_rp)
        self.assertEqual(0, len(allocations))

+    @mock.patch('nova.api.openstack.placement.objects.resource_provider.LOG')
+    def test_set_allocations_retry(self, mock_log):
+        """Test server side allocation write retry handling."""
+
+        # Create a single resource provider and give it some inventory.
+        rp1 = self._create_provider('rp1')
+        tb.add_inventory(rp1, fields.ResourceClass.VCPU, 24,
+                         allocation_ratio=16.0)
+        tb.add_inventory(rp1, fields.ResourceClass.MEMORY_MB, 1024,
+                         min_unit=64,
+                         max_unit=1024,
+                         step_size=64)
+        original_generation = rp1.generation
+        # Verify the generation is what we expect (we'll be checking again
+        # later).
+        self.assertEqual(2, original_generation)
+
+        # Create a consumer and have it make an allocation.
+        inst_consumer = consumer_obj.Consumer(
+            self.ctx, uuid=uuidsentinel.instance, user=self.user_obj,
+            project=self.project_obj)
+        inst_consumer.create()
+
+        alloc_list = rp_obj.AllocationList(context=self.ctx,
+            objects=[
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.VCPU,
+                    used=12),
+                rp_obj.Allocation(
+                    context=self.ctx,
+                    consumer=inst_consumer,
+                    resource_provider=rp1,
+                    resource_class=fields.ResourceClass.MEMORY_MB,
+                    used=1024)
+            ])
+
+        # Make sure the right exception happens when the retry loop expires.
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 0):
+            self.assertRaises(
+                exception.ResourceProviderConcurrentUpdateDetected,
+                alloc_list.replace_all)
+            mock_log.warning.assert_called_with(
+                'Exceeded retry limit of %d on allocations write', 0)
+
+        # Make sure the right thing happens after a small number of failures.
+        # There's a bit of mock magic going on here to enusre that we can
+        # both do some side effects on _set_allocations as well as have the
+        # real behavior. Two generation conflicts and then a success.
+        mock_log.reset_mock()
+        with mock.patch.object(rp_obj.AllocationList,
+                               'RP_CONFLICT_RETRY_COUNT', 3):
+            unmocked_set = functools.partial(
+                rp_obj.AllocationList._set_allocations, alloc_list)
+            with mock.patch(
+                'nova.api.openstack.placement.objects.resource_provider.'
+                'AllocationList._set_allocations') as mock_set:
+                exceptions = iter([
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                    exception.ResourceProviderConcurrentUpdateDetected(),
+                ])
+
+                def side_effect(*args, **kwargs):
+                    try:
+                        raise next(exceptions)
+                    except StopIteration:
+                        return unmocked_set(*args, **kwargs)
+
+                mock_set.side_effect = side_effect
+                alloc_list.replace_all()
+                self.assertEqual(2, mock_log.debug.call_count)
+                mock_log.debug.called_with(
+                    'Retrying allocations write on resource provider '
+                    'generation conflict')
+                self.assertEqual(3, mock_set.call_count)
+
+        # Confirm we're using a different rp object after the change
+        # and that it has a higher generation.
+        new_rp = alloc_list[0].resource_provider
+        self.assertEqual(original_generation, rp1.generation)
+        self.assertEqual(original_generation + 1, new_rp.generation)
+

 class UsageListTestCase(tb.PlacementDbBaseTestCase):