Merge "Add allocation_conflict_retry_count conf setting"

2019-11-14 21:07:05 +00:00 · 2019-11-14 21:07:05 +00:00 · b87073c991
parent 2a665b5d99 7753694d29
commit b87073c991
5 changed files with 61 additions and 32 deletions
--- a/placement/conf/placement.py
+++ b/placement/conf/placement.py
@ -72,6 +72,14 @@ a project or user identifier for the consumer. In cleaning up the data
 modeling, we no longer allow missing project and user information. If an older
 client makes an allocation, we'll use this in place of the information it
 doesn't provide.
+"""),
+    cfg.IntOpt(
+        'allocation_conflict_retry_count',
+        default=10,
+        help="""
+The number of times to retry, server-side, writing allocations when there is
+a resource provider generation conflict. Raising this value may be useful
+when many concurrent allocations to the same resource provider are expected.
 """),
 ]

--- a/placement/objects/allocation.py
+++ b/placement/objects/allocation.py
@ -35,10 +35,6 @@ _USER_TBL = models.User.__table__

 LOG = logging.getLogger(__name__)

-# The number of times to retry set_allocations if there has
-# been a resource provider (not consumer) generation coflict.
-RP_CONFLICT_RETRY_COUNT = 10
-

 class Allocation(object):

@ -499,7 +495,7 @@ def replace_all(context, alloc_list):
    # and try again. For sake of simplicity (and because we don't have
    # easy access to the information) we reload all the resource
    # providers that may be present.
-    retries = RP_CONFLICT_RETRY_COUNT
+    retries = context.config.placement.allocation_conflict_retry_count
    while retries:
        retries -= 1
        try:
@ -526,7 +522,7 @@ def replace_all(context, alloc_list):
        # information from the allocations is not coherent as this
        # could be multiple consumers and providers.
        LOG.warning('Exceeded retry limit of %d on allocations write',
-                    RP_CONFLICT_RETRY_COUNT)
+                    context.config.placement.allocation_conflict_retry_count)
        raise exception.ResourceProviderConcurrentUpdateDetected()


--- a/placement/tests/functional/db/test_allocation.py
+++ b/placement/tests/functional/db/test_allocation.py
@ -624,40 +624,42 @@ class TestAllocationListCreateDelete(tb.PlacementDbBaseTestCase):
        ]

        # Make sure the right exception happens when the retry loop expires.
-        with mock.patch.object(alloc_obj, 'RP_CONFLICT_RETRY_COUNT', 0):
-            self.assertRaises(
-                exception.ResourceProviderConcurrentUpdateDetected,
-                alloc_obj.replace_all, self.ctx, alloc_list)
-            mock_log.warning.assert_called_with(
-                'Exceeded retry limit of %d on allocations write', 0)
+        self.conf_fixture.config(allocation_conflict_retry_count=0,
+                                 group='placement')
+        self.assertRaises(
+            exception.ResourceProviderConcurrentUpdateDetected,
+            alloc_obj.replace_all, self.ctx, alloc_list)
+        mock_log.warning.assert_called_with(
+            'Exceeded retry limit of %d on allocations write', 0)

        # Make sure the right thing happens after a small number of failures.
        # There's a bit of mock magic going on here to enusre that we can
        # both do some side effects on _set_allocations as well as have the
        # real behavior. Two generation conflicts and then a success.
        mock_log.reset_mock()
-        with mock.patch.object(alloc_obj, 'RP_CONFLICT_RETRY_COUNT', 3):
-            unmocked_set = alloc_obj._set_allocations
-            with mock.patch('placement.objects.allocation.'
-                            '_set_allocations') as mock_set:
-                exceptions = iter([
-                    exception.ResourceProviderConcurrentUpdateDetected(),
-                    exception.ResourceProviderConcurrentUpdateDetected(),
-                ])
+        self.conf_fixture.config(allocation_conflict_retry_count=3,
+                                 group='placement')
+        unmocked_set = alloc_obj._set_allocations
+        with mock.patch('placement.objects.allocation.'
+                        '_set_allocations') as mock_set:
+            exceptions = iter([
+                exception.ResourceProviderConcurrentUpdateDetected(),
+                exception.ResourceProviderConcurrentUpdateDetected(),
+            ])

-                def side_effect(*args, **kwargs):
-                    try:
-                        raise next(exceptions)
-                    except StopIteration:
-                        return unmocked_set(*args, **kwargs)
+            def side_effect(*args, **kwargs):
+                try:
+                    raise next(exceptions)
+                except StopIteration:
+                    return unmocked_set(*args, **kwargs)

-                mock_set.side_effect = side_effect
-                alloc_obj.replace_all(self.ctx, alloc_list)
-                self.assertEqual(2, mock_log.debug.call_count)
-                mock_log.debug.called_with(
-                    'Retrying allocations write on resource provider '
-                    'generation conflict')
-                self.assertEqual(3, mock_set.call_count)
+            mock_set.side_effect = side_effect
+            alloc_obj.replace_all(self.ctx, alloc_list)
+            self.assertEqual(2, mock_log.debug.call_count)
+            mock_log.debug.called_with(
+                'Retrying allocations write on resource provider '
+                'generation conflict')
+            self.assertEqual(3, mock_set.call_count)

        # Confirm we're using a different rp object after the change
        # and that it has a higher generation.
--- a/placement/tests/functional/fixtures/gabbits.py
+++ b/placement/tests/functional/fixtures/gabbits.py
@ -79,6 +79,11 @@ class APIFixture(fixture.GabbiFixture):
        self.placement_db_fixture.setUp()

        self.context = context.RequestContext()
+        # Some database interaction methods require access to the oslo config
+        # via the context. Within the WSGI application this is taken care of
+        # but here in the fixtures we use some of those methods to create
+        # entities.
+        self.context.config = self.conf_fixture.conf

        # Set default policy opts, otherwise the deploy module can
        # NoSuchOptError.
--- a/releasenotes/notes/allocation_conflict_retry_count-329daae86059f5ec.yaml
+++ b/releasenotes/notes/allocation_conflict_retry_count-329daae86059f5ec.yaml
@ -0,0 +1,18 @@
+---
+fixes:
+  - |
+    When a single resource provider receives many concurrent allocation writes,
+    retries may be performed server side when there is a resource provider
+    generation conflict. When those retries are all consumed, the client
+    receives an HTTP 409 response and may choose to try the request again.
+
+    In an environment where high levels of concurrent allocation writes are
+    common, such as a busy clustered hypervisor, the default retry count may be
+    too low. See story 2006467_
+
+    A new configuation setting,
+    ``[placement]/allocation_conflict_retry_count``, has been added to address
+    this situation. It defines the number of times to retry, server-side,
+    writing allocations when there is a resource provider generation conflict.
+
+    .. _2006467: https://storyboard.openstack.org/#!/story/2006467