Browse Source

[placement] Retry allocation writes server side

This change adds a fast retry loop around
AllocationList._set_allocations if a resource provider generation
conflict happens. It turns out that under high concurrency of allocation
claims being made on the same resource provider conflicts can be quite
common and client side retries are insufficient.

Because both consumer generation and resource provider generations had
raised the same exception there was no way to distinguish between the
two so a child of ConcurrentUpdateDetected has been created as
ResourceProviderConcurrentUpdateDetected. In the future this will allow
us to send different error codes to the client as well, but that change
is not done here.

When the conflict is detected, all the resource providers in the
AllocationList are reloaded and the list objects refreshed.

Logging is provided to indicate:

* at debug that a retry is going to happen
* at warning that all the retries failed and the client is going to
  see the conflict

The tests for this are a bit funky: Some mocks are used to cause the
conflicts, then the real actions after a couple of iterations.

This was backported from 72e4c4c8d7 with
conflicts because exceptions, tests and object files were moved to
new locations with Rocky and the AllocationList.create_all method was
renamed to the more accurate replace_all. Prior to Rocky there were
no Consumer objects and fewer helper methods in functional tests,
so the test is adjusted accordingly.

Change-Id: Id614d609fc8f3ed2d2ff29a2b52143f53b3b1b9a
Closes-Bug: #1719933
(cherry picked from commit 72e4c4c8d7)
tags/17.0.6
Chris Dent 1 year ago
parent
commit
66a47b7623
3 changed files with 138 additions and 4 deletions
  1. +5
    -0
      nova/exception.py
  2. +40
    -4
      nova/objects/resource_provider.py
  3. +93
    -0
      nova/tests/functional/db/test_resource_provider.py

+ 5
- 0
nova/exception.py View File

@@ -2122,6 +2122,11 @@ class ConcurrentUpdateDetected(NovaException):
"Please retry your update")


class ResourceProviderConcurrentUpdateDetected(ConcurrentUpdateDetected):
msg_fmt = _("Another thread concurrently updated the resource provider "
"data. Please retry your update")


class ResourceClassNotFound(NotFound):
msg_fmt = _("No such resource class %(resource_class)s.")


+ 40
- 4
nova/objects/resource_provider.py View File

@@ -279,7 +279,7 @@ def _increment_provider_generation(ctx, rp):

res = ctx.session.execute(upd_stmt)
if res.rowcount != 1:
raise exception.ConcurrentUpdateDetected
raise exception.ResourceProviderConcurrentUpdateDetected()
return new_generation


@@ -1992,6 +1992,10 @@ def _get_allocations_by_consumer_uuid(ctx, consumer_uuid):
@base.NovaObjectRegistry.register_if(False)
class AllocationList(base.ObjectListBase, base.NovaObject):

# The number of times to retry set_allocations if there has
# been a resource provider (not consumer) generation coflict.
RP_CONFLICT_RETRY_COUNT = 10

fields = {
'objects': fields.ListOfObjectsField('Allocation'),
}
@@ -2139,9 +2143,41 @@ class AllocationList(base.ObjectListBase, base.NovaObject):
returned to the caller, nor are their database ids set. If
those ids are required use one of the get_all_by* methods.
"""
# TODO(jaypipes): Retry the allocation writes on
# ConcurrentUpdateDetected
self._set_allocations(self._context, self.objects)
# Retry _set_allocations server side if there is a
# ResourceProviderConcurrentUpdateDetected. We don't care about
# sleeping, we simply want to reset the resource provider objects
# and try again. For sake of simplicity (and because we don't have
# easy access to the information) we reload all the resource
# providers that may be present.
retries = self.RP_CONFLICT_RETRY_COUNT
while retries:
retries -= 1
try:
self._set_allocations(self._context, self.objects)
break
except exception.ResourceProviderConcurrentUpdateDetected:
LOG.debug('Retrying allocations write on resource provider '
'generation conflict')
# We only want to reload each unique resource provider once.
alloc_rp_uuids = set(
alloc.resource_provider.uuid for alloc in self.objects)
seen_rps = {}
for rp_uuid in alloc_rp_uuids:
seen_rps[rp_uuid] = ResourceProvider.get_by_uuid(
self._context, rp_uuid)
for alloc in self.objects:
rp_uuid = alloc.resource_provider.uuid
alloc.resource_provider = seen_rps[rp_uuid]
else:
# We ran out of retries so we need to raise again.
# The log will automatically have request id info associated with
# it that will allow tracing back to specific allocations.
# Attempting to extract specific consumer or resource provider
# information from the allocations is not coherent as this
# could be multiple consumers and providers.
LOG.warning('Exceeded retry limit of %d on allocations write',
self.RP_CONFLICT_RETRY_COUNT)
raise exception.ResourceProviderConcurrentUpdateDetected()

def delete_all(self):
# Allocations can only have a single consumer, so take advantage of

+ 93
- 0
nova/tests/functional/db/test_resource_provider.py View File

@@ -11,6 +11,7 @@
# under the License.


import functools
import mock
import os_traits
from oslo_db import exception as db_exc
@@ -42,6 +43,15 @@ DISK_ALLOCATION = dict(
)


def add_inventory(rp, rc, total, **kwargs):
kwargs.setdefault('max_unit', total)
inv = rp_obj.Inventory(rp._context, resource_provider=rp,
resource_class=rc, total=total, **kwargs)
inv.obj_set_defaults()
rp.add_inventory(inv)
return inv


class ResourceProviderBaseCase(test.NoDBTestCase):

USES_DB_SELF = True
@@ -1866,6 +1876,89 @@ class TestAllocationListCreateDelete(ResourceProviderBaseCase):
self.ctx, migration_uuid)
self.assertEqual(0, len(allocations))

@mock.patch('nova.objects.resource_provider.LOG')
def test_set_allocations_retry(self, mock_log):
"""Test server side allocation write retry handling."""

# Create a single resource provider and give it some inventory.
rp1 = rp_obj.ResourceProvider(
self.ctx, name='rp1', uuid=uuidsentinel.rp1)
rp1.create()
add_inventory(rp1, fields.ResourceClass.VCPU, 24,
allocation_ratio=16.0)
add_inventory(rp1, fields.ResourceClass.MEMORY_MB, 1024,
min_unit=64,
max_unit=1024,
step_size=64)
original_generation = rp1.generation
# Verify the generation is what we expect (we'll be checking again
# later).
self.assertEqual(2, original_generation)

inst_consumer = uuidsentinel.instance

alloc_list = rp_obj.AllocationList(context=self.ctx,
objects=[
rp_obj.Allocation(
context=self.ctx,
consumer_id=inst_consumer,
resource_provider=rp1,
resource_class=fields.ResourceClass.VCPU,
used=12),
rp_obj.Allocation(
context=self.ctx,
consumer_id=inst_consumer,
resource_provider=rp1,
resource_class=fields.ResourceClass.MEMORY_MB,
used=1024)
])

# Make sure the right exception happens when the retry loop expires.
with mock.patch.object(rp_obj.AllocationList,
'RP_CONFLICT_RETRY_COUNT', 0):
self.assertRaises(
exception.ResourceProviderConcurrentUpdateDetected,
alloc_list.create_all)
mock_log.warning.assert_called_with(
'Exceeded retry limit of %d on allocations write', 0)

# Make sure the right thing happens after a small number of failures.
# There's a bit of mock magic going on here to enusre that we can
# both do some side effects on _set_allocations as well as have the
# real behavior. Two generation conflicts and then a success.
mock_log.reset_mock()
with mock.patch.object(rp_obj.AllocationList,
'RP_CONFLICT_RETRY_COUNT', 3):
unmocked_set = functools.partial(
rp_obj.AllocationList._set_allocations, alloc_list)
with mock.patch(
'nova.objects.resource_provider.'
'AllocationList._set_allocations') as mock_set:
exceptions = iter([
exception.ResourceProviderConcurrentUpdateDetected(),
exception.ResourceProviderConcurrentUpdateDetected(),
])

def side_effect(*args, **kwargs):
try:
raise next(exceptions)
except StopIteration:
return unmocked_set(*args, **kwargs)

mock_set.side_effect = side_effect
alloc_list.create_all()
self.assertEqual(2, mock_log.debug.call_count)
mock_log.debug.called_with(
'Retrying allocations write on resource provider '
'generation conflict')
self.assertEqual(3, mock_set.call_count)

# Confirm we're using a different rp object after the change
# and that it has a higher generation.
new_rp = alloc_list[0].resource_provider
self.assertEqual(original_generation, rp1.generation)
self.assertEqual(original_generation + 1, new_rp.generation)


class UsageListTestCase(ResourceProviderBaseCase):


Loading…
Cancel
Save