[placement] Retry allocation writes server side

This change adds a fast retry loop around
AllocationList._set_allocations if a resource provider generation
conflict happens. It turns out that under high concurrency of allocation
claims being made on the same resource provider conflicts can be quite
common and client side retries are insufficient.

Because both consumer generation and resource provider generations had
raised the same exception there was no way to distinguish between the
two so a child of ConcurrentUpdateDetected has been created as
ResourceProviderConcurrentUpdateDetected. In the future this will allow
us to send different error codes to the client as well, but that change
is not done here.

When the conflict is detected, all the resource providers in the
AllocationList are reloaded and the list objects refreshed.

Logging is provided to indicate:

* at debug that a retry is going to happen
* at warning that all the retries failed and the client is going to
  see the conflict

The tests for this are a bit funky: Some mocks are used to cause the
conflicts, then the real actions after a couple of iterations.

Change-Id: Id614d609fc8f3ed2d2ff29a2b52143f53b3b1b9a
Closes-Bug: #1719933
This commit is contained in:
Chris Dent 2018-07-26 12:00:14 +01:00
parent 04b83e7013
commit 72e4c4c8d7
3 changed files with 131 additions and 4 deletions

View File

@ -85,6 +85,11 @@ class ConcurrentUpdateDetected(_BaseException):
"Please retry your update")
class ResourceProviderConcurrentUpdateDetected(ConcurrentUpdateDetected):
msg_fmt = _("Another thread concurrently updated the resource provider "
"data. Please retry your update")
class InvalidAllocationCapacityExceeded(InvalidInventory):
msg_fmt = _("Unable to create allocation for '%(resource_class)s' on "
"resource provider '%(resource_provider)s'. The requested "

View File

@ -278,7 +278,7 @@ def _increment_provider_generation(ctx, rp):
res = ctx.session.execute(upd_stmt)
if res.rowcount != 1:
raise exception.ConcurrentUpdateDetected
raise exception.ResourceProviderConcurrentUpdateDetected()
return new_generation
@ -1924,6 +1924,10 @@ def _create_incomplete_consumer(ctx, consumer_id):
@base.VersionedObjectRegistry.register_if(False)
class AllocationList(base.ObjectListBase, base.VersionedObject):
# The number of times to retry set_allocations if there has
# been a resource provider (not consumer) generation coflict.
RP_CONFLICT_RETRY_COUNT = 10
fields = {
'objects': fields.ListOfObjectsField('Allocation'),
}
@ -2106,9 +2110,41 @@ class AllocationList(base.ObjectListBase, base.VersionedObject):
so, it will end up setting the Allocation.id attribute of each
Allocation object.
"""
# TODO(jaypipes): Retry the allocation writes on
# ConcurrentUpdateDetected
self._set_allocations(self._context, self.objects)
# Retry _set_allocations server side if there is a
# ResourceProviderConcurrentUpdateDetected. We don't care about
# sleeping, we simply want to reset the resource provider objects
# and try again. For sake of simplicity (and because we don't have
# easy access to the information) we reload all the resource
# providers that may be present.
retries = self.RP_CONFLICT_RETRY_COUNT
while retries:
retries -= 1
try:
self._set_allocations(self._context, self.objects)
break
except exception.ResourceProviderConcurrentUpdateDetected:
LOG.debug('Retrying allocations write on resource provider '
'generation conflict')
# We only want to reload each unique resource provider once.
alloc_rp_uuids = set(
alloc.resource_provider.uuid for alloc in self.objects)
seen_rps = {}
for rp_uuid in alloc_rp_uuids:
seen_rps[rp_uuid] = ResourceProvider.get_by_uuid(
self._context, rp_uuid)
for alloc in self.objects:
rp_uuid = alloc.resource_provider.uuid
alloc.resource_provider = seen_rps[rp_uuid]
else:
# We ran out of retries so we need to raise again.
# The log will automatically have request id info associated with
# it that will allow tracing back to specific allocations.
# Attempting to extract specific consumer or resource provider
# information from the allocations is not coherent as this
# could be multiple consumers and providers.
LOG.warning('Exceeded retry limit of %d on allocations write',
self.RP_CONFLICT_RETRY_COUNT)
raise exception.ResourceProviderConcurrentUpdateDetected()
def delete_all(self):
consumer_uuids = set(alloc.consumer.uuid for alloc in self.objects)

View File

@ -11,6 +11,7 @@
# under the License.
import functools
import mock
import os_traits
from oslo_db import exception as db_exc
@ -1701,6 +1702,91 @@ class TestAllocationListCreateDelete(tb.PlacementDbBaseTestCase):
self.ctx, empty_rp)
self.assertEqual(0, len(allocations))
@mock.patch('nova.api.openstack.placement.objects.resource_provider.LOG')
def test_set_allocations_retry(self, mock_log):
"""Test server side allocation write retry handling."""
# Create a single resource provider and give it some inventory.
rp1 = self._create_provider('rp1')
tb.add_inventory(rp1, fields.ResourceClass.VCPU, 24,
allocation_ratio=16.0)
tb.add_inventory(rp1, fields.ResourceClass.MEMORY_MB, 1024,
min_unit=64,
max_unit=1024,
step_size=64)
original_generation = rp1.generation
# Verify the generation is what we expect (we'll be checking again
# later).
self.assertEqual(2, original_generation)
# Create a consumer and have it make an allocation.
inst_consumer = consumer_obj.Consumer(
self.ctx, uuid=uuidsentinel.instance, user=self.user_obj,
project=self.project_obj)
inst_consumer.create()
alloc_list = rp_obj.AllocationList(context=self.ctx,
objects=[
rp_obj.Allocation(
context=self.ctx,
consumer=inst_consumer,
resource_provider=rp1,
resource_class=fields.ResourceClass.VCPU,
used=12),
rp_obj.Allocation(
context=self.ctx,
consumer=inst_consumer,
resource_provider=rp1,
resource_class=fields.ResourceClass.MEMORY_MB,
used=1024)
])
# Make sure the right exception happens when the retry loop expires.
with mock.patch.object(rp_obj.AllocationList,
'RP_CONFLICT_RETRY_COUNT', 0):
self.assertRaises(
exception.ResourceProviderConcurrentUpdateDetected,
alloc_list.replace_all)
mock_log.warning.assert_called_with(
'Exceeded retry limit of %d on allocations write', 0)
# Make sure the right thing happens after a small number of failures.
# There's a bit of mock magic going on here to enusre that we can
# both do some side effects on _set_allocations as well as have the
# real behavior. Two generation conflicts and then a success.
mock_log.reset_mock()
with mock.patch.object(rp_obj.AllocationList,
'RP_CONFLICT_RETRY_COUNT', 3):
unmocked_set = functools.partial(
rp_obj.AllocationList._set_allocations, alloc_list)
with mock.patch(
'nova.api.openstack.placement.objects.resource_provider.'
'AllocationList._set_allocations') as mock_set:
exceptions = iter([
exception.ResourceProviderConcurrentUpdateDetected(),
exception.ResourceProviderConcurrentUpdateDetected(),
])
def side_effect(*args, **kwargs):
try:
raise next(exceptions)
except StopIteration:
return unmocked_set(*args, **kwargs)
mock_set.side_effect = side_effect
alloc_list.replace_all()
self.assertEqual(2, mock_log.debug.call_count)
mock_log.debug.called_with(
'Retrying allocations write on resource provider '
'generation conflict')
self.assertEqual(3, mock_set.call_count)
# Confirm we're using a different rp object after the change
# and that it has a higher generation.
new_rp = alloc_list[0].resource_provider
self.assertEqual(original_generation, rp1.generation)
self.assertEqual(original_generation + 1, new_rp.generation)
class UsageListTestCase(tb.PlacementDbBaseTestCase):