Allow retries when resource acquires lock

Previously, if an update or delete on a resource is initiated
concurrently with another action (e.g. a metadata update as the result
of a resource signal) it may end up failing with
exception.UpdateInProgress because its view of the resource's
atomic_key is stale.

Now, we retry, rereading the resource's atomic_key from the db if
needed, up to cfg.CONF.action_retry_limit times.

Change-Id: I4cfa6f691fe916c0d605a712028b88f61ebab4d9
Partial-Bug: #1675286
This commit is contained in:
Crag Wolfe 2017-03-30 10:49:17 -04:00
parent c2995c73df
commit 2ec2d5a973
2 changed files with 23 additions and 6 deletions

View File

@ -15,6 +15,7 @@ import base64
import contextlib
import datetime as dt
import pydoc
import tenacity
import weakref
from oslo_config import cfg
@ -772,14 +773,35 @@ class Resource(status.ResourceStatus):
Expected exceptions are re-raised, with the Resource moved to the
COMPLETE state.
"""
attempts = 1
first_iter = [True] # work around no nonlocal in py27
if self.stack.convergence:
lock_acquire = self.LOCK_ACQUIRE
lock_release = self.LOCK_RELEASE
if action != self.CREATE:
attempts += max(cfg.CONF.client_retry_limit, 0)
else:
lock_acquire = lock_release = self.LOCK_NONE
try:
# retry for convergence DELETE or UPDATE if we get the usual
# lock-acquire exception of exception.UpdateInProgress
@tenacity.retry(
stop=tenacity.stop_after_attempt(attempts),
retry=tenacity.retry_if_exception_type(
exception.UpdateInProgress),
wait=tenacity.wait_random(max=2),
reraise=True)
def set_in_progress():
if not first_iter[0]:
res_obj = resource_objects.Resource.get_obj(
self.context, self.id)
self._atomic_key = res_obj.atomic_key
else:
first_iter[0] = False
self.state_set(action, self.IN_PROGRESS, lock=lock_acquire)
try:
set_in_progress()
yield
except exception.UpdateInProgress as ex:
with excutils.save_and_reraise_exception():

View File

@ -55,8 +55,3 @@ class AodhAlarmTest(scenario_base.ScenarioTestsBase):
# Note: there is little point waiting more than 60s+time to scale up.
self.assertTrue(test.call_until_true(
120, 2, self.check_instance_count, stack_identifier, 2))
# Temporarily avoids a race condition, addressed in the
# next change https://review.openstack.org/#/c/449351/
import time
time.sleep(3)