Allow retries when resource acquires lock

Previously, if an update or delete on a resource is initiated concurrently with another action (e.g. a metadata update as the result of a resource signal) it may end up failing with exception.UpdateInProgress because its view of the resource's atomic_key is stale. Now, we retry, rereading the resource's atomic_key from the db if needed, up to cfg.CONF.action_retry_limit times. Change-Id: I4cfa6f691fe916c0d605a712028b88f61ebab4d9 Partial-Bug: #1675286
2017-03-30 10:49:17 -04:00 · 2017-03-30 10:49:17 -04:00 · 2ec2d5a973
commit 2ec2d5a973
parent c2995c73df
2 changed files with 23 additions and 6 deletions
--- a/heat/engine/resource.py
+++ b/heat/engine/resource.py
@ -15,6 +15,7 @@ import base64
 import contextlib
 import datetime as dt
 import pydoc
+import tenacity
 import weakref

 from oslo_config import cfg
@ -772,14 +773,35 @@ class Resource(status.ResourceStatus):
        Expected exceptions are re-raised, with the Resource moved to the
        COMPLETE state.
        """
+        attempts = 1
+        first_iter = [True]  # work around no nonlocal in py27
        if self.stack.convergence:
            lock_acquire = self.LOCK_ACQUIRE
            lock_release = self.LOCK_RELEASE
+            if action != self.CREATE:
+                attempts += max(cfg.CONF.client_retry_limit, 0)
        else:
            lock_acquire = lock_release = self.LOCK_NONE

-        try:
+        # retry for convergence DELETE or UPDATE if we get the usual
+        # lock-acquire exception of exception.UpdateInProgress
+        @tenacity.retry(
+            stop=tenacity.stop_after_attempt(attempts),
+            retry=tenacity.retry_if_exception_type(
+                exception.UpdateInProgress),
+            wait=tenacity.wait_random(max=2),
+            reraise=True)
+        def set_in_progress():
+            if not first_iter[0]:
+                res_obj = resource_objects.Resource.get_obj(
+                    self.context, self.id)
+                self._atomic_key = res_obj.atomic_key
+            else:
+                first_iter[0] = False
            self.state_set(action, self.IN_PROGRESS, lock=lock_acquire)
+
+        try:
+            set_in_progress()
            yield
        except exception.UpdateInProgress as ex:
            with excutils.save_and_reraise_exception():
--- a/heat_integrationtests/scenario/test_aodh_alarm.py
+++ b/heat_integrationtests/scenario/test_aodh_alarm.py
@ -55,8 +55,3 @@ class AodhAlarmTest(scenario_base.ScenarioTestsBase):
        # Note: there is little point waiting more than 60s+time to scale up.
        self.assertTrue(test.call_until_true(
            120, 2, self.check_instance_count, stack_identifier, 2))
-
-        # Temporarily avoids a race condition, addressed in the
-        # next change https://review.openstack.org/#/c/449351/
-        import time
-        time.sleep(3)