Make the DB retry mechanism more robust

Currently in the worst case retry interval is: [0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 10, 10, 10] This may cause some neutron DB action fail, and result an API 500/40X error, for instance, the IP allocation get a large failure chance with such config. After this change, the worst case retry interval will look something like this: [0.5, 1, 2, 4, 8, 10, 10, ... 10] So, for the new value 20 and 0.5, in the worst case the total time cost will be 165.5s. For the old config, it's 42.7s. It's not too radical for the new retry input params. And we get a higher rate of API success. Change-Id: I5ad139bdfb3ae125658b36d05f85f139a1b47bee Closes-Bug: #1777968
2018-07-16 09:56:47 +08:00 · 2018-07-16 09:56:47 +08:00 · ed62a06a53
commit ed62a06a53
parent c55478666e
3 changed files with 47 additions and 4 deletions
--- a/neutron_lib/db/api.py
+++ b/neutron_lib/db/api.py
@ -36,7 +36,7 @@ from neutron_lib import exceptions
 from neutron_lib.objects import exceptions as obj_exc
-MAX_RETRIES = 10
+MAX_RETRIES = 20
 OSPROFILER_TRACE_NAMES = {'neutron.db', 'neutron_lib.db'}
 LOG = logging.getLogger(__name__)
 _synchronized = lockutils.synchronized_with_prefix("neutron-")
@ -148,7 +148,7 @@ def _copy_if_lds(item):
 _retry_db_errors = oslo_db_api.wrap_db_retry(
    max_retries=MAX_RETRIES,
-    retry_interval=0.1,
+    retry_interval=0.5,
    inc_retry_interval=True,
    exception_checker=is_retriable
 )
--- a/neutron_lib/tests/unit/db/test_api.py
+++ b/neutron_lib/tests/unit/db/test_api.py
@ -129,9 +129,42 @@ class TestDeadLockDecorator(_base.BaseTestCase):
        e = db_exc.DBConnectionError()
        mock.patch('time.sleep').start()
        with testtools.ExpectedException(db_exc.DBConnectionError):
-            # after 10 failures, the inner retry should give up and
+            # after 20 failures, the inner retry should give up and
            # the exception should be tagged to prevent the outer retry
-            self._alt_context_function(context, 11, e)
+            self._alt_context_function(context, db_api.MAX_RETRIES + 1, e)
    def _test_retry_time_cost(self, exc_to_raise):
        worst_case = [0.5, 1, 2, 4, 8,
                      10, 10, 10, 10, 10,
                      10, 10, 10, 10, 10,
                      10, 10, 10, 10, 10]
        class FakeTime(object):
            def __init__(self):
                self.counter = 0
            def sleep(self, t):
                self.counter += t
        fake_timer = FakeTime()
        def fake_sleep(t):
            fake_timer.sleep(t)
        e = exc_to_raise()
        mock.patch('time.sleep', side_effect=fake_sleep).start()
        with testtools.ExpectedException(exc_to_raise):
            self._decorated_function(db_api.MAX_RETRIES + 1, e)
        if exc_to_raise == db_exc.DBDeadlock:
            self.assertEqual(True, (fake_timer.counter <= sum(worst_case)))
        else:
            self.assertEqual(sum(worst_case), fake_timer.counter)
    def test_all_deadlock_time_elapsed(self):
        self._test_retry_time_cost(db_exc.DBDeadlock)
    def test_not_deadlock_time_elapsed(self):
        self._test_retry_time_cost(db_exc.DBConnectionError)
    def test_retry_if_session_inactive_args_not_mutated_after_retries(self):
        context = mock.Mock()
--- a/releasenotes/notes/reset-db-retry-settings-49e51cef4c842f69.yaml
+++ b/releasenotes/notes/reset-db-retry-settings-49e51cef4c842f69.yaml
@ -0,0 +1,10 @@
 ---
 fixes:
  - |
    Increase the DB retry interval and max retry times for the
    ``retry_db_errors`` decorator in ``neutron_lib.db.api`` to
    0.5 seconds and 20 times, respectively. For those actions
    which have a higher chance for DBDeadlock, users should have
    a higher success rate due to the larger random range and retry
    times. For more information
    see bug `1777968 <https://bugs.launchpad.net/neutron/+bug/1777968>`_