Make the DB retry mechanism more robust

Currently in the worst case retry interval is:
[0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 10, 10, 10]
This may cause some neutron DB action fail, and result an
API 500/40X error, for instance, the IP allocation get a
large failure chance with such config.

After this change, the worst case retry interval will look
something like this:
[0.5, 1, 2, 4, 8, 10, 10, ... 10]

So, for the new value 20 and 0.5, in the worst case the total
time cost will be 165.5s. For the old config, it's  42.7s. It's
not too radical for the new retry input params. And we get a
higher rate of API success.

Change-Id: I5ad139bdfb3ae125658b36d05f85f139a1b47bee
Closes-Bug: #1777968
This commit is contained in:
LIU Yulong 2018-07-16 09:56:47 +08:00
parent c55478666e
commit ed62a06a53
3 changed files with 47 additions and 4 deletions

View File

@ -36,7 +36,7 @@ from neutron_lib import exceptions
from neutron_lib.objects import exceptions as obj_exc from neutron_lib.objects import exceptions as obj_exc
MAX_RETRIES = 10 MAX_RETRIES = 20
OSPROFILER_TRACE_NAMES = {'neutron.db', 'neutron_lib.db'} OSPROFILER_TRACE_NAMES = {'neutron.db', 'neutron_lib.db'}
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
_synchronized = lockutils.synchronized_with_prefix("neutron-") _synchronized = lockutils.synchronized_with_prefix("neutron-")
@ -148,7 +148,7 @@ def _copy_if_lds(item):
_retry_db_errors = oslo_db_api.wrap_db_retry( _retry_db_errors = oslo_db_api.wrap_db_retry(
max_retries=MAX_RETRIES, max_retries=MAX_RETRIES,
retry_interval=0.1, retry_interval=0.5,
inc_retry_interval=True, inc_retry_interval=True,
exception_checker=is_retriable exception_checker=is_retriable
) )

View File

@ -129,9 +129,42 @@ class TestDeadLockDecorator(_base.BaseTestCase):
e = db_exc.DBConnectionError() e = db_exc.DBConnectionError()
mock.patch('time.sleep').start() mock.patch('time.sleep').start()
with testtools.ExpectedException(db_exc.DBConnectionError): with testtools.ExpectedException(db_exc.DBConnectionError):
# after 10 failures, the inner retry should give up and # after 20 failures, the inner retry should give up and
# the exception should be tagged to prevent the outer retry # the exception should be tagged to prevent the outer retry
self._alt_context_function(context, 11, e) self._alt_context_function(context, db_api.MAX_RETRIES + 1, e)
def _test_retry_time_cost(self, exc_to_raise):
worst_case = [0.5, 1, 2, 4, 8,
10, 10, 10, 10, 10,
10, 10, 10, 10, 10,
10, 10, 10, 10, 10]
class FakeTime(object):
def __init__(self):
self.counter = 0
def sleep(self, t):
self.counter += t
fake_timer = FakeTime()
def fake_sleep(t):
fake_timer.sleep(t)
e = exc_to_raise()
mock.patch('time.sleep', side_effect=fake_sleep).start()
with testtools.ExpectedException(exc_to_raise):
self._decorated_function(db_api.MAX_RETRIES + 1, e)
if exc_to_raise == db_exc.DBDeadlock:
self.assertEqual(True, (fake_timer.counter <= sum(worst_case)))
else:
self.assertEqual(sum(worst_case), fake_timer.counter)
def test_all_deadlock_time_elapsed(self):
self._test_retry_time_cost(db_exc.DBDeadlock)
def test_not_deadlock_time_elapsed(self):
self._test_retry_time_cost(db_exc.DBConnectionError)
def test_retry_if_session_inactive_args_not_mutated_after_retries(self): def test_retry_if_session_inactive_args_not_mutated_after_retries(self):
context = mock.Mock() context = mock.Mock()

View File

@ -0,0 +1,10 @@
---
fixes:
- |
Increase the DB retry interval and max retry times for the
``retry_db_errors`` decorator in ``neutron_lib.db.api`` to
0.5 seconds and 20 times, respectively. For those actions
which have a higher chance for DBDeadlock, users should have
a higher success rate due to the larger random range and retry
times. For more information
see bug `1777968 <https://bugs.launchpad.net/neutron/+bug/1777968>`_