From 8f67ed7d530770417e462d569ef413a0300c0e66 Mon Sep 17 00:00:00 2001 From: melanie witt Date: Thu, 20 Apr 2017 15:46:17 +0000 Subject: [PATCH] Add min_interval to BackOffLoopingCall The backoff timer has a few issues that can cause it to get stuck in an infinite loop and never time out. 1. The random.gauss() function used to generate random jitter can return negative values, so when it does, it makes the elapsed time self._error_time go "backward." 2. The random jitter is used as a multiplier for the self._interval, so self._interval can deviate far away from the mean over time and walk to zero, causing self._interval to be 0, which will prevent the timer from making progress from that point on because idle will always evaluate to zero and the elapsed time won't increase. 3. The evaluated interval doesn't have a lower bound, so over time it can get extremely small if jitter (the mean) < 0.5. This adds a min_interval keyword argument to the BackOffLoopingCall start() function that defaults to 0.001s and uses it to lower bound the interval calculations. We'll also take the absolute value of the return from random.gauss() to prevent elapsed time going backward, and we'll calculate the running self._interval separately to make it track the desired growth rate of the backoff and not let it drift with the random.gauss() values. Closes-Bug: #1686159 Change-Id: Id17668a34d5cedbe870c9056350a7e9c7196faa7 --- oslo_service/loopingcall.py | 16 +++++-- oslo_service/tests/test_loopingcall.py | 64 ++++++++++++++++++-------- 2 files changed, 57 insertions(+), 23 deletions(-) diff --git a/oslo_service/loopingcall.py b/oslo_service/loopingcall.py index 1619d3c5..1747fda9 100644 --- a/oslo_service/loopingcall.py +++ b/oslo_service/loopingcall.py @@ -290,6 +290,8 @@ class BackOffLoopingCall(LoopingCallBase): random.gauss(jitter, 0.1), with jitter as the mean for the distribution. If set below .5, it can cause the calls to come more rapidly after each failure. + :param min_interval: The minimum interval in seconds between calls to + function. :raises: LoopingCallTimeout if time spent doing error retries would exceed timeout. """ @@ -305,7 +307,7 @@ class BackOffLoopingCall(LoopingCallBase): self._interval = 1 def start(self, initial_delay=None, starting_interval=1, timeout=300, - max_interval=300, jitter=0.75): + max_interval=300, jitter=0.75, min_interval=0.001): if self._thread is not None: raise RuntimeError(self._RUN_ONLY_ONE_MESSAGE) @@ -314,16 +316,20 @@ class BackOffLoopingCall(LoopingCallBase): self._interval = starting_interval def _idle_for(success, _elapsed): - random_jitter = self._RNG.gauss(jitter, 0.1) + random_jitter = abs(self._RNG.gauss(jitter, 0.1)) if success: # Reset error state now that it didn't error... self._interval = starting_interval self._error_time = 0 return self._interval * random_jitter else: - # Perform backoff - self._interval = idle = min( - self._interval * 2 * random_jitter, max_interval) + # Perform backoff, random jitter around the next interval + # bounded by min_interval and max_interval. + idle = max(self._interval * 2 * random_jitter, min_interval) + idle = min(idle, max_interval) + # Calculate the next interval based on the mean, so that the + # backoff grows at the desired rate. + self._interval = max(self._interval * 2 * jitter, min_interval) # Don't go over timeout, end early if necessary. If # timeout is 0, keep going. if timeout > 0 and self._error_time + idle > timeout: diff --git a/oslo_service/tests/test_loopingcall.py b/oslo_service/tests/test_loopingcall.py index 4a758639..7ac80252 100644 --- a/oslo_service/tests/test_loopingcall.py +++ b/oslo_service/tests/test_loopingcall.py @@ -326,16 +326,43 @@ class TestBackOffLoopingCall(test_base.BaseTestCase): loopingcall.BackOffLoopingCall(false).start() .wait) - expected_times = [mock.call(1.6000000000000001), - mock.call(2.5600000000000005), - mock.call(4.096000000000001), - mock.call(6.5536000000000021), - mock.call(10.485760000000004), - mock.call(16.777216000000006), - mock.call(26.843545600000013), - mock.call(42.949672960000022), - mock.call(68.719476736000033), - mock.call(109.95116277760006)] + expected_times = [mock.call(1.6), + mock.call(2.4000000000000004), + mock.call(3.6), + mock.call(5.4), + mock.call(8.1), + mock.call(12.15), + mock.call(18.225), + mock.call(27.337500000000002), + mock.call(41.00625), + mock.call(61.509375000000006), + mock.call(92.26406250000001)] + self.assertEqual(expected_times, sleep_mock.call_args_list) + + @mock.patch('random.SystemRandom.gauss') + @mock.patch('eventlet.greenthread.sleep') + def test_exponential_backoff_negative_value(self, sleep_mock, random_mock): + def false(): + return False + + # random.gauss() can return negative values + random_mock.return_value = -.8 + + self.assertRaises(loopingcall.LoopingCallTimeOut, + loopingcall.BackOffLoopingCall(false).start() + .wait) + + expected_times = [mock.call(1.6), + mock.call(2.4000000000000004), + mock.call(3.6), + mock.call(5.4), + mock.call(8.1), + mock.call(12.15), + mock.call(18.225), + mock.call(27.337500000000002), + mock.call(41.00625), + mock.call(61.509375000000006), + mock.call(92.26406250000001)] self.assertEqual(expected_times, sleep_mock.call_args_list) @mock.patch('random.SystemRandom.gauss') @@ -379,14 +406,15 @@ class TestBackOffLoopingCall(test_base.BaseTestCase): max_interval=60) .wait) - expected_times = [mock.call(1.6000000000000001), - mock.call(2.5600000000000005), - mock.call(4.096000000000001), - mock.call(6.5536000000000021), - mock.call(10.485760000000004), - mock.call(16.777216000000006), - mock.call(26.843545600000013), - mock.call(42.949672960000022), + expected_times = [mock.call(1.6), + mock.call(2.4000000000000004), + mock.call(3.6), + mock.call(5.4), + mock.call(8.1), + mock.call(12.15), + mock.call(18.225), + mock.call(27.337500000000002), + mock.call(41.00625), mock.call(60), mock.call(60), mock.call(60)]