diff --git a/ironic_python_agent/agent.py b/ironic_python_agent/agent.py index a290c827f..a3032bd0f 100644 --- a/ironic_python_agent/agent.py +++ b/ironic_python_agent/agent.py @@ -68,14 +68,24 @@ class IronicPythonAgentStatus(encoding.Serializable): self.version = version +def _with_jitter(value, min_multiplier, max_multiplier): + interval_multiplier = random.uniform(min_multiplier, max_multiplier) + return value * interval_multiplier + + class IronicPythonAgentHeartbeater(threading.Thread): """Thread that periodically heartbeats to Ironic.""" - # If we could wait at most N seconds between heartbeats (or in case of an - # error) we will instead wait r x N seconds, where r is a random value - # between these multipliers. + # If we could wait at most N seconds between heartbeats, we will instead + # wait r x N seconds, where r is a random value between these multipliers. min_jitter_multiplier = 0.3 max_jitter_multiplier = 0.6 + # Error retry between 5 and 10 seconds, at least 12 retries with + # the default ramdisk_heartbeat_timeout of 300 and the worst case interval + # jitter of 0.6. + min_heartbeat_interval = 5 + min_error_jitter_multiplier = 1.0 + max_error_jitter_multiplier = 2.0 def __init__(self, agent): """Initialize the heartbeat thread. @@ -97,19 +107,39 @@ class IronicPythonAgentHeartbeater(threading.Thread): LOG.info('Starting heartbeater') self.agent.set_agent_advertise_addr() - while not self.stop_event.wait(min(self.interval, 5)): - if self._heartbeat_expected(): - self.do_heartbeat() + while self._run_next(): eventlet.sleep(0) + def _run_next(self): + # The logic here makes sure we don't wait exactly 5 seconds more or + # less regardless of the current interval since it may cause a + # thundering herd problem when a lot of agents are heartbeating. + # Essentially, if the next heartbeat is due in 2 seconds, don't wait 5. + # But if the next one is scheduled in 2 minutes, do wait 5 to account + # for forced heartbeats. + wait = min( + self.min_heartbeat_interval, + # This operation checks how much of the initially planned interval + # we have still left. Compare with 0 in case we overshoot the goal. + max(0, self.interval - (_time() - self.previous_heartbeat)), + ) + if self.stop_event.wait(wait): + return False # done + + if self._heartbeat_expected(): + self.do_heartbeat() + + return True + def _heartbeat_expected(self): + elapsed = _time() - self.previous_heartbeat + # Normal heartbeating - if _time() > self.previous_heartbeat + self.interval: + if elapsed >= self.interval: return True # Forced heartbeating, but once in 5 seconds - if (self.heartbeat_forced - and _time() > self.previous_heartbeat + 5): + if self.heartbeat_forced and elapsed > self.min_heartbeat_interval: return True def do_heartbeat(self): @@ -121,20 +151,24 @@ class IronicPythonAgentHeartbeater(threading.Thread): advertise_protocol=self.agent.advertise_protocol, generated_cert=self.agent.generated_cert, ) - LOG.info('heartbeat successful') + except Exception as exc: + if isinstance(exc, errors.HeartbeatConflictError): + LOG.warning('conflict error sending heartbeat to %s', + self.agent.api_url) + else: + LOG.exception('error sending heartbeat to %s', + self.agent.api_url) + self.interval = _with_jitter(self.min_heartbeat_interval, + self.min_error_jitter_multiplier, + self.max_error_jitter_multiplier) + else: + LOG.debug('heartbeat successful') self.heartbeat_forced = False - self.previous_heartbeat = _time() - except errors.HeartbeatConflictError: - LOG.warning('conflict error sending heartbeat to %s', - self.agent.api_url) - except Exception: - LOG.exception('error sending heartbeat to %s', self.agent.api_url) - finally: - interval_multiplier = random.uniform(self.min_jitter_multiplier, - self.max_jitter_multiplier) - self.interval = self.agent.heartbeat_timeout * interval_multiplier - LOG.info('sleeping before next heartbeat, interval: %s', - self.interval) + self.interval = _with_jitter(self.agent.heartbeat_timeout, + self.min_jitter_multiplier, + self.max_jitter_multiplier) + self.previous_heartbeat = _time() + LOG.info('sleeping before next heartbeat, interval: %s', self.interval) def force_heartbeat(self): self.heartbeat_forced = True diff --git a/ironic_python_agent/tests/unit/test_agent.py b/ironic_python_agent/tests/unit/test_agent.py index 3f284a896..400a36eb5 100644 --- a/ironic_python_agent/tests/unit/test_agent.py +++ b/ironic_python_agent/tests/unit/test_agent.py @@ -50,6 +50,20 @@ class FakeExtension(base.BaseAgentExtension): pass +class FakeClock: + current = 0 + last_wait = None + wait_result = False + + def get(self): + return self.current + + def wait(self, interval): + self.last_wait = interval + self.current += interval + return self.wait_result + + class TestHeartbeater(ironic_agent_base.IronicAgentTest): def setUp(self): super(TestHeartbeater, self).setUp() @@ -64,65 +78,79 @@ class TestHeartbeater(ironic_agent_base.IronicAgentTest): @mock.patch('ironic_python_agent.agent._time', autospec=True) @mock.patch('random.uniform', autospec=True) def test_heartbeat(self, mock_uniform, mock_time): - time_responses = [] - uniform_responses = [] - heartbeat_responses = [] - wait_responses = [] - expected_stop_calls = [] + clock = FakeClock() + mock_time.side_effect = clock.get + self.heartbeater.stop_event.wait.side_effect = clock.wait - # FIRST RUN: - # initial delay is 0 - expected_stop_calls.append(mock.call(0)) - wait_responses.append(False) - # next heartbeat due at t=100 - heartbeat_responses.append(100) - # random interval multiplier is 0.5 - uniform_responses.append(0.5) - # time is now 50 - time_responses.append(50) + heartbeat_mock = self.heartbeater.api.heartbeat + self.mock_agent.heartbeat_timeout = 20 - # SECOND RUN: - expected_stop_calls.append(mock.call(5)) - wait_responses.append(False) - # next heartbeat due at t=180 - heartbeat_responses.append(180) - # random interval multiplier is 0.4 - uniform_responses.append(0.4) - # time is now 80 - time_responses.append(80) - # add one response for _time in _heartbeat_expected - time_responses.append(80) + # First run right after start + mock_uniform.return_value = 0.6 + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(0, clock.last_wait) + heartbeat_mock.assert_called_once_with( + uuid=self.mock_agent.get_node_uuid.return_value, + advertise_address=self.mock_agent.advertise_address, + advertise_protocol=self.mock_agent.advertise_protocol, + generated_cert=self.mock_agent.generated_cert) + heartbeat_mock.reset_mock() + self.assertEqual(12, self.heartbeater.interval) # 20*0.6 + self.assertEqual(0, self.heartbeater.previous_heartbeat) - # THIRD RUN: - expected_stop_calls.append(mock.call(5)) - wait_responses.append(False) - # this heartbeat attempt fails - heartbeat_responses.append(Exception('uh oh!')) - # random interval multiplier is 0.5 - uniform_responses.append(0.5) - # we check the time to generate a fake deadline, now t=125 - time_responses.append(125) - # time is now 125.5 - time_responses.append(125.5) + # A few empty runs before reaching the next heartbeat + for ts in [5, 10]: + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(5, clock.last_wait) + self.assertEqual(ts, clock.current) + heartbeat_mock.assert_not_called() + self.assertEqual(0, self.heartbeater.previous_heartbeat) - # FOURTH RUN: - expected_stop_calls.append(mock.call(5)) - # Stop now - wait_responses.append(True) + # Second run when the heartbeat is due + mock_uniform.return_value = 0.4 + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(2, clock.last_wait) # 12-2*5 + self.assertTrue(heartbeat_mock.called) + heartbeat_mock.reset_mock() + self.assertEqual(8, self.heartbeater.interval) # 20*0.4 + self.assertEqual(12, self.heartbeater.previous_heartbeat) - # Hook it up and run it - mock_time.side_effect = time_responses - mock_uniform.side_effect = uniform_responses - self.mock_agent.heartbeat_timeout = 50 - self.heartbeater.api.heartbeat.side_effect = heartbeat_responses - self.heartbeater.stop_event.wait.side_effect = wait_responses - self.heartbeater.run() + # One empty run before reaching the next heartbeat + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(5, clock.last_wait) + heartbeat_mock.assert_not_called() + self.assertEqual(12, self.heartbeater.previous_heartbeat) - # Validate expectations - self.assertEqual(expected_stop_calls, - self.heartbeater.stop_event.wait.call_args_list) - self.assertEqual(self.heartbeater.api.heartbeat.call_count, 2) - self.assertEqual(mock_time.call_count, 5) + # Failed run resulting in a fast retry + mock_uniform.return_value = 1.2 + heartbeat_mock.side_effect = Exception('uh oh!') + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(3, clock.last_wait) # 8-5 + self.assertTrue(heartbeat_mock.called) + heartbeat_mock.reset_mock(side_effect=True) + self.assertEqual(6, self.heartbeater.interval) # 5*1.2 + self.assertEqual(20, self.heartbeater.previous_heartbeat) + + # One empty run because 6>5 + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(5, clock.last_wait) + heartbeat_mock.assert_not_called() + self.assertEqual(20, self.heartbeater.previous_heartbeat) + + # Retry after the remaining 1 second + mock_uniform.return_value = 0.5 + self.assertTrue(self.heartbeater._run_next()) + self.assertEqual(1, clock.last_wait) + self.assertTrue(heartbeat_mock.called) + heartbeat_mock.reset_mock() + self.assertEqual(10, self.heartbeater.interval) # 20*0.5 + self.assertEqual(26, self.heartbeater.previous_heartbeat) + + # Stop on the next empty run + clock.wait_result = True + self.assertFalse(self.heartbeater._run_next()) + heartbeat_mock.assert_not_called() + self.assertEqual(26, self.heartbeater.previous_heartbeat) @mock.patch('ironic_python_agent.agent._time', autospec=True) def test__heartbeat_expected(self, mock_time): @@ -132,7 +160,7 @@ class TestHeartbeater(ironic_agent_base.IronicAgentTest): self.heartbeater.interval = 0 self.heartbeater.heartbeat_forced = False mock_time.return_value = 0 - self.assertFalse(self.heartbeater._heartbeat_expected()) + self.assertTrue(self.heartbeater._heartbeat_expected()) # 1st cadence self.heartbeater.previous_heartbeat = 0 diff --git a/releasenotes/notes/heartbeat-jitter-620bbcba591d2894.yaml b/releasenotes/notes/heartbeat-jitter-620bbcba591d2894.yaml new file mode 100644 index 000000000..58cec2433 --- /dev/null +++ b/releasenotes/notes/heartbeat-jitter-620bbcba591d2894.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Adds random jitter to retried heartbeats after Ironic returns an error. + Previously, heartbeats would be retried after 5 seconds, potentially + causing a thundering herd problem if many nodes fail to heartbeat at + the same time.