From 368ab136f0be343e7257f85f19fe4c8cc193e77d Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Wed, 25 Mar 2020 10:05:38 -0700 Subject: [PATCH] Add jitter to inspection command reporting Adds a jitter and backoff behavior to the inspector data collection command to prevent thundering heard sorts of issues. Change-Id: I00517010991cbe43d5958c7d76019ef6fe89c983 --- ironic_python_agent/inspect.py | 49 +++++++++++++++++-- ...r-inspection-command-5a226927757a0308.yaml | 13 +++++ 2 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/jitter-for-inspection-command-5a226927757a0308.yaml diff --git a/ironic_python_agent/inspect.py b/ironic_python_agent/inspect.py index b8b8fd73b..17a6b91d3 100644 --- a/ironic_python_agent/inspect.py +++ b/ironic_python_agent/inspect.py @@ -11,6 +11,7 @@ # limitations under the License. import os +import random import select import threading @@ -27,6 +28,18 @@ LOG = log.getLogger(__name__) class IronicInspection(threading.Thread): """Class for manual inspection functionality.""" + # If we could wait at most N seconds between heartbeats (or in case of an + # error) we will instead wait r x N seconds, where r is a random value + # between these multipliers. + min_jitter_multiplier = 0.7 + max_jitter_multiplier = 1.2 + + # Exponential backoff values used in case of an error. In reality we will + # only wait a portion of either of these delays based on the jitter + # multipliers. + max_delay = 4 * cfg.CONF.introspection_daemon_post_interval + backoff_factor = 2.7 + def __init__(self): super(IronicInspection, self).__init__() if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile): @@ -36,7 +49,7 @@ class IronicInspection(threading.Thread): def _run(self): try: daemon_mode = cfg.CONF.introspection_daemon - post_interval = cfg.CONF.introspection_daemon_post_interval + interval = cfg.CONF.introspection_daemon_post_interval inspector.inspect() if not daemon_mode: @@ -46,29 +59,55 @@ class IronicInspection(threading.Thread): self.reader, self.writer = os.pipe() p = select.poll() p.register(self.reader) + exception_encountered = False try: while daemon_mode: - LOG.info('Sleeping until next check-in.') - # TODO(TheJulia): It would likely be good to introduce - # some jitter into this at some point... - if p.poll(post_interval * 1000): + interval_multiplier = random.uniform( + self.min_jitter_multiplier, + self.max_jitter_multiplier) + interval = interval * interval_multiplier + log_msg = 'sleeping before next inspection, interval: %s' + LOG.info(log_msg, interval) + + if p.poll(interval * 1000): if os.read(self.reader, 1).decode() == 'a': break try: inspector.inspect() + if exception_encountered: + interval = min( + interval, + cfg.CONF.introspection_daemon_post_interval) + exception_encountered = False except errors.InspectionError as e: # Failures happen, no reason to exit as # the failure could be intermittent. LOG.warning('Error reporting introspection ' 'data: %(err)s', {'err': e}) + exception_encountered = True + interval = min(interval * self.backoff_factor, + self.max_delay) + except exception.ServiceLookupFailure as e: # Likely a mDNS lookup failure. We should # keep retrying. LOG.error('Error looking up introspection ' 'endpoint: %(err)s', {'err': e}) + exception_encountered = True + interval = min(interval * self.backoff_factor, + self.max_delay) + except Exception as e: + # General failure such as requests ConnectionError + LOG.error('Error occured attempting to connect to ' + 'connect to the introspection service. ' + 'Error: %(err)s', + {'err': e}) + exception_encountered = True + interval = min(interval * self.backoff_factor, + self.max_delay) finally: os.close(self.reader) diff --git a/releasenotes/notes/jitter-for-inspection-command-5a226927757a0308.yaml b/releasenotes/notes/jitter-for-inspection-command-5a226927757a0308.yaml new file mode 100644 index 000000000..77edb00ac --- /dev/null +++ b/releasenotes/notes/jitter-for-inspection-command-5a226927757a0308.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Fixes risk of potential active node thundering heard by introducing + jitter handling into the ``ironic-collect-introspection-data``. + By default, the jitter will cause the + ``introspection_daemon_post_interval`` configuration parameter based + time value to be honored between in a range of 70% to 120% of the + desired time window. + + Should failures occur after the initial connection and start of the + daemon mode for introspection data collection, the fallback is a maximum + of 400% of the introspection daemon post interval.