Add jitter to inspection command reporting

Adds a jitter and backoff behavior to the inspector data
collection command to prevent thundering heard sorts of
issues.

Change-Id: I00517010991cbe43d5958c7d76019ef6fe89c983
This commit is contained in:
Julia Kreger 2020-03-25 10:05:38 -07:00
parent 68a71513f0
commit 368ab136f0
2 changed files with 57 additions and 5 deletions

View File

@ -11,6 +11,7 @@
# limitations under the License.
import os
import random
import select
import threading
@ -27,6 +28,18 @@ LOG = log.getLogger(__name__)
class IronicInspection(threading.Thread):
"""Class for manual inspection functionality."""
# If we could wait at most N seconds between heartbeats (or in case of an
# error) we will instead wait r x N seconds, where r is a random value
# between these multipliers.
min_jitter_multiplier = 0.7
max_jitter_multiplier = 1.2
# Exponential backoff values used in case of an error. In reality we will
# only wait a portion of either of these delays based on the jitter
# multipliers.
max_delay = 4 * cfg.CONF.introspection_daemon_post_interval
backoff_factor = 2.7
def __init__(self):
super(IronicInspection, self).__init__()
if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile):
@ -36,7 +49,7 @@ class IronicInspection(threading.Thread):
def _run(self):
try:
daemon_mode = cfg.CONF.introspection_daemon
post_interval = cfg.CONF.introspection_daemon_post_interval
interval = cfg.CONF.introspection_daemon_post_interval
inspector.inspect()
if not daemon_mode:
@ -46,29 +59,55 @@ class IronicInspection(threading.Thread):
self.reader, self.writer = os.pipe()
p = select.poll()
p.register(self.reader)
exception_encountered = False
try:
while daemon_mode:
LOG.info('Sleeping until next check-in.')
# TODO(TheJulia): It would likely be good to introduce
# some jitter into this at some point...
if p.poll(post_interval * 1000):
interval_multiplier = random.uniform(
self.min_jitter_multiplier,
self.max_jitter_multiplier)
interval = interval * interval_multiplier
log_msg = 'sleeping before next inspection, interval: %s'
LOG.info(log_msg, interval)
if p.poll(interval * 1000):
if os.read(self.reader, 1).decode() == 'a':
break
try:
inspector.inspect()
if exception_encountered:
interval = min(
interval,
cfg.CONF.introspection_daemon_post_interval)
exception_encountered = False
except errors.InspectionError as e:
# Failures happen, no reason to exit as
# the failure could be intermittent.
LOG.warning('Error reporting introspection '
'data: %(err)s',
{'err': e})
exception_encountered = True
interval = min(interval * self.backoff_factor,
self.max_delay)
except exception.ServiceLookupFailure as e:
# Likely a mDNS lookup failure. We should
# keep retrying.
LOG.error('Error looking up introspection '
'endpoint: %(err)s',
{'err': e})
exception_encountered = True
interval = min(interval * self.backoff_factor,
self.max_delay)
except Exception as e:
# General failure such as requests ConnectionError
LOG.error('Error occured attempting to connect to '
'connect to the introspection service. '
'Error: %(err)s',
{'err': e})
exception_encountered = True
interval = min(interval * self.backoff_factor,
self.max_delay)
finally:
os.close(self.reader)

View File

@ -0,0 +1,13 @@
---
fixes:
- |
Fixes risk of potential active node thundering heard by introducing
jitter handling into the ``ironic-collect-introspection-data``.
By default, the jitter will cause the
``introspection_daemon_post_interval`` configuration parameter based
time value to be honored between in a range of 70% to 120% of the
desired time window.
Should failures occur after the initial connection and start of the
daemon mode for introspection data collection, the fallback is a maximum
of 400% of the introspection daemon post interval.