Add jitter to inspection command reporting
Adds a jitter and backoff behavior to the inspector data collection command to prevent thundering heard sorts of issues. Change-Id: I00517010991cbe43d5958c7d76019ef6fe89c983
This commit is contained in:
parent
68a71513f0
commit
368ab136f0
@ -11,6 +11,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import random
|
||||
import select
|
||||
import threading
|
||||
|
||||
@ -27,6 +28,18 @@ LOG = log.getLogger(__name__)
|
||||
class IronicInspection(threading.Thread):
|
||||
"""Class for manual inspection functionality."""
|
||||
|
||||
# If we could wait at most N seconds between heartbeats (or in case of an
|
||||
# error) we will instead wait r x N seconds, where r is a random value
|
||||
# between these multipliers.
|
||||
min_jitter_multiplier = 0.7
|
||||
max_jitter_multiplier = 1.2
|
||||
|
||||
# Exponential backoff values used in case of an error. In reality we will
|
||||
# only wait a portion of either of these delays based on the jitter
|
||||
# multipliers.
|
||||
max_delay = 4 * cfg.CONF.introspection_daemon_post_interval
|
||||
backoff_factor = 2.7
|
||||
|
||||
def __init__(self):
|
||||
super(IronicInspection, self).__init__()
|
||||
if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile):
|
||||
@ -36,7 +49,7 @@ class IronicInspection(threading.Thread):
|
||||
def _run(self):
|
||||
try:
|
||||
daemon_mode = cfg.CONF.introspection_daemon
|
||||
post_interval = cfg.CONF.introspection_daemon_post_interval
|
||||
interval = cfg.CONF.introspection_daemon_post_interval
|
||||
|
||||
inspector.inspect()
|
||||
if not daemon_mode:
|
||||
@ -46,29 +59,55 @@ class IronicInspection(threading.Thread):
|
||||
self.reader, self.writer = os.pipe()
|
||||
p = select.poll()
|
||||
p.register(self.reader)
|
||||
exception_encountered = False
|
||||
|
||||
try:
|
||||
while daemon_mode:
|
||||
LOG.info('Sleeping until next check-in.')
|
||||
# TODO(TheJulia): It would likely be good to introduce
|
||||
# some jitter into this at some point...
|
||||
if p.poll(post_interval * 1000):
|
||||
interval_multiplier = random.uniform(
|
||||
self.min_jitter_multiplier,
|
||||
self.max_jitter_multiplier)
|
||||
interval = interval * interval_multiplier
|
||||
log_msg = 'sleeping before next inspection, interval: %s'
|
||||
LOG.info(log_msg, interval)
|
||||
|
||||
if p.poll(interval * 1000):
|
||||
if os.read(self.reader, 1).decode() == 'a':
|
||||
break
|
||||
try:
|
||||
inspector.inspect()
|
||||
if exception_encountered:
|
||||
interval = min(
|
||||
interval,
|
||||
cfg.CONF.introspection_daemon_post_interval)
|
||||
exception_encountered = False
|
||||
except errors.InspectionError as e:
|
||||
# Failures happen, no reason to exit as
|
||||
# the failure could be intermittent.
|
||||
LOG.warning('Error reporting introspection '
|
||||
'data: %(err)s',
|
||||
{'err': e})
|
||||
exception_encountered = True
|
||||
interval = min(interval * self.backoff_factor,
|
||||
self.max_delay)
|
||||
|
||||
except exception.ServiceLookupFailure as e:
|
||||
# Likely a mDNS lookup failure. We should
|
||||
# keep retrying.
|
||||
LOG.error('Error looking up introspection '
|
||||
'endpoint: %(err)s',
|
||||
{'err': e})
|
||||
exception_encountered = True
|
||||
interval = min(interval * self.backoff_factor,
|
||||
self.max_delay)
|
||||
except Exception as e:
|
||||
# General failure such as requests ConnectionError
|
||||
LOG.error('Error occured attempting to connect to '
|
||||
'connect to the introspection service. '
|
||||
'Error: %(err)s',
|
||||
{'err': e})
|
||||
exception_encountered = True
|
||||
interval = min(interval * self.backoff_factor,
|
||||
self.max_delay)
|
||||
|
||||
finally:
|
||||
os.close(self.reader)
|
||||
|
@ -0,0 +1,13 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes risk of potential active node thundering heard by introducing
|
||||
jitter handling into the ``ironic-collect-introspection-data``.
|
||||
By default, the jitter will cause the
|
||||
``introspection_daemon_post_interval`` configuration parameter based
|
||||
time value to be honored between in a range of 70% to 120% of the
|
||||
desired time window.
|
||||
|
||||
Should failures occur after the initial connection and start of the
|
||||
daemon mode for introspection data collection, the fallback is a maximum
|
||||
of 400% of the introspection daemon post interval.
|
Loading…
Reference in New Issue
Block a user