Add jitter to inspection command reporting
Adds a jitter and backoff behavior to the inspector data collection command to prevent thundering heard sorts of issues. Change-Id: I00517010991cbe43d5958c7d76019ef6fe89c983
This commit is contained in:
parent
68a71513f0
commit
368ab136f0
@ -11,6 +11,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import select
|
import select
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
@ -27,6 +28,18 @@ LOG = log.getLogger(__name__)
|
|||||||
class IronicInspection(threading.Thread):
|
class IronicInspection(threading.Thread):
|
||||||
"""Class for manual inspection functionality."""
|
"""Class for manual inspection functionality."""
|
||||||
|
|
||||||
|
# If we could wait at most N seconds between heartbeats (or in case of an
|
||||||
|
# error) we will instead wait r x N seconds, where r is a random value
|
||||||
|
# between these multipliers.
|
||||||
|
min_jitter_multiplier = 0.7
|
||||||
|
max_jitter_multiplier = 1.2
|
||||||
|
|
||||||
|
# Exponential backoff values used in case of an error. In reality we will
|
||||||
|
# only wait a portion of either of these delays based on the jitter
|
||||||
|
# multipliers.
|
||||||
|
max_delay = 4 * cfg.CONF.introspection_daemon_post_interval
|
||||||
|
backoff_factor = 2.7
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(IronicInspection, self).__init__()
|
super(IronicInspection, self).__init__()
|
||||||
if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile):
|
if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile):
|
||||||
@ -36,7 +49,7 @@ class IronicInspection(threading.Thread):
|
|||||||
def _run(self):
|
def _run(self):
|
||||||
try:
|
try:
|
||||||
daemon_mode = cfg.CONF.introspection_daemon
|
daemon_mode = cfg.CONF.introspection_daemon
|
||||||
post_interval = cfg.CONF.introspection_daemon_post_interval
|
interval = cfg.CONF.introspection_daemon_post_interval
|
||||||
|
|
||||||
inspector.inspect()
|
inspector.inspect()
|
||||||
if not daemon_mode:
|
if not daemon_mode:
|
||||||
@ -46,29 +59,55 @@ class IronicInspection(threading.Thread):
|
|||||||
self.reader, self.writer = os.pipe()
|
self.reader, self.writer = os.pipe()
|
||||||
p = select.poll()
|
p = select.poll()
|
||||||
p.register(self.reader)
|
p.register(self.reader)
|
||||||
|
exception_encountered = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while daemon_mode:
|
while daemon_mode:
|
||||||
LOG.info('Sleeping until next check-in.')
|
interval_multiplier = random.uniform(
|
||||||
# TODO(TheJulia): It would likely be good to introduce
|
self.min_jitter_multiplier,
|
||||||
# some jitter into this at some point...
|
self.max_jitter_multiplier)
|
||||||
if p.poll(post_interval * 1000):
|
interval = interval * interval_multiplier
|
||||||
|
log_msg = 'sleeping before next inspection, interval: %s'
|
||||||
|
LOG.info(log_msg, interval)
|
||||||
|
|
||||||
|
if p.poll(interval * 1000):
|
||||||
if os.read(self.reader, 1).decode() == 'a':
|
if os.read(self.reader, 1).decode() == 'a':
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
inspector.inspect()
|
inspector.inspect()
|
||||||
|
if exception_encountered:
|
||||||
|
interval = min(
|
||||||
|
interval,
|
||||||
|
cfg.CONF.introspection_daemon_post_interval)
|
||||||
|
exception_encountered = False
|
||||||
except errors.InspectionError as e:
|
except errors.InspectionError as e:
|
||||||
# Failures happen, no reason to exit as
|
# Failures happen, no reason to exit as
|
||||||
# the failure could be intermittent.
|
# the failure could be intermittent.
|
||||||
LOG.warning('Error reporting introspection '
|
LOG.warning('Error reporting introspection '
|
||||||
'data: %(err)s',
|
'data: %(err)s',
|
||||||
{'err': e})
|
{'err': e})
|
||||||
|
exception_encountered = True
|
||||||
|
interval = min(interval * self.backoff_factor,
|
||||||
|
self.max_delay)
|
||||||
|
|
||||||
except exception.ServiceLookupFailure as e:
|
except exception.ServiceLookupFailure as e:
|
||||||
# Likely a mDNS lookup failure. We should
|
# Likely a mDNS lookup failure. We should
|
||||||
# keep retrying.
|
# keep retrying.
|
||||||
LOG.error('Error looking up introspection '
|
LOG.error('Error looking up introspection '
|
||||||
'endpoint: %(err)s',
|
'endpoint: %(err)s',
|
||||||
{'err': e})
|
{'err': e})
|
||||||
|
exception_encountered = True
|
||||||
|
interval = min(interval * self.backoff_factor,
|
||||||
|
self.max_delay)
|
||||||
|
except Exception as e:
|
||||||
|
# General failure such as requests ConnectionError
|
||||||
|
LOG.error('Error occured attempting to connect to '
|
||||||
|
'connect to the introspection service. '
|
||||||
|
'Error: %(err)s',
|
||||||
|
{'err': e})
|
||||||
|
exception_encountered = True
|
||||||
|
interval = min(interval * self.backoff_factor,
|
||||||
|
self.max_delay)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
os.close(self.reader)
|
os.close(self.reader)
|
||||||
|
@ -0,0 +1,13 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes risk of potential active node thundering heard by introducing
|
||||||
|
jitter handling into the ``ironic-collect-introspection-data``.
|
||||||
|
By default, the jitter will cause the
|
||||||
|
``introspection_daemon_post_interval`` configuration parameter based
|
||||||
|
time value to be honored between in a range of 70% to 120% of the
|
||||||
|
desired time window.
|
||||||
|
|
||||||
|
Should failures occur after the initial connection and start of the
|
||||||
|
daemon mode for introspection data collection, the fallback is a maximum
|
||||||
|
of 400% of the introspection daemon post interval.
|
Loading…
x
Reference in New Issue
Block a user