368ab136f0
Adds a jitter and backoff behavior to the inspector data collection command to prevent thundering heard sorts of issues. Change-Id: I00517010991cbe43d5958c7d76019ef6fe89c983
126 lines
5.0 KiB
Python
126 lines
5.0 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
import random
|
|
import select
|
|
import threading
|
|
|
|
from ironic_lib import exception
|
|
from oslo_config import cfg
|
|
from oslo_log import log
|
|
|
|
from ironic_python_agent import errors
|
|
from ironic_python_agent import inspector
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
|
|
class IronicInspection(threading.Thread):
|
|
"""Class for manual inspection functionality."""
|
|
|
|
# If we could wait at most N seconds between heartbeats (or in case of an
|
|
# error) we will instead wait r x N seconds, where r is a random value
|
|
# between these multipliers.
|
|
min_jitter_multiplier = 0.7
|
|
max_jitter_multiplier = 1.2
|
|
|
|
# Exponential backoff values used in case of an error. In reality we will
|
|
# only wait a portion of either of these delays based on the jitter
|
|
# multipliers.
|
|
max_delay = 4 * cfg.CONF.introspection_daemon_post_interval
|
|
backoff_factor = 2.7
|
|
|
|
def __init__(self):
|
|
super(IronicInspection, self).__init__()
|
|
if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile):
|
|
LOG.warning("Only one of 'keyfile' and 'certfile' options is "
|
|
"defined in config file. Its value will be ignored.")
|
|
|
|
def _run(self):
|
|
try:
|
|
daemon_mode = cfg.CONF.introspection_daemon
|
|
interval = cfg.CONF.introspection_daemon_post_interval
|
|
|
|
inspector.inspect()
|
|
if not daemon_mode:
|
|
# No reason to continue unless we're in daemon mode.
|
|
return
|
|
|
|
self.reader, self.writer = os.pipe()
|
|
p = select.poll()
|
|
p.register(self.reader)
|
|
exception_encountered = False
|
|
|
|
try:
|
|
while daemon_mode:
|
|
interval_multiplier = random.uniform(
|
|
self.min_jitter_multiplier,
|
|
self.max_jitter_multiplier)
|
|
interval = interval * interval_multiplier
|
|
log_msg = 'sleeping before next inspection, interval: %s'
|
|
LOG.info(log_msg, interval)
|
|
|
|
if p.poll(interval * 1000):
|
|
if os.read(self.reader, 1).decode() == 'a':
|
|
break
|
|
try:
|
|
inspector.inspect()
|
|
if exception_encountered:
|
|
interval = min(
|
|
interval,
|
|
cfg.CONF.introspection_daemon_post_interval)
|
|
exception_encountered = False
|
|
except errors.InspectionError as e:
|
|
# Failures happen, no reason to exit as
|
|
# the failure could be intermittent.
|
|
LOG.warning('Error reporting introspection '
|
|
'data: %(err)s',
|
|
{'err': e})
|
|
exception_encountered = True
|
|
interval = min(interval * self.backoff_factor,
|
|
self.max_delay)
|
|
|
|
except exception.ServiceLookupFailure as e:
|
|
# Likely a mDNS lookup failure. We should
|
|
# keep retrying.
|
|
LOG.error('Error looking up introspection '
|
|
'endpoint: %(err)s',
|
|
{'err': e})
|
|
exception_encountered = True
|
|
interval = min(interval * self.backoff_factor,
|
|
self.max_delay)
|
|
except Exception as e:
|
|
# General failure such as requests ConnectionError
|
|
LOG.error('Error occured attempting to connect to '
|
|
'connect to the introspection service. '
|
|
'Error: %(err)s',
|
|
{'err': e})
|
|
exception_encountered = True
|
|
interval = min(interval * self.backoff_factor,
|
|
self.max_delay)
|
|
|
|
finally:
|
|
os.close(self.reader)
|
|
os.close(self.writer)
|
|
self.reader = None
|
|
self.writer = None
|
|
except errors.InspectionError as e:
|
|
msg = "Inspection failed: %s" % e
|
|
raise errors.InspectionError(msg)
|
|
|
|
def run(self):
|
|
"""Run Inspection."""
|
|
if not cfg.CONF.inspection_callback_url:
|
|
cfg.CONF.set_override('inspection_callback_url', 'mdns')
|
|
self._run()
|