# Copyright 2013 Rackspace, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import importlib.metadata import ipaddress import random import socket import threading import time from urllib import parse as urlparse import eventlet from ironic_lib import exception as lib_exc from ironic_lib import mdns from oslo_concurrency import processutils from oslo_config import cfg from oslo_log import log from ironic_python_agent.api import app from ironic_python_agent import config from ironic_python_agent import encoding from ironic_python_agent import errors from ironic_python_agent.extensions import base from ironic_python_agent import hardware from ironic_python_agent import inspector from ironic_python_agent import ironic_api_client from ironic_python_agent import utils LOG = log.getLogger(__name__) # Time(in seconds) to wait for any of the interfaces to be up # before lookup of the node is attempted NETWORK_WAIT_TIMEOUT = 60 # Time(in seconds) to wait before reattempt NETWORK_WAIT_RETRY = 5 cfg.CONF.import_group('metrics', 'ironic_lib.metrics_utils') cfg.CONF.import_group('metrics_statsd', 'ironic_lib.metrics_statsd') Host = collections.namedtuple('Host', ['hostname', 'port']) def _time(): """Wraps time.time() for simpler testing.""" return time.time() class IronicPythonAgentStatus(encoding.Serializable): """Represents the status of an agent.""" serializable_fields = ('started_at', 'version') def __init__(self, started_at, version): self.started_at = started_at self.version = version def _with_jitter(value, min_multiplier, max_multiplier): interval_multiplier = random.uniform(min_multiplier, max_multiplier) return value * interval_multiplier class IronicPythonAgentHeartbeater(threading.Thread): """Thread that periodically heartbeats to Ironic.""" # If we could wait at most N seconds between heartbeats, we will instead # wait r x N seconds, where r is a random value between these multipliers. min_jitter_multiplier = 0.3 max_jitter_multiplier = 0.6 # Error retry between 5 and 10 seconds, at least 12 retries with # the default ramdisk_heartbeat_timeout of 300 and the worst case interval # jitter of 0.6. min_heartbeat_interval = 5 min_error_jitter_multiplier = 1.0 max_error_jitter_multiplier = 2.0 def __init__(self, agent): """Initialize the heartbeat thread. :param agent: an :class:`ironic_python_agent.agent.IronicPythonAgent` instance. """ super(IronicPythonAgentHeartbeater, self).__init__() self.agent = agent self.stop_event = threading.Event() self.api = agent.api_client self.interval = 0 self.heartbeat_forced = False self.previous_heartbeat = 0 def run(self): """Start the heartbeat thread.""" # The first heartbeat happens immediately LOG.info('Starting heartbeater') self.agent.set_agent_advertise_addr() while self._run_next(): eventlet.sleep(0) def _run_next(self): # The logic here makes sure we don't wait exactly 5 seconds more or # less regardless of the current interval since it may cause a # thundering herd problem when a lot of agents are heartbeating. # Essentially, if the next heartbeat is due in 2 seconds, don't wait 5. # But if the next one is scheduled in 2 minutes, do wait 5 to account # for forced heartbeats. wait = min( self.min_heartbeat_interval, # This operation checks how much of the initially planned interval # we have still left. Compare with 0 in case we overshoot the goal. max(0, self.interval - (_time() - self.previous_heartbeat)), ) if self.stop_event.wait(wait): return False # done if self._heartbeat_expected(): self.do_heartbeat() return True def _heartbeat_expected(self): elapsed = _time() - self.previous_heartbeat # Normal heartbeating if elapsed >= self.interval: return True # Forced heartbeating, but once in 5 seconds if self.heartbeat_forced and elapsed > self.min_heartbeat_interval: return True def do_heartbeat(self): """Send a heartbeat to Ironic.""" try: self.api.heartbeat( uuid=self.agent.get_node_uuid(), advertise_address=self.agent.advertise_address, advertise_protocol=self.agent.advertise_protocol, generated_cert=self.agent.generated_cert, ) except Exception as exc: if isinstance(exc, errors.HeartbeatConflictError): LOG.warning('conflict error sending heartbeat to %s', self.agent.api_urls) else: LOG.exception('error sending heartbeat to %s', self.agent.api_urls) self.interval = _with_jitter(self.min_heartbeat_interval, self.min_error_jitter_multiplier, self.max_error_jitter_multiplier) else: LOG.debug('heartbeat successful') self.heartbeat_forced = False self.interval = _with_jitter(self.agent.heartbeat_timeout, self.min_jitter_multiplier, self.max_jitter_multiplier) self.previous_heartbeat = _time() LOG.info('sleeping before next heartbeat, interval: %s', self.interval) def force_heartbeat(self): self.heartbeat_forced = True def stop(self): """Stop the heartbeat thread.""" LOG.info('stopping heartbeater') self.stop_event.set() return self.join() class IronicPythonAgent(base.ExecuteCommandMixin): """Class for base agent functionality.""" @classmethod def from_config(cls, conf): return cls(conf.api_url, Host(hostname=conf.advertise_host, port=conf.advertise_port), Host(hostname=conf.listen_host, port=conf.listen_port), conf.ip_lookup_attempts, conf.ip_lookup_sleep, conf.network_interface, conf.lookup_timeout, conf.lookup_interval, False, conf.agent_token, conf.hardware_initialization_delay, conf.advertise_protocol) def __init__(self, api_url, advertise_address, listen_address, ip_lookup_attempts, ip_lookup_sleep, network_interface, lookup_timeout, lookup_interval, standalone, agent_token, hardware_initialization_delay=0, advertise_protocol='http'): super(IronicPythonAgent, self).__init__() if bool(cfg.CONF.keyfile) != bool(cfg.CONF.certfile): LOG.warning("Only one of 'keyfile' and 'certfile' options is " "defined in config file. Its value will be ignored.") self.ext_mgr = base.init_ext_manager(self) if (not api_url or api_url == 'mdns') and not standalone: try: api_url, params = mdns.get_endpoint('baremetal') except lib_exc.ServiceLookupFailure: if api_url: # mDNS explicitly requested, report failure. raise else: # implicit fallback to mDNS, do not fail (maybe we're only # running inspection). LOG.warning('Could not get baremetal endpoint from mDNS, ' 'will not heartbeat') else: config.override(params) if api_url: self.api_urls = list(filter(None, api_url.split(','))) else: self.api_urls = None if self.api_urls: self.api_client = ironic_api_client.APIClient(self.api_urls) self.heartbeater = IronicPythonAgentHeartbeater(self) self.listen_address = listen_address self.advertise_address = advertise_address self.advertise_protocol = advertise_protocol self.version = importlib.metadata.version('ironic-python-agent') self.api = app.Application(self, cfg.CONF) self.heartbeat_timeout = None self.started_at = None self.node = None # lookup timeout in seconds self.lookup_timeout = lookup_timeout self.lookup_interval = lookup_interval self.ip_lookup_attempts = ip_lookup_attempts self.ip_lookup_sleep = ip_lookup_sleep self.network_interface = network_interface self.standalone = standalone self.hardware_initialization_delay = hardware_initialization_delay # IPA will stop serving requests and exit after this is set to False self.serve_api = True self.agent_token = agent_token # Allows this to be turned on by the conductor while running, # in the event of long running ramdisks where the conductor # got upgraded somewhere along the way. self.agent_token_required = cfg.CONF.agent_token_required self.generated_cert = None def get_status(self): """Retrieve a serializable status. :returns: a :class:`ironic_python_agent.agent.IronicPythonAgent` instance describing the agent's status. """ return IronicPythonAgentStatus( started_at=self.started_at, version=self.version ) def validate_agent_token(self, token): # We did not get a token, i.e. None and # we've previously seen a token, which is # a mid-cluster upgrade case with long-running ramdisks. if (not token and self.agent_token and not self.agent_token_required): # TODO(TheJulia): Rip this out during or after the V cycle. LOG.warning('Agent token for requests are not required ' 'by the conductor, yet we received a token. ' 'Cluster may be mid-upgrade. Support to ' 'not fail in this condition will be removed in ' 'the Victoria development cycle.') # Tell the API everything is okay. return True return self.agent_token == token def _get_route_source(self, dest): """Get the IP address to send packages to destination.""" try: out, _err = utils.execute('ip', 'route', 'get', dest) except (EnvironmentError, processutils.ProcessExecutionError) as e: LOG.warning('Cannot get route to host %(dest)s: %(err)s', {'dest': dest, 'err': e}) return try: source = out.strip().split('\n')[0].split('src')[1].split()[0] except IndexError: LOG.warning('No route to host %(dest)s, route record: %(rec)s', {'dest': dest, 'rec': out}) return try: if ipaddress.ip_address(source).is_link_local: LOG.info('Ignoring link-local source to %(dest)s: %(rec)s', {'dest': dest, 'rec': out}) return except ValueError as exc: LOG.warning('Invalid IP address %(addr)s returned as a route ' 'to host %(dest)s: %(err)s', {'dest': dest, 'addr': source, 'err': exc}) return source def _find_routable_addr(self): ips = set() for api_url in self.api_urls: ironic_host = urlparse.urlparse(api_url).hostname # Try resolving it in case it's not an IP address try: addrs = socket.getaddrinfo(ironic_host, 0) except socket.gaierror: LOG.debug('Could not resolve %s, maybe no DNS', ironic_host) ips.add(ironic_host) continue ips.update(addr for _, _, _, _, (addr, *_) in addrs) for attempt in range(self.ip_lookup_attempts): for ironic_host in ips: found_ip = self._get_route_source(ironic_host) if found_ip: return found_ip time.sleep(self.ip_lookup_sleep) def set_agent_advertise_addr(self): """Set advertised IP address for the agent, if not already set. If agent's advertised IP address is still default (None), try to find a better one. If the agent's network interface is None, replace that as well. :raises: LookupAgentIPError if an IP address could not be found """ if self.advertise_address.hostname is not None: return found_ip = None if self.network_interface is not None: # TODO(dtantsur): deprecate this found_ip = hardware.dispatch_to_managers('get_ipv4_addr', self.network_interface) else: found_ip = self._find_routable_addr() if found_ip: self.advertise_address = Host(hostname=found_ip, port=self.advertise_address.port) else: raise errors.LookupAgentIPError('Agent could not find a valid IP ' 'address.') def get_node_uuid(self): """Get UUID for Ironic node. If the agent has not yet heartbeated to Ironic, it will not have the UUID and this will raise an exception. :returns: A string containing the UUID for the Ironic node. :raises: UnknownNodeError if UUID is unknown. """ if self.node is None or 'uuid' not in self.node: raise errors.UnknownNodeError() return self.node['uuid'] def list_command_results(self): """Get a list of command results. :returns: list of :class:`ironic_python_agent.extensions.base. BaseCommandResult` objects. """ return list(self.command_results.values()) def get_command_result(self, result_id): """Get a specific command result by ID. :returns: a :class:`ironic_python_agent.extensions.base. BaseCommandResult` object. :raises: RequestedObjectNotFoundError if command with the given ID is not found. """ try: return self.command_results[result_id] except KeyError: raise errors.RequestedObjectNotFoundError('Command Result', result_id) def force_heartbeat(self): if not self.standalone: self.heartbeater.force_heartbeat() def _wait_for_interface(self): """Wait until at least one interface is up.""" wait_till = time.time() + NETWORK_WAIT_TIMEOUT while time.time() < wait_till: interfaces = hardware.dispatch_to_managers( 'list_network_interfaces') if not any(ifc.mac_address for ifc in interfaces): LOG.debug('Network is not up yet. ' 'No valid interfaces found, retrying ...') time.sleep(NETWORK_WAIT_RETRY) else: break else: LOG.warning("No valid network interfaces found. " "Node lookup will probably fail.") def _start_auto_tls(self): # NOTE(dtantsur): if listen_tls is True, assume static TLS # configuration and don't auto-generate anything. if cfg.CONF.listen_tls or not cfg.CONF.enable_auto_tls: LOG.debug('Automated TLS is disabled') return None, None if not self.api_urls or not self.api_client.supports_auto_tls(): LOG.warning('Ironic does not support automated TLS') return None, None self.set_agent_advertise_addr() LOG.info('Generating TLS parameters automatically for IP %s', self.advertise_address.hostname) tls_info = hardware.dispatch_to_managers( 'generate_tls_certificate', self.advertise_address.hostname) self.generated_cert = tls_info.text self.advertise_protocol = 'https' return tls_info.path, tls_info.private_key_path def serve_ipa_api(self): """Serve the API until an extension terminates it.""" cert_file, key_file = self._start_auto_tls() self.api.start(cert_file, key_file) if not self.standalone and self.api_urls: # Don't start heartbeating until the server is listening self.heartbeater.start() try: while self.serve_api: eventlet.sleep(0.1) except KeyboardInterrupt: LOG.info('Caught keyboard interrupt, exiting') self.api.stop() def process_lookup_data(self, content): """Update agent configuration from lookup data.""" self.node = content['node'] LOG.info('Lookup succeeded, node UUID is %s', self.node['uuid']) hardware.cache_node(self.node) self.heartbeat_timeout = content['config']['heartbeat_timeout'] # Update config with values from Ironic config = content.get('config', {}) if config.get('metrics'): for opt, val in config.items(): setattr(cfg.CONF.metrics, opt, val) if config.get('metrics_statsd'): for opt, val in config.items(): setattr(cfg.CONF.metrics_statsd, opt, val) if config.get('disable_deep_image_inspection') is not None: cfg.CONF.set_override('disable_deep_image_inspection', config['disable_deep_image_inspection']) if config.get('permitted_image_formats') is not None: cfg.CONF.set_override('permitted_image_formats', config['permitted_image_formats']) md5_allowed = config.get('agent_md5_checksum_enable') if md5_allowed is not None: cfg.CONF.set_override('md5_enabled', md5_allowed) if config.get('agent_token_required'): self.agent_token_required = True token = config.get('agent_token') if token: if len(token) >= 32: LOG.debug('Agent token recorded as designated by ' 'the ironic installation.') self.agent_token = token # set with-in the API client. if not self.standalone: self.api_client.agent_token = token elif token == '******': LOG.warning('The agent token has already been ' 'retrieved. IPA may not operate as ' 'intended and the deployment may fail ' 'depending on settings in the ironic ' 'deployment.') if not self.agent_token and self.agent_token_required: LOG.error('Ironic is signaling that agent tokens ' 'are required, however we do not have ' 'a token on file. ' 'This is likely **FATAL**.') else: LOG.info('An invalid token was received.') if self.agent_token and not self.standalone: # Explicitly set the token in our API client before # starting heartbeat operations. self.api_client.agent_token = self.agent_token def run(self): """Run the Ironic Python Agent.""" LOG.info('Starting ironic-python-agent version: %s', self.version) # Get the UUID so we can heartbeat to Ironic. Raises LookupNodeError # if there is an issue (uncaught, restart agent) self.started_at = _time() # Attempt to sync the software clock utils.sync_clock(ignore_errors=True) # Cached hw managers at runtime, not load time. See bug 1490008. hardware.get_managers() # Operator-settable delay before hardware actually comes up. # Helps with slow RAID drivers - see bug 1582797. if self.hardware_initialization_delay > 0: LOG.info('Waiting %d seconds before proceeding', self.hardware_initialization_delay) time.sleep(self.hardware_initialization_delay) if not self.standalone: # Inspection should be started before call to lookup, otherwise # lookup will fail due to unknown MAC. uuid = None # We can't try to inspect or heartbeat until we have valid # interfaces to perform those actions over. self._wait_for_interface() if self.api_urls or cfg.CONF.inspection_callback_url: try: # Attempt inspection. This may fail, and previously # an error would be logged. uuid = inspector.inspect() except errors.InspectionError as e: LOG.error('Failed to perform inspection: %s', e) if self.api_urls: content = self.api_client.lookup_node( hardware_info=hardware.list_hardware_info(use_cache=True), timeout=self.lookup_timeout, starting_interval=self.lookup_interval, node_uuid=uuid) LOG.debug('Received lookup results: %s', content) self.process_lookup_data(content) # Save the API url in case we need it later. hardware.save_api_client( self.api_client, self.lookup_timeout, self.lookup_interval) elif cfg.CONF.inspection_callback_url: LOG.info('No ipa-api-url configured, Heartbeat and lookup ' 'skipped for inspector.') else: # NOTE(TheJulia): Once communication flow capability is # able to be driven solely from the conductor, this is no # longer a major issue. LOG.error('Neither ipa-api-url nor inspection_callback_url' 'found, please check your pxe append parameters.') self.serve_ipa_api() if not self.standalone and self.api_urls: self.heartbeater.stop()