# Copyright (c) 2012 NTT DOCOMO, INC. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. import base64 import gzip import math import os import re import shutil import socket import stat import tempfile import time from oslo_concurrency import processutils from oslo_config import cfg from oslo_serialization import jsonutils from oslo_utils import excutils from oslo_utils import units import requests import six from ironic.common import disk_partitioner from ironic.common import exception from ironic.common.i18n import _ from ironic.common.i18n import _LE from ironic.common import images from ironic.common import states from ironic.common import utils from ironic.conductor import utils as manager_utils from ironic.drivers.modules import image_cache from ironic.openstack.common import log as logging deploy_opts = [ cfg.StrOpt('dd_block_size', default='1M', help='Block size to use when writing to the nodes disk.'), cfg.IntOpt('iscsi_verify_attempts', default=3, help='Maximum attempts to verify an iSCSI connection is ' 'active, sleeping 1 second between attempts.'), ] CONF = cfg.CONF CONF.register_opts(deploy_opts, group='deploy') LOG = logging.getLogger(__name__) # All functions are called from deploy() directly or indirectly. # They are split for stub-out. def discovery(portal_address, portal_port): """Do iSCSI discovery on portal.""" utils.execute('iscsiadm', '-m', 'discovery', '-t', 'st', '-p', '%s:%s' % (portal_address, portal_port), run_as_root=True, check_exit_code=[0], attempts=5, delay_on_retry=True) def login_iscsi(portal_address, portal_port, target_iqn): """Login to an iSCSI target.""" utils.execute('iscsiadm', '-m', 'node', '-p', '%s:%s' % (portal_address, portal_port), '-T', target_iqn, '--login', run_as_root=True, check_exit_code=[0], attempts=5, delay_on_retry=True) # Ensure the login complete verify_iscsi_connection(target_iqn) # force iSCSI initiator to re-read luns force_iscsi_lun_update(target_iqn) # ensure file system sees the block device check_file_system_for_iscsi_device(portal_address, portal_port, target_iqn) def check_file_system_for_iscsi_device(portal_address, portal_port, target_iqn): """Ensure the file system sees the iSCSI block device.""" check_dir = "/dev/disk/by-path/ip-%s:%s-iscsi-%s-lun-1" % (portal_address, portal_port, target_iqn) total_checks = CONF.deploy.iscsi_verify_attempts for attempt in range(total_checks): if os.path.exists(check_dir): break time.sleep(1) LOG.debug("iSCSI connection not seen by file system. Rechecking. " "Attempt %(attempt)d out of %(total)d", {"attempt": attempt + 1, "total": total_checks}) else: msg = _("iSCSI connection was not seen by the file system after " "attempting to verify %d times.") % total_checks LOG.error(msg) raise exception.InstanceDeployFailure(msg) def verify_iscsi_connection(target_iqn): """Verify iscsi connection.""" LOG.debug("Checking for iSCSI target to become active.") for attempt in range(CONF.deploy.iscsi_verify_attempts): out, _err = utils.execute('iscsiadm', '-m', 'node', '-S', run_as_root=True, check_exit_code=[0]) if target_iqn in out: break time.sleep(1) LOG.debug("iSCSI connection not active. Rechecking. Attempt " "%(attempt)d out of %(total)d", {"attempt": attempt + 1, "total": CONF.deploy.iscsi_verify_attempts}) else: msg = _("iSCSI connection did not become active after attempting to " "verify %d times.") % CONF.deploy.iscsi_verify_attempts LOG.error(msg) raise exception.InstanceDeployFailure(msg) def force_iscsi_lun_update(target_iqn): """force iSCSI initiator to re-read luns.""" LOG.debug("Re-reading iSCSI luns.") utils.execute('iscsiadm', '-m', 'node', '-T', target_iqn, '-R', run_as_root=True, check_exit_code=[0]) def logout_iscsi(portal_address, portal_port, target_iqn): """Logout from an iSCSI target.""" utils.execute('iscsiadm', '-m', 'node', '-p', '%s:%s' % (portal_address, portal_port), '-T', target_iqn, '--logout', run_as_root=True, check_exit_code=[0], attempts=5, delay_on_retry=True) def delete_iscsi(portal_address, portal_port, target_iqn): """Delete the iSCSI target.""" # Retry delete until it succeeds (exit code 0) or until there is # no longer a target to delete (exit code 21). utils.execute('iscsiadm', '-m', 'node', '-p', '%s:%s' % (portal_address, portal_port), '-T', target_iqn, '-o', 'delete', run_as_root=True, check_exit_code=[0, 21], attempts=5, delay_on_retry=True) def make_partitions(dev, root_mb, swap_mb, ephemeral_mb, configdrive_mb, commit=True): """Partition the disk device. Create partitions for root, swap, ephemeral and configdrive on a disk device. :param root_mb: Size of the root partition in mebibytes (MiB). :param swap_mb: Size of the swap partition in mebibytes (MiB). If 0, no partition will be created. :param ephemeral_mb: Size of the ephemeral partition in mebibytes (MiB). If 0, no partition will be created. :param configdrive_mb: Size of the configdrive partition in mebibytes (MiB). If 0, no partition will be created. :param commit: True/False. Default for this setting is True. If False partitions will not be written to disk. :returns: A dictionary containing the partition type as Key and partition path as Value for the partitions created by this method. """ LOG.debug("Starting to partition the disk device: %(dev)s", {'dev': dev}) part_template = dev + '-part%d' part_dict = {} dp = disk_partitioner.DiskPartitioner(dev) if ephemeral_mb: LOG.debug("Add ephemeral partition (%(size)d MB) to device: %(dev)s", {'dev': dev, 'size': ephemeral_mb}) part_num = dp.add_partition(ephemeral_mb) part_dict['ephemeral'] = part_template % part_num if swap_mb: LOG.debug("Add Swap partition (%(size)d MB) to device: %(dev)s", {'dev': dev, 'size': swap_mb}) part_num = dp.add_partition(swap_mb, fs_type='linux-swap') part_dict['swap'] = part_template % part_num if configdrive_mb: LOG.debug("Add config drive partition (%(size)d MB) to device: " "%(dev)s", {'dev': dev, 'size': configdrive_mb}) part_num = dp.add_partition(configdrive_mb) part_dict['configdrive'] = part_template % part_num # NOTE(lucasagomes): Make the root partition the last partition. This # enables tools like cloud-init's growroot utility to expand the root # partition until the end of the disk. LOG.debug("Add root partition (%(size)d MB) to device: %(dev)s", {'dev': dev, 'size': root_mb}) part_num = dp.add_partition(root_mb) part_dict['root'] = part_template % part_num if commit: # write to the disk dp.commit() return part_dict def is_block_device(dev): """Check whether a device is block or not.""" attempts = CONF.deploy.iscsi_verify_attempts for attempt in range(attempts): try: s = os.stat(dev) except OSError as e: LOG.debug("Unable to stat device %(dev)s. Attempt %(attempt)d " "out of %(total)d. Error: %(err)s", {"dev": dev, "attempt": attempt + 1, "total": attempts, "err": e}) time.sleep(1) else: return stat.S_ISBLK(s.st_mode) msg = _("Unable to stat device %(dev)s after attempting to verify " "%(attempts)d times.") % {'dev': dev, 'attempts': attempts} LOG.error(msg) raise exception.InstanceDeployFailure(msg) def dd(src, dst): """Execute dd from src to dst.""" utils.dd(src, dst, 'bs=%s' % CONF.deploy.dd_block_size, 'oflag=direct') def populate_image(src, dst): data = images.qemu_img_info(src) if data.file_format == 'raw': dd(src, dst) else: images.convert_image(src, dst, 'raw', True) def mkswap(dev, label='swap1'): """Execute mkswap on a device.""" utils.mkfs('swap', dev, label) def mkfs_ephemeral(dev, ephemeral_format, label="ephemeral0"): utils.mkfs(ephemeral_format, dev, label) def block_uuid(dev): """Get UUID of a block device.""" out, _err = utils.execute('blkid', '-s', 'UUID', '-o', 'value', dev, run_as_root=True, check_exit_code=[0]) return out.strip() def switch_pxe_config(path, root_uuid, boot_mode): """Switch a pxe config from deployment mode to service mode.""" with open(path) as f: lines = f.readlines() root = 'UUID=%s' % root_uuid rre = re.compile(r'\{\{ ROOT \}\}') if boot_mode == 'uefi': dre = re.compile('^default=.*$') boot_line = 'default=boot' else: pxe_cmd = 'goto' if CONF.pxe.ipxe_enabled else 'default' dre = re.compile('^%s .*$' % pxe_cmd) boot_line = '%s boot' % pxe_cmd with open(path, 'w') as f: for line in lines: line = rre.sub(root, line) line = dre.sub(boot_line, line) f.write(line) def notify(address, port): """Notify a node that it becomes ready to reboot.""" s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: s.connect((address, port)) s.send('done') finally: s.close() def get_dev(address, port, iqn, lun): """Returns a device path for given parameters.""" dev = ("/dev/disk/by-path/ip-%s:%s-iscsi-%s-lun-%s" % (address, port, iqn, lun)) return dev def get_image_mb(image_path, virtual_size=True): """Get size of an image in Megabyte.""" mb = 1024 * 1024 if not virtual_size: image_byte = os.path.getsize(image_path) else: image_byte = images.converted_size(image_path) # round up size to MB image_mb = int((image_byte + mb - 1) / mb) return image_mb def get_dev_block_size(dev): """Get the device size in 512 byte sectors.""" block_sz, cmderr = utils.execute('blockdev', '--getsz', dev, run_as_root=True, check_exit_code=[0]) return int(block_sz) def destroy_disk_metadata(dev, node_uuid): """Destroy metadata structures on node's disk. Ensure that node's disk appears to be blank without zeroing the entire drive. To do this we will zero: - the first 18KiB to clear MBR / GPT data - the last 18KiB to clear GPT and other metadata like: LVM, veritas, MDADM, DMRAID, ... """ # NOTE(NobodyCam): This is needed to work around bug: # https://bugs.launchpad.net/ironic/+bug/1317647 LOG.debug("Start destroy disk metadata for node %(node)s.", {'node': node_uuid}) try: utils.execute('dd', 'if=/dev/zero', 'of=%s' % dev, 'bs=512', 'count=36', run_as_root=True, check_exit_code=[0]) except processutils.ProcessExecutionError as err: with excutils.save_and_reraise_exception(): LOG.error(_LE("Failed to erase beginning of disk for node " "%(node)s. Command: %(command)s. Error: %(error)s."), {'node': node_uuid, 'command': err.cmd, 'error': err.stderr}) # now wipe the end of the disk. # get end of disk seek value try: block_sz = get_dev_block_size(dev) except processutils.ProcessExecutionError as err: with excutils.save_and_reraise_exception(): LOG.error(_LE("Failed to get disk block count for node %(node)s. " "Command: %(command)s. Error: %(error)s."), {'node': node_uuid, 'command': err.cmd, 'error': err.stderr}) else: seek_value = block_sz - 36 try: utils.execute('dd', 'if=/dev/zero', 'of=%s' % dev, 'bs=512', 'count=36', 'seek=%d' % seek_value, run_as_root=True, check_exit_code=[0]) except processutils.ProcessExecutionError as err: with excutils.save_and_reraise_exception(): LOG.error(_LE("Failed to erase the end of the disk on node " "%(node)s. Command: %(command)s. " "Error: %(error)s."), {'node': node_uuid, 'command': err.cmd, 'error': err.stderr}) def _get_configdrive(configdrive, node_uuid): """Get the information about size and location of the configdrive. :param configdrive: Base64 encoded Gzipped configdrive content or configdrive HTTP URL. :param node_uuid: Node's uuid. Used for logging. :raises: InstanceDeployFailure if it can't download or decode the config drive. :returns: A tuple with the size in MiB and path to the uncompressed configdrive file. """ # Check if the configdrive option is a HTTP URL or the content directly is_url = utils.is_http_url(configdrive) if is_url: try: data = requests.get(configdrive).content except requests.exceptions.RequestException as e: raise exception.InstanceDeployFailure( _("Can't download the configdrive content for node %(node)s " "from '%(url)s'. Reason: %(reason)s") % {'node': node_uuid, 'url': configdrive, 'reason': e}) else: data = configdrive try: data = six.StringIO(base64.b64decode(data)) except TypeError: error_msg = (_('Config drive for node %s is not base64 encoded ' 'or the content is malformed.') % node_uuid) if is_url: error_msg += _(' Downloaded from "%s".') % configdrive raise exception.InstanceDeployFailure(error_msg) configdrive_file = tempfile.NamedTemporaryFile(delete=False, prefix='configdrive') configdrive_mb = 0 with gzip.GzipFile('configdrive', 'rb', fileobj=data) as gunzipped: try: shutil.copyfileobj(gunzipped, configdrive_file) except EnvironmentError as e: # Delete the created file utils.unlink_without_raise(configdrive_file.name) raise exception.InstanceDeployFailure( _('Encountered error while decompressing and writing ' 'config drive for node %(node)s. Error: %(exc)s') % {'node': node_uuid, 'exc': e}) else: # Get the file size and convert to MiB configdrive_file.seek(0, os.SEEK_END) bytes_ = configdrive_file.tell() configdrive_mb = int(math.ceil(float(bytes_) / units.Mi)) finally: configdrive_file.close() return (configdrive_mb, configdrive_file.name) def work_on_disk(dev, root_mb, swap_mb, ephemeral_mb, ephemeral_format, image_path, node_uuid, preserve_ephemeral=False, configdrive=None): """Create partitions and copy an image to the root partition. :param dev: Path for the device to work on. :param root_mb: Size of the root partition in megabytes. :param swap_mb: Size of the swap partition in megabytes. :param ephemeral_mb: Size of the ephemeral partition in megabytes. If 0, no ephemeral partition will be created. :param ephemeral_format: The type of file system to format the ephemeral partition. :param image_path: Path for the instance's disk image. :param node_uuid: node's uuid. Used for logging. :param preserve_ephemeral: If True, no filesystem is written to the ephemeral block device, preserving whatever content it had (if the partition table has not changed). :param configdrive: Optional. Base64 encoded Gzipped configdrive content or configdrive HTTP URL. :returns: the UUID of the root partition. """ if not is_block_device(dev): raise exception.InstanceDeployFailure( _("Parent device '%s' not found") % dev) # the only way for preserve_ephemeral to be set to true is if we are # rebuilding an instance with --preserve_ephemeral. commit = not preserve_ephemeral # now if we are committing the changes to disk clean first. if commit: destroy_disk_metadata(dev, node_uuid) try: # If requested, get the configdrive file and determine the size # of the configdrive partition configdrive_mb = 0 configdrive_file = None if configdrive: configdrive_mb, configdrive_file = _get_configdrive(configdrive, node_uuid) part_dict = make_partitions(dev, root_mb, swap_mb, ephemeral_mb, configdrive_mb, commit=commit) ephemeral_part = part_dict.get('ephemeral') swap_part = part_dict.get('swap') configdrive_part = part_dict.get('configdrive') root_part = part_dict.get('root') if not is_block_device(root_part): raise exception.InstanceDeployFailure( _("Root device '%s' not found") % root_part) for part in ('swap', 'ephemeral', 'configdrive'): part_device = part_dict.get(part) LOG.debug("Checking for %(part)s device (%(dev)s) on node " "%(node)s.", {'part': part, 'dev': part_device, 'node': node_uuid}) if part_device and not is_block_device(part_device): raise exception.InstanceDeployFailure( _("'%(partition)s' device '%(part_device)s' not found") % {'partition': part, 'part_device': part_device}) if configdrive_part: # Copy the configdrive content to the configdrive partition dd(configdrive_file, configdrive_part) finally: # If the configdrive was requested make sure we delete the file # after copying the content to the partition if configdrive_file: utils.unlink_without_raise(configdrive_file) populate_image(image_path, root_part) if swap_part: mkswap(swap_part) if ephemeral_part and not preserve_ephemeral: mkfs_ephemeral(ephemeral_part, ephemeral_format) try: root_uuid = block_uuid(root_part) except processutils.ProcessExecutionError: with excutils.save_and_reraise_exception(): LOG.error(_LE("Failed to detect root device UUID.")) return root_uuid def deploy(address, port, iqn, lun, image_path, root_mb, swap_mb, ephemeral_mb, ephemeral_format, node_uuid, preserve_ephemeral=False, configdrive=None): """All-in-one function to deploy a node. :param address: The iSCSI IP address. :param port: The iSCSI port number. :param iqn: The iSCSI qualified name. :param lun: The iSCSI logical unit number. :param image_path: Path for the instance's disk image. :param root_mb: Size of the root partition in megabytes. :param swap_mb: Size of the swap partition in megabytes. :param ephemeral_mb: Size of the ephemeral partition in megabytes. If 0, no ephemeral partition will be created. :param ephemeral_format: The type of file system to format the ephemeral partition. :param node_uuid: node's uuid. Used for logging. :param preserve_ephemeral: If True, no filesystem is written to the ephemeral block device, preserving whatever content it had (if the partition table has not changed). :param configdrive: Optional. Base64 encoded Gzipped configdrive content or configdrive HTTP URL. :returns: the UUID of the root partition. """ dev = get_dev(address, port, iqn, lun) image_mb = get_image_mb(image_path) if image_mb > root_mb: root_mb = image_mb discovery(address, port) login_iscsi(address, port, iqn) try: root_uuid = work_on_disk(dev, root_mb, swap_mb, ephemeral_mb, ephemeral_format, image_path, node_uuid, preserve_ephemeral=preserve_ephemeral, configdrive=configdrive) except processutils.ProcessExecutionError as err: with excutils.save_and_reraise_exception(): LOG.error(_LE("Deploy to address %s failed."), address) LOG.error(_LE("Command: %s"), err.cmd) LOG.error(_LE("StdOut: %r"), err.stdout) LOG.error(_LE("StdErr: %r"), err.stderr) except exception.InstanceDeployFailure as e: with excutils.save_and_reraise_exception(): LOG.error(_LE("Deploy to address %s failed."), address) LOG.error(e) finally: logout_iscsi(address, port, iqn) delete_iscsi(address, port, iqn) return root_uuid def notify_deploy_complete(address): """Notifies the completion of deployment to the baremetal node. :param address: The IP address of the node. """ # Ensure the node started netcat on the port after POST the request. time.sleep(3) notify(address, 10000) def check_for_missing_params(info_dict, error_msg, param_prefix=''): """Check for empty params in the provided dictionary. :param info_dict: The dictionary to inspect. :param error_msg: The error message to prefix before printing the information about missing parameters. :param param_prefix: Add this prefix to each parameter for error messages :raises: MissingParameterValue, if one or more parameters are empty in the provided dictionary. """ missing_info = [] for label, value in info_dict.items(): if not value: missing_info.append(param_prefix + label) if missing_info: exc_msg = _("%(error_msg)s. Missing are: %(missing_info)s") raise exception.MissingParameterValue(exc_msg % {'error_msg': error_msg, 'missing_info': missing_info}) def fetch_images(ctx, cache, images_info, force_raw=True): """Check for available disk space and fetch images using ImageCache. :param ctx: context :param cache: ImageCache instance to use for fetching :param images_info: list of tuples (image href, destination path) :param force_raw: boolean value, whether to convert the image to raw format :raises: InstanceDeployFailure if unable to find enough disk space """ try: image_cache.clean_up_caches(ctx, cache.master_dir, images_info) except exception.InsufficientDiskSpace as e: raise exception.InstanceDeployFailure(reason=e) # NOTE(dtantsur): This code can suffer from race condition, # if disk space is used between the check and actual download. # This is probably unavoidable, as we can't control other # (probably unrelated) processes for href, path in images_info: cache.fetch_image(href, path, ctx=ctx, force_raw=force_raw) def set_failed_state(task, msg): """Sets the deploy status as failed with relevant messages. This method sets the deployment as fail with the given message. It sets node's provision_state to DEPLOYFAIL and updates last_error with the given error message. It also powers off the baremetal node. :param task: a TaskManager instance containing the node to act on. :param msg: the message to set in last_error of the node. :raises: InvalidState if the event is not allowed by the associated state machine. """ task.process_event('fail') node = task.node try: manager_utils.node_power_action(task, states.POWER_OFF) except Exception: msg2 = (_LE('Node %s failed to power off while handling deploy ' 'failure. This may be a serious condition. Node ' 'should be removed from Ironic or put in maintenance ' 'mode until the problem is resolved.') % node.uuid) LOG.exception(msg2) finally: # NOTE(deva): node_power_action() erases node.last_error # so we need to set it again here. node.last_error = msg node.save() def get_single_nic_with_vif_port_id(task): """Returns the MAC address of a port which has a VIF port id. :param task: a TaskManager instance containing the ports to act on. :returns: MAC address of the port connected to deployment network. None if it cannot find any port with vif id. """ for port in task.ports: if port.extra.get('vif_port_id'): return port.address def parse_instance_info_capabilities(node): """Parse the instance_info capabilities. One way of having these capabilities set is via Nova, where the capabilities are defined in the Flavor extra_spec and passed to Ironic by the Nova Ironic driver. NOTE: Although our API fully supports JSON fields, to maintain the backward compatibility with Juno the Nova Ironic driver is sending it as a string. :param node: a single Node. :raises: InvalidParameterValue if the capabilities string is not a dictionary or is malformed. :returns: A dictionary with the capabilities if found, otherwise an empty dictionary. """ def parse_error(): error_msg = (_('Error parsing capabilities from Node %s instance_info ' 'field. A dictionary or a "jsonified" dictionary is ' 'expected.') % node.uuid) raise exception.InvalidParameterValue(error_msg) capabilities = node.instance_info.get('capabilities', {}) if isinstance(capabilities, six.string_types): try: capabilities = jsonutils.loads(capabilities) except (ValueError, TypeError): parse_error() if not isinstance(capabilities, dict): parse_error() return capabilities