ironic/ironic/drivers/modules/ansible/deploy.py

#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

"""
Ansible deploy interface
"""

import json
import os
import shlex
from urllib import parse as urlparse

from ironic_lib import metrics_utils
from ironic_lib import utils as irlib_utils
from oslo_concurrency import processutils
from oslo_log import log
from oslo_utils import strutils
from oslo_utils import units
import tenacity
import yaml

from ironic.common import dhcp_factory
from ironic.common import exception
from ironic.common import faults
from ironic.common.i18n import _
from ironic.common import images
from ironic.common import states
from ironic.common import utils
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.conf import CONF
from ironic.drivers import base
from ironic.drivers.modules import agent_base
from ironic.drivers.modules import deploy_utils


LOG = log.getLogger(__name__)

METRICS = metrics_utils.get_metrics_logger(__name__)

OPTIONAL_PROPERTIES = {
    'ansible_username': _('Deploy ramdisk username for Ansible. '
                          'This user must have passwordless sudo '
                          'permissions. Optional.'),
    'ansible_key_file': _('Full path to private SSH key file. '
                          'If not specified, default keys for user running '
                          'ironic-conductor process will be used. '
                          'Note that for keys with password, those '
                          'must be pre-loaded into ssh-agent. '
                          'Optional.'),
    'ansible_playbooks_path': _('Path to folder holding playbooks to use '
                                'for this node. Optional. '
                                'Default is set in ironic config.'),
    'ansible_deploy_playbook': _('Name of the Ansible playbook file inside '
                                 'the "ansible_playbooks_path" folder which '
                                 'is used for node deployment. Optional.'),
    'ansible_shutdown_playbook': _('Name of the Ansible playbook file inside '
                                   'the "ansible_playbooks_path" folder which '
                                   'is used for node shutdown. Optional.'),
    'ansible_clean_playbook': _('Name of the Ansible playbook file inside '
                                'the "ansible_playbooks_path" folder which '
                                'is used for node cleaning. Optional.'),
    'ansible_clean_steps_config': _('Name of the file inside the '
                                    '"ansible_playbooks_path" folder with '
                                    'cleaning steps configuration. Optional.'),
    'ansible_python_interpreter': _('Absolute path to the python interpreter '
                                    'on the managed machines. Optional.'),
}

COMMON_PROPERTIES = OPTIONAL_PROPERTIES


class PlaybookNotFound(exception.IronicException):
    _msg_fmt = _('Failed to set ansible playbook for action %(action)s')


def _get_playbooks_path(node):
    return node.driver_info.get('ansible_playbooks_path',
                                CONF.ansible.playbooks_path)


def _parse_ansible_driver_info(node, action='deploy'):
    user = node.driver_info.get('ansible_username',
                                CONF.ansible.default_username)
    key = node.driver_info.get('ansible_key_file',
                               CONF.ansible.default_key_file)
    playbook = node.driver_info.get('ansible_%s_playbook' % action,
                                    getattr(CONF.ansible,
                                            'default_%s_playbook' % action,
                                            None))
    if not playbook:
        raise PlaybookNotFound(action=action)
    return os.path.basename(playbook), user, key


def _get_python_interpreter(node):
    return node.driver_info.get('ansible_python_interpreter',
                                CONF.ansible.default_python_interpreter)


def _get_configdrive_path(basename):
    return os.path.join(CONF.tempdir, basename + '.cndrive')


def _get_node_ip(task):
    callback_url = task.node.driver_internal_info.get('agent_url', '')
    return urlparse.urlparse(callback_url).netloc.split(':')[0]


def _prepare_extra_vars(host_list, variables=None):
    nodes_var = []
    for node_uuid, ip, user, extra in host_list:
        nodes_var.append(dict(name=node_uuid, ip=ip, user=user, extra=extra))
    extra_vars = dict(nodes=nodes_var)
    if variables:
        extra_vars.update(variables)
    return extra_vars


def _run_playbook(node, name, extra_vars, key, tags=None, notags=None):
    """Execute ansible-playbook."""
    root = _get_playbooks_path(node)
    playbook = os.path.join(root, name)
    inventory = os.path.join(root, 'inventory')
    ironic_vars = {'ironic': extra_vars}
    python_interpreter = _get_python_interpreter(node)
    if python_interpreter:
        ironic_vars['ansible_python_interpreter'] = python_interpreter
    args = [CONF.ansible.ansible_playbook_script, playbook,
            '-i', inventory,
            '-e', json.dumps(ironic_vars),
            ]

    if CONF.ansible.config_file_path:
        env = ['env', 'ANSIBLE_CONFIG=%s' % CONF.ansible.config_file_path]
        args = env + args

    if tags:
        args.append('--tags=%s' % ','.join(tags))

    if notags:
        args.append('--skip-tags=%s' % ','.join(notags))

    if key:
        args.append('--private-key=%s' % key)

    verbosity = CONF.ansible.verbosity
    if verbosity is None and CONF.debug:
        verbosity = 4
    if verbosity:
        args.append('-' + 'v' * verbosity)

    if CONF.ansible.ansible_extra_args:
        args.extend(shlex.split(CONF.ansible.ansible_extra_args))

    try:
        out, err = utils.execute(*args)
        return out, err
    except processutils.ProcessExecutionError as e:
        raise exception.InstanceDeployFailure(reason=e)


def _calculate_memory_req(task):
    image_source = task.node.instance_info['image_source']
    image_size = images.download_size(task.context, image_source)
    return image_size // units.Mi + CONF.ansible.extra_memory


def _parse_partitioning_info(node):

    info = node.instance_info
    i_info = {'label': deploy_utils.get_disk_label(node) or 'msdos'}
    is_gpt = i_info['label'] == 'gpt'
    unit = 'MiB'
    partitions = {}

    def add_partition(name, start, end):
        partitions[name] = {'number': len(partitions) + 1,
                            'part_start': '%i%s' % (start, unit),
                            'part_end': '%i%s' % (end, unit)}
        if is_gpt:
            partitions[name]['name'] = name

    end = 1
    if is_gpt:
        # prepend 1MiB bios_grub partition for GPT so that grub(2) installs
        start, end = end, end + 1
        add_partition('bios', start, end)
        partitions['bios']['flags'] = ['bios_grub']

    ephemeral_mb = info['ephemeral_mb']
    if ephemeral_mb:
        start, end = end, end + ephemeral_mb
        add_partition('ephemeral', start, end)
        i_info['ephemeral_format'] = info['ephemeral_format']
        i_info['preserve_ephemeral'] = (
            'yes' if info['preserve_ephemeral'] else 'no')

    swap_mb = info['swap_mb']
    if swap_mb:
        start, end = end, end + swap_mb
        add_partition('swap', start, end)

    configdrive = info.get('configdrive')
    if configdrive:
        # pre-create 64MiB partition for configdrive
        start, end = end, end + 64
        add_partition('configdrive', start, end)

    # NOTE(pas-ha) make the root partition last so that
    # e.g. cloud-init can grow it on first start
    start, end = end, end + info['root_mb']
    add_partition('root', start, end)
    if not is_gpt:
        partitions['root']['flags'] = ['boot']
    i_info['partitions'] = partitions
    return {'partition_info': i_info}


def _parse_root_device_hints(node):
    """Convert string with hints to dict. """
    parsed_hints = deploy_utils.get_root_device_for_deploy(node)
    if not parsed_hints:
        return {}

    root_device_hints = {}
    advanced = {}
    for hint, value in parsed_hints.items():
        if isinstance(value, str):
            if value.startswith('== '):
                root_device_hints[hint] = int(value[3:])
            elif value.startswith('s== '):
                root_device_hints[hint] = urlparse.unquote(value[4:])
            else:
                advanced[hint] = value
        else:
            root_device_hints[hint] = value
    if advanced:
        raise exception.InvalidParameterValue(
            _('Ansible-deploy does not support advanced root device hints '
              'based on oslo.utils operators. '
              'Present advanced hints for node %(node)s are %(hints)s.') % {
                  'node': node.uuid, 'hints': advanced})
    return root_device_hints


def _add_ssl_image_options(image):
    image['validate_certs'] = ('no' if CONF.ansible.image_store_insecure
                               else 'yes')
    if CONF.ansible.image_store_cafile:
        image['cafile'] = CONF.ansible.image_store_cafile
    if CONF.ansible.image_store_certfile and CONF.ansible.image_store_keyfile:
        image['client_cert'] = CONF.ansible.image_store_certfile
        image['client_key'] = CONF.ansible.image_store_keyfile


def _prepare_variables(task):
    node = task.node
    i_info = node.instance_info
    image = {}
    for i_key, i_value in i_info.items():
        if i_key.startswith('image_'):
            image[i_key[6:]] = i_value

    checksum = image.get('checksum')
    if checksum:
        # NOTE(pas-ha) checksum can be in <algo>:<checksum> format
        # as supported by various Ansible modules, mostly good for
        # standalone Ironic case when instance_info is populated manually.
        # With no <algo> we take that instance_info is populated from Glance,
        # where API reports checksum as MD5 always.
        if ':' not in checksum:
            image['checksum'] = 'md5:%s' % checksum
    _add_ssl_image_options(image)
    variables = {'image': image}
    configdrive = manager_utils.get_configdrive_image(task.node)
    if configdrive:
        if urlparse.urlparse(configdrive).scheme in ('http', 'https'):
            cfgdrv_type = 'url'
            cfgdrv_location = configdrive
        else:
            cfgdrv_location = _get_configdrive_path(node.uuid)
            with open(cfgdrv_location, 'w') as f:
                f.write(configdrive)
            cfgdrv_type = 'file'
        variables['configdrive'] = {'type': cfgdrv_type,
                                    'location': cfgdrv_location}

    root_device_hints = _parse_root_device_hints(node)
    if root_device_hints:
        variables['root_device_hints'] = root_device_hints

    return variables


def _validate_clean_steps(steps, node_uuid):
    missing = []
    for step in steps:
        name = step.get('name')
        if not name:
            missing.append({'name': 'undefined', 'field': 'name'})
            continue
        if 'interface' not in step:
            missing.append({'name': name, 'field': 'interface'})
        args = step.get('args', {})
        for arg_name, arg in args.items():
            if arg.get('required', False) and 'value' not in arg:
                missing.append({'name': name,
                                'field': '%s.value' % arg_name})
    if missing:
        err_string = ', '.join(
            'name %(name)s, field %(field)s' % i for i in missing)
        msg = _("Malformed clean_steps file: %s") % err_string
        LOG.error(msg)
        raise exception.NodeCleaningFailure(node=node_uuid,
                                            reason=msg)
    if len(set(s['name'] for s in steps)) != len(steps):
        msg = _("Cleaning steps do not have unique names.")
        LOG.error(msg)
        raise exception.NodeCleaningFailure(node=node_uuid,
                                            reason=msg)


def _get_clean_steps(node, interface=None, override_priorities=None):
    """Get cleaning steps."""
    clean_steps_file = node.driver_info.get(
        'ansible_clean_steps_config', CONF.ansible.default_clean_steps_config)
    path = os.path.join(node.driver_info.get('ansible_playbooks_path',
                                             CONF.ansible.playbooks_path),
                        os.path.basename(clean_steps_file))
    try:
        with open(path) as f:
            internal_steps = yaml.safe_load(f)
    except Exception as e:
        msg = _('Failed to load clean steps from file '
                '%(file)s: %(exc)s') % {'file': path, 'exc': e}
        raise exception.NodeCleaningFailure(node=node.uuid, reason=msg)

    _validate_clean_steps(internal_steps, node.uuid)

    steps = []
    override = override_priorities or {}
    for params in internal_steps:
        name = params['name']
        clean_if = params['interface']
        if interface is not None and interface != clean_if:
            continue
        new_priority = override.get(name)
        priority = (new_priority if new_priority is not None else
                    params.get('priority', 0))
        args = {}
        argsinfo = params.get('args', {})
        for arg, arg_info in argsinfo.items():
            args[arg] = arg_info.pop('value', None)
        step = {
            'interface': clean_if,
            'step': name,
            'priority': priority,
            'abortable': False,
            'argsinfo': argsinfo,
            'args': args
        }
        steps.append(step)

    return steps


class AnsibleDeploy(agent_base.HeartbeatMixin,
                    agent_base.AgentOobStepsMixin,
                    base.DeployInterface):
    """Interface for deploy-related actions."""

    collect_deploy_logs = False

    def get_properties(self):
        """Return the properties of the interface."""
        props = COMMON_PROPERTIES.copy()
        # NOTE(pas-ha) this is to get the deploy_forces_oob_reboot property
        props.update(agent_base.VENDOR_PROPERTIES)
        return props

    @METRICS.timer('AnsibleDeploy.validate')
    def validate(self, task):
        """Validate the driver-specific Node deployment info."""
        task.driver.boot.validate(task)

        node = task.node

        params = {}
        image_source = node.instance_info.get('image_source')
        params['instance_info.image_source'] = image_source
        error_msg = _('Node %s failed to validate deploy image info. Some '
                      'parameters were missing') % node.uuid
        deploy_utils.check_for_missing_params(params, error_msg)
        # validate root device hints, proper exceptions are raised from there
        _parse_root_device_hints(node)
        # TODO(pas-ha) validate that all playbooks and ssh key (if set)
        # are pointing to actual files

    def _ansible_deploy(self, task, node_address):
        """Internal function for deployment to a node."""
        node = task.node
        LOG.debug('IP of node %(node)s is %(ip)s',
                  {'node': node.uuid, 'ip': node_address})
        variables = _prepare_variables(task)
        if not node.driver_internal_info.get('is_whole_disk_image'):
            variables.update(_parse_partitioning_info(node))
        if node.target_raid_config:
            variables.update({'raid_config': node.target_raid_config})
        playbook, user, key = _parse_ansible_driver_info(node)
        node_list = [(node.uuid, node_address, user, node.extra)]
        extra_vars = _prepare_extra_vars(node_list, variables=variables)

        LOG.debug('Starting deploy on node %s', node.uuid)
        # any caller should manage exceptions raised from here
        _run_playbook(node, playbook, extra_vars, key)

    @METRICS.timer('AnsibleDeploy.deploy')
    @base.deploy_step(priority=100)
    @task_manager.require_exclusive_lock
    def deploy(self, task):
        """Perform a deployment to a node."""
        self._required_image_info(task)
        manager_utils.node_power_action(task, states.REBOOT)
        return states.DEPLOYWAIT

    def in_core_deploy_step(self, task):
        """Check if we are in the deploy.deploy deploy step.

        Assumes that we are in the DEPLOYWAIT state.

        :param task: a TaskManager instance
        :returns: True if the current deploy step is deploy.deploy.
        """
        step = task.node.deploy_step
        return (step
                and step['interface'] == 'deploy'
                and step['step'] == 'deploy')

    def process_next_step(self, task, step_type):
        """Start the next clean/deploy step if the previous one is complete.

        :param task: a TaskManager instance
        :param step_type: "clean" or "deploy"
        """
        # Run the next step as soon as agent heartbeats in deploy.deploy
        if step_type == 'deploy' and self.in_core_deploy_step(task):
            manager_utils.notify_conductor_resume_deploy(task)

    @staticmethod
    def _required_image_info(task):
        """Gather and save needed image info while the context is good.

        Gather image info that will be needed later, during the
        write_image execution, where the context won't be the same
        anymore, since coming from the server's heartbeat.
        """
        node = task.node
        i_info = node.instance_info
        i_info['image_mem_req'] = _calculate_memory_req(task)
        node.instance_info = i_info
        node.save()

    @METRICS.timer('AnsibleDeploy.tear_down')
    @task_manager.require_exclusive_lock
    def tear_down(self, task):
        """Tear down a previous deployment on the task's node."""
        manager_utils.node_power_action(task, states.POWER_OFF)
        power_state_to_restore = manager_utils.power_on_node_if_needed(task)
        task.driver.network.unconfigure_tenant_networks(task)
        manager_utils.restore_power_state_if_needed(
            task, power_state_to_restore)
        return states.DELETED

    @METRICS.timer('AnsibleDeploy.prepare')
    def prepare(self, task):
        """Prepare the deployment environment for this node."""
        node = task.node
        # TODO(pas-ha) investigate takeover scenario
        if node.provision_state == states.DEPLOYING:
            # adding network-driver dependent provisioning ports
            fast_track = manager_utils.is_fast_track(task)
            power_state_to_restore = None
            if not fast_track:
                manager_utils.node_power_action(task, states.POWER_OFF)
                power_state_to_restore = (
                    manager_utils.power_on_node_if_needed(task))
            task.driver.network.add_provisioning_network(task)
            manager_utils.restore_power_state_if_needed(
                task, power_state_to_restore)
        if node.provision_state not in [states.ACTIVE, states.ADOPTING]:
            node.instance_info = deploy_utils.build_instance_info_for_deploy(
                task)
            node.save()
            deploy_utils.prepare_agent_boot(task)

    @METRICS.timer('AnsibleDeploy.clean_up')
    def clean_up(self, task):
        """Clean up the deployment environment for this node."""
        task.driver.boot.clean_up_ramdisk(task)
        provider = dhcp_factory.DHCPFactory()
        provider.clean_dhcp(task)
        irlib_utils.unlink_without_raise(
            _get_configdrive_path(task.node.uuid))

    def take_over(self, task):
        LOG.error("Ansible deploy does not support take over. "
                  "You must redeploy the node %s explicitly.",
                  task.node.uuid)

    def get_clean_steps(self, task):
        """Get the list of clean steps from the file.

        :param task: a TaskManager object containing the node
        :returns: A list of clean step dictionaries
        """
        new_priorities = {
            'erase_devices': CONF.deploy.erase_devices_priority,
            'erase_devices_metadata':
                CONF.deploy.erase_devices_metadata_priority
        }
        return _get_clean_steps(task.node, interface='deploy',
                                override_priorities=new_priorities)

    @METRICS.timer('AnsibleDeploy.execute_clean_step')
    def execute_clean_step(self, task, step):
        """Execute a clean step.

        :param task: a TaskManager object containing the node
        :param step: a clean step dictionary to execute
        :returns: None
        """
        node = task.node
        playbook, user, key = _parse_ansible_driver_info(
            task.node, action='clean')
        stepname = step['step']

        node_address = _get_node_ip(task)

        node_list = [(node.uuid, node_address, user, node.extra)]

        if node.target_raid_config:
            variables = {'raid_config': node.target_raid_config}
            extra_vars = _prepare_extra_vars(node_list, variables=variables)
        else:
            extra_vars = _prepare_extra_vars(node_list)

        LOG.debug('Starting cleaning step %(step)s on node %(node)s',
                  {'node': node.uuid, 'step': stepname})
        step_tags = step['args'].get('tags', [])
        LOG.debug("Detected tags from cleaning step: %(tags)s",
                  {'tags': step_tags})
        _run_playbook(node, playbook, extra_vars, key, tags=step_tags)
        LOG.info('Ansible completed cleaning step %(step)s '
                 'on node %(node)s.',
                 {'node': node.uuid, 'step': stepname})

    @METRICS.timer('AnsibleDeploy.prepare_cleaning')
    def prepare_cleaning(self, task):
        """Boot into the ramdisk to prepare for cleaning.

        :param task: a TaskManager object containing the node
        :raises NodeCleaningFailure: if the previous cleaning ports cannot
                be removed or if new cleaning ports cannot be created
        :returns: None or states.CLEANWAIT for async prepare.
        """
        node = task.node
        conductor_steps.set_node_cleaning_steps(task)
        if not node.driver_internal_info['clean_steps']:
            # no clean steps configured, nothing to do.
            return
        fast_track = manager_utils.is_fast_track(task)
        power_state_to_restore = None
        if not fast_track:
            power_state_to_restore = manager_utils.power_on_node_if_needed(
                task)
        task.driver.network.add_cleaning_network(task)
        manager_utils.restore_power_state_if_needed(
            task, power_state_to_restore)
        deploy_utils.prepare_agent_boot(task)
        if not fast_track:
            manager_utils.node_power_action(task, states.REBOOT)
        return states.CLEANWAIT

    @METRICS.timer('AnsibleDeploy.tear_down_cleaning')
    def tear_down_cleaning(self, task):
        """Clean up the PXE and DHCP files after cleaning.

        :param task: a TaskManager object containing the node
        :raises NodeCleaningFailure: if the cleaning ports cannot be
                removed
        """
        fast_track = manager_utils.is_fast_track(task)
        node = task.node
        cleaning_failure = (node.fault == faults.CLEAN_FAILURE)
        if not (fast_track or cleaning_failure):
            manager_utils.node_power_action(task, states.POWER_OFF)
        task.driver.boot.clean_up_ramdisk(task)
        power_state_to_restore = manager_utils.power_on_node_if_needed(task)
        task.driver.network.remove_cleaning_network(task)
        if not (fast_track or cleaning_failure):
            manager_utils.restore_power_state_if_needed(
                task, power_state_to_restore)

    @METRICS.timer('AnsibleDeploy.write_image')
    @base.deploy_step(priority=80)
    def write_image(self, task):
        # NOTE(pas-ha) the lock should be already upgraded in heartbeat,
        # just setting its purpose for better logging
        task.upgrade_lock(purpose='deploy')
        # NOTE(pas-ha) this method is called from heartbeat processing only,
        # so we are sure we need this particular method, not the general one
        node_address = _get_node_ip(task)
        self._ansible_deploy(task, node_address)
        LOG.info('Ansible complete deploy on node %s', task.node.uuid)
        manager_utils.node_set_boot_device(task, 'disk', persistent=True)

    @METRICS.timer('AnsibleDeploy.tear_down_agent')
    @base.deploy_step(priority=40)
    @task_manager.require_exclusive_lock
    def tear_down_agent(self, task):
        """A deploy step to tear down the agent.

        Shuts down the machine and removes it from the provisioning
        network.

        :param task: a TaskManager object containing the node
        """
        wait = CONF.ansible.post_deploy_get_power_state_retry_interval
        attempts = CONF.ansible.post_deploy_get_power_state_retries + 1

        @tenacity.retry(
            stop=tenacity.stop_after_attempt(attempts),
            retry=(tenacity.retry_if_result(
                lambda state: state != states.POWER_OFF)
                | tenacity.retry_if_exception_type(Exception)),
            wait=tenacity.wait_fixed(wait),
            reraise=True)
        def _wait_until_powered_off(task):
            return task.driver.power.get_power_state(task)

        node = task.node
        oob_power_off = strutils.bool_from_string(
            node.driver_info.get('deploy_forces_oob_reboot', False))
        try:
            if not oob_power_off:
                try:
                    node_address = _get_node_ip(task)
                    playbook, user, key = _parse_ansible_driver_info(
                        node, action='shutdown')
                    node_list = [(node.uuid, node_address, user, node.extra)]
                    extra_vars = _prepare_extra_vars(node_list)
                    _run_playbook(node, playbook, extra_vars, key)
                    _wait_until_powered_off(task)
                except Exception as e:
                    LOG.warning('Failed to soft power off node %(node_uuid)s '
                                'in at least %(timeout)d seconds. '
                                'Error: %(error)s',
                                {'node_uuid': node.uuid,
                                 'timeout': (wait * (attempts - 1)) / 1000,
                                 'error': e})
                    # NOTE(pas-ha) flush is a part of deploy playbook
                    # so if it finished successfully we can safely
                    # power off the node out-of-band
                    manager_utils.node_power_action(task, states.POWER_OFF)
            else:
                manager_utils.node_power_action(task, states.POWER_OFF)
        except Exception as e:
            msg = (_('Error rebooting node %(node)s after deploy. '
                     'Error: %(error)s') %
                   {'node': node.uuid, 'error': e})
            agent_base.log_and_raise_deployment_error(task, msg)