748 lines
30 KiB

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
Ansible deploy driver
import json
import os
import shlex
from ironic_lib import utils as irlib_utils
from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_log import log
from oslo_utils import excutils
from oslo_utils import strutils
from oslo_utils import units
import retrying
import six
import six.moves.urllib.parse as urlparse
import yaml
from ironic.common import dhcp_factory
from ironic.common import exception
from ironic.common.glance_service import service_utils
from ironic.common.i18n import _
from ironic.common.i18n import _LE
from ironic.common.i18n import _LI
from ironic.common.i18n import _LW
from ironic.common import image_service
from ironic.common import images
from ironic.common import states
from ironic.common import utils
from ironic.conductor import rpcapi
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.conf import CONF
from ironic.drivers import base
from ironic.drivers.modules import deploy_utils
ansible_opts = [
help=_('Extra arguments to pass on every '
'invocation of Ansible.')),
help=_('Set ansible verbosity level requested when invoking '
'"ansible-playbook" command. '
'4 includes detailed SSH session logging. '
'Default is 4 when global debug is enabled '
'and 0 otherwise.')),
help=_('Path to "ansible-playbook" script. '
'Default will search the $PATH configured for user '
'running ironic-conductor process. '
'Provide the full path when ansible-playbook is not in '
'$PATH or installed in not default location.')),
default=os.path.join(os.path.dirname(__file__), 'playbooks'),
help=_('Path to directory with playbooks, roles and '
'local inventory.')),
os.path.dirname(__file__), 'playbooks', 'ansible.cfg'),
help=_('Path to ansible configuration file. If set to empty, '
'system default will be used.')),
help=_('Number of times to retry getting power state to check '
'if bare metal node has been powered off after a soft '
'power off.')),
help=_('Amount of time (in seconds) to wait between polling '
'power state after trigger soft poweroff.')),
help=_('Extra amount of memory in MiB expected to be consumed '
'by Ansible-related processes on the node. Affects '
'decision whether image will fit into RAM.')),
help=_('Use callback request from ramdisk for start deploy or '
'cleaning. Disable it when using custom ramdisk '
'without callback script. '
'When callback is disabled, Neutron is mandatory.')),
CONF.register_opts(ansible_opts, group='ansible')
LOG = log.getLogger(__name__)
'deploy': 'deploy.yaml',
'clean': 'clean.yaml'
DEFAULT_CLEAN_STEPS = 'clean_steps.yaml'
'ansible_deploy_username': _('Deploy ramdisk username for Ansible. '
'This user must have passwordless sudo '
'permissions. Default is "ansible". '
'ansible_deploy_key_file': _('Path to private key file. If not specified, '
'default keys for user running '
'ironic-conductor process will be used. '
'Note that for keys with password, those '
'must be pre-loaded into ssh-agent. '
'ansible_deploy_playbook': _('Name of the Ansible playbook used for '
'deployment. Default is %s. Optional.'
) % DEFAULT_PLAYBOOKS['deploy'],
'ansible_clean_playbook': _('Name of the Ansible playbook used for '
'cleaning. Default is %s. Optional.'
'ansible_clean_steps_config': _('Name of the file with default cleaning '
'steps configuration. Default is %s. '
DISK_LAYOUT_PARAMS = ('root_gb', 'swap_mb', 'ephemeral_gb')
INVENTORY_FILE = os.path.join(CONF.ansible.playbooks_path, 'inventory')
class PlaybookNotFound(exception.IronicException):
_msg_fmt = _('Failed to set ansible playbook for action %(action)s')
def _parse_ansible_driver_info(node, action='deploy'):
user = node.driver_info.get('ansible_deploy_username', 'ansible')
key = node.driver_info.get('ansible_deploy_key_file')
playbook = node.driver_info.get('ansible_%s_playbook' % action,
if not playbook:
raise PlaybookNotFound(action=action)
return playbook, user, key
def _get_configdrive_path(basename):
return os.path.join(CONF.tempdir, basename + '.cndrive')
# NOTE(yuriyz): this is a copy from agent driver
def build_instance_info_for_deploy(task):
"""Build instance_info necessary for deploying to a node."""
node = task.node
instance_info = node.instance_info
image_source = instance_info['image_source']
if service_utils.is_glance_image(image_source):
glance = image_service.GlanceImageService(version=2,
image_info =
swift_temp_url = glance.swift_temp_url(image_info)
LOG.debug('Got image info: %(info)s for node %(node)s.',
{'info': image_info, 'node': node.uuid})
instance_info['image_url'] = swift_temp_url
instance_info['image_checksum'] = image_info['checksum']
instance_info['image_disk_format'] = image_info['disk_format']
except exception.ImageRefValidationFailed:
with excutils.save_and_reraise_exception():
LOG.error(_LE("Ansible deploy supports only HTTP(S) URLs as "
"instance_info['image_source']. Either %s "
"is not a valid HTTP(S) URL or "
"is not reachable."), image_source)
instance_info['image_url'] = image_source
return instance_info
def _get_node_ip(task):
api = dhcp_factory.DHCPFactory().provider
ip_addrs = api.get_ip_addresses(task)
if not ip_addrs:
raise exception.FailedToGetIPAddressOnPort(_(
"Failed to get IP address for any port on node %s.") %
if len(ip_addrs) > 1:
error = _("Ansible driver does not support multiple IP addresses "
"during deploy or cleaning")
raise exception.InstanceDeployFailure(reason=error)
return ip_addrs[0]
# some good code from agent
def _reboot_and_finish_deploy(task):
wait = CONF.ansible.post_deploy_get_power_state_retry_interval * 1000
attempts = CONF.ansible.post_deploy_get_power_state_retries + 1
retry_on_result=lambda state: state != states.POWER_OFF,
def _wait_until_powered_off(task):
return task.driver.power.get_power_state(task)
except Exception as e:
LOG.warning(_LW('Failed to soft power off node %(node_uuid)s '
'in at least %(timeout)d seconds. Error: %(error)s'),
{'node_uuid': task.node.uuid,
'timeout': (wait * (attempts - 1)) / 1000,
'error': e})
manager_utils.node_power_action(task, states.POWER_OFF)
manager_utils.node_power_action(task, states.POWER_ON)
def _prepare_extra_vars(host_list, variables=None):
nodes_var = []
for node_uuid, ip, user, extra in host_list:
nodes_var.append(dict(name=node_uuid, ip=ip, user=user, extra=extra))
extra_vars = dict(ironic_nodes=nodes_var)
if variables:
return extra_vars
def _run_playbook(name, extra_vars, key, tags=None, notags=None):
"""Execute ansible-playbook."""
playbook = os.path.join(CONF.ansible.playbooks_path, name)
args = [CONF.ansible.ansible_playbook_script, playbook,
'-e', json.dumps(extra_vars),
if CONF.ansible.config_file_path:
env = ['env', 'ANSIBLE_CONFIG=%s' % CONF.ansible.config_file_path]
args = env + args
if tags:
args.append('--tags=%s' % ','.join(tags))
if notags:
args.append('--skip-tags=%s' % ','.join(notags))
if key:
args.append('--private-key=%s' % key)
verbosity = CONF.ansible.verbosity
if verbosity is None and CONF.debug:
verbosity = 4
if verbosity:
args.append('-' + 'v' * verbosity)
if CONF.ansible.ansible_extra_args:
out, err = utils.execute(*args)
return out, err
except processutils.ProcessExecutionError as e:
raise exception.InstanceDeployFailure(reason=e)
def _calculate_memory_req(task):
image_source = task.node.instance_info['image_source']
image_size = images.download_size(task.context, image_source)
return image_size // units.Mi + CONF.ansible.extra_memory
def _parse_partitioning_info(node):
info = node.instance_info
i_info = {}
i_info['root_gb'] = info.get('root_gb')
error_msg = _("'root_gb' is missing in node's instance_info")
deploy_utils.check_for_missing_params(i_info, error_msg)
i_info['swap_mb'] = info.get('swap_mb', 0)
i_info['ephemeral_gb'] = info.get('ephemeral_gb', 0)
err_msg_invalid = _("Cannot validate parameter for deploy. Invalid "
"parameter %(param)s. Reason: %(reason)s")
for param in DISK_LAYOUT_PARAMS:
i_info[param] = int(i_info[param])
except ValueError:
reason = _("%s is not an integer value") % i_info[param]
raise exception.InvalidParameterValue(err_msg_invalid %
{'param': param,
'reason': reason})
# convert to sizes expected by 'parted' Ansible module
root_mib = 1024 * i_info.pop('root_gb')
swap_mib = i_info.pop('swap_mb')
ephemeral_mib = 1024 * i_info.pop('ephemeral_gb')
partitions = []
root_partition = {'name': 'root',
'size_mib': root_mib,
'boot': 'yes',
'swap': 'no'}
if swap_mib:
swap_partition = {'name': 'swap',
'size_mib': swap_mib,
'boot': 'no',
'swap': 'yes'}
if ephemeral_mib:
ephemeral_partition = {'name': 'ephemeral',
'size_mib': ephemeral_mib,
'boot': 'no',
'swap': 'no'}
i_info['ephemeral_format'] = info.get('ephemeral_format')
if not i_info['ephemeral_format']:
i_info['ephemeral_format'] = CONF.pxe.default_ephemeral_format
preserve_ephemeral = info.get('preserve_ephemeral', False)
i_info['preserve_ephemeral'] = (
strutils.bool_from_string(preserve_ephemeral, strict=True))
except ValueError as e:
raise exception.InvalidParameterValue(
err_msg_invalid % {'param': 'preserve_ephemeral', 'reason': e})
i_info['preserve_ephemeral'] = (
'yes' if i_info['preserve_ephemeral'] else 'no')
i_info['ironic_partitions'] = partitions
return i_info
def _prepare_variables(task):
node = task.node
i_info = node.instance_info
image = {
'url': i_info['image_url'],
'mem_req': _calculate_memory_req(task),
'disk_format': i_info.get('image_disk_format'),
checksum = i_info.get('image_checksum')
if checksum:
# NOTE(pas-ha) checksum can be in <algo>:<checksum> format
# as supported by various Ansible modules, mostly good for
# standalone Ironic case when instance_info is populated manually.
# With no <algo> we take that instance_info is populated from Glance,
# where API reports checksum as MD5 always.
if ':' not in checksum:
checksum = 'md5:%s' % checksum
image['checksum'] = checksum
variables = {'image': image}
configdrive = i_info.get('configdrive')
if configdrive:
if urlparse.urlparse(configdrive).scheme in ('http', 'https'):
cfgdrv_type = 'url'
cfgdrv_location = configdrive
cfgdrv_location = _get_configdrive_path(node.uuid)
with open(cfgdrv_location, 'w') as f:
cfgdrv_type = 'file'
variables['configdrive'] = {'type': cfgdrv_type,
'location': cfgdrv_location}
return variables
def _validate_clean_steps(steps, node_uuid):
missing = []
for step in steps:
name = step.setdefault('name', 'unnamed')
if 'interface' not in step:
missing.append({'name': name, 'field': 'interface'})
args = step.get('args', {})
for arg_name, arg in args.items():
if arg.get('required', False) and 'value' not in arg:
missing.append({'name': name,
'field': '%s.value' % arg_name})
if missing:
err_string = ', '.join(
'name %(name)s, field %(field)s' % i for i in missing)
msg = _("Malformed clean_steps file: %s") % err_string
raise exception.NodeCleaningFailure(node=node_uuid,
def _get_clean_steps(task, interface=None, override_priorities=None):
"""Get cleaning steps."""
clean_steps_file = task.node.driver_info.get('ansible_clean_steps_config',
path = os.path.join(CONF.ansible.playbooks_path, clean_steps_file)
with open(path) as f:
internal_steps = yaml.safe_load(f)
except Exception as e:
msg = _('Failed to load clean steps from file '
'%(file)s: %(exc)s') % {'file': path, 'exc': e}
raise exception.NodeCleaningFailure(node=task.node.uuid, reason=msg)
_validate_clean_steps(internal_steps, task.node.uuid)
steps = []
override = override_priorities or {}
for params in internal_steps:
name = params['name']
clean_if = params['interface']
if interface is not None and interface != clean_if:
new_priority = override.get(name)
priority = (new_priority if new_priority is not None else
params.get('priority', 0))
args = {}
argsinfo = params.get('args', {})
for arg, arg_info in argsinfo.items():
args[arg] = arg_info.pop('value', None)
step = {
'interface': clean_if,
'step': name,
'priority': priority,
'abortable': False,
'argsinfo': argsinfo,
'args': args
return steps
# taken from agent driver
def _notify_conductor_resume_clean(task):
LOG.debug('Sending RPC to conductor to resume cleaning for node %s',
uuid = task.node.uuid
rpc = rpcapi.ConductorAPI()
topic = rpc.get_topic_for(task.node)
# Need to release the lock to let the conductor take it
rpc.continue_node_clean(task.context, uuid, topic=topic)
def _deploy(task, node_address):
"""Internal function for deployment to a node."""
notags = ['wait'] if CONF.ansible.use_ramdisk_callback else []
node = task.node
LOG.debug('IP of node %(node)s is %(ip)s',
{'node': node.uuid, 'ip': node_address})
iwdi = node.driver_internal_info.get('is_whole_disk_image')
variables = _prepare_variables(task)
if iwdi:
playbook, user, key = _parse_ansible_driver_info(task.node)
node_list = [(node.uuid, node_address, user, node.extra)]
extra_vars = _prepare_extra_vars(node_list, variables=variables)
LOG.debug('Starting deploy on node %s', node.uuid)
# any caller should manage exceptions raised from here
_run_playbook(playbook, extra_vars, key, notags=notags)'Ansible complete deploy on node %s'), node.uuid)
LOG.debug('Rebooting node %s to instance', node.uuid)
manager_utils.node_set_boot_device(task, 'disk', persistent=True)
class AnsibleDeploy(base.DeployInterface):
"""Interface for deploy-related actions."""
def get_properties(self):
"""Return the properties of the interface."""
def validate(self, task):
"""Validate the driver-specific Node deployment info."""
node = task.node
iwdi = node.driver_internal_info.get('is_whole_disk_image')
if not iwdi and deploy_utils.get_boot_option(node) == "netboot":
raise exception.InvalidParameterValue(_(
"Node %(node)s is configured to use the %(driver)s driver "
"which does not support netboot.") % {'node': node.uuid,
'driver': node.driver})
params = {}
image_source = node.instance_info.get('image_source')
params['instance_info.image_source'] = image_source
error_msg = _('Node %s failed to validate deploy image info. Some '
'parameters were missing') % node.uuid
deploy_utils.check_for_missing_params(params, error_msg)
def deploy(self, task):
"""Perform a deployment to a node."""
manager_utils.node_power_action(task, states.REBOOT)
if CONF.ansible.use_ramdisk_callback:
return states.DEPLOYWAIT
node = task.node
ip_addr = _get_node_ip(task)
_deploy(task, ip_addr)
except Exception as e:
error = _('Deploy failed for node %(node)s: '
'Error: %(exc)s') % {'node': node.uuid,
'exc': six.text_type(e)}
self._set_failed_state(task, error)
else:'Deployment to node %s done'), node.uuid)
return states.DEPLOYDONE
def tear_down(self, task):
"""Tear down a previous deployment on the task's node."""
manager_utils.node_power_action(task, states.POWER_OFF)
return states.DELETED
def prepare(self, task):
"""Prepare the deployment environment for this node."""
node = task.node
# TODO(pas-ha) investigate takeover scenario
if node.provision_state == states.DEPLOYING:
# adding network-driver dependent provisioning ports
manager_utils.node_power_action(task, states.POWER_OFF)
if node.provision_state not in [states.ACTIVE, states.ADOPTING]:
node.instance_info = build_instance_info_for_deploy(task)
boot_opt = deploy_utils.build_agent_options(node)
task.driver.boot.prepare_ramdisk(task, boot_opt)
def clean_up(self, task):
"""Clean up the deployment environment for this node."""
provider = dhcp_factory.DHCPFactory()
def take_over(self, task):
LOG.error(_LE("Ansible deploy does not support take over. "
"You must redeploy the node %s explicitly."),
def get_clean_steps(self, task):
"""Get the list of clean steps from the file.
:param task: a TaskManager object containing the node
:returns: A list of clean step dictionaries
new_priorities = {
'erase_devices': CONF.deploy.erase_devices_priority,
return _get_clean_steps(task, interface='deploy',
def execute_clean_step(self, task, step):
"""Execute a clean step.
:param task: a TaskManager object containing the node
:param step: a clean step dictionary to execute
:returns: None
node = task.node
playbook, user, key = _parse_ansible_driver_info(
task.node, action='clean')
stepname = step['step']
ip_addr = node.driver_internal_info['ansible_cleaning_ip']
except KeyError:
raise exception.NodeCleaningFailure(node=node.uuid,
reason='undefined node IP '
node_list = [(node.uuid, ip_addr, user, node.extra)]
extra_vars = _prepare_extra_vars(node_list)
LOG.debug('Starting cleaning step %(step)s on node %(node)s',
{'node': node.uuid, 'step': stepname})
step_tags = step['args'].get('tags', [])
_run_playbook(playbook, extra_vars, key,
except exception.InstanceDeployFailure as e:
LOG.error(_LE("Ansible failed cleaning step %(step)s "
"on node %(node)s."), {
'node': node.uuid, 'step': stepname})
manager_utils.cleaning_error_handler(task, six.text_type(e))
else:'Ansible completed cleaning step %(step)s '
'on node %(node)s.'),
{'node': node.uuid, 'step': stepname})
def prepare_cleaning(self, task):
"""Boot into the ramdisk to prepare for cleaning.
:param task: a TaskManager object containing the node
:raises NodeCleaningFailure: if the previous cleaning ports cannot
be removed or if new cleaning ports cannot be created
:returns: None or states.CLEANWAIT for async prepare.
node = task.node
use_callback = CONF.ansible.use_ramdisk_callback
if use_callback:
if not node.driver_internal_info['clean_steps']:
# no clean steps configured, nothing to do.
boot_opt = deploy_utils.build_agent_options(node)
task.driver.boot.prepare_ramdisk(task, boot_opt)
manager_utils.node_power_action(task, states.REBOOT)
if use_callback:
return states.CLEANWAIT
ip_addr = _get_node_ip(task)
LOG.debug('IP of node %(node)s is %(ip)s',
{'node': node.uuid, 'ip': ip_addr})
driver_internal_info = node.driver_internal_info
driver_internal_info['ansible_cleaning_ip'] = ip_addr
node.driver_internal_info = driver_internal_info
playbook, user, key = _parse_ansible_driver_info(
task.node, action='clean')
node_list = [(node.uuid, ip_addr, user, node.extra)]
extra_vars = _prepare_extra_vars(node_list)
LOG.debug('Waiting ramdisk on node %s for cleaning', node.uuid)
_run_playbook(playbook, extra_vars, key, tags=['wait'])'Node %s is ready for cleaning'), node.uuid)
def tear_down_cleaning(self, task):
"""Clean up the PXE and DHCP files after cleaning.
:param task: a TaskManager object containing the node
:raises NodeCleaningFailure: if the cleaning ports cannot be
node = task.node
driver_internal_info = node.driver_internal_info
driver_internal_info.pop('ansible_cleaning_ip', None)
node.driver_internal_info = driver_internal_info
manager_utils.node_power_action(task, states.POWER_OFF)
# FIXME(pas-ha): remove this workaround after nearest Ironic release
# that contains the specified commit (next after 6.1.0)
# and require this Ironic release
def _upgrade_lock(self, task, purpose=None):
except TypeError:
LOG.warning(_LW("To have better logging please update your "
"Ironic installation to contain commit "
# FIXME(pas-ha): remove this workaround after nearest Ironic release
# that contains the specified commit (next after 6.1.0)
# and require this Ironic release
def _set_failed_state(self, task, error):
deploy_utils.set_failed_state(task, error, collect_logs=False)
except TypeError:
LOG.warning(_LW("To have proper error handling please update "
"your Ironic installation to contain commit "
deploy_utils.set_failed_state(task, error)
def heartbeat(self, task, callback_url):
"""Method for ansible ramdisk callback."""
node = task.node
address = urlparse.urlparse(callback_url).netloc.split(':')[0]
if node.maintenance:
# this shouldn't happen often, but skip the rest if it does.
LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
'not taking any action.', {'node': node.uuid})
elif node.provision_state == states.DEPLOYWAIT:
LOG.debug('Heartbeat from %(node)s.', {'node': node.uuid})
self._upgrade_lock(task, purpose='deploy')
node = task.node
_deploy(task, address)
except Exception as e:
error = _('Deploy failed for node %(node)s: '
'Error: %(exc)s') % {'node': node.uuid,
'exc': six.text_type(e)}
self._set_failed_state(task, error)
else:'Deployment to node %s done'), node.uuid)
elif node.provision_state == states.CLEANWAIT:
LOG.debug('Node %s just booted to start cleaning.',
self._upgrade_lock(task, purpose='clean')
node = task.node
driver_internal_info = node.driver_internal_info
driver_internal_info['ansible_cleaning_ip'] = address
node.driver_internal_info = driver_internal_info
except Exception as e:
error = _('cleaning failed for node %(node)s: '
'Error: %(exc)s') % {'node': node.uuid,
'exc': six.text_type(e)}
manager_utils.cleaning_error_handler(task, error)
LOG.warning(_LW('Call back from %(node)s in invalid provision '
'state %(state)s'),
{'node': node.uuid, 'state': node.provision_state})