397 lines
16 KiB
Python
397 lines
16 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""Functionality related to deploying and undeploying."""
|
|
|
|
import tempfile
|
|
|
|
from ironic_lib import metrics_utils
|
|
from oslo_db import exception as db_exception
|
|
from oslo_log import log
|
|
from oslo_utils import excutils
|
|
|
|
from ironic.common import exception
|
|
from ironic.common.glance_service import service_utils as glance_utils
|
|
from ironic.common.i18n import _
|
|
from ironic.common import images
|
|
from ironic.common import states
|
|
from ironic.common import swift
|
|
from ironic.conductor import notification_utils as notify_utils
|
|
from ironic.conductor import steps as conductor_steps
|
|
from ironic.conductor import task_manager
|
|
from ironic.conductor import utils
|
|
from ironic.conf import CONF
|
|
from ironic.objects import fields
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
METRICS = metrics_utils.get_metrics_logger(__name__)
|
|
|
|
|
|
def validate_node(task, event='deploy'):
|
|
"""Validate that a node is suitable for deployment/rebuilding.
|
|
|
|
:param task: a TaskManager instance.
|
|
:param event: event to process: deploy or rebuild.
|
|
:raises: NodeInMaintenance, NodeProtected, InvalidStateRequested
|
|
"""
|
|
if task.node.maintenance:
|
|
raise exception.NodeInMaintenance(op=_('provisioning'),
|
|
node=task.node.uuid)
|
|
|
|
if event == 'rebuild' and task.node.protected:
|
|
raise exception.NodeProtected(node=task.node.uuid)
|
|
|
|
if not task.fsm.is_actionable_event(event):
|
|
raise exception.InvalidStateRequested(
|
|
action=event, node=task.node.uuid, state=task.node.provision_state)
|
|
|
|
|
|
@METRICS.timer('start_deploy')
|
|
@task_manager.require_exclusive_lock
|
|
def start_deploy(task, manager, configdrive=None, event='deploy'):
|
|
"""Start deployment or rebuilding on a node.
|
|
|
|
This function does not check the node suitability for deployment, it's left
|
|
up to the caller.
|
|
|
|
:param task: a TaskManager instance.
|
|
:param manager: a ConductorManager to run tasks on.
|
|
:param configdrive: a configdrive, if requested.
|
|
:param event: event to process: deploy or rebuild.
|
|
"""
|
|
node = task.node
|
|
|
|
if event == 'rebuild':
|
|
# Note(gilliard) Clear these to force the driver to
|
|
# check whether they have been changed in glance
|
|
# NOTE(vdrok): If image_source is not from Glance we should
|
|
# not clear kernel and ramdisk as they're input manually
|
|
if glance_utils.is_glance_image(
|
|
node.instance_info.get('image_source')):
|
|
instance_info = node.instance_info
|
|
instance_info.pop('kernel', None)
|
|
instance_info.pop('ramdisk', None)
|
|
node.instance_info = instance_info
|
|
|
|
# Infer the image type to make sure the deploy driver
|
|
# validates only the necessary variables for different
|
|
# image types.
|
|
# NOTE(sirushtim): The iwdi variable can be None. It's up to
|
|
# the deploy driver to validate this.
|
|
iwdi = images.is_whole_disk_image(task.context, node.instance_info)
|
|
driver_internal_info = node.driver_internal_info
|
|
driver_internal_info['is_whole_disk_image'] = iwdi
|
|
node.driver_internal_info = driver_internal_info
|
|
node.save()
|
|
|
|
try:
|
|
task.driver.power.validate(task)
|
|
task.driver.deploy.validate(task)
|
|
utils.validate_instance_info_traits(task.node)
|
|
conductor_steps.validate_deploy_templates(task, skip_missing=True)
|
|
except exception.InvalidParameterValue as e:
|
|
raise exception.InstanceDeployFailure(
|
|
_("Failed to validate deploy or power info for node "
|
|
"%(node_uuid)s. Error: %(msg)s") %
|
|
{'node_uuid': node.uuid, 'msg': e}, code=e.code)
|
|
|
|
try:
|
|
task.process_event(
|
|
event,
|
|
callback=manager._spawn_worker,
|
|
call_args=(do_node_deploy, task,
|
|
manager.conductor.id, configdrive),
|
|
err_handler=utils.provisioning_error_handler)
|
|
except exception.InvalidState:
|
|
raise exception.InvalidStateRequested(
|
|
action=event, node=task.node.uuid,
|
|
state=task.node.provision_state)
|
|
|
|
|
|
@METRICS.timer('do_node_deploy')
|
|
@task_manager.require_exclusive_lock
|
|
def do_node_deploy(task, conductor_id=None, configdrive=None):
|
|
"""Prepare the environment and deploy a node."""
|
|
node = task.node
|
|
utils.wipe_deploy_internal_info(task)
|
|
try:
|
|
if configdrive:
|
|
if isinstance(configdrive, dict):
|
|
configdrive = utils.build_configdrive(node, configdrive)
|
|
_store_configdrive(node, configdrive)
|
|
except (exception.SwiftOperationError, exception.ConfigInvalid) as e:
|
|
with excutils.save_and_reraise_exception():
|
|
utils.deploying_error_handler(
|
|
task,
|
|
('Error while uploading the configdrive for %(node)s '
|
|
'to Swift') % {'node': node.uuid},
|
|
_('Failed to upload the configdrive to Swift. '
|
|
'Error: %s') % e,
|
|
clean_up=False)
|
|
except db_exception.DBDataError as e:
|
|
with excutils.save_and_reraise_exception():
|
|
# NOTE(hshiina): This error happens when the configdrive is
|
|
# too large. Remove the configdrive from the
|
|
# object to update DB successfully in handling
|
|
# the failure.
|
|
node.obj_reset_changes()
|
|
utils.deploying_error_handler(
|
|
task,
|
|
('Error while storing the configdrive for %(node)s into '
|
|
'the database: %(err)s') % {'node': node.uuid, 'err': e},
|
|
_("Failed to store the configdrive in the database. "
|
|
"%s") % e,
|
|
clean_up=False)
|
|
except Exception as e:
|
|
with excutils.save_and_reraise_exception():
|
|
utils.deploying_error_handler(
|
|
task,
|
|
('Unexpected error while preparing the configdrive for '
|
|
'node %(node)s') % {'node': node.uuid},
|
|
_("Failed to prepare the configdrive. Exception: %s") % e,
|
|
traceback=True, clean_up=False)
|
|
|
|
try:
|
|
task.driver.deploy.prepare(task)
|
|
except exception.IronicException as e:
|
|
with excutils.save_and_reraise_exception():
|
|
utils.deploying_error_handler(
|
|
task,
|
|
('Error while preparing to deploy to node %(node)s: '
|
|
'%(err)s') % {'node': node.uuid, 'err': e},
|
|
_("Failed to prepare to deploy: %s") % e,
|
|
clean_up=False)
|
|
except Exception as e:
|
|
with excutils.save_and_reraise_exception():
|
|
utils.deploying_error_handler(
|
|
task,
|
|
('Unexpected error while preparing to deploy to node '
|
|
'%(node)s') % {'node': node.uuid},
|
|
_("Failed to prepare to deploy. Exception: %s") % e,
|
|
traceback=True, clean_up=False)
|
|
|
|
try:
|
|
# This gets the deploy steps (if any) and puts them in the node's
|
|
# driver_internal_info['deploy_steps']. In-band steps are skipped since
|
|
# we know that an agent is not running yet.
|
|
conductor_steps.set_node_deployment_steps(task, skip_missing=True)
|
|
except exception.InstanceDeployFailure as e:
|
|
with excutils.save_and_reraise_exception():
|
|
utils.deploying_error_handler(
|
|
task,
|
|
'Error while getting deploy steps; cannot deploy to node '
|
|
'%(node)s. Error: %(err)s' % {'node': node.uuid, 'err': e},
|
|
_("Cannot get deploy steps; failed to deploy: %s") % e)
|
|
|
|
if not node.driver_internal_info.get('deploy_steps'):
|
|
msg = _('Error while getting deploy steps: no steps returned for '
|
|
'node %s') % node.uuid
|
|
utils.deploying_error_handler(
|
|
task, msg,
|
|
_("No deploy steps returned by the driver"))
|
|
raise exception.InstanceDeployFailure(msg)
|
|
|
|
do_next_deploy_step(task, 0, conductor_id)
|
|
|
|
|
|
@task_manager.require_exclusive_lock
|
|
def do_next_deploy_step(task, step_index, conductor_id):
|
|
"""Do deployment, starting from the specified deploy step.
|
|
|
|
:param task: a TaskManager instance with an exclusive lock
|
|
:param step_index: The first deploy step in the list to execute. This
|
|
is the index (from 0) into the list of deploy steps in the node's
|
|
driver_internal_info['deploy_steps']. Is None if there are no steps
|
|
to execute.
|
|
"""
|
|
node = task.node
|
|
if step_index is None:
|
|
steps = []
|
|
else:
|
|
steps = node.driver_internal_info['deploy_steps'][step_index:]
|
|
|
|
LOG.info('Executing %(state)s on node %(node)s, remaining steps: '
|
|
'%(steps)s', {'node': node.uuid, 'steps': steps,
|
|
'state': node.provision_state})
|
|
|
|
# Execute each step until we hit an async step or run out of steps
|
|
for ind, step in enumerate(steps):
|
|
# Save which step we're about to start so we can restart
|
|
# if necessary
|
|
node.deploy_step = step
|
|
driver_internal_info = node.driver_internal_info
|
|
driver_internal_info['deploy_step_index'] = step_index + ind
|
|
node.driver_internal_info = driver_internal_info
|
|
node.save()
|
|
interface = getattr(task.driver, step.get('interface'))
|
|
LOG.info('Executing %(step)s on node %(node)s',
|
|
{'step': step, 'node': node.uuid})
|
|
try:
|
|
result = interface.execute_deploy_step(task, step)
|
|
except exception.IronicException as e:
|
|
if isinstance(e, exception.AgentConnectionFailed):
|
|
if task.node.driver_internal_info.get('deployment_reboot'):
|
|
LOG.info('Agent is not yet running on node %(node)s after '
|
|
'deployment reboot, waiting for agent to come up '
|
|
'to run next deploy step %(step)s.',
|
|
{'node': node.uuid, 'step': step})
|
|
driver_internal_info['skip_current_deploy_step'] = False
|
|
node.driver_internal_info = driver_internal_info
|
|
task.process_event('wait')
|
|
return
|
|
log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
|
|
'%(err)s' %
|
|
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
|
|
utils.deploying_error_handler(
|
|
task, log_msg,
|
|
_("Failed to deploy: Deploy step %(step)s, "
|
|
"error: %(err)s.") % {
|
|
'step': node.deploy_step,
|
|
'err': e})
|
|
return
|
|
except Exception as e:
|
|
log_msg = ('Node %(node)s failed deploy step %(step)s with '
|
|
'unexpected error: %(err)s' %
|
|
{'node': node.uuid, 'step': node.deploy_step, 'err': e})
|
|
utils.deploying_error_handler(
|
|
task, log_msg,
|
|
_("Failed to deploy. Exception: %s") % e, traceback=True)
|
|
return
|
|
|
|
if ind == 0:
|
|
# We've done the very first deploy step.
|
|
# Update conductor_affinity to reference this conductor's ID
|
|
# since there may be local persistent state
|
|
node.conductor_affinity = conductor_id
|
|
node.save()
|
|
|
|
# Check if the step is done or not. The step should return
|
|
# states.DEPLOYWAIT if the step is still being executed, or
|
|
# None if the step is done.
|
|
# NOTE(tenbrae): Some drivers may return states.DEPLOYWAIT
|
|
# eg. if they are waiting for a callback
|
|
if result == states.DEPLOYWAIT:
|
|
# Kill this worker, the async step will make an RPC call to
|
|
# continue_node_deploy() to continue deploying
|
|
LOG.info('Deploy step %(step)s on node %(node)s being '
|
|
'executed asynchronously, waiting for driver.',
|
|
{'node': node.uuid, 'step': step})
|
|
if task.node.provision_state != states.DEPLOYWAIT:
|
|
task.process_event('wait')
|
|
return
|
|
elif result is not None:
|
|
# NOTE(rloo): This is an internal/dev error; shouldn't happen.
|
|
log_msg = (_('While executing deploy step %(step)s on node '
|
|
'%(node)s, step returned unexpected state: %(val)s')
|
|
% {'step': step, 'node': node.uuid, 'val': result})
|
|
utils.deploying_error_handler(
|
|
task, log_msg,
|
|
_("Failed to deploy: %s") % node.deploy_step)
|
|
return
|
|
|
|
LOG.info('Node %(node)s finished deploy step %(step)s',
|
|
{'node': node.uuid, 'step': step})
|
|
|
|
# Finished executing the steps. Clear deploy_step.
|
|
node.deploy_step = None
|
|
utils.wipe_deploy_internal_info(task)
|
|
node.save()
|
|
|
|
_start_console_in_deploy(task)
|
|
|
|
task.process_event('done')
|
|
LOG.info('Successfully deployed node %(node)s with '
|
|
'instance %(instance)s.',
|
|
{'node': node.uuid, 'instance': node.instance_uuid})
|
|
|
|
|
|
def _get_configdrive_obj_name(node):
|
|
"""Generate the object name for the config drive."""
|
|
return 'configdrive-%s' % node.uuid
|
|
|
|
|
|
def _store_configdrive(node, configdrive):
|
|
"""Handle the storage of the config drive.
|
|
|
|
If configured, the config drive data are uploaded to a swift endpoint.
|
|
The Node's instance_info is updated to include either the temporary
|
|
Swift URL from the upload, or if no upload, the actual config drive data.
|
|
|
|
:param node: an Ironic node object.
|
|
:param configdrive: A gzipped and base64 encoded configdrive.
|
|
:raises: SwiftOperationError if an error occur when uploading the
|
|
config drive to the swift endpoint.
|
|
:raises: ConfigInvalid if required keystone authorization credentials
|
|
with swift are missing.
|
|
|
|
|
|
"""
|
|
if CONF.deploy.configdrive_use_object_store:
|
|
# NOTE(lucasagomes): No reason to use a different timeout than
|
|
# the one used for deploying the node
|
|
timeout = (CONF.conductor.configdrive_swift_temp_url_duration
|
|
or CONF.conductor.deploy_callback_timeout
|
|
# The documented default in ironic.conf.conductor
|
|
or 1800)
|
|
container = CONF.conductor.configdrive_swift_container
|
|
object_name = _get_configdrive_obj_name(node)
|
|
|
|
object_headers = {'X-Delete-After': str(timeout)}
|
|
|
|
with tempfile.NamedTemporaryFile(dir=CONF.tempdir,
|
|
mode="wt") as fileobj:
|
|
fileobj.write(configdrive)
|
|
fileobj.flush()
|
|
|
|
swift_api = swift.SwiftAPI()
|
|
swift_api.create_object(container, object_name, fileobj.name,
|
|
object_headers=object_headers)
|
|
configdrive = swift_api.get_temp_url(container, object_name,
|
|
timeout)
|
|
|
|
i_info = node.instance_info
|
|
i_info['configdrive'] = configdrive
|
|
node.instance_info = i_info
|
|
node.save()
|
|
|
|
|
|
def _start_console_in_deploy(task):
|
|
"""Start console at the end of deployment.
|
|
|
|
Console is stopped at tearing down not to be exposed to an instance user.
|
|
Then, restart at deployment.
|
|
|
|
:param task: a TaskManager instance with an exclusive lock
|
|
"""
|
|
|
|
if not task.node.console_enabled:
|
|
return
|
|
|
|
notify_utils.emit_console_notification(
|
|
task, 'console_restore', fields.NotificationStatus.START)
|
|
try:
|
|
task.driver.console.start_console(task)
|
|
except Exception as err:
|
|
msg = (_('Failed to start console while deploying the '
|
|
'node %(node)s: %(err)s.') % {'node': task.node.uuid,
|
|
'err': err})
|
|
LOG.error(msg)
|
|
task.node.last_error = msg
|
|
task.node.console_enabled = False
|
|
task.node.save()
|
|
notify_utils.emit_console_notification(
|
|
task, 'console_restore', fields.NotificationStatus.ERROR)
|
|
else:
|
|
notify_utils.emit_console_notification(
|
|
task, 'console_restore', fields.NotificationStatus.END)
|