mogan/mogan/engine/baremetal/ironic/driver.py

561 lines
23 KiB
Python

# Copyright 2016 Huawei Technologies Co.,LTD.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from ironicclient import exc as ironic_exc
from ironicclient import exceptions as client_e
from oslo_log import log as logging
from oslo_service import loopingcall
from oslo_utils import excutils
import six
from mogan.common import exception
from mogan.common.i18n import _
from mogan.common import ironic
from mogan.common import states
from mogan.conf import CONF
from mogan.engine.baremetal import driver as base_driver
from mogan.engine.baremetal.ironic import ironic_states
LOG = logging.getLogger(__name__)
_POWER_STATE_MAP = {
ironic_states.POWER_ON: states.POWER_ON,
ironic_states.NOSTATE: states.NOSTATE,
ironic_states.POWER_OFF: states.POWER_OFF,
}
_UNPROVISION_STATES = (ironic_states.ACTIVE, ironic_states.DEPLOYFAIL,
ironic_states.ERROR, ironic_states.DEPLOYWAIT,
ironic_states.DEPLOYING)
_NODE_FIELDS = ('uuid', 'power_state', 'target_power_state', 'provision_state',
'target_provision_state', 'last_error', 'maintenance',
'properties', 'instance_uuid')
def map_power_state(state):
try:
return _POWER_STATE_MAP[state]
except KeyError:
LOG.warning("Power state %s not found.", state)
return states.NOSTATE
def _log_ironic_polling(what, node, instance):
power_state = (None if node.power_state is None else
'"%s"' % node.power_state)
tgt_power_state = (None if node.target_power_state is None else
'"%s"' % node.target_power_state)
prov_state = (None if node.provision_state is None else
'"%s"' % node.provision_state)
tgt_prov_state = (None if node.target_provision_state is None else
'"%s"' % node.target_provision_state)
LOG.debug('Still waiting for ironic node %(node)s to %(what)s: '
'power_state=%(power_state)s, '
'target_power_state=%(tgt_power_state)s, '
'provision_state=%(prov_state)s, '
'target_provision_state=%(tgt_prov_state)s',
dict(what=what,
node=node.uuid,
power_state=power_state,
tgt_power_state=tgt_power_state,
prov_state=prov_state,
tgt_prov_state=tgt_prov_state),
instance=instance)
class IronicDriver(base_driver.BaseEngineDriver):
def __init__(self):
super(IronicDriver, self).__init__()
self.ironicclient = ironic.IronicClientWrapper()
def _get_node(self, node_uuid):
"""Get a node by its UUID."""
return self.ironicclient.call('node.get', node_uuid,
fields=_NODE_FIELDS)
def _validate_instance_and_node(self, instance):
"""Get the node associated with the instance.
Check with the Ironic service that this instance is associated with a
node, and return the node.
"""
try:
return self.ironicclient.call('node.get_by_instance_uuid',
instance.uuid, fields=_NODE_FIELDS)
except ironic_exc.NotFound:
raise exception.InstanceNotFound(instance_id=instance.uuid)
def _parse_node_properties(self, node):
"""Helper method to parse the node's properties."""
properties = {}
for prop in ('cpus', 'memory_mb', 'local_gb'):
try:
properties[prop] = int(node.properties.get(prop, 0))
except (TypeError, ValueError):
LOG.warning('Node %(uuid)s has a malformed "%(prop)s". '
'It should be an integer.',
{'uuid': node.uuid, 'prop': prop})
properties[prop] = 0
properties['capabilities'] = node.properties.get('capabilities')
properties['availability_zone'] = \
node.properties.get('availability_zone')
properties['node_type'] = node.properties.get('node_type')
return properties
def _node_resource(self, node):
"""Helper method to create resource dict from node stats."""
properties = self._parse_node_properties(node)
cpus = properties['cpus']
memory_mb = properties['memory_mb']
availability_zone = properties['availability_zone']
node_type = properties['node_type']
nodes_extra_specs = {}
# NOTE(gilliard): To assist with more precise scheduling, if the
# node.properties contains a key 'capabilities', we expect the value
# to be of the form "k1:v1,k2:v2,etc.." which we add directly as
# key/value pairs into the node_extra_specs to be used by the
# ComputeCapabilitiesFilter
capabilities = properties['capabilities']
if capabilities:
for capability in str(capabilities).split(','):
parts = capability.split(':')
if len(parts) == 2 and parts[0] and parts[1]:
nodes_extra_specs[parts[0].strip()] = parts[1]
else:
LOG.warning("Ignoring malformed capability '%s'. "
"Format should be 'key:val'.", capability)
dic = {
'cpus': cpus,
'memory_mb': memory_mb,
'hypervisor_type': self._get_hypervisor_type(),
'availability_zone': str(availability_zone),
'node_type': str(node_type),
'extra_specs': nodes_extra_specs,
'node_uuid': str(node.uuid),
'ports': node.ports,
}
return dic
def _port_resource(self, port):
"""Helper method to create resource dict from port stats."""
port_type = port.extra.get('port_type')
dic = {
'port_type': str(port_type),
'node_uuid': str(port.node_uuid),
'port_uuid': str(port.uuid),
}
return dic
def _add_instance_info_to_node(self, node, instance):
patch = list()
# Associate the node with an instance
patch.append({'path': '/instance_uuid', 'op': 'add',
'value': instance.uuid})
# Add the required fields to deploy a node.
patch.append({'path': '/instance_info/image_source', 'op': 'add',
'value': instance.image_uuid})
# TODO(zhenguo) Add partition support
patch.append({'path': '/instance_info/root_gb', 'op': 'add',
'value': str(node.properties.get('local_gb', 0))})
try:
# FIXME(lucasagomes): The "retry_on_conflict" parameter was added
# to basically causes the deployment to fail faster in case the
# node picked by the scheduler is already associated with another
# instance due bug #1341420.
self.ironicclient.call('node.update', node.uuid, patch,
retry_on_conflict=False)
except ironic_exc.BadRequest:
msg = (_("Failed to add deploy parameters on node %(node)s "
"when provisioning the instance %(instance)s")
% {'node': node.uuid, 'instance': instance.uuid})
LOG.error(msg)
raise exception.InstanceDeployFailure(msg)
def _remove_instance_info_from_node(self, node, instance):
patch = [{'path': '/instance_info', 'op': 'remove'},
{'path': '/instance_uuid', 'op': 'remove'}]
try:
self.ironicclient.call('node.update', node.uuid, patch)
except ironic_exc.BadRequest as e:
LOG.warning("Failed to remove deploy parameters from node "
"%(node)s when unprovisioning the instance "
"%(instance)s: %(reason)s",
{'node': node.uuid, 'instance': instance.uuid,
'reason': six.text_type(e)})
def _wait_for_active(self, instance):
"""Wait for the node to be marked as ACTIVE in Ironic."""
instance.refresh()
if instance.status in (states.DELETING, states.ERROR, states.DELETED):
raise exception.InstanceDeployAborted(
_("Instance %s provisioning was aborted") % instance.uuid)
node = self._validate_instance_and_node(instance)
if node.provision_state == ironic_states.ACTIVE:
# job is done
LOG.debug("Ironic node %(node)s is now ACTIVE",
dict(node=node.uuid), instance=instance)
raise loopingcall.LoopingCallDone()
if node.target_provision_state in (ironic_states.DELETED,
ironic_states.AVAILABLE):
# ironic is trying to delete it now
raise exception.InstanceNotFound(instance_id=instance.uuid)
if node.provision_state in (ironic_states.NOSTATE,
ironic_states.AVAILABLE):
# ironic already deleted it
raise exception.InstanceNotFound(instance_id=instance.uuid)
if node.provision_state == ironic_states.DEPLOYFAIL:
# ironic failed to deploy
msg = (_("Failed to provision instance %(inst)s: %(reason)s")
% {'inst': instance.uuid, 'reason': node.last_error})
raise exception.InstanceDeployFailure(msg)
_log_ironic_polling('become ACTIVE', node, instance)
def _wait_for_power_state(self, instance, message):
"""Wait for the node to complete a power state change."""
node = self._validate_instance_and_node(instance)
if node.target_power_state == ironic_states.NOSTATE:
raise loopingcall.LoopingCallDone()
_log_ironic_polling(message, node, instance)
def _get_hypervisor_type(self):
"""Get hypervisor type."""
return 'ironic'
def get_ports_from_node(self, node_uuid, detail=True):
"""List the MAC addresses and the port types from a node."""
ports = self.ironicclient.call("node.list_ports",
node_uuid, detail=detail)
portgroups = self.ironicclient.call("portgroup.list", node=node_uuid,
detail=detail)
return ports + portgroups
def plug_vif(self, ironic_port_id, port_id):
patch = [{'op': 'add',
'path': '/extra/vif_port_id',
'value': port_id}]
self.ironicclient.call("port.update", ironic_port_id, patch)
def unplug_vif(self, node_interface):
patch = [{'op': 'remove',
'path': '/extra/vif_port_id'}]
try:
if 'vif_port_id' in node_interface.extra:
self.ironicclient.call("port.update",
node_interface.uuid, patch)
except client_e.BadRequest:
pass
def spawn(self, context, instance):
"""Deploy an instance.
:param context: The security context.
:param instance: The instance object.
"""
LOG.debug('Spawn called for instance', instance=instance)
# The engine manager is meant to know the node uuid, so missing uuid
# is a significant issue. It may mean we've been passed the wrong data.
node_uuid = instance.node_uuid
if not node_uuid:
raise ironic_exc.BadRequest(
_("Ironic node uuid not supplied to "
"driver for instance %s.") % instance.uuid)
# add instance info to node
node = self._get_node(node_uuid)
self._add_instance_info_to_node(node, instance)
# validate we are ready to do the deploy
validate_chk = self.ironicclient.call("node.validate", node_uuid)
if (not validate_chk.deploy.get('result')
or not validate_chk.power.get('result')):
# something is wrong. undo what we have done
self._cleanup_deploy(node, instance)
raise exception.ValidationError(_(
"Ironic node: %(id)s failed to validate."
" (deploy: %(deploy)s, power: %(power)s)")
% {'id': instance.node_uuid,
'deploy': validate_chk.deploy,
'power': validate_chk.power})
# trigger the node deploy
try:
self.ironicclient.call("node.set_provision_state", node_uuid,
ironic_states.ACTIVE)
except Exception as e:
with excutils.save_and_reraise_exception():
msg = ("Failed to request Ironic to provision instance "
"%(inst)s: %(reason)s",
{'inst': instance.uuid,
'reason': six.text_type(e)})
LOG.error(msg)
timer = loopingcall.FixedIntervalLoopingCall(self._wait_for_active,
instance)
try:
timer.start(interval=CONF.ironic.api_retry_interval).wait()
LOG.info('Successfully provisioned Ironic node %s',
node.uuid, instance=instance)
except Exception:
with excutils.save_and_reraise_exception():
LOG.error("Error deploying instance %(instance)s on "
"baremetal node %(node)s.",
{'instance': instance.uuid,
'node': node_uuid})
def _unprovision(self, instance, node):
"""This method is called from destroy() to unprovision
already provisioned node after required checks.
"""
try:
self.ironicclient.call("node.set_provision_state", node.uuid,
"deleted")
except Exception as e:
# if the node is already in a deprovisioned state, continue
# This should be fixed in Ironic.
# TODO(deva): This exception should be added to
# python-ironicclient and matched directly,
# rather than via __name__.
if getattr(e, '__name__', None) != 'InstanceDeployFailure':
raise
# using a dict because this is modified in the local method
data = {'tries': 0}
def _wait_for_provision_state():
try:
node = self._validate_instance_and_node(instance)
except exception.InstanceNotFound:
LOG.debug("Instance already removed from Ironic",
instance=instance)
raise loopingcall.LoopingCallDone()
if node.provision_state in (ironic_states.NOSTATE,
ironic_states.CLEANING,
ironic_states.CLEANWAIT,
ironic_states.CLEANFAIL,
ironic_states.AVAILABLE):
# From a user standpoint, the node is unprovisioned. If a node
# gets into CLEANFAIL state, it must be fixed in Ironic, but we
# can consider the instance unprovisioned.
LOG.debug("Ironic node %(node)s is in state %(state)s, "
"instance is now unprovisioned.",
dict(node=node.uuid, state=node.provision_state),
instance=instance)
raise loopingcall.LoopingCallDone()
if data['tries'] >= CONF.ironic.api_max_retries + 1:
msg = (_("Error destroying the instance on node %(node)s. "
"Provision state still '%(state)s'.")
% {'state': node.provision_state,
'node': node.uuid})
LOG.error(msg)
raise exception.NovaException(msg)
else:
data['tries'] += 1
_log_ironic_polling('unprovision', node, instance)
# wait for the state transition to finish
timer = loopingcall.FixedIntervalLoopingCall(_wait_for_provision_state)
timer.start(interval=CONF.ironic.api_retry_interval).wait()
def destroy(self, context, instance):
"""Destroy the specified instance, if it can be found.
:param context: The security context.
:param instance: The instance object.
"""
LOG.debug('Destroy called for instance', instance=instance)
try:
node = self._validate_instance_and_node(instance)
except exception.InstanceNotFound:
LOG.warning("Destroy called on non-existing instance %s.",
instance.uuid)
return
if node.provision_state in _UNPROVISION_STATES:
self._unprovision(instance, node)
else:
# NOTE(hshiina): if spawn() fails before ironic starts
# provisioning, instance information should be
# removed from ironic node.
self._remove_instance_info_from_node(node, instance)
LOG.info('Successfully unprovisioned Ironic node %s',
node.uuid, instance=instance)
def get_available_resources(self):
"""Helper function to return the list of resources.
If unable to connect ironic server, an empty list is returned.
:returns: a list of raw node from ironic
"""
# Retrieve nodes
params = {
'maintenance': False,
'detail': True,
'provision_state': ironic_states.AVAILABLE,
'associated': False,
'limit': 0
}
try:
node_list = self.ironicclient.call("node.list", **params)
except client_e.ClientException as e:
LOG.exception("Could not get nodes from ironic. Reason: "
"%(detail)s", {'detail': e.message})
node_list = []
# Retrive ports
params = {
'limit': 0,
'fields': ('uuid', 'node_uuid', 'extra', 'address')
}
try:
port_list = self.ironicclient.call("port.list", **params)
except client_e.ClientException as e:
LOG.exception("Could not get ports from ironic. Reason: "
"%(detail)s", {'detail': e.message})
port_list = []
# TODO(zhenguo): Add portgroups resources
node_resources = {}
for node in node_list:
# Add ports to the associated node
node.ports = [self._port_resource(port) for port in port_list
if node.uuid == port.node_uuid]
node_resources[node.uuid] = self._node_resource(node)
return node_resources
def get_maintenance_node_list(self):
"""Helper function to return the list of maintenance nodes.
If unable to connect ironic server, an empty list is returned.
:returns: a list of maintenance node from ironic
"""
params = {
'associated': True,
'fields': ('instance_uuid', 'maintenance'),
'limit': 0
}
try:
node_list = self.ironicclient.call("node.list", **params)
except client_e.ClientException as e:
LOG.exception("Could not get nodes from ironic. Reason: "
"%(detail)s", {'detail': e.message})
node_list = []
return node_list
def get_node_power_states(self):
"""Helper function to return the node power states.
If unable to connect ironic server, an empty list is returned.
:returns: a list of node power states from ironic
"""
params = {
'maintenance': False,
'associated': True,
'fields': ('instance_uuid', 'power_state', 'target_power_state'),
'limit': 0
}
try:
node_list = self.ironicclient.call("node.list", **params)
except client_e.ClientException as e:
LOG.exception("Could not get nodes from ironic. Reason: "
"%(detail)s", {'detail': e.message})
node_list = []
return node_list
def get_power_state(self, context, instance_uuid):
try:
node = self.ironicclient.call('node.get_by_instance_uuid',
instance_uuid,
fields=('power_state',))
return map_power_state(node.power_state)
except client_e.NotFound:
return map_power_state(ironic_states.NOSTATE)
def set_power_state(self, context, instance, state):
"""Set power state on the specified instance.
:param context: The security context.
:param instance: The instance object.
"""
node = self._validate_instance_and_node(instance)
if state == "soft_off":
self.ironicclient.call("node.set_power_state",
node.uuid, "off", soft=True)
elif state == "soft_reboot":
self.ironicclient.call("node.set_power_state",
node.uuid, "reboot", soft=True)
else:
self.ironicclient.call("node.set_power_state",
node.uuid, state)
timer = loopingcall.FixedIntervalLoopingCall(
self._wait_for_power_state, instance, state)
timer.start(interval=CONF.ironic.api_retry_interval).wait()
def rebuild(self, context, instance):
"""Rebuild/redeploy an instance.
:param context: The security context.
:param instance: The instance object.
"""
LOG.debug('Rebuild called for instance', instance=instance)
# trigger the node rebuild
try:
self.ironicclient.call("node.set_provision_state",
instance.node_uuid,
ironic_states.REBUILD)
except (ironic_exc.InternalServerError,
ironic_exc.BadRequest) as e:
msg = (_("Failed to request Ironic to rebuild instance "
"%(inst)s: %(reason)s") % {'inst': instance.uuid,
'reason': six.text_type(e)})
raise exception.InstanceDeployFailure(msg)
# Although the target provision state is REBUILD, it will actually go
# to ACTIVE once the redeploy is finished.
timer = loopingcall.FixedIntervalLoopingCall(self._wait_for_active,
instance)
timer.start(interval=CONF.ironic.api_retry_interval).wait()
LOG.info('Instance was successfully rebuilt', instance=instance)