
This is a normal situation and should not report a traceback. Change-Id: I497d3aaae91e8aa87649323874121eb6e81c5fd8
322 lines
13 KiB
Python
322 lines
13 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""Functionality related to allocations."""
|
|
|
|
import random
|
|
|
|
from ironic_lib import metrics_utils
|
|
from oslo_config import cfg
|
|
from oslo_log import log
|
|
from oslo_utils import excutils
|
|
import retrying
|
|
|
|
from ironic.common import exception
|
|
from ironic.common.i18n import _
|
|
from ironic.common import states
|
|
from ironic.conductor import task_manager
|
|
from ironic import objects
|
|
|
|
|
|
CONF = cfg.CONF
|
|
LOG = log.getLogger(__name__)
|
|
METRICS = metrics_utils.get_metrics_logger(__name__)
|
|
|
|
|
|
def do_allocate(context, allocation):
|
|
"""Process the allocation.
|
|
|
|
This call runs in a separate thread on a conductor. It finds suitable
|
|
nodes for the allocation and reserves one of them.
|
|
|
|
This call does not raise exceptions since it's designed to work
|
|
asynchronously.
|
|
|
|
:param context: an admin context
|
|
:param allocation: an allocation object
|
|
"""
|
|
try:
|
|
nodes = _candidate_nodes(context, allocation)
|
|
_allocate_node(context, allocation, nodes)
|
|
except exception.AllocationFailed as exc:
|
|
LOG.error(str(exc))
|
|
_allocation_failed(allocation, exc)
|
|
except Exception as exc:
|
|
LOG.exception("Unexpected exception during processing of "
|
|
"allocation %s", allocation.uuid)
|
|
reason = _("Unexpected exception during allocation: %s") % exc
|
|
_allocation_failed(allocation, reason)
|
|
|
|
|
|
def verify_node_for_deallocation(node, allocation):
|
|
"""Verify that allocation can be removed for the node.
|
|
|
|
:param node: a node object
|
|
:param allocation: an allocation object associated with the node
|
|
"""
|
|
if node.maintenance:
|
|
# Allocations can always be removed in the maintenance mode.
|
|
return
|
|
|
|
if (node.target_provision_state
|
|
and node.provision_state not in states.UPDATE_ALLOWED_STATES):
|
|
msg = (_("Cannot remove allocation %(uuid)s for node %(node)s, "
|
|
"because the node is in state %(state)s where updates are "
|
|
"not allowed (and maintenance mode is off)") %
|
|
{'node': node.uuid, 'uuid': allocation.uuid,
|
|
'state': node.provision_state})
|
|
raise exception.InvalidState(msg)
|
|
|
|
if node.provision_state == states.ACTIVE:
|
|
msg = (_("Cannot remove allocation %(uuid)s for node %(node)s, "
|
|
"because the node is active (and maintenance mode is off)") %
|
|
{'node': node.uuid, 'uuid': allocation.uuid})
|
|
raise exception.InvalidState(msg)
|
|
|
|
|
|
def _allocation_failed(allocation, reason):
|
|
"""Failure handler for the allocation."""
|
|
try:
|
|
allocation.node_id = None
|
|
allocation.state = states.ERROR
|
|
allocation.last_error = str(reason)
|
|
allocation.save()
|
|
except exception.AllocationNotFound as exc:
|
|
LOG.debug('Not saving a failed allocation: %s', exc)
|
|
except Exception:
|
|
LOG.exception('Could not save the failed allocation %s',
|
|
allocation.uuid)
|
|
|
|
|
|
def _traits_match(traits, node):
|
|
return {t.trait for t in node.traits.objects}.issuperset(traits)
|
|
|
|
|
|
def _candidate_nodes(context, allocation):
|
|
"""Get a list of candidate nodes for the allocation."""
|
|
filters = {'resource_class': allocation.resource_class,
|
|
'provision_state': states.AVAILABLE,
|
|
'associated': False,
|
|
'with_power_state': True,
|
|
'maintenance': False}
|
|
if allocation.candidate_nodes:
|
|
# NOTE(dtantsur): we assume that candidate_nodes were converted to
|
|
# UUIDs on the API level.
|
|
filters['uuid_in'] = allocation.candidate_nodes
|
|
|
|
nodes = objects.Node.list(context, filters=filters)
|
|
|
|
if not nodes:
|
|
if allocation.candidate_nodes:
|
|
error = _("none of the requested nodes are available and match "
|
|
"the resource class %s") % allocation.resource_class
|
|
else:
|
|
error = _("no available nodes match the resource class %s") % (
|
|
allocation.resource_class)
|
|
raise exception.AllocationFailed(uuid=allocation.uuid, error=error)
|
|
|
|
# TODO(dtantsur): database-level filtering?
|
|
if allocation.traits:
|
|
traits = set(allocation.traits)
|
|
nodes = [n for n in nodes if _traits_match(traits, n)]
|
|
if not nodes:
|
|
error = (_("no suitable nodes have the requested traits %s") %
|
|
', '.join(traits))
|
|
raise exception.AllocationFailed(uuid=allocation.uuid, error=error)
|
|
|
|
# NOTE(dtantsur): make sure that parallel allocations do not try the nodes
|
|
# in the same order.
|
|
random.shuffle(nodes)
|
|
|
|
LOG.debug('%(count)d nodes are candidates for allocation %(uuid)s',
|
|
{'count': len(nodes), 'uuid': allocation.uuid})
|
|
return nodes
|
|
|
|
|
|
def _verify_node(node, allocation):
|
|
"""Check that the node still satisfiest the request."""
|
|
if node.maintenance:
|
|
LOG.debug('Node %s is now in maintenance, skipping',
|
|
node.uuid)
|
|
return False
|
|
|
|
if node.instance_uuid:
|
|
LOG.debug('Node %(node)s is already associated with instance '
|
|
'%(inst)s, skipping',
|
|
{'node': node.uuid, 'inst': node.instance_uuid})
|
|
return False
|
|
|
|
if node.provision_state != states.AVAILABLE:
|
|
LOG.debug('Node %s is no longer available, skipping',
|
|
node.uuid)
|
|
return False
|
|
|
|
if node.resource_class != allocation.resource_class:
|
|
LOG.debug('Resource class of node %(node)s no longer '
|
|
'matches requested resource class %(rsc)s for '
|
|
'allocation %(uuid)s, skipping',
|
|
{'node': node.uuid,
|
|
'rsc': allocation.resource_class,
|
|
'uuid': allocation.uuid})
|
|
return False
|
|
|
|
if allocation.traits and not _traits_match(set(allocation.traits), node):
|
|
LOG.debug('List of traits of node %(node)s no longer '
|
|
'matches requested traits %(traits)s for '
|
|
'allocation %(uuid)s, skipping',
|
|
{'node': node.uuid,
|
|
'traits': allocation.traits,
|
|
'uuid': allocation.uuid})
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
# NOTE(dtantsur): instead of trying to allocate each node
|
|
# node_locked_retry_attempt times, we try to allocate *any* node the same
|
|
# number of times. This avoids getting stuck on a node reserved e.g. for power
|
|
# sync periodic task.
|
|
@retrying.retry(
|
|
retry_on_exception=lambda e: isinstance(e, exception.AllocationFailed),
|
|
stop_max_attempt_number=CONF.conductor.node_locked_retry_attempts,
|
|
wait_fixed=CONF.conductor.node_locked_retry_interval * 1000)
|
|
def _allocate_node(context, allocation, nodes):
|
|
"""Go through the list of nodes and try to allocate one of them."""
|
|
retry_nodes = []
|
|
for node in nodes:
|
|
try:
|
|
# NOTE(dtantsur): retries are done for all nodes above, so disable
|
|
# per-node retry. Also disable loading the driver, since the
|
|
# current conductor may not have the requried hardware type or
|
|
# interfaces (it's picked at random).
|
|
with task_manager.acquire(context, node.uuid, shared=False,
|
|
retry=False, load_driver=False,
|
|
purpose='allocating') as task:
|
|
# NOTE(dtantsur): double-check the node details, since they
|
|
# could have changed before we acquired the lock.
|
|
if not _verify_node(task.node, allocation):
|
|
continue
|
|
|
|
allocation.node_id = task.node.id
|
|
allocation.state = states.ACTIVE
|
|
# NOTE(dtantsur): the node.instance_uuid and allocation_id are
|
|
# updated inside of the save() call within the same
|
|
# transaction to avoid races. NodeAssociated can be raised if
|
|
# another process allocates this node first.
|
|
allocation.save()
|
|
LOG.info('Node %(node)s has been successfully reserved for '
|
|
'allocation %(uuid)s',
|
|
{'node': node.uuid, 'uuid': allocation.uuid})
|
|
return allocation
|
|
except exception.NodeLocked:
|
|
LOG.debug('Node %s is currently locked, moving to the next one',
|
|
node.uuid)
|
|
retry_nodes.append(node)
|
|
except exception.NodeAssociated:
|
|
LOG.debug('Node %s is already associated, moving to the next one',
|
|
node.uuid)
|
|
|
|
# NOTE(dtantsur): rewrite the passed list to only contain the nodes that
|
|
# are worth retrying. Do not include nodes that are no longer suitable.
|
|
nodes[:] = retry_nodes
|
|
|
|
if nodes:
|
|
error = _('could not reserve any of %d suitable nodes') % len(nodes)
|
|
else:
|
|
error = _('all nodes were filtered out during reservation')
|
|
|
|
raise exception.AllocationFailed(uuid=allocation.uuid, error=error)
|
|
|
|
|
|
def backfill_allocation(context, allocation, node_id):
|
|
"""Assign the previously allocated node to the node allocation.
|
|
|
|
This is not the actual allocation process, but merely backfilling of
|
|
allocation_uuid for a previously allocated node.
|
|
|
|
:param context: an admin context
|
|
:param allocation: an allocation object associated with the node
|
|
:param node_id: An ID of the node.
|
|
:raises: AllocationFailed if the node does not match the allocation
|
|
:raises: NodeAssociated if the node is already associated with another
|
|
instance or allocation.
|
|
:raises: InstanceAssociated if the allocation's UUID is already used
|
|
on another node as instance_uuid.
|
|
:raises: NodeNotFound if the node with the provided ID cannot be found.
|
|
"""
|
|
try:
|
|
_do_backfill_allocation(context, allocation, node_id)
|
|
except (exception.AllocationFailed,
|
|
exception.InstanceAssociated,
|
|
exception.NodeAssociated,
|
|
exception.NodeNotFound) as exc:
|
|
with excutils.save_and_reraise_exception():
|
|
LOG.error(str(exc))
|
|
_allocation_failed(allocation, exc)
|
|
except Exception as exc:
|
|
with excutils.save_and_reraise_exception():
|
|
LOG.exception("Unexpected exception during backfilling of "
|
|
"allocation %s", allocation.uuid)
|
|
reason = _("Unexpected exception during allocation: %s") % exc
|
|
_allocation_failed(allocation, reason)
|
|
|
|
|
|
def _do_backfill_allocation(context, allocation, node_id):
|
|
with task_manager.acquire(context, node_id,
|
|
purpose='allocation backfilling') as task:
|
|
node = task.node
|
|
|
|
errors = []
|
|
|
|
# NOTE(dtantsur): this feature is not designed to bypass the allocation
|
|
# mechanism, but to backfill allocations for active nodes, hence this
|
|
# check.
|
|
if node.provision_state != states.ACTIVE:
|
|
errors.append(_('Node must be in the "active" state, but the '
|
|
'current state is "%s"') % node.provision_state)
|
|
|
|
# NOTE(dtantsur): double-check that the node is still suitable.
|
|
if (allocation.resource_class
|
|
and node.resource_class != allocation.resource_class):
|
|
errors.append(_('Resource class %(curr)s does not match '
|
|
'the requested resource class %(rsc)s')
|
|
% {'curr': node.resource_class,
|
|
'rsc': allocation.resource_class})
|
|
if (allocation.traits
|
|
and not _traits_match(set(allocation.traits), node)):
|
|
errors.append(_('List of traits %(curr)s does not match '
|
|
'the requested traits %(traits)s')
|
|
% {'curr': node.traits,
|
|
'traits': allocation.traits})
|
|
if (allocation.candidate_nodes
|
|
and node.uuid not in allocation.candidate_nodes):
|
|
errors.append(_('Candidate nodes must be empty or contain the '
|
|
'target node, but got %s')
|
|
% allocation.candidate_nodes)
|
|
|
|
if errors:
|
|
error = _('Cannot backfill an allocation for node %(node)s: '
|
|
'%(errors)s') % {'node': node.uuid,
|
|
'errors': '; '.join(errors)}
|
|
raise exception.AllocationFailed(uuid=allocation.uuid, error=error)
|
|
|
|
allocation.node_id = task.node.id
|
|
allocation.state = states.ACTIVE
|
|
# NOTE(dtantsur): the node.instance_uuid and allocation_id are
|
|
# updated inside of the save() call within the same
|
|
# transaction to avoid races. NodeAssociated can be raised if
|
|
# another process allocates this node first.
|
|
allocation.save()
|
|
LOG.info('Node %(node)s has been successfully reserved for '
|
|
'allocation %(uuid)s',
|
|
{'node': node.uuid, 'uuid': allocation.uuid})
|