nova/nova/conductor/tasks/migrate.py

451 lines
21 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from oslo_serialization import jsonutils
from nova import availability_zones
from nova.compute import utils as compute_utils
from nova.conductor.tasks import base
from nova import exception
from nova.i18n import _
from nova import objects
from nova.scheduler.client import report
from nova.scheduler import utils as scheduler_utils
LOG = logging.getLogger(__name__)
def replace_allocation_with_migration(context, instance, migration):
"""Replace instance's allocation with one for a migration.
:raises: keystoneauth1.exceptions.base.ClientException on failure to
communicate with the placement API
:raises: ConsumerAllocationRetrievalFailed if reading the current
allocation from placement fails
:raises: ComputeHostNotFound if the host of the instance is not found in
the databse
:raises: AllocationMoveFailed if moving the allocation from the
instance.uuid to the migration.uuid fails due to parallel
placement operation on the instance consumer
:raises: NoValidHost if placement rejectes the update for other reasons
(e.g. not enough resources)
:returns: (source_compute_node, migration_allocation)
"""
try:
source_cn = objects.ComputeNode.get_by_host_and_nodename(
context, instance.host, instance.node)
except exception.ComputeHostNotFound:
LOG.error('Unable to find record for source '
'node %(node)s on %(host)s',
{'host': instance.host, 'node': instance.node},
instance=instance)
# A generic error like this will just error out the migration
# and do any rollback required
raise
reportclient = report.SchedulerReportClient()
orig_alloc = reportclient.get_allocs_for_consumer(
context, instance.uuid)['allocations']
root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {})
if not root_alloc:
LOG.debug('Unable to find existing allocations for instance on '
'source compute node: %s. This is normal if you are not '
'using the FilterScheduler.', source_cn.uuid,
instance=instance)
return None, None
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.move_allocations(context, instance.uuid,
migration.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
{'host': instance.host,
'node': instance.node},
instance=instance)
# Mimic the "no space" error that could have come from the
# scheduler. Once we have an atomic replace operation, this
# would be a severe error.
raise exception.NoValidHost(
reason=_('Unable to replace instance claim on source'))
else:
LOG.debug('Created allocations for migration %(mig)s on %(rp)s',
{'mig': migration.uuid, 'rp': source_cn.uuid})
return source_cn, orig_alloc
def revert_allocation_for_migration(context, source_cn, instance, migration):
"""Revert an allocation made for a migration back to the instance."""
reportclient = report.SchedulerReportClient()
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.move_allocations(context, migration.uuid,
instance.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
{'host': instance.host,
'node': instance.node},
instance=instance)
else:
LOG.debug('Created allocations for instance %(inst)s on %(rp)s',
{'inst': instance.uuid, 'rp': source_cn.uuid})
class MigrationTask(base.TaskBase):
def __init__(self, context, instance, flavor,
request_spec, clean_shutdown, compute_rpcapi,
query_client, report_client, host_list, network_api):
super(MigrationTask, self).__init__(context, instance)
self.clean_shutdown = clean_shutdown
self.request_spec = request_spec
self.flavor = flavor
self.compute_rpcapi = compute_rpcapi
self.query_client = query_client
self.reportclient = report_client
self.host_list = host_list
self.network_api = network_api
# Persist things from the happy path so we don't have to look
# them up if we need to roll back
self._migration = None
self._held_allocations = None
self._source_cn = None
def _preallocate_migration(self):
# If this is a rescheduled migration, don't create a new record.
migration_type = ("resize" if self.instance.flavor.id != self.flavor.id
else "migration")
filters = {"instance_uuid": self.instance.uuid,
"migration_type": migration_type,
"status": "pre-migrating"}
migrations = objects.MigrationList.get_by_filters(self.context,
filters).objects
if migrations:
migration = migrations[0]
else:
migration = objects.Migration(context=self.context.elevated())
migration.old_instance_type_id = self.instance.flavor.id
migration.new_instance_type_id = self.flavor.id
migration.status = 'pre-migrating'
migration.instance_uuid = self.instance.uuid
migration.source_compute = self.instance.host
migration.source_node = self.instance.node
migration.migration_type = migration_type
migration.create()
self._migration = migration
self._source_cn, self._held_allocations = (
replace_allocation_with_migration(self.context,
self.instance,
self._migration))
return migration
def _restrict_request_spec_to_cell(self, legacy_props):
# NOTE(danms): Right now we only support migrate to the same
# cell as the current instance, so request that the scheduler
# limit thusly.
instance_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.context, self.instance.uuid)
LOG.debug('Requesting cell %(cell)s while migrating',
{'cell': instance_mapping.cell_mapping.identity},
instance=self.instance)
if ('requested_destination' in self.request_spec and
self.request_spec.requested_destination):
self.request_spec.requested_destination.cell = (
instance_mapping.cell_mapping)
# NOTE(takashin): In the case that the target host is specified,
# if the migration is failed, it is not necessary to retry
# the cold migration to the same host. So make sure that
# reschedule will not occur.
if 'host' in self.request_spec.requested_destination:
legacy_props.pop('retry', None)
self.request_spec.retry = None
else:
self.request_spec.requested_destination = objects.Destination(
cell=instance_mapping.cell_mapping)
def _support_resource_request(self, selection):
"""Returns true if the host is new enough to support resource request
during migration.
"""
svc = objects.Service.get_by_host_and_binary(
self.context, selection.service_host, 'nova-compute')
return svc.version >= 39
# TODO(gibi): Remove this compat code when nova doesn't need to support
# Train computes any more.
def _get_host_supporting_request(self, selection_list):
"""Return the first compute selection from the selection_list where
the service is new enough to support resource request during migration
and the resources claimed successfully.
:param selection_list: a list of Selection objects returned by the
scheduler
:return: A two tuple. The first item is a Selection object
representing the host that supports the request. The second item
is a list of Selection objects representing the remaining alternate
hosts.
:raises MaxRetriesExceeded: if none of the hosts in the selection_list
is new enough to support the request or we cannot claim resource
on any of the hosts that are new enough.
"""
if not self.request_spec.requested_resources:
return selection_list[0], selection_list[1:]
# Scheduler allocated resources on the first host. So check if the
# first host is new enough
if self._support_resource_request(selection_list[0]):
return selection_list[0], selection_list[1:]
# First host is old, so we need to use an alternate. Therefore we have
# to remove the allocation from the first host.
self.reportclient.delete_allocation_for_instance(
self.context, self.instance.uuid)
LOG.debug(
'Scheduler returned host %(host)s as a possible migration target '
'but that host is not new enough to support the migration with '
'resource request %(request)s. Trying alternate hosts.',
{'host': selection_list[0].service_host,
'request': self.request_spec.requested_resources},
instance=self.instance)
alternates = selection_list[1:]
for i, selection in enumerate(alternates):
if self._support_resource_request(selection):
# this host is new enough so we need to try to claim resources
# on it
if selection.allocation_request:
alloc_req = jsonutils.loads(
selection.allocation_request)
resource_claimed = scheduler_utils.claim_resources(
self.context, self.reportclient, self.request_spec,
self.instance.uuid, alloc_req,
selection.allocation_request_version)
if not resource_claimed:
LOG.debug(
'Scheduler returned alternate host %(host)s as a '
'possible migration target but resource claim '
'failed on that host. Trying another alternate.',
{'host': selection.service_host},
instance=self.instance)
else:
return selection, alternates[i + 1:]
else:
# Some deployments use different schedulers that do not
# use Placement, so they will not have an
# allocation_request to claim with. For those cases,
# there is no concept of claiming, so just assume that
# the resources are available.
return selection, alternates[i + 1:]
else:
LOG.debug(
'Scheduler returned alternate host %(host)s as a possible '
'migration target but that host is not new enough to '
'support the migration with resource request %(request)s. '
'Trying another alternate.',
{'host': selection.service_host,
'request': self.request_spec.requested_resources},
instance=self.instance)
# if we reach this point then none of the hosts was new enough for the
# request or we failed to claim resources on every alternate
reason = ("Exhausted all hosts available during compute service level "
"check for instance %(instance_uuid)s." %
{"instance_uuid": self.instance.uuid})
raise exception.MaxRetriesExceeded(reason=reason)
def _execute(self):
# TODO(sbauza): Remove once all the scheduler.utils methods accept a
# RequestSpec object in the signature.
legacy_props = self.request_spec.to_legacy_filter_properties_dict()
scheduler_utils.setup_instance_group(self.context, self.request_spec)
# If a target host is set in a requested destination,
# 'populate_retry' need not be executed.
if not ('requested_destination' in self.request_spec and
self.request_spec.requested_destination and
'host' in self.request_spec.requested_destination):
scheduler_utils.populate_retry(legacy_props,
self.instance.uuid)
# NOTE(sbauza): Force_hosts/nodes needs to be reset
# if we want to make sure that the next destination
# is not forced to be the original host
self.request_spec.reset_forced_destinations()
port_res_req = self.network_api.get_requested_resource_for_instance(
self.context, self.instance.uuid)
# NOTE(gibi): When cyborg or other module wants to handle similar
# non-nova resources then here we have to collect all the external
# resource requests in a single list and add them to the RequestSpec.
self.request_spec.requested_resources = port_res_req
self._restrict_request_spec_to_cell(legacy_props)
# Once _preallocate_migration() is done, the source node allocation is
# moved from the instance consumer to the migration record consumer,
# and the instance consumer doesn't have any allocations. If this is
# the first time through here (not a reschedule), select_destinations
# below will allocate resources on the selected destination node for
# the instance consumer. If we're rescheduling, host_list is not None
# and we'll call claim_resources for the instance and the selected
# alternate. If we exhaust our alternates and raise MaxRetriesExceeded,
# the rollback() method should revert the allocation swaparoo and move
# the source node allocation from the migration record back to the
# instance record.
migration = self._preallocate_migration()
self.request_spec.ensure_project_and_user_id(self.instance)
self.request_spec.ensure_network_metadata(self.instance)
compute_utils.heal_reqspec_is_bfv(
self.context, self.request_spec, self.instance)
# On an initial call to migrate, 'self.host_list' will be None, so we
# have to call the scheduler to get a list of acceptable hosts to
# migrate to. That list will consist of a selected host, along with
# zero or more alternates. On a reschedule, though, the alternates will
# be passed to this object and stored in 'self.host_list', so we can
# pop the first alternate from the list to use for the destination, and
# pass the remaining alternates to the compute.
if self.host_list is None:
selection = self._schedule()
else:
# This is a reschedule that will use the supplied alternate hosts
# in the host_list as destinations.
selection = self._reschedule()
scheduler_utils.populate_filter_properties(legacy_props, selection)
# context is not serializable
legacy_props.pop('context', None)
(host, node) = (selection.service_host, selection.nodename)
self.instance.availability_zone = (
availability_zones.get_host_availability_zone(
self.context, host))
LOG.debug("Calling prep_resize with selected host: %s; "
"Selected node: %s; Alternates: %s", host, node,
self.host_list, instance=self.instance)
# RPC cast to the destination host to start the migration process.
self.compute_rpcapi.prep_resize(
# NOTE(mriedem): Using request_spec.image here is potentially
# dangerous if it is not kept up to date (i.e. rebuild/unshelve);
# seems like the sane thing to do would be to pass the current
# instance.image_meta since that is what MoveClaim will use for
# any NUMA topology claims on the destination host...
self.context, self.instance, self.request_spec.image,
self.flavor, host, migration,
request_spec=self.request_spec, filter_properties=legacy_props,
node=node, clean_shutdown=self.clean_shutdown,
host_list=self.host_list)
def _schedule(self):
selection_lists = self.query_client.select_destinations(
self.context, self.request_spec, [self.instance.uuid],
return_objects=True, return_alternates=True)
# Since there is only ever one instance to migrate per call, we
# just need the first returned element.
selection_list = selection_lists[0]
selection, self.host_list = self._get_host_supporting_request(
selection_list)
scheduler_utils.fill_provider_mapping(
self.context, self.reportclient, self.request_spec, selection)
return selection
def _reschedule(self):
# Since the resources on these alternates may have been consumed and
# might not be able to support the migrated instance, we need to first
# claim the resources to verify the host still has sufficient
# available resources.
elevated = self.context.elevated()
host_available = False
selection = None
while self.host_list and not host_available:
selection = self.host_list.pop(0)
if (self.request_spec.requested_resources and not
self._support_resource_request(selection)):
LOG.debug(
'Scheduler returned alternate host %(host)s as a possible '
'migration target for re-schedule but that host is not '
'new enough to support the migration with resource '
'request %(request)s. Trying another alternate.',
{'host': selection.service_host,
'request': self.request_spec.requested_resources},
instance=self.instance)
continue
if selection.allocation_request:
alloc_req = jsonutils.loads(selection.allocation_request)
else:
alloc_req = None
if alloc_req:
# If this call succeeds, the resources on the destination
# host will be claimed by the instance.
host_available = scheduler_utils.claim_resources(
elevated, self.reportclient, self.request_spec,
self.instance.uuid, alloc_req,
selection.allocation_request_version)
if host_available:
scheduler_utils.fill_provider_mapping(
self.context, self.reportclient, self.request_spec,
selection)
else:
# Some deployments use different schedulers that do not
# use Placement, so they will not have an
# allocation_request to claim with. For those cases,
# there is no concept of claiming, so just assume that
# the host is valid.
host_available = True
# There are no more available hosts. Raise a MaxRetriesExceeded
# exception in that case.
if not host_available:
reason = ("Exhausted all hosts available for retrying build "
"failures for instance %(instance_uuid)s." %
{"instance_uuid": self.instance.uuid})
raise exception.MaxRetriesExceeded(reason=reason)
return selection
def rollback(self):
if self._migration:
self._migration.status = 'error'
self._migration.save()
if not self._held_allocations:
return
# NOTE(danms): We created new-style migration-based
# allocations for the instance, but failed before we kicked
# off the migration in the compute. Normally the latter would
# do that cleanup but we never got that far, so do it here and
# now.
revert_allocation_for_migration(self.context, self._source_cn,
self.instance, self._migration)