4a10f8eaa7
The MigrationTask in the conductor already checks the service version as old computes cannot support migration with QoS port. However it is still possible that every compute is new but the compute RPC API is pinned to < 5.2. In this case the migration still cannot be supported. This patch adds an extra RPC version check to the conductor. Change-Id: Ib4e0b9ab050a59ab5a290e6eecea01b87c3bd4c6 Closes-Bug: #1844993
456 lines
21 KiB
Python
456 lines
21 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from oslo_log import log as logging
|
|
from oslo_serialization import jsonutils
|
|
|
|
from nova import availability_zones
|
|
from nova.compute import utils as compute_utils
|
|
from nova.conductor.tasks import base
|
|
from nova import exception
|
|
from nova.i18n import _
|
|
from nova import objects
|
|
from nova.scheduler.client import report
|
|
from nova.scheduler import utils as scheduler_utils
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def replace_allocation_with_migration(context, instance, migration):
|
|
"""Replace instance's allocation with one for a migration.
|
|
|
|
:raises: keystoneauth1.exceptions.base.ClientException on failure to
|
|
communicate with the placement API
|
|
:raises: ConsumerAllocationRetrievalFailed if reading the current
|
|
allocation from placement fails
|
|
:raises: ComputeHostNotFound if the host of the instance is not found in
|
|
the databse
|
|
:raises: AllocationMoveFailed if moving the allocation from the
|
|
instance.uuid to the migration.uuid fails due to parallel
|
|
placement operation on the instance consumer
|
|
:raises: NoValidHost if placement rejectes the update for other reasons
|
|
(e.g. not enough resources)
|
|
:returns: (source_compute_node, migration_allocation)
|
|
"""
|
|
try:
|
|
source_cn = objects.ComputeNode.get_by_host_and_nodename(
|
|
context, instance.host, instance.node)
|
|
except exception.ComputeHostNotFound:
|
|
LOG.error('Unable to find record for source '
|
|
'node %(node)s on %(host)s',
|
|
{'host': instance.host, 'node': instance.node},
|
|
instance=instance)
|
|
# A generic error like this will just error out the migration
|
|
# and do any rollback required
|
|
raise
|
|
|
|
reportclient = report.SchedulerReportClient()
|
|
|
|
orig_alloc = reportclient.get_allocs_for_consumer(
|
|
context, instance.uuid)['allocations']
|
|
root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {})
|
|
if not root_alloc:
|
|
LOG.debug('Unable to find existing allocations for instance on '
|
|
'source compute node: %s. This is normal if you are not '
|
|
'using the FilterScheduler.', source_cn.uuid,
|
|
instance=instance)
|
|
return None, None
|
|
|
|
# FIXME(danms): This method is flawed in that it asssumes allocations
|
|
# against only one provider. So, this may overwite allocations against
|
|
# a shared provider, if we had one.
|
|
success = reportclient.move_allocations(context, instance.uuid,
|
|
migration.uuid)
|
|
if not success:
|
|
LOG.error('Unable to replace resource claim on source '
|
|
'host %(host)s node %(node)s for instance',
|
|
{'host': instance.host,
|
|
'node': instance.node},
|
|
instance=instance)
|
|
# Mimic the "no space" error that could have come from the
|
|
# scheduler. Once we have an atomic replace operation, this
|
|
# would be a severe error.
|
|
raise exception.NoValidHost(
|
|
reason=_('Unable to replace instance claim on source'))
|
|
else:
|
|
LOG.debug('Created allocations for migration %(mig)s on %(rp)s',
|
|
{'mig': migration.uuid, 'rp': source_cn.uuid})
|
|
|
|
return source_cn, orig_alloc
|
|
|
|
|
|
def revert_allocation_for_migration(context, source_cn, instance, migration):
|
|
"""Revert an allocation made for a migration back to the instance."""
|
|
|
|
reportclient = report.SchedulerReportClient()
|
|
|
|
# FIXME(danms): This method is flawed in that it asssumes allocations
|
|
# against only one provider. So, this may overwite allocations against
|
|
# a shared provider, if we had one.
|
|
success = reportclient.move_allocations(context, migration.uuid,
|
|
instance.uuid)
|
|
if not success:
|
|
LOG.error('Unable to replace resource claim on source '
|
|
'host %(host)s node %(node)s for instance',
|
|
{'host': instance.host,
|
|
'node': instance.node},
|
|
instance=instance)
|
|
else:
|
|
LOG.debug('Created allocations for instance %(inst)s on %(rp)s',
|
|
{'inst': instance.uuid, 'rp': source_cn.uuid})
|
|
|
|
|
|
class MigrationTask(base.TaskBase):
|
|
def __init__(self, context, instance, flavor,
|
|
request_spec, clean_shutdown, compute_rpcapi,
|
|
query_client, report_client, host_list, network_api):
|
|
super(MigrationTask, self).__init__(context, instance)
|
|
self.clean_shutdown = clean_shutdown
|
|
self.request_spec = request_spec
|
|
self.flavor = flavor
|
|
|
|
self.compute_rpcapi = compute_rpcapi
|
|
self.query_client = query_client
|
|
self.reportclient = report_client
|
|
self.host_list = host_list
|
|
self.network_api = network_api
|
|
|
|
# Persist things from the happy path so we don't have to look
|
|
# them up if we need to roll back
|
|
self._migration = None
|
|
self._held_allocations = None
|
|
self._source_cn = None
|
|
|
|
def _preallocate_migration(self):
|
|
# If this is a rescheduled migration, don't create a new record.
|
|
migration_type = ("resize" if self.instance.flavor.id != self.flavor.id
|
|
else "migration")
|
|
filters = {"instance_uuid": self.instance.uuid,
|
|
"migration_type": migration_type,
|
|
"status": "pre-migrating"}
|
|
migrations = objects.MigrationList.get_by_filters(self.context,
|
|
filters).objects
|
|
if migrations:
|
|
migration = migrations[0]
|
|
else:
|
|
migration = objects.Migration(context=self.context.elevated())
|
|
migration.old_instance_type_id = self.instance.flavor.id
|
|
migration.new_instance_type_id = self.flavor.id
|
|
migration.status = 'pre-migrating'
|
|
migration.instance_uuid = self.instance.uuid
|
|
migration.source_compute = self.instance.host
|
|
migration.source_node = self.instance.node
|
|
migration.migration_type = migration_type
|
|
migration.create()
|
|
|
|
self._migration = migration
|
|
|
|
self._source_cn, self._held_allocations = (
|
|
replace_allocation_with_migration(self.context,
|
|
self.instance,
|
|
self._migration))
|
|
|
|
return migration
|
|
|
|
def _restrict_request_spec_to_cell(self, legacy_props):
|
|
# NOTE(danms): Right now we only support migrate to the same
|
|
# cell as the current instance, so request that the scheduler
|
|
# limit thusly.
|
|
instance_mapping = objects.InstanceMapping.get_by_instance_uuid(
|
|
self.context, self.instance.uuid)
|
|
LOG.debug('Requesting cell %(cell)s while migrating',
|
|
{'cell': instance_mapping.cell_mapping.identity},
|
|
instance=self.instance)
|
|
if ('requested_destination' in self.request_spec and
|
|
self.request_spec.requested_destination):
|
|
self.request_spec.requested_destination.cell = (
|
|
instance_mapping.cell_mapping)
|
|
# NOTE(takashin): In the case that the target host is specified,
|
|
# if the migration is failed, it is not necessary to retry
|
|
# the cold migration to the same host. So make sure that
|
|
# reschedule will not occur.
|
|
if 'host' in self.request_spec.requested_destination:
|
|
legacy_props.pop('retry', None)
|
|
self.request_spec.retry = None
|
|
else:
|
|
self.request_spec.requested_destination = objects.Destination(
|
|
cell=instance_mapping.cell_mapping)
|
|
|
|
def _support_resource_request(self, selection):
|
|
"""Returns true if the host is new enough to support resource request
|
|
during migration and that the RPC API version is not pinned during
|
|
rolling upgrade.
|
|
"""
|
|
svc = objects.Service.get_by_host_and_binary(
|
|
self.context, selection.service_host, 'nova-compute')
|
|
return (svc.version >= 39 and
|
|
self.compute_rpcapi.supports_resize_with_qos_port(
|
|
self.context))
|
|
|
|
# TODO(gibi): Remove this compat code when nova doesn't need to support
|
|
# Train computes any more.
|
|
def _get_host_supporting_request(self, selection_list):
|
|
"""Return the first compute selection from the selection_list where
|
|
the service is new enough to support resource request during migration
|
|
and the resources claimed successfully.
|
|
|
|
:param selection_list: a list of Selection objects returned by the
|
|
scheduler
|
|
:return: A two tuple. The first item is a Selection object
|
|
representing the host that supports the request. The second item
|
|
is a list of Selection objects representing the remaining alternate
|
|
hosts.
|
|
:raises MaxRetriesExceeded: if none of the hosts in the selection_list
|
|
is new enough to support the request or we cannot claim resource
|
|
on any of the hosts that are new enough.
|
|
"""
|
|
|
|
if not self.request_spec.requested_resources:
|
|
return selection_list[0], selection_list[1:]
|
|
|
|
# Scheduler allocated resources on the first host. So check if the
|
|
# first host is new enough
|
|
if self._support_resource_request(selection_list[0]):
|
|
return selection_list[0], selection_list[1:]
|
|
|
|
# First host is old, so we need to use an alternate. Therefore we have
|
|
# to remove the allocation from the first host.
|
|
self.reportclient.delete_allocation_for_instance(
|
|
self.context, self.instance.uuid)
|
|
LOG.debug(
|
|
'Scheduler returned host %(host)s as a possible migration target '
|
|
'but that host is not new enough to support the migration with '
|
|
'resource request %(request)s or the compute RPC is pinned to '
|
|
'less than 5.2. Trying alternate hosts.',
|
|
{'host': selection_list[0].service_host,
|
|
'request': self.request_spec.requested_resources},
|
|
instance=self.instance)
|
|
|
|
alternates = selection_list[1:]
|
|
|
|
for i, selection in enumerate(alternates):
|
|
if self._support_resource_request(selection):
|
|
# this host is new enough so we need to try to claim resources
|
|
# on it
|
|
if selection.allocation_request:
|
|
alloc_req = jsonutils.loads(
|
|
selection.allocation_request)
|
|
resource_claimed = scheduler_utils.claim_resources(
|
|
self.context, self.reportclient, self.request_spec,
|
|
self.instance.uuid, alloc_req,
|
|
selection.allocation_request_version)
|
|
|
|
if not resource_claimed:
|
|
LOG.debug(
|
|
'Scheduler returned alternate host %(host)s as a '
|
|
'possible migration target but resource claim '
|
|
'failed on that host. Trying another alternate.',
|
|
{'host': selection.service_host},
|
|
instance=self.instance)
|
|
else:
|
|
return selection, alternates[i + 1:]
|
|
|
|
else:
|
|
# Some deployments use different schedulers that do not
|
|
# use Placement, so they will not have an
|
|
# allocation_request to claim with. For those cases,
|
|
# there is no concept of claiming, so just assume that
|
|
# the resources are available.
|
|
return selection, alternates[i + 1:]
|
|
|
|
else:
|
|
LOG.debug(
|
|
'Scheduler returned alternate host %(host)s as a possible '
|
|
'migration target but that host is not new enough to '
|
|
'support the migration with resource request %(request)s '
|
|
'or the compute RPC is pinned to less than 5.2. '
|
|
'Trying another alternate.',
|
|
{'host': selection.service_host,
|
|
'request': self.request_spec.requested_resources},
|
|
instance=self.instance)
|
|
|
|
# if we reach this point then none of the hosts was new enough for the
|
|
# request or we failed to claim resources on every alternate
|
|
reason = ("Exhausted all hosts available during compute service level "
|
|
"check for instance %(instance_uuid)s." %
|
|
{"instance_uuid": self.instance.uuid})
|
|
raise exception.MaxRetriesExceeded(reason=reason)
|
|
|
|
def _execute(self):
|
|
# TODO(sbauza): Remove once all the scheduler.utils methods accept a
|
|
# RequestSpec object in the signature.
|
|
legacy_props = self.request_spec.to_legacy_filter_properties_dict()
|
|
scheduler_utils.setup_instance_group(self.context, self.request_spec)
|
|
# If a target host is set in a requested destination,
|
|
# 'populate_retry' need not be executed.
|
|
if not ('requested_destination' in self.request_spec and
|
|
self.request_spec.requested_destination and
|
|
'host' in self.request_spec.requested_destination):
|
|
scheduler_utils.populate_retry(legacy_props,
|
|
self.instance.uuid)
|
|
|
|
# NOTE(sbauza): Force_hosts/nodes needs to be reset
|
|
# if we want to make sure that the next destination
|
|
# is not forced to be the original host
|
|
self.request_spec.reset_forced_destinations()
|
|
|
|
port_res_req = self.network_api.get_requested_resource_for_instance(
|
|
self.context, self.instance.uuid)
|
|
# NOTE(gibi): When cyborg or other module wants to handle similar
|
|
# non-nova resources then here we have to collect all the external
|
|
# resource requests in a single list and add them to the RequestSpec.
|
|
self.request_spec.requested_resources = port_res_req
|
|
|
|
self._restrict_request_spec_to_cell(legacy_props)
|
|
|
|
# Once _preallocate_migration() is done, the source node allocation is
|
|
# moved from the instance consumer to the migration record consumer,
|
|
# and the instance consumer doesn't have any allocations. If this is
|
|
# the first time through here (not a reschedule), select_destinations
|
|
# below will allocate resources on the selected destination node for
|
|
# the instance consumer. If we're rescheduling, host_list is not None
|
|
# and we'll call claim_resources for the instance and the selected
|
|
# alternate. If we exhaust our alternates and raise MaxRetriesExceeded,
|
|
# the rollback() method should revert the allocation swaparoo and move
|
|
# the source node allocation from the migration record back to the
|
|
# instance record.
|
|
migration = self._preallocate_migration()
|
|
|
|
self.request_spec.ensure_project_and_user_id(self.instance)
|
|
self.request_spec.ensure_network_metadata(self.instance)
|
|
compute_utils.heal_reqspec_is_bfv(
|
|
self.context, self.request_spec, self.instance)
|
|
# On an initial call to migrate, 'self.host_list' will be None, so we
|
|
# have to call the scheduler to get a list of acceptable hosts to
|
|
# migrate to. That list will consist of a selected host, along with
|
|
# zero or more alternates. On a reschedule, though, the alternates will
|
|
# be passed to this object and stored in 'self.host_list', so we can
|
|
# pop the first alternate from the list to use for the destination, and
|
|
# pass the remaining alternates to the compute.
|
|
if self.host_list is None:
|
|
selection = self._schedule()
|
|
|
|
else:
|
|
# This is a reschedule that will use the supplied alternate hosts
|
|
# in the host_list as destinations.
|
|
selection = self._reschedule()
|
|
|
|
scheduler_utils.populate_filter_properties(legacy_props, selection)
|
|
# context is not serializable
|
|
legacy_props.pop('context', None)
|
|
|
|
(host, node) = (selection.service_host, selection.nodename)
|
|
|
|
self.instance.availability_zone = (
|
|
availability_zones.get_host_availability_zone(
|
|
self.context, host))
|
|
|
|
LOG.debug("Calling prep_resize with selected host: %s; "
|
|
"Selected node: %s; Alternates: %s", host, node,
|
|
self.host_list, instance=self.instance)
|
|
# RPC cast to the destination host to start the migration process.
|
|
self.compute_rpcapi.prep_resize(
|
|
# NOTE(mriedem): Using request_spec.image here is potentially
|
|
# dangerous if it is not kept up to date (i.e. rebuild/unshelve);
|
|
# seems like the sane thing to do would be to pass the current
|
|
# instance.image_meta since that is what MoveClaim will use for
|
|
# any NUMA topology claims on the destination host...
|
|
self.context, self.instance, self.request_spec.image,
|
|
self.flavor, host, migration,
|
|
request_spec=self.request_spec, filter_properties=legacy_props,
|
|
node=node, clean_shutdown=self.clean_shutdown,
|
|
host_list=self.host_list)
|
|
|
|
def _schedule(self):
|
|
selection_lists = self.query_client.select_destinations(
|
|
self.context, self.request_spec, [self.instance.uuid],
|
|
return_objects=True, return_alternates=True)
|
|
# Since there is only ever one instance to migrate per call, we
|
|
# just need the first returned element.
|
|
selection_list = selection_lists[0]
|
|
|
|
selection, self.host_list = self._get_host_supporting_request(
|
|
selection_list)
|
|
|
|
scheduler_utils.fill_provider_mapping(
|
|
self.context, self.reportclient, self.request_spec, selection)
|
|
return selection
|
|
|
|
def _reschedule(self):
|
|
# Since the resources on these alternates may have been consumed and
|
|
# might not be able to support the migrated instance, we need to first
|
|
# claim the resources to verify the host still has sufficient
|
|
# available resources.
|
|
elevated = self.context.elevated()
|
|
host_available = False
|
|
selection = None
|
|
while self.host_list and not host_available:
|
|
selection = self.host_list.pop(0)
|
|
if (self.request_spec.requested_resources and not
|
|
self._support_resource_request(selection)):
|
|
LOG.debug(
|
|
'Scheduler returned alternate host %(host)s as a possible '
|
|
'migration target for re-schedule but that host is not '
|
|
'new enough to support the migration with resource '
|
|
'request %(request)s. Trying another alternate.',
|
|
{'host': selection.service_host,
|
|
'request': self.request_spec.requested_resources},
|
|
instance=self.instance)
|
|
continue
|
|
if selection.allocation_request:
|
|
alloc_req = jsonutils.loads(selection.allocation_request)
|
|
else:
|
|
alloc_req = None
|
|
if alloc_req:
|
|
# If this call succeeds, the resources on the destination
|
|
# host will be claimed by the instance.
|
|
host_available = scheduler_utils.claim_resources(
|
|
elevated, self.reportclient, self.request_spec,
|
|
self.instance.uuid, alloc_req,
|
|
selection.allocation_request_version)
|
|
if host_available:
|
|
scheduler_utils.fill_provider_mapping(
|
|
self.context, self.reportclient, self.request_spec,
|
|
selection)
|
|
else:
|
|
# Some deployments use different schedulers that do not
|
|
# use Placement, so they will not have an
|
|
# allocation_request to claim with. For those cases,
|
|
# there is no concept of claiming, so just assume that
|
|
# the host is valid.
|
|
host_available = True
|
|
# There are no more available hosts. Raise a MaxRetriesExceeded
|
|
# exception in that case.
|
|
if not host_available:
|
|
reason = ("Exhausted all hosts available for retrying build "
|
|
"failures for instance %(instance_uuid)s." %
|
|
{"instance_uuid": self.instance.uuid})
|
|
raise exception.MaxRetriesExceeded(reason=reason)
|
|
return selection
|
|
|
|
def rollback(self):
|
|
if self._migration:
|
|
self._migration.status = 'error'
|
|
self._migration.save()
|
|
|
|
if not self._held_allocations:
|
|
return
|
|
|
|
# NOTE(danms): We created new-style migration-based
|
|
# allocations for the instance, but failed before we kicked
|
|
# off the migration in the compute. Normally the latter would
|
|
# do that cleanup but we never got that far, so do it here and
|
|
# now.
|
|
|
|
revert_allocation_for_migration(self.context, self._source_cn,
|
|
self.instance, self._migration)
|