nova/nova/conductor/tasks/migrate.py
Balazs Gibizer 4a10f8eaa7 Reject migration with QoS port from conductor if RPC pinned
The MigrationTask in the conductor already checks the service version as
old computes cannot support migration with QoS port. However it is still
possible that every compute is new but the compute RPC API is pinned to
< 5.2. In this case the migration still cannot be supported.

This patch adds an extra RPC version check to the conductor.

Change-Id: Ib4e0b9ab050a59ab5a290e6eecea01b87c3bd4c6
Closes-Bug: #1844993
2019-09-24 10:12:05 +02:00

456 lines
21 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from oslo_serialization import jsonutils
from nova import availability_zones
from nova.compute import utils as compute_utils
from nova.conductor.tasks import base
from nova import exception
from nova.i18n import _
from nova import objects
from nova.scheduler.client import report
from nova.scheduler import utils as scheduler_utils
LOG = logging.getLogger(__name__)
def replace_allocation_with_migration(context, instance, migration):
"""Replace instance's allocation with one for a migration.
:raises: keystoneauth1.exceptions.base.ClientException on failure to
communicate with the placement API
:raises: ConsumerAllocationRetrievalFailed if reading the current
allocation from placement fails
:raises: ComputeHostNotFound if the host of the instance is not found in
the databse
:raises: AllocationMoveFailed if moving the allocation from the
instance.uuid to the migration.uuid fails due to parallel
placement operation on the instance consumer
:raises: NoValidHost if placement rejectes the update for other reasons
(e.g. not enough resources)
:returns: (source_compute_node, migration_allocation)
"""
try:
source_cn = objects.ComputeNode.get_by_host_and_nodename(
context, instance.host, instance.node)
except exception.ComputeHostNotFound:
LOG.error('Unable to find record for source '
'node %(node)s on %(host)s',
{'host': instance.host, 'node': instance.node},
instance=instance)
# A generic error like this will just error out the migration
# and do any rollback required
raise
reportclient = report.SchedulerReportClient()
orig_alloc = reportclient.get_allocs_for_consumer(
context, instance.uuid)['allocations']
root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {})
if not root_alloc:
LOG.debug('Unable to find existing allocations for instance on '
'source compute node: %s. This is normal if you are not '
'using the FilterScheduler.', source_cn.uuid,
instance=instance)
return None, None
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.move_allocations(context, instance.uuid,
migration.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
{'host': instance.host,
'node': instance.node},
instance=instance)
# Mimic the "no space" error that could have come from the
# scheduler. Once we have an atomic replace operation, this
# would be a severe error.
raise exception.NoValidHost(
reason=_('Unable to replace instance claim on source'))
else:
LOG.debug('Created allocations for migration %(mig)s on %(rp)s',
{'mig': migration.uuid, 'rp': source_cn.uuid})
return source_cn, orig_alloc
def revert_allocation_for_migration(context, source_cn, instance, migration):
"""Revert an allocation made for a migration back to the instance."""
reportclient = report.SchedulerReportClient()
# FIXME(danms): This method is flawed in that it asssumes allocations
# against only one provider. So, this may overwite allocations against
# a shared provider, if we had one.
success = reportclient.move_allocations(context, migration.uuid,
instance.uuid)
if not success:
LOG.error('Unable to replace resource claim on source '
'host %(host)s node %(node)s for instance',
{'host': instance.host,
'node': instance.node},
instance=instance)
else:
LOG.debug('Created allocations for instance %(inst)s on %(rp)s',
{'inst': instance.uuid, 'rp': source_cn.uuid})
class MigrationTask(base.TaskBase):
def __init__(self, context, instance, flavor,
request_spec, clean_shutdown, compute_rpcapi,
query_client, report_client, host_list, network_api):
super(MigrationTask, self).__init__(context, instance)
self.clean_shutdown = clean_shutdown
self.request_spec = request_spec
self.flavor = flavor
self.compute_rpcapi = compute_rpcapi
self.query_client = query_client
self.reportclient = report_client
self.host_list = host_list
self.network_api = network_api
# Persist things from the happy path so we don't have to look
# them up if we need to roll back
self._migration = None
self._held_allocations = None
self._source_cn = None
def _preallocate_migration(self):
# If this is a rescheduled migration, don't create a new record.
migration_type = ("resize" if self.instance.flavor.id != self.flavor.id
else "migration")
filters = {"instance_uuid": self.instance.uuid,
"migration_type": migration_type,
"status": "pre-migrating"}
migrations = objects.MigrationList.get_by_filters(self.context,
filters).objects
if migrations:
migration = migrations[0]
else:
migration = objects.Migration(context=self.context.elevated())
migration.old_instance_type_id = self.instance.flavor.id
migration.new_instance_type_id = self.flavor.id
migration.status = 'pre-migrating'
migration.instance_uuid = self.instance.uuid
migration.source_compute = self.instance.host
migration.source_node = self.instance.node
migration.migration_type = migration_type
migration.create()
self._migration = migration
self._source_cn, self._held_allocations = (
replace_allocation_with_migration(self.context,
self.instance,
self._migration))
return migration
def _restrict_request_spec_to_cell(self, legacy_props):
# NOTE(danms): Right now we only support migrate to the same
# cell as the current instance, so request that the scheduler
# limit thusly.
instance_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.context, self.instance.uuid)
LOG.debug('Requesting cell %(cell)s while migrating',
{'cell': instance_mapping.cell_mapping.identity},
instance=self.instance)
if ('requested_destination' in self.request_spec and
self.request_spec.requested_destination):
self.request_spec.requested_destination.cell = (
instance_mapping.cell_mapping)
# NOTE(takashin): In the case that the target host is specified,
# if the migration is failed, it is not necessary to retry
# the cold migration to the same host. So make sure that
# reschedule will not occur.
if 'host' in self.request_spec.requested_destination:
legacy_props.pop('retry', None)
self.request_spec.retry = None
else:
self.request_spec.requested_destination = objects.Destination(
cell=instance_mapping.cell_mapping)
def _support_resource_request(self, selection):
"""Returns true if the host is new enough to support resource request
during migration and that the RPC API version is not pinned during
rolling upgrade.
"""
svc = objects.Service.get_by_host_and_binary(
self.context, selection.service_host, 'nova-compute')
return (svc.version >= 39 and
self.compute_rpcapi.supports_resize_with_qos_port(
self.context))
# TODO(gibi): Remove this compat code when nova doesn't need to support
# Train computes any more.
def _get_host_supporting_request(self, selection_list):
"""Return the first compute selection from the selection_list where
the service is new enough to support resource request during migration
and the resources claimed successfully.
:param selection_list: a list of Selection objects returned by the
scheduler
:return: A two tuple. The first item is a Selection object
representing the host that supports the request. The second item
is a list of Selection objects representing the remaining alternate
hosts.
:raises MaxRetriesExceeded: if none of the hosts in the selection_list
is new enough to support the request or we cannot claim resource
on any of the hosts that are new enough.
"""
if not self.request_spec.requested_resources:
return selection_list[0], selection_list[1:]
# Scheduler allocated resources on the first host. So check if the
# first host is new enough
if self._support_resource_request(selection_list[0]):
return selection_list[0], selection_list[1:]
# First host is old, so we need to use an alternate. Therefore we have
# to remove the allocation from the first host.
self.reportclient.delete_allocation_for_instance(
self.context, self.instance.uuid)
LOG.debug(
'Scheduler returned host %(host)s as a possible migration target '
'but that host is not new enough to support the migration with '
'resource request %(request)s or the compute RPC is pinned to '
'less than 5.2. Trying alternate hosts.',
{'host': selection_list[0].service_host,
'request': self.request_spec.requested_resources},
instance=self.instance)
alternates = selection_list[1:]
for i, selection in enumerate(alternates):
if self._support_resource_request(selection):
# this host is new enough so we need to try to claim resources
# on it
if selection.allocation_request:
alloc_req = jsonutils.loads(
selection.allocation_request)
resource_claimed = scheduler_utils.claim_resources(
self.context, self.reportclient, self.request_spec,
self.instance.uuid, alloc_req,
selection.allocation_request_version)
if not resource_claimed:
LOG.debug(
'Scheduler returned alternate host %(host)s as a '
'possible migration target but resource claim '
'failed on that host. Trying another alternate.',
{'host': selection.service_host},
instance=self.instance)
else:
return selection, alternates[i + 1:]
else:
# Some deployments use different schedulers that do not
# use Placement, so they will not have an
# allocation_request to claim with. For those cases,
# there is no concept of claiming, so just assume that
# the resources are available.
return selection, alternates[i + 1:]
else:
LOG.debug(
'Scheduler returned alternate host %(host)s as a possible '
'migration target but that host is not new enough to '
'support the migration with resource request %(request)s '
'or the compute RPC is pinned to less than 5.2. '
'Trying another alternate.',
{'host': selection.service_host,
'request': self.request_spec.requested_resources},
instance=self.instance)
# if we reach this point then none of the hosts was new enough for the
# request or we failed to claim resources on every alternate
reason = ("Exhausted all hosts available during compute service level "
"check for instance %(instance_uuid)s." %
{"instance_uuid": self.instance.uuid})
raise exception.MaxRetriesExceeded(reason=reason)
def _execute(self):
# TODO(sbauza): Remove once all the scheduler.utils methods accept a
# RequestSpec object in the signature.
legacy_props = self.request_spec.to_legacy_filter_properties_dict()
scheduler_utils.setup_instance_group(self.context, self.request_spec)
# If a target host is set in a requested destination,
# 'populate_retry' need not be executed.
if not ('requested_destination' in self.request_spec and
self.request_spec.requested_destination and
'host' in self.request_spec.requested_destination):
scheduler_utils.populate_retry(legacy_props,
self.instance.uuid)
# NOTE(sbauza): Force_hosts/nodes needs to be reset
# if we want to make sure that the next destination
# is not forced to be the original host
self.request_spec.reset_forced_destinations()
port_res_req = self.network_api.get_requested_resource_for_instance(
self.context, self.instance.uuid)
# NOTE(gibi): When cyborg or other module wants to handle similar
# non-nova resources then here we have to collect all the external
# resource requests in a single list and add them to the RequestSpec.
self.request_spec.requested_resources = port_res_req
self._restrict_request_spec_to_cell(legacy_props)
# Once _preallocate_migration() is done, the source node allocation is
# moved from the instance consumer to the migration record consumer,
# and the instance consumer doesn't have any allocations. If this is
# the first time through here (not a reschedule), select_destinations
# below will allocate resources on the selected destination node for
# the instance consumer. If we're rescheduling, host_list is not None
# and we'll call claim_resources for the instance and the selected
# alternate. If we exhaust our alternates and raise MaxRetriesExceeded,
# the rollback() method should revert the allocation swaparoo and move
# the source node allocation from the migration record back to the
# instance record.
migration = self._preallocate_migration()
self.request_spec.ensure_project_and_user_id(self.instance)
self.request_spec.ensure_network_metadata(self.instance)
compute_utils.heal_reqspec_is_bfv(
self.context, self.request_spec, self.instance)
# On an initial call to migrate, 'self.host_list' will be None, so we
# have to call the scheduler to get a list of acceptable hosts to
# migrate to. That list will consist of a selected host, along with
# zero or more alternates. On a reschedule, though, the alternates will
# be passed to this object and stored in 'self.host_list', so we can
# pop the first alternate from the list to use for the destination, and
# pass the remaining alternates to the compute.
if self.host_list is None:
selection = self._schedule()
else:
# This is a reschedule that will use the supplied alternate hosts
# in the host_list as destinations.
selection = self._reschedule()
scheduler_utils.populate_filter_properties(legacy_props, selection)
# context is not serializable
legacy_props.pop('context', None)
(host, node) = (selection.service_host, selection.nodename)
self.instance.availability_zone = (
availability_zones.get_host_availability_zone(
self.context, host))
LOG.debug("Calling prep_resize with selected host: %s; "
"Selected node: %s; Alternates: %s", host, node,
self.host_list, instance=self.instance)
# RPC cast to the destination host to start the migration process.
self.compute_rpcapi.prep_resize(
# NOTE(mriedem): Using request_spec.image here is potentially
# dangerous if it is not kept up to date (i.e. rebuild/unshelve);
# seems like the sane thing to do would be to pass the current
# instance.image_meta since that is what MoveClaim will use for
# any NUMA topology claims on the destination host...
self.context, self.instance, self.request_spec.image,
self.flavor, host, migration,
request_spec=self.request_spec, filter_properties=legacy_props,
node=node, clean_shutdown=self.clean_shutdown,
host_list=self.host_list)
def _schedule(self):
selection_lists = self.query_client.select_destinations(
self.context, self.request_spec, [self.instance.uuid],
return_objects=True, return_alternates=True)
# Since there is only ever one instance to migrate per call, we
# just need the first returned element.
selection_list = selection_lists[0]
selection, self.host_list = self._get_host_supporting_request(
selection_list)
scheduler_utils.fill_provider_mapping(
self.context, self.reportclient, self.request_spec, selection)
return selection
def _reschedule(self):
# Since the resources on these alternates may have been consumed and
# might not be able to support the migrated instance, we need to first
# claim the resources to verify the host still has sufficient
# available resources.
elevated = self.context.elevated()
host_available = False
selection = None
while self.host_list and not host_available:
selection = self.host_list.pop(0)
if (self.request_spec.requested_resources and not
self._support_resource_request(selection)):
LOG.debug(
'Scheduler returned alternate host %(host)s as a possible '
'migration target for re-schedule but that host is not '
'new enough to support the migration with resource '
'request %(request)s. Trying another alternate.',
{'host': selection.service_host,
'request': self.request_spec.requested_resources},
instance=self.instance)
continue
if selection.allocation_request:
alloc_req = jsonutils.loads(selection.allocation_request)
else:
alloc_req = None
if alloc_req:
# If this call succeeds, the resources on the destination
# host will be claimed by the instance.
host_available = scheduler_utils.claim_resources(
elevated, self.reportclient, self.request_spec,
self.instance.uuid, alloc_req,
selection.allocation_request_version)
if host_available:
scheduler_utils.fill_provider_mapping(
self.context, self.reportclient, self.request_spec,
selection)
else:
# Some deployments use different schedulers that do not
# use Placement, so they will not have an
# allocation_request to claim with. For those cases,
# there is no concept of claiming, so just assume that
# the host is valid.
host_available = True
# There are no more available hosts. Raise a MaxRetriesExceeded
# exception in that case.
if not host_available:
reason = ("Exhausted all hosts available for retrying build "
"failures for instance %(instance_uuid)s." %
{"instance_uuid": self.instance.uuid})
raise exception.MaxRetriesExceeded(reason=reason)
return selection
def rollback(self):
if self._migration:
self._migration.status = 'error'
self._migration.save()
if not self._held_allocations:
return
# NOTE(danms): We created new-style migration-based
# allocations for the instance, but failed before we kicked
# off the migration in the compute. Normally the latter would
# do that cleanup but we never got that far, so do it here and
# now.
revert_allocation_for_migration(self.context, self._source_cn,
self.instance, self._migration)