395 lines
18 KiB
Python
395 lines
18 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from oslo_log import log as logging
|
|
import oslo_messaging as messaging
|
|
import six
|
|
|
|
from nova.compute import power_state
|
|
from nova.conductor.tasks import base
|
|
from nova.conductor.tasks import migrate
|
|
import nova.conf
|
|
from nova import exception
|
|
from nova.i18n import _
|
|
from nova import objects
|
|
from nova.scheduler import utils as scheduler_utils
|
|
from nova import utils
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
CONF = nova.conf.CONF
|
|
|
|
|
|
def should_do_migration_allocation(context):
|
|
minver = objects.Service.get_minimum_version_multi(context,
|
|
['nova-compute'])
|
|
return minver >= 25
|
|
|
|
|
|
class LiveMigrationTask(base.TaskBase):
|
|
def __init__(self, context, instance, destination,
|
|
block_migration, disk_over_commit, migration, compute_rpcapi,
|
|
servicegroup_api, scheduler_client, request_spec=None):
|
|
super(LiveMigrationTask, self).__init__(context, instance)
|
|
self.destination = destination
|
|
self.block_migration = block_migration
|
|
self.disk_over_commit = disk_over_commit
|
|
self.migration = migration
|
|
self.source = instance.host
|
|
self.migrate_data = None
|
|
|
|
self.compute_rpcapi = compute_rpcapi
|
|
self.servicegroup_api = servicegroup_api
|
|
self.scheduler_client = scheduler_client
|
|
self.request_spec = request_spec
|
|
self._source_cn = None
|
|
self._held_allocations = None
|
|
|
|
def _execute(self):
|
|
self._check_instance_is_active()
|
|
self._check_host_is_up(self.source)
|
|
|
|
if should_do_migration_allocation(self.context):
|
|
self._source_cn, self._held_allocations = (
|
|
# NOTE(danms): This may raise various exceptions, which will
|
|
# propagate to the API and cause a 500. This is what we
|
|
# want, as it would indicate internal data structure corruption
|
|
# (such as missing migrations, compute nodes, etc).
|
|
migrate.replace_allocation_with_migration(self.context,
|
|
self.instance,
|
|
self.migration))
|
|
|
|
if not self.destination:
|
|
# Either no host was specified in the API request and the user
|
|
# wants the scheduler to pick a destination host, or a host was
|
|
# specified but is not forcing it, so they want the scheduler
|
|
# filters to run on the specified host, like a scheduler hint.
|
|
self.destination, dest_node = self._find_destination()
|
|
else:
|
|
# This is the case that the user specified the 'force' flag when
|
|
# live migrating with a specific destination host so the scheduler
|
|
# is bypassed. There are still some minimal checks performed here
|
|
# though.
|
|
source_node, dest_node = self._check_requested_destination()
|
|
# Now that we're semi-confident in the force specified host, we
|
|
# need to copy the source compute node allocations in Placement
|
|
# to the destination compute node. Normally select_destinations()
|
|
# in the scheduler would do this for us, but when forcing the
|
|
# target host we don't call the scheduler.
|
|
# TODO(mriedem): In Queens, call select_destinations() with a
|
|
# skip_filters=True flag so the scheduler does the work of claiming
|
|
# resources on the destination in Placement but still bypass the
|
|
# scheduler filters, which honors the 'force' flag in the API.
|
|
# This raises NoValidHost which will be handled in
|
|
# ComputeTaskManager.
|
|
scheduler_utils.claim_resources_on_destination(
|
|
self.scheduler_client.reportclient, self.instance,
|
|
source_node, dest_node,
|
|
source_node_allocations=self._held_allocations)
|
|
|
|
# dest_node is a ComputeNode object, so we need to get the actual
|
|
# node name off it to set in the Migration object below.
|
|
dest_node = dest_node.hypervisor_hostname
|
|
|
|
self.migration.source_node = self.instance.node
|
|
self.migration.dest_node = dest_node
|
|
self.migration.dest_compute = self.destination
|
|
self.migration.save()
|
|
|
|
# TODO(johngarbutt) need to move complexity out of compute manager
|
|
# TODO(johngarbutt) disk_over_commit?
|
|
return self.compute_rpcapi.live_migration(self.context,
|
|
host=self.source,
|
|
instance=self.instance,
|
|
dest=self.destination,
|
|
block_migration=self.block_migration,
|
|
migration=self.migration,
|
|
migrate_data=self.migrate_data)
|
|
|
|
def rollback(self):
|
|
# TODO(johngarbutt) need to implement the clean up operation
|
|
# but this will make sense only once we pull in the compute
|
|
# calls, since this class currently makes no state changes,
|
|
# except to call the compute method, that has no matching
|
|
# rollback call right now.
|
|
if self._held_allocations:
|
|
migrate.revert_allocation_for_migration(self._source_cn,
|
|
self.instance,
|
|
self.migration,
|
|
self._held_allocations)
|
|
|
|
def _check_instance_is_active(self):
|
|
if self.instance.power_state not in (power_state.RUNNING,
|
|
power_state.PAUSED):
|
|
raise exception.InstanceInvalidState(
|
|
instance_uuid=self.instance.uuid,
|
|
attr='power_state',
|
|
state=self.instance.power_state,
|
|
method='live migrate')
|
|
|
|
def _check_host_is_up(self, host):
|
|
service = objects.Service.get_by_compute_host(self.context, host)
|
|
|
|
if not self.servicegroup_api.service_is_up(service):
|
|
raise exception.ComputeServiceUnavailable(host=host)
|
|
|
|
def _check_requested_destination(self):
|
|
"""Performs basic pre-live migration checks for the forced host.
|
|
|
|
:returns: tuple of (source ComputeNode, destination ComputeNode)
|
|
"""
|
|
self._check_destination_is_not_source()
|
|
self._check_host_is_up(self.destination)
|
|
self._check_destination_has_enough_memory()
|
|
source_node, dest_node = self._check_compatible_with_source_hypervisor(
|
|
self.destination)
|
|
self._call_livem_checks_on_host(self.destination)
|
|
# Make sure the forced destination host is in the same cell that the
|
|
# instance currently lives in.
|
|
# NOTE(mriedem): This can go away if/when the forced destination host
|
|
# case calls select_destinations.
|
|
source_cell_mapping = self._get_source_cell_mapping()
|
|
dest_cell_mapping = self._get_destination_cell_mapping()
|
|
if source_cell_mapping.uuid != dest_cell_mapping.uuid:
|
|
raise exception.MigrationPreCheckError(
|
|
reason=(_('Unable to force live migrate instance %s '
|
|
'across cells.') % self.instance.uuid))
|
|
return source_node, dest_node
|
|
|
|
def _check_destination_is_not_source(self):
|
|
if self.destination == self.source:
|
|
raise exception.UnableToMigrateToSelf(
|
|
instance_id=self.instance.uuid, host=self.destination)
|
|
|
|
def _check_destination_has_enough_memory(self):
|
|
# TODO(mriedem): This method can be removed when the forced host
|
|
# scenario is calling select_destinations() in the scheduler because
|
|
# Placement will be used to filter allocation candidates by MEMORY_MB.
|
|
# We likely can't remove it until the CachingScheduler is gone though
|
|
# since the CachingScheduler does not use Placement.
|
|
compute = self._get_compute_info(self.destination)
|
|
free_ram_mb = compute.free_ram_mb
|
|
total_ram_mb = compute.memory_mb
|
|
mem_inst = self.instance.memory_mb
|
|
# NOTE(sbauza): Now the ComputeNode object reports an allocation ratio
|
|
# that can be provided by the compute_node if new or by the controller
|
|
ram_ratio = compute.ram_allocation_ratio
|
|
|
|
# NOTE(sbauza): Mimic the RAMFilter logic in order to have the same
|
|
# ram validation
|
|
avail = total_ram_mb * ram_ratio - (total_ram_mb - free_ram_mb)
|
|
if not mem_inst or avail <= mem_inst:
|
|
instance_uuid = self.instance.uuid
|
|
dest = self.destination
|
|
reason = _("Unable to migrate %(instance_uuid)s to %(dest)s: "
|
|
"Lack of memory(host:%(avail)s <= "
|
|
"instance:%(mem_inst)s)")
|
|
raise exception.MigrationPreCheckError(reason=reason % dict(
|
|
instance_uuid=instance_uuid, dest=dest, avail=avail,
|
|
mem_inst=mem_inst))
|
|
|
|
def _get_compute_info(self, host):
|
|
return objects.ComputeNode.get_first_node_by_host_for_old_compat(
|
|
self.context, host)
|
|
|
|
def _check_compatible_with_source_hypervisor(self, destination):
|
|
source_info = self._get_compute_info(self.source)
|
|
destination_info = self._get_compute_info(destination)
|
|
|
|
source_type = source_info.hypervisor_type
|
|
destination_type = destination_info.hypervisor_type
|
|
if source_type != destination_type:
|
|
raise exception.InvalidHypervisorType()
|
|
|
|
source_version = source_info.hypervisor_version
|
|
destination_version = destination_info.hypervisor_version
|
|
if source_version > destination_version:
|
|
raise exception.DestinationHypervisorTooOld()
|
|
return source_info, destination_info
|
|
|
|
def _call_livem_checks_on_host(self, destination):
|
|
try:
|
|
self.migrate_data = self.compute_rpcapi.\
|
|
check_can_live_migrate_destination(self.context, self.instance,
|
|
destination, self.block_migration, self.disk_over_commit)
|
|
except messaging.MessagingTimeout:
|
|
msg = _("Timeout while checking if we can live migrate to host: "
|
|
"%s") % destination
|
|
raise exception.MigrationPreCheckError(msg)
|
|
|
|
def _get_source_cell_mapping(self):
|
|
"""Returns the CellMapping for the cell in which the instance lives
|
|
|
|
:returns: nova.objects.CellMapping record for the cell where
|
|
the instance currently lives.
|
|
:raises: MigrationPreCheckError - in case a mapping is not found
|
|
"""
|
|
try:
|
|
return objects.InstanceMapping.get_by_instance_uuid(
|
|
self.context, self.instance.uuid).cell_mapping
|
|
except exception.InstanceMappingNotFound:
|
|
raise exception.MigrationPreCheckError(
|
|
reason=(_('Unable to determine in which cell '
|
|
'instance %s lives.') % self.instance.uuid))
|
|
|
|
def _get_destination_cell_mapping(self):
|
|
"""Returns the CellMapping for the destination host
|
|
|
|
:returns: nova.objects.CellMapping record for the cell where
|
|
the destination host is mapped.
|
|
:raises: MigrationPreCheckError - in case a mapping is not found
|
|
"""
|
|
try:
|
|
return objects.HostMapping.get_by_host(
|
|
self.context, self.destination).cell_mapping
|
|
except exception.HostMappingNotFound:
|
|
raise exception.MigrationPreCheckError(
|
|
reason=(_('Unable to determine in which cell '
|
|
'destination host %s lives.') % self.destination))
|
|
|
|
def _get_request_spec_for_select_destinations(self, attempted_hosts=None):
|
|
"""Builds a RequestSpec that can be passed to select_destinations
|
|
|
|
Used when calling the scheduler to pick a destination host for live
|
|
migrating the instance.
|
|
|
|
:param attempted_hosts: List of host names to ignore in the scheduler.
|
|
This is generally at least seeded with the source host.
|
|
:returns: nova.objects.RequestSpec object
|
|
"""
|
|
image = utils.get_image_from_system_metadata(
|
|
self.instance.system_metadata)
|
|
filter_properties = {'ignore_hosts': attempted_hosts}
|
|
if not self.request_spec:
|
|
# NOTE(sbauza): We were unable to find an original RequestSpec
|
|
# object - probably because the instance is old.
|
|
# We need to mock that the old way
|
|
request_spec = objects.RequestSpec.from_components(
|
|
self.context, self.instance.uuid, image,
|
|
self.instance.flavor, self.instance.numa_topology,
|
|
self.instance.pci_requests,
|
|
filter_properties, None, self.instance.availability_zone
|
|
)
|
|
else:
|
|
request_spec = self.request_spec
|
|
# NOTE(sbauza): Force_hosts/nodes needs to be reset
|
|
# if we want to make sure that the next destination
|
|
# is not forced to be the original host
|
|
request_spec.reset_forced_destinations()
|
|
scheduler_utils.setup_instance_group(self.context, request_spec)
|
|
|
|
# We currently only support live migrating to hosts in the same
|
|
# cell that the instance lives in, so we need to tell the scheduler
|
|
# to limit the applicable hosts based on cell.
|
|
cell_mapping = self._get_source_cell_mapping()
|
|
LOG.debug('Requesting cell %(cell)s while live migrating',
|
|
{'cell': cell_mapping.identity},
|
|
instance=self.instance)
|
|
if ('requested_destination' in request_spec and
|
|
request_spec.requested_destination):
|
|
request_spec.requested_destination.cell = cell_mapping
|
|
else:
|
|
request_spec.requested_destination = objects.Destination(
|
|
cell=cell_mapping)
|
|
|
|
request_spec.ensure_project_id(self.instance)
|
|
|
|
return request_spec
|
|
|
|
def _find_destination(self):
|
|
# TODO(johngarbutt) this retry loop should be shared
|
|
attempted_hosts = [self.source]
|
|
request_spec = self._get_request_spec_for_select_destinations(
|
|
attempted_hosts)
|
|
|
|
host = None
|
|
while host is None:
|
|
self._check_not_over_max_retries(attempted_hosts)
|
|
request_spec.ignore_hosts = attempted_hosts
|
|
try:
|
|
selection_lists = self.scheduler_client.select_destinations(
|
|
self.context, request_spec, [self.instance.uuid],
|
|
return_objects=True, return_alternates=False)
|
|
# We only need the first item in the first list, as there is
|
|
# only one instance, and we don't care about any alternates.
|
|
selection = selection_lists[0][0]
|
|
host = selection.service_host
|
|
except messaging.RemoteError as ex:
|
|
# TODO(ShaoHe Feng) There maybe multi-scheduler, and the
|
|
# scheduling algorithm is R-R, we can let other scheduler try.
|
|
# Note(ShaoHe Feng) There are types of RemoteError, such as
|
|
# NoSuchMethod, UnsupportedVersion, we can distinguish it by
|
|
# ex.exc_type.
|
|
raise exception.MigrationSchedulerRPCError(
|
|
reason=six.text_type(ex))
|
|
try:
|
|
self._check_compatible_with_source_hypervisor(host)
|
|
self._call_livem_checks_on_host(host)
|
|
except (exception.Invalid, exception.MigrationPreCheckError) as e:
|
|
LOG.debug("Skipping host: %(host)s because: %(e)s",
|
|
{"host": host, "e": e})
|
|
attempted_hosts.append(host)
|
|
# The scheduler would have created allocations against the
|
|
# selected destination host in Placement, so we need to remove
|
|
# those before moving on.
|
|
self._remove_host_allocations(host, selection.nodename)
|
|
host = None
|
|
return selection.service_host, selection.nodename
|
|
|
|
def _remove_host_allocations(self, host, node):
|
|
"""Removes instance allocations against the given host from Placement
|
|
|
|
:param host: The name of the host.
|
|
:param node: The name of the node.
|
|
"""
|
|
# Get the compute node object since we need the UUID.
|
|
# TODO(mriedem): If the result of select_destinations eventually
|
|
# returns the compute node uuid, we wouldn't need to look it
|
|
# up via host/node and we can save some time.
|
|
try:
|
|
compute_node = objects.ComputeNode.get_by_host_and_nodename(
|
|
self.context, host, node)
|
|
except exception.ComputeHostNotFound:
|
|
# This shouldn't happen, but we're being careful.
|
|
LOG.info('Unable to remove instance allocations from host %s '
|
|
'and node %s since it was not found.', host, node,
|
|
instance=self.instance)
|
|
return
|
|
|
|
# Calculate the resource class amounts to subtract from the allocations
|
|
# on the node based on the instance flavor.
|
|
resources = scheduler_utils.resources_from_flavor(
|
|
self.instance, self.instance.flavor)
|
|
|
|
# Now remove the allocations for our instance against that node.
|
|
# Note that this does not remove allocations against any other node
|
|
# or shared resource provider, it's just undoing what the scheduler
|
|
# allocated for the given (destination) node.
|
|
self.scheduler_client.reportclient.\
|
|
remove_provider_from_instance_allocation(
|
|
self.instance.uuid, compute_node.uuid, self.instance.user_id,
|
|
self.instance.project_id, resources)
|
|
|
|
def _check_not_over_max_retries(self, attempted_hosts):
|
|
if CONF.migrate_max_retries == -1:
|
|
return
|
|
|
|
retries = len(attempted_hosts) - 1
|
|
if retries > CONF.migrate_max_retries:
|
|
if self.migration:
|
|
self.migration.status = 'failed'
|
|
self.migration.save()
|
|
msg = (_('Exceeded max scheduling retries %(max_retries)d for '
|
|
'instance %(instance_uuid)s during live migration')
|
|
% {'max_retries': retries,
|
|
'instance_uuid': self.instance.uuid})
|
|
raise exception.MaxRetriesExceeded(reason=msg)
|