nova/nova/conductor/tasks/live_migrate.py

199 lines
8.4 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging as messaging
from nova.compute import power_state
from nova.conductor.tasks import base
from nova import exception
from nova.i18n import _
from nova import objects
from nova.scheduler import utils as scheduler_utils
from nova import utils
LOG = logging.getLogger(__name__)
migrate_opt = cfg.IntOpt('migrate_max_retries',
default=-1,
help='Number of times to retry live-migration before failing. '
'If == -1, try until out of hosts. '
'If == 0, only try once, no retries.')
CONF = cfg.CONF
CONF.register_opt(migrate_opt)
class LiveMigrationTask(base.TaskBase):
def __init__(self, context, instance, destination,
block_migration, disk_over_commit, migration, compute_rpcapi,
servicegroup_api, scheduler_client):
super(LiveMigrationTask, self).__init__(context, instance)
self.destination = destination
self.block_migration = block_migration
self.disk_over_commit = disk_over_commit
self.migration = migration
self.source = instance.host
self.migrate_data = None
self.compute_rpcapi = compute_rpcapi
self.servicegroup_api = servicegroup_api
self.scheduler_client = scheduler_client
def _execute(self):
self._check_instance_is_active()
self._check_host_is_up(self.source)
if not self.destination:
self.destination = self._find_destination()
self.migration.dest_compute = self.destination
self.migration.save()
else:
self._check_requested_destination()
# TODO(johngarbutt) need to move complexity out of compute manager
# TODO(johngarbutt) disk_over_commit?
return self.compute_rpcapi.live_migration(self.context,
host=self.source,
instance=self.instance,
dest=self.destination,
block_migration=self.block_migration,
migration=self.migration,
migrate_data=self.migrate_data)
def rollback(self):
# TODO(johngarbutt) need to implement the clean up operation
# but this will make sense only once we pull in the compute
# calls, since this class currently makes no state changes,
# except to call the compute method, that has no matching
# rollback call right now.
pass
def _check_instance_is_active(self):
if self.instance.power_state not in (power_state.RUNNING,
power_state.PAUSED):
raise exception.InstanceInvalidState(
instance_uuid = self.instance.uuid,
attr = 'power_state',
state = self.instance.power_state,
method = 'live migrate')
def _check_host_is_up(self, host):
try:
service = objects.Service.get_by_compute_host(self.context, host)
except exception.NotFound:
raise exception.ComputeServiceUnavailable(host=host)
if not self.servicegroup_api.service_is_up(service):
raise exception.ComputeServiceUnavailable(host=host)
def _check_requested_destination(self):
self._check_destination_is_not_source()
self._check_host_is_up(self.destination)
self._check_destination_has_enough_memory()
self._check_compatible_with_source_hypervisor(self.destination)
self._call_livem_checks_on_host(self.destination)
def _check_destination_is_not_source(self):
if self.destination == self.source:
raise exception.UnableToMigrateToSelf(
instance_id=self.instance.uuid, host=self.destination)
def _check_destination_has_enough_memory(self):
compute = self._get_compute_info(self.destination)
free_ram_mb = compute.free_ram_mb
total_ram_mb = compute.memory_mb
mem_inst = self.instance.memory_mb
# NOTE(sbauza): Now the ComputeNode object reports an allocation ratio
# that can be provided by the compute_node if new or by the controller
ram_ratio = compute.ram_allocation_ratio
# NOTE(sbauza): Mimic the RAMFilter logic in order to have the same
# ram validation
avail = total_ram_mb * ram_ratio - (total_ram_mb - free_ram_mb)
if not mem_inst or avail <= mem_inst:
instance_uuid = self.instance.uuid
dest = self.destination
reason = _("Unable to migrate %(instance_uuid)s to %(dest)s: "
"Lack of memory(host:%(avail)s <= "
"instance:%(mem_inst)s)")
raise exception.MigrationPreCheckError(reason=reason % dict(
instance_uuid=instance_uuid, dest=dest, avail=avail,
mem_inst=mem_inst))
def _get_compute_info(self, host):
return objects.ComputeNode.get_first_node_by_host_for_old_compat(
self.context, host)
def _check_compatible_with_source_hypervisor(self, destination):
source_info = self._get_compute_info(self.source)
destination_info = self._get_compute_info(destination)
source_type = source_info['hypervisor_type']
destination_type = destination_info['hypervisor_type']
if source_type != destination_type:
raise exception.InvalidHypervisorType()
source_version = source_info['hypervisor_version']
destination_version = destination_info['hypervisor_version']
if source_version > destination_version:
raise exception.DestinationHypervisorTooOld()
def _call_livem_checks_on_host(self, destination):
try:
self.migrate_data = self.compute_rpcapi.\
check_can_live_migrate_destination(self.context, self.instance,
destination, self.block_migration, self.disk_over_commit)
except messaging.MessagingTimeout:
msg = _("Timeout while checking if we can live migrate to host: "
"%s") % destination
raise exception.MigrationPreCheckError(msg)
def _find_destination(self):
# TODO(johngarbutt) this retry loop should be shared
attempted_hosts = [self.source]
image = utils.get_image_from_system_metadata(
self.instance.system_metadata)
request_spec = scheduler_utils.build_request_spec(self.context, image,
[self.instance])
host = None
while host is None:
self._check_not_over_max_retries(attempted_hosts)
filter_properties = {'ignore_hosts': attempted_hosts}
scheduler_utils.setup_instance_group(self.context, request_spec,
filter_properties)
host = self.scheduler_client.select_destinations(self.context,
request_spec, filter_properties)[0]['host']
try:
self._check_compatible_with_source_hypervisor(host)
self._call_livem_checks_on_host(host)
except (exception.Invalid, exception.MigrationPreCheckError) as e:
LOG.debug("Skipping host: %(host)s because: %(e)s",
{"host": host, "e": e})
attempted_hosts.append(host)
host = None
return host
def _check_not_over_max_retries(self, attempted_hosts):
if CONF.migrate_max_retries == -1:
return
retries = len(attempted_hosts) - 1
if retries > CONF.migrate_max_retries:
msg = (_('Exceeded max scheduling retries %(max_retries)d for '
'instance %(instance_uuid)s during live migration')
% {'max_retries': retries,
'instance_uuid': self.instance.uuid})
raise exception.MaxRetriesExceeded(reason=msg)