Compute: add support to abort queued live migration

This patch modified compute manager and bumps the
nova-compute service version to be able to abort
live migration in ``queued`` and ``preparing`` status.
The following patch will modify the API part.

Part of blueprint abort-live-migration-in-queued-status

Change-Id: I3689dd6544c387676983be47cf925c3fd7ce72f1
This commit is contained in:
Kevin_Zheng
2018-05-15 16:13:14 +08:00
committed by Matt Riedemann
parent 79dac41fee
commit d23794d25f
3 changed files with 136 additions and 20 deletions

View File

@@ -6175,9 +6175,6 @@ class ComputeManager(manager.Manager):
# done on source/destination. For now, this is just here for status
# reporting
self._set_migration_status(migration, 'preparing')
# NOTE(Kevin_Zheng): The migration is no longer in the `queued` status
# so lets remove it from the mapping.
self._waiting_live_migrations.pop(instance.uuid)
class _BreakWaitForInstanceEvent(Exception):
"""Used as a signal to stop waiting for the network-vif-plugged
@@ -6251,8 +6248,22 @@ class ComputeManager(manager.Manager):
self._cleanup_pre_live_migration(
context, dest, instance, migration, migrate_data)
self._set_migration_status(migration, 'running')
# NOTE(Kevin_Zheng): Pop the migration from the waiting queue
# if it exist in the queue, then we are good to moving on, if
# not, some other process must have aborted it, then we should
# rollback.
try:
self._waiting_live_migrations.pop(instance.uuid)
except KeyError:
LOG.debug('Migration %s aborted by another process, rollback.',
migration.uuid, instance=instance)
migrate_data.migration = migration
self._rollback_live_migration(context, instance, dest,
migrate_data, 'cancelled')
self._notify_live_migrate_abort_end(context, instance)
return
self._set_migration_status(migration, 'running')
if migrate_data:
migrate_data.migration = migration
LOG.debug('live_migration data is %s', migrate_data)
@@ -6331,6 +6342,14 @@ class ComputeManager(manager.Manager):
action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,
phase=fields.NotificationPhase.END)
def _notify_live_migrate_abort_end(self, context, instance):
self._notify_about_instance_usage(
context, instance, 'live.migration.abort.end')
compute_utils.notify_about_instance_action(
context, instance, self.host,
action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
phase=fields.NotificationPhase.END)
@wrap_exception()
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
@@ -6342,26 +6361,51 @@ class ComputeManager(manager.Manager):
:param migration_id: ID of in-progress live migration
"""
migration = objects.Migration.get_by_id(context, migration_id)
if migration.status != 'running':
raise exception.InvalidMigrationState(migration_id=migration_id,
instance_uuid=instance.uuid,
state=migration.status,
method='abort live migration')
self._notify_about_instance_usage(
context, instance, 'live.migration.abort.start')
compute_utils.notify_about_instance_action(
context, instance, self.host,
action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
phase=fields.NotificationPhase.START)
self.driver.live_migration_abort(instance)
self._notify_about_instance_usage(
context, instance, 'live.migration.abort.end')
compute_utils.notify_about_instance_action(
context, instance, self.host,
action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
phase=fields.NotificationPhase.END)
# NOTE(Kevin_Zheng): Pop the migration out from the queue, this might
# lead to 3 scenarios:
# 1. The selected migration is still in queue, and the future.cancel()
# succeed, then the abort action is succeed, mark the migration
# status to 'cancelled'.
# 2. The selected migration is still in queue, but the future.cancel()
# failed, then the _do_live_migration() has started executing, and
# the migration status is 'preparing', then we just pop it from the
# queue, and the migration process will handle it later. And the
# migration status couldn't be 'running' in this scenario because
# if _do_live_migration has started executing and we've already
# popped it from the queue and set the migration status to
# 'running' at this point, popping it here will raise KeyError at
# which point we check if it's running and if so, we abort the old
# way.
# 3. The selected migration is not in the queue, then the migration
# status is 'running', let the driver handle it.
try:
migration, future = (
self._waiting_live_migrations.pop(instance.uuid))
if future and future.cancel():
# If we got here, we've successfully aborted the queued
# migration and _do_live_migration won't run so we need
# to set the migration status to cancelled and send the
# notification. If Future.cancel() fails, it means
# _do_live_migration is running and the migration status
# is preparing, and _do_live_migration() itself will attempt
# to pop the queued migration, hit a KeyError, and rollback,
# set the migration to cancelled and send the
# live.migration.abort.end notification.
self._set_migration_status(migration, 'cancelled')
except KeyError:
migration = objects.Migration.get_by_id(context, migration_id)
if migration.status != 'running':
raise exception.InvalidMigrationState(
migration_id=migration_id, instance_uuid=instance.uuid,
state=migration.status, method='abort live migration')
self.driver.live_migration_abort(instance)
self._notify_live_migrate_abort_end(context, instance)
def _live_migration_cleanup_flags(self, migrate_data):
"""Determine whether disks or instance path need to be cleaned up after

View File

@@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)
# NOTE(danms): This is the global service version counter
SERVICE_VERSION = 33
SERVICE_VERSION = 34
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -141,6 +141,8 @@ SERVICE_VERSION_HISTORY = (
# Version 33: Add support for check on the server group with
# 'max_server_per_host' rules
{'compute_rpc': '5.0'},
# Version 34: Adds support to abort queued/preparing live migrations.
{'compute_rpc': '5.0'},
)

View File

@@ -7279,6 +7279,47 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
'fake', self.instance, True, migration, {})
self.assertEqual('error', migration.status)
@mock.patch(
'nova.compute.manager.ComputeManager._notify_about_instance_usage')
@mock.patch.object(compute_utils, 'notify_about_instance_action')
@mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration')
@mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration')
def test_live_migration_aborted_before_running(self, mock_rpc,
mock_rollback,
mock_action_notify,
mock_usage_notify):
migrate_data = objects.LibvirtLiveMigrateData(
wait_for_vif_plugged=True)
mock_rpc.return_value = migrate_data
self.instance.info_cache = objects.InstanceInfoCache(
network_info=network_model.NetworkInfo([
network_model.VIF(uuids.port1), network_model.VIF(uuids.port2)
]))
self.compute._waiting_live_migrations = {}
fake_migration = objects.Migration(
uuid=uuids.migration, instance_uuid=self.instance.uuid)
fake_migration.save = mock.MagicMock()
with mock.patch.object(self.compute.virtapi,
'wait_for_instance_event') as wait_for_event:
self.compute._do_live_migration(
self.context, 'dest-host', self.instance, 'block_migration',
fake_migration, migrate_data)
self.assertEqual(2, len(wait_for_event.call_args[0][1]))
mock_rpc.assert_called_once_with(
self.context, self.instance, 'block_migration', None,
'dest-host', migrate_data)
mock_rollback.assert_called_once_with(
self.context, self.instance, 'dest-host', migrate_data,
'cancelled')
mock_usage_notify.assert_called_once_with(
self.context, self.instance, 'live.migration.abort.end')
mock_action_notify.assert_called_once_with(
self.context, self.instance, self.compute.host,
action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
phase=fields.NotificationPhase.END)
def test_live_migration_force_complete_succeeded(self):
migration = objects.Migration()
migration.status = 'running'
@@ -7708,6 +7749,34 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
action='live_migration_abort', phase='end')]
)
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
@mock.patch.object(objects.Migration, 'get_by_id')
@mock.patch('nova.compute.utils.notify_about_instance_action')
def test_live_migration_abort_queued(self, mock_notify_action,
mock_get_migration, mock_notify):
instance = objects.Instance(id=123, uuid=uuids.instance)
migration = self._get_migration(10, 'queued', 'live-migration')
migration.save = mock.MagicMock()
mock_get_migration.return_value = migration
fake_future = mock.MagicMock()
self.compute._waiting_live_migrations[instance.uuid] = (
migration, fake_future)
self.compute.live_migration_abort(self.context, instance, migration.id)
mock_notify.assert_has_calls(
[mock.call(self.context, instance,
'live.migration.abort.start'),
mock.call(self.context, instance,
'live.migration.abort.end')]
)
mock_notify_action.assert_has_calls(
[mock.call(self.context, instance, 'fake-mini',
action='live_migration_abort', phase='start'),
mock.call(self.context, instance, 'fake-mini',
action='live_migration_abort', phase='end')]
)
self.assertEqual('cancelled', migration.status)
fake_future.cancel.assert_called_once_with()
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
@mock.patch.object(objects.Migration, 'get_by_id')
@@ -7731,11 +7800,13 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
'fake-mini', action='live_migration_abort', phase='start')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
@mock.patch.object(objects.Migration, 'get_by_id')
@mock.patch('nova.compute.utils.notify_about_instance_action')
def test_live_migration_abort_wrong_migration_state(self,
mock_notify_action,
mock_get_migration,
mock_notify,
mock_instance_fault):
instance = objects.Instance(id=123, uuid=uuids.instance)
migration = self._get_migration(10, 'completed', 'live-migration')
@@ -7745,7 +7816,6 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
self.context,
instance,
migration.id)
mock_notify_action.assert_not_called()
def test_live_migration_cleanup_flags_block_migrate_libvirt(self):
migrate_data = objects.LibvirtLiveMigrateData(