diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d8e4abca6090..3994bb424399 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6175,9 +6175,6 @@ class ComputeManager(manager.Manager): # done on source/destination. For now, this is just here for status # reporting self._set_migration_status(migration, 'preparing') - # NOTE(Kevin_Zheng): The migration is no longer in the `queued` status - # so lets remove it from the mapping. - self._waiting_live_migrations.pop(instance.uuid) class _BreakWaitForInstanceEvent(Exception): """Used as a signal to stop waiting for the network-vif-plugged @@ -6251,8 +6248,22 @@ class ComputeManager(manager.Manager): self._cleanup_pre_live_migration( context, dest, instance, migration, migrate_data) - self._set_migration_status(migration, 'running') + # NOTE(Kevin_Zheng): Pop the migration from the waiting queue + # if it exist in the queue, then we are good to moving on, if + # not, some other process must have aborted it, then we should + # rollback. + try: + self._waiting_live_migrations.pop(instance.uuid) + except KeyError: + LOG.debug('Migration %s aborted by another process, rollback.', + migration.uuid, instance=instance) + migrate_data.migration = migration + self._rollback_live_migration(context, instance, dest, + migrate_data, 'cancelled') + self._notify_live_migrate_abort_end(context, instance) + return + self._set_migration_status(migration, 'running') if migrate_data: migrate_data.migration = migration LOG.debug('live_migration data is %s', migrate_data) @@ -6331,6 +6342,14 @@ class ComputeManager(manager.Manager): action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE, phase=fields.NotificationPhase.END) + def _notify_live_migrate_abort_end(self, context, instance): + self._notify_about_instance_usage( + context, instance, 'live.migration.abort.end') + compute_utils.notify_about_instance_action( + context, instance, self.host, + action=fields.NotificationAction.LIVE_MIGRATION_ABORT, + phase=fields.NotificationPhase.END) + @wrap_exception() @wrap_instance_event(prefix='compute') @wrap_instance_fault @@ -6342,26 +6361,51 @@ class ComputeManager(manager.Manager): :param migration_id: ID of in-progress live migration """ - migration = objects.Migration.get_by_id(context, migration_id) - if migration.status != 'running': - raise exception.InvalidMigrationState(migration_id=migration_id, - instance_uuid=instance.uuid, - state=migration.status, - method='abort live migration') - self._notify_about_instance_usage( context, instance, 'live.migration.abort.start') compute_utils.notify_about_instance_action( context, instance, self.host, action=fields.NotificationAction.LIVE_MIGRATION_ABORT, phase=fields.NotificationPhase.START) - self.driver.live_migration_abort(instance) - self._notify_about_instance_usage( - context, instance, 'live.migration.abort.end') - compute_utils.notify_about_instance_action( - context, instance, self.host, - action=fields.NotificationAction.LIVE_MIGRATION_ABORT, - phase=fields.NotificationPhase.END) + # NOTE(Kevin_Zheng): Pop the migration out from the queue, this might + # lead to 3 scenarios: + # 1. The selected migration is still in queue, and the future.cancel() + # succeed, then the abort action is succeed, mark the migration + # status to 'cancelled'. + # 2. The selected migration is still in queue, but the future.cancel() + # failed, then the _do_live_migration() has started executing, and + # the migration status is 'preparing', then we just pop it from the + # queue, and the migration process will handle it later. And the + # migration status couldn't be 'running' in this scenario because + # if _do_live_migration has started executing and we've already + # popped it from the queue and set the migration status to + # 'running' at this point, popping it here will raise KeyError at + # which point we check if it's running and if so, we abort the old + # way. + # 3. The selected migration is not in the queue, then the migration + # status is 'running', let the driver handle it. + try: + migration, future = ( + self._waiting_live_migrations.pop(instance.uuid)) + if future and future.cancel(): + # If we got here, we've successfully aborted the queued + # migration and _do_live_migration won't run so we need + # to set the migration status to cancelled and send the + # notification. If Future.cancel() fails, it means + # _do_live_migration is running and the migration status + # is preparing, and _do_live_migration() itself will attempt + # to pop the queued migration, hit a KeyError, and rollback, + # set the migration to cancelled and send the + # live.migration.abort.end notification. + self._set_migration_status(migration, 'cancelled') + except KeyError: + migration = objects.Migration.get_by_id(context, migration_id) + if migration.status != 'running': + raise exception.InvalidMigrationState( + migration_id=migration_id, instance_uuid=instance.uuid, + state=migration.status, method='abort live migration') + self.driver.live_migration_abort(instance) + self._notify_live_migrate_abort_end(context, instance) def _live_migration_cleanup_flags(self, migrate_data): """Determine whether disks or instance path need to be cleaned up after diff --git a/nova/objects/service.py b/nova/objects/service.py index cf8827bb508a..ef5289958e58 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 33 +SERVICE_VERSION = 34 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -141,6 +141,8 @@ SERVICE_VERSION_HISTORY = ( # Version 33: Add support for check on the server group with # 'max_server_per_host' rules {'compute_rpc': '5.0'}, + # Version 34: Adds support to abort queued/preparing live migrations. + {'compute_rpc': '5.0'}, ) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index ef34d2349502..5a669219e03f 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7279,6 +7279,47 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): 'fake', self.instance, True, migration, {}) self.assertEqual('error', migration.status) + @mock.patch( + 'nova.compute.manager.ComputeManager._notify_about_instance_usage') + @mock.patch.object(compute_utils, 'notify_about_instance_action') + @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration') + @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration') + def test_live_migration_aborted_before_running(self, mock_rpc, + mock_rollback, + mock_action_notify, + mock_usage_notify): + migrate_data = objects.LibvirtLiveMigrateData( + wait_for_vif_plugged=True) + mock_rpc.return_value = migrate_data + self.instance.info_cache = objects.InstanceInfoCache( + network_info=network_model.NetworkInfo([ + network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + ])) + self.compute._waiting_live_migrations = {} + fake_migration = objects.Migration( + uuid=uuids.migration, instance_uuid=self.instance.uuid) + fake_migration.save = mock.MagicMock() + + with mock.patch.object(self.compute.virtapi, + 'wait_for_instance_event') as wait_for_event: + self.compute._do_live_migration( + self.context, 'dest-host', self.instance, 'block_migration', + fake_migration, migrate_data) + + self.assertEqual(2, len(wait_for_event.call_args[0][1])) + mock_rpc.assert_called_once_with( + self.context, self.instance, 'block_migration', None, + 'dest-host', migrate_data) + mock_rollback.assert_called_once_with( + self.context, self.instance, 'dest-host', migrate_data, + 'cancelled') + mock_usage_notify.assert_called_once_with( + self.context, self.instance, 'live.migration.abort.end') + mock_action_notify.assert_called_once_with( + self.context, self.instance, self.compute.host, + action=fields.NotificationAction.LIVE_MIGRATION_ABORT, + phase=fields.NotificationPhase.END) + def test_live_migration_force_complete_succeeded(self): migration = objects.Migration() migration.status = 'running' @@ -7708,6 +7749,34 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): action='live_migration_abort', phase='end')] ) + @mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage') + @mock.patch.object(objects.Migration, 'get_by_id') + @mock.patch('nova.compute.utils.notify_about_instance_action') + def test_live_migration_abort_queued(self, mock_notify_action, + mock_get_migration, mock_notify): + instance = objects.Instance(id=123, uuid=uuids.instance) + migration = self._get_migration(10, 'queued', 'live-migration') + migration.save = mock.MagicMock() + mock_get_migration.return_value = migration + fake_future = mock.MagicMock() + self.compute._waiting_live_migrations[instance.uuid] = ( + migration, fake_future) + self.compute.live_migration_abort(self.context, instance, migration.id) + mock_notify.assert_has_calls( + [mock.call(self.context, instance, + 'live.migration.abort.start'), + mock.call(self.context, instance, + 'live.migration.abort.end')] + ) + mock_notify_action.assert_has_calls( + [mock.call(self.context, instance, 'fake-mini', + action='live_migration_abort', phase='start'), + mock.call(self.context, instance, 'fake-mini', + action='live_migration_abort', phase='end')] + ) + self.assertEqual('cancelled', migration.status) + fake_future.cancel.assert_called_once_with() + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') @mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage') @mock.patch.object(objects.Migration, 'get_by_id') @@ -7731,11 +7800,13 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): 'fake-mini', action='live_migration_abort', phase='start') @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + @mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage') @mock.patch.object(objects.Migration, 'get_by_id') @mock.patch('nova.compute.utils.notify_about_instance_action') def test_live_migration_abort_wrong_migration_state(self, mock_notify_action, mock_get_migration, + mock_notify, mock_instance_fault): instance = objects.Instance(id=123, uuid=uuids.instance) migration = self._get_migration(10, 'completed', 'live-migration') @@ -7745,7 +7816,6 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): self.context, instance, migration.id) - mock_notify_action.assert_not_called() def test_live_migration_cleanup_flags_block_migrate_libvirt(self): migrate_data = objects.LibvirtLiveMigrateData(