Fix pre_live_migration rollback

During the pre live migration process, Nova performs most of the
tasks related to the creation and operation of the VM in the destination
host. That is done without interrupting any of the hardware in the source
host. If the pre_live_migration fails, those same operations should be
rolled back.

Currently nova is sharing the _rollback_live_migration for both
live and pre_live migration rollbacks, and that is causing the source
host to try to re-attach network interfaces on the source host where
they weren't actually de-attached.

This patch fixes that by adding a conditional to allow nova to do
different paths for migration and pre_live_migration rollbacks.

Closes-bug: #1944619
Change-Id: I784190ac356695dd508e0ad8ec31d8eaa3ebee56
This commit is contained in:
Erlon R. Cruz 2021-12-07 17:39:58 -03:00 committed by melanie witt
parent 2ddb8bf53f
commit 63ffba7496
4 changed files with 27 additions and 13 deletions

View File

@ -8413,7 +8413,8 @@ class ComputeManager(manager.Manager):
migrate_data.migration = migration
self._rollback_live_migration(context, instance, dest,
migrate_data=migrate_data,
source_bdms=source_bdms)
source_bdms=source_bdms,
pre_live_migration=True)
def _do_pre_live_migration_from_source(self, context, dest, instance,
block_migration, migration,
@ -9167,7 +9168,8 @@ class ComputeManager(manager.Manager):
def _rollback_live_migration(self, context, instance,
dest, migrate_data=None,
migration_status='failed',
source_bdms=None):
source_bdms=None,
pre_live_migration=False):
"""Recovers Instance/volume state from migrating -> running.
:param context: security context
@ -9217,8 +9219,14 @@ class ComputeManager(manager.Manager):
# for nova-network)
# NOTE(mriedem): This is a no-op for neutron.
self.network_api.setup_networks_on_host(context, instance, self.host)
self.driver.rollback_live_migration_at_source(context, instance,
migrate_data)
# NOTE(erlon): We should make sure that rollback_live_migration_at_src
# is not called in the pre_live_migration rollback as that will trigger
# the src host to re-attach interfaces which were not detached
# previously.
if not pre_live_migration:
self.driver.rollback_live_migration_at_source(context, instance,
migrate_data)
# NOTE(lyarwood): Fetch the current list of BDMs, disconnect any
# connected volumes from the dest and delete any volume attachments

View File

@ -72,11 +72,5 @@ class TestRollbackWithHWOffloadedOVS(
self._live_migrate(self.server,
migration_expected_state='failed',
server_expected_state='MIGRATING')
# FIXME(erlon): In the current behavior,
# rollback_live_migration_at_source is called if an error happens
# during the pre_live_migration phase on the destination and therefore
# triggers the observed bug. rollback_live_migration_at_source should
# *not* be called for when errors happen during pre_live_migration
# phase.
mlpr.assert_called_once()
mlpr.assert_not_called()
mlpp.assert_called_once()

View File

@ -9539,7 +9539,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.assertEqual('error', self.migration.status)
mock_rollback_live_mig.assert_called_once_with(
self.context, self.instance, 'dest-host',
migrate_data=migrate_data, source_bdms=source_bdms)
migrate_data=migrate_data, source_bdms=source_bdms,
pre_live_migration=True)
@mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration')
@mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration')
@ -9574,7 +9575,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.assertEqual('error', self.migration.status)
mock_rollback_live_mig.assert_called_once_with(
self.context, self.instance, 'dest-host',
migrate_data=migrate_data, source_bdms=source_bdms)
migrate_data=migrate_data, source_bdms=source_bdms,
pre_live_migration=True)
@mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration')
@mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration')

View File

@ -0,0 +1,10 @@
---
fixes:
- |
Instances with hardware offloaded ovs ports no longer lose connectivity
after failed live migrations. The driver.rollback_live_migration_at_source
function is no longer called during during pre_live_migration rollback
which previously resulted in connectivity loss following a failed live
migration. See `Bug 1944619`_ for more details.
.. _Bug 1944619: https://bugs.launchpad.net/nova/+bug/1944619