libvirt: Abort live-migration job when monitoring fails

During live migration process, a _live_migration_monitor thread
checks progress of migration on source host, if for any reason
we hit infrastructure issue involving a DB/RPC/libvirt-timeout
failure, an Exception is raised to the nova-compute service and
instance/migration is set to ERROR state.

The issue is that we may let live-migration job running out of nova
control. At the end of job, guest is resumed on target host while
nova still reports it on source host, this may lead to a split-brain
situation if instance is restarted.

This change proposes to abort live-migration job if issue occurs
during _live_migration_monitor.

Change-Id: Ia593b500425c81e54eb401e38264db5cc5fc1f93
Closes-Bug: #1905944
This commit is contained in:
Alexandre Arents
2020-11-26 15:24:19 +00:00
parent b241663b89
commit 39f0af5d18
2 changed files with 33 additions and 6 deletions

View File

@@ -13374,9 +13374,11 @@ class LibvirtConnTestCase(test.NoDBTestCase,
@mock.patch.object(fakelibvirt.Connection, "_mark_running")
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_live_migration_copy_disk_paths")
def test_live_migration_main(self, mock_copy_disk_path, mock_running,
mock_guest, mock_monitor, mock_thread,
mock_conn):
@mock.patch.object(libvirt_driver.LibvirtDriver, "live_migration_abort")
def _test_live_migration_main(self, mock_abort, mock_copy_disk_path,
mock_running, mock_guest, mock_monitor,
mock_thread, mock_conn,
mon_side_effect=None):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
@@ -13389,6 +13391,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
mock_copy_disk_path.return_value = disks_to_copy
mock_guest.return_value = guest
mock_monitor.side_effect = mon_side_effect
def fake_post():
pass
@@ -13396,9 +13399,15 @@ class LibvirtConnTestCase(test.NoDBTestCase,
def fake_recover():
pass
drvr._live_migration(self.context, instance, "fakehost",
fake_post, fake_recover, True,
migrate_data)
if mon_side_effect:
self.assertRaises(mon_side_effect, drvr._live_migration,
self.context, instance, "fakehost", fake_post,
fake_recover, True, migrate_data)
mock_abort.assert_called_once_with(instance)
else:
drvr._live_migration(self.context, instance, "fakehost", fake_post,
fake_recover, True, migrate_data)
mock_copy_disk_path.assert_called_once_with(self.context, instance,
guest)
@@ -13415,6 +13424,12 @@ class LibvirtConnTestCase(test.NoDBTestCase,
fake_post, fake_recover, True,
migrate_data, AnyEventletEvent(), disks_to_copy[0])
def test_live_migration_main(self):
self._test_live_migration_main()
def test_live_migration_main_monitoring_failed(self):
self._test_live_migration_main(mon_side_effect=Exception)
@mock.patch('os.path.exists', return_value=False)
@mock.patch('nova.virt.libvirt.utils.create_image')
@mock.patch.object(libvirt_driver.LibvirtDriver,

View File

@@ -9962,6 +9962,18 @@ class LibvirtDriver(driver.ComputeDriver):
except Exception as ex:
LOG.warning("Error monitoring migration: %(ex)s",
{"ex": ex}, instance=instance, exc_info=True)
# NOTE(aarents): Ensure job is aborted if still running before
# raising the exception so this would avoid the migration to be
# done and the libvirt guest to be resumed on the target while
# the instance record would still related to the source host.
try:
# If migration is running in post-copy mode and guest
# already running on dest host, libvirt will refuse to
# cancel migration job.
self.live_migration_abort(instance)
except libvirt.libvirtError:
LOG.warning("Error occured when trying to abort live ",
"migration job, ignoring it.", instance=instance)
raise
finally:
LOG.debug("Live migration monitoring is all done",