libvirt: Abort live-migration job when monitoring fails
During live migration process, a _live_migration_monitor thread checks progress of migration on source host, if for any reason we hit infrastructure issue involving a DB/RPC/libvirt-timeout failure, an Exception is raised to the nova-compute service and instance/migration is set to ERROR state. The issue is that we may let live-migration job running out of nova control. At the end of job, guest is resumed on target host while nova still reports it on source host, this may lead to a split-brain situation if instance is restarted. This change proposes to abort live-migration job if issue occurs during _live_migration_monitor. Change-Id: Ia593b500425c81e54eb401e38264db5cc5fc1f93 Closes-Bug: #1905944
This commit is contained in:
@@ -13374,9 +13374,11 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
@mock.patch.object(fakelibvirt.Connection, "_mark_running")
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
"_live_migration_copy_disk_paths")
|
||||
def test_live_migration_main(self, mock_copy_disk_path, mock_running,
|
||||
mock_guest, mock_monitor, mock_thread,
|
||||
mock_conn):
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver, "live_migration_abort")
|
||||
def _test_live_migration_main(self, mock_abort, mock_copy_disk_path,
|
||||
mock_running, mock_guest, mock_monitor,
|
||||
mock_thread, mock_conn,
|
||||
mon_side_effect=None):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
instance = objects.Instance(**self.test_instance)
|
||||
|
||||
@@ -13389,6 +13391,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
mock_copy_disk_path.return_value = disks_to_copy
|
||||
|
||||
mock_guest.return_value = guest
|
||||
mock_monitor.side_effect = mon_side_effect
|
||||
|
||||
def fake_post():
|
||||
pass
|
||||
@@ -13396,9 +13399,15 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
def fake_recover():
|
||||
pass
|
||||
|
||||
drvr._live_migration(self.context, instance, "fakehost",
|
||||
fake_post, fake_recover, True,
|
||||
migrate_data)
|
||||
if mon_side_effect:
|
||||
self.assertRaises(mon_side_effect, drvr._live_migration,
|
||||
self.context, instance, "fakehost", fake_post,
|
||||
fake_recover, True, migrate_data)
|
||||
mock_abort.assert_called_once_with(instance)
|
||||
else:
|
||||
drvr._live_migration(self.context, instance, "fakehost", fake_post,
|
||||
fake_recover, True, migrate_data)
|
||||
|
||||
mock_copy_disk_path.assert_called_once_with(self.context, instance,
|
||||
guest)
|
||||
|
||||
@@ -13415,6 +13424,12 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
fake_post, fake_recover, True,
|
||||
migrate_data, AnyEventletEvent(), disks_to_copy[0])
|
||||
|
||||
def test_live_migration_main(self):
|
||||
self._test_live_migration_main()
|
||||
|
||||
def test_live_migration_main_monitoring_failed(self):
|
||||
self._test_live_migration_main(mon_side_effect=Exception)
|
||||
|
||||
@mock.patch('os.path.exists', return_value=False)
|
||||
@mock.patch('nova.virt.libvirt.utils.create_image')
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
|
@@ -9962,6 +9962,18 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
except Exception as ex:
|
||||
LOG.warning("Error monitoring migration: %(ex)s",
|
||||
{"ex": ex}, instance=instance, exc_info=True)
|
||||
# NOTE(aarents): Ensure job is aborted if still running before
|
||||
# raising the exception so this would avoid the migration to be
|
||||
# done and the libvirt guest to be resumed on the target while
|
||||
# the instance record would still related to the source host.
|
||||
try:
|
||||
# If migration is running in post-copy mode and guest
|
||||
# already running on dest host, libvirt will refuse to
|
||||
# cancel migration job.
|
||||
self.live_migration_abort(instance)
|
||||
except libvirt.libvirtError:
|
||||
LOG.warning("Error occured when trying to abort live ",
|
||||
"migration job, ignoring it.", instance=instance)
|
||||
raise
|
||||
finally:
|
||||
LOG.debug("Live migration monitoring is all done",
|
||||
|
Reference in New Issue
Block a user