Merge "Reserve mdevs to return to the source"

This commit is contained in:
Zuul 2024-02-23 15:46:08 +00:00 committed by Gerrit Code Review
commit 9a9ab2128b
4 changed files with 236 additions and 7 deletions

View File

@ -9161,7 +9161,7 @@ class ComputeManager(manager.Manager):
rollback process
There may be other resources which need cleanup; currently this is
limited to vPMEM devices with the libvirt driver.
limited to vPMEM and mdev devices with the libvirt driver.
:param migrate_data: implementation specific data
:param migr_ctxt: specific resources stored in migration_context
@ -9182,12 +9182,14 @@ class ComputeManager(manager.Manager):
objects.LibvirtVPMEMDevice)):
has_vpmem = True
break
has_mdevs = 'target_mdevs' in migrate_data
# No instance booting at source host, but instance dir
# must be deleted for preparing next block migration
# must be deleted for preparing next live migration w/o shared
# storage
# vpmem must be cleaned
do_cleanup = not migrate_data.is_shared_instance_path or has_vpmem
do_cleanup = (not migrate_data.is_shared_instance_path or
has_vpmem or has_mdevs)
destroy_disks = not migrate_data.is_shared_block_storage
elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):
# NOTE(claudiub): We need to cleanup any zombie Planned VM.

View File

@ -582,6 +582,50 @@ class VGPULiveMigrationTests(base.LibvirtMigrationMixin, VGPUTestBase):
self.assertRaises(KeyError, self.assert_mdev_usage, self.dest, 0)
class VGPULiveMigrationTestsLMFailed(VGPULiveMigrationTests):
"""Tests that expect the live migration to fail, and exist to test the
rollback code. Stubs out fakelibvirt's migrateToURI3() with a stub that
"fails" the migration.
"""
def _migrate_stub(self, domain, destination, params, flags):
"""Designed to stub fakelibvirt's migrateToURI3 and "fail" the
live migration by monkeypatching jobStats() to return an error.
"""
# During the migration, we reserved a mdev in the dest
self.assert_mdev_usage(self.src, 1)
self.assert_mdev_usage(self.dest, 1)
# The resource update periodic task should not change the consumed
# mdevs, as the migration is still happening. As usual, running
# periodics is not necessary to make the test pass, but it's good to
# make sure it does the right thing.
self._run_periodics()
self.assert_mdev_usage(self.src, 1)
self.assert_mdev_usage(self.dest, 1)
source = self.computes['src']
conn = source.driver._host.get_connection()
dom = conn.lookupByUUIDString(self.server['id'])
dom.fail_job()
self.migrate_stub_ran = True
def test_live_migrate_server(self):
self.server = self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, networks='auto', host=self.src.host)
inst = objects.Instance.get_by_uuid(self.context, self.server['id'])
mdevs = self.src.driver._get_all_assigned_mediated_devices(inst)
self.assertEqual(1, len(mdevs))
self._live_migrate(self.server, 'failed')
# We released the reserved mdev after the migration failed.
self.assert_mdev_usage(self.src, 1)
self.assert_mdev_usage(self.dest, 0)
class DifferentMdevClassesTests(VGPUTestBase):
def setUp(self):

View File

@ -11736,6 +11736,73 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.context, instance_ref, dest_check_data,
None, None, allocations)
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_mdev_types_from_uuids')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_allocate_mdevs')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_supported_vgpu_types')
def test_check_source_migrate_data_at_dest_types_missing_mdevs(self,
mock_get,
mock_alloc,
mock_gmtfu):
"""Raises an exception if the destination doesn't allocate all the
necessary mediated devices.
"""
mock_get.return_value = ['type1']
mock_alloc.return_value = [uuids.dst_mdev1]
mock_gmtfu.return_value = {uuids.dst_mdev1: 'type1'}
instance_ref = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({}, drvr.instance_claimed_mdevs)
dest_check_data = objects.LibvirtLiveMigrateData(
filename="file",
block_migration=True,
disk_over_commit=False,
disk_available_mb=1024,
src_supports_mdev_live_migration=True,
source_mdev_types={uuids.src_mdev1: 'type1',
uuids.src_mdev2: 'type1'})
allocations = {uuids.rp1: {'resources': {orc.VGPU: 2}}}
self.assertRaises(exception.MigrationPreCheckError,
drvr.check_source_migrate_data_at_dest,
self.context, instance_ref, dest_check_data,
None, None, allocations)
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_mdev_types_from_uuids')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_allocate_mdevs')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_supported_vgpu_types')
def test_check_source_migrate_data_at_dest_types_claim_mdevs(self,
mock_get,
mock_alloc,
mock_gmtfu):
mock_get.return_value = ['type1']
mock_alloc.return_value = [uuids.dst_mdev1]
mock_gmtfu.return_value = {uuids.dst_mdev1: 'type1'}
instance_ref = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({}, drvr.instance_claimed_mdevs)
dest_check_data = objects.LibvirtLiveMigrateData(
filename="file",
block_migration=True,
disk_over_commit=False,
disk_available_mb=1024,
src_supports_mdev_live_migration=True,
source_mdev_types={uuids.src_mdev1: 'type1'})
allocations = {uuids.rp1: {'resources': {orc.VGPU: 1}}}
drvr.check_source_migrate_data_at_dest(
self.context, instance_ref, dest_check_data, None, None,
allocations)
mock_alloc.assert_called_once_with(allocations)
mock_gmtfu.assert_called_once_with([uuids.dst_mdev1])
self.assertEqual({uuids.src_mdev1: uuids.dst_mdev1},
dest_check_data.target_mdevs)
self.assertEqual({instance_ref.uuid: [uuids.dst_mdev1]},
drvr.instance_claimed_mdevs)
@mock.patch.object(fakelibvirt.Connection, 'getLibVersion')
@mock.patch.object(fakelibvirt.Connection, 'getVersion')
def _test_host_can_support_mdev_lm(self, mock_getversion,
@ -13426,12 +13493,13 @@ class LibvirtConnTestCase(test.NoDBTestCase,
mock_get_instance_path.return_value = fake_instance_path
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(id=1, uuid=uuids.instance)
migrate_data = objects.LibvirtLiveMigrateData(
is_shared_instance_path=False,
instance_relative_path=False)
self.assertRaises(exception.Invalid,
drvr.rollback_live_migration_at_destination,
"context", "instance", [], None, True, migrate_data)
"context", instance, [], None, True, migrate_data)
mock_exist.assert_called_once_with(fake_instance_path)
mock_shutil.assert_called_once_with(fake_instance_path)
@ -13453,17 +13521,24 @@ class LibvirtConnTestCase(test.NoDBTestCase,
mock_destroy.side_effect = fake_destroy
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({}, drvr.instance_claimed_mdevs)
instance = objects.Instance(id=1, uuid=uuids.instance)
migrate_data = objects.LibvirtLiveMigrateData(
is_shared_instance_path=True,
instance_relative_path=False)
drvr.rollback_live_migration_at_destination("context", "instance", [],
drvr.instance_claimed_mdevs = {instance.uuid: [uuids.mdev1],
uuids.other_inst: [uuids.mdev2]}
drvr.rollback_live_migration_at_destination("context", instance, [],
None, True, migrate_data)
mock_destroy.assert_called_once_with("context", "instance", [],
mock_destroy.assert_called_once_with("context", instance, [],
None, True)
self.assertFalse(mock_get_instance_path.called)
self.assertFalse(mock_exist.called)
self.assertFalse(mock_shutil.called)
# Assert we delete existing claimed mdevs for the instance
self.assertEqual({uuids.other_inst: [uuids.mdev2]},
drvr.instance_claimed_mdevs)
@mock.patch.object(fakelibvirt.Domain, "XMLDesc")
def test_live_migration_copy_disk_paths_tunnelled(self, mock_xml):
@ -20257,15 +20332,20 @@ class LibvirtConnTestCase(test.NoDBTestCase,
mock_get_guest.return_value = guest
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({}, drvr.instance_claimed_mdevs)
net_info = network_model.NetworkInfo()
mock_get_interfaces.return_type = []
mock_image_meta.return_value = mock.sentinel.image_meta
drvr.instance_claimed_mdevs = {instance.uuid: [uuids.mdev1],
uuids.other_inst: [uuids.mdev2]}
drvr.post_live_migration_at_destination(mock.ANY, instance, net_info)
# Assert that we don't try to write anything to the destination node
# since the source live migrated with the VIR_MIGRATE_PERSIST_DEST flag
mock_write_instance_config.assert_not_called()
mock_attach.assert_not_called()
# Assert we delete existing claimed mdevs for the instance
self.assertEqual({uuids.other_inst: [uuids.mdev2]},
drvr.instance_claimed_mdevs)
vif = network_model.VIF(id=uuids.port_id,
vnic_type=network_model.VNIC_TYPE_NORMAL)
vif_direct = network_model.VIF(id=uuids.port_id,
@ -27058,6 +27138,40 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
self.assertEqual({},
drvr._get_all_assigned_mediated_devices(fake_inst))
@mock.patch.object(host.Host, 'list_guests')
def test_get_all_assigned_mediated_devices_with_claimed_mdevs(self,
list_guests):
dom_with_vgpu = """
<domain type="kvm">
<devices>
<hostdev mode='subsystem' type='mdev' model='vfio-pci'>
<source>
<address uuid='%s'/>
</source>
</hostdev>
</devices>
</domain>
""" % uuids.mdev
guest1 = libvirt_guest.Guest(FakeVirtDomain())
guest2 = libvirt_guest.Guest(FakeVirtDomain(fake_xml=dom_with_vgpu))
list_guests.return_value = [guest1, guest2]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({}, drvr.instance_claimed_mdevs)
# Just add some claimed mdevs by the live migration
drvr.instance_claimed_mdevs = {uuids.inst1: [uuids.mdev1, uuids.mdev2],
uuids.inst2: [uuids.mdev3]}
self.assertEqual({uuids.mdev: guest2.uuid,
uuids.mdev1: uuids.inst1,
uuids.mdev2: uuids.inst1,
uuids.mdev3: uuids.inst2},
drvr._get_all_assigned_mediated_devices())
# Just double-check we only return claimed mdevs from inst2 if we ask
# for it
fake_inst2 = objects.Instance(uuid=uuids.inst2)
self.assertEqual({uuids.mdev3: uuids.inst2},
drvr._get_all_assigned_mediated_devices(fake_inst2))
def test_allocate_mdevs_with_no_vgpu_allocations(self):
allocations = {
'rp1': {
@ -30012,6 +30126,17 @@ class LibvirtPMEMNamespaceTests(test.NoDBTestCase):
</domain>'''
self.assertXmlEqual(expected, guest.to_xml())
def test_cleanup_lingering_instance_resources_deletes_mdevs(self):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({}, drvr.instance_claimed_mdevs)
# Adding some claimed mdevs
instance = objects.Instance(id=1, uuid=uuids.instance, resources=None)
drvr.instance_claimed_mdevs = {instance.uuid: [uuids.mdev1],
uuids.other_inst: [uuids.mdev2]}
drvr.cleanup_lingering_instance_resources(instance)
self.assertEqual({uuids.other_inst: [uuids.mdev2]},
drvr.instance_claimed_mdevs)
@ddt.ddt
class LibvirtDeviceRemoveEventTestCase(test.NoDBTestCase):

View File

@ -540,6 +540,11 @@ class LibvirtDriver(driver.ComputeDriver):
self.mdev_classes = set([])
self.supported_vgpu_types = self._get_supported_vgpu_types()
# This dict is for knowing which mdevs are already claimed by some
# instance. This is keyed by instance UUID and the value is a list
# of mediated device UUIDs.
self.instance_claimed_mdevs = {}
# Handles ongoing device manipultion in libvirt where we wait for the
# events about success or failure.
self._device_event_handler = AsyncDeviceEventsHandler()
@ -1732,6 +1737,13 @@ class LibvirtDriver(driver.ComputeDriver):
vpmems = self._get_vpmems(instance)
if vpmems:
self._cleanup_vpmems(vpmems)
# we may have some claimed mdev residue, we need to delete it
mdevs = self.instance_claimed_mdevs.pop(instance.uuid, None)
if mdevs:
# The live migration was aborted, we need to remove the reserved
# values.
LOG.debug("Unclaiming mdevs %s from instance %s",
mdevs, instance.uuid)
def _cleanup_vpmems(self, vpmems):
for vpmem in vpmems:
@ -8423,6 +8435,12 @@ class LibvirtDriver(driver.ComputeDriver):
found in the hypervisor.
"""
allocated_mdevs = {}
# Add the reserved mediated devices for live-migration
for instance_uuid, mdev_uuids in self.instance_claimed_mdevs.items():
if instance and instance.uuid != instance_uuid:
continue
for mdev in mdev_uuids:
allocated_mdevs[mdev] = instance_uuid
if instance:
# NOTE(sbauza): In some cases (like a migration issue), the
# instance can exist in the Nova database but libvirt doesn't know
@ -8435,7 +8453,8 @@ class LibvirtDriver(driver.ComputeDriver):
except exception.InstanceNotFound:
# Bail out early if libvirt doesn't know about it since we
# can't know the existing mediated devices
return {}
# Some mdevs could be claimed for that instance
return allocated_mdevs
guests = [guest]
else:
guests = self._host.list_guests(only_running=False)
@ -9798,6 +9817,33 @@ class LibvirtDriver(driver.ComputeDriver):
'src_types': list(src_mdev_types.values()),
'dest_types': self.supported_vgpu_types}))
raise exception.MigrationPreCheckError(reason)
dst_mdevs = self._allocate_mdevs(allocs)
dst_mdev_types = self._get_mdev_types_from_uuids(dst_mdevs)
target_mdevs: ty.Dict[str, str] = {}
for src_mdev, src_type in src_mdev_types.items():
for dst_mdev, dst_type in dst_mdev_types.items():
# we want to associate by 1:1 between dst and src mdevs
if (src_type == dst_type and
src_type not in target_mdevs and
dst_mdev not in target_mdevs.values()):
target_mdevs[src_mdev] = dst_mdev
continue
if len(target_mdevs) != len(src_mdev_types):
reason = (_('Unable to migrate %(instance_uuid)s: '
'Source mdevs %(src_mdevs)s are not '
'fully mapped for this compute : %(targets)s ' %
{'instance_uuid': instance.uuid,
'src_mdevs': list(src_mdev_types.keys()),
'targets': target_mdevs}))
raise exception.MigrationPreCheckError(reason)
LOG.debug('Source mediated devices are now associated with those '
'existing mediated devices '
'(source uuid : dest uuid): %s', str(target_mdevs))
migrate_data.target_mdevs = target_mdevs
self.instance_claimed_mdevs[instance.uuid] = dst_mdevs
LOG.info("Current mediated devices reserved by this host "
"(instance UUID: list of reserved mdev UUIDs) : %s ",
self.instance_claimed_mdevs)
return migrate_data
def post_claim_migrate_data(self, context, instance, migrate_data, claim):
@ -10916,6 +10962,12 @@ class LibvirtDriver(driver.ComputeDriver):
instance, migrate_data)
if os.path.exists(instance_dir):
shutil.rmtree(instance_dir)
mdevs = self.instance_claimed_mdevs.pop(instance.uuid, None)
if mdevs:
# The live migration is aborted, we need to remove the reserved
# values.
LOG.debug("Unclaiming mdevs %s from instance %s",
mdevs, instance.uuid)
def _pre_live_migration_plug_vifs(self, instance, network_info,
migrate_data):
@ -11298,6 +11350,12 @@ class LibvirtDriver(driver.ComputeDriver):
"""
self._reattach_instance_vifs(context, instance, network_info)
self._qemu_monitor_announce_self(instance)
mdevs = self.instance_claimed_mdevs.pop(instance.uuid, None)
if mdevs:
# The live migration is done, the related mdevs are now associated
# to the domain XML so we can remove the reserved values.
LOG.debug("Unclaiming mdevs %s from instance %s",
mdevs, instance.uuid)
def _get_instance_disk_info_from_config(self, guest_config,
block_device_info):