diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 675a64f0cef4..984caeefea4d 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -9161,7 +9161,7 @@ class ComputeManager(manager.Manager): rollback process There may be other resources which need cleanup; currently this is - limited to vPMEM devices with the libvirt driver. + limited to vPMEM and mdev devices with the libvirt driver. :param migrate_data: implementation specific data :param migr_ctxt: specific resources stored in migration_context @@ -9182,12 +9182,14 @@ class ComputeManager(manager.Manager): objects.LibvirtVPMEMDevice)): has_vpmem = True break + has_mdevs = 'target_mdevs' in migrate_data # No instance booting at source host, but instance dir # must be deleted for preparing next block migration # must be deleted for preparing next live migration w/o shared # storage # vpmem must be cleaned - do_cleanup = not migrate_data.is_shared_instance_path or has_vpmem + do_cleanup = (not migrate_data.is_shared_instance_path or + has_vpmem or has_mdevs) destroy_disks = not migrate_data.is_shared_block_storage elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData): # NOTE(claudiub): We need to cleanup any zombie Planned VM. diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py index 8764dc5d84e3..7def9bc6d875 100644 --- a/nova/tests/functional/libvirt/test_vgpu.py +++ b/nova/tests/functional/libvirt/test_vgpu.py @@ -582,6 +582,50 @@ class VGPULiveMigrationTests(base.LibvirtMigrationMixin, VGPUTestBase): self.assertRaises(KeyError, self.assert_mdev_usage, self.dest, 0) +class VGPULiveMigrationTestsLMFailed(VGPULiveMigrationTests): + """Tests that expect the live migration to fail, and exist to test the + rollback code. Stubs out fakelibvirt's migrateToURI3() with a stub that + "fails" the migration. + """ + + def _migrate_stub(self, domain, destination, params, flags): + """Designed to stub fakelibvirt's migrateToURI3 and "fail" the + live migration by monkeypatching jobStats() to return an error. + """ + + # During the migration, we reserved a mdev in the dest + self.assert_mdev_usage(self.src, 1) + self.assert_mdev_usage(self.dest, 1) + + # The resource update periodic task should not change the consumed + # mdevs, as the migration is still happening. As usual, running + # periodics is not necessary to make the test pass, but it's good to + # make sure it does the right thing. + self._run_periodics() + self.assert_mdev_usage(self.src, 1) + self.assert_mdev_usage(self.dest, 1) + + source = self.computes['src'] + conn = source.driver._host.get_connection() + dom = conn.lookupByUUIDString(self.server['id']) + dom.fail_job() + self.migrate_stub_ran = True + + def test_live_migrate_server(self): + self.server = self._create_server( + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + flavor_id=self.flavor, networks='auto', host=self.src.host) + inst = objects.Instance.get_by_uuid(self.context, self.server['id']) + mdevs = self.src.driver._get_all_assigned_mediated_devices(inst) + self.assertEqual(1, len(mdevs)) + + self._live_migrate(self.server, 'failed') + + # We released the reserved mdev after the migration failed. + self.assert_mdev_usage(self.src, 1) + self.assert_mdev_usage(self.dest, 0) + + class DifferentMdevClassesTests(VGPUTestBase): def setUp(self): diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 41cdf7618209..fa351a648c45 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -11736,6 +11736,73 @@ class LibvirtConnTestCase(test.NoDBTestCase, self.context, instance_ref, dest_check_data, None, None, allocations) + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_get_mdev_types_from_uuids') + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_allocate_mdevs') + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_get_supported_vgpu_types') + def test_check_source_migrate_data_at_dest_types_missing_mdevs(self, + mock_get, + mock_alloc, + mock_gmtfu): + """Raises an exception if the destination doesn't allocate all the + necessary mediated devices. + """ + mock_get.return_value = ['type1'] + mock_alloc.return_value = [uuids.dst_mdev1] + mock_gmtfu.return_value = {uuids.dst_mdev1: 'type1'} + instance_ref = objects.Instance(**self.test_instance) + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual({}, drvr.instance_claimed_mdevs) + dest_check_data = objects.LibvirtLiveMigrateData( + filename="file", + block_migration=True, + disk_over_commit=False, + disk_available_mb=1024, + src_supports_mdev_live_migration=True, + source_mdev_types={uuids.src_mdev1: 'type1', + uuids.src_mdev2: 'type1'}) + allocations = {uuids.rp1: {'resources': {orc.VGPU: 2}}} + self.assertRaises(exception.MigrationPreCheckError, + drvr.check_source_migrate_data_at_dest, + self.context, instance_ref, dest_check_data, + None, None, allocations) + + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_get_mdev_types_from_uuids') + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_allocate_mdevs') + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_get_supported_vgpu_types') + def test_check_source_migrate_data_at_dest_types_claim_mdevs(self, + mock_get, + mock_alloc, + mock_gmtfu): + mock_get.return_value = ['type1'] + mock_alloc.return_value = [uuids.dst_mdev1] + mock_gmtfu.return_value = {uuids.dst_mdev1: 'type1'} + instance_ref = objects.Instance(**self.test_instance) + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual({}, drvr.instance_claimed_mdevs) + dest_check_data = objects.LibvirtLiveMigrateData( + filename="file", + block_migration=True, + disk_over_commit=False, + disk_available_mb=1024, + src_supports_mdev_live_migration=True, + source_mdev_types={uuids.src_mdev1: 'type1'}) + allocations = {uuids.rp1: {'resources': {orc.VGPU: 1}}} + drvr.check_source_migrate_data_at_dest( + self.context, instance_ref, dest_check_data, None, None, + allocations) + mock_alloc.assert_called_once_with(allocations) + mock_gmtfu.assert_called_once_with([uuids.dst_mdev1]) + self.assertEqual({uuids.src_mdev1: uuids.dst_mdev1}, + dest_check_data.target_mdevs) + self.assertEqual({instance_ref.uuid: [uuids.dst_mdev1]}, + drvr.instance_claimed_mdevs) + @mock.patch.object(fakelibvirt.Connection, 'getLibVersion') @mock.patch.object(fakelibvirt.Connection, 'getVersion') def _test_host_can_support_mdev_lm(self, mock_getversion, @@ -13426,12 +13493,13 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_get_instance_path.return_value = fake_instance_path drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + instance = objects.Instance(id=1, uuid=uuids.instance) migrate_data = objects.LibvirtLiveMigrateData( is_shared_instance_path=False, instance_relative_path=False) self.assertRaises(exception.Invalid, drvr.rollback_live_migration_at_destination, - "context", "instance", [], None, True, migrate_data) + "context", instance, [], None, True, migrate_data) mock_exist.assert_called_once_with(fake_instance_path) mock_shutil.assert_called_once_with(fake_instance_path) @@ -13453,17 +13521,24 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_destroy.side_effect = fake_destroy drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual({}, drvr.instance_claimed_mdevs) + instance = objects.Instance(id=1, uuid=uuids.instance) migrate_data = objects.LibvirtLiveMigrateData( is_shared_instance_path=True, instance_relative_path=False) - drvr.rollback_live_migration_at_destination("context", "instance", [], + drvr.instance_claimed_mdevs = {instance.uuid: [uuids.mdev1], + uuids.other_inst: [uuids.mdev2]} + drvr.rollback_live_migration_at_destination("context", instance, [], None, True, migrate_data) - mock_destroy.assert_called_once_with("context", "instance", [], + mock_destroy.assert_called_once_with("context", instance, [], None, True) self.assertFalse(mock_get_instance_path.called) self.assertFalse(mock_exist.called) self.assertFalse(mock_shutil.called) + # Assert we delete existing claimed mdevs for the instance + self.assertEqual({uuids.other_inst: [uuids.mdev2]}, + drvr.instance_claimed_mdevs) @mock.patch.object(fakelibvirt.Domain, "XMLDesc") def test_live_migration_copy_disk_paths_tunnelled(self, mock_xml): @@ -20257,15 +20332,20 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_get_guest.return_value = guest drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual({}, drvr.instance_claimed_mdevs) net_info = network_model.NetworkInfo() mock_get_interfaces.return_type = [] mock_image_meta.return_value = mock.sentinel.image_meta + drvr.instance_claimed_mdevs = {instance.uuid: [uuids.mdev1], + uuids.other_inst: [uuids.mdev2]} drvr.post_live_migration_at_destination(mock.ANY, instance, net_info) # Assert that we don't try to write anything to the destination node # since the source live migrated with the VIR_MIGRATE_PERSIST_DEST flag mock_write_instance_config.assert_not_called() mock_attach.assert_not_called() - + # Assert we delete existing claimed mdevs for the instance + self.assertEqual({uuids.other_inst: [uuids.mdev2]}, + drvr.instance_claimed_mdevs) vif = network_model.VIF(id=uuids.port_id, vnic_type=network_model.VNIC_TYPE_NORMAL) vif_direct = network_model.VIF(id=uuids.port_id, @@ -27058,6 +27138,40 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self.assertEqual({}, drvr._get_all_assigned_mediated_devices(fake_inst)) + @mock.patch.object(host.Host, 'list_guests') + def test_get_all_assigned_mediated_devices_with_claimed_mdevs(self, + list_guests): + dom_with_vgpu = """ + + + + +
+ + + + + """ % uuids.mdev + guest1 = libvirt_guest.Guest(FakeVirtDomain()) + guest2 = libvirt_guest.Guest(FakeVirtDomain(fake_xml=dom_with_vgpu)) + list_guests.return_value = [guest1, guest2] + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual({}, drvr.instance_claimed_mdevs) + # Just add some claimed mdevs by the live migration + drvr.instance_claimed_mdevs = {uuids.inst1: [uuids.mdev1, uuids.mdev2], + uuids.inst2: [uuids.mdev3]} + self.assertEqual({uuids.mdev: guest2.uuid, + uuids.mdev1: uuids.inst1, + uuids.mdev2: uuids.inst1, + uuids.mdev3: uuids.inst2}, + drvr._get_all_assigned_mediated_devices()) + + # Just double-check we only return claimed mdevs from inst2 if we ask + # for it + fake_inst2 = objects.Instance(uuid=uuids.inst2) + self.assertEqual({uuids.mdev3: uuids.inst2}, + drvr._get_all_assigned_mediated_devices(fake_inst2)) + def test_allocate_mdevs_with_no_vgpu_allocations(self): allocations = { 'rp1': { @@ -30012,6 +30126,17 @@ class LibvirtPMEMNamespaceTests(test.NoDBTestCase): ''' self.assertXmlEqual(expected, guest.to_xml()) + def test_cleanup_lingering_instance_resources_deletes_mdevs(self): + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + self.assertEqual({}, drvr.instance_claimed_mdevs) + # Adding some claimed mdevs + instance = objects.Instance(id=1, uuid=uuids.instance, resources=None) + drvr.instance_claimed_mdevs = {instance.uuid: [uuids.mdev1], + uuids.other_inst: [uuids.mdev2]} + drvr.cleanup_lingering_instance_resources(instance) + self.assertEqual({uuids.other_inst: [uuids.mdev2]}, + drvr.instance_claimed_mdevs) + @ddt.ddt class LibvirtDeviceRemoveEventTestCase(test.NoDBTestCase): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 43bd2a044290..87ba7efc8616 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -540,6 +540,11 @@ class LibvirtDriver(driver.ComputeDriver): self.mdev_classes = set([]) self.supported_vgpu_types = self._get_supported_vgpu_types() + # This dict is for knowing which mdevs are already claimed by some + # instance. This is keyed by instance UUID and the value is a list + # of mediated device UUIDs. + self.instance_claimed_mdevs = {} + # Handles ongoing device manipultion in libvirt where we wait for the # events about success or failure. self._device_event_handler = AsyncDeviceEventsHandler() @@ -1732,6 +1737,13 @@ class LibvirtDriver(driver.ComputeDriver): vpmems = self._get_vpmems(instance) if vpmems: self._cleanup_vpmems(vpmems) + # we may have some claimed mdev residue, we need to delete it + mdevs = self.instance_claimed_mdevs.pop(instance.uuid, None) + if mdevs: + # The live migration was aborted, we need to remove the reserved + # values. + LOG.debug("Unclaiming mdevs %s from instance %s", + mdevs, instance.uuid) def _cleanup_vpmems(self, vpmems): for vpmem in vpmems: @@ -8423,6 +8435,12 @@ class LibvirtDriver(driver.ComputeDriver): found in the hypervisor. """ allocated_mdevs = {} + # Add the reserved mediated devices for live-migration + for instance_uuid, mdev_uuids in self.instance_claimed_mdevs.items(): + if instance and instance.uuid != instance_uuid: + continue + for mdev in mdev_uuids: + allocated_mdevs[mdev] = instance_uuid if instance: # NOTE(sbauza): In some cases (like a migration issue), the # instance can exist in the Nova database but libvirt doesn't know @@ -8435,7 +8453,8 @@ class LibvirtDriver(driver.ComputeDriver): except exception.InstanceNotFound: # Bail out early if libvirt doesn't know about it since we # can't know the existing mediated devices - return {} + # Some mdevs could be claimed for that instance + return allocated_mdevs guests = [guest] else: guests = self._host.list_guests(only_running=False) @@ -9798,6 +9817,33 @@ class LibvirtDriver(driver.ComputeDriver): 'src_types': list(src_mdev_types.values()), 'dest_types': self.supported_vgpu_types})) raise exception.MigrationPreCheckError(reason) + dst_mdevs = self._allocate_mdevs(allocs) + dst_mdev_types = self._get_mdev_types_from_uuids(dst_mdevs) + target_mdevs: ty.Dict[str, str] = {} + for src_mdev, src_type in src_mdev_types.items(): + for dst_mdev, dst_type in dst_mdev_types.items(): + # we want to associate by 1:1 between dst and src mdevs + if (src_type == dst_type and + src_type not in target_mdevs and + dst_mdev not in target_mdevs.values()): + target_mdevs[src_mdev] = dst_mdev + continue + if len(target_mdevs) != len(src_mdev_types): + reason = (_('Unable to migrate %(instance_uuid)s: ' + 'Source mdevs %(src_mdevs)s are not ' + 'fully mapped for this compute : %(targets)s ' % + {'instance_uuid': instance.uuid, + 'src_mdevs': list(src_mdev_types.keys()), + 'targets': target_mdevs})) + raise exception.MigrationPreCheckError(reason) + LOG.debug('Source mediated devices are now associated with those ' + 'existing mediated devices ' + '(source uuid : dest uuid): %s', str(target_mdevs)) + migrate_data.target_mdevs = target_mdevs + self.instance_claimed_mdevs[instance.uuid] = dst_mdevs + LOG.info("Current mediated devices reserved by this host " + "(instance UUID: list of reserved mdev UUIDs) : %s ", + self.instance_claimed_mdevs) return migrate_data def post_claim_migrate_data(self, context, instance, migrate_data, claim): @@ -10916,6 +10962,12 @@ class LibvirtDriver(driver.ComputeDriver): instance, migrate_data) if os.path.exists(instance_dir): shutil.rmtree(instance_dir) + mdevs = self.instance_claimed_mdevs.pop(instance.uuid, None) + if mdevs: + # The live migration is aborted, we need to remove the reserved + # values. + LOG.debug("Unclaiming mdevs %s from instance %s", + mdevs, instance.uuid) def _pre_live_migration_plug_vifs(self, instance, network_info, migrate_data): @@ -11298,6 +11350,12 @@ class LibvirtDriver(driver.ComputeDriver): """ self._reattach_instance_vifs(context, instance, network_info) self._qemu_monitor_announce_self(instance) + mdevs = self.instance_claimed_mdevs.pop(instance.uuid, None) + if mdevs: + # The live migration is done, the related mdevs are now associated + # to the domain XML so we can remove the reserved values. + LOG.debug("Unclaiming mdevs %s from instance %s", + mdevs, instance.uuid) def _get_instance_disk_info_from_config(self, guest_config, block_device_info):