Merge "libvirt: pass the mdevs when rebooting the guest"

2018-01-27 15:29:32 +00:00 · 2018-01-27 15:29:32 +00:00 · ec8b9a59c0
parent 02852c8799 39382587fe
commit ec8b9a59c0
3 changed files with 74 additions and 9 deletions
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@ -12371,7 +12371,9 @@ class LibvirtConnTestCase(test.NoDBTestCase,
    @mock.patch('nova.virt.libvirt.LibvirtDriver.'
                '_get_instance_disk_info_from_config')
    @mock.patch('nova.virt.libvirt.LibvirtDriver.destroy')
-    def test_hard_reboot(self, mock_destroy, mock_get_disk_info,
+    @mock.patch('nova.virt.libvirt.LibvirtDriver.'
+                '_get_all_assigned_mediated_devices')
+    def test_hard_reboot(self, mock_get_mdev, mock_destroy, mock_get_disk_info,
                         mock_get_guest_xml, mock_create_domain_and_network,
                         mock_get_info):
        self.context.auth_token = True  # any non-None value will suffice
@ -12389,6 +12391,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
                    "<target dev='vdb' bus='virtio'/></disk>"
                    "</devices></domain>")

+        mock_get_mdev.return_value = {uuids.mdev1: uuids.inst1}
        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)

        return_values = [hardware.InstanceInfo(state=power_state.SHUTDOWN),
@ -12421,10 +12424,14 @@ class LibvirtConnTestCase(test.NoDBTestCase,
        for name in ('disk', 'disk.local'):
            self.assertTrue(disks[name].cache.called)

+        mock_get_mdev.assert_called_once_with(instance)
        mock_destroy.assert_called_once_with(self.context, instance,
                network_info, destroy_disks=False,
                block_device_info=block_device_info)

+        mock_get_guest_xml.assert_called_once_with(self.context, instance,
+            network_info, mock.ANY, mock.ANY,
+            block_device_info=block_device_info, mdevs=[uuids.mdev1])
        mock_create_domain_and_network.assert_called_once_with(self.context,
            dummyxml, instance, network_info,
            block_device_info=block_device_info)
@ -12442,9 +12449,11 @@ class LibvirtConnTestCase(test.NoDBTestCase,
    @mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_config')
    @mock.patch('nova.virt.libvirt.blockinfo.get_disk_info')
    @mock.patch('nova.virt.libvirt.LibvirtDriver._destroy')
+    @mock.patch('nova.virt.libvirt.LibvirtDriver.'
+                '_get_all_assigned_mediated_devices')
    def test_hard_reboot_does_not_call_glance_show(self,
-            mock_destroy, mock_get_disk_info, mock_get_guest_config,
-            mock_get_instance_path, mock_write_to_file,
+            mock_get_mdev, mock_destroy, mock_get_disk_info,
+            mock_get_guest_config, mock_get_instance_path, mock_write_to_file,
            mock_get_instance_disk_info, mock_create_images_and_backing,
            mock_create_domand_and_network, mock_prepare_pci_devices_for_use,
            mock_get_instance_pci_devs, mock_looping_call, mock_ensure_tree):
@ -12462,6 +12471,8 @@ class LibvirtConnTestCase(test.NoDBTestCase,

        instance = objects.Instance(**self.test_instance)

+        mock_get_mdev.return_value = {}
+
        network_info = mock.MagicMock()
        block_device_info = mock.MagicMock()
        mock_get_disk_info.return_value = {}
@ -18634,6 +18645,28 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
        self.assertEqual({uuids.mdev: guest2.uuid},
                         drvr._get_all_assigned_mediated_devices())

+    @mock.patch.object(host.Host, 'get_guest')
+    def test_get_all_assigned_mediated_devices_for_an_instance(self,
+                                                               get_guest):
+        dom_with_vgpu = """
+              <domain type="kvm">
+                <devices>
+                 <hostdev mode='subsystem' type='mdev' model='vfio-pci'>
+                  <source>
+                   <address uuid='%s'/>
+                  </source>
+                 </hostdev>
+                </devices>
+              </domain>
+              """ % uuids.mdev
+        guest = libvirt_guest.Guest(FakeVirtDomain(fake_xml=dom_with_vgpu))
+        get_guest.return_value = guest
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+        fake_inst = objects.Instance()
+        self.assertEqual({uuids.mdev: guest.uuid},
+                         drvr._get_all_assigned_mediated_devices(fake_inst))
+        get_guest.assert_called_once_with(fake_inst)
+
    def test_allocate_mdevs_with_no_vgpu_allocations(self):
        allocations = {
            'rp1': {
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@ -2589,6 +2589,10 @@ class LibvirtDriver(driver.ComputeDriver):
        re-creates the domain to ensure the reboot happens, as the guest
        OS cannot ignore this action.
        """
+        # NOTE(sbauza): Since we undefine the guest XML when destroying, we
+        # need to remember the existing mdevs for reusing them.
+        mdevs = self._get_all_assigned_mediated_devices(instance)
+        mdevs = list(mdevs.keys())
        # NOTE(mdbooth): In addition to performing a hard reboot of the domain,
        # the hard reboot operation is relied upon by operators to be an
        # automated attempt to fix as many things as possible about a
@ -2617,7 +2621,8 @@ class LibvirtDriver(driver.ComputeDriver):
        #             are in place.
        xml = self._get_guest_xml(context, instance, network_info, disk_info,
                                  instance.image_meta,
-                                  block_device_info=block_device_info)
+                                  block_device_info=block_device_info,
+                                  mdevs=mdevs)

        # NOTE(mdbooth): context.auth_token will not be set when we call
        #                _hard_reboot from resume_state_on_host_boot()
@ -5767,15 +5772,22 @@ class LibvirtDriver(driver.ComputeDriver):
                mediated_devices.append(device)
        return mediated_devices

-    def _get_all_assigned_mediated_devices(self):
+    def _get_all_assigned_mediated_devices(self, instance=None):
        """Lookup all instances from the host and return all the mediated
        devices that are assigned to a guest.

+        :param instance: Only return mediated devices for that instance.
+
        :returns: A dictionary of keys being mediated device UUIDs and their
                  respective values the instance UUID of the guest using it.
        """
        allocated_mdevs = {}
-        for guest in self._host.list_guests(only_running=False):
+        if instance:
+            guest = self._host.get_guest(instance)
+            guests = [guest]
+        else:
+            guests = self._host.list_guests(only_running=False)
+        for guest in guests:
            cfg = guest.get_config()
            for device in cfg.devices:
                if isinstance(device, vconfig.LibvirtConfigGuestHostdevMDEV):
--- a/releasenotes/notes/add-support-for-vgpu-libvirt-91d2983e643f5ff1.yaml
+++ b/releasenotes/notes/add-support-for-vgpu-libvirt-91d2983e643f5ff1.yaml
@ -30,9 +30,28 @@ features:
      different types but there is no possibility yet to specify in the flavor
      which specific type we want to use for that instance.

+    * Suspending a guest having vGPUs doesn't work yet given a libvirt concern
+      (it can't hot-unplug mediated devices from a guest). Workarounds using
+      other instance actions (like snapshotting the instance or shelving it)
+      are recommended until libvirt supports that.

-    * For the moment, please don't restart instances (or suspend/resume them)
-      or the VGPU related device will be removed from the guest.
+    * Resizing an instance with a new flavor that has vGPU resources doesn't
+      allocate those vGPUs to the instance (the instance is created without
+      vGPU resources). We propose to work around this problem by rebuilding the
+      instance once it has been resized so then it will have allocated vGPUs.
+
+    * Migrating an instance to another host will have the same problem as
+      resize. In case you want to migrate an instance, make sure to rebuild
+      it.
+
+    * Rescuing an instance having vGPUs will mean that the rescue image won't
+      use the existing vGPUs. When unrescuing, it will use again the existing
+      vGPUs that were allocated to the instance. That said, given Nova looks
+      at all the allocated vGPUs when trying to find unallocated ones, there
+      could be a race condition if an instance is rescued at the moment a new
+      instance asking for vGPUs is created, because both instances could use
+      the same vGPUs. If you want to rescue an instance, make sure to disable
+      the host until we fix that in Nova.

    * Mediated devices that are created by the libvirt driver are not persisted
      upon reboot. Consequently, a guest startup would fail since the virtual
@ -45,7 +64,8 @@ features:
      the nvidia driver that prevents one guest to have more than one virtual
      GPU from the same physical card. One guest can have two or more virtual
      GPUs but then it requires each vGPU to be hosted by a separate physical
-      card.
+      card. Until that limitation is removed, please avoid creating flavors
+      asking for more than 1 vGPU.

    We are working actively to remove or workaround those caveats, but please
    understand that for the moment this feature is experimental given all the