XenAPI: create vGPU for instance

This commit parses the allocation to check if VGPU is
allocated. If yes, maps the allocation to GPU group and
vGPU type which can be understood by XenAPI. Then creates
the vGPU for the instance.
Before booting instance with vGPU, it needs to check if the
GPU group has remaining vGPUs available. So also updates the
function of _get_vgpu_stats_in_group() to report remaining
vGPU capacity.
As we can use vGPU feature since this commit, it also includes
a release note.

blueprint: add-support-for-vgpu

Change-Id: Ie24dde0f1fd4b281d598f4040097d82ad251eb06
This commit is contained in:
Jianghua Wang 2017-11-01 05:18:53 +00:00
parent 07c925a532
commit 2cbca2c6dd
7 changed files with 243 additions and 17 deletions

View File

@ -18,8 +18,10 @@ import math
import mock
from oslo_utils import units
from nova import exception
from nova.objects import fields as obj_fields
from nova.tests.unit.virt.xenapi import stubs
from nova.tests import uuidsentinel as uuids
from nova.virt import driver
from nova.virt import fake
from nova.virt import xenapi
@ -71,9 +73,10 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB):
'vgpu_stats': {
'c8328467-badf-43d8-8e28-0e096b0f88b1':
{'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
'total': 7,
'max_heads': 1,
'type_name': 'Intel GVT-g',
'max_heads': 1,
'total': 7,
'remaining': 7,
},
}}
@ -322,3 +325,98 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB):
vgpu_total = drv._get_vgpu_total(vgpu_stats)
self.assertEqual(11, vgpu_total)
def test_get_vgpu_info_no_vgpu_alloc(self):
# no vgpu in allocation.
alloc = {
'rp1': {
'resources': {
'VCPU': 1,
'MEMORY_MB': 512,
'DISK_GB': 1,
}
}
}
drv = self._get_driver()
vgpu_info = drv._get_vgpu_info(alloc)
self.assertIsNone(vgpu_info)
@mock.patch.object(host.HostState, 'get_host_stats')
def test_get_vgpu_info_has_vgpu_alloc(self, mock_get_stats):
# Have vgpu in allocation.
alloc = {
'rp1': {
'resources': {
'VCPU': 1,
'MEMORY_MB': 512,
'DISK_GB': 1,
'VGPU': 1,
}
}
}
# The following fake data assumes there are two GPU
# groups both of which supply the same type of vGPUs.
# If the 1st GPU group has no remaining available vGPUs;
# the 2nd GPU group still has remaining available vGPUs.
# it should return the uuid from the 2nd GPU group.
vgpu_stats = {
uuids.gpu_group_1: {
'uuid': uuids.vgpu_type,
'type_name': 'GRID K180Q',
'max_heads': 4,
'total': 2,
'remaining': 0,
},
uuids.gpu_group_2: {
'uuid': uuids.vgpu_type,
'type_name': 'GRID K180Q',
'max_heads': 4,
'total': 2,
'remaining': 2,
},
}
host_stats = self.host_stats()
host_stats.update(vgpu_stats=vgpu_stats)
mock_get_stats.return_value = host_stats
drv = self._get_driver()
vgpu_info = drv._get_vgpu_info(alloc)
expected_info = {'gpu_grp_uuid': uuids.gpu_group_2,
'vgpu_type_uuid': uuids.vgpu_type}
self.assertEqual(expected_info, vgpu_info)
@mock.patch.object(host.HostState, 'get_host_stats')
def test_get_vgpu_info_has_vgpu_alloc_except(self, mock_get_stats):
# Allocated vGPU but got exception due to no remaining vGPU.
alloc = {
'rp1': {
'resources': {
'VCPU': 1,
'MEMORY_MB': 512,
'DISK_GB': 1,
'VGPU': 1,
}
}
}
vgpu_stats = {
uuids.gpu_group: {
'uuid': uuids.vgpu_type,
'type_name': 'Intel GVT-g',
'max_heads': 1,
'total': 7,
'remaining': 0,
},
}
host_stats = self.host_stats()
host_stats.update(vgpu_stats=vgpu_stats)
mock_get_stats.return_value = host_stats
drv = self._get_driver()
self.assertRaises(exception.ComputeResourcesUnavailable,
drv._get_vgpu_info,
alloc)

View File

@ -92,6 +92,7 @@ class VGPUTestCase(test.NoDBTestCase):
'type_name_2', # VGPU_type.get_model_name
'type_uuid_2', # VGPU_type.get_uuid
'4', # VGPU_type.get_max_heads
'6', # GPU_group.get_remaining_capacity
]
host_obj = host.HostState(session)
@ -102,8 +103,9 @@ class VGPUTestCase(test.NoDBTestCase):
'type_name': 'type_name_2',
'max_heads': 4,
'total': 7,
'remaining': 6,
}
self.assertEqual(session.call_xenapi.call_count, 5)
self.assertEqual(session.call_xenapi.call_count, 6)
# It should get_uuid for the vGPU type passed via *enabled_vgpu_types*
# (the arg for get_uuid should be 'type_ref_2').
get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_2')]
@ -125,6 +127,7 @@ class VGPUTestCase(test.NoDBTestCase):
'type_name_2', # VGPU_type.get_model_name
'type_uuid_1', # VGPU_type.get_uuid
'4', # VGPU_type.get_max_heads
'6', # GPU_group.get_remaining_capacity
]
host_obj = host.HostState(session)
@ -136,8 +139,9 @@ class VGPUTestCase(test.NoDBTestCase):
'type_name': 'type_name_1',
'max_heads': 4,
'total': 7,
'remaining': 6,
}
self.assertEqual(session.call_xenapi.call_count, 5)
self.assertEqual(session.call_xenapi.call_count, 6)
# It should call get_uuid for the first vGPU type (the arg for get_uuid
# should be 'type_ref_1').
get_uuid_call = [mock.call('VGPU_type.get_uuid', 'type_ref_1')]

View File

@ -326,6 +326,7 @@ class SpawnTestCase(VMOpsTestBase):
'apply_instance_filter')
self.mox.StubOutWithMock(self.vmops, '_update_last_dom_id')
self.mox.StubOutWithMock(self.vmops._session, 'call_xenapi')
self.mox.StubOutWithMock(self.vmops, '_attach_vgpu')
@staticmethod
def _new_instance(obj):
@ -337,7 +338,7 @@ class SpawnTestCase(VMOpsTestBase):
def _test_spawn(self, name_label_param=None, block_device_info_param=None,
rescue=False, include_root_vdi=True, throw_exception=None,
attach_pci_dev=False, neutron_exception=False,
network_info=None):
network_info=None, vgpu_info=None):
self._stub_out_common()
instance = self._new_instance({"name": "dummy", "uuid": "fake_uuid",
@ -422,6 +423,9 @@ class SpawnTestCase(VMOpsTestBase):
"0/0000:00:00.0")
else:
pci_manager.get_instance_pci_devs(instance).AndReturn([])
self.vmops._attach_vgpu(vm_ref, vgpu_info, instance)
step += 1
self.vmops._update_instance_progress(context, instance, step, steps)
@ -491,8 +495,8 @@ class SpawnTestCase(VMOpsTestBase):
self.mox.ReplayAll()
self.vmops.spawn(context, instance, image_meta, injected_files,
admin_password, network_info,
block_device_info_param, name_label_param, rescue)
admin_password, network_info, block_device_info_param,
vgpu_info, name_label_param, rescue)
def test_spawn(self):
self._test_spawn()
@ -505,6 +509,11 @@ class SpawnTestCase(VMOpsTestBase):
def test_spawn_with_pci_available_on_the_host(self):
self._test_spawn(attach_pci_dev=True)
def test_spawn_with_vgpu(self):
vgpu_info = {'grp_uuid': uuids.gpu_group_1,
'vgpu_type_uuid': uuids.vgpu_type_1}
self._test_spawn(vgpu_info=vgpu_info)
def test_spawn_performs_rollback_and_throws_exception(self):
self.assertRaises(test.TestingException, self._test_spawn,
throw_exception=test.TestingException())
@ -645,7 +654,8 @@ class SpawnTestCase(VMOpsTestBase):
self._test_spawn, neutron_exception=True)
def _test_finish_migration(self, power_on=True, resize_instance=True,
throw_exception=None, booted_from_volume=False):
throw_exception=None, booted_from_volume=False,
vgpu_info=None):
self._stub_out_common()
self.mox.StubOutWithMock(volumeops.VolumeOps, "connect_volume")
self.mox.StubOutWithMock(vm_utils, "import_all_migrated_disks")
@ -704,6 +714,8 @@ class SpawnTestCase(VMOpsTestBase):
self.vmops._attach_mapped_block_devices(instance, block_device_info)
pci_manager.get_instance_pci_devs(instance).AndReturn([])
self.vmops._attach_vgpu(vm_ref, vgpu_info, instance)
self.vmops._inject_instance_metadata(instance, vm_ref)
self.vmops._inject_auto_disk_config(instance, vm_ref)
self.vmops._file_inject_vm_settings(instance, vm_ref, vdis,

View File

@ -168,12 +168,65 @@ class XenAPIDriver(driver.ComputeDriver):
"""
return self._vmops.list_instance_uuids()
def _is_vgpu_allocated(self, allocations):
# check if allocated vGPUs
if not allocations:
# If no allocations, there is no vGPU request.
return False
RC_VGPU = fields.ResourceClass.VGPU
for rp in allocations:
res = allocations[rp]['resources']
if res and RC_VGPU in res and res[RC_VGPU] > 0:
return True
return False
def _get_vgpu_info(self, allocations):
"""Get vGPU info basing on the allocations.
:param allocations: Information about resources allocated to the
instance via placement, of the form returned by
SchedulerReportClient.get_allocations_for_consumer.
:returns: Dictionary describing vGPU info if any vGPU allocated;
None otherwise.
:raises: exception.ComputeResourcesUnavailable if there is no
available vGPUs.
"""
if not self._is_vgpu_allocated(allocations):
return None
# NOTE(jianghuaw): At the moment, we associate all vGPUs resource to
# the compute node regardless which GPU group the vGPUs belong to, so
# we need search all GPU groups until we got one group which has
# remaining capacity to supply one vGPU. Once we switch to the
# nested resource providers, the allocations will contain the resource
# provider which represents a particular GPU group. It's able to get
# the GPU group and vGPU type directly by using the resource provider's
# uuid. Then we can consider moving this function to vmops, as there is
# no need to query host stats to get all GPU groups.
host_stats = self.host_state.get_host_stats(refresh=True)
vgpu_stats = host_stats['vgpu_stats']
for grp_uuid in vgpu_stats:
if vgpu_stats[grp_uuid]['remaining'] > 0:
# NOTE(jianghuaw): As XenServer only supports single vGPU per
# VM, we've restricted the inventory data having `max_unit` as
# 1. If it reached here, surely only one GPU is allocated.
# So just return the GPU group uuid and vGPU type uuid once
# we got one group which still has remaining vGPUs.
return dict(gpu_grp_uuid=grp_uuid,
vgpu_type_uuid=vgpu_stats[grp_uuid]['uuid'])
# No remaining vGPU available: e.g. the vGPU resource has been used by
# other instance or the vGPU has been changed to be disabled.
raise exception.ComputeResourcesUnavailable(
reason='vGPU resource is not available')
def spawn(self, context, instance, image_meta, injected_files,
admin_password, allocations, network_info=None,
block_device_info=None):
"""Create VM instance."""
vgpu_info = self._get_vgpu_info(allocations)
self._vmops.spawn(context, instance, image_meta, injected_files,
admin_password, network_info, block_device_info)
admin_password, network_info, block_device_info,
vgpu_info)
def confirm_migration(self, context, migration, instance, network_info):
"""Confirms a resize, destroying the source VM."""

View File

@ -272,9 +272,10 @@ class HostState(object):
The return value is a dict. For example:
{'uuid': '6444c6ee-3a49-42f5-bebb-606b52175e67',
'total': 7,
'max_heads': '1',
'type_name': 'Intel GVT-g',
'max_heads': 1,
'total': 7,
'remaining': 7,
}
"""
type_refs_in_grp = self._session.call_xenapi(
@ -309,6 +310,10 @@ class HostState(object):
'VGPU_type.get_max_heads', type_ref))
stat['total'] = self._get_total_vgpu_in_grp(grp_ref, type_ref)
stat['remaining'] = int(self._session.call_xenapi(
'GPU_group.get_remaining_capacity',
grp_ref,
type_ref))
return stat
def _get_total_vgpu_in_grp(self, grp_ref, type_ref):

View File

@ -400,9 +400,26 @@ class VMOps(object):
other_config['last_dom_id'] = self._session.VM.get_domid(vm_ref)
self._session.VM.set_other_config(vm_ref, other_config)
def _attach_vgpu(self, vm_ref, vgpu_info, instance):
if not vgpu_info:
return
grp_ref = self._session.call_xenapi("GPU_group.get_by_uuid",
vgpu_info['gpu_grp_uuid'])
type_ref = self._session.call_xenapi("VGPU_type.get_by_uuid",
vgpu_info['vgpu_type_uuid'])
# NOTE(jianghuaw): set other-config with "nova-instance-uuid" to
# declare which nova instance owns this vGPU. That should be useful
# for tracking purposes. '0' is the device id for VGPU. As we only
# support one VGPU at the moment, so only '0' is the valid value.
# Refer to https://xapi-project.github.io/xen-api/classes/vgpu.html
# for this Xen API of 'VGPU.create'.
self._session.call_xenapi('VGPU.create', vm_ref, grp_ref, '0',
{'nova-instance-uuid': instance['uuid']},
type_ref)
def spawn(self, context, instance, image_meta, injected_files,
admin_password, network_info=None, block_device_info=None,
name_label=None, rescue=False):
vgpu_info=None, name_label=None, rescue=False):
if block_device_info:
LOG.debug("Block device information present: %s",
@ -432,12 +449,12 @@ class VMOps(object):
return vdis
self._spawn(context, instance, image_meta, step, create_disks_step,
True, injected_files, admin_password,
network_info, block_device_info, name_label, rescue)
True, injected_files, admin_password, network_info,
block_device_info, vgpu_info, name_label, rescue)
def _spawn(self, context, instance, image_meta, step, create_disks_step,
first_boot, injected_files=None, admin_password=None,
network_info=None, block_device_info=None,
network_info=None, block_device_info=None, vgpu_info=None,
name_label=None, rescue=False, power_on=True, resize=True,
completed_callback=None):
if name_label is None:
@ -515,9 +532,15 @@ class VMOps(object):
return vm_ref
@step
def attach_devices_step(undo_mgr, vm_ref, vdis, disk_image_type):
def attach_devices_step(undo_mgr, vm_ref, vdis, disk_image_type,
vgpu_info):
attach_disks(undo_mgr, vm_ref, vdis, disk_image_type)
attach_pci_devices(undo_mgr, vm_ref)
# NOTE(jianghuaw): in XAPI, the VGPU record is associated with a
# VM since creation. The record will be destroyed automatically
# once VM is destroyed. So there is no need to add any additional
# undo functions for VGPU.
self._attach_vgpu(vm_ref, vgpu_info, instance)
if rescue:
# NOTE(johannes): Attach disks from original VM to rescue VM now,
@ -583,7 +606,8 @@ class VMOps(object):
vm_ref = create_vm_record_step(undo_mgr, disk_image_type,
kernel_file, ramdisk_file)
attach_devices_step(undo_mgr, vm_ref, vdis, disk_image_type)
attach_devices_step(undo_mgr, vm_ref, vdis, disk_image_type,
vgpu_info)
inject_instance_data_step(undo_mgr, vm_ref, vdis)

View File

@ -0,0 +1,30 @@
---
features:
- |
When using XenAPI driver for XenServer, we can support booting instances
with a vGPU attached to get better graphics processing capability.
In order to use this feature, the operators should specify the enabled
vGPU types in the nova compute configuration file with the configuration
option - ``[devices]/enabled_vgpu_types``. Only the enabled vGPU types
can be used by instances.
XenServer automatically detects and groups together identical physical
GPUs. Although the physical GPUs may support multiple vGPU types, at
the moment nova only supports a single vGPU type for each compute node.
The operators can run the following CLI commands in XenServer to get
the available vGPU types if the host supports vGPU.
* xe vgpu-type-list
The values of "model-name ( RO):" from the output of the above commands
are the vGPU type names which you can choose from to set the nova
configure - ``[devices]/enabled_vgpu_types``. Please choose only one
vGPU type to be enabled.
The operators should specify a vGPU resource in the flavor's extra_specs:
* nova flavor-key <flavor-id> set resources:VGPU=1
Then users can use the flavor to boot instances with a vGPU attached.
At the moment, XenServer doesn't support multiple vGPUs for a single
instance, so ``resources:VGPU`` in the flavor's extra_specs should
always be ``1``.