Cyborg NVIDIA GPU Driver support vGPU management
The Cyborg NVIDIA GPU Driver has implemented pGPU management in the Train release, this patch implements the vGPU management support in the same driver. For specs about this feature, please refer to [0]. [0]:https://specs.openstack.org/openstack/cyborg-specs/specs/wallaby/approved/vgpu-driver-proposal.html Co-Authored-By: Wenping Song <songwenping@inspur.com> implement blueprint enable-vgpu-in-cyborg Change-Id: I715fdad3e8601186b5c6e8c087f27fb91d679490
This commit is contained in:
parent
eafcc2fc64
commit
79e1928554
@ -14,12 +14,19 @@
|
|||||||
|
|
||||||
import collections
|
import collections
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from oslo_serialization import jsonutils
|
from oslo_serialization import jsonutils
|
||||||
|
|
||||||
from cyborg.common import exception
|
from cyborg.common import exception
|
||||||
|
|
||||||
|
|
||||||
|
_PCI_ADDRESS_PATTERN = ("^(hex{4}):(hex{2}):(hex{2}).(oct{1})$".
|
||||||
|
replace("hex", r"[\da-fA-F]").
|
||||||
|
replace("oct", "[0-7]"))
|
||||||
|
_PCI_ADDRESS_REGEX = re.compile(_PCI_ADDRESS_PATTERN)
|
||||||
|
|
||||||
|
|
||||||
def pci_str_to_json(pci_address, physnet=None):
|
def pci_str_to_json(pci_address, physnet=None):
|
||||||
dbs, func = pci_address.split('.')
|
dbs, func = pci_address.split('.')
|
||||||
domain, bus, slot = dbs.split(':')
|
domain, bus, slot = dbs.split(':')
|
||||||
@ -106,3 +113,24 @@ def get_vendor_maps():
|
|||||||
"1099": "samsung",
|
"1099": "samsung",
|
||||||
"1cf2": "zte"
|
"1cf2": "zte"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def mdev_str_to_json(pci_address, asked_type, vgpu_mark):
|
||||||
|
dbs, func = pci_address.split('.')
|
||||||
|
domain, bus, slot = dbs.split(':')
|
||||||
|
keys = ["domain", "bus", "device", "function", "asked_type", "vgpu_mark"]
|
||||||
|
values = [domain, bus, slot, func, asked_type, vgpu_mark]
|
||||||
|
bdf_dict = dict(zip(keys, values))
|
||||||
|
ordered_dict = collections.OrderedDict(sorted(bdf_dict.items()))
|
||||||
|
bdf_json = jsonutils.dumps(ordered_dict)
|
||||||
|
return bdf_json
|
||||||
|
|
||||||
|
|
||||||
|
def parse_address(address):
|
||||||
|
"""Returns (domain, bus, slot, function) from PCI address that is set
|
||||||
|
in configuration
|
||||||
|
"""
|
||||||
|
m = _PCI_ADDRESS_REGEX.match(address)
|
||||||
|
if not m:
|
||||||
|
raise exception.PciDeviceWrongAddressFormat(address=address)
|
||||||
|
return m.groups()
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# Modifications Copyright (C) 2020 ZTE Corporation
|
||||||
# Copyright 2018 Beijing Lenovo Software Ltd.
|
# Copyright 2018 Beijing Lenovo Software Ltd.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
@ -26,6 +27,7 @@ class NVIDIAGPUDriver(GPUDriver):
|
|||||||
Vendor should implement their specific drivers in this class.
|
Vendor should implement their specific drivers in this class.
|
||||||
"""
|
"""
|
||||||
VENDOR = "nvidia"
|
VENDOR = "nvidia"
|
||||||
|
VENDOR_ID = "10de"
|
||||||
|
|
||||||
def discover(self):
|
def discover(self):
|
||||||
return sysinfo.gpu_tree()
|
return sysinfo.discover(self.VENDOR_ID)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# Modifications Copyright (C) 2021 ZTE Corporation
|
||||||
# Copyright 2018 Beijing Lenovo Software Ltd.
|
# Copyright 2018 Beijing Lenovo Software Ltd.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
@ -16,11 +17,271 @@
|
|||||||
"""
|
"""
|
||||||
Cyborg NVIDIA GPU driver implementation.
|
Cyborg NVIDIA GPU driver implementation.
|
||||||
"""
|
"""
|
||||||
|
from oslo_log import log as logging
|
||||||
|
from oslo_serialization import jsonutils
|
||||||
|
|
||||||
from cyborg.accelerator.drivers.gpu import utils
|
import collections
|
||||||
VENDOR_ID = "10de"
|
import os
|
||||||
|
|
||||||
|
import cyborg.conf
|
||||||
|
|
||||||
|
from cyborg.accelerator.common import utils
|
||||||
|
from cyborg.accelerator.drivers.gpu import utils as gpu_utils
|
||||||
|
from cyborg.common import constants
|
||||||
|
from cyborg.common import exception
|
||||||
|
from cyborg.conf import CONF
|
||||||
|
from cyborg.objects.driver_objects import driver_attach_handle
|
||||||
|
from cyborg.objects.driver_objects import driver_attribute
|
||||||
|
from cyborg.objects.driver_objects import driver_controlpath_id
|
||||||
|
from cyborg.objects.driver_objects import driver_deployable
|
||||||
|
from cyborg.objects.driver_objects import driver_device
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def gpu_tree():
|
def _get_traits(vendor_id, product_id, vgpu_type_name=None):
|
||||||
devs = utils.discover_gpus(VENDOR_ID)
|
"""Generate traits for GPUs.
|
||||||
|
: param vendor_id: vendor_id of PGPU/VGPU, eg."10de"
|
||||||
|
: param product_id: product_id of PGPU/VGPU, eg."1eb8".
|
||||||
|
: param vgpu_type_name: vgpu type name, eg."T4_1B".
|
||||||
|
Example VGPU traits:
|
||||||
|
{traits:["OWNER_CYBORG", "CUSTOM_NVIDIA_1EB8_T4_2B"]}
|
||||||
|
Example PGPU traits:
|
||||||
|
{traits:["OWNER_CYBORG", "CUSTOM_NVIDIA_1EB8"]}
|
||||||
|
"""
|
||||||
|
traits = ["OWNER_CYBORG"]
|
||||||
|
# PGPU trait
|
||||||
|
gpu_trait = "_".join(
|
||||||
|
('CUSTOM', gpu_utils.VENDOR_MAPS.get(vendor_id, "").upper(),
|
||||||
|
product_id.upper()))
|
||||||
|
# VGPU trait
|
||||||
|
if vgpu_type_name:
|
||||||
|
gpu_trait = "_".join((gpu_trait, vgpu_type_name.upper()))
|
||||||
|
traits.append(gpu_trait)
|
||||||
|
return {"traits": traits}
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_attribute_list(gpu):
|
||||||
|
attr_list = []
|
||||||
|
index = 0
|
||||||
|
for k, v in gpu.items():
|
||||||
|
if k == "rc":
|
||||||
|
driver_attr = driver_attribute.DriverAttribute()
|
||||||
|
driver_attr.key, driver_attr.value = k, v
|
||||||
|
attr_list.append(driver_attr)
|
||||||
|
if k == "traits":
|
||||||
|
values = gpu.get(k, [])
|
||||||
|
for val in values:
|
||||||
|
driver_attr = driver_attribute.DriverAttribute(
|
||||||
|
key="trait" + str(index), value=val)
|
||||||
|
index = index + 1
|
||||||
|
attr_list.append(driver_attr)
|
||||||
|
return attr_list
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_attach_handle(gpu, num=None):
|
||||||
|
driver_ah = driver_attach_handle.DriverAttachHandle()
|
||||||
|
driver_ah.in_use = False
|
||||||
|
if gpu["rc"] == "PGPU":
|
||||||
|
driver_ah.attach_type = constants.AH_TYPE_PCI
|
||||||
|
driver_ah.attach_info = utils.pci_str_to_json(gpu["devices"])
|
||||||
|
else:
|
||||||
|
vgpu_mark = gpu["vGPU_type"] + '_' + str(num)
|
||||||
|
driver_ah.attach_type = constants.AH_TYPE_MDEV
|
||||||
|
driver_ah.attach_info = utils.mdev_str_to_json(
|
||||||
|
gpu["devices"], gpu["vGPU_type"], vgpu_mark)
|
||||||
|
return driver_ah
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_dep_list(gpu):
|
||||||
|
dep_list = []
|
||||||
|
driver_dep = driver_deployable.DriverDeployable()
|
||||||
|
driver_dep.attribute_list = _generate_attribute_list(gpu)
|
||||||
|
driver_dep.attach_handle_list = []
|
||||||
|
# NOTE(wangzhh): The name of deployable should be unique, its format is
|
||||||
|
# under disscussion, may looks like
|
||||||
|
# <ComputeNodeName>_<NumaNodeName>_<CyborgName>_<NumInHost>
|
||||||
|
# NOTE(yumeng) Since Wallaby release, the deplpyable_name is named as
|
||||||
|
# <Compute_hostname>_<Device_address>
|
||||||
|
driver_dep.name = gpu.get('hostname', '') + '_' + gpu["devices"]
|
||||||
|
driver_dep.driver_name = \
|
||||||
|
gpu_utils.VENDOR_MAPS.get(gpu["vendor_id"], '').upper()
|
||||||
|
# if is pGPU, num_accelerators = 1
|
||||||
|
if gpu["rc"] == "PGPU":
|
||||||
|
driver_dep.num_accelerators = 1
|
||||||
|
driver_dep.attach_handle_list = \
|
||||||
|
[_generate_attach_handle(gpu)]
|
||||||
|
else:
|
||||||
|
# if is vGPU, num_accelerators is the total vGPU capability of
|
||||||
|
# the asked vGPU type
|
||||||
|
vGPU_path = os.path.expandvars(
|
||||||
|
'/sys/bus/pci/devices/{0}/mdev_supported_types/{1}/'
|
||||||
|
.format(gpu["devices"], gpu["vGPU_type"]))
|
||||||
|
num_available = 0
|
||||||
|
with open(vGPU_path + 'available_instances', 'r') as f:
|
||||||
|
num_available = int(f.read().strip())
|
||||||
|
num_created = len(os.listdir(vGPU_path + 'devices'))
|
||||||
|
driver_dep.num_accelerators = num_available + num_created
|
||||||
|
# example: 1 pGPU has 16 vGPUs is represented as
|
||||||
|
# 16 attach_handles, 1 deployable, 1 resource_provider
|
||||||
|
# NOTE(yumeng): cyborg use attach_handle_uuid
|
||||||
|
# to create each vGPU without the need to generate a new uuid
|
||||||
|
# example: echo "attach_handle_uuid" > nvidia-223/create
|
||||||
|
for num in range(driver_dep.num_accelerators):
|
||||||
|
driver_dep.attach_handle_list.append(
|
||||||
|
_generate_attach_handle(gpu, num))
|
||||||
|
dep_list.append(driver_dep)
|
||||||
|
return dep_list
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_controlpath_id(gpu):
|
||||||
|
driver_cpid = driver_controlpath_id.DriverControlPathID()
|
||||||
|
driver_cpid.cpid_type = "PCI"
|
||||||
|
driver_cpid.cpid_info = utils.pci_str_to_json(gpu["devices"])
|
||||||
|
return driver_cpid
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_driver_device(gpu):
|
||||||
|
driver_device_obj = driver_device.DriverDevice()
|
||||||
|
driver_device_obj.vendor = gpu['vendor_id']
|
||||||
|
driver_device_obj.model = gpu.get('model', 'miss model info')
|
||||||
|
std_board_info = {'product_id': gpu.get('product_id'),
|
||||||
|
'controller': gpu.get('controller'), }
|
||||||
|
vendor_board_info = {'vendor_info': gpu.get('vendor_info',
|
||||||
|
'gpu_vb_info')}
|
||||||
|
driver_device_obj.std_board_info = jsonutils.dumps(std_board_info)
|
||||||
|
driver_device_obj.vendor_board_info = jsonutils.dumps(
|
||||||
|
vendor_board_info)
|
||||||
|
driver_device_obj.type = constants.DEVICE_GPU
|
||||||
|
driver_device_obj.stub = gpu.get('stub', False)
|
||||||
|
driver_device_obj.controlpath_id = _generate_controlpath_id(gpu)
|
||||||
|
driver_device_obj.deployable_list = _generate_dep_list(gpu)
|
||||||
|
return driver_device_obj
|
||||||
|
|
||||||
|
|
||||||
|
def _get_supported_vgpu_types():
|
||||||
|
"""Gets supported vgpu_types from cyborg.conf.
|
||||||
|
|
||||||
|
Retrieves supported vgpu_types set by the operator and generates a
|
||||||
|
record of vgpu_type and pgpu in the dict constant: pgpu_type_mapping.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of all vgpu_types set in CONF.gpu_devices.enabled_vgpu_types.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidGPUConfig: An error occurred if same PCI appear twice
|
||||||
|
or PCI address is not valid.
|
||||||
|
"""
|
||||||
|
pgpu_type_mapping = collections.defaultdict(str)
|
||||||
|
pgpu_type_mapping.clear()
|
||||||
|
if not CONF.gpu_devices.enabled_vgpu_types:
|
||||||
|
return [], pgpu_type_mapping
|
||||||
|
|
||||||
|
for vgpu_type in CONF.gpu_devices.enabled_vgpu_types:
|
||||||
|
group = getattr(CONF, 'vgpu_%s' % vgpu_type, None)
|
||||||
|
if group is None or not group.device_addresses:
|
||||||
|
# Device addresses must be configured explictly now for every
|
||||||
|
# enabled vgpu type. Will improve after the disable and enable
|
||||||
|
# devices interfaces implemented.
|
||||||
|
raise exception.InvalidvGPUConfig(
|
||||||
|
reason="Missing device addresses config for vgpu type %s"
|
||||||
|
% vgpu_type
|
||||||
|
)
|
||||||
|
for device_address in group.device_addresses:
|
||||||
|
if device_address in pgpu_type_mapping:
|
||||||
|
raise exception.InvalidvGPUConfig(
|
||||||
|
reason="Duplicate types for PCI address %s"
|
||||||
|
% device_address
|
||||||
|
)
|
||||||
|
# Just checking whether the operator fat-fingered the address.
|
||||||
|
# If it's wrong, it will return an exception
|
||||||
|
try:
|
||||||
|
# Validates whether it's a PCI ID...
|
||||||
|
utils.parse_address(device_address)
|
||||||
|
except exception.PciDeviceWrongAddressFormat:
|
||||||
|
raise exception.InvalidvGPUConfig(
|
||||||
|
reason="Incorrect PCI address: %s" % device_address
|
||||||
|
)
|
||||||
|
pgpu_type_mapping[device_address] = vgpu_type
|
||||||
|
return CONF.gpu_devices.enabled_vgpu_types, pgpu_type_mapping
|
||||||
|
|
||||||
|
|
||||||
|
def _get_vgpu_type_per_pgpu(device_address, supported_vgpu_types,
|
||||||
|
pgpu_type_mapping):
|
||||||
|
"""Provides the vGPU type the pGPU supports.
|
||||||
|
|
||||||
|
:param device_address: the PCI device address in config,
|
||||||
|
eg.'0000:af:00.0'
|
||||||
|
"""
|
||||||
|
supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
|
||||||
|
# Bail out quickly if we don't support vGPUs
|
||||||
|
if not supported_vgpu_types:
|
||||||
|
LOG.error('Unable to load vGPU_type from [gpu_devices] '
|
||||||
|
'Ensure "enabled_vgpu_types" is set.')
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Validates whether it's a PCI ID...
|
||||||
|
utils.parse_address(device_address)
|
||||||
|
except (exception.PciDeviceWrongAddressFormat, IndexError):
|
||||||
|
# this is not a valid PCI address
|
||||||
|
LOG.warning("The PCI address %s was invalid for getting the"
|
||||||
|
"related vGPU type", device_address)
|
||||||
|
return
|
||||||
|
return pgpu_type_mapping.get(device_address)
|
||||||
|
|
||||||
|
|
||||||
|
def _discover_gpus(vendor_id):
|
||||||
|
"""param: vendor_id=VENDOR_ID means only discover Nvidia GPU on the host
|
||||||
|
"""
|
||||||
|
# init vGPU conf
|
||||||
|
cyborg.conf.devices.register_dynamic_opts(CONF)
|
||||||
|
supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
|
||||||
|
# discover gpu devices by "lspci"
|
||||||
|
gpu_list = []
|
||||||
|
gpus = gpu_utils.get_pci_devices(gpu_utils.GPU_FLAGS, vendor_id)
|
||||||
|
# report trait,rc and generate driver object
|
||||||
|
for gpu in gpus:
|
||||||
|
m = gpu_utils.GPU_INFO_PATTERN.match(gpu)
|
||||||
|
if m:
|
||||||
|
gpu_dict = m.groupdict()
|
||||||
|
# get hostname for deployable_name usage
|
||||||
|
gpu_dict['hostname'] = CONF.host
|
||||||
|
# get vgpu_type from cyborg.conf, otherwise vgpu_type=None
|
||||||
|
vgpu_type = _get_vgpu_type_per_pgpu(
|
||||||
|
gpu_dict["devices"], supported_vgpu_types, pgpu_type_mapping)
|
||||||
|
# generate rc and trait for pGPU
|
||||||
|
if not vgpu_type:
|
||||||
|
gpu_dict["rc"] = constants.RESOURCES["PGPU"]
|
||||||
|
traits = _get_traits(gpu_dict["vendor_id"],
|
||||||
|
gpu_dict["product_id"])
|
||||||
|
# generate rc and trait for vGPU
|
||||||
|
else:
|
||||||
|
# get rc
|
||||||
|
gpu_dict["rc"] = constants.RESOURCES["VGPU"]
|
||||||
|
mdev_path = os.path.expandvars(
|
||||||
|
'/sys/bus/pci/devices/{0}/mdev_supported_types'.
|
||||||
|
format(gpu_dict["devices"]))
|
||||||
|
valid_types = os.listdir(mdev_path)
|
||||||
|
if vgpu_type not in valid_types:
|
||||||
|
raise exception.InvalidVGPUType(name=vgpu_type)
|
||||||
|
gpu_dict["vGPU_type"] = vgpu_type
|
||||||
|
vGPU_path = os.path.expandvars(
|
||||||
|
'/sys/bus/pci/devices/{0}/mdev_supported_types/{1}/'
|
||||||
|
.format(gpu_dict["devices"], gpu_dict["vGPU_type"]))
|
||||||
|
# transfer vgpu_type to vgpu_type_name.
|
||||||
|
# eg. transfer 'nvidia-223' to 'T4_1B'
|
||||||
|
with open(vGPU_path + 'name', 'r') as f:
|
||||||
|
name = f.read().strip()
|
||||||
|
vgpu_type_name = name.split(' ')[1].replace('-', '_')
|
||||||
|
traits = _get_traits(gpu_dict["vendor_id"],
|
||||||
|
gpu_dict["product_id"],
|
||||||
|
vgpu_type_name)
|
||||||
|
gpu_dict.update(traits)
|
||||||
|
gpu_list.append(_generate_driver_device(gpu_dict))
|
||||||
|
return gpu_list
|
||||||
|
|
||||||
|
|
||||||
|
def discover(vendor_id):
|
||||||
|
devs = _discover_gpus(vendor_id)
|
||||||
return devs
|
return devs
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
# Copyright 2018 Beijing Lenovo Software Ltd.
|
# Modifications Copyright (C) 2021 ZTE Corporation
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
# not use this file except in compliance with the License. You may obtain
|
# not use this file except in compliance with the License. You may obtain
|
||||||
# a copy of the License at
|
# a copy of the License at
|
||||||
@ -12,24 +11,15 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Utils for GPU driver.
|
Utils for GPU driver.
|
||||||
"""
|
"""
|
||||||
from oslo_concurrency import processutils
|
from oslo_concurrency import processutils
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
from oslo_serialization import jsonutils
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from cyborg.accelerator.common import utils
|
import cyborg.conf
|
||||||
from cyborg.common import constants
|
|
||||||
from cyborg.conf import CONF
|
|
||||||
from cyborg.objects.driver_objects import driver_attach_handle
|
|
||||||
from cyborg.objects.driver_objects import driver_attribute
|
|
||||||
from cyborg.objects.driver_objects import driver_controlpath_id
|
|
||||||
from cyborg.objects.driver_objects import driver_deployable
|
|
||||||
from cyborg.objects.driver_objects import driver_device
|
|
||||||
import cyborg.privsep
|
import cyborg.privsep
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
@ -42,6 +32,7 @@ GPU_INFO_PATTERN = re.compile(r"(?P<devices>[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:"
|
|||||||
r"{4}):(?P<product_id>[0-9a-fA-F]{4})].*")
|
r"{4}):(?P<product_id>[0-9a-fA-F]{4})].*")
|
||||||
|
|
||||||
VENDOR_MAPS = {"10de": "nvidia", "102b": "matrox"}
|
VENDOR_MAPS = {"10de": "nvidia", "102b": "matrox"}
|
||||||
|
PRODUCT_ID_MAPS = {"1eb8": "T4", "15f7": "P100_PCIE_12GB"}
|
||||||
|
|
||||||
|
|
||||||
@cyborg.privsep.sys_admin_pctxt.entrypoint
|
@cyborg.privsep.sys_admin_pctxt.entrypoint
|
||||||
@ -62,19 +53,6 @@ def get_pci_devices(pci_flags, vendor_id=None):
|
|||||||
return device_for_vendor_out if vendor_id else all_device_out
|
return device_for_vendor_out if vendor_id else all_device_out
|
||||||
|
|
||||||
|
|
||||||
def get_traits(vendor_id, product_id):
|
|
||||||
"""Generate traits for GPUs.
|
|
||||||
: param vendor_id: vendor_id of PGPU/VGPU, eg."10de"
|
|
||||||
: param product_id: product_id of PGPU/VGPU, eg."1eb8".
|
|
||||||
Example VGPU traits:
|
|
||||||
{traits:["CUSTOM_GPU_NVIDIA", "CUSTOM_GPU_PRODUCT_ID_1EB8"]}
|
|
||||||
"""
|
|
||||||
traits = []
|
|
||||||
traits.append("CUSTOM_GPU_" + VENDOR_MAPS.get(vendor_id, "").upper())
|
|
||||||
traits.append("CUSTOM_GPU_PRODUCT_ID_" + product_id.upper())
|
|
||||||
return {"traits": traits}
|
|
||||||
|
|
||||||
|
|
||||||
def discover_vendors():
|
def discover_vendors():
|
||||||
vendors = set()
|
vendors = set()
|
||||||
gpus = get_pci_devices(GPU_FLAGS)
|
gpus = get_pci_devices(GPU_FLAGS)
|
||||||
@ -84,93 +62,3 @@ def discover_vendors():
|
|||||||
vendor_id = m.groupdict().get("vendor_id")
|
vendor_id = m.groupdict().get("vendor_id")
|
||||||
vendors.add(vendor_id)
|
vendors.add(vendor_id)
|
||||||
return vendors
|
return vendors
|
||||||
|
|
||||||
|
|
||||||
def discover_gpus(vendor_id=None):
|
|
||||||
gpu_list = []
|
|
||||||
gpus = get_pci_devices(GPU_FLAGS, vendor_id)
|
|
||||||
for gpu in gpus:
|
|
||||||
m = GPU_INFO_PATTERN.match(gpu)
|
|
||||||
if m:
|
|
||||||
gpu_dict = m.groupdict()
|
|
||||||
# generate hostname for deployable_name usage
|
|
||||||
gpu_dict['hostname'] = CONF.host
|
|
||||||
# generate traits info
|
|
||||||
# TODO(yumeng) support and test VGPU rc generation soon.
|
|
||||||
traits = get_traits(gpu_dict["vendor_id"], gpu_dict["product_id"])
|
|
||||||
gpu_dict["rc"] = constants.RESOURCES["PGPU"]
|
|
||||||
gpu_dict.update(traits)
|
|
||||||
gpu_list.append(_generate_driver_device(gpu_dict))
|
|
||||||
return gpu_list
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_driver_device(gpu):
|
|
||||||
driver_device_obj = driver_device.DriverDevice()
|
|
||||||
driver_device_obj.vendor = gpu["vendor_id"]
|
|
||||||
driver_device_obj.model = gpu.get('model', 'miss model info')
|
|
||||||
std_board_info = {'product_id': gpu.get('product_id'),
|
|
||||||
'controller': gpu.get('controller')}
|
|
||||||
vendor_board_info = {'vendor_info': gpu.get('vendor_info', 'gpu_vb_info')}
|
|
||||||
driver_device_obj.std_board_info = jsonutils.dumps(std_board_info)
|
|
||||||
driver_device_obj.vendor_board_info = jsonutils.dumps(vendor_board_info)
|
|
||||||
driver_device_obj.type = constants.DEVICE_GPU
|
|
||||||
driver_device_obj.stub = gpu.get('stub', False)
|
|
||||||
driver_device_obj.controlpath_id = _generate_controlpath_id(gpu)
|
|
||||||
driver_device_obj.deployable_list = _generate_dep_list(gpu)
|
|
||||||
return driver_device_obj
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_controlpath_id(gpu):
|
|
||||||
driver_cpid = driver_controlpath_id.DriverControlPathID()
|
|
||||||
# NOTE: GPUs (either pGPU or vGPU), they all report "PCI" as
|
|
||||||
# their cpid_type, while attach_handle_type of them are different.
|
|
||||||
driver_cpid.cpid_type = "PCI"
|
|
||||||
driver_cpid.cpid_info = utils.pci_str_to_json(gpu["devices"])
|
|
||||||
return driver_cpid
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_dep_list(gpu):
|
|
||||||
dep_list = []
|
|
||||||
driver_dep = driver_deployable.DriverDeployable()
|
|
||||||
driver_dep.attribute_list = _generate_attribute_list(gpu)
|
|
||||||
driver_dep.attach_handle_list = []
|
|
||||||
# NOTE(yumeng) Now simply named as <Compute_hostname>_<Device_address>
|
|
||||||
# once cyborg needs to support GPU devices discovered from a baremetal
|
|
||||||
# node, we might need to support more formats.
|
|
||||||
driver_dep.name = gpu.get('hostname', '') + '_' + gpu["devices"]
|
|
||||||
driver_dep.driver_name = VENDOR_MAPS.get(gpu["vendor_id"], '').upper()
|
|
||||||
# driver_dep.num_accelerators for PGPU is 1, for VGPU should be the
|
|
||||||
# available_instances of the vGPU device.
|
|
||||||
# TODO(yumeng) support VGPU num report soon
|
|
||||||
driver_dep.num_accelerators = 1
|
|
||||||
driver_dep.attach_handle_list = \
|
|
||||||
[_generate_attach_handle(gpu)]
|
|
||||||
dep_list.append(driver_dep)
|
|
||||||
return dep_list
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_attach_handle(gpu):
|
|
||||||
driver_ah = driver_attach_handle.DriverAttachHandle()
|
|
||||||
if gpu["rc"] == "PGPU":
|
|
||||||
driver_ah.attach_type = constants.AH_TYPE_PCI
|
|
||||||
else:
|
|
||||||
driver_ah.attach_type = constants.AH_TYPE_MDEV
|
|
||||||
driver_ah.in_use = False
|
|
||||||
driver_ah.attach_info = utils.pci_str_to_json(gpu["devices"])
|
|
||||||
return driver_ah
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_attribute_list(gpu):
|
|
||||||
attr_list = []
|
|
||||||
for k, v in gpu.items():
|
|
||||||
if k == "rc":
|
|
||||||
driver_attr = driver_attribute.DriverAttribute()
|
|
||||||
driver_attr.key, driver_attr.value = k, v
|
|
||||||
attr_list.append(driver_attr)
|
|
||||||
if k == "traits":
|
|
||||||
values = gpu.get(k, [])
|
|
||||||
for index, val in enumerate(values):
|
|
||||||
driver_attr = driver_attribute.DriverAttribute(
|
|
||||||
key="trait" + str(index), value=val)
|
|
||||||
attr_list.append(driver_attr)
|
|
||||||
return attr_list
|
|
||||||
|
@ -389,6 +389,18 @@ class InvalidDriver(Invalid):
|
|||||||
_msg_fmt = _("Found an invalid driver: %(name)s")
|
_msg_fmt = _("Found an invalid driver: %(name)s")
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidVGPUType(Invalid):
|
||||||
|
_msg_fmt = _("Invalid requested vGPU type: %(name)s")
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidvGPUConfig(Invalid):
|
||||||
|
_msg_fmt = _("Invalid vGPU config: %(reason)s")
|
||||||
|
|
||||||
|
|
||||||
|
class PciDeviceWrongAddressFormat(Invalid):
|
||||||
|
_msg_fmt = _("The PCI address %(address)s has an incorrect format.")
|
||||||
|
|
||||||
|
|
||||||
class InvalidType(Invalid):
|
class InvalidType(Invalid):
|
||||||
_msg_fmt = _("Invalid type for %(obj)s: %(type)s."
|
_msg_fmt = _("Invalid type for %(obj)s: %(type)s."
|
||||||
"Expected: %(expected)s")
|
"Expected: %(expected)s")
|
||||||
|
@ -12,21 +12,42 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
import sys
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
from oslo_serialization import jsonutils
|
from oslo_serialization import jsonutils
|
||||||
|
|
||||||
|
import cyborg
|
||||||
|
from cyborg.accelerator.drivers.gpu.nvidia.driver import NVIDIAGPUDriver
|
||||||
from cyborg.accelerator.drivers.gpu import utils
|
from cyborg.accelerator.drivers.gpu import utils
|
||||||
from cyborg.tests import base
|
from cyborg.tests import base
|
||||||
|
|
||||||
|
|
||||||
|
CONF = cyborg.conf.CONF
|
||||||
|
|
||||||
NVIDIA_GPU_INFO = "0000:00:06.0 3D controller [0302]: NVIDIA Corporation " \
|
NVIDIA_GPU_INFO = "0000:00:06.0 3D controller [0302]: NVIDIA Corporation " \
|
||||||
"GP100GL [Tesla P100 PCIe 12GB] [10de:15f7] (rev a1)"
|
"GP100GL [Tesla P100 PCIe 12GB] [10de:15f7] (rev a1)"
|
||||||
|
|
||||||
|
NVIDIA_T4_GPU_INFO = "0000:af:00.0 3D controller [0302]: NVIDIA Corporation "\
|
||||||
|
"TU104GL [Tesla T4] [10de:1eb8] (rev a1)"
|
||||||
|
|
||||||
|
NVIDIA_T4_SUPPORTED_MDEV_TYPES = ['nvidia-222', 'nvidia-223', 'nvidia-224',
|
||||||
|
'nvidia-225', 'nvidia-226', 'nvidia-227',
|
||||||
|
'nvidia-228', 'nvidia-229', 'nvidia-230',
|
||||||
|
'nvidia-231', 'nvidia-232', 'nvidia-233',
|
||||||
|
'nvidia-234', 'nvidia-252', 'nvidia-319',
|
||||||
|
'nvidia-320', 'nvidia-321']
|
||||||
|
|
||||||
|
BUILTIN = '__builtin__' if (sys.version_info[0] < 3) else '__builtins__'
|
||||||
|
|
||||||
|
|
||||||
class stdout(object):
|
class stdout(object):
|
||||||
def readlines(self):
|
def readlines(self):
|
||||||
return [NVIDIA_GPU_INFO]
|
return [NVIDIA_GPU_INFO]
|
||||||
|
|
||||||
|
def readlines_T4(self):
|
||||||
|
return [NVIDIA_T4_GPU_INFO]
|
||||||
|
|
||||||
|
|
||||||
class p(object):
|
class p(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -49,11 +70,14 @@ class TestGPUDriverUtils(base.TestCase):
|
|||||||
self.assertEqual(1, len(gpu_vendors))
|
self.assertEqual(1, len(gpu_vendors))
|
||||||
|
|
||||||
@mock.patch('cyborg.accelerator.drivers.gpu.utils.lspci_privileged')
|
@mock.patch('cyborg.accelerator.drivers.gpu.utils.lspci_privileged')
|
||||||
def test_discover_gpus(self, mock_devices_for_vendor):
|
def test_discover_gpus_report_pGPU(self, mock_devices_for_vendor):
|
||||||
|
"""test nvidia pGPU discover"""
|
||||||
mock_devices_for_vendor.return_value = self.p.stdout.readlines()
|
mock_devices_for_vendor.return_value = self.p.stdout.readlines()
|
||||||
self.set_defaults(host='host-192-168-32-195', debug=True)
|
self.set_defaults(host='host-192-168-32-195', debug=True)
|
||||||
vendor_id = '10de'
|
|
||||||
gpu_list = utils.discover_gpus(vendor_id)
|
nvidia = NVIDIAGPUDriver()
|
||||||
|
gpu_list = nvidia.discover()
|
||||||
|
|
||||||
self.assertEqual(1, len(gpu_list))
|
self.assertEqual(1, len(gpu_list))
|
||||||
attach_handle_list = [
|
attach_handle_list = [
|
||||||
{'attach_type': 'PCI',
|
{'attach_type': 'PCI',
|
||||||
@ -65,8 +89,8 @@ class TestGPUDriverUtils(base.TestCase):
|
|||||||
]
|
]
|
||||||
attribute_list = [
|
attribute_list = [
|
||||||
{'key': 'rc', 'value': 'PGPU'},
|
{'key': 'rc', 'value': 'PGPU'},
|
||||||
{'key': 'trait0', 'value': 'CUSTOM_GPU_NVIDIA'},
|
{'key': 'trait0', 'value': 'OWNER_CYBORG'},
|
||||||
{'key': 'trait1', 'value': 'CUSTOM_GPU_PRODUCT_ID_15F7'},
|
{'key': 'trait1', 'value': 'CUSTOM_NVIDIA_15F7'},
|
||||||
]
|
]
|
||||||
expected = {
|
expected = {
|
||||||
'vendor': '10de',
|
'vendor': '10de',
|
||||||
@ -118,3 +142,107 @@ class TestGPUDriverUtils(base.TestCase):
|
|||||||
self.assertEqual(attach_handle_list[0],
|
self.assertEqual(attach_handle_list[0],
|
||||||
gpu_attach_handle_list[0].as_dict())
|
gpu_attach_handle_list[0].as_dict())
|
||||||
self.assertEqual(attribute_list, attribute_actual_data)
|
self.assertEqual(attribute_list, attribute_actual_data)
|
||||||
|
|
||||||
|
@mock.patch('builtins.open')
|
||||||
|
@mock.patch('os.listdir')
|
||||||
|
@mock.patch('os.path.exists')
|
||||||
|
@mock.patch('cyborg.accelerator.drivers.gpu.utils.lspci_privileged')
|
||||||
|
def test_discover_gpus_report_vGPU(self, mock_devices_for_vendor,
|
||||||
|
mock_path_exists,
|
||||||
|
mock_supported_mdev_types,
|
||||||
|
mock_open):
|
||||||
|
"""test nvidia vGPU discover"""
|
||||||
|
mock_devices_for_vendor.return_value = self.p.stdout.readlines_T4()
|
||||||
|
mock_path_exists.return_value = True
|
||||||
|
mock_supported_mdev_types.return_value = NVIDIA_T4_SUPPORTED_MDEV_TYPES
|
||||||
|
file_content_list = ['GRID T4-1B', '1']
|
||||||
|
mock_open.side_effect = multi_mock_open(*file_content_list)
|
||||||
|
self.set_defaults(host='host-192-168-32-195', debug=True)
|
||||||
|
self.set_defaults(enabled_vgpu_types='nvidia-223', group='gpu_devices')
|
||||||
|
cyborg.conf.devices.register_dynamic_opts(CONF)
|
||||||
|
self.set_defaults(
|
||||||
|
device_addresses=['0000:af:00.0'], group='vgpu_nvidia-223')
|
||||||
|
nvidia = NVIDIAGPUDriver()
|
||||||
|
gpu_list = nvidia.discover()
|
||||||
|
|
||||||
|
self.assertEqual(1, len(gpu_list))
|
||||||
|
attach_handle_list = [
|
||||||
|
{'attach_type': 'MDEV',
|
||||||
|
'attach_info': '{"asked_type": "nvidia-223", '
|
||||||
|
'"bus": "af", '
|
||||||
|
'"device": "00", '
|
||||||
|
'"domain": "0000", '
|
||||||
|
'"function": "0", '
|
||||||
|
'"vgpu_mark": "nvidia-223_0"}',
|
||||||
|
'in_use': False}
|
||||||
|
] * 8
|
||||||
|
attribute_list = [
|
||||||
|
{'key': 'rc', 'value': 'VGPU'},
|
||||||
|
{'key': 'trait0', 'value': 'OWNER_CYBORG'},
|
||||||
|
{'key': 'trait1', 'value': 'CUSTOM_NVIDIA_1EB8_T4_1B'},
|
||||||
|
]
|
||||||
|
expected = {
|
||||||
|
'vendor': '10de',
|
||||||
|
'type': 'GPU',
|
||||||
|
'std_board_info':
|
||||||
|
{"controller": "3D controller", "product_id": "1eb8"},
|
||||||
|
'vendor_board_info': {"vendor_info": "gpu_vb_info"},
|
||||||
|
'deployable_list':
|
||||||
|
[
|
||||||
|
{
|
||||||
|
'num_accelerators': 18,
|
||||||
|
'driver_name': 'NVIDIA',
|
||||||
|
'name': 'host-192-168-32-195_0000:af:00.0',
|
||||||
|
'attach_handle_list': attach_handle_list,
|
||||||
|
'attribute_list': attribute_list
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'controlpath_id': {'cpid_info': '{"bus": "af", '
|
||||||
|
'"device": "00", '
|
||||||
|
'"domain": "0000", '
|
||||||
|
'"function": "0"}',
|
||||||
|
'cpid_type': 'PCI'}
|
||||||
|
}
|
||||||
|
gpu_obj = gpu_list[0]
|
||||||
|
gpu_dict = gpu_obj.as_dict()
|
||||||
|
gpu_dep_list = gpu_dict['deployable_list']
|
||||||
|
gpu_attach_handle_list = gpu_dep_list[0].as_dict()[
|
||||||
|
'attach_handle_list']
|
||||||
|
gpu_attribute_list = gpu_dep_list[0].as_dict()['attribute_list']
|
||||||
|
attri_obj_data = []
|
||||||
|
[attri_obj_data.append(attr.as_dict()) for attr in gpu_attribute_list]
|
||||||
|
attribute_actual_data = sorted(attri_obj_data, key=lambda i: i['key'])
|
||||||
|
self.assertEqual(expected['vendor'], gpu_dict['vendor'])
|
||||||
|
self.assertEqual(expected['controlpath_id'],
|
||||||
|
gpu_dict['controlpath_id'])
|
||||||
|
self.assertEqual(expected['std_board_info'],
|
||||||
|
jsonutils.loads(gpu_dict['std_board_info']))
|
||||||
|
self.assertEqual(expected['vendor_board_info'],
|
||||||
|
jsonutils.loads(gpu_dict['vendor_board_info']))
|
||||||
|
self.assertEqual(expected['deployable_list'][0]['num_accelerators'],
|
||||||
|
gpu_dep_list[0].as_dict()['num_accelerators'])
|
||||||
|
self.assertEqual(expected['deployable_list'][0]['name'],
|
||||||
|
gpu_dep_list[0].as_dict()['name'])
|
||||||
|
self.assertEqual(expected['deployable_list'][0]['driver_name'],
|
||||||
|
gpu_dep_list[0].as_dict()['driver_name'])
|
||||||
|
self.assertEqual(attach_handle_list[0],
|
||||||
|
gpu_attach_handle_list[0].as_dict())
|
||||||
|
self.assertEqual(attribute_list, attribute_actual_data)
|
||||||
|
|
||||||
|
|
||||||
|
def multi_mock_open(*file_contents):
|
||||||
|
"""Create a mock "open" that will mock open multiple files in sequence.
|
||||||
|
|
||||||
|
: params file_contents: a list of file contents to be returned by open
|
||||||
|
|
||||||
|
: returns: (MagicMock) a mock opener that will return the contents of the
|
||||||
|
first file when opened the first time, the second file when
|
||||||
|
opened the second time, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mock_files = [
|
||||||
|
mock.mock_open(read_data=content).return_value for content in
|
||||||
|
file_contents]
|
||||||
|
mock_opener = mock.mock_open()
|
||||||
|
mock_opener.side_effect = mock_files
|
||||||
|
return mock_opener
|
||||||
|
Loading…
Reference in New Issue
Block a user