vgpu type manamegment POC
1. set default vgpu_type for every gpu if virtualized 2. support change vgpu_type for gpu Change-Id: I052d120036cd72c8224f7e1d06e96db44979f9ee
This commit is contained in:
parent
9df66a96fe
commit
70744c6012
@ -21,12 +21,14 @@ from oslo_log import log as logging
|
||||
from oslo_serialization import jsonutils
|
||||
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
|
||||
import cyborg.conf
|
||||
|
||||
from cyborg.accelerator.common import utils
|
||||
from cyborg.accelerator.drivers.gpu import utils as gpu_utils
|
||||
from cyborg.agent import rpcapi as agent_rpcapi
|
||||
from cyborg.common import constants
|
||||
from cyborg.common import exception
|
||||
from cyborg.conf import CONF
|
||||
@ -145,19 +147,19 @@ def _generate_driver_device(gpu):
|
||||
driver_device_obj.model = gpu.get('model', 'miss model info')
|
||||
std_board_info = {'product_id': gpu.get('product_id'),
|
||||
'controller': gpu.get('controller'), }
|
||||
vendor_board_info = {'vendor_info': gpu.get('vendor_info',
|
||||
'gpu_vb_info')}
|
||||
driver_device_obj.std_board_info = jsonutils.dumps(std_board_info)
|
||||
driver_device_obj.vendor_board_info = jsonutils.dumps(
|
||||
vendor_board_info)
|
||||
driver_device_obj.type = constants.DEVICE_GPU
|
||||
driver_device_obj.stub = gpu.get('stub', False)
|
||||
driver_device_obj.controlpath_id = _generate_controlpath_id(gpu)
|
||||
driver_device_obj.deployable_list = _generate_dep_list(gpu)
|
||||
vendor_board_info = {'device_address': gpu.get('devices'),
|
||||
'vgpu_type': gpu.get('vgpu_type')}
|
||||
driver_device_obj.vendor_board_info = jsonutils.dumps(
|
||||
vendor_board_info)
|
||||
return driver_device_obj
|
||||
|
||||
|
||||
def _get_supported_vgpu_types():
|
||||
def _get_supported_vgpu_types(hostname):
|
||||
"""Gets supported vgpu_types from cyborg.conf.
|
||||
|
||||
Retrieves supported vgpu_types set by the operator and generates a
|
||||
@ -172,85 +174,52 @@ def _get_supported_vgpu_types():
|
||||
"""
|
||||
pgpu_type_mapping = collections.defaultdict(str)
|
||||
pgpu_type_mapping.clear()
|
||||
if not CONF.gpu_devices.enabled_vgpu_types:
|
||||
return [], pgpu_type_mapping
|
||||
|
||||
for vgpu_type in CONF.gpu_devices.enabled_vgpu_types:
|
||||
group = getattr(CONF, 'vgpu_%s' % vgpu_type, None)
|
||||
if group is None or not group.device_addresses:
|
||||
# Device addresses must be configured explictly now for every
|
||||
# enabled vgpu type. Will improve after the disable and enable
|
||||
# devices interfaces implemented.
|
||||
raise exception.InvalidvGPUConfig(
|
||||
reason="Missing device addresses config for vgpu type %s"
|
||||
% vgpu_type
|
||||
)
|
||||
for device_address in group.device_addresses:
|
||||
if device_address in pgpu_type_mapping:
|
||||
raise exception.InvalidvGPUConfig(
|
||||
reason="Duplicate types for PCI address %s"
|
||||
% device_address
|
||||
)
|
||||
# Just checking whether the operator fat-fingered the address.
|
||||
# If it's wrong, it will return an exception
|
||||
try:
|
||||
# Validates whether it's a PCI ID...
|
||||
utils.parse_address(device_address)
|
||||
except exception.PciDeviceWrongAddressFormat:
|
||||
raise exception.InvalidvGPUConfig(
|
||||
reason="Incorrect PCI address: %s" % device_address
|
||||
)
|
||||
admin_context = context.get_admin_context()
|
||||
devices = agent_rpcapi.AgentAPI().get_devices(admin_context, hostname)
|
||||
for device in devices:
|
||||
if device.vendor_board_info == 'miss_vb_info':
|
||||
continue
|
||||
vbi = json.loads(device.vendor_board_info)
|
||||
vgpu_type = vbi.get('vgpu_type')
|
||||
device_address = vbi.get('device_address')
|
||||
if vgpu_type and device_address:
|
||||
pgpu_type_mapping[device_address] = vgpu_type
|
||||
return CONF.gpu_devices.enabled_vgpu_types, pgpu_type_mapping
|
||||
|
||||
|
||||
def _get_vgpu_type_per_pgpu(device_address, supported_vgpu_types,
|
||||
pgpu_type_mapping):
|
||||
"""Provides the vGPU type the pGPU supports.
|
||||
|
||||
:param device_address: the PCI device address in config,
|
||||
eg.'0000:af:00.0'
|
||||
"""
|
||||
supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
|
||||
# Bail out quickly if we don't support vGPUs
|
||||
if not supported_vgpu_types:
|
||||
LOG.warning('Unable to load vGPU_type from [gpu_devices] '
|
||||
'Ensure "enabled_vgpu_types" is set if the gpu'
|
||||
'is virtualized.')
|
||||
return
|
||||
|
||||
try:
|
||||
# Validates whether it's a PCI ID...
|
||||
utils.parse_address(device_address)
|
||||
except (exception.PciDeviceWrongAddressFormat, IndexError):
|
||||
# this is not a valid PCI address
|
||||
LOG.warning("The PCI address %s was invalid for getting the"
|
||||
"related vGPU type", device_address)
|
||||
return
|
||||
return pgpu_type_mapping.get(device_address)
|
||||
return pgpu_type_mapping
|
||||
|
||||
|
||||
def _discover_gpus(vendor_id):
|
||||
"""param: vendor_id=VENDOR_ID means only discover Nvidia GPU on the host
|
||||
"""
|
||||
# init vGPU conf
|
||||
cyborg.conf.devices.register_dynamic_opts(CONF)
|
||||
supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types()
|
||||
hostname = CONF.host
|
||||
pgpu_type_mapping = _get_supported_vgpu_types(hostname)
|
||||
# discover gpu devices by "lspci"
|
||||
gpu_list = []
|
||||
gpus = gpu_utils.get_pci_devices(gpu_utils.GPU_FLAGS, vendor_id)
|
||||
LOG.info('gpus raw info: %s', gpus)
|
||||
# report trait,rc and generate driver object
|
||||
for gpu in gpus:
|
||||
m = gpu_utils.GPU_INFO_PATTERN.match(gpu)
|
||||
if m:
|
||||
gpu_dict = m.groupdict()
|
||||
# get hostname for deployable_name usage
|
||||
gpu_dict['hostname'] = CONF.host
|
||||
gpu_dict['hostname'] = hostname
|
||||
# get vgpu_type from cyborg.conf, otherwise vgpu_type=None
|
||||
vgpu_type = _get_vgpu_type_per_pgpu(
|
||||
gpu_dict["devices"], supported_vgpu_types, pgpu_type_mapping)
|
||||
vgpu_type = pgpu_type_mapping.get(gpu_dict["devices"])
|
||||
LOG.info('vgpu_type is %s', vgpu_type)
|
||||
mdev_path = os.path.expandvars(
|
||||
'/sys/bus/pci/devices/{0}/mdev_supported_types'.
|
||||
format(gpu_dict["devices"]))
|
||||
valid_types = []
|
||||
try:
|
||||
valid_types = os.listdir(mdev_path)
|
||||
LOG.info("The GPU %(gpu)s on host %(host)s is virtualized.",
|
||||
{"gpu": gpu_dict['devices'], "host": hostname})
|
||||
except FileNotFoundError:
|
||||
LOG.info("The GPU %(gpu)s on host %(host)s is not virtualized.",
|
||||
{"gpu": gpu_dict['devices'], "host": hostname})
|
||||
|
||||
# generate rc and trait for pGPU
|
||||
if not vgpu_type:
|
||||
if not valid_types:
|
||||
gpu_dict["rc"] = constants.RESOURCES["PGPU"]
|
||||
traits = _get_traits(gpu_dict["vendor_id"],
|
||||
gpu_dict["product_id"])
|
||||
@ -258,12 +227,8 @@ def _discover_gpus(vendor_id):
|
||||
else:
|
||||
# get rc
|
||||
gpu_dict["rc"] = constants.RESOURCES["VGPU"]
|
||||
mdev_path = os.path.expandvars(
|
||||
'/sys/bus/pci/devices/{0}/mdev_supported_types'.
|
||||
format(gpu_dict["devices"]))
|
||||
valid_types = os.listdir(mdev_path)
|
||||
if vgpu_type not in valid_types:
|
||||
raise exception.InvalidVGPUType(name=vgpu_type)
|
||||
# default set the first vgpu_type in sorted(valid_types)
|
||||
vgpu_type = vgpu_type if vgpu_type else sorted(valid_types)[0]
|
||||
gpu_dict["vGPU_type"] = vgpu_type
|
||||
vGPU_path = os.path.expandvars(
|
||||
'/sys/bus/pci/devices/{0}/mdev_supported_types/{1}/'
|
||||
|
@ -89,3 +89,6 @@ class AgentManager(periodic_task.PeriodicTasks):
|
||||
def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid):
|
||||
LOG.debug('Remove a vgpu mdev')
|
||||
gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid)
|
||||
|
||||
def get_devices(self, context, hostname):
|
||||
return self.cond_api.get_host_devices(context, hostname)
|
||||
|
@ -83,3 +83,9 @@ class AgentAPI(object):
|
||||
pci_addr=pci_addr,
|
||||
asked_type=asked_type,
|
||||
ah_uuid=ah_uuid)
|
||||
|
||||
def get_devices(self, context, hostname):
|
||||
LOG.info('Get devices by host: (%s)', hostname)
|
||||
version = '1.0'
|
||||
cctxt = self.client.prepare(server=hostname, version=version)
|
||||
return cctxt.call(context, 'get_devices', hostname=hostname)
|
||||
|
@ -19,11 +19,15 @@ from wsme import types as wtypes
|
||||
|
||||
from oslo_log import log
|
||||
|
||||
from cyborg.accelerator.drivers.gpu import utils
|
||||
from cyborg.agent.rpcapi import AgentAPI
|
||||
from cyborg.api.controllers import base
|
||||
from cyborg.api.controllers import link
|
||||
from cyborg.api.controllers import types
|
||||
from cyborg.api import expose
|
||||
from cyborg.common import authorize_wsgi
|
||||
from cyborg.common import constants
|
||||
from cyborg.common import exception
|
||||
from cyborg.common import policy
|
||||
from cyborg import objects
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
@ -92,6 +96,12 @@ class DeviceCollection(base.APIBase):
|
||||
|
||||
class DevicesController(base.CyborgController):
|
||||
"""REST controller for Devices."""
|
||||
_custom_actions = {'update_type': ['PATCH'], 'update_mig': ['PATCH'],
|
||||
'get_vgpu_type': ['GET']}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(DevicesController, self).__init__(*args, **kwargs)
|
||||
self.agent = AgentAPI()
|
||||
|
||||
@authorize_wsgi.authorize_wsgi("cyborg:device", "get_one")
|
||||
@expose.expose(Device, wtypes.text)
|
||||
@ -128,3 +138,74 @@ class DevicesController(base.CyborgController):
|
||||
obj_devices = objects.Device.list(context, filters=filters_dict)
|
||||
LOG.info('[devices:get_all] Returned: %s', obj_devices)
|
||||
return DeviceCollection.convert_with_links(obj_devices)
|
||||
|
||||
@policy.authorize_wsgi("cyborg:device", "get_vgpu_type", False)
|
||||
@expose.expose('json', wtypes.text, body=types.jsontype, status_code=200)
|
||||
def get_vgpu_type(self, uuid):
|
||||
"""Update vgpu_types of a gpu device.
|
||||
:param uuid: UUID of an device.
|
||||
"""
|
||||
context = pecan.request.context
|
||||
device = objects.Device.get(context, uuid)
|
||||
if device.type != 'GPU':
|
||||
raise exception.CyborgException("Device must be GPU.")
|
||||
sbi = json.loads(device.std_board_info)
|
||||
product_id = sbi.get('product_id')
|
||||
if product_id in utils.MIG_PROFILES.keys():
|
||||
ret = {'vgpu_types': utils.MIG_PROFILES.get(product_id)}
|
||||
else:
|
||||
hostname = device.hostname
|
||||
device_address = json.loads(device.vendor_board_info).get("device_address")
|
||||
command = 'nsenter -m -t1 ssh {1} ls /sys/bus/pci/devices/{0}/mdev_supported_types/'.format(
|
||||
device_address, hostname)
|
||||
p = subprocess.Popen(
|
||||
command,
|
||||
shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
out = p.stdout.readlines()
|
||||
vgpu_types = []
|
||||
for vgpu_type in out:
|
||||
vgpu_types.append(vgpu_type.strip())
|
||||
ret = {'vgpu_types': vgpu_types}
|
||||
return wsme.api.Response(ret, status_code=http_client.OK,
|
||||
return_type=wsme.types.DictType)
|
||||
|
||||
@policy.authorize_wsgi("cyborg:device", "update_type", False)
|
||||
@expose.expose('json', wtypes.text, body=types.jsontype, status_code=200)
|
||||
def update_type(self, uuid, req_dev):
|
||||
"""Update vgpu_type for a gpu device.
|
||||
:param uuid: UUID of an device.
|
||||
:param req_dev: type of device to update.{"vgpu_type": "nvidia-182"}
|
||||
"""
|
||||
LOG.info("[device update_type] PUT request = (%s)", req_dev)
|
||||
vgpu_type = req_dev.get('vgpu_type')
|
||||
if not vgpu_type:
|
||||
raise exception.VGPUTypeIsNeed()
|
||||
context = pecan.request.context
|
||||
device = objects.Device.get(context, uuid)
|
||||
if device.type != 'GPU':
|
||||
raise exception.CyborgException("Device must be GPU")
|
||||
deployables = objects.Deployable.get_list_by_device_id(context, device_id=device.id)
|
||||
for deployable in deployables:
|
||||
attach_handlers = objects.AttachHandle.get_ah_list_by_deployable_id(context, deployable.id)
|
||||
for attach_handler in attach_handlers:
|
||||
if attach_handler.in_use:
|
||||
raise exception.DeviceInUse(device=uuid)
|
||||
LOG.info("[device.vendor_board_info = (%s)", device.vendor_board_info)
|
||||
vbi = json.loads(device.vendor_board_info)
|
||||
device_address = vbi.get("device_address")
|
||||
hostname = device.hostname
|
||||
command = 'nsenter -m -t1 ssh {2} ls /sys/bus/pci/devices/{0}/mdev_supported_types/{1}'.format(
|
||||
device_address, vgpu_type, hostname)
|
||||
p = subprocess.Popen(
|
||||
command,
|
||||
shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
out, err = p.communicate()
|
||||
if not out:
|
||||
raise exception.VGPUTypeNotExist(gpu=device_address, vgpu_type=vgpu_type)
|
||||
vbi.update({'vgpu_type': vgpu_type})
|
||||
device.vendor_board_info = json.dumps(vbi)
|
||||
device.save(context)
|
||||
self.agent.update_mdev(context, hostname)
|
||||
return Device.convert_with_links(device)
|
||||
|
@ -104,6 +104,9 @@ class ConductorManager(object):
|
||||
"""
|
||||
ExtARQ.apply_patch(context, patch_list, valid_fields)
|
||||
|
||||
def get_host_devices(self, context, hostname):
|
||||
return DriverDevice.list(context, hostname)
|
||||
|
||||
def report_data(self, context, hostname, driver_device_list):
|
||||
"""Update the Cyborg DB in one hostname according to the
|
||||
discovered device list.
|
||||
|
@ -118,3 +118,12 @@ class ConductorAPI(object):
|
||||
cctxt = self.client.prepare(topic=self.topic)
|
||||
return cctxt.call(context, 'arq_apply_patch', patch_list=patch_list,
|
||||
valid_fields=valid_fields)
|
||||
|
||||
def get_host_devices(self, context, hostname):
|
||||
"""Signal to conductor service to get host devices.
|
||||
|
||||
:param context: request context.
|
||||
:param hostname: host name
|
||||
"""
|
||||
cctxt = self.client.prepare(topic=self.topic)
|
||||
return cctxt.call(context, 'get_host_devices', hostname=hostname)
|
Loading…
Reference in New Issue
Block a user