From afc3c1dcd22cfc1c107dc8d1f6119678fecfce89 Mon Sep 17 00:00:00 2001 From: songwenping Date: Fri, 28 Jul 2023 15:48:45 +0800 Subject: [PATCH] vgpu type manamegment POC 1. set default vgpu_type for every gpu if virtualized 2. support change vgpu_type for gpu Change-Id: I052d120036cd72c8224f7e1d06e96db44979f9ee --- .../accelerator/drivers/gpu/nvidia/sysinfo.py | 117 ++++++------------ cyborg/agent/manager.py | 6 + cyborg/agent/rpcapi.py | 12 ++ cyborg/api/controllers/v2/devices.py | 78 ++++++++++++ cyborg/common/exception.py | 13 ++ cyborg/common/placement_client.py | 15 ++- cyborg/common/policy.py | 6 + cyborg/conductor/manager.py | 3 + cyborg/conductor/rpcapi.py | 9 ++ 9 files changed, 176 insertions(+), 83 deletions(-) diff --git a/cyborg/accelerator/drivers/gpu/nvidia/sysinfo.py b/cyborg/accelerator/drivers/gpu/nvidia/sysinfo.py index c5db1e0f..cd165c54 100644 --- a/cyborg/accelerator/drivers/gpu/nvidia/sysinfo.py +++ b/cyborg/accelerator/drivers/gpu/nvidia/sysinfo.py @@ -21,15 +21,15 @@ from oslo_log import log as logging from oslo_serialization import jsonutils import collections +import json import os -import cyborg.conf - from cyborg.accelerator.common import utils from cyborg.accelerator.drivers.gpu import utils as gpu_utils +from cyborg.agent import rpcapi as agent_rpcapi from cyborg.common import constants -from cyborg.common import exception from cyborg.conf import CONF +from cyborg import context from cyborg.objects.driver_objects import driver_attach_handle from cyborg.objects.driver_objects import driver_attribute from cyborg.objects.driver_objects import driver_controlpath_id @@ -145,19 +145,19 @@ def _generate_driver_device(gpu): driver_device_obj.model = gpu.get('model', 'miss model info') std_board_info = {'product_id': gpu.get('product_id'), 'controller': gpu.get('controller'), } - vendor_board_info = {'vendor_info': gpu.get('vendor_info', - 'gpu_vb_info')} driver_device_obj.std_board_info = jsonutils.dumps(std_board_info) - driver_device_obj.vendor_board_info = jsonutils.dumps( - vendor_board_info) driver_device_obj.type = constants.DEVICE_GPU driver_device_obj.stub = gpu.get('stub', False) driver_device_obj.controlpath_id = _generate_controlpath_id(gpu) driver_device_obj.deployable_list = _generate_dep_list(gpu) + vendor_board_info = {'device_address': gpu.get('devices'), + 'vgpu_type': gpu.get('vgpu_type')} + driver_device_obj.vendor_board_info = jsonutils.dumps( + vendor_board_info) return driver_device_obj -def _get_supported_vgpu_types(): +def _get_supported_vgpu_types(hostname): """Gets supported vgpu_types from cyborg.conf. Retrieves supported vgpu_types set by the operator and generates a @@ -172,85 +172,52 @@ def _get_supported_vgpu_types(): """ pgpu_type_mapping = collections.defaultdict(str) pgpu_type_mapping.clear() - if not CONF.gpu_devices.enabled_vgpu_types: - return [], pgpu_type_mapping - - for vgpu_type in CONF.gpu_devices.enabled_vgpu_types: - group = getattr(CONF, 'vgpu_%s' % vgpu_type, None) - if group is None or not group.device_addresses: - # Device addresses must be configured explictly now for every - # enabled vgpu type. Will improve after the disable and enable - # devices interfaces implemented. - raise exception.InvalidvGPUConfig( - reason="Missing device addresses config for vgpu type %s" - % vgpu_type - ) - for device_address in group.device_addresses: - if device_address in pgpu_type_mapping: - raise exception.InvalidvGPUConfig( - reason="Duplicate types for PCI address %s" - % device_address - ) - # Just checking whether the operator fat-fingered the address. - # If it's wrong, it will return an exception - try: - # Validates whether it's a PCI ID... - utils.parse_address(device_address) - except exception.PciDeviceWrongAddressFormat: - raise exception.InvalidvGPUConfig( - reason="Incorrect PCI address: %s" % device_address - ) + admin_context = context.get_admin_context() + devices = agent_rpcapi.AgentAPI().get_devices(admin_context, hostname) + for device in devices: + if device.type != 'GPU' or device.vendor_board_info == 'miss_vb_info': + continue + vbi = json.loads(device.vendor_board_info) + vgpu_type = vbi.get('vgpu_type') + device_address = vbi.get('device_address') + if vgpu_type and device_address: pgpu_type_mapping[device_address] = vgpu_type - return CONF.gpu_devices.enabled_vgpu_types, pgpu_type_mapping - - -def _get_vgpu_type_per_pgpu(device_address, supported_vgpu_types, - pgpu_type_mapping): - """Provides the vGPU type the pGPU supports. - - :param device_address: the PCI device address in config, - eg.'0000:af:00.0' - """ - supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types() - # Bail out quickly if we don't support vGPUs - if not supported_vgpu_types: - LOG.warning('Unable to load vGPU_type from [gpu_devices] ' - 'Ensure "enabled_vgpu_types" is set if the gpu' - 'is virtualized.') - return - - try: - # Validates whether it's a PCI ID... - utils.parse_address(device_address) - except (exception.PciDeviceWrongAddressFormat, IndexError): - # this is not a valid PCI address - LOG.warning("The PCI address %s was invalid for getting the" - "related vGPU type", device_address) - return - return pgpu_type_mapping.get(device_address) + return pgpu_type_mapping def _discover_gpus(vendor_id): """param: vendor_id=VENDOR_ID means only discover Nvidia GPU on the host """ - # init vGPU conf - cyborg.conf.devices.register_dynamic_opts(CONF) - supported_vgpu_types, pgpu_type_mapping = _get_supported_vgpu_types() + hostname = CONF.host + pgpu_type_mapping = _get_supported_vgpu_types(hostname) # discover gpu devices by "lspci" gpu_list = [] gpus = gpu_utils.get_pci_devices(gpu_utils.GPU_FLAGS, vendor_id) + LOG.info('gpus raw info: %s', gpus) # report trait,rc and generate driver object for gpu in gpus: m = gpu_utils.GPU_INFO_PATTERN.match(gpu) if m: gpu_dict = m.groupdict() # get hostname for deployable_name usage - gpu_dict['hostname'] = CONF.host + gpu_dict['hostname'] = hostname # get vgpu_type from cyborg.conf, otherwise vgpu_type=None - vgpu_type = _get_vgpu_type_per_pgpu( - gpu_dict["devices"], supported_vgpu_types, pgpu_type_mapping) + vgpu_type = pgpu_type_mapping.get(gpu_dict["devices"]) + LOG.info('vgpu_type is %s', vgpu_type) + mdev_path = os.path.expandvars( + '/sys/bus/pci/devices/{0}/mdev_supported_types'. + format(gpu_dict["devices"])) + valid_types = [] + try: + valid_types = os.listdir(mdev_path) + LOG.info("The GPU %(gpu)s on host %(host)s is virtualized.", + {"gpu": gpu_dict['devices'], "host": hostname}) + except FileNotFoundError: + LOG.info("The GPU %(gpu)s on host %(host)s is unvirtualized.", + {"gpu": gpu_dict['devices'], "host": hostname}) + # generate rc and trait for pGPU - if not vgpu_type: + if not valid_types: gpu_dict["rc"] = constants.RESOURCES["PGPU"] traits = _get_traits(gpu_dict["vendor_id"], gpu_dict["product_id"]) @@ -258,13 +225,9 @@ def _discover_gpus(vendor_id): else: # get rc gpu_dict["rc"] = constants.RESOURCES["VGPU"] - mdev_path = os.path.expandvars( - '/sys/bus/pci/devices/{0}/mdev_supported_types'. - format(gpu_dict["devices"])) - valid_types = os.listdir(mdev_path) - if vgpu_type not in valid_types: - raise exception.InvalidVGPUType(name=vgpu_type) - gpu_dict["vGPU_type"] = vgpu_type + # default set the first vgpu_type in sorted(valid_types) + vgpu_type = vgpu_type if vgpu_type else sorted(valid_types)[0] + gpu_dict["vgpu_type"] = vgpu_type vGPU_path = os.path.expandvars( '/sys/bus/pci/devices/{0}/mdev_supported_types/{1}/' .format(gpu_dict["devices"], gpu_dict["vGPU_type"])) diff --git a/cyborg/agent/manager.py b/cyborg/agent/manager.py index 153a7597..f484b9f3 100644 --- a/cyborg/agent/manager.py +++ b/cyborg/agent/manager.py @@ -89,3 +89,9 @@ class AgentManager(periodic_task.PeriodicTasks): def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid): LOG.debug('Remove a vgpu mdev') gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid) + + def get_devices(self, context, hostname): + return self.cond_api.get_host_devices(context, hostname) + + def update_mdev(self, context): + self._rt.update_usage(context) diff --git a/cyborg/agent/rpcapi.py b/cyborg/agent/rpcapi.py index 29c66350..b2d3d5c9 100644 --- a/cyborg/agent/rpcapi.py +++ b/cyborg/agent/rpcapi.py @@ -83,3 +83,15 @@ class AgentAPI(object): pci_addr=pci_addr, asked_type=asked_type, ah_uuid=ah_uuid) + + def get_devices(self, context, hostname): + LOG.info('Get devices by host: (%s)', hostname) + version = '1.0' + cctxt = self.client.prepare(server=hostname, version=version) + return cctxt.call(context, 'get_devices', hostname=hostname) + + def update_mdev(self, context, hostname): + LOG.info('Agent update mdev for hostname: (%s)', hostname) + version = '1.0' + cctxt = self.client.prepare(server=hostname, version=version) + return cctxt.call(context, 'update_mdev') diff --git a/cyborg/api/controllers/v2/devices.py b/cyborg/api/controllers/v2/devices.py index 26006526..5a292550 100644 --- a/cyborg/api/controllers/v2/devices.py +++ b/cyborg/api/controllers/v2/devices.py @@ -13,17 +13,24 @@ # License for the specific language governing permissions and limitations # under the License. +from http import HTTPStatus +import json import pecan +import subprocess import wsme from wsme import types as wtypes from oslo_log import log +from cyborg.accelerator.drivers.gpu import utils +from cyborg.agent.rpcapi import AgentAPI from cyborg.api.controllers import base from cyborg.api.controllers import link from cyborg.api.controllers import types from cyborg.api import expose from cyborg.common import authorize_wsgi +from cyborg.common import exception +from cyborg.common import policy from cyborg import objects LOG = log.getLogger(__name__) @@ -92,6 +99,11 @@ class DeviceCollection(base.APIBase): class DevicesController(base.CyborgController): """REST controller for Devices.""" + _custom_actions = {'update_type': ['PATCH'], 'get_vgpu_type': ['GET']} + + def __init__(self, *args, **kwargs): + super(DevicesController, self).__init__(*args, **kwargs) + self.agent = AgentAPI() @authorize_wsgi.authorize_wsgi("cyborg:device", "get_one") @expose.expose(Device, wtypes.text) @@ -128,3 +140,69 @@ class DevicesController(base.CyborgController): obj_devices = objects.Device.list(context, filters=filters_dict) LOG.info('[devices:get_all] Returned: %s', obj_devices) return DeviceCollection.convert_with_links(obj_devices) + + @authorize_wsgi.authorize_wsgi("cyborg:device", "get_vgpu_type", False) + @expose.expose('json', wtypes.text, body=types.jsontype, status_code=200) + def get_vgpu_type(self, uuid): + """Update vgpu_types of a gpu device. + :param uuid: UUID of an device. + """ + context = pecan.request.context + device = objects.Device.get(context, uuid) + if device.type != 'GPU': + raise exception.CyborgException("Only GPU device has vgpu_type.") + hostname = device.hostname + device_address = json.loads(device.vendor_board_info).get("device_address") + command = 'nsenter -m -t1 ssh {1} ls /sys/bus/pci/devices/{0}/mdev_supported_types/'.format( + device_address, hostname) + p = subprocess.Popen( + command, + shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out = p.stdout.readlines() + vgpu_types = [] + for vgpu_type in out: + vgpu_types.append(vgpu_type.strip()) + ret = {'vgpu_types': vgpu_types} + return wsme.api.Response(ret, status_code=HTTPStatus.OK, + return_type=wsme.types.DictType) + + @authorize_wsgi.authorize_wsgi("cyborg:device", "update_type", False) + @expose.expose('json', wtypes.text, body=types.jsontype, status_code=200) + def update_type(self, uuid, req_dev): + """Update vgpu_type for a gpu device. + :param uuid: UUID of an device. + :param req_dev: type of device to update.{"vgpu_type": "nvidia-182"} + """ + LOG.info("[device update_type] PUT request = (%s)", req_dev) + vgpu_type = req_dev.get('vgpu_type') + if not vgpu_type: + raise exception.VGPUTypeIsNeed() + context = pecan.request.context + device = objects.Device.get(context, uuid) + if device.type != 'GPU': + raise exception.CyborgException("Only GPU device can update vgpu_type") + deployables = objects.Deployable.get_list_by_device_id(context, device_id=device.id) + for deployable in deployables: + attach_handlers = objects.AttachHandle.get_ah_list_by_deployable_id(context, deployable.id) + for attach_handler in attach_handlers: + if attach_handler.in_use: + raise exception.DeviceInUse(device=uuid) + LOG.info("[device.vendor_board_info = (%s)", device.vendor_board_info) + vbi = json.loads(device.vendor_board_info) + device_address = vbi.get("device_address") + hostname = device.hostname + command = 'nsenter -m -t1 ssh {2} ls /sys/bus/pci/devices/{0}/mdev_supported_types/{1}'.format( + device_address, vgpu_type, hostname) + p = subprocess.Popen( + command, + shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + if not out: + raise exception.VGPUTypeNotExist(gpu=device_address, vgpu_type=vgpu_type) + vbi.update({'vgpu_type': vgpu_type}) + device.vendor_board_info = json.dumps(vbi) + device.save(context) + self.agent.update_mdev(context, hostname) + return Device.convert_with_links(device) diff --git a/cyborg/common/exception.py b/cyborg/common/exception.py index ba4e1ba7..8c4a8464 100644 --- a/cyborg/common/exception.py +++ b/cyborg/common/exception.py @@ -425,3 +425,16 @@ class FPGAProgramError(CyborgException): class PciDeviceNotFoundById(NotFound): _msg_fmt = _("PCI device %(id)s not found") + + +class DeviceInUse(Conflict): + _msg_fmt = _("Device %(device)s is in use.") + + +class VGPUTypeIsNeed(Invalid): + _msg_fmt = _("The vgpu type is need.") + + +class VGPUTypeNotExist(Invalid): + _msg_fmt = _("The vgpu type %(vgpu_type)s is not available for " + "GPU %(gpu)s.") diff --git a/cyborg/common/placement_client.py b/cyborg/common/placement_client.py index 284044bd..874b8650 100644 --- a/cyborg/common/placement_client.py +++ b/cyborg/common/placement_client.py @@ -85,16 +85,19 @@ class PlacementClient(object): def _ensure_traits(self, trait_names): # TODO(Xinran): maintain a reference count of how many RPs use # this trait and do the deletion only when the last RP is deleted. - for trait in trait_names: - resp = self.put("/traits/%s" % trait, None, version='1.6') + for trait_name in trait_names: + trait = self.get("/traits/%s" % trait_name, version='1.6') + if trait: + LOG.info("Trait %(trait)s already existed", + {"trait": trait_name}) + continue + resp = self.put("/traits/%s" % trait_name, None, version='1.6') if resp.status_code == 201: - LOG.info("Created trait %(trait)s", {"trait": trait}) - elif resp.status_code == 204: - LOG.info("Trait %(trait)s already existed", {"trait": trait}) + LOG.info("Created trait %(trait)s", {"trait": trait_name}) else: raise Exception( "Failed to create trait %s: HTTP %d: %s" % - (trait, resp.status_code, resp.text)) + (trait_name, resp.status_code, resp.text)) def _put_rp_traits(self, rp_uuid, traits_json): generation = self.get_resource_provider( diff --git a/cyborg/common/policy.py b/cyborg/common/policy.py index d0798cfb..c5fe1d4a 100644 --- a/cyborg/common/policy.py +++ b/cyborg/common/policy.py @@ -48,6 +48,12 @@ device_policies = [ policy.RuleDefault('cyborg:device:get_all', 'rule:allow', description='Retrieve all device records'), + policy.RuleDefault('cyborg:device:update_type', + 'rule:allow', + description='Update vgpu_type of GPU device'), + policy.RuleDefault('cyborg:device:get_vgpu_type', + 'rule:allow', + description='Get vgpu_type of GPU device'), ] deployable_policies = [ diff --git a/cyborg/conductor/manager.py b/cyborg/conductor/manager.py index 45cfd625..b522c9b6 100644 --- a/cyborg/conductor/manager.py +++ b/cyborg/conductor/manager.py @@ -104,6 +104,9 @@ class ConductorManager(object): """ ExtARQ.apply_patch(context, patch_list, valid_fields) + def get_host_devices(self, context, hostname): + return DriverDevice.list(context, hostname) + def report_data(self, context, hostname, driver_device_list): """Update the Cyborg DB in one hostname according to the discovered device list. diff --git a/cyborg/conductor/rpcapi.py b/cyborg/conductor/rpcapi.py index 93c2034d..c6e9a341 100644 --- a/cyborg/conductor/rpcapi.py +++ b/cyborg/conductor/rpcapi.py @@ -118,3 +118,12 @@ class ConductorAPI(object): cctxt = self.client.prepare(topic=self.topic) return cctxt.call(context, 'arq_apply_patch', patch_list=patch_list, valid_fields=valid_fields) + + def get_host_devices(self, context, hostname): + """Signal to conductor service to get host devices. + + :param context: request context. + :param hostname: host name + """ + cctxt = self.client.prepare(topic=self.topic) + return cctxt.call(context, 'get_host_devices', hostname=hostname) \ No newline at end of file