From 4b34d897d206f30a5a0ac9a963b119242ded6488 Mon Sep 17 00:00:00 2001 From: Yumeng Bao Date: Thu, 21 Jan 2021 15:37:42 +0800 Subject: [PATCH] arq bind and unbound support vGPU This patch is part of the vGPU support feature in cyborg. It implements arq bind and unbind for vGPU resource. Co-Authored-By: Wenping Song Change-Id: I32c3b81345c6ce83834a83c64b88e37926724f16 --- cyborg/accelerator/drivers/gpu/utils.py | 22 +++++++++++++++++ cyborg/agent/manager.py | 9 +++++++ cyborg/agent/rpcapi.py | 22 +++++++++++++++++ cyborg/api/controllers/v2/arqs.py | 1 + cyborg/common/exception.py | 4 +++ cyborg/conductor/manager.py | 2 +- cyborg/objects/arq.py | 1 + cyborg/objects/ext_arq.py | 31 ++++++++++++++++++++++-- cyborg/tests/unit/objects/test_extarq.py | 6 +++-- 9 files changed, 93 insertions(+), 5 deletions(-) diff --git a/cyborg/accelerator/drivers/gpu/utils.py b/cyborg/accelerator/drivers/gpu/utils.py index a21615c0..1b9963fc 100644 --- a/cyborg/accelerator/drivers/gpu/utils.py +++ b/cyborg/accelerator/drivers/gpu/utils.py @@ -19,6 +19,7 @@ from oslo_log import log as logging import re +import cyborg.common.exception as exception import cyborg.conf import cyborg.privsep @@ -41,6 +42,27 @@ def lspci_privileged(): return processutils.execute(*cmd) +@cyborg.privsep.sys_admin_pctxt.entrypoint +def create_mdev_privileged(pci_addr, mdev_type, ah_uuid): + """Instantiate a mediated device.""" + if ah_uuid is None: + raise exception.AttachHandleUUIDNeeded() + fpath = '/sys/class/mdev_bus/{0}/mdev_supported_types/{1}/create' + fpath = fpath.format(pci_addr, mdev_type) + with open(fpath, 'w') as f: + f.write(ah_uuid) + return ah_uuid + + +@cyborg.privsep.sys_admin_pctxt.entrypoint +def remove_mdev_privileged(physical_device, mdev_type, medv_uuid): + fpath = ('/sys/class/mdev_bus/{0}/mdev_supported_types/' + '{1}/devices/{2}/remove') + fpath = fpath.format(physical_device, mdev_type, medv_uuid) + with open(fpath, 'w') as f: + f.write("1") + + def get_pci_devices(pci_flags, vendor_id=None): device_for_vendor_out = [] all_device_out = [] diff --git a/cyborg/agent/manager.py b/cyborg/agent/manager.py index e27039de..153a7597 100644 --- a/cyborg/agent/manager.py +++ b/cyborg/agent/manager.py @@ -21,6 +21,7 @@ from oslo_service import periodic_task from oslo_utils import uuidutils from cyborg.accelerator.drivers.fpga.base import FPGADriver +from cyborg.accelerator.drivers.gpu import utils as gpu_utils from cyborg.agent.resource_tracker import ResourceTracker from cyborg.agent.rpcapi import AgentAPI from cyborg.common import exception @@ -80,3 +81,11 @@ class AgentManager(periodic_task.PeriodicTasks): def update_available_resource(self, context, startup=True): """Update all kinds of accelerator resources from their drivers.""" self._rt.update_usage(context) + + def create_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid): + LOG.debug('Instantiate a mediated device') + gpu_utils.create_mdev_privileged(pci_addr, asked_type, ah_uuid) + + def remove_vgpu_mdev(self, context, pci_addr, asked_type, ah_uuid): + LOG.debug('Remove a vgpu mdev') + gpu_utils.remove_mdev_privileged(pci_addr, asked_type, ah_uuid) diff --git a/cyborg/agent/rpcapi.py b/cyborg/agent/rpcapi.py index 644ed591..29c66350 100644 --- a/cyborg/agent/rpcapi.py +++ b/cyborg/agent/rpcapi.py @@ -61,3 +61,25 @@ class AgentAPI(object): controlpath_id=controlpath_id, bitstream_uuid=bitstream_uuid, driver_name=driver_name) + + def create_vgpu_mdev(self, context, hostname, pci_addr, + asked_type, ah_uuid): + LOG.debug('Agent create_vgpu_mdev: hostname: (%s) , pci_address: (%s)' + 'gpu_id: (%s)', hostname, pci_addr, ah_uuid) + version = '1.0' + cctxt = self.client.prepare(server=hostname, version=version) + return cctxt.call(context, 'create_vgpu_mdev', + pci_addr=pci_addr, + asked_type=asked_type, + ah_uuid=ah_uuid) + + def remove_vgpu_mdev(self, context, hostname, pci_addr, + asked_type, ah_uuid): + LOG.debug('Agent remove_vgpu_mdev: hostname: (%s) ' + 'gpu_id: (%s)', hostname, ah_uuid) + version = '1.0' + cctxt = self.client.prepare(server=hostname, version=version) + return cctxt.call(context, 'remove_vgpu_mdev', + pci_addr=pci_addr, + asked_type=asked_type, + ah_uuid=ah_uuid) diff --git a/cyborg/api/controllers/v2/arqs.py b/cyborg/api/controllers/v2/arqs.py index 23ba110d..aea1dd0a 100644 --- a/cyborg/api/controllers/v2/arqs.py +++ b/cyborg/api/controllers/v2/arqs.py @@ -60,6 +60,7 @@ class ARQ(base.APIBase): """The UUID of the instance project_id associated with this ARQ, if any""" attach_handle_type = wtypes.text + attach_handle_uuid = wtypes.text attach_handle_info = {wtypes.text: wtypes.text} links = wsme.wsattr([link.Link], readonly=True) diff --git a/cyborg/common/exception.py b/cyborg/common/exception.py index c5f0edf5..ba4e1ba7 100644 --- a/cyborg/common/exception.py +++ b/cyborg/common/exception.py @@ -92,6 +92,10 @@ class AttachHandleAlreadyExists(CyborgException): _msg_fmt = _("AttachHandle with uuid %(uuid)s already exists.") +class AttachHandleUUIDNeeded(CyborgException): + _msg_fmt = _("Need to provide AttachHandle uuid.") + + class ControlpathIDAlreadyExists(CyborgException): _msg_fmt = _("ControlpathID with uuid %(uuid)s already exists.") diff --git a/cyborg/conductor/manager.py b/cyborg/conductor/manager.py index 895c7302..45cfd625 100644 --- a/cyborg/conductor/manager.py +++ b/cyborg/conductor/manager.py @@ -370,7 +370,7 @@ class ConductorManager(object): "resource_providers?name=" + hostname).json() pr_uuid = provider["resource_providers"][0]["uuid"] return pr_uuid - except IndexError: + except (IndexError, KeyError): raise exception.PlacementResourceProviderNotFound( resource_provider=hostname) diff --git a/cyborg/objects/arq.py b/cyborg/objects/arq.py index 033039ad..17e81a7c 100644 --- a/cyborg/objects/arq.py +++ b/cyborg/objects/arq.py @@ -48,6 +48,7 @@ class ARQ(base.CyborgObject, object_base.VersionedObjectDictCompat): # Fields populated by Cyborg after binding 'attach_handle_type': object_fields.StringField(nullable=True), + 'attach_handle_uuid': object_fields.StringField(nullable=True), 'attach_handle_info': object_fields.DictOfStringsField(nullable=True), } diff --git a/cyborg/objects/ext_arq.py b/cyborg/objects/ext_arq.py index ddfa7c09..4cc1e209 100644 --- a/cyborg/objects/ext_arq.py +++ b/cyborg/objects/ext_arq.py @@ -13,11 +13,14 @@ # License for the specific language governing permissions and limitations # under the License. +import json + from openstack import connection from oslo_log import log as logging from oslo_utils import versionutils from oslo_versionedobjects import base as object_base +from cyborg.agent.rpcapi import AgentAPI from cyborg.common import constants from cyborg.common.constants import ARQ_STATES_TRANSFORM_MATRIX from cyborg.common import exception @@ -78,6 +81,10 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat, if target_version < (1, 2) and 'deployable_id' in primitive: del primitive['deployable_id'] + def __init__(self, *args, **kwargs): + super(ExtARQ, self).__init__(*args, **kwargs) + self.agent = AgentAPI() + def create(self, context, device_profile_id=None): """Create an ExtARQ record in the DB.""" if 'device_profile_name' not in self.arq and not device_profile_id: @@ -213,6 +220,16 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat, try: ah = AttachHandle.allocate(context, deployable.id) self.attach_handle_id = ah.id + # if attach_handle is a vgpu, create the mdev in the sys path + if ah.attach_type == 'MDEV': + attach_info = json.loads(ah.attach_info) + pci_addr = "{}:{}:{}.{}".format( + attach_info['domain'], attach_info['bus'], + attach_info['device'], attach_info['function']) + hostname = self.arq.hostname + asked_type = attach_info['asked_type'] + self.agent.create_vgpu_mdev( + context, hostname, pci_addr, asked_type, ah.uuid) except Exception as e: LOG.error("Failed to allocate attach handle for ARQ %s" "from deployable %s. Reason: %s", @@ -237,9 +254,17 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat, # if (self.arq.state == constants.ARQ_DELETING # or self.arq.state == ARQ_UNBOUND): - def _deallocate_attach_handle(self, context, ah_id): + def _deallocate_attach_handle(self, context, ah_id, hostname): try: attach_handle = AttachHandle.get_by_id(context, ah_id) + if attach_handle.attach_type == 'MDEV': + attach_info = json.loads(attach_handle.attach_info) + pci_addr = "{}:{}:{}.{}".format( + attach_info['domain'], attach_info['bus'], + attach_info['device'], attach_info['function']) + self.agent.remove_vgpu_mdev( + context, hostname, pci_addr, + attach_info['asked_type'], attach_handle.uuid) attach_handle.deallocate(context) except Exception as e: LOG.error("Failed to deallocate attach handle %s for ARQ %s." @@ -252,6 +277,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat, def unbind(self, context): arq = self.arq + hostname = arq.hostname arq.hostname = None arq.device_rp_uuid = None arq.instance_uuid = None @@ -260,7 +286,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat, # Unbind: mark attach handles as freed ah_id = self.attach_handle_id if ah_id: - self._deallocate_attach_handle(context, ah_id) + self._deallocate_attach_handle(context, ah_id, hostname) self.attach_handle_id = None self.deployable_id = None self.save(context) @@ -285,6 +311,7 @@ class ExtARQ(base.CyborgObject, object_base.VersionedObjectDictCompat, if db_ah is not None: db_extarq['attach_handle_type'] = db_ah['attach_type'] db_extarq['attach_handle_info'] = db_ah['attach_info'] + db_extarq['attach_handle_uuid'] = db_ah['uuid'] else: raise exception.ResourceNotFound( resource='Attach Handle', diff --git a/cyborg/tests/unit/objects/test_extarq.py b/cyborg/tests/unit/objects/test_extarq.py index 45e09633..4cbf594e 100644 --- a/cyborg/tests/unit/objects/test_extarq.py +++ b/cyborg/tests/unit/objects/test_extarq.py @@ -352,7 +352,8 @@ class TestExtARQObject(base.DbTestCase): self, mock_deallocate, mock_ah, mock_check_state): obj_extarq = self.fake_obj_extarqs[0] mock_ah.return_value = self.fake_obj_ahs[0] - obj_extarq._deallocate_attach_handle(self.context, mock_ah.id) + obj_extarq._deallocate_attach_handle( + self.context, mock_ah.id, obj_extarq.arq.hostname) mock_check_state.assert_not_called() @mock.patch('logging.LoggerAdapter.error') @@ -370,7 +371,8 @@ class TestExtARQObject(base.DbTestCase): mock_deallocate.side_effect = e self.assertRaises( exception.ResourceNotFound, - obj_extarq._deallocate_attach_handle, self.context, mock_ah.id) + obj_extarq._deallocate_attach_handle, self.context, mock_ah.id, + obj_extarq.arq.hostname) mock_log.assert_called_once_with( msg, mock_ah.id, obj_extarq.arq.uuid, str(e)) mock_check_state.assert_called_once_with(