nova/nova/accelerator/cyborg.py

312 lines
12 KiB
Python

# Copyright 2019 Intel
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from keystoneauth1 import exceptions as ks_exc
from nova import exception
from nova.i18n import _
from nova import objects
from nova.scheduler import utils as schedutils
from nova import service_auth
from nova import utils
"""
Note on object relationships:
1 device profile (DP) has D >= 1 request groups (just as a flavor
has many request groups).
Each DP request group corresponds to exactly 1 numbered request
group (RG) in the request spec.
Each numbered RG corresponds to exactly one resource provider (RP).
A DP request group may request A >= 1 accelerators, and so result
in the creation of A ARQs.
Each ARQ corresponds to exactly 1 DP request group.
A device profile is a dictionary:
{ "name": "mydpname",
"uuid": <uuid>,
"groups": [ <device_profile_request_group> ]
}
A device profile group is a dictionary too:
{ "resources:CUSTOM_ACCELERATOR_FPGA": "2",
"resources:CUSTOM_LOCAL_MEMORY": "1",
"trait:CUSTOM_INTEL_PAC_ARRIA10": "required",
"trait:CUSTOM_FUNCTION_NAME_FALCON_GZIP_1_1": "required",
# 0 or more Cyborg properties
"accel:bitstream_id": "FB021995_BF21_4463_936A_02D49D4DB5E5"
}
See cyborg/cyborg/objects/device_profile.py for more details.
"""
LOG = logging.getLogger(__name__)
def get_client(context):
return _CyborgClient(context)
def get_device_profile_group_requester_id(dp_group_id):
"""Return the value to use in objects.RequestGroup.requester_id.
The requester_id is used to match device profile groups from
Cyborg to the request groups in request spec.
:param dp_group_id: The index of the request group in the device profile.
"""
req_id = "device_profile_" + str(dp_group_id)
return req_id
def get_device_profile_request_groups(context, dp_name):
cyclient = get_client(context)
return cyclient.get_device_profile_groups(dp_name)
class _CyborgClient(object):
DEVICE_PROFILE_URL = "/device_profiles"
ARQ_URL = "/accelerator_requests"
def __init__(self, context):
auth = service_auth.get_auth_plugin(context)
self._client = utils.get_ksa_adapter('accelerator', ksa_auth=auth)
def _call_cyborg(self, func, *args, **kwargs):
resp = err_msg = None
try:
resp = func(*args, **kwargs)
if not resp:
msg = _('Invalid response from Cyborg: ')
err_msg = msg + str(resp)
except ks_exc.ClientException as exc:
err_msg = _('Could not communicate with Cyborg.')
LOG.exception('%s: %s', err_msg, str(exc))
return resp, err_msg
def _get_device_profile_list(self, dp_name):
query = {"name": dp_name}
err_msg = None
resp, err_msg = self._call_cyborg(self._client.get,
self.DEVICE_PROFILE_URL, params=query)
if err_msg:
raise exception.DeviceProfileError(name=dp_name, msg=err_msg)
return resp.json().get('device_profiles')
def get_device_profile_groups(self, dp_name):
"""Get list of profile group objects from the device profile.
Cyborg API returns: {"device_profiles": [<device_profile>]}
See module notes above for further details.
:param dp_name: string: device profile name
Expected to be valid, not None or ''.
:returns: [objects.RequestGroup]
:raises: DeviceProfileError
"""
dp_list = self._get_device_profile_list(dp_name)
if not dp_list:
msg = _('Expected 1 device profile but got nothing.')
raise exception.DeviceProfileError(name=dp_name, msg=msg)
if len(dp_list) != 1:
err = _('Expected 1 device profile but got %s.') % len(dp_list)
raise exception.DeviceProfileError(name=dp_name, msg=err)
dp_groups = dp_list[0]['groups']
request_groups = []
for dp_group_id, dp_group in enumerate(dp_groups):
req_id = get_device_profile_group_requester_id(dp_group_id)
rg = objects.RequestGroup(requester_id=req_id)
for key, val in dp_group.items():
match = schedutils.ResourceRequest.XS_KEYPAT.match(key)
if not match:
continue # could be 'accel:foo=bar', skip it
prefix, _ignore, name = match.groups()
if prefix == schedutils.ResourceRequest.XS_RES_PREFIX:
rg.add_resource(rclass=name, amount=val)
elif prefix == schedutils.ResourceRequest.XS_TRAIT_PREFIX:
rg.add_trait(trait_name=name, trait_type=val)
request_groups.append(rg)
return request_groups
def _create_arqs(self, dp_name):
data = {"device_profile_name": dp_name}
resp, err_msg = self._call_cyborg(self._client.post,
self.ARQ_URL, json=data)
if err_msg:
raise exception.AcceleratorRequestOpFailed(
op=_('create'), msg=err_msg)
return resp.json().get('arqs')
def create_arqs_and_match_resource_providers(self, dp_name, rg_rp_map):
"""Create ARQs, match them with request groups and thereby
determine their corresponding RPs.
:param dp_name: Device profile name
:param rg_rp_map: Request group - Resource Provider map
{requester_id: [resource_provider_uuid]}
:returns:
[arq], with each ARQ associated with an RP
:raises: DeviceProfileError, AcceleratorRequestOpFailed
"""
LOG.info('Creating ARQs for device profile %s', dp_name)
arqs = self._create_arqs(dp_name)
if not arqs or len(arqs) == 0:
msg = _('device profile name %s') % dp_name
raise exception.AcceleratorRequestOpFailed(op=_('create'), msg=msg)
for arq in arqs:
dp_group_id = arq['device_profile_group_id']
arq['device_rp_uuid'] = None
requester_id = (
get_device_profile_group_requester_id(dp_group_id))
arq['device_rp_uuid'] = rg_rp_map[requester_id][0]
return arqs
def bind_arqs(self, bindings):
"""Initiate Cyborg bindings.
Handles RFC 6902-compliant JSON patching, sparing
calling Nova code from those details.
:param bindings:
{ "$arq_uuid": {
"hostname": STRING
"device_rp_uuid": UUID
"instance_uuid": UUID
},
...
}
:returns: nothing
:raises: AcceleratorRequestOpFailed
"""
LOG.info('Binding ARQs.')
# Create a JSON patch in RFC 6902 format
patch_list = {}
for arq_uuid, binding in bindings.items():
patch = [{"path": "/" + field,
"op": "add",
"value": value
} for field, value in binding.items()]
patch_list[arq_uuid] = patch
resp, err_msg = self._call_cyborg(self._client.patch,
self.ARQ_URL, json=patch_list)
if err_msg:
arq_uuids = bindings.keys()
msg = _(' Binding failed for ARQ UUIDs: ')
err_msg = err_msg + msg + ','.join(arq_uuids)
raise exception.AcceleratorRequestBindingFailed(
arqs=arq_uuids, msg=err_msg)
def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
"""Get ARQs for the instance.
:param instance_uuid: Instance UUID
:param only_resolved: flag to return only resolved ARQs
:returns: List of ARQs for the instance:
if only_resolved: only those ARQs which have completed binding
else: all ARQs
The format of the returned data structure is as below:
[
{'uuid': $arq_uuid,
'device_profile_name': $dp_name,
'device_profile_group_id': $dp_request_group_index,
'state': 'Bound',
'device_rp_uuid': $resource_provider_uuid,
'hostname': $host_nodename,
'instance_uuid': $instance_uuid,
'attach_handle_info': { # PCI bdf
'bus': '0c', 'device': '0',
'domain': '0000', 'function': '0'},
'attach_handle_type': 'PCI'
# or 'TEST_PCI' for Cyborg fake driver
}
]
:raises: AcceleratorRequestOpFailed
"""
query = {"instance": instance_uuid}
resp, err_msg = self._call_cyborg(self._client.get,
self.ARQ_URL, params=query)
if err_msg:
err_msg = err_msg + _(' Instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed(
op=_('get'), msg=err_msg)
arqs = resp.json().get('arqs')
if not arqs:
err_msg = _('Cyborg returned no accelerator requests for '
'instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed(
op=_('get'), msg=err_msg)
if only_resolved:
arqs = [arq for arq in arqs if
arq['state'] in ['Bound', 'BindFailed', 'Deleting']]
return arqs
def delete_arqs_for_instance(self, instance_uuid):
"""Delete ARQs for instance, after unbinding if needed.
:param instance_uuid: Instance UUID
:raises: AcceleratorRequestOpFailed
"""
# Unbind and delete the ARQs
params = {"instance": instance_uuid}
resp, err_msg = self._call_cyborg(self._client.delete,
self.ARQ_URL, params=params)
if err_msg:
msg = err_msg + _(' Instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed(
op=_('delete'), msg=msg)
def delete_arqs_by_uuid(self, arq_uuids):
"""Delete the specified ARQs, unbinding them if needed.
This is meant to be used to clean up ARQs that have failed to bind
to an instance. So delete_arqs_for_instance() is not applicable.
This Cyborg API call is NOT idempotent, i.e., if called more than
once, the 2nd and later calls will throw errors.
If this fails, an error is logged but no exception is raised
because this cleans up Cyborg resources, but should otherwise
not affect instance spawn.
:params arq_uuids: dict_keys() of ARQ UUIDs
"""
arq_uuid_str = ','.join(arq_uuids)
params = {'arqs': arq_uuid_str}
resp, err_msg = self._call_cyborg(self._client.delete,
self.ARQ_URL, params=params)
if err_msg:
# No point raising an exception.
LOG.error('Failed to delete ARQs %s', arq_uuid_str)
def get_arq_uuids_for_instance(self, instance):
"""Get ARQ UUIDs for the instance.
:param instance: Instance Object
:return: ARQ UUIDs.
"""
return [arq['uuid']
for arq in self.get_arqs_for_instance(instance.uuid)]