Identify accelerator devices during introspection

Identify acclerator devices by processing pci devices and update to
ironic node when found. Currently only Tesla T4 from NVIDIA is
supported.

Change-Id: Id702cb04cb2445d544965821680cd0cc5cfd37e5
Story: 2007971
Task: 40473
This commit is contained in:
Kaifeng Wang 2020-08-07 14:13:51 +08:00
parent 7ff52c732b
commit de2a27ad8b
9 changed files with 197 additions and 0 deletions

View File

@ -258,6 +258,19 @@ Here are some plugins that can be additionally enabled:
[port_physnet]
cidr_map = 10.10.10.0/24:physnet_a, 2001:db8::/64:physnet_b
``accelerators``
Processes PCI data returned from inspection and compares with the
accelerator inventory, it will update accelerator device information to
the properties field of the ironic node if any accelerator device is
found, for example::
{'local_gb': '1115', 'cpus': '40', 'cpu_arch': 'x86_64', 'memory_mb': '32768',
'capabilities': 'boot_mode:bios,cpu_vt:true,cpu_aes:true,cpu_hugepages:true,cpu_hugepages_1g:true,cpu_txt:true',
'accel': [{'vendor_id': '10de', 'device_id': '1eb8', 'type': 'GPU',
'pci_address': '0000:82:00.0',
'device_info': 'NVIDIA Corporation Tesla T4'}]
}
Refer to :ref:`contributing_link` for information on how to write your
own plugin.

View File

@ -12,6 +12,7 @@
from oslo_config import cfg
from ironic_inspector.conf import accelerators
from ironic_inspector.conf import capabilities
from ironic_inspector.conf import coordination
from ironic_inspector.conf import default
@ -31,6 +32,7 @@ from ironic_inspector.conf import swift
CONF = cfg.CONF
accelerators.register_opts(CONF)
capabilities.register_opts(CONF)
coordination.register_opts(CONF)
discovery.register_opts(CONF)

View File

@ -0,0 +1,35 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from oslo_config import cfg
from ironic_inspector.common.i18n import _
_OPTS = [
cfg.StrOpt('known_devices',
default=os.path.abspath(os.path.join(
os.path.dirname(__file__), '../known_accelerators.yaml')),
help=_('The predefined accelerator devices which contains '
'information used for identifying accelerators.')),
]
def register_opts(conf):
conf.register_opts(_OPTS, 'accelerators')
def list_opts():
return _OPTS

View File

@ -0,0 +1,5 @@
pci_devices:
- vendor_id: "10de"
device_id: "1eb8"
type: GPU
device_info: NVIDIA Corporation Tesla T4

View File

@ -0,0 +1,78 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gather and distinguish Accelerator PCI devices from inventory."""
from oslo_config import cfg
import yaml
from ironic_inspector.plugins import base
from ironic_inspector import utils
CONF = cfg.CONF
LOG = utils.getProcessingLogger(__name__)
class AccelDevicesHook(base.ProcessingHook):
"""Processing hook for distinguishing accelerator devices."""
def __init__(self):
super(AccelDevicesHook, self).__init__()
self._known_devices = {}
with open(CONF.accelerators.known_devices) as f:
self._known_devices = yaml.safe_load(f)
self._validate_datasource()
def _validate_datasource(self):
# Do a simple check against the data source
if (not self._known_devices or
'pci_devices' not in self._known_devices):
raise RuntimeError('Could not find pci_devices in the '
'configuration data')
if not isinstance(self._known_devices['pci_devices'], list):
raise RuntimeError('pci_devices should contain a list of devices')
for device in self._known_devices['pci_devices']:
if not device.get('vendor_id') or not device.get('device_id'):
raise RuntimeError('one of devices is missing vendor_id or '
'device_id')
def _find_accelerator(self, vendor_id, device_id):
for dev in self._known_devices['pci_devices']:
if (dev['vendor_id'] == vendor_id and
dev['device_id'] == device_id):
return dev
def before_update(self, introspection_data, node_info, **kwargs):
pci_devices = introspection_data.get('pci_devices', [])
if not pci_devices:
LOG.warning('Unable to distinguish accelerator devices due to no '
'PCI devices information was received from the '
'ramdisk.')
return
accelerators = []
for pci_dev in pci_devices:
dev = self._find_accelerator(pci_dev['vendor_id'],
pci_dev['product_id'])
if dev:
accel = {k: dev[k] for k in dev.keys()}
accel.update(pci_address=pci_dev['bus'])
accelerators.append(accel)
if accelerators:
node_info.update_properties(accelerators=accelerators)
LOG.info('Found the following accelerator devices: %s',
accelerators)
else:
LOG.info('No known accelerator devices found')

View File

@ -0,0 +1,52 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock
from ironic_inspector import node_cache
from ironic_inspector.plugins import accel_device
from ironic_inspector.test import base as test_base
class TestAccelDevicesHook(test_base.NodeTest):
hook = accel_device.AccelDevicesHook()
@mock.patch.object(node_cache.NodeInfo, 'update_properties',
autospec=True)
def test_before_update(self, mock_update_props):
self.data['pci_devices'] = [
{"vendor_id": "10de", "product_id": "1eb8", "class": "1234",
"bus": "0000:01:1f.0", "revision": "1"},
]
expected_accels = [{'vendor_id': '10de', 'device_id': '1eb8',
'type': 'GPU', 'pci_address': '0000:01:1f.0',
'device_info': 'NVIDIA Corporation Tesla T4'}]
self.hook.before_update(self.data, self.node_info)
mock_update_props.assert_called_once_with(self.node_info,
accelerators=expected_accels)
@mock.patch.object(node_cache.NodeInfo, 'update_properties',
autospec=True)
def test_before_update_no_pci_info_from_ipa(self, mock_update_props):
self.hook.before_update(self.data, self.node_info)
self.assertFalse(mock_update_props.called)
@mock.patch.object(node_cache.NodeInfo, 'update_properties',
autospec=True)
def test_before_update_no_match(self, mock_update_props):
self.data['pci_devices'] = [
{"vendor_id": "1234", "product_id": "1234", "class": "1234",
"bus": "0000:01:1f.0", "revision": "1"},
]
self.hook.before_update(self.data, self.node_info)
self.assertFalse(mock_update_props.called)

View File

@ -0,0 +1,10 @@
---
features:
- |
Adds an ``accelerators`` plugin to identify acclerator devices and update
the bare metal node for future scheduling. The accelerator devices will be
saved to node properties under the key ``accelerators``. Introduces a
configuration option ``[accelerators]known_devices`` to specify a
configuration file which contains required information to identify
accelerator devices, by default it uses the in-tree configuration file
named ``known_accelerators.yaml``.

View File

@ -15,6 +15,7 @@ keystonemiddleware>=4.18.0 # Apache-2.0
netaddr>=0.7.18 # BSD
pbr!=2.1.0,>=2.0.0 # Apache-2.0
pytz>=2013.6 # MIT
PyYAML>=5.3.1
openstacksdk>=0.40.0 # Apache-2.0
oslo.concurrency>=3.26.0 # Apache-2.0
oslo.config>=5.2.0 # Apache-2.0

View File

@ -36,6 +36,7 @@ console_scripts =
wsgi_scripts =
ironic-inspector-api-wsgi = ironic_inspector.cmd.wsgi:initialize_wsgi_app
ironic_inspector.hooks.processing =
accelerators = ironic_inspector.plugins.accel_device:AccelDevicesHook
scheduler = ironic_inspector.plugins.standard:SchedulerHook
validate_interfaces = ironic_inspector.plugins.standard:ValidateInterfacesHook
ramdisk_error = ironic_inspector.plugins.standard:RamdiskErrorHook