Browse Source

Add GPU reporting to idrac-wsman inspect interface

This patch implements reporting number of NVIDIA Tesla T4
devices connected to a system by discovering such devices
and reporting them through capability 'pci_gpu_devices'.

Change-Id: If713895f05f08a9827c4c085108abb3e388b2a2e
Story: 2008118
Task: 40839
Depends-On: https://review.opendev.org/#/c/750364/
tags/16.0.0
Mudit 1 month ago
committed by Richard Pioso
parent
commit
101fc29686
5 changed files with 145 additions and 5 deletions
  1. +1
    -0
      doc/source/admin/drivers/idrac.rst
  2. +1
    -1
      driver-requirements.txt
  3. +23
    -1
      ironic/drivers/modules/drac/inspect.py
  4. +112
    -3
      ironic/tests/unit/drivers/modules/drac/test_inspect.py
  5. +8
    -0
      releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml

+ 1
- 0
doc/source/admin/drivers/idrac.rst View File

@@ -259,6 +259,7 @@ The inspection discovers the following properties:
Extra capabilities:

* ``boot_mode``: UEFI or BIOS boot mode.
* ``pci_gpu_devices``: number of GPU devices connected to the bare metal.

It also creates baremetal ports for each NIC port detected in the system.
The ``idrac-wsman`` inspect interface discovers which NIC ports are


+ 1
- 1
driver-requirements.txt View File

@@ -7,7 +7,7 @@
proliantutils>=2.10.0
pysnmp>=4.3.0,<5.0.0
python-scciclient>=0.8.0
python-dracclient>=3.1.0,<6.0.0
python-dracclient>=5.1.0,<6.0.0
python-xclarityclient>=0.1.6

# The Redfish hardware type uses the Sushy library


+ 23
- 1
ironic/drivers/modules/drac/inspect.py View File

@@ -49,6 +49,8 @@ class DracRedfishInspect(redfish_inspect.RedfishInspect):

class DracWSManInspect(base.InspectInterface):

_GPU_SUPPORTED_LIST = {"TU104GL [Tesla T4]"}

def get_properties(self):
"""Return the properties of the interface.

@@ -98,9 +100,12 @@ class DracWSManInspect(base.InspectInterface):
properties['cpu_arch'] = 'x86_64' if cpus[0].arch64 else 'x86'

bios_settings = client.list_bios_settings()
video_controllers = client.list_video_controllers()
current_capabilities = node.properties.get('capabilities', '')
new_capabilities = {
'boot_mode': bios_settings["BootMode"].current_value.lower()}
'boot_mode': bios_settings["BootMode"].current_value.lower(),
'pci_gpu_devices': self._calculate_gpus(video_controllers)}

capabilties = utils.get_updated_capabilities(current_capabilities,
new_capabilities)
properties['capabilities'] = capabilties
@@ -190,6 +195,23 @@ class DracWSManInspect(base.InspectInterface):
else:
return cpu.cores

def _calculate_gpus(self, video_controllers):
"""Find actual GPU count.

This method reports number of NVIDIA Tesla T4 GPU devices present
on the server.

:param video_controllers: list of video controllers.

:returns: returns total gpu count.
"""
gpu_cnt = 0
for video_controller in video_controllers:
for gpu in self._GPU_SUPPORTED_LIST:
if video_controller.description == gpu:
gpu_cnt += 1
return gpu_cnt

def _get_pxe_dev_nics(self, client, nics, node):
"""Get a list of pxe device interfaces.



+ 112
- 3
ironic/tests/unit/drivers/modules/drac/test_inspect.py View File

@@ -135,6 +135,23 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'PxeDev4Interface': None}
nic_settings = {'LegacyBootProto': {'current_value': 'PXE'},
'FQDD': 'NIC.Embedded.1-1-1'}
video_controllers = [
{'id': 'Video.Embedded.1-1',
'description': 'Integrated Matrox G200eW3 Graphics Controller',
'function_number': 0,
'manufacturer': 'Matrox Electronics Systems Ltd.',
'pci_device_id': '0536',
'pci_vendor_id': '102B',
'pci_subdevice_id': '0737',
'pci_subvendor_id': '1028'},
{'id': 'Video.Slot.7-1',
'description': 'TU104GL [Tesla T4]',
'function_number': 0,
'manufacturer': 'NVIDIA Corporation',
'pci_device_id': '1EB8',
'pci_vendor_id': '10DE',
'pci_subdevice_id': '12A2',
'pci_subvendor_id': '10DE'}]

self.memory = [test_utils.dict_to_namedtuple(values=m) for m in memory]
self.cpus = [test_utils.dict_to_namedtuple(values=c) for c in cpus]
@@ -146,6 +163,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
self.bios_boot_settings = test_utils.dict_of_object(bios_boot_settings)
self.uefi_boot_settings = test_utils.dict_of_object(uefi_boot_settings)
self.nic_settings = test_utils.dict_of_object(nic_settings)
self.video_controllers = [test_utils.dict_to_namedtuple(values=vc)
for vc in video_controllers]

def test_get_properties(self):
expected = drac_common.COMMON_PROPERTIES
@@ -161,7 +180,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 1116,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi'}
'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
@@ -169,6 +188,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.return_value = self.virtual_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers

with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -191,6 +212,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.side_effect = (
drac_exceptions.BaseClientException('boom'))
mock_client.list_bios_settings.return_value = self.bios_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers

with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -207,7 +230,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 279,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi'}
'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
@@ -216,6 +239,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers

with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
@@ -239,12 +264,94 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers

with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.assertRaises(exception.HardwareInspectionFailure,
task.driver.inspect.inspect_hardware, task)

@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
def test_inspect_hardware_no_supported_gpu(self, mock_port_create,
mock_get_drac_client):
controllers = [
{'id': 'Video.Embedded.1-1',
'description': 'Integrated Matrox G200eW3 Graphics Controller',
'function_number': 0,
'manufacturer': 'Matrox Electronics Systems Ltd.',
'pci_device_id': '0536',
'pci_vendor_id': '102B',
'pci_subdevice_id': '0737',
'pci_subvendor_id': '1028'},
{'id': 'Video.Slot.7-1',
'description': 'GV100GL [Tesla V100 PCIe 16GB]]',
'function_number': 0,
'manufacturer': 'NVIDIA Corporation',
'pci_device_id': '1DB4',
'pci_vendor_id': '10DE',
'pci_subdevice_id': '1214',
'pci_subvendor_id': '10DE'}]

expected_node_properties = {
'memory_mb': 32768,
'local_gb': 279,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
mock_client.list_cpus.return_value = self.cpus
mock_client.list_virtual_disks.return_value = []
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
video_controllers = [test_utils.dict_to_namedtuple(values=vc)
for vc in controllers]
mock_client.list_video_controllers.return_value = video_controllers

with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
return_value = task.driver.inspect.inspect_hardware(task)

self.node.refresh()
self.assertEqual(expected_node_properties, self.node.properties)
self.assertEqual(states.MANAGEABLE, return_value)
self.assertEqual(2, mock_port_create.call_count)

@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
def test_inspect_hardware_no_gpu(self, mock_port_create,
mock_get_drac_client):
expected_node_properties = {
'memory_mb': 32768,
'local_gb': 279,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
mock_client.list_cpus.return_value = self.cpus
mock_client.list_virtual_disks.return_value = []
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = []

with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
return_value = task.driver.inspect.inspect_hardware(task)

self.node.refresh()
self.assertEqual(expected_node_properties, self.node.properties)
self.assertEqual(states.MANAGEABLE, return_value)
self.assertEqual(2, mock_port_create.call_count)

@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
@@ -255,7 +362,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 1116,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi'}
'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
@@ -263,6 +370,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.return_value = self.virtual_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers

mock_port_create.side_effect = exception.MACAlreadyExists("boom")



+ 8
- 0
releasenotes/notes/idrac-add-gpu-reporting-support-f4d80e2071f85f6a.yaml View File

@@ -0,0 +1,8 @@
---
features:
- |
Adds support in ``idrac-wsman`` inspect hardware interface for reporting
number of GPU devices connected to a system. This information is advertised
through capability ``pci_gpu_devices``, which can be used to make
scheduling decisions for the node. Currently, NVIDIA Tesla T4 GPU devices
are reported.

Loading…
Cancel
Save