Add GPU reporting to idrac-wsman inspect interface

This patch implements reporting number of NVIDIA Tesla T4
devices connected to a system by discovering such devices
and reporting them through capability 'pci_gpu_devices'.

Change-Id: If713895f05f08a9827c4c085108abb3e388b2a2e
Story: 2008118
Task: 40839
Depends-On: https://review.opendev.org/#/c/750364/
This commit is contained in:
Mudit 2020-09-10 10:29:47 -04:00 committed by Richard Pioso
parent e2d0f3fd07
commit 101fc29686
5 changed files with 145 additions and 5 deletions

View File

@ -259,6 +259,7 @@ The inspection discovers the following properties:
Extra capabilities: Extra capabilities:
* ``boot_mode``: UEFI or BIOS boot mode. * ``boot_mode``: UEFI or BIOS boot mode.
* ``pci_gpu_devices``: number of GPU devices connected to the bare metal.
It also creates baremetal ports for each NIC port detected in the system. It also creates baremetal ports for each NIC port detected in the system.
The ``idrac-wsman`` inspect interface discovers which NIC ports are The ``idrac-wsman`` inspect interface discovers which NIC ports are

View File

@ -7,7 +7,7 @@
proliantutils>=2.10.0 proliantutils>=2.10.0
pysnmp>=4.3.0,<5.0.0 pysnmp>=4.3.0,<5.0.0
python-scciclient>=0.8.0 python-scciclient>=0.8.0
python-dracclient>=3.1.0,<6.0.0 python-dracclient>=5.1.0,<6.0.0
python-xclarityclient>=0.1.6 python-xclarityclient>=0.1.6
# The Redfish hardware type uses the Sushy library # The Redfish hardware type uses the Sushy library

View File

@ -49,6 +49,8 @@ class DracRedfishInspect(redfish_inspect.RedfishInspect):
class DracWSManInspect(base.InspectInterface): class DracWSManInspect(base.InspectInterface):
_GPU_SUPPORTED_LIST = {"TU104GL [Tesla T4]"}
def get_properties(self): def get_properties(self):
"""Return the properties of the interface. """Return the properties of the interface.
@ -98,9 +100,12 @@ class DracWSManInspect(base.InspectInterface):
properties['cpu_arch'] = 'x86_64' if cpus[0].arch64 else 'x86' properties['cpu_arch'] = 'x86_64' if cpus[0].arch64 else 'x86'
bios_settings = client.list_bios_settings() bios_settings = client.list_bios_settings()
video_controllers = client.list_video_controllers()
current_capabilities = node.properties.get('capabilities', '') current_capabilities = node.properties.get('capabilities', '')
new_capabilities = { new_capabilities = {
'boot_mode': bios_settings["BootMode"].current_value.lower()} 'boot_mode': bios_settings["BootMode"].current_value.lower(),
'pci_gpu_devices': self._calculate_gpus(video_controllers)}
capabilties = utils.get_updated_capabilities(current_capabilities, capabilties = utils.get_updated_capabilities(current_capabilities,
new_capabilities) new_capabilities)
properties['capabilities'] = capabilties properties['capabilities'] = capabilties
@ -190,6 +195,23 @@ class DracWSManInspect(base.InspectInterface):
else: else:
return cpu.cores return cpu.cores
def _calculate_gpus(self, video_controllers):
"""Find actual GPU count.
This method reports number of NVIDIA Tesla T4 GPU devices present
on the server.
:param video_controllers: list of video controllers.
:returns: returns total gpu count.
"""
gpu_cnt = 0
for video_controller in video_controllers:
for gpu in self._GPU_SUPPORTED_LIST:
if video_controller.description == gpu:
gpu_cnt += 1
return gpu_cnt
def _get_pxe_dev_nics(self, client, nics, node): def _get_pxe_dev_nics(self, client, nics, node):
"""Get a list of pxe device interfaces. """Get a list of pxe device interfaces.

View File

@ -135,6 +135,23 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'PxeDev4Interface': None} 'PxeDev4Interface': None}
nic_settings = {'LegacyBootProto': {'current_value': 'PXE'}, nic_settings = {'LegacyBootProto': {'current_value': 'PXE'},
'FQDD': 'NIC.Embedded.1-1-1'} 'FQDD': 'NIC.Embedded.1-1-1'}
video_controllers = [
{'id': 'Video.Embedded.1-1',
'description': 'Integrated Matrox G200eW3 Graphics Controller',
'function_number': 0,
'manufacturer': 'Matrox Electronics Systems Ltd.',
'pci_device_id': '0536',
'pci_vendor_id': '102B',
'pci_subdevice_id': '0737',
'pci_subvendor_id': '1028'},
{'id': 'Video.Slot.7-1',
'description': 'TU104GL [Tesla T4]',
'function_number': 0,
'manufacturer': 'NVIDIA Corporation',
'pci_device_id': '1EB8',
'pci_vendor_id': '10DE',
'pci_subdevice_id': '12A2',
'pci_subvendor_id': '10DE'}]
self.memory = [test_utils.dict_to_namedtuple(values=m) for m in memory] self.memory = [test_utils.dict_to_namedtuple(values=m) for m in memory]
self.cpus = [test_utils.dict_to_namedtuple(values=c) for c in cpus] self.cpus = [test_utils.dict_to_namedtuple(values=c) for c in cpus]
@ -146,6 +163,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
self.bios_boot_settings = test_utils.dict_of_object(bios_boot_settings) self.bios_boot_settings = test_utils.dict_of_object(bios_boot_settings)
self.uefi_boot_settings = test_utils.dict_of_object(uefi_boot_settings) self.uefi_boot_settings = test_utils.dict_of_object(uefi_boot_settings)
self.nic_settings = test_utils.dict_of_object(nic_settings) self.nic_settings = test_utils.dict_of_object(nic_settings)
self.video_controllers = [test_utils.dict_to_namedtuple(values=vc)
for vc in video_controllers]
def test_get_properties(self): def test_get_properties(self):
expected = drac_common.COMMON_PROPERTIES expected = drac_common.COMMON_PROPERTIES
@ -161,7 +180,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 1116, 'local_gb': 1116,
'cpus': 18, 'cpus': 18,
'cpu_arch': 'x86_64', 'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi'} 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock() mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory mock_client.list_memory.return_value = self.memory
@ -169,6 +188,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.return_value = self.virtual_disks mock_client.list_virtual_disks.return_value = self.virtual_disks
mock_client.list_nics.return_value = self.nics mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers
with task_manager.acquire(self.context, self.node.uuid, with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task: shared=True) as task:
@ -191,6 +212,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.side_effect = ( mock_client.list_virtual_disks.side_effect = (
drac_exceptions.BaseClientException('boom')) drac_exceptions.BaseClientException('boom'))
mock_client.list_bios_settings.return_value = self.bios_boot_settings mock_client.list_bios_settings.return_value = self.bios_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers
with task_manager.acquire(self.context, self.node.uuid, with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task: shared=True) as task:
@ -207,7 +230,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 279, 'local_gb': 279,
'cpus': 18, 'cpus': 18,
'cpu_arch': 'x86_64', 'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi'} 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock() mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory mock_client.list_memory.return_value = self.memory
@ -216,6 +239,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_physical_disks.return_value = self.physical_disks mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers
with task_manager.acquire(self.context, self.node.uuid, with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task: shared=True) as task:
@ -239,12 +264,94 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_physical_disks.return_value = self.physical_disks mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers
with task_manager.acquire(self.context, self.node.uuid, with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task: shared=True) as task:
self.assertRaises(exception.HardwareInspectionFailure, self.assertRaises(exception.HardwareInspectionFailure,
task.driver.inspect.inspect_hardware, task) task.driver.inspect.inspect_hardware, task)
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
def test_inspect_hardware_no_supported_gpu(self, mock_port_create,
mock_get_drac_client):
controllers = [
{'id': 'Video.Embedded.1-1',
'description': 'Integrated Matrox G200eW3 Graphics Controller',
'function_number': 0,
'manufacturer': 'Matrox Electronics Systems Ltd.',
'pci_device_id': '0536',
'pci_vendor_id': '102B',
'pci_subdevice_id': '0737',
'pci_subvendor_id': '1028'},
{'id': 'Video.Slot.7-1',
'description': 'GV100GL [Tesla V100 PCIe 16GB]]',
'function_number': 0,
'manufacturer': 'NVIDIA Corporation',
'pci_device_id': '1DB4',
'pci_vendor_id': '10DE',
'pci_subdevice_id': '1214',
'pci_subvendor_id': '10DE'}]
expected_node_properties = {
'memory_mb': 32768,
'local_gb': 279,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
mock_client.list_cpus.return_value = self.cpus
mock_client.list_virtual_disks.return_value = []
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
video_controllers = [test_utils.dict_to_namedtuple(values=vc)
for vc in controllers]
mock_client.list_video_controllers.return_value = video_controllers
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
return_value = task.driver.inspect.inspect_hardware(task)
self.node.refresh()
self.assertEqual(expected_node_properties, self.node.properties)
self.assertEqual(states.MANAGEABLE, return_value)
self.assertEqual(2, mock_port_create.call_count)
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
def test_inspect_hardware_no_gpu(self, mock_port_create,
mock_get_drac_client):
expected_node_properties = {
'memory_mb': 32768,
'local_gb': 279,
'cpus': 18,
'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi,pci_gpu_devices:0'}
mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory
mock_client.list_cpus.return_value = self.cpus
mock_client.list_virtual_disks.return_value = []
mock_client.list_physical_disks.return_value = self.physical_disks
mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = []
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
return_value = task.driver.inspect.inspect_hardware(task)
self.node.refresh()
self.assertEqual(expected_node_properties, self.node.properties)
self.assertEqual(states.MANAGEABLE, return_value)
self.assertEqual(2, mock_port_create.call_count)
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True, @mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True) autospec=True)
@mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True) @mock.patch.object(objects.Port, 'create', spec_set=True, autospec=True)
@ -255,7 +362,7 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
'local_gb': 1116, 'local_gb': 1116,
'cpus': 18, 'cpus': 18,
'cpu_arch': 'x86_64', 'cpu_arch': 'x86_64',
'capabilities': 'boot_mode:uefi'} 'capabilities': 'boot_mode:uefi,pci_gpu_devices:1'}
mock_client = mock.Mock() mock_client = mock.Mock()
mock_get_drac_client.return_value = mock_client mock_get_drac_client.return_value = mock_client
mock_client.list_memory.return_value = self.memory mock_client.list_memory.return_value = self.memory
@ -263,6 +370,8 @@ class DracInspectionTestCase(test_utils.BaseDracTest):
mock_client.list_virtual_disks.return_value = self.virtual_disks mock_client.list_virtual_disks.return_value = self.virtual_disks
mock_client.list_nics.return_value = self.nics mock_client.list_nics.return_value = self.nics
mock_client.list_bios_settings.return_value = self.uefi_boot_settings mock_client.list_bios_settings.return_value = self.uefi_boot_settings
mock_client.list_video_controllers.return_value = \
self.video_controllers
mock_port_create.side_effect = exception.MACAlreadyExists("boom") mock_port_create.side_effect = exception.MACAlreadyExists("boom")

View File

@ -0,0 +1,8 @@
---
features:
- |
Adds support in ``idrac-wsman`` inspect hardware interface for reporting
number of GPU devices connected to a system. This information is advertised
through capability ``pci_gpu_devices``, which can be used to make
scheduling decisions for the node. Currently, NVIDIA Tesla T4 GPU devices
are reported.