Add inspection hooks

Adds these inspection hooks in the agent inspect interface for
processing data received from the ramdisk at the
/v1/continue_inspection endpoint: 'accelerators', 'boot-mode',
'cpu-capabilities', and 'extra-hardware'.

Change-Id: I63a528eba15391292c841693d6a0cc2f3b683720
Story: #2010275
This commit is contained in:
Mahnoor Asghar 2023-08-08 13:10:34 -04:00
parent bcfddda517
commit 609ccc9037
11 changed files with 615 additions and 2 deletions

View File

@ -12,6 +12,8 @@
# License for the specific language governing permissions and limitations
# under the License.
import os
from oslo_config import cfg
from ironic.common.i18n import _
@ -29,6 +31,14 @@ VALID_KEEP_PORTS_VALUES = {
'present': _('keep only ports with MAC\'s present in the inventory'),
'added': _('keep only ports determined by the add_ports option'),
}
DEFAULT_CPU_FLAGS_MAPPING = {
'vmx': 'cpu_vt',
'svm': 'cpu_vt',
'aes': 'cpu_aes',
'pse': 'cpu_hugepages',
'pdpe1gb': 'cpu_hugepages_1g',
'smx': 'cpu_txt',
}
opts = [
cfg.IntOpt('status_check_period', default=60,
@ -75,14 +85,33 @@ opts = [
'run by default. In most cases, the operators will not '
'modify this. The default (somewhat conservative) hooks '
'will raise an exception in case the ramdisk reports an '
'error, validate interfaces in the inventory, and create'
' ports.')),
'error, validate interfaces in the inventory, create '
'ports and set the node\'s cpu architecture property.')),
cfg.StrOpt('hooks',
default='$default_hooks',
help=_('Comma-separated list of enabled hooks for processing '
'pipeline. The default for this is $default_hooks. '
'Hooks can be added before or after the defaults '
'like this: "prehook,$default_hooks,posthook".')),
cfg.StrOpt('known_accelerators',
default=os.path.join(
'$pybasedir',
'drivers/modules/inspector/hooks/known_accelerators.yaml'),
help=_('Path to the file which contains the known accelerator '
'devices, to be used by the "accelerators" inspection '
'hook.')),
cfg.DictOpt('cpu_capabilities',
default=DEFAULT_CPU_FLAGS_MAPPING,
help='Mapping between a CPU flag and a node capability to set '
'if this CPU flag is present. This configuration option '
'is used by the "cpu-capabilities" inspection hook.'),
cfg.BoolOpt('extra_hardware_strict',
default=False,
help=_('If True, refuse to parse extra data (in plugin_data) '
'if at least one record is too short. Additionally, '
'remove the incoming "data" even if parsing failed. '
'This configuration option is used by the '
'"extra-hardware" inspection hook.'))
]

View File

@ -0,0 +1,80 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from oslo_log import log as logging
import yaml
from ironic.drivers.modules.inspector.hooks import base
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
class AcceleratorsHook(base.InspectionHook):
"""Hook to set the node's accelerators property."""
def __init__(self):
super(AcceleratorsHook, self).__init__()
self._known_devices = {}
with open(CONF.inspector.known_accelerators) as f:
self._known_devices = yaml.safe_load(f)
self._validate_known_devices()
def _validate_known_devices(self):
# Do a simple check against the data source
if (not self._known_devices
or 'pci_devices' not in self._known_devices):
raise RuntimeError('Could not find pci_devices in the '
'configuration data.')
if not isinstance(self._known_devices['pci_devices'], list):
raise RuntimeError('pci_devices in the configuration file should '
'contain a list of devices.')
for device in self._known_devices['pci_devices']:
if not device.get('vendor_id') or not device.get('device_id'):
raise RuntimeError('One of the PCI devices in the '
'configuration file is missing vendor_id '
'or device_id.')
def _find_accelerator(self, vendor_id, device_id):
for dev in self._known_devices['pci_devices']:
if (dev['vendor_id'] == vendor_id
and dev['device_id'] == device_id):
return dev
def __call__(self, task, inventory, plugin_data):
pci_devices = plugin_data.get('pci_devices', [])
if not pci_devices:
LOG.warning('Unable to process accelerator devices because no PCI '
'device information was received from the ramdisk for '
'node %s.', task.node.uuid)
return
accelerators = []
for pci_dev in pci_devices:
known_device = self._find_accelerator(pci_dev['vendor_id'],
pci_dev['product_id'])
if known_device:
accelerator = {k: known_device[k] for k in known_device.keys()}
accelerator.update(pci_address=pci_dev['bus'])
accelerators.append(accelerator)
if accelerators:
LOG.info('Found the following accelerator devices for node %s: %s',
task.node.uuid, accelerators)
task.node.set_property('accelerators', accelerators)
task.node.save()
else:
LOG.info('No known accelerator devices found for node %s',
task.node.uuid)

View File

@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from oslo_log import log as logging
from ironic.common import utils
from ironic.drivers.modules.inspector.hooks import base
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
class BootModeHook(base.InspectionHook):
"""Hook to set the node's boot_mode capability in node properties."""
def __call__(self, task, inventory, plugin_data):
boot_mode = inventory.get('boot', {}).get('current_boot_mode')
if boot_mode is None:
LOG.warning('No boot mode information available for node %s',
task.node.uuid)
return
LOG.info('Boot mode is %s for node %s', boot_mode, task.node.uuid)
old_capabilities = task.node.properties.get('capabilities')
new_capabilities = utils.get_updated_capabilities(
old_capabilities, {'boot_mode': boot_mode})
LOG.debug('New capabilities for node %s: %s', task.node.uuid,
new_capabilities)
task.node.set_property('capabilities', new_capabilities)
task.node.save()

View File

@ -0,0 +1,47 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from oslo_log import log as logging
from ironic.common import utils
from ironic.drivers.modules.inspector.hooks import base
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
class CPUCapabilitiesHook(base.InspectionHook):
"""Hook to set node's capabilities based on cpu flags in the inventory."""
def __call__(self, task, inventory, plugin_data):
cpu_flags = inventory.get('cpu', {}).get('flags')
if not cpu_flags:
LOG.warning('No CPU flags available for node %s.', task.node.uuid)
return
cpu_flags = set(cpu_flags)
cpu_capabilities = {}
for flag, name in CONF.inspector.cpu_capabilities.items():
if flag in cpu_flags:
cpu_capabilities[name] = 'true'
LOG.info('CPU capabilities for node %s: %s', task.node.uuid,
cpu_capabilities)
old_capabilities = task.node.properties.get('capabilities')
new_capabilities = utils.get_updated_capabilities(old_capabilities,
cpu_capabilities)
LOG.debug('New capabilities for node %s: %s', task.node.uuid,
new_capabilities)
task.node.set_property('capabilities', new_capabilities)
task.node.save()

View File

@ -0,0 +1,83 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from oslo_log import log as logging
from ironic.drivers.modules.inspector.hooks import base
LOG = logging.getLogger(__name__)
_ITEM_SIZE = 4
CONF = cfg.CONF
class ExtraHardwareHook(base.InspectionHook):
"""Hook to gather extra information about the node hardware."""
def __call__(self, task, inventory, plugin_data):
"""Store extra hardware information in plugin_data['extra']
Convert the extra collected data from the format of the
hardware-detect tool (list of lists) to a nested dictionary. Remove
the original ``data`` field from plugin_data, and save the converted
data into a new field ``extra`` instead.
"""
if 'data' not in plugin_data:
LOG.warning('No extra hardware information was received from the '
'ramdisk for node %s', task.node.uuid)
return
data = plugin_data['data']
if not self._is_valid_data(data):
LOG.warning('Extra hardware data was not in a recognised format, '
'and will not be forwarded to inspection rules for '
'node %s', task.node.uuid)
if CONF.inspector.extra_hardware_strict:
LOG.debug('Deleting \"data\" key from plugin data of node %s '
'as it is malformed and strict mode is on.',
task.node.uuid)
del plugin_data['data']
return
# NOTE(sambetts) If data is in a valid format, convert it to
# dictionaries for rules processing, and store converted data in
# plugin_data['extra'].
# Delete plugin_data['data'], as it is assumed unusable by rules.
converted = {}
for item in data:
if not item:
continue
try:
converted_0 = converted.setdefault(item[0], {})
converted_1 = converted_0.setdefault(item[1], {})
try:
item[3] = int(item[3])
except (ValueError, TypeError):
pass
converted_1[item[2]] = item[3]
except Exception as e:
LOG.warning('Ignoring invalid extra data item %s for node %s. '
'Error: %s', item, task.node.uuid, e)
plugin_data['extra'] = converted
LOG.debug('Deleting \"data\" key from plugin data of node %s as it is '
'assumed unusable by inspection rules.', task.node.uuid)
del plugin_data['data']
def _is_valid_data(self, data):
return isinstance(data, list) and all(
isinstance(item, list)
and (not CONF.inspector.extra_hardware_strict
or len(item) == _ITEM_SIZE)
for item in data)

View File

@ -0,0 +1,9 @@
pci_devices:
- vendor_id: "10de"
device_id: "1eb8"
type: GPU
device_info: NVIDIA Corporation Tesla T4
- vendor_id: "10de"
device_id: "1df6"
type: GPU
device_info: NVIDIA Corporation GV100GL

View File

@ -0,0 +1,80 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from ironic.conductor import task_manager
from ironic.conf import CONF
from ironic.drivers.modules.inspector.hooks import accelerators as \
accelerators_hook
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as obj_utils
_PLUGIN_DATA = {
'pci_devices': [
{
'vendor_id': '8086',
'product_id': '2922',
'class': '010601',
'revision': '02',
'bus': '0000:00:1f.2'
},
{
'vendor_id': '0de',
'product_id': '1eb8',
'class': '060400',
'revision': '00',
'bus': '0000:00:01.2'
}
]
}
_KNOWN_DEVICES = {
'pci_devices': [
{
'vendor_id': '0de',
'device_id': '1eb8',
'type': 'GPU',
'device_info': 'NVIDIA Corporation Tesla T4'
},
{
'vendor_id': '10de',
'device_id': '1df6',
'type': 'GPU',
'device_info': 'NVIDIA Corporation GV100GL'
}
]
}
class AcceleratorsTestCase(db_base.DbTestCase):
def setUp(self):
super().setUp()
CONF.set_override('enabled_inspect_interfaces',
['agent', 'no-inspect'])
self.node = obj_utils.create_test_node(self.context,
inspect_interface='agent')
self.inventory = {'inventory': 'test_inventory'}
self.plugin_data = _PLUGIN_DATA
self.accelerators_hook = accelerators_hook.AcceleratorsHook()
self.accelerators_hook._known_devices = _KNOWN_DEVICES
def test_accelerators(self):
with task_manager.acquire(self.context, self.node.id) as task:
self.accelerators_hook.__call__(task, self.inventory,
self.plugin_data)
self.node.refresh()
result = self.node.properties.get('accelerators', [])
expected = [{'vendor_id': '0de',
'device_id': '1eb8',
'type': 'GPU',
'device_info': 'NVIDIA Corporation Tesla T4',
'pci_address': '0000:00:01.2'}]
self.assertEqual(result, expected)

View File

@ -0,0 +1,37 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from ironic.conductor import task_manager
from ironic.conf import CONF
from ironic.drivers.modules.inspector.hooks import boot_mode as boot_mode_hook
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as obj_utils
class BootModeTestCase(db_base.DbTestCase):
def setUp(self):
super().setUp()
CONF.set_override('enabled_inspect_interfaces',
['agent', 'no-inspect'])
self.node = obj_utils.create_test_node(self.context,
inspect_interface='agent')
self.inventory = {'boot': {'current_boot_mode': 'test-boot-mode'}}
self.plugin_data = {'plugin_data': 'fake-plugin-data'}
def test_boot_mode(self):
with task_manager.acquire(self.context, self.node.id) as task:
boot_mode_hook.BootModeHook().__call__(task, self.inventory,
self.plugin_data)
self.node.refresh()
result = self.node.properties.get('capabilities', '')
self.assertEqual(result, 'boot_mode:test-boot-mode')

View File

@ -0,0 +1,37 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from ironic.conductor import task_manager
from ironic.conf import CONF
from ironic.drivers.modules.inspector.hooks import cpu_capabilities as cpu_hook
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as obj_utils
class CPUCapabilitiesTestCase(db_base.DbTestCase):
def setUp(self):
super().setUp()
CONF.set_override('enabled_inspect_interfaces',
['agent', 'no-inspect'])
self.node = obj_utils.create_test_node(self.context,
inspect_interface='agent')
self.inventory = {'cpu': {'flags': ['aes', 'aes', 'pdpe1gb']}}
self.plugin_data = {'plugin_data': 'fake-plugin-data'}
def test_cpu_capabilities(self):
with task_manager.acquire(self.context, self.node.id) as task:
cpu_hook.CPUCapabilitiesHook().__call__(task, self.inventory,
self.plugin_data)
self.node.refresh()
result = self.node.properties.get('capabilities', '')
self.assertEqual(result, 'cpu_aes:true,cpu_hugepages_1g:true')

View File

@ -0,0 +1,165 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from unittest import mock
from ironic.conductor import task_manager
from ironic.conf import CONF
from ironic.drivers.modules.inspector.hooks import extra_hardware as hook
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as obj_utils
_PLUGIN_DATA = {
'data': [
['disk', 'logical', 'count', '1'],
['disk', 'vda', 'size', '11'],
['disk', 'vda', 'vendor', '0x1af4'],
['disk', 'vda', 'physical_block_size', '512'],
['disk', 'vda', 'rotational', '1'],
['system', 'product', 'name', 'RHEL'],
['system', 'product', 'vendor', 'Red Hat'],
['system', 'product', 'version', 'RHEL-9.2.0 PC'],
['system', 'product', 'uuid', 'afdd3896-de8d-4585-8214-627071e13552'],
['system', 'motherboard', 'name', 'RHEL'],
['system', 'motherboard', 'vendor', 'Red Hat'],
['system', 'motherboard', 'version', 'RHEL-9.2.0 PC']
]
}
_EXPECTED_PLUGIN_DATA = {
'extra': {
'disk': {
'logical': {
'count': 1
},
'vda': {
'size': 11,
'vendor': '0x1af4',
'physical_block_size': 512,
'rotational': 1,
}
},
'system': {
'product': {
'name': 'RHEL',
'vendor': 'Red Hat',
'version': 'RHEL-9.2.0 PC',
'uuid': 'afdd3896-de8d-4585-8214-627071e13552'
},
'motherboard': {
'name': 'RHEL',
'vendor': 'Red Hat',
'version': 'RHEL-9.2.0 PC'
}
}
}
}
@mock.patch.object(hook.LOG, 'warning', autospec=True)
class ExtraHardwareTestCase(db_base.DbTestCase):
def setUp(self):
super().setUp()
CONF.set_override('enabled_inspect_interfaces',
['agent', 'no-inspect'])
self.node = obj_utils.create_test_node(self.context,
inspect_interface='agent')
self.inventory = {'inventory': 'fake-inventory'}
self.plugin_data = _PLUGIN_DATA
def test_valid_extra_hardware(self, mock_warn):
with task_manager.acquire(self.context, self.node.id) as task:
hook.ExtraHardwareHook().__call__(task, self.inventory,
self.plugin_data)
self.assertFalse(mock_warn.called)
self.assertEqual(self.plugin_data, _EXPECTED_PLUGIN_DATA)
def test_no_data_received(self, mock_warn):
self.plugin_data = {'cats': 'meow'}
with task_manager.acquire(self.context, self.node.id) as task:
hook.ExtraHardwareHook().__call__(task, self.inventory,
self.plugin_data)
mock_warn.assert_called_once_with(
'No extra hardware information was received from the ramdisk '
'for node %s', task.node.uuid)
self.assertEqual(self.plugin_data, {'cats': 'meow'})
@mock.patch.object(hook.LOG, 'debug', autospec=True)
def test_extra_hardware_with_errors(self, mock_debug, mock_warn):
self.plugin_data = {'data':
[['memory', 'total', 'size', '4294967296'],
[],
['cpu', 'physical', 'number', '1'],
['cpu', 'physical', 'WUT'],
['cpu', 'logical', 'number', '1']]
}
with task_manager.acquire(self.context, self.node.id) as task:
hook.ExtraHardwareHook().__call__(task, self.inventory,
self.plugin_data)
expected = {'extra': {
'memory': {
'total': {
'size': 4294967296
}
},
'cpu': {
'physical': {
'number': 1
},
'logical': {
'number': 1
},
}
}}
self.assertEqual(expected, self.plugin_data)
# An empty list is not a warning, a bad record is.
self.assertEqual(1, mock_warn.call_count)
mock_debug.assert_called_once_with(
'Deleting \"data\" key from plugin data of node %s as it is '
'assumed unusable by inspection rules.', task.node.uuid)
def test_invalid_data_strict_mode_off(self, mock_warn):
invalid_plugin_data = {
'data': [['memory', 'total', 'size', '4294967296'],
['cpu', 'physical', 'number', '1'],
{'interface': 'eth1'}]}
self.plugin_data = invalid_plugin_data
with task_manager.acquire(self.context, self.node.id) as task:
hook.ExtraHardwareHook().__call__(task, self.inventory,
self.plugin_data)
self.assertEqual(invalid_plugin_data, self.plugin_data)
mock_warn.assert_called_once_with(
'Extra hardware data was not in a recognised format, and will '
'not be forwarded to inspection rules for node %s',
task.node.uuid)
@mock.patch.object(hook.LOG, 'debug', autospec=True)
def test_invalid_data_strict_mode_on(self, mock_debug, mock_warn):
CONF.set_override('extra_hardware_strict', True, group='inspector')
self.plugin_data = {
'data': [['memory', 'total', 'size', '4294967296'],
['cpu', 'physical', 'WUT']]
}
with task_manager.acquire(self.context, self.node.id) as task:
hook.ExtraHardwareHook().__call__(task, self.inventory,
self.plugin_data)
self.assertEqual({}, self.plugin_data)
mock_warn.assert_called_once_with(
'Extra hardware data was not in a recognised format, and will '
'not be forwarded to inspection rules for node %s',
task.node.uuid)
mock_debug.assert_called_once_with(
'Deleting \"data\" key from plugin data of node %s as it is '
'malformed and strict mode is on.', task.node.uuid)

View File

@ -201,6 +201,10 @@ ironic.inspection.hooks =
validate-interfaces = ironic.drivers.modules.inspector.hooks.validate_interfaces:ValidateInterfacesHook
ports = ironic.drivers.modules.inspector.hooks.ports:PortsHook
architecture = ironic.drivers.modules.inspector.hooks.architecture:ArchitectureHook
accelerators = ironic.drivers.modules.inspector.hooks.accelerators:AcceleratorsHook
boot-mode = ironic.drivers.modules.inspector.hooks.boot_mode:BootModeHook
cpu-capabilities = ironic.drivers.modules.inspector.hooks.cpu_capabilities:CPUCapabilitiesHook
extra-hardware = ironic.drivers.modules.inspector.hooks.extra_hardware:ExtraHardwareHook
[egg_info]
tag_build =