From d3fd725ef98a0c8f3e1a2641381d95d335a0c9aa Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Thu, 13 Jun 2024 12:35:46 +0200 Subject: [PATCH] Fix and document redfish metrics Provides a complete documentation for metrics that the Redfish management interface can collect. The Power payload refers to InputRanges in a broken way: this field is a list, but the code treats it as a singular resource. No hardware I have access to provides it this way. Since input ranges are constants and thus arguably don't qualify as runtime metrics, removing them instead of fixing. Change-Id: Ida1be1341346df917073e649a23a2f116b262e66 --- doc/source/admin/drivers/redfish.rst | 4 +- doc/source/admin/drivers/redfish/metrics.rst | 181 ++++++++++++++++++ ironic/drivers/modules/redfish/management.py | 4 - .../modules/redfish/test_management.py | 13 -- 4 files changed, 184 insertions(+), 18 deletions(-) create mode 100644 doc/source/admin/drivers/redfish/metrics.rst diff --git a/doc/source/admin/drivers/redfish.rst b/doc/source/admin/drivers/redfish.rst index 609791adf3..4985796ba4 100644 --- a/doc/source/admin/drivers/redfish.rst +++ b/doc/source/admin/drivers/redfish.rst @@ -22,7 +22,8 @@ Redfish_ protocol. Supported features include: * Retrieving and changing :ref:`BIOS settings `. * Applying :doc:`firmware updates `. * Configuring :doc:`hardware RAID `. -* Hardware metrics and integration with `ironic-prometheus-exporter +* :doc:`Hardware metrics ` and integration with + `ironic-prometheus-exporter `_. * Event notifications configured via :doc:`redfish/passthru`. @@ -420,6 +421,7 @@ Further topics .. toctree:: + redfish/metrics redfish/passthru redfish/session-cache redfish/interop diff --git a/doc/source/admin/drivers/redfish/metrics.rst b/doc/source/admin/drivers/redfish/metrics.rst new file mode 100644 index 0000000000..9fc35b76f8 --- /dev/null +++ b/doc/source/admin/drivers/redfish/metrics.rst @@ -0,0 +1,181 @@ +Redfish hardware metrics +======================== + +The ``redfish`` hardware type supports sending hardware metrics via the +:doc:`notification system `. The ``event_type`` field of +a notification will be set to ``hardware.redfish.metrics`` (where ``redfish`` +may be replaced by a different driver name for hardware types derived from it). + +The payload of each notification is a mapping where keys are sensor types +(``Fan``, ``Temperature``, ``Power`` or ``Drive``) and values are also mappings +from sensor identifiers to the sensor data. + +Each ``Fan`` payload contains the following fields: + +* ``max_reading_range``, ``min_reading_range`` - the range of reading values. +* ``reading``, ``reading_units`` - the current reading and its units. +* ``serial_number`` - the serial number of the fan sensor. +* ``physical_context`` - the context of the sensor, such as ``SystemBoard``. + Can also be ``null`` or just ``Fan``. + +Each ``Temperature`` payload contains the following fields: + +* ``max_reading_range_temp``, ``min_reading_range_temp`` - the range of reading + values. +* ``reading_celsius`` - the current reading in degrees Celsius. +* ``sensor_number`` - the number of the temperature sensor. +* ``physical_context`` - the context of the sensor, usually reflecting its + location, such as ``CPU``, ``Memory``, ``Intake``, ``PowerSupply`` or + ``SystemBoard``. Can also be ``null``. + +Each ``Power`` payload contains the following fields: + +* ``power_capacity_watts``, ``line_input_voltage``, ``last_power_output_watts`` +* ``serial_number`` - the serial number of the power source. +* ``state`` - the power source state: ``enabled``, ``absent`` (``null`` if + unknown). +* ``health`` - the power source health status: ``ok``, ``warning``, + ``critical`` (``null`` if unknown). + +Each ``Drive`` payload contains the following fields: + +* ``name`` - the drive name in the BMC (this is **not** a Linux device name + like ``/dev/sda``). +* ``model`` - the drive model (if known). +* ``capacity_bytes`` - the drive capacity in bytes. +* ``state`` - the drive state: ``enabled``, ``absent`` (``null`` if unknown). +* ``health`` - the drive health status: ``ok``, ``warning``, ``critical`` + (``null`` if unknown). + +.. note:: + Drive payloads are often not available on real hardware. + +.. warning:: + Metrics collection works by polling several Redfish endpoints on the target + BMC. Some older BMC implementations may have hard rate limits or misbehave + under load. If this is the case for you, you need to reduce the metrics + collection frequency or completely disable it. + +Example (Dell) +-------------- + +.. code-block:: json + + { + "message_id": "578628d2-9967-4d33-97ca-7e7c27a76abc", + "publisher_id": "conductor-1.example.com", + "event_type": "hardware.redfish.metrics", + "priority": "INFO", + "payload": { + "message_id": "60653d54-87aa-43b8-a4ed-96d568dd4e96", + "instance_uuid": null, + "node_uuid": "aea161dc-2e96-4535-b003-ca70a4a7bb6d", + "timestamp": "2023-10-22T15:50:26.841964", + "node_name": "dell-430", + "event_type": "hardware.redfish.metrics.update", + "payload": { + "Fan": { + "0x17||Fan.Embedded.1A@System.Embedded.1": { + "identity": "0x17||Fan.Embedded.1A", + "max_reading_range": null, + "min_reading_range": 720, + "reading": 1680, + "reading_units": "RPM", + "serial_number": null, + "physical_context": "SystemBoard", + "state": "enabled", + "health": "ok" + }, + "0x17||Fan.Embedded.2A@System.Embedded.1": { + "identity": "0x17||Fan.Embedded.2A", + "max_reading_range": null, + "min_reading_range": 720, + "reading": 3120, + "reading_units": "RPM", + "serial_number": null, + "physical_context": "SystemBoard", + "state": "enabled", + "health": "ok" + }, + "0x17||Fan.Embedded.2B@System.Embedded.1": { + "identity": "0x17||Fan.Embedded.2B", + "max_reading_range": null, + "min_reading_range": 720, + "reading": 3000, + "reading_units": "RPM", + "serial_number": null, + "physical_context": "SystemBoard", + "state": "enabled", + "health": "ok" + } + }, + "Temperature": { + "iDRAC.Embedded.1#SystemBoardInletTemp@System.Embedded.1": { + "identity": "iDRAC.Embedded.1#SystemBoardInletTemp", + "max_reading_range_temp": 47, + "min_reading_range_temp": -7, + "reading_celsius": 28, + "physical_context": "SystemBoard", + "sensor_number": 4, + "state": "enabled", + "health": "ok" + }, + "iDRAC.Embedded.1#CPU1Temp@System.Embedded.1": { + "identity": "iDRAC.Embedded.1#CPU1Temp", + "max_reading_range_temp": 90, + "min_reading_range_temp": 3, + "reading_celsius": 63, + "physical_context": "CPU", + "sensor_number": 14, + "state": "enabled", + "health": "ok" + } + }, + "Power": { + "PSU.Slot.1:Power@System.Embedded.1": { + "power_capacity_watts": null, + "line_input_voltage": 206, + "last_power_output_watts": null, + "serial_number": "CNLOD0075324D7", + "state": "enabled", + "health": "ok" + }, + "PSU.Slot.2:Power@System.Embedded.1": { + "power_capacity_watts": null, + "line_input_voltage": null, + "last_power_output_watts": null, + "serial_number": "CNLOD0075324E5", + "state": null, + "health": "critical" + } + }, + "Drive": { + "Solid State Disk 0:1:0:RAID.Integrated.1-1@System.Embedded.1": { + "name": "Solid State Disk 0:1:0", + "capacity_bytes": 479559942144, + "state": "enabled", + "health": "ok" + }, + "Physical Disk 0:1:1:RAID.Integrated.1-1@System.Embedded.1": { + "name": "Physical Disk 0:1:1", + "capacity_bytes": 1799725514752, + "state": "enabled", + "health": "ok" + }, + "Physical Disk 0:1:2:RAID.Integrated.1-1@System.Embedded.1": { + "name": "Physical Disk 0:1:2", + "capacity_bytes": 1799725514752, + "state": "enabled", + "health": "ok" + }, + "Backplane 1 on Connector 0 of Integrated RAID Controller 1:RAID.Integrated.1-1@System.Embedded.1": { + "name": "Backplane 1 on Connector 0 of Integrated RAID Controller 1", + "capacity_bytes": null, + "state": "enabled", + "health": "ok" + } + } + } + }, + "timestamp": "2023-10-22 15:50:36.700458" + } diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index e4715d95bc..682154719f 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -500,10 +500,6 @@ class RedfishManagement(base.ManagementInterface): 'line_input_voltage', 'last_power_output_watts', 'serial_number') sensor.update(cls._sensor2dict(power.status, 'state', 'health')) - sensor.update(cls._sensor2dict( - power.input_ranges, 'minimum_voltage', - 'maximum_voltage', 'minimum_frequency_hz', - 'maximum_frequency_hz', 'output_wattage')) unique_name = '%s:%s@%s' % ( power.identity, chassis.power.identity, chassis.identity) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_management.py b/ironic/tests/unit/drivers/modules/redfish/test_management.py index a24b89bd6d..043f117641 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_management.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_management.py @@ -649,13 +649,6 @@ class RedfishManagementTestCase(db_base.DbTestCase): 'power_capacity_watts': 1450, 'last_power_output_watts': 650, 'line_input_voltage': 220, - 'input_ranges': { - 'minimum_voltage': 185, - 'maximum_voltage': 250, - 'minimum_frequency_hz': 47, - 'maximum_frequency_hz': 63, - 'output_wattage': 1450 - }, 'serial_number': 'SN010203040506', "status": { "state": "enabled", @@ -669,7 +662,6 @@ class RedfishManagementTestCase(db_base.DbTestCase): mock_psu = mock.MagicMock(**attributes) mock_psu.name = attributes['name'] mock_psu.status = mock.MagicMock(**attributes['status']) - mock_psu.input_ranges = mock.MagicMock(**attributes['input_ranges']) mock_power.power_supplies = [mock_psu] with task_manager.acquire(self.context, self.node.uuid, @@ -681,11 +673,6 @@ class RedfishManagementTestCase(db_base.DbTestCase): 'health': 'OK', 'last_power_output_watts': 650, 'line_input_voltage': 220, - 'maximum_frequency_hz': 63, - 'maximum_voltage': 250, - 'minimum_frequency_hz': 47, - 'minimum_voltage': 185, - 'output_wattage': 1450, 'power_capacity_watts': 1450, 'serial_number': 'SN010203040506', 'state': 'enabled'