Add Redfish metrics support

Added a parsing module to process Redfish-originated metrics
and submit them to Prometheus.

Change-Id: I1c751041488366304c92d4df07cb8a62dcb371fa
This commit is contained in:
Ilya Etingof 2019-09-06 17:31:42 +02:00
parent 46e50fb56f
commit 9959cf36bc
6 changed files with 501 additions and 12 deletions

View File

@ -15,6 +15,7 @@ import os
from ironic_prometheus_exporter.parsers import ipmi
from ironic_prometheus_exporter.parsers import header
from ironic_prometheus_exporter.parsers import redfish
from oslo_config import cfg
from oslo_messaging.notify import notifier
from prometheus_client import write_to_textfile, CollectorRegistry
@ -43,14 +44,23 @@ class PrometheusFileDriver(notifier.Driver):
def notify(self, ctxt, message, priority, retry):
try:
if message['event_type'] == 'hardware.ipmi.metrics':
registry = CollectorRegistry()
node_message = message['payload']
header.timestamp_registry(node_message, registry)
registry = CollectorRegistry()
event_type = message['event_type']
node_message = message['payload']
header.timestamp_registry(node_message, registry)
if event_type == 'hardware.ipmi.metrics':
ipmi.category_registry(node_message, registry)
nodeFile = os.path.join(self.location,
node_message['node_name'])
write_to_textfile(nodeFile, registry)
elif event_type == 'hardware.redfish.metrics':
redfish.category_registry(node_message, registry)
nodeFile = os.path.join(
self.location,
node_message['node_name'] + '-' + event_type)
write_to_textfile(nodeFile, registry)
except Exception as e:
LOG.error(e)

View File

@ -0,0 +1,256 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import logging
from prometheus_client import Gauge
LOG = logging.getLogger(__name__)
def _build_labels(node_message):
return {
k: node_message[k]
for k in ('node_name', 'node_uuid', 'instance_uuid')
}
def build_temperature_metrics(node_message):
"""Build Prometheus temperature metrics from Oslo message.
Takes Oslo notification message carrying Redfish sensor data and
produces a data structure suitable for submitting to Prometheus.
:param node_message: Oslo notification message
Examples::
.. code-block:: python
{
# metric name
'baremetal_temp_cpu_celsius':
[
# metric value
42,
# metric instance in form of Prometheus labels
{
'node_name': 'kninode',
'node_uuid', 'XXX-YYY-ZZZ',
'instance_uuid': 'ZZZ-YYY-XXX',
'entity_id': 'CPU',
'sensor_id': '1'
}
]
]
}
"""
payload = node_message
for key in ('payload', 'Temperature'):
payload = payload.get(key, {})
metrics = collections.defaultdict(list)
for sensor_id, sensor_data in payload.items():
metric = 'baremetal_temp_%s_celsius' % (
sensor_data['physical_context'].lower())
labels = _build_labels(node_message)
labels['entity_id'] = sensor_data['physical_context']
labels['sensor_id'] = sensor_data['sensor_number']
value = sensor_data['reading_celsius']
metrics[metric].append((value, labels))
return metrics
def build_power_metrics(node_message):
"""Build Prometheus power metrics from Oslo message.
Takes Oslo notification message carrying Redfish sensor data and
produces a data structure suitable for submitting to Prometheus.
:param node_message: Oslo notification message
Examples::
.. code-block:: python
{
# metric name
'baremetal_power_status':
[
# metric value (0 - OK, 1 - on fire)
0,
# metric instance in form of Prometheus labels
{
'node_name': 'kninode',
'node_uuid', 'XXX-YYY-ZZZ',
'instance_uuid': 'ZZZ-YYY-XXX',
'entity_id': 'PSU',
'sensor_id': '0:Power@ZZZ-YYY-XXX'
}
]
]
}
"""
payload = node_message
for key in ('payload', 'Power'):
payload = payload.get(key, {})
metrics = collections.defaultdict(list)
for sensor_id, sensor_data in payload.items():
metric = 'baremetal_power_status'
labels = _build_labels(node_message)
labels['entity_id'] = 'PSU'
labels['sensor_id'] = sensor_id
value = sensor_data['health'] != 'OK' and 1 or 0
metrics[metric].append((value, labels))
return metrics
def build_fan_metrics(node_message):
"""Build Prometheus fan metrics from Oslo message.
Takes Oslo notification message carrying Redfish sensor data and
produces a data structure suitable for submitting to Prometheus.
:param node_message: Oslo notification message
Examples::
.. code-block:: python
{
# metric name
'baremetal_fan_status':
[
# metric value (0 - OK, 1 - on fire)
0,
# metric instance in form of Prometheus labels
{
'node_name': 'kninode',
'node_uuid', 'XXX-YYY-ZZZ',
'instance_uuid': 'ZZZ-YYY-XXX',
'entity_id': 'CPU',
'sensor_id': '0:Power@ZZZ-YYY-XXX'
}
]
]
}
"""
payload = node_message
for key in ('payload', 'Fan'):
payload = payload.get(key, {})
metrics = collections.defaultdict(list)
for sensor_id, sensor_data in payload.items():
metric = 'baremetal_fan_status'
labels = _build_labels(node_message)
labels['entity_id'] = sensor_data['physical_context']
labels['sensor_id'] = sensor_data['identity']
value = sensor_data['health'] != 'OK' and 1 or 0
metrics[metric].append((value, labels))
return metrics
def build_drive_metrics(node_message):
"""Build Prometheus drive metrics from Oslo message.
Takes Oslo notification message carrying Redfish sensor data and
produces a data structure suitable for submitting to Prometheus.
:param node_message: Oslo notification message
Examples::
.. code-block:: python
{
# metric name
'baremetal_drive_status':
[
# metric value (0 - OK, 1 - on fire)
0,
# metric instance in form of Prometheus labels
{
'node_name': 'kninode',
'node_uuid', 'XXX-YYY-ZZZ',
'instance_uuid': 'ZZZ-YYY-XXX',
'entity_id': 'HDD',
'sensor_id': '32ADF365C6C1B7BD'
}
]
]
}
"""
payload = node_message
for key in ('payload', 'Drive'):
payload = payload.get(key, {})
metrics = collections.defaultdict(list)
for sensor_id, sensor_data in payload.items():
metric = 'baremetal_drive_status'
labels = _build_labels(node_message)
labels['entity_id'] = 'HDD'
labels['sensor_id'] = sensor_id
value = sensor_data['health'] != 'OK' and 1 or 0
metrics[metric].append((value, labels))
return metrics
def category_registry(node_message, metrics_registry):
"""Parse Redfish metrics and submit them to Prometheus
:param node_message: Oslo notification message
:param metrics_registry: Prometheus registry
"""
metrics = build_temperature_metrics(node_message)
metrics.update(build_power_metrics(node_message))
metrics.update(build_fan_metrics(node_message))
metrics.update(build_drive_metrics(node_message))
for metric, details in metrics.items():
for value, labels in details:
gauge = Gauge(metric, '', labelnames=labels,
registry=metrics_registry)
gauge.labels(**labels).set(value)

View File

@ -0,0 +1,67 @@
{
"priority": "INFO",
"event_type": "hardware.redfish.metrics",
"timestamp": "2019-03-29 20:12:26.885347",
"publisher_id": "None.localhost.localdomain",
"payload": {
"instance_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228",
"node_uuid": "ac2aa2fd-6e1a-41c8-a114-2084c8705228",
"event_type": "hardware.redfish.metrics.update",
"timestamp": "2019-03-29T20:12:22.989020",
"node_name": "knilab-master-u9",
"message_id": "85d6b2c8-fe57-432d-868a-330e0e28cf34",
"payload": {
"Temperature": {
"XXX-YYY-ZZZ@ZZZ-YYY-XXX": {
"identity": "XXX-YYY-ZZZ",
"max_reading_range_temp": 120,
"min_reading_range_temp": 0,
"physical_context": "CPU",
"reading_celsius": 62,
"sensor_number": 1,
"health": "OK",
"state": "enabled"
}
},
"Power": {
"0:Power@ZZZ-YYY-XXX": {
"health": "OK",
"last_power_output_watts": 650,
"line_input_voltage": 220,
"maximum_frequency_hz": 63,
"maximum_voltage": 250,
"minimum_frequency_hz": 47,
"minimum_voltage": 185,
"output_wattage": 1450,
"power_capacity_watts": 1450,
"serial_number": "SN010203040506",
"state": "enabled"
}
},
"Fan": {
"XXX-YYY-ZZZ@ZZZ-YYY-XXX": {
"identity": "XXX-YYY-ZZZ",
"max_reading_range": 10000,
"min_reading_range": 0,
"physical_context": "CPU",
"reading": 6000,
"reading_units": "RPM",
"serial_number": "SN010203040506",
"health": "OK",
"state": "enabled"
}
},
"Drive": {
"32ADF365C6C1B7BD:XXX-YYY-ZZZ@ZZZ-YYY-XXX": {
"capacity_bytes": 3750000000,
"failure_predicted": true,
"health": "OK",
"identity": "32ADF365C6C1B7BD",
"model": "IBM 350A",
"state": "enabled"
}
}
}
},
"message_id": "2c0da1e8-1958-484f-9bdd-9117d717f7fa"
}

View File

@ -25,7 +25,7 @@ class TestPrometheusFileNotifier(test_utils.BaseTestCase):
def setUp(self):
super(TestPrometheusFileNotifier, self).setUp()
def test_instanciate(self):
def test_instantiate(self):
temp_dir = self.useFixture(fixtures.TempDir()).path
self.config(location=temp_dir,
group='oslo_messaging_notifications')
@ -72,8 +72,8 @@ class TestPrometheusFileNotifier(test_utils.BaseTestCase):
if os.path.isfile(os.path.join(DIR, name))]
self.assertEqual(node1, node2)
self.assertEqual(len(all_files), 1)
self.assertIn(node1, all_files)
self.assertIn(node2, all_files)
self.assertIn(node1 + '-hardware.ipmi.metrics', all_files)
self.assertIn(node2 + '-hardware.ipmi.metrics', all_files)
def test_messages_from_different_nodes(self):
temp_dir = self.useFixture(fixtures.TempDir()).path
@ -104,5 +104,35 @@ class TestPrometheusFileNotifier(test_utils.BaseTestCase):
all_files = [name for name in os.listdir(DIR)
if os.path.isfile(os.path.join(DIR, name))]
self.assertEqual(len(all_files), 2)
self.assertIn(node1, all_files)
self.assertIn(node2, all_files)
self.assertIn(node1 + '-hardware.ipmi.metrics', all_files)
self.assertIn(node2 + '-hardware.ipmi.metrics', all_files)
def test_messages_of_different_types(self):
temp_dir = self.useFixture(fixtures.TempDir()).path
self.config(location=temp_dir,
group='oslo_messaging_notifications')
transport = oslo_messaging.get_notification_transport(self.conf)
driver = PrometheusFileDriver(self.conf, None, transport)
sample_file_1 = os.path.join(
os.path.dirname(ironic_prometheus_exporter.__file__),
'tests', 'json_samples', 'notification-ipmi-1.json')
sample_file_2 = os.path.join(
os.path.dirname(ironic_prometheus_exporter.__file__),
'tests', 'json_samples', 'notification-redfish.json')
msg1 = json.load(open(sample_file_1))
node1 = msg1['payload']['node_name']
msg2 = json.load(open(sample_file_2))
node2 = msg2['payload']['node_name']
driver.notify(None, msg1, 'info', 0)
driver.notify(None, msg2, 'info', 0)
DIR = self.conf.oslo_messaging_notifications.location
all_files = [name for name in os.listdir(DIR)
if os.path.isfile(os.path.join(DIR, name))]
self.assertEqual(len(all_files), 2)
self.assertIn(node1 + '-hardware.ipmi.metrics', all_files)
self.assertIn(node2 + '-hardware.redfish.metrics', all_files)

View File

@ -0,0 +1,121 @@
import json
import os
import unittest
import ironic_prometheus_exporter
from ironic_prometheus_exporter.parsers import redfish
from prometheus_client import CollectorRegistry
sample_file = os.path.join(
os.path.dirname(ironic_prometheus_exporter.__file__),
'tests', 'json_samples', 'notification-redfish.json')
DATA = json.load(open(sample_file))
class TestPayloadsParser(unittest.TestCase):
def setUp(self):
self.node_message = DATA['payload']
self.node_name = DATA['payload']['node_name']
self.node_uuid = DATA['payload']['node_uuid']
self.instance_uuid = DATA['payload']['instance_uuid']
def test_build_temperature_metrics(self):
metrics = redfish.build_temperature_metrics(self.node_message)
expected_metric = 'baremetal_temp_cpu_celsius'
self.assertIn(expected_metric, metrics)
self.assertEqual(62, metrics[expected_metric][0][0])
expected_labels = {
'entity_id': 'CPU',
'instance_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'node_name': 'knilab-master-u9',
'node_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'sensor_id': 1
}
self.assertEqual(
expected_labels, metrics[expected_metric][0][1])
def test_build_power_metrics(self):
metrics = redfish.build_power_metrics(self.node_message)
expected_metric = 'baremetal_power_status'
self.assertIn(expected_metric, metrics)
self.assertEqual(0, metrics[expected_metric][0][0])
expected_labels = {
'entity_id': 'PSU',
'instance_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'node_name': 'knilab-master-u9',
'node_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'sensor_id': '0:Power@ZZZ-YYY-XXX'
}
self.assertEqual(
expected_labels, metrics[expected_metric][0][1])
def test_build_fan_metrics(self):
metrics = redfish.build_fan_metrics(self.node_message)
expected_metric = 'baremetal_fan_status'
self.assertIn(expected_metric, metrics)
self.assertEqual(0, metrics[expected_metric][0][0])
expected_labels = {
'entity_id': 'CPU',
'instance_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'node_name': 'knilab-master-u9',
'node_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'sensor_id': 'XXX-YYY-ZZZ'
}
self.assertEqual(
expected_labels, metrics[expected_metric][0][1])
def test_build_drive_metrics(self):
metrics = redfish.build_drive_metrics(self.node_message)
expected_metric = 'baremetal_drive_status'
self.assertIn(expected_metric, metrics)
self.assertEqual(0, metrics[expected_metric][0][0])
expected_labels = {
'entity_id': 'HDD',
'instance_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'node_name': 'knilab-master-u9',
'node_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'sensor_id': '32ADF365C6C1B7BD:XXX-YYY-ZZZ@ZZZ-YYY-XXX'
}
self.assertEqual(
expected_labels, metrics[expected_metric][0][1])
def test_category_registry(self):
metrics_registry = CollectorRegistry()
redfish.category_registry(self.node_message, metrics_registry)
label = {
'entity_id': 'HDD',
'instance_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'node_name': 'knilab-master-u9',
'node_uuid': 'ac2aa2fd-6e1a-41c8-a114-2084c8705228',
'sensor_id': '32ADF365C6C1B7BD:XXX-YYY-ZZZ@ZZZ-YYY-XXX'
}
sensor_value = metrics_registry.get_sample_value(
'baremetal_drive_status', label)
self.assertEqual(0, sensor_value)

View File

@ -0,0 +1,5 @@
---
features:
- |
Adds support for handling Redfish-originated metrics alongside
IPMI ones.