From 53872f9af2842312f1a20ff5bad7a51eb49b8648 Mon Sep 17 00:00:00 2001 From: Alfredo Moralejo Date: Wed, 11 Jun 2025 14:40:23 +0200 Subject: [PATCH] Aggregate by label when querying instance cpu usage in prometheus Currently, when the prometheus datasource query ceilometer_cpu metric for instance cpu usage, it aggregates by instance and filter by the label containing the instance uuid. While this works fine in real scenarios, where a single metric is provided in a single instance, in some cases as the CI jobs where metrics are directly injected, leads to incorrect metric calculation. We applied a similar fix for the host metrics in [1] but we did not implement it for instance cpu. I am also converting the query formatting to the dict format to improve understability. [1] https://review.opendev.org/c/openstack/watcher/+/946049 Closes-Bug: #2113936 Change-Id: I3038dec20612162c411fc77446e86a47e0354423 (cherry picked from commit 3860de0b1efba2101615ae782ba6aab3de0964e0) --- watcher/decision_engine/datasources/prometheus.py | 10 ++++++---- .../datasources/test_prometheus_helper.py | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py index 7b7572f3b..074e6a591 100644 --- a/watcher/decision_engine/datasources/prometheus.py +++ b/watcher/decision_engine/datasources/prometheus.py @@ -342,10 +342,12 @@ class PrometheusHelper(base.DataSourceBase): ) vcpus = 1 query_args = ( - "clamp_max((%s by (instance)(rate(%s{%s='%s'}[%ss]))/10e+8) " - "*(100/%s), 100)" % - (aggregate, meter, uuid_label_key, instance_label, period, - vcpus) + "clamp_max((%(agg)s by (%(label)s)" + "(rate(%(meter)s{%(label)s='%(label_value)s'}[%(period)ss]))" + "/10e+8) *(100/%(vcpus)s), 100)" + % {'label': uuid_label_key, 'label_value': instance_label, + 'agg': aggregate, 'meter': meter, 'period': period, + 'vcpus': vcpus} ) else: raise exception.InvalidParameter( diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py index 5aaccb0ec..4d66fa642 100644 --- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py +++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py @@ -242,7 +242,7 @@ class TestPrometheusHelper(base.BaseTestCase): self.assertEqual(expected_cpu_usage, result_cpu) self.assertIsInstance(result_cpu, float) mock_prometheus_query.assert_called_once_with( - "clamp_max((avg by (instance)(rate(" + "clamp_max((avg by (resource)(rate(" "ceilometer_cpu{resource='uuid-0'}[300s]))" "/10e+8) *(100/2), 100)" ) @@ -644,7 +644,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_instance_cpu_avg_agg(self): expected_query = ( - "clamp_max((avg by (instance)(rate(" + "clamp_max((avg by (resource)(rate(" "ceilometer_cpu{resource='uuid-0'}[222s]))" "/10e+8) *(100/2), 100)" ) @@ -655,7 +655,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_build_prometheus_query_instance_cpu_max_agg(self): expected_query = ( - "clamp_max((max by (instance)(rate(" + "clamp_max((max by (resource)(rate(" "ceilometer_cpu{resource='uuid-0'}[555s]))" "/10e+8) *(100/4), 100)" ) @@ -699,7 +699,7 @@ class TestPrometheusHelper(base.BaseTestCase): def test_prometheus_query_custom_uuid_label(self, mock_prometheus_get): cfg.CONF.prometheus_client.instance_uuid_label = 'custom_uuid_label' expected_query = ( - "clamp_max((max by (instance)" + "clamp_max((max by (custom_uuid_label)" "(rate(ceilometer_cpu{custom_uuid_label='uuid-0'}[555s]))" "/10e+8) *(100/4), 100)" )