From 53872f9af2842312f1a20ff5bad7a51eb49b8648 Mon Sep 17 00:00:00 2001
From: Alfredo Moralejo <amoralej@redhat.com>
Date: Wed, 11 Jun 2025 14:40:23 +0200
Subject: [PATCH] Aggregate by label when querying instance cpu usage in
 prometheus

Currently, when the prometheus datasource query ceilometer_cpu metric
for instance cpu usage, it aggregates by instance and filter by the
label containing the instance uuid. While this works fine in real
scenarios, where a single metric is provided in a single instance, in
some cases as the CI jobs where metrics are directly injected, leads to
incorrect metric calculation.

We applied a similar fix for the host metrics in [1] but we did not
implement it for instance cpu.

I am also converting the query formatting to the dict format to improve
understability.

[1] https://review.opendev.org/c/openstack/watcher/+/946049

Closes-Bug: #2113936
Change-Id: I3038dec20612162c411fc77446e86a47e0354423
(cherry picked from commit 3860de0b1efba2101615ae782ba6aab3de0964e0)
---
 watcher/decision_engine/datasources/prometheus.py      | 10 ++++++----
 .../datasources/test_prometheus_helper.py              |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/watcher/decision_engine/datasources/prometheus.py b/watcher/decision_engine/datasources/prometheus.py
index 7b7572f3b..074e6a591 100644
--- a/watcher/decision_engine/datasources/prometheus.py
+++ b/watcher/decision_engine/datasources/prometheus.py
@@ -342,10 +342,12 @@ class PrometheusHelper(base.DataSourceBase):
                 )
                 vcpus = 1
             query_args = (
-                "clamp_max((%s by (instance)(rate(%s{%s='%s'}[%ss]))/10e+8) "
-                "*(100/%s), 100)" %
-                (aggregate, meter, uuid_label_key, instance_label, period,
-                 vcpus)
+                "clamp_max((%(agg)s by (%(label)s)"
+                "(rate(%(meter)s{%(label)s='%(label_value)s'}[%(period)ss]))"
+                "/10e+8) *(100/%(vcpus)s), 100)"
+                % {'label': uuid_label_key, 'label_value': instance_label,
+                   'agg': aggregate, 'meter': meter, 'period': period,
+                   'vcpus': vcpus}
             )
         else:
             raise exception.InvalidParameter(
diff --git a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
index 5aaccb0ec..4d66fa642 100644
--- a/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_prometheus_helper.py
@@ -242,7 +242,7 @@ class TestPrometheusHelper(base.BaseTestCase):
         self.assertEqual(expected_cpu_usage, result_cpu)
         self.assertIsInstance(result_cpu, float)
         mock_prometheus_query.assert_called_once_with(
-            "clamp_max((avg by (instance)(rate("
+            "clamp_max((avg by (resource)(rate("
             "ceilometer_cpu{resource='uuid-0'}[300s]))"
             "/10e+8) *(100/2), 100)"
         )
@@ -644,7 +644,7 @@ class TestPrometheusHelper(base.BaseTestCase):
 
     def test_build_prometheus_query_instance_cpu_avg_agg(self):
         expected_query = (
-            "clamp_max((avg by (instance)(rate("
+            "clamp_max((avg by (resource)(rate("
             "ceilometer_cpu{resource='uuid-0'}[222s]))"
             "/10e+8) *(100/2), 100)"
         )
@@ -655,7 +655,7 @@ class TestPrometheusHelper(base.BaseTestCase):
 
     def test_build_prometheus_query_instance_cpu_max_agg(self):
         expected_query = (
-            "clamp_max((max by (instance)(rate("
+            "clamp_max((max by (resource)(rate("
             "ceilometer_cpu{resource='uuid-0'}[555s]))"
             "/10e+8) *(100/4), 100)"
         )
@@ -699,7 +699,7 @@ class TestPrometheusHelper(base.BaseTestCase):
     def test_prometheus_query_custom_uuid_label(self, mock_prometheus_get):
         cfg.CONF.prometheus_client.instance_uuid_label = 'custom_uuid_label'
         expected_query = (
-            "clamp_max((max by (instance)"
+            "clamp_max((max by (custom_uuid_label)"
             "(rate(ceilometer_cpu{custom_uuid_label='uuid-0'}[555s]))"
             "/10e+8) *(100/4), 100)"
         )