Use server_group in Prom autoscaling scenario

Thanks to the recent developments in ceilometer and sg-core, we can use server_group for grouping instances from the same stack for autoscaling purposes. This is how the instances are grouped in gnocchi based autoscaling. It's much easier for the users to configure and it should be the prefered option when using autoscaling. For backwards compatibility with current stable branches I added a "autoscaling_instance_grouping" config option. The old way ("prefix") of instance grouping is used by default and so tempest tests will continue working on stable branches. By setting the option to "metadata", the new way of instance grouping will be used. I'll set this setting in .zuul.yaml of all telemetry repositories on master branches in follow-up patches. Change-Id: I2770e9d47b914941f938f63d92ab7868fe09d7b9
2024-09-25 11:02:14 -04:00
parent 4ec1f85148
commit afe631a2d0
4 changed files with 35 additions and 19 deletions
--- a/telemetry_tempest_plugin/config.py
+++ b/telemetry_tempest_plugin/config.py
@@ -96,7 +96,15 @@ TelemetryGroup = [
    cfg.IntOpt('prometheus_scrape_interval',
               default=15,
               help="Scrape interval configured for prometheus. This can "
-                    "be used in test cases to properly configure autoscaling")
+                    "be used in test cases to properly configure autoscaling"),
+    cfg.StrOpt('autoscaling_instance_grouping',
+               default='prefix',
+               choices=['prefix', 'metadata'],
+               help="How to group instances for autoscaling testing. "
+                    "'prefix' relies on the instances having a common string "
+                    "at the start of their name. 'metadata' is a new and "
+                    "prefered way of grouping since 2024.2 relying on "
+                    "metering.server_group instance metadata")
 ]

 telemetry_services_opts = [
--- a/telemetry_tempest_plugin/scenario/telemetry_integration_prometheus_gabbits/autoscaling.yaml
+++ b/telemetry_tempest_plugin/scenario/telemetry_integration_prometheus_gabbits/autoscaling.yaml
@@ -57,22 +57,6 @@ tests:
          $.servers[1].status: ACTIVE
          $.servers.`len`: 2

-    - name: check prometheus query for the servers count .
-      desc: Check the Prometheus metric for the existence of servers
-      url: $ENVIRON['PROMETHEUS_SERVICE_URL']/api/v1/query
-      verbose: all
-      method: POST
-      request_headers:
-          content-type: application/x-www-form-urlencoded
-      data:
-         query=ceilometer_cpu{resource_name=~"te-$ENVIRON['RESOURCE_PREFIX'].*"}
-      poll:
-          count: 300
-          delay: 1
-      status: 200
-      response_json_paths:
-          $.data.result.`len`: 2
-
    - name: check alarm cpu_alarm_high ALARM
      verbose: all
      desc: Check the aodh alarm and its state
--- a/telemetry_tempest_plugin/scenario/telemetry_integration_prometheus_gabbits/create_stack.json
+++ b/telemetry_tempest_plugin/scenario/telemetry_integration_prometheus_gabbits/create_stack.json
@@ -54,7 +54,7 @@
                            }
                        }
                    ],
-                    "query": "(rate(ceilometer_cpu{resource_name=~'te-$ENVIRON['RESOURCE_PREFIX'].*'}[$ENVIRON['PROMETHEUS_RATE_DURATION']s])) * 100"
+                    "query": $ENVIRON["QUERY"]
                }
            },
            "web_server_scaledown_policy": {
@@ -82,7 +82,7 @@
                            }
                        }
                    ],
-                    "query": "(rate(ceilometer_cpu{resource_name=~'te-$ENVIRON['RESOURCE_PREFIX'].*'}[$ENVIRON['PROMETHEUS_RATE_DURATION']s])) * 100"
+                    "query": $ENVIRON["QUERY"]
                }
            }
        }
--- a/telemetry_tempest_plugin/scenario/test_telemetry_integration_prometheus.py
+++ b/telemetry_tempest_plugin/scenario/test_telemetry_integration_prometheus.py
@@ -104,6 +104,28 @@ class PrometheusGabbiTest(manager.ScenarioTest):

        super(PrometheusGabbiTest, cls).resource_cleanup()

+    def _prep_query(self, prometheus_rate_duration, resource_prefix):
+        if config.CONF.telemetry.autoscaling_instance_grouping == "metadata":
+            query = ("\"(rate(ceilometer_cpu{{server_group=~'stack_id'}}"
+                     "[{}s])) * 100\"").format(prometheus_rate_duration)
+            metadata_query = '''
+            {{
+                "str_replace": {{
+                    "template": {},
+                    "params": {{
+                        "stack_id": {{ "get_param": "OS::stack_id" }}
+                    }}
+                }}
+            }}
+            '''.format(query)
+            return metadata_query
+
+        else:
+            prefix_query = '''
+            "(rate(ceilometer_cpu{{resource_name=~'te-{}.*'}}[{}s])) * 100"
+            '''.format(resource_prefix, prometheus_rate_duration)
+            return prefix_query
+
    def _prep_test(self, filename):
        auth = self.os_primary.auth_provider.get_auth()
        networks = self.os_primary.networks_client.list_networks(
@@ -115,6 +137,7 @@ class PrometheusGabbiTest(manager.ScenarioTest):
        prometheus_rate_duration = (
            config.CONF.telemetry.ceilometer_polling_interval
            + config.CONF.telemetry.prometheus_scrape_interval)
+        query = self._prep_query(prometheus_rate_duration, resource_prefix)
        os.environ.update({
            "USER_TOKEN": auth[0],
            "AODH_THRESHOLD": str(config.CONF.telemetry.alarm_threshold),
@@ -136,6 +159,7 @@ class PrometheusGabbiTest(manager.ScenarioTest):
            "RESOURCE_PREFIX": resource_prefix,
            "PROMETHEUS_RATE_DURATION": str(prometheus_rate_duration),
            "LOAD_LENGTH": str(prometheus_rate_duration * 2),
+            "QUERY": query,
        })