Use server_group in Prom autoscaling scenario

Thanks to the recent developments in ceilometer and sg-core,
we can use server_group for grouping instances from the same
stack for autoscaling purposes. This is how the instances are
grouped in gnocchi based autoscaling. It's much easier for the
users to configure and it should be the prefered option when using
autoscaling.

For backwards compatibility with current stable branches I
added a "autoscaling_instance_grouping" config option. The old
way ("prefix") of instance grouping is used by default and so
tempest tests will continue working on stable branches. By
setting the option to "metadata", the new way of instance
grouping will be used. I'll set this setting in .zuul.yaml
of all telemetry repositories on master branches in follow-up
patches.

Change-Id: I2770e9d47b914941f938f63d92ab7868fe09d7b9
This commit is contained in:
Jaromir Wysoglad
2024-09-25 11:02:14 -04:00
parent 4ec1f85148
commit afe631a2d0
4 changed files with 35 additions and 19 deletions

View File

@@ -96,7 +96,15 @@ TelemetryGroup = [
cfg.IntOpt('prometheus_scrape_interval',
default=15,
help="Scrape interval configured for prometheus. This can "
"be used in test cases to properly configure autoscaling")
"be used in test cases to properly configure autoscaling"),
cfg.StrOpt('autoscaling_instance_grouping',
default='prefix',
choices=['prefix', 'metadata'],
help="How to group instances for autoscaling testing. "
"'prefix' relies on the instances having a common string "
"at the start of their name. 'metadata' is a new and "
"prefered way of grouping since 2024.2 relying on "
"metering.server_group instance metadata")
]
telemetry_services_opts = [

View File

@@ -57,22 +57,6 @@ tests:
$.servers[1].status: ACTIVE
$.servers.`len`: 2
- name: check prometheus query for the servers count .
desc: Check the Prometheus metric for the existence of servers
url: $ENVIRON['PROMETHEUS_SERVICE_URL']/api/v1/query
verbose: all
method: POST
request_headers:
content-type: application/x-www-form-urlencoded
data:
query=ceilometer_cpu{resource_name=~"te-$ENVIRON['RESOURCE_PREFIX'].*"}
poll:
count: 300
delay: 1
status: 200
response_json_paths:
$.data.result.`len`: 2
- name: check alarm cpu_alarm_high ALARM
verbose: all
desc: Check the aodh alarm and its state

View File

@@ -54,7 +54,7 @@
}
}
],
"query": "(rate(ceilometer_cpu{resource_name=~'te-$ENVIRON['RESOURCE_PREFIX'].*'}[$ENVIRON['PROMETHEUS_RATE_DURATION']s])) * 100"
"query": $ENVIRON["QUERY"]
}
},
"web_server_scaledown_policy": {
@@ -82,7 +82,7 @@
}
}
],
"query": "(rate(ceilometer_cpu{resource_name=~'te-$ENVIRON['RESOURCE_PREFIX'].*'}[$ENVIRON['PROMETHEUS_RATE_DURATION']s])) * 100"
"query": $ENVIRON["QUERY"]
}
}
}

View File

@@ -104,6 +104,28 @@ class PrometheusGabbiTest(manager.ScenarioTest):
super(PrometheusGabbiTest, cls).resource_cleanup()
def _prep_query(self, prometheus_rate_duration, resource_prefix):
if config.CONF.telemetry.autoscaling_instance_grouping == "metadata":
query = ("\"(rate(ceilometer_cpu{{server_group=~'stack_id'}}"
"[{}s])) * 100\"").format(prometheus_rate_duration)
metadata_query = '''
{{
"str_replace": {{
"template": {},
"params": {{
"stack_id": {{ "get_param": "OS::stack_id" }}
}}
}}
}}
'''.format(query)
return metadata_query
else:
prefix_query = '''
"(rate(ceilometer_cpu{{resource_name=~'te-{}.*'}}[{}s])) * 100"
'''.format(resource_prefix, prometheus_rate_duration)
return prefix_query
def _prep_test(self, filename):
auth = self.os_primary.auth_provider.get_auth()
networks = self.os_primary.networks_client.list_networks(
@@ -115,6 +137,7 @@ class PrometheusGabbiTest(manager.ScenarioTest):
prometheus_rate_duration = (
config.CONF.telemetry.ceilometer_polling_interval
+ config.CONF.telemetry.prometheus_scrape_interval)
query = self._prep_query(prometheus_rate_duration, resource_prefix)
os.environ.update({
"USER_TOKEN": auth[0],
"AODH_THRESHOLD": str(config.CONF.telemetry.alarm_threshold),
@@ -136,6 +159,7 @@ class PrometheusGabbiTest(manager.ScenarioTest):
"RESOURCE_PREFIX": resource_prefix,
"PROMETHEUS_RATE_DURATION": str(prometheus_rate_duration),
"LOAD_LENGTH": str(prometheus_rate_duration * 2),
"QUERY": query,
})