Improve StatsD metric precision

Make possible to see timings and counts for invoked APIs also based on
the returned status code. This helps to make deeper investigations of
the cloud behaviour (it might be very useful to see all variety of particular
status code occurence, especially error one). Since latency of 404 and
202 may vary heavily this also helps to reduce such interference.
In addition to that start emiting statsd metric for timeouts from API.

Change-Id: I8eb0174afc36f9ff10e2bd434f803f63736c160c
This commit is contained in:
Artem Goncharov 2022-04-15 16:35:52 +02:00
parent 6b2f555ade
commit 360d517a1a
3 changed files with 103 additions and 23 deletions

View File

@ -50,6 +50,12 @@ def _check_resource(strict=False):
return wrap
def normalize_metric_name(name):
name = name.replace('.', '_')
name = name.replace(':', '_')
return name
class Proxy(adapter.Adapter):
"""Represents a service."""
@ -204,22 +210,35 @@ class Proxy(adapter.Adapter):
self._report_stats_influxdb(response, url, method, exc)
def _report_stats_statsd(self, response, url=None, method=None, exc=None):
if response is not None and not url:
url = response.request.url
if response is not None and not method:
method = response.request.method
name_parts = self._extract_name(url,
self.service_type,
self.session.get_project_id())
key = '.'.join(
[self._statsd_prefix, self.service_type, method]
+ name_parts)
with self._statsd_client.pipeline() as pipe:
if response is not None:
pipe.timing(key, response.elapsed)
pipe.incr(key)
elif exc is not None:
pipe.incr('%s.failed' % key)
try:
if response is not None and not url:
url = response.request.url
if response is not None and not method:
method = response.request.method
name_parts = [
normalize_metric_name(f) for f in
self._extract_name(
url, self.service_type, self.session.get_project_id())
]
key = '.'.join(
[self._statsd_prefix,
normalize_metric_name(self.service_type), method,
'_'.join(name_parts)
])
with self._statsd_client.pipeline() as pipe:
if response is not None:
duration = int(response.elapsed.total_seconds() * 1000)
metric_name = '%s.%s' % (key, str(response.status_code))
pipe.timing(metric_name, duration)
pipe.incr(metric_name)
if duration > 1000:
pipe.incr('%s.over_1000' % key)
elif exc is not None:
pipe.incr('%s.failed' % key)
pipe.incr('%s.attempted' % key)
except Exception:
# We do not want errors in metric reporting ever break client
self.log.exception("Exception reporting metrics")
def _report_stats_prometheus(self, response, url=None, method=None,
exc=None):
@ -253,9 +272,12 @@ class Proxy(adapter.Adapter):
method = response.request.method
tags = dict(
method=method,
name='_'.join(self._extract_name(
url, self.service_type,
self.session.get_project_id()))
name='_'.join([
normalize_metric_name(f) for f in
self._extract_name(
url, self.service_type,
self.session.get_project_id())
])
)
fields = dict(
attempted=1

View File

@ -23,7 +23,9 @@ import threading
import time
import fixtures
from keystoneauth1 import exceptions
import prometheus_client
from requests import exceptions as rexceptions
import testtools.content
from openstack.tests.unit import base
@ -175,7 +177,7 @@ class TestStats(base.TestCase):
self.assert_calls()
self.assert_reported_stat(
'openstack.api.identity.GET.projects', value='1', kind='c')
'openstack.api.identity.GET.projects.200', value='1', kind='c')
self.assert_prometheus_stat(
'openstack_http_requests_total', 1, dict(
service_type='identity',
@ -196,7 +198,7 @@ class TestStats(base.TestCase):
self.assert_calls()
self.assert_reported_stat(
'openstack.api.identity.GET.projects', value='1', kind='c')
'openstack.api.identity.GET.projects.200', value='1', kind='c')
self.assert_prometheus_stat(
'openstack_http_requests_total', 1, dict(
service_type='identity',
@ -217,7 +219,11 @@ class TestStats(base.TestCase):
self.assert_calls()
self.assert_reported_stat(
'openstack.api.compute.GET.servers.detail', value='1', kind='c')
'openstack.api.compute.GET.servers_detail.200',
value='1', kind='c')
self.assert_reported_stat(
'openstack.api.compute.GET.servers_detail.200',
value='0', kind='ms')
self.assert_prometheus_stat(
'openstack_http_requests_total', 1, dict(
service_type='compute',
@ -237,7 +243,11 @@ class TestStats(base.TestCase):
self.assert_calls()
self.assert_reported_stat(
'openstack.api.compute.GET.servers', value='1', kind='c')
'openstack.api.compute.GET.servers.200', value='1', kind='c')
self.assert_reported_stat(
'openstack.api.compute.GET.servers.200', value='0', kind='ms')
self.assert_reported_stat(
'openstack.api.compute.GET.servers.attempted', value='1', kind='c')
self.assert_prometheus_stat(
'openstack_http_requests_total', 1, dict(
service_type='compute',
@ -245,6 +255,49 @@ class TestStats(base.TestCase):
method='GET',
status_code='200'))
def test_servers_error(self):
mock_uri = 'https://compute.example.com/v2.1/servers'
self.register_uris([
dict(method='GET', uri=mock_uri, status_code=500,
json={})])
self.cloud.compute.get('/servers')
self.assert_calls()
self.assert_reported_stat(
'openstack.api.compute.GET.servers.500', value='1', kind='c')
self.assert_reported_stat(
'openstack.api.compute.GET.servers.500', value='0', kind='ms')
self.assert_reported_stat(
'openstack.api.compute.GET.servers.attempted', value='1', kind='c')
self.assert_prometheus_stat(
'openstack_http_requests_total', 1, dict(
service_type='compute',
endpoint=mock_uri,
method='GET',
status_code='500'))
def test_timeout(self):
mock_uri = 'https://compute.example.com/v2.1/servers'
self.register_uris([
dict(method='GET', uri=mock_uri,
exc=rexceptions.ConnectTimeout)
])
try:
self.cloud.compute.get('/servers')
except exceptions.ConnectTimeout:
pass
self.assert_reported_stat(
'openstack.api.compute.GET.servers.failed', value='1', kind='c')
self.assert_reported_stat(
'openstack.api.compute.GET.servers.attempted', value='1', kind='c')
class TestNoStats(base.TestCase):

View File

@ -0,0 +1,5 @@
---
upgrade:
- |
API metrics emitted by OpenStackSDK to StatsD now contain status_code
part of the metric name in order to improve information precision.