Improve StatsD metric precision
Make possible to see timings and counts for invoked APIs also based on the returned status code. This helps to make deeper investigations of the cloud behaviour (it might be very useful to see all variety of particular status code occurence, especially error one). Since latency of 404 and 202 may vary heavily this also helps to reduce such interference. In addition to that start emiting statsd metric for timeouts from API. Change-Id: I8eb0174afc36f9ff10e2bd434f803f63736c160c
This commit is contained in:
parent
6b2f555ade
commit
360d517a1a
@ -50,6 +50,12 @@ def _check_resource(strict=False):
|
||||
return wrap
|
||||
|
||||
|
||||
def normalize_metric_name(name):
|
||||
name = name.replace('.', '_')
|
||||
name = name.replace(':', '_')
|
||||
return name
|
||||
|
||||
|
||||
class Proxy(adapter.Adapter):
|
||||
"""Represents a service."""
|
||||
|
||||
@ -204,22 +210,35 @@ class Proxy(adapter.Adapter):
|
||||
self._report_stats_influxdb(response, url, method, exc)
|
||||
|
||||
def _report_stats_statsd(self, response, url=None, method=None, exc=None):
|
||||
if response is not None and not url:
|
||||
url = response.request.url
|
||||
if response is not None and not method:
|
||||
method = response.request.method
|
||||
name_parts = self._extract_name(url,
|
||||
self.service_type,
|
||||
self.session.get_project_id())
|
||||
key = '.'.join(
|
||||
[self._statsd_prefix, self.service_type, method]
|
||||
+ name_parts)
|
||||
with self._statsd_client.pipeline() as pipe:
|
||||
if response is not None:
|
||||
pipe.timing(key, response.elapsed)
|
||||
pipe.incr(key)
|
||||
elif exc is not None:
|
||||
pipe.incr('%s.failed' % key)
|
||||
try:
|
||||
if response is not None and not url:
|
||||
url = response.request.url
|
||||
if response is not None and not method:
|
||||
method = response.request.method
|
||||
name_parts = [
|
||||
normalize_metric_name(f) for f in
|
||||
self._extract_name(
|
||||
url, self.service_type, self.session.get_project_id())
|
||||
]
|
||||
key = '.'.join(
|
||||
[self._statsd_prefix,
|
||||
normalize_metric_name(self.service_type), method,
|
||||
'_'.join(name_parts)
|
||||
])
|
||||
with self._statsd_client.pipeline() as pipe:
|
||||
if response is not None:
|
||||
duration = int(response.elapsed.total_seconds() * 1000)
|
||||
metric_name = '%s.%s' % (key, str(response.status_code))
|
||||
pipe.timing(metric_name, duration)
|
||||
pipe.incr(metric_name)
|
||||
if duration > 1000:
|
||||
pipe.incr('%s.over_1000' % key)
|
||||
elif exc is not None:
|
||||
pipe.incr('%s.failed' % key)
|
||||
pipe.incr('%s.attempted' % key)
|
||||
except Exception:
|
||||
# We do not want errors in metric reporting ever break client
|
||||
self.log.exception("Exception reporting metrics")
|
||||
|
||||
def _report_stats_prometheus(self, response, url=None, method=None,
|
||||
exc=None):
|
||||
@ -253,9 +272,12 @@ class Proxy(adapter.Adapter):
|
||||
method = response.request.method
|
||||
tags = dict(
|
||||
method=method,
|
||||
name='_'.join(self._extract_name(
|
||||
url, self.service_type,
|
||||
self.session.get_project_id()))
|
||||
name='_'.join([
|
||||
normalize_metric_name(f) for f in
|
||||
self._extract_name(
|
||||
url, self.service_type,
|
||||
self.session.get_project_id())
|
||||
])
|
||||
)
|
||||
fields = dict(
|
||||
attempted=1
|
||||
|
@ -23,7 +23,9 @@ import threading
|
||||
import time
|
||||
|
||||
import fixtures
|
||||
from keystoneauth1 import exceptions
|
||||
import prometheus_client
|
||||
from requests import exceptions as rexceptions
|
||||
import testtools.content
|
||||
|
||||
from openstack.tests.unit import base
|
||||
@ -175,7 +177,7 @@ class TestStats(base.TestCase):
|
||||
self.assert_calls()
|
||||
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.identity.GET.projects', value='1', kind='c')
|
||||
'openstack.api.identity.GET.projects.200', value='1', kind='c')
|
||||
self.assert_prometheus_stat(
|
||||
'openstack_http_requests_total', 1, dict(
|
||||
service_type='identity',
|
||||
@ -196,7 +198,7 @@ class TestStats(base.TestCase):
|
||||
self.assert_calls()
|
||||
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.identity.GET.projects', value='1', kind='c')
|
||||
'openstack.api.identity.GET.projects.200', value='1', kind='c')
|
||||
self.assert_prometheus_stat(
|
||||
'openstack_http_requests_total', 1, dict(
|
||||
service_type='identity',
|
||||
@ -217,7 +219,11 @@ class TestStats(base.TestCase):
|
||||
self.assert_calls()
|
||||
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.detail', value='1', kind='c')
|
||||
'openstack.api.compute.GET.servers_detail.200',
|
||||
value='1', kind='c')
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers_detail.200',
|
||||
value='0', kind='ms')
|
||||
self.assert_prometheus_stat(
|
||||
'openstack_http_requests_total', 1, dict(
|
||||
service_type='compute',
|
||||
@ -237,7 +243,11 @@ class TestStats(base.TestCase):
|
||||
self.assert_calls()
|
||||
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers', value='1', kind='c')
|
||||
'openstack.api.compute.GET.servers.200', value='1', kind='c')
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.200', value='0', kind='ms')
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.attempted', value='1', kind='c')
|
||||
self.assert_prometheus_stat(
|
||||
'openstack_http_requests_total', 1, dict(
|
||||
service_type='compute',
|
||||
@ -245,6 +255,49 @@ class TestStats(base.TestCase):
|
||||
method='GET',
|
||||
status_code='200'))
|
||||
|
||||
def test_servers_error(self):
|
||||
|
||||
mock_uri = 'https://compute.example.com/v2.1/servers'
|
||||
|
||||
self.register_uris([
|
||||
dict(method='GET', uri=mock_uri, status_code=500,
|
||||
json={})])
|
||||
|
||||
self.cloud.compute.get('/servers')
|
||||
self.assert_calls()
|
||||
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.500', value='1', kind='c')
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.500', value='0', kind='ms')
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.attempted', value='1', kind='c')
|
||||
self.assert_prometheus_stat(
|
||||
'openstack_http_requests_total', 1, dict(
|
||||
service_type='compute',
|
||||
endpoint=mock_uri,
|
||||
method='GET',
|
||||
status_code='500'))
|
||||
|
||||
def test_timeout(self):
|
||||
|
||||
mock_uri = 'https://compute.example.com/v2.1/servers'
|
||||
|
||||
self.register_uris([
|
||||
dict(method='GET', uri=mock_uri,
|
||||
exc=rexceptions.ConnectTimeout)
|
||||
])
|
||||
|
||||
try:
|
||||
self.cloud.compute.get('/servers')
|
||||
except exceptions.ConnectTimeout:
|
||||
pass
|
||||
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.failed', value='1', kind='c')
|
||||
self.assert_reported_stat(
|
||||
'openstack.api.compute.GET.servers.attempted', value='1', kind='c')
|
||||
|
||||
|
||||
class TestNoStats(base.TestCase):
|
||||
|
||||
|
5
releasenotes/notes/improve-metrics-5d7ce70ce4021d72.yaml
Normal file
5
releasenotes/notes/improve-metrics-5d7ce70ce4021d72.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
upgrade:
|
||||
- |
|
||||
API metrics emitted by OpenStackSDK to StatsD now contain status_code
|
||||
part of the metric name in order to improve information precision.
|
Loading…
Reference in New Issue
Block a user