From f932265290a4e923eac6111eb28578489c7dce33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=A1gr?= Date: Fri, 4 Aug 2023 16:26:47 +0200 Subject: [PATCH] Add Prometheus evaluator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds threshold evaluator for Prometheus metrics. Alarms are based on Prometheus query only. The library python-observabilityclient is used for querying Prometheus. Co-authored-by: Jaromír Wysoglad Depends-On: Ie7477e0b11479449f7f13c7a784737ac70059964 Change-Id: I72e124cca4398b78f7ed12e1db3f66bdbfcb196e --- .../controllers/v2/alarm_rules/composite.py | 3 +- .../controllers/v2/alarm_rules/prometheus.py | 46 +++++++++++ aodh/evaluator/composite.py | 3 +- aodh/evaluator/prometheus.py | 78 +++++++++++++++++++ aodh/evaluator/threshold.py | 50 ++++++------ aodh/opts.py | 2 + aodh/tests/unit/evaluator/test_composite.py | 7 ++ aodh/tests/unit/test_evaluator.py | 59 ++++++++++++++ requirements.txt | 1 + setup.cfg | 2 + 10 files changed, 226 insertions(+), 25 deletions(-) create mode 100644 aodh/api/controllers/v2/alarm_rules/prometheus.py create mode 100644 aodh/evaluator/prometheus.py diff --git a/aodh/api/controllers/v2/alarm_rules/composite.py b/aodh/api/controllers/v2/alarm_rules/composite.py index c0dce88b6..1b7984259 100644 --- a/aodh/api/controllers/v2/alarm_rules/composite.py +++ b/aodh/api/controllers/v2/alarm_rules/composite.py @@ -41,7 +41,8 @@ class CompositeRule(wtypes.UserType): threshold_plugins = None def __init__(self): - threshold_rules = ('gnocchi_resources_threshold', + threshold_rules = ('prometheus', + 'gnocchi_resources_threshold', 'gnocchi_aggregation_by_metrics_threshold', 'gnocchi_aggregation_by_resources_threshold') CompositeRule.threshold_plugins = named.NamedExtensionManager( diff --git a/aodh/api/controllers/v2/alarm_rules/prometheus.py b/aodh/api/controllers/v2/alarm_rules/prometheus.py new file mode 100644 index 000000000..4e2d24539 --- /dev/null +++ b/aodh/api/controllers/v2/alarm_rules/prometheus.py @@ -0,0 +1,46 @@ +# +# Copyright 2023 Red Hat, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_log import log +import wsme +from wsme import types as wtypes + +from aodh.api.controllers.v2 import base + + +LOG = log.getLogger(__name__) + + +class PrometheusRule(base.AlarmRule): + comparison_operator = base.AdvEnum('comparison_operator', str, + 'lt', 'le', 'eq', 'ne', 'ge', 'gt', + default='eq') + "The comparison against the alarm threshold" + + threshold = wsme.wsattr(float, mandatory=True) + "The threshold of the alarm" + + query = wsme.wsattr(wtypes.text, mandatory=True) + "The Prometheus query" + + @staticmethod + def validate(rule): + # TO-DO(mmagr): validate Prometheus query maybe? + return rule + + def as_dict(self): + rule = self.as_dict_from_keys(['comparison_operator', 'threshold', + 'query']) + return rule diff --git a/aodh/evaluator/composite.py b/aodh/evaluator/composite.py index 52686dd19..0e51895f7 100644 --- a/aodh/evaluator/composite.py +++ b/aodh/evaluator/composite.py @@ -116,7 +116,8 @@ class CompositeEvaluator(evaluator.Evaluator): @property def threshold_evaluators(self): if not self._threshold_evaluators: - threshold_types = ('gnocchi_resources_threshold', + threshold_types = ('prometheus', + 'gnocchi_resources_threshold', 'gnocchi_aggregation_by_metrics_threshold', 'gnocchi_aggregation_by_resources_threshold') self._threshold_evaluators = stevedore.NamedExtensionManager( diff --git a/aodh/evaluator/prometheus.py b/aodh/evaluator/prometheus.py new file mode 100644 index 000000000..6467867ef --- /dev/null +++ b/aodh/evaluator/prometheus.py @@ -0,0 +1,78 @@ +# +# Copyright 2023 Red Hat, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_config import cfg +from oslo_log import log + +from observabilityclient import client + +from aodh.evaluator import threshold +from aodh import keystone_client + + +LOG = log.getLogger(__name__) +OPTS = [ + cfg.BoolOpt('prometheus_disable_rbac', + default=False, + help='Disable RBAC for Prometheus evaluator.'), +] + + +class PrometheusBase(threshold.ThresholdEvaluator): + def __init__(self, conf): + super(PrometheusBase, self).__init__(conf) + self._set_obsclient(conf) + self._no_rbac = conf.prometheus_disable_rbac + + def _set_obsclient(self, conf): + session = keystone_client.get_session(conf) + opts = {'interface': conf.service_credentials.interface, + 'region_name': conf.service_credentials.region_name} + self._prom = client.Client('1', session, adapter_options=opts) + + def _get_metric_data(self, query): + LOG.debug(f'Querying Prometheus instance on: {query}') + return self._prom.query.query(query, disable_rbac=self._no_rbac) + + +class PrometheusEvaluator(PrometheusBase): + + def _sanitize(self, metric_data): + sanitized = [float(m.value) for m in metric_data] + LOG.debug(f'Sanited Prometheus metric data: {metric_data}' + f' to statistics: {sanitized}') + return sanitized + + def evaluate_rule(self, alarm_rule): + """Evaluate alarm rule. + + :returns: state, trending state, statistics, number of samples outside + threshold and reason + """ + metrics = self._get_metric_data(alarm_rule['query']) + if not metrics: + LOG.warning("Empty result fetched from Prometheus for query" + f" {alarm_rule['query']}") + + statistics = self._sanitize(metrics) + if not statistics: + raise threshold.InsufficientDataError('datapoints are unknown', + statistics) + return self._process_statistics(alarm_rule, statistics) + + def _unknown_reason_data(self, alarm, statistics): + LOG.warning(f'Transfering alarm {alarm} on unknown reason') + last = None if not statistics else statistics[-1] + return self._reason_data('unknown', len(statistics), last) diff --git a/aodh/evaluator/threshold.py b/aodh/evaluator/threshold.py index a035a4253..8d423b251 100644 --- a/aodh/evaluator/threshold.py +++ b/aodh/evaluator/threshold.py @@ -96,19 +96,7 @@ class ThresholdEvaluator(evaluator.Evaluator): ' %(disposition)s threshold, most recent: %(most_recent)s' % dict(reason_data, state=state), reason_data) - def evaluate_rule(self, alarm_rule): - """Evaluate alarm rule. - - :returns: state, trending state and statistics. - """ - start, end = self._bound_duration(alarm_rule) - statistics = self._statistics(alarm_rule, start, end) - statistics = self._sanitize(alarm_rule, statistics) - sufficient = len(statistics) >= alarm_rule['evaluation_periods'] - if not sufficient: - raise InsufficientDataError( - '%d datapoints are unknown' % alarm_rule['evaluation_periods'], - statistics) + def _process_statistics(self, alarm_rule, statistics): def _compare(value): op = COMPARATORS[alarm_rule['comparison_operator']] @@ -129,6 +117,31 @@ class ThresholdEvaluator(evaluator.Evaluator): trending_state = evaluator.ALARM if compared[-1] else evaluator.OK return None, trending_state, statistics, number_outside, None + def evaluate_rule(self, alarm_rule): + """Evaluate alarm rule. + + :returns: state, trending state and statistics. + """ + start, end = self._bound_duration(alarm_rule) + statistics = self._statistics(alarm_rule, start, end) + statistics = self._sanitize(alarm_rule, statistics) + sufficient = len(statistics) >= alarm_rule['evaluation_periods'] + if not sufficient: + raise InsufficientDataError( + '%d datapoints are unknown' % alarm_rule['evaluation_periods'], + statistics) + + return self._process_statistics(alarm_rule, statistics) + + def _unknown_reason_data(self, alarm, statistics): + LOG.warning(f'Expecting {alarm.rule["evaluation_periods"]} datapoints' + f' but only get {len(statistics)}') + # Reason is not same as log message because we want to keep + # consistent since thirdparty software may depend on old format. + last = None if not statistics else statistics[-1] + return self._reason_data('unknown', alarm.rule['evaluation_periods'], + last) + def _transition_alarm(self, alarm, state, trending_state, statistics, outside_count, unknown_reason): unknown = alarm.state == evaluator.UNKNOWN @@ -143,16 +156,7 @@ class ThresholdEvaluator(evaluator.Evaluator): return if state == evaluator.UNKNOWN and not unknown: - LOG.warning('Expecting %(expected)d datapoints but only get ' - '%(actual)d' - % {'expected': alarm.rule['evaluation_periods'], - 'actual': len(statistics)}) - # Reason is not same as log message because we want to keep - # consistent since thirdparty software may depend on old format. - last = None if not statistics else statistics[-1] - reason_data = self._reason_data('unknown', - alarm.rule['evaluation_periods'], - last) + reason_data = self._unknown_reason_data(alarm, statistics) self._refresh(alarm, state, unknown_reason, reason_data) elif state and (alarm.state != state or continuous): diff --git a/aodh/opts.py b/aodh/opts.py index cf559fa7a..8c75a8bf1 100644 --- a/aodh/opts.py +++ b/aodh/opts.py @@ -23,6 +23,7 @@ import aodh.evaluator import aodh.evaluator.event import aodh.evaluator.gnocchi import aodh.evaluator.loadbalancer +import aodh.evaluator.prometheus import aodh.evaluator.threshold import aodh.event import aodh.keystone_client @@ -38,6 +39,7 @@ def list_opts(): itertools.chain( aodh.evaluator.OPTS, aodh.evaluator.event.OPTS, + aodh.evaluator.prometheus.OPTS, aodh.evaluator.threshold.OPTS, aodh.evaluator.loadbalancer.OPTS, aodh.notifier.rest.OPTS, diff --git a/aodh/tests/unit/evaluator/test_composite.py b/aodh/tests/unit/evaluator/test_composite.py index 54030c6c6..b7b5945aa 100644 --- a/aodh/tests/unit/evaluator/test_composite.py +++ b/aodh/tests/unit/evaluator/test_composite.py @@ -15,6 +15,7 @@ from unittest import mock import fixtures +import os from oslo_utils import timeutils from oslo_utils import uuidutils @@ -25,6 +26,12 @@ from aodh.tests import constants from aodh.tests.unit.evaluator import base +# NOTE(mmagr): Overriding PrometheusEvaluator setting to avoid +# complains during init. +os.environ['PROMETHEUS_HOST'] = '127.0.0.1' +os.environ['PROMETHEUS_PORT'] = '666' + + class BaseCompositeEvaluate(base.TestEvaluatorBase): EVALUATOR = composite.CompositeEvaluator diff --git a/aodh/tests/unit/test_evaluator.py b/aodh/tests/unit/test_evaluator.py index f8deea9a3..3359933c4 100644 --- a/aodh/tests/unit/test_evaluator.py +++ b/aodh/tests/unit/test_evaluator.py @@ -18,11 +18,14 @@ import fixtures import time from unittest import mock +from observabilityclient import prometheus_client from oslo_config import fixture as fixture_config from stevedore import extension from aodh import evaluator from aodh import service + +from aodh.evaluator import prometheus from aodh.tests import base as tests_base @@ -190,3 +193,59 @@ class TestAlarmEvaluationService(tests_base.BaseTestCase): target = svc.partition_coordinator.extract_my_subset self.assertEqual(0, target.call_count) self.assertEqual(0, self.threshold_eval.evaluate.call_count) + + +class TestPrometheusEvaluator(tests_base.BaseTestCase): + def setUp(self): + super(TestPrometheusEvaluator, self).setUp() + conf = service.prepare_service(argv=[], config_files=[]) + self.CONF = self.useFixture(fixture_config.Config(conf)).conf + + def test_rule_evaluation(self): + metric_list = [ + prometheus_client.PrometheusMetric({'metric': 'mtr', + 'value': (0, 10)}), + prometheus_client.PrometheusMetric({'metric': 'mtr', + 'value': (1, 15)}), + prometheus_client.PrometheusMetric({'metric': 'mtr', + 'value': (2, 20)}), + prometheus_client.PrometheusMetric({'metric': 'mtr', + 'value': (3, 25)}), + prometheus_client.PrometheusMetric({'metric': 'mtr', + 'value': (4, 30)}), + prometheus_client.PrometheusMetric({'metric': 'mtr', + 'value': (5, 15)}), + ] + with mock.patch.object(prometheus.PrometheusEvaluator, + '_set_obsclient', return_value=None): + # mock Prometheus client + ev = prometheus.PrometheusEvaluator(self.CONF) + ev._get_metric_data = mock.Mock(return_value=metric_list) + + # test transfer to alarm state + state, trend, stats, outside, reason = ev.evaluate_rule( + {'query': 'mtr', 'threshold': 9, + 'comparison_operator': 'gt'}) + self.assertEqual('alarm', state) + self.assertEqual(6, outside) + + # test transfer to ok state + state, trend, stats, outside, reason = ev.evaluate_rule( + {'query': 'mtr', 'threshold': 31, + 'comparison_operator': 'gt'}) + self.assertEqual('ok', state) + self.assertEqual(0, outside) + + # test trending to alarm state + state, trend, stats, outside, reason = ev.evaluate_rule( + {'query': 'mtr', 'threshold': 14, + 'comparison_operator': 'gt'}) + self.assertEqual('alarm', trend) + self.assertEqual(5, outside) + + # test trending to ok state + state, trend, stats, outside, reason = ev.evaluate_rule( + {'query': 'mtr', 'threshold': 20, + 'comparison_operator': 'gt'}) + self.assertEqual('ok', trend) + self.assertEqual(2, outside) diff --git a/requirements.txt b/requirements.txt index f12f63836..a2629b19f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,6 +36,7 @@ cachetools>=1.1.6 cotyledon>=1.7.3 keystoneauth1>=2.1 debtcollector>=1.2.0 # Apache-2.0 +python-observabilityclient>=0.0.4 python-octaviaclient>=1.8.0 python-dateutil>=2.8.2 # BSD python-heatclient>=1.17.0 diff --git a/setup.cfg b/setup.cfg index c3c1fed6a..4d3e7166a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ aodh.alarm.rule = event = aodh.api.controllers.v2.alarm_rules.event:AlarmEventRule composite = aodh.api.controllers.v2.alarm_rules.composite:composite_rule loadbalancer_member_health = aodh.api.controllers.v2.alarm_rules.loadbalancer:LoadBalancerMemberHealthRule + prometheus = aodh.api.controllers.v2.alarm_rules.prometheus:PrometheusRule aodh.evaluator = gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator @@ -64,6 +65,7 @@ aodh.evaluator = gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator composite = aodh.evaluator.composite:CompositeEvaluator loadbalancer_member_health = aodh.evaluator.loadbalancer:LoadBalancerMemberHealthEvaluator + prometheus = aodh.evaluator.prometheus:PrometheusEvaluator aodh.notifier = log = aodh.notifier.log:LogAlarmNotifier