Basic alarm threshold evaluation logic.

Partially addresses BP alarm-distributed-threshold-evaluation. Threshold evaluation logic encapsulating basic alarm statistics querying, threshold comparison, and state transition rules. Change-Id: I0f3a50809985d25ab0eceb990b142da8701a9616
2013-06-25 22:27:48 +01:00 · 2013-06-25 22:27:48 +01:00 · 135612f0f9
commit 135612f0f9
parent 1ba1bc7eda
5 changed files with 432 additions and 0 deletions
--- a/ceilometer/alarm/init.py
+++ b/ceilometer/alarm/init.py
--- a/ceilometer/alarm/threshold_evaluation.py
+++ b/ceilometer/alarm/threshold_evaluation.py
@ -0,0 +1,219 @@
+# -*- encoding: utf-8 -*-
+#
+# Copyright © 2013 Red Hat, Inc
+#
+# Author: Eoghan Glynn <eglynn@redhat.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import operator
+
+from oslo.config import cfg
+
+from ceilometer.openstack.common import log
+from ceilometerclient import client as ceiloclient
+
+LOG = log.getLogger(__name__)
+
+COMPARATORS = {
+    'gt': operator.gt,
+    'lt': operator.lt,
+    'ge': operator.ge,
+    'le': operator.le,
+    'eq': operator.eq,
+    'ne': operator.ne,
+}
+
+UNKNOWN = 'insufficient data'
+OK = 'ok'
+ALARM = 'alarm'
+
+
+class Evaluator(object):
+    """This class implements the basic alarm threshold evaluation
+       logic.
+    """
+
+    # the sliding evaluation window is extended to allow
+    # for reporting/ingestion lag
+    look_back = 1
+
+    # minimum number of datapoints within sliding window to
+    # avoid unknown state
+    quorum = 1
+
+    def __init__(self, notifier):
+        self.alarms = []
+        self.notifier = notifier
+        self.api_client = None
+
+    def assign_alarms(self, alarms):
+        """Assign alarms to be evaluated."""
+        self.alarms = alarms
+
+    @property
+    def _client(self):
+        """Construct or reuse an authenticated API client."""
+        if not self.api_client:
+            auth_config = cfg.CONF.service_credentials
+            creds = dict(
+                os_auth_url=auth_config.os_auth_url,
+                os_tenant_name=auth_config.os_tenant_name,
+                os_password=auth_config.os_password,
+                os_username=auth_config.os_username
+            )
+            self.api_client = ceiloclient.get_client(2, **creds)
+        return self.api_client
+
+    @staticmethod
+    def _constraints(alarm):
+        """Assert the constraints on the statistics query."""
+        constraints = []
+        for (field, value) in alarm.matching_metadata.iteritems():
+            constraints.append(dict(field=field, op='eq', value=value))
+        return constraints
+
+    @classmethod
+    def _bound_duration(cls, alarm, constraints):
+        """Bound the duration of the statistics query."""
+        now = datetime.datetime.utcnow()
+        window = (alarm.period *
+                  (alarm.evaluation_periods + cls.look_back))
+        start = now - datetime.timedelta(seconds=window)
+        LOG.debug(_('query stats from %(start)s to %(now)s') % locals())
+        after = dict(field='timestamp', op='ge', value=start.isoformat())
+        before = dict(field='timestamp', op='le', value=now.isoformat())
+        constraints.extend([before, after])
+        return constraints
+
+    @staticmethod
+    def _sanitize(alarm, statistics):
+        """Sanitize statistics.
+           Ultimately this will be the hook for the exclusion of chaotic
+           datapoints for example.
+        """
+        LOG.debug(_('sanitize stats %s') % statistics)
+        # in practice statistics are always sorted by period start, not
+        # strictly required by the API though
+        statistics = statistics[:alarm.evaluation_periods]
+        LOG.debug(_('pruned statistics to %d') % len(statistics))
+        return statistics
+
+    def _statistics(self, alarm, query):
+        """Retrieve statistics over the current window."""
+        LOG.debug(_('stats query %s') % query)
+        try:
+            return self._client.statistics.list(alarm.counter_name,
+                                                q=query,
+                                                period=alarm.period)
+        except Exception:
+            LOG.exception(_('alarm stats retrieval failed'))
+            return []
+
+    def _update(self, alarm, state, reason):
+        """Refresh alarm state."""
+        id = alarm.alarm_id
+        LOG.info(_('alarm %(id)s transitioning to %(state)s'
+                   ' because %(reason)s') % locals())
+        try:
+            self._client.alarms.update(id, **dict(state=state))
+            alarm.state = state
+            if self.notifier:
+                self.notifier.notify(alarm, state, reason)
+        except Exception:
+            # retry will occur naturally on the next evaluation
+            # cycle (unless alarm state reverts in the meantime)
+            LOG.exception(_('alarm state update failed'))
+
+    def _sufficient(self, alarm, statistics):
+        """Ensure there is sufficient data for evaluation,
+           transitioning to unknown otherwise.
+        """
+        sufficient = len(statistics) >= self.quorum
+        if not sufficient and alarm.state != UNKNOWN:
+            reason = _('%d datapoints are unknown') % alarm.evaluation_periods
+            self._update(alarm, UNKNOWN, reason)
+        return sufficient
+
+    @staticmethod
+    def _reason(alarm, statistics, distilled, state):
+        """Fabricate reason string."""
+        count = len(statistics)
+        disposition = 'inside' if state == OK else 'outside'
+        last = getattr(statistics[-1], alarm.statistic)
+        return (_('Transition to %(state)s due to %(count)d samples'
+                  ' %(disposition)s threshold, most recent: %(last)s') %
+                locals())
+
+    def _transition(self, alarm, statistics, compared):
+        """Transition alarm state if necessary.
+
+           The transition rules are currently hardcoded as:
+
+           - transitioning from a known state requires an unequivocal
+             set of datapoints
+
+           - transitioning from unknown is on the basis of the most
+             recent datapoint if equivocal
+
+           Ultimately this will be policy-driven.
+        """
+        distilled = all(compared)
+        unequivocal = distilled or not any(compared)
+        if unequivocal:
+            state = ALARM if distilled else OK
+            if alarm.state != state:
+                reason = self._reason(alarm, statistics, distilled, state)
+                self._update(alarm, state, reason)
+        elif alarm.state == UNKNOWN:
+            state = ALARM if compared[-1] else OK
+            reason = self._reason(alarm, statistics, distilled, state)
+            self._update(alarm, state, reason)
+
+    def evaluate(self):
+        """Evaluate the alarms assigned to this evaluator."""
+
+        LOG.info(_('initiating evaluation cycle on %d alarms') %
+                 len(self.alarms))
+
+        for alarm in self.alarms:
+
+            if not alarm.enabled:
+                LOG.debug(_('skipping alarm %s') % alarm.alarm_id)
+                continue
+            LOG.debug(_('evaluating alarm %s') % alarm.alarm_id)
+
+            query = self._bound_duration(
+                alarm,
+                self._constraints(alarm)
+            )
+
+            statistics = self._sanitize(
+                alarm,
+                self._statistics(alarm, query)
+            )
+
+            if self._sufficient(alarm, statistics):
+
+                def _compare(stat):
+                    op = COMPARATORS[alarm.comparison_operator]
+                    value = getattr(stat, alarm.statistic)
+                    limit = alarm.threshold
+                    LOG.debug(_('comparing value %(value)s against threshold'
+                                ' %(limit)s') % locals())
+                    return op(value, limit)
+
+                self._transition(alarm,
+                                 statistics,
+                                 list(map(_compare, statistics)))
--- a/requirements.txt
+++ b/requirements.txt
@ -16,6 +16,7 @@ msgpack-python
 python-glanceclient
 python-novaclient>=2.6.10
 python-keystoneclient>=0.2,<0.3
+python-ceilometerclient>=1.0.1
 python-swiftclient
 lxml
 requests>=1.1,<1.2.1
--- a/tests/alarm/init.py
+++ b/tests/alarm/init.py
--- a/tests/alarm/test_threshold_evaluation.py
+++ b/tests/alarm/test_threshold_evaluation.py
@ -0,0 +1,212 @@
+# -*- encoding: utf-8 -*-
+#
+# Copyright © 2013 Red Hat, Inc
+#
+# Author: Eoghan Glynn <eglynn@redhat.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+"""Tests for ceilometer/alarm/threshold_evaluation.py
+"""
+import mock
+import uuid
+
+from ceilometer.alarm import threshold_evaluation
+from ceilometer.storage import models
+from ceilometer.tests import base
+from ceilometerclient import exc
+from ceilometerclient.v2 import statistics
+
+
+class TestEvaluate(base.TestCase):
+    def setUp(self):
+        super(TestEvaluate, self).setUp()
+        self.api_client = mock.Mock()
+        self.notifier = mock.MagicMock()
+        self.alarms = [
+            models.Alarm(name='instance_running_hot',
+                         counter_name='cpu_util',
+                         comparison_operator='gt',
+                         threshold=80.0,
+                         evaluation_periods=5,
+                         statistic='avg',
+                         user_id='foobar',
+                         project_id='snafu',
+                         period=60,
+                         alarm_id=str(uuid.uuid4()),
+                         matching_metadata={'resource_id':
+                                            'my_instance'}),
+            models.Alarm(name='group_running_idle',
+                         counter_name='cpu_util',
+                         comparison_operator='le',
+                         threshold=10.0,
+                         statistic='max',
+                         evaluation_periods=4,
+                         user_id='foobar',
+                         project_id='snafu',
+                         period=300,
+                         alarm_id=str(uuid.uuid4()),
+                         matching_metadata={'metadata.user_metadata.AS':
+                                            'my_group'}),
+        ]
+        self.evaluator = threshold_evaluation.Evaluator(self.notifier)
+        self.evaluator.assign_alarms(self.alarms)
+
+    @staticmethod
+    def _get_stat(attr, value):
+        return statistics.Statistics(None, {attr: value})
+
+    def _set_all_alarms(self, state):
+        for alarm in self.alarms:
+            alarm.state = state
+
+    def _assert_all_alarms(self, state):
+        for alarm in self.alarms:
+            self.assertEqual(alarm.state, state)
+
+    def test_retry_transient_api_failure(self):
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            broken = exc.CommunicationError(message='broken')
+            avgs = [self._get_stat('avg', self.alarms[0].threshold - v)
+                    for v in xrange(5)]
+            maxs = [self._get_stat('max', self.alarms[1].threshold + v)
+                    for v in xrange(1, 4)]
+            self.api_client.statistics.list.side_effect = [broken,
+                                                           broken,
+                                                           avgs,
+                                                           maxs]
+            self.evaluator.evaluate()
+            self._assert_all_alarms('insufficient data')
+            self.evaluator.evaluate()
+            self._assert_all_alarms('ok')
+
+    def test_simple_insufficient(self):
+        self._set_all_alarms('ok')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            self.api_client.statistics.list.return_value = []
+            self.evaluator.evaluate()
+            self._assert_all_alarms('insufficient data')
+            expected = [mock.call(alarm.alarm_id, state='insufficient data')
+                        for alarm in self.alarms]
+            update_calls = self.api_client.alarms.update.call_args_list
+            self.assertEqual(update_calls, expected)
+            expected = [mock.call(alarm,
+                                  'insufficient data',
+                                  ('%d datapoints are unknown' %
+                                   alarm.evaluation_periods))
+                        for alarm in self.alarms]
+            self.assertEqual(self.notifier.notify.call_args_list, expected)
+
+    def test_disabled_is_skipped(self):
+        self._set_all_alarms('ok')
+        self.alarms[1].enabled = False
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            self.api_client.statistics.list.return_value = []
+            self.evaluator.evaluate()
+            self.assertEqual(self.alarms[0].state, 'insufficient data')
+            self.assertEqual(self.alarms[1].state, 'ok')
+            self.api_client.alarms.update.assert_called_once_with(
+                self.alarms[0].alarm_id,
+                state='insufficient data'
+            )
+            self.notifier.notify.assert_called_once_with(
+                self.alarms[0],
+                'insufficient data',
+                mock.ANY
+            )
+
+    def test_simple_alarm_trip(self):
+        self._set_all_alarms('ok')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            avgs = [self._get_stat('avg', self.alarms[0].threshold + v)
+                    for v in xrange(1, 6)]
+            maxs = [self._get_stat('max', self.alarms[1].threshold - v)
+                    for v in xrange(4)]
+            self.api_client.statistics.list.side_effect = [avgs, maxs]
+            self.evaluator.evaluate()
+            self._assert_all_alarms('alarm')
+            expected = [mock.call(alarm.alarm_id, state='alarm')
+                        for alarm in self.alarms]
+            update_calls = self.api_client.alarms.update.call_args_list
+            self.assertEqual(update_calls, expected)
+            reasons = ['Transition to alarm due to 5 samples outside'
+                       ' threshold, most recent: 85.0',
+                       'Transition to alarm due to 4 samples outside'
+                       ' threshold, most recent: 7.0']
+            expected = [mock.call(alarm, 'alarm', reason)
+                        for alarm, reason in zip(self.alarms, reasons)]
+            self.assertEqual(self.notifier.notify.call_args_list, expected)
+
+    def test_simple_alarm_clear(self):
+        self._set_all_alarms('alarm')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            avgs = [self._get_stat('avg', self.alarms[0].threshold - v)
+                    for v in xrange(5)]
+            maxs = [self._get_stat('max', self.alarms[1].threshold + v)
+                    for v in xrange(1, 5)]
+            self.api_client.statistics.list.side_effect = [avgs, maxs]
+            self.evaluator.evaluate()
+            self._assert_all_alarms('ok')
+            expected = [mock.call(alarm.alarm_id, state='ok')
+                        for alarm in self.alarms]
+            update_calls = self.api_client.alarms.update.call_args_list
+            self.assertEqual(update_calls, expected)
+            reasons = ['Transition to ok due to 5 samples inside'
+                       ' threshold, most recent: 76.0',
+                       'Transition to ok due to 4 samples inside'
+                       ' threshold, most recent: 14.0']
+            expected = [mock.call(alarm, 'ok', reason)
+                        for alarm, reason in zip(self.alarms, reasons)]
+            self.assertEqual(self.notifier.notify.call_args_list, expected)
+
+    def test_equivocal_from_known_state(self):
+        self._set_all_alarms('ok')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            avgs = [self._get_stat('avg', self.alarms[0].threshold + v)
+                    for v in xrange(5)]
+            maxs = [self._get_stat('max', self.alarms[1].threshold - v)
+                    for v in xrange(-1, 3)]
+            self.api_client.statistics.list.side_effect = [avgs, maxs]
+            self.evaluator.evaluate()
+            self._assert_all_alarms('ok')
+            self.assertEqual(self.api_client.alarms.update.call_args_list,
+                             [])
+            self.assertEqual(self.notifier.notify.call_args_list, [])
+
+    def test_equivocal_from_unknown(self):
+        self._set_all_alarms('insufficient data')
+        with mock.patch('ceilometerclient.client.get_client',
+                        return_value=self.api_client):
+            avgs = [self._get_stat('avg', self.alarms[0].threshold + v)
+                    for v in xrange(1, 6)]
+            maxs = [self._get_stat('max', self.alarms[1].threshold - v)
+                    for v in xrange(4)]
+            self.api_client.statistics.list.side_effect = [avgs, maxs]
+            self.evaluator.evaluate()
+            self._assert_all_alarms('alarm')
+            expected = [mock.call(alarm.alarm_id, state='alarm')
+                        for alarm in self.alarms]
+            update_calls = self.api_client.alarms.update.call_args_list
+            self.assertEqual(update_calls, expected)
+            reasons = ['Transition to alarm due to 5 samples outside'
+                       ' threshold, most recent: 85.0',
+                       'Transition to alarm due to 4 samples outside'
+                       ' threshold, most recent: 7.0']
+            expected = [mock.call(alarm, 'alarm', reason)
+                        for alarm, reason in zip(self.alarms, reasons)]
+            self.assertEqual(self.notifier.notify.call_args_list, expected)