Basic alarm threshold evaluation logic.

Partially addresses BP alarm-distributed-threshold-evaluation. Threshold evaluation logic encapsulating basic alarm statistics querying, threshold comparison, and state transition rules. Change-Id: I0f3a50809985d25ab0eceb990b142da8701a9616
2013-06-25 22:27:48 +01:00 · 2013-06-25 22:27:48 +01:00 · 135612f0f9
commit 135612f0f9
parent 1ba1bc7eda
5 changed files with 432 additions and 0 deletions
--- a/ceilometer/alarm/init.py
+++ b/ceilometer/alarm/init.py
--- a/ceilometer/alarm/threshold_evaluation.py
+++ b/ceilometer/alarm/threshold_evaluation.py
@ -0,0 +1,219 @@
 # -*- encoding: utf-8 -*-
 #
 # Copyright © 2013 Red Hat, Inc
 #
 # Author: Eoghan Glynn <eglynn@redhat.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 import datetime
 import operator
 from oslo.config import cfg
 from ceilometer.openstack.common import log
 from ceilometerclient import client as ceiloclient
 LOG = log.getLogger(__name__)
 COMPARATORS = {
    'gt': operator.gt,
    'lt': operator.lt,
    'ge': operator.ge,
    'le': operator.le,
    'eq': operator.eq,
    'ne': operator.ne,
 }
 UNKNOWN = 'insufficient data'
 OK = 'ok'
 ALARM = 'alarm'
 class Evaluator(object):
    """This class implements the basic alarm threshold evaluation
       logic.
    """
    # the sliding evaluation window is extended to allow
    # for reporting/ingestion lag
    look_back = 1
    # minimum number of datapoints within sliding window to
    # avoid unknown state
    quorum = 1
    def __init__(self, notifier):
        self.alarms = []
        self.notifier = notifier
        self.api_client = None
    def assign_alarms(self, alarms):
        """Assign alarms to be evaluated."""
        self.alarms = alarms
    @property
    def _client(self):
        """Construct or reuse an authenticated API client."""
        if not self.api_client:
            auth_config = cfg.CONF.service_credentials
            creds = dict(
                os_auth_url=auth_config.os_auth_url,
                os_tenant_name=auth_config.os_tenant_name,
                os_password=auth_config.os_password,
                os_username=auth_config.os_username
            )
            self.api_client = ceiloclient.get_client(2, **creds)
        return self.api_client
    @staticmethod
    def _constraints(alarm):
        """Assert the constraints on the statistics query."""
        constraints = []
        for (field, value) in alarm.matching_metadata.iteritems():
            constraints.append(dict(field=field, op='eq', value=value))
        return constraints
    @classmethod
    def _bound_duration(cls, alarm, constraints):
        """Bound the duration of the statistics query."""
        now = datetime.datetime.utcnow()
        window = (alarm.period *
                  (alarm.evaluation_periods + cls.look_back))
        start = now - datetime.timedelta(seconds=window)
        LOG.debug(_('query stats from %(start)s to %(now)s') % locals())
        after = dict(field='timestamp', op='ge', value=start.isoformat())
        before = dict(field='timestamp', op='le', value=now.isoformat())
        constraints.extend([before, after])
        return constraints
    @staticmethod
    def _sanitize(alarm, statistics):
        """Sanitize statistics.
           Ultimately this will be the hook for the exclusion of chaotic
           datapoints for example.
        """
        LOG.debug(_('sanitize stats %s') % statistics)
        # in practice statistics are always sorted by period start, not
        # strictly required by the API though
        statistics = statistics[:alarm.evaluation_periods]
        LOG.debug(_('pruned statistics to %d') % len(statistics))
        return statistics
    def _statistics(self, alarm, query):
        """Retrieve statistics over the current window."""
        LOG.debug(_('stats query %s') % query)
        try:
            return self._client.statistics.list(alarm.counter_name,
                                                q=query,
                                                period=alarm.period)
        except Exception:
            LOG.exception(_('alarm stats retrieval failed'))
            return []
    def _update(self, alarm, state, reason):
        """Refresh alarm state."""
        id = alarm.alarm_id
        LOG.info(_('alarm %(id)s transitioning to %(state)s'
                   ' because %(reason)s') % locals())
        try:
            self._client.alarms.update(id, **dict(state=state))
            alarm.state = state
            if self.notifier:
                self.notifier.notify(alarm, state, reason)
        except Exception:
            # retry will occur naturally on the next evaluation
            # cycle (unless alarm state reverts in the meantime)
            LOG.exception(_('alarm state update failed'))
    def _sufficient(self, alarm, statistics):
        """Ensure there is sufficient data for evaluation,
           transitioning to unknown otherwise.
        """
        sufficient = len(statistics) >= self.quorum
        if not sufficient and alarm.state != UNKNOWN:
            reason = _('%d datapoints are unknown') % alarm.evaluation_periods
            self._update(alarm, UNKNOWN, reason)
        return sufficient
    @staticmethod
    def _reason(alarm, statistics, distilled, state):
        """Fabricate reason string."""
        count = len(statistics)
        disposition = 'inside' if state == OK else 'outside'
        last = getattr(statistics[-1], alarm.statistic)
        return (_('Transition to %(state)s due to %(count)d samples'
                  ' %(disposition)s threshold, most recent: %(last)s') %
                locals())
    def _transition(self, alarm, statistics, compared):
        """Transition alarm state if necessary.
           The transition rules are currently hardcoded as:
           - transitioning from a known state requires an unequivocal
             set of datapoints
           - transitioning from unknown is on the basis of the most
             recent datapoint if equivocal
           Ultimately this will be policy-driven.
        """
        distilled = all(compared)
        unequivocal = distilled or not any(compared)
        if unequivocal:
            state = ALARM if distilled else OK
            if alarm.state != state:
                reason = self._reason(alarm, statistics, distilled, state)
                self._update(alarm, state, reason)
        elif alarm.state == UNKNOWN:
            state = ALARM if compared[-1] else OK
            reason = self._reason(alarm, statistics, distilled, state)
            self._update(alarm, state, reason)
    def evaluate(self):
        """Evaluate the alarms assigned to this evaluator."""
        LOG.info(_('initiating evaluation cycle on %d alarms') %
                 len(self.alarms))
        for alarm in self.alarms:
            if not alarm.enabled:
                LOG.debug(_('skipping alarm %s') % alarm.alarm_id)
                continue
            LOG.debug(_('evaluating alarm %s') % alarm.alarm_id)
            query = self._bound_duration(
                alarm,
                self._constraints(alarm)
            )
            statistics = self._sanitize(
                alarm,
                self._statistics(alarm, query)
            )
            if self._sufficient(alarm, statistics):
                def _compare(stat):
                    op = COMPARATORS[alarm.comparison_operator]
                    value = getattr(stat, alarm.statistic)
                    limit = alarm.threshold
                    LOG.debug(_('comparing value %(value)s against threshold'
                                ' %(limit)s') % locals())
                    return op(value, limit)
                self._transition(alarm,
                                 statistics,
                                 list(map(_compare, statistics)))
--- a/requirements.txt
+++ b/requirements.txt
@ -16,6 +16,7 @@ msgpack-python
 python-glanceclient
 python-novaclient>=2.6.10
 python-keystoneclient>=0.2,<0.3
 python-ceilometerclient>=1.0.1
 python-swiftclient
 lxml
 requests>=1.1,<1.2.1
--- a/tests/alarm/init.py
+++ b/tests/alarm/init.py
--- a/tests/alarm/test_threshold_evaluation.py
+++ b/tests/alarm/test_threshold_evaluation.py
@ -0,0 +1,212 @@
 # -*- encoding: utf-8 -*-
 #
 # Copyright © 2013 Red Hat, Inc
 #
 # Author: Eoghan Glynn <eglynn@redhat.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 """Tests for ceilometer/alarm/threshold_evaluation.py
 """
 import mock
 import uuid
 from ceilometer.alarm import threshold_evaluation
 from ceilometer.storage import models
 from ceilometer.tests import base
 from ceilometerclient import exc
 from ceilometerclient.v2 import statistics
 class TestEvaluate(base.TestCase):
    def setUp(self):
        super(TestEvaluate, self).setUp()
        self.api_client = mock.Mock()
        self.notifier = mock.MagicMock()
        self.alarms = [
            models.Alarm(name='instance_running_hot',
                         counter_name='cpu_util',
                         comparison_operator='gt',
                         threshold=80.0,
                         evaluation_periods=5,
                         statistic='avg',
                         user_id='foobar',
                         project_id='snafu',
                         period=60,
                         alarm_id=str(uuid.uuid4()),
                         matching_metadata={'resource_id':
                                            'my_instance'}),
            models.Alarm(name='group_running_idle',
                         counter_name='cpu_util',
                         comparison_operator='le',
                         threshold=10.0,
                         statistic='max',
                         evaluation_periods=4,
                         user_id='foobar',
                         project_id='snafu',
                         period=300,
                         alarm_id=str(uuid.uuid4()),
                         matching_metadata={'metadata.user_metadata.AS':
                                            'my_group'}),
        ]
        self.evaluator = threshold_evaluation.Evaluator(self.notifier)
        self.evaluator.assign_alarms(self.alarms)
    @staticmethod
    def _get_stat(attr, value):
        return statistics.Statistics(None, {attr: value})
    def _set_all_alarms(self, state):
        for alarm in self.alarms:
            alarm.state = state
    def _assert_all_alarms(self, state):
        for alarm in self.alarms:
            self.assertEqual(alarm.state, state)
    def test_retry_transient_api_failure(self):
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            broken = exc.CommunicationError(message='broken')
            avgs = [self._get_stat('avg', self.alarms[0].threshold - v)
                    for v in xrange(5)]
            maxs = [self._get_stat('max', self.alarms[1].threshold + v)
                    for v in xrange(1, 4)]
            self.api_client.statistics.list.side_effect = [broken,
                                                           broken,
                                                           avgs,
                                                           maxs]
            self.evaluator.evaluate()
            self._assert_all_alarms('insufficient data')
            self.evaluator.evaluate()
            self._assert_all_alarms('ok')
    def test_simple_insufficient(self):
        self._set_all_alarms('ok')
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            self.api_client.statistics.list.return_value = []
            self.evaluator.evaluate()
            self._assert_all_alarms('insufficient data')
            expected = [mock.call(alarm.alarm_id, state='insufficient data')
                        for alarm in self.alarms]
            update_calls = self.api_client.alarms.update.call_args_list
            self.assertEqual(update_calls, expected)
            expected = [mock.call(alarm,
                                  'insufficient data',
                                  ('%d datapoints are unknown' %
                                   alarm.evaluation_periods))
                        for alarm in self.alarms]
            self.assertEqual(self.notifier.notify.call_args_list, expected)
    def test_disabled_is_skipped(self):
        self._set_all_alarms('ok')
        self.alarms[1].enabled = False
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            self.api_client.statistics.list.return_value = []
            self.evaluator.evaluate()
            self.assertEqual(self.alarms[0].state, 'insufficient data')
            self.assertEqual(self.alarms[1].state, 'ok')
            self.api_client.alarms.update.assert_called_once_with(
                self.alarms[0].alarm_id,
                state='insufficient data'
            )
            self.notifier.notify.assert_called_once_with(
                self.alarms[0],
                'insufficient data',
                mock.ANY
            )
    def test_simple_alarm_trip(self):
        self._set_all_alarms('ok')
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            avgs = [self._get_stat('avg', self.alarms[0].threshold + v)
                    for v in xrange(1, 6)]
            maxs = [self._get_stat('max', self.alarms[1].threshold - v)
                    for v in xrange(4)]
            self.api_client.statistics.list.side_effect = [avgs, maxs]
            self.evaluator.evaluate()
            self._assert_all_alarms('alarm')
            expected = [mock.call(alarm.alarm_id, state='alarm')
                        for alarm in self.alarms]
            update_calls = self.api_client.alarms.update.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to alarm due to 5 samples outside'
                       ' threshold, most recent: 85.0',
                       'Transition to alarm due to 4 samples outside'
                       ' threshold, most recent: 7.0']
            expected = [mock.call(alarm, 'alarm', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)
    def test_simple_alarm_clear(self):
        self._set_all_alarms('alarm')
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            avgs = [self._get_stat('avg', self.alarms[0].threshold - v)
                    for v in xrange(5)]
            maxs = [self._get_stat('max', self.alarms[1].threshold + v)
                    for v in xrange(1, 5)]
            self.api_client.statistics.list.side_effect = [avgs, maxs]
            self.evaluator.evaluate()
            self._assert_all_alarms('ok')
            expected = [mock.call(alarm.alarm_id, state='ok')
                        for alarm in self.alarms]
            update_calls = self.api_client.alarms.update.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to ok due to 5 samples inside'
                       ' threshold, most recent: 76.0',
                       'Transition to ok due to 4 samples inside'
                       ' threshold, most recent: 14.0']
            expected = [mock.call(alarm, 'ok', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)
    def test_equivocal_from_known_state(self):
        self._set_all_alarms('ok')
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            avgs = [self._get_stat('avg', self.alarms[0].threshold + v)
                    for v in xrange(5)]
            maxs = [self._get_stat('max', self.alarms[1].threshold - v)
                    for v in xrange(-1, 3)]
            self.api_client.statistics.list.side_effect = [avgs, maxs]
            self.evaluator.evaluate()
            self._assert_all_alarms('ok')
            self.assertEqual(self.api_client.alarms.update.call_args_list,
                             [])
            self.assertEqual(self.notifier.notify.call_args_list, [])
    def test_equivocal_from_unknown(self):
        self._set_all_alarms('insufficient data')
        with mock.patch('ceilometerclient.client.get_client',
                        return_value=self.api_client):
            avgs = [self._get_stat('avg', self.alarms[0].threshold + v)
                    for v in xrange(1, 6)]
            maxs = [self._get_stat('max', self.alarms[1].threshold - v)
                    for v in xrange(4)]
            self.api_client.statistics.list.side_effect = [avgs, maxs]
            self.evaluator.evaluate()
            self._assert_all_alarms('alarm')
            expected = [mock.call(alarm.alarm_id, state='alarm')
                        for alarm in self.alarms]
            update_calls = self.api_client.alarms.update.call_args_list
            self.assertEqual(update_calls, expected)
            reasons = ['Transition to alarm due to 5 samples outside'
                       ' threshold, most recent: 85.0',
                       'Transition to alarm due to 4 samples outside'
                       ' threshold, most recent: 7.0']
            expected = [mock.call(alarm, 'alarm', reason)
                        for alarm, reason in zip(self.alarms, reasons)]
            self.assertEqual(self.notifier.notify.call_args_list, expected)