kiloeyes/kiloeyes/microservice/threshold_processor.py

329 lines
13 KiB
Python
Executable File

# Copyright 2015 Carnegie Mellon University
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import copy
import json
from oslo_log import log
import uuid
from kiloeyes.common import alarm_expr_calculator as calculator
from kiloeyes.common import alarm_expr_parser as parser
from kiloeyes.common import timeutils as tu
LOG = log.getLogger(__name__)
reasons = {'ALARM': 'The alarm threshold(s) have '
'been exceeded for the sub-alarms',
'OK': 'The alarm threshold(s) have '
'not been exceeded for the sub-alarms',
'UNDETERMINED': 'Unable to determine the alarm state'}
class ThresholdProcessor(object):
"""Thresh processor.
This processor is for alarm definitions with short period.
It will store the metrics value/timestamp in memory using dict.
The basic data structure is:
ALL_DATA = {#match_by name#: ALARM_DATA}
For example, an alarm def has "match_by": ["hostname", "os"]
Metrics come in:
Metrics_A -> 'dimensions': {'hostname': 'A', 'os': 'windows'}
Metrics_B -> 'dimensions': {'hostname': 'B', 'os': 'unix'}
Then, ALL_DATA = {'Awindows': ALARM_DATA, 'Bunix': ALARM_DATA}
ALARM_DATA = {'state': #alarm state#,
'timestamp': #timestamp#, data: SUB_ALARM_DATA, ...}
It will hold the overall info of th alarm,
like state, timestamps, and metrics data.
SUB_ALARM_DATA = {#sub alarm expr#: METRICS}
For example, alarm expr is 'max(cpu)>10 and avg(memory)<10'
SUB_ALARM_DATA = {'max(cpu)>10': METRICS, 'avg(memory)<10': METRICS}
METRICS = {'value': [X, ...], 'timestamp': [T, ...], 'sub_state': S}
Other key/values in a metrics will not be stored here.
The state here is the state of this sub_alarm.
"""
def __init__(self, alarm_def):
"""One processor instance hold one alarm definition."""
LOG.debug('initializing ThresholdProcessor!')
super(ThresholdProcessor, self).__init__()
self.alarm_definition = alarm_def
self.expression = self.alarm_definition['expression']
self.match_by = self.alarm_definition['match_by']
self.expr_data_queue = {}
self.related_metrics = {}
if len(self.match_by) == 0:
self.match_by = None
alarm_parser = parser.AlarmExprParser(self.expression)
self.parse_result = alarm_parser.parse_result
self.sub_expr_list = alarm_parser.sub_expr_list
self.related_metrics[None] = alarm_parser.related_metrics
self.sub_alarm_expr = alarm_parser.sub_alarm_expressions
LOG.debug('successfully initialize ThresholdProcessor!')
def update_thresh_processor(self, alarm_def):
"""Update the processor with updated alarm definition."""
def update_data():
# inherit previous stored metrics values
for name in self.expr_data_queue:
ts = tu.utcnow_ts()
new_expr_data_queue[name] = {
'data': {},
'state': 'UNDETERMINED',
'create_timestamp':
self.expr_data_queue[name]['create_timestamp'],
'update_timestamp': ts,
'state_update_timestamp':
self.expr_data_queue[name]['state_update_timestamp']
}
for i in range(0, len(new_sub_expr_list), 1):
expr_old = self.sub_expr_list[i].fmtd_sub_expr_str
expr_new = new_sub_expr_list[i].fmtd_sub_expr_str
new_expr_data_queue[name]['data'][expr_new] = {
'state': 'UNDETERMINED',
'metrics':
(self.expr_data_queue[name]
['data'][expr_old]['metrics']),
'values': []}
LOG.debug('update ThresholdProcessor!')
new_alarm_definition = alarm_def
new_expression = new_alarm_definition['expression']
alarm_parser = parser.AlarmExprParser(new_expression)
new_sub_expr_list = alarm_parser.sub_expr_list
new_expr_data_queue = {}
update_data()
self.expr_data_queue = new_expr_data_queue
self.sub_expr_list = new_sub_expr_list
self.sub_alarm_expr = alarm_parser.sub_alarm_expressions
self.parse_result = alarm_parser.parse_result
self.alarm_definition = new_alarm_definition
self.expression = new_expression
self.match_by = self.alarm_definition['match_by']
if '' in self.match_by:
self.match_by.remove('')
if len(self.match_by) == 0:
self.match_by = None
LOG.debug('successfully update ThresholdProcessor!')
return True
def process_metrics(self, metrics):
"""Add new metrics to matched expr."""
try:
data = json.loads(metrics)
self.add_expr_metrics(data)
except Exception:
LOG.exception('Received a wrong format metrics')
def process_alarms(self):
"""Called to produce alarms."""
try:
alarm_list = []
for m in self.expr_data_queue.keys():
is_updated = self.update_state(self.expr_data_queue[m])
if is_updated:
alarm_list.append(self.build_alarm(m))
return alarm_list
except Exception:
LOG.exception('process metrics error')
return []
def update_state(self, expr_data):
"""Update the state of each alarm under this alarm definition."""
def _calc_state(operand):
if operand.logic_operator:
subs = []
for o in operand.sub_expr_list:
subs.append(_calc_state(o))
return calculator.calc_logic(operand.logic_operator, subs)
else:
return expr_data['data'][operand.fmtd_sub_expr_str]['state']
for sub_expr in self.sub_expr_list:
self.update_sub_expr_state(sub_expr, expr_data)
state_new = _calc_state(self.parse_result)
if state_new != expr_data['state']:
expr_data['state_update_timestamp'] = tu.utcnow_ts()
expr_data['update_timestamp'] = tu.utcnow_ts()
expr_data['state'] = state_new
return True
else:
return False
def update_sub_expr_state(self, expr, expr_data):
def _update_metrics():
"""Delete metrics not in period."""
data_list = expr_data['data'][expr.fmtd_sub_expr_str]['metrics']
start_time = t_now - (float(expr.period)) * int(expr.periods)
while (len(data_list) != 0
and data_list[0]['timestamp'] < start_time):
data_list.popleft()
def _update_state():
"""Update state of a sub expr."""
data_sub = expr_data['data'][expr.fmtd_sub_expr_str]
data_list = data_sub['metrics']
period = float(expr.period)
periods = int(expr.periods)
right = t_now
left = right - period
temp_data = []
value_in_periods = []
i = len(data_list) - 1
while i >= 0:
if data_list[i]['timestamp'] >= left:
temp_data.append(float(data_list[i]['value']))
else:
value = calculator.calc_value(
expr.normalized_func, temp_data)
value_in_periods.append(value)
right = left
left = right - period
temp_data = []
i += 1
i -= 1
value = calculator.calc_value(
expr.normalized_func, temp_data)
value_in_periods.append(value)
for i in range(len(value_in_periods), periods, 1):
value_in_periods.append(
calculator.calc_value(expr.normalized_func, []))
expr_data['data'][expr.fmtd_sub_expr_str]['values'] = (
value_in_periods)
expr_data['data'][expr.fmtd_sub_expr_str]['state'] = (
calculator.compare_thresh(
value_in_periods,
expr.normalized_operator,
float(expr.threshold)))
t_now = tu.utcnow_ts()
_update_metrics()
_update_state()
def add_expr_metrics(self, data):
"""Add new metrics to matched place."""
for sub_expr in self.sub_expr_list:
self.add_sub_expr_metrics(sub_expr, data)
def add_sub_expr_metrics(self, expr, data):
"""Add new metrics to sub expr place."""
def _has_match_expr():
if (data['name'].lower() != expr.normalized_metric_name):
return False
metrics_dimensions = {}
if 'dimensions' in data:
metrics_dimensions = data['dimensions']
def_dimensions = expr.dimensions_as_dict
for dimension_key in def_dimensions.keys():
if dimension_key in metrics_dimensions:
if (metrics_dimensions[dimension_key].lower()
!= def_dimensions[dimension_key].lower()):
return False
else:
return False
return True
def _add_metrics():
temp = None
if self.match_by:
q_name = self.get_matched_data_queue_name(data)
if q_name:
temp = self.expr_data_queue[q_name]
else:
if None not in self.expr_data_queue:
self.create_data_item(None)
temp = self.expr_data_queue[None]
if temp:
data_list = temp['data'][expr.fmtd_sub_expr_str]
data_list['metrics'].append(
{'value': float(data['value']),
'timestamp': tu.utcnow_ts()})
return True
else:
return False
if _has_match_expr() and _add_metrics():
LOG.debug("Alarm def: %s consumes the metrics!"
% self.alarm_definition['name'])
else:
LOG.debug("Alarm def: %s don't need the metrics!"
% self.alarm_definition['name'])
def create_data_item(self, name):
"""If new match_up tuple, create new entry to store metrics value."""
ts = tu.utcnow_ts()
self.expr_data_queue[name] = {
'data': {},
'state': 'UNDETERMINED',
'create_timestamp': ts,
'update_timestamp': ts,
'state_update_timestamp': ts}
for expr in self.sub_expr_list:
self.expr_data_queue[name]['data'][expr.fmtd_sub_expr_str] = {
'state': 'UNDETERMINED',
'metrics': collections.deque(),
'values': []}
def get_matched_data_queue_name(self, data):
"""Use dimensions in match_up to generate a name."""
name = ''
for m in self.match_by:
if m in data['dimensions']:
name = name + data['dimensions'][m] + ','
else:
return None
if name in self.expr_data_queue:
return name
else:
self.related_metrics[name] = []
for m in self.related_metrics[None]:
temp = copy.deepcopy(m)
for match in self.match_by:
temp['dimensions'][match] = data['dimensions'][match]
self.related_metrics[name].append(temp)
self.create_data_item(name)
return name
def build_alarm(self, name):
"""Build alarm json."""
alarm = {}
id = str(uuid.uuid4())
alarm['id'] = id
alarm['alarm_definition'] = self.alarm_definition
alarm['metrics'] = self.related_metrics[name]
alarm['state'] = self.expr_data_queue[name]['state']
alarm['reason'] = reasons[alarm['state']]
alarm['reason_data'] = {}
sub_alarms = []
dt = self.expr_data_queue[name]['data']
for expr in self.sub_expr_list:
sub_alarms.append({
'sub_alarm_expression':
self.sub_alarm_expr[expr.fmtd_sub_expr_str],
'sub_alarm_state': dt[expr.fmtd_sub_expr_str]['state'],
'current_values': dt[expr.fmtd_sub_expr_str]['values']
})
alarm['sub_alarms'] = sub_alarms
ct = self.expr_data_queue[name]['create_timestamp']
st = self.expr_data_queue[name]['state_update_timestamp']
t = self.expr_data_queue[name]['update_timestamp']
alarm['state_updated_timestamp'] = tu.iso8601_from_timestamp(st)
alarm['updated_timestamp'] = tu.iso8601_from_timestamp(t)
alarm['created_timestamp'] = tu.iso8601_from_timestamp(ct)
return json.dumps(alarm)