329 lines
13 KiB
Python
Executable File
329 lines
13 KiB
Python
Executable File
# Copyright 2015 Carnegie Mellon University
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import collections
|
|
import copy
|
|
import json
|
|
from oslo_log import log
|
|
import uuid
|
|
|
|
from kiloeyes.common import alarm_expr_calculator as calculator
|
|
from kiloeyes.common import alarm_expr_parser as parser
|
|
from kiloeyes.common import timeutils as tu
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
reasons = {'ALARM': 'The alarm threshold(s) have '
|
|
'been exceeded for the sub-alarms',
|
|
'OK': 'The alarm threshold(s) have '
|
|
'not been exceeded for the sub-alarms',
|
|
'UNDETERMINED': 'Unable to determine the alarm state'}
|
|
|
|
|
|
class ThresholdProcessor(object):
|
|
"""Thresh processor.
|
|
|
|
This processor is for alarm definitions with short period.
|
|
It will store the metrics value/timestamp in memory using dict.
|
|
|
|
The basic data structure is:
|
|
ALL_DATA = {#match_by name#: ALARM_DATA}
|
|
For example, an alarm def has "match_by": ["hostname", "os"]
|
|
Metrics come in:
|
|
Metrics_A -> 'dimensions': {'hostname': 'A', 'os': 'windows'}
|
|
Metrics_B -> 'dimensions': {'hostname': 'B', 'os': 'unix'}
|
|
Then, ALL_DATA = {'Awindows': ALARM_DATA, 'Bunix': ALARM_DATA}
|
|
|
|
ALARM_DATA = {'state': #alarm state#,
|
|
'timestamp': #timestamp#, data: SUB_ALARM_DATA, ...}
|
|
It will hold the overall info of th alarm,
|
|
like state, timestamps, and metrics data.
|
|
|
|
SUB_ALARM_DATA = {#sub alarm expr#: METRICS}
|
|
For example, alarm expr is 'max(cpu)>10 and avg(memory)<10'
|
|
SUB_ALARM_DATA = {'max(cpu)>10': METRICS, 'avg(memory)<10': METRICS}
|
|
|
|
METRICS = {'value': [X, ...], 'timestamp': [T, ...], 'sub_state': S}
|
|
Other key/values in a metrics will not be stored here.
|
|
The state here is the state of this sub_alarm.
|
|
"""
|
|
def __init__(self, alarm_def):
|
|
"""One processor instance hold one alarm definition."""
|
|
LOG.debug('initializing ThresholdProcessor!')
|
|
super(ThresholdProcessor, self).__init__()
|
|
self.alarm_definition = alarm_def
|
|
self.expression = self.alarm_definition['expression']
|
|
self.match_by = self.alarm_definition['match_by']
|
|
self.expr_data_queue = {}
|
|
self.related_metrics = {}
|
|
if len(self.match_by) == 0:
|
|
self.match_by = None
|
|
alarm_parser = parser.AlarmExprParser(self.expression)
|
|
self.parse_result = alarm_parser.parse_result
|
|
self.sub_expr_list = alarm_parser.sub_expr_list
|
|
self.related_metrics[None] = alarm_parser.related_metrics
|
|
self.sub_alarm_expr = alarm_parser.sub_alarm_expressions
|
|
LOG.debug('successfully initialize ThresholdProcessor!')
|
|
|
|
def update_thresh_processor(self, alarm_def):
|
|
"""Update the processor with updated alarm definition."""
|
|
def update_data():
|
|
# inherit previous stored metrics values
|
|
for name in self.expr_data_queue:
|
|
ts = tu.utcnow_ts()
|
|
new_expr_data_queue[name] = {
|
|
'data': {},
|
|
'state': 'UNDETERMINED',
|
|
'create_timestamp':
|
|
self.expr_data_queue[name]['create_timestamp'],
|
|
'update_timestamp': ts,
|
|
'state_update_timestamp':
|
|
self.expr_data_queue[name]['state_update_timestamp']
|
|
}
|
|
for i in range(0, len(new_sub_expr_list), 1):
|
|
expr_old = self.sub_expr_list[i].fmtd_sub_expr_str
|
|
expr_new = new_sub_expr_list[i].fmtd_sub_expr_str
|
|
new_expr_data_queue[name]['data'][expr_new] = {
|
|
'state': 'UNDETERMINED',
|
|
'metrics':
|
|
(self.expr_data_queue[name]
|
|
['data'][expr_old]['metrics']),
|
|
'values': []}
|
|
|
|
LOG.debug('update ThresholdProcessor!')
|
|
new_alarm_definition = alarm_def
|
|
new_expression = new_alarm_definition['expression']
|
|
alarm_parser = parser.AlarmExprParser(new_expression)
|
|
new_sub_expr_list = alarm_parser.sub_expr_list
|
|
new_expr_data_queue = {}
|
|
update_data()
|
|
self.expr_data_queue = new_expr_data_queue
|
|
self.sub_expr_list = new_sub_expr_list
|
|
self.sub_alarm_expr = alarm_parser.sub_alarm_expressions
|
|
self.parse_result = alarm_parser.parse_result
|
|
self.alarm_definition = new_alarm_definition
|
|
self.expression = new_expression
|
|
self.match_by = self.alarm_definition['match_by']
|
|
if '' in self.match_by:
|
|
self.match_by.remove('')
|
|
if len(self.match_by) == 0:
|
|
self.match_by = None
|
|
LOG.debug('successfully update ThresholdProcessor!')
|
|
return True
|
|
|
|
def process_metrics(self, metrics):
|
|
"""Add new metrics to matched expr."""
|
|
try:
|
|
data = json.loads(metrics)
|
|
self.add_expr_metrics(data)
|
|
except Exception:
|
|
LOG.exception('Received a wrong format metrics')
|
|
|
|
def process_alarms(self):
|
|
"""Called to produce alarms."""
|
|
try:
|
|
alarm_list = []
|
|
for m in self.expr_data_queue.keys():
|
|
is_updated = self.update_state(self.expr_data_queue[m])
|
|
if is_updated:
|
|
alarm_list.append(self.build_alarm(m))
|
|
return alarm_list
|
|
except Exception:
|
|
LOG.exception('process metrics error')
|
|
return []
|
|
|
|
def update_state(self, expr_data):
|
|
"""Update the state of each alarm under this alarm definition."""
|
|
def _calc_state(operand):
|
|
if operand.logic_operator:
|
|
subs = []
|
|
for o in operand.sub_expr_list:
|
|
subs.append(_calc_state(o))
|
|
return calculator.calc_logic(operand.logic_operator, subs)
|
|
else:
|
|
return expr_data['data'][operand.fmtd_sub_expr_str]['state']
|
|
|
|
for sub_expr in self.sub_expr_list:
|
|
self.update_sub_expr_state(sub_expr, expr_data)
|
|
state_new = _calc_state(self.parse_result)
|
|
if state_new != expr_data['state']:
|
|
expr_data['state_update_timestamp'] = tu.utcnow_ts()
|
|
expr_data['update_timestamp'] = tu.utcnow_ts()
|
|
expr_data['state'] = state_new
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def update_sub_expr_state(self, expr, expr_data):
|
|
def _update_metrics():
|
|
"""Delete metrics not in period."""
|
|
data_list = expr_data['data'][expr.fmtd_sub_expr_str]['metrics']
|
|
start_time = t_now - (float(expr.period)) * int(expr.periods)
|
|
while (len(data_list) != 0
|
|
and data_list[0]['timestamp'] < start_time):
|
|
data_list.popleft()
|
|
|
|
def _update_state():
|
|
"""Update state of a sub expr."""
|
|
data_sub = expr_data['data'][expr.fmtd_sub_expr_str]
|
|
data_list = data_sub['metrics']
|
|
period = float(expr.period)
|
|
periods = int(expr.periods)
|
|
right = t_now
|
|
left = right - period
|
|
temp_data = []
|
|
value_in_periods = []
|
|
i = len(data_list) - 1
|
|
while i >= 0:
|
|
if data_list[i]['timestamp'] >= left:
|
|
temp_data.append(float(data_list[i]['value']))
|
|
else:
|
|
value = calculator.calc_value(
|
|
expr.normalized_func, temp_data)
|
|
value_in_periods.append(value)
|
|
right = left
|
|
left = right - period
|
|
temp_data = []
|
|
i += 1
|
|
i -= 1
|
|
value = calculator.calc_value(
|
|
expr.normalized_func, temp_data)
|
|
value_in_periods.append(value)
|
|
for i in range(len(value_in_periods), periods, 1):
|
|
value_in_periods.append(
|
|
calculator.calc_value(expr.normalized_func, []))
|
|
expr_data['data'][expr.fmtd_sub_expr_str]['values'] = (
|
|
value_in_periods)
|
|
expr_data['data'][expr.fmtd_sub_expr_str]['state'] = (
|
|
calculator.compare_thresh(
|
|
value_in_periods,
|
|
expr.normalized_operator,
|
|
float(expr.threshold)))
|
|
|
|
t_now = tu.utcnow_ts()
|
|
_update_metrics()
|
|
_update_state()
|
|
|
|
def add_expr_metrics(self, data):
|
|
"""Add new metrics to matched place."""
|
|
for sub_expr in self.sub_expr_list:
|
|
self.add_sub_expr_metrics(sub_expr, data)
|
|
|
|
def add_sub_expr_metrics(self, expr, data):
|
|
"""Add new metrics to sub expr place."""
|
|
def _has_match_expr():
|
|
if (data['name'].lower() != expr.normalized_metric_name):
|
|
return False
|
|
metrics_dimensions = {}
|
|
if 'dimensions' in data:
|
|
metrics_dimensions = data['dimensions']
|
|
def_dimensions = expr.dimensions_as_dict
|
|
for dimension_key in def_dimensions.keys():
|
|
if dimension_key in metrics_dimensions:
|
|
if (metrics_dimensions[dimension_key].lower()
|
|
!= def_dimensions[dimension_key].lower()):
|
|
return False
|
|
else:
|
|
return False
|
|
return True
|
|
|
|
def _add_metrics():
|
|
temp = None
|
|
if self.match_by:
|
|
q_name = self.get_matched_data_queue_name(data)
|
|
if q_name:
|
|
temp = self.expr_data_queue[q_name]
|
|
else:
|
|
if None not in self.expr_data_queue:
|
|
self.create_data_item(None)
|
|
temp = self.expr_data_queue[None]
|
|
if temp:
|
|
data_list = temp['data'][expr.fmtd_sub_expr_str]
|
|
data_list['metrics'].append(
|
|
{'value': float(data['value']),
|
|
'timestamp': tu.utcnow_ts()})
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
if _has_match_expr() and _add_metrics():
|
|
LOG.debug("Alarm def: %s consumes the metrics!"
|
|
% self.alarm_definition['name'])
|
|
else:
|
|
LOG.debug("Alarm def: %s don't need the metrics!"
|
|
% self.alarm_definition['name'])
|
|
|
|
def create_data_item(self, name):
|
|
"""If new match_up tuple, create new entry to store metrics value."""
|
|
ts = tu.utcnow_ts()
|
|
self.expr_data_queue[name] = {
|
|
'data': {},
|
|
'state': 'UNDETERMINED',
|
|
'create_timestamp': ts,
|
|
'update_timestamp': ts,
|
|
'state_update_timestamp': ts}
|
|
for expr in self.sub_expr_list:
|
|
self.expr_data_queue[name]['data'][expr.fmtd_sub_expr_str] = {
|
|
'state': 'UNDETERMINED',
|
|
'metrics': collections.deque(),
|
|
'values': []}
|
|
|
|
def get_matched_data_queue_name(self, data):
|
|
"""Use dimensions in match_up to generate a name."""
|
|
name = ''
|
|
for m in self.match_by:
|
|
if m in data['dimensions']:
|
|
name = name + data['dimensions'][m] + ','
|
|
else:
|
|
return None
|
|
if name in self.expr_data_queue:
|
|
return name
|
|
else:
|
|
self.related_metrics[name] = []
|
|
for m in self.related_metrics[None]:
|
|
temp = copy.deepcopy(m)
|
|
for match in self.match_by:
|
|
temp['dimensions'][match] = data['dimensions'][match]
|
|
self.related_metrics[name].append(temp)
|
|
self.create_data_item(name)
|
|
return name
|
|
|
|
def build_alarm(self, name):
|
|
"""Build alarm json."""
|
|
alarm = {}
|
|
id = str(uuid.uuid4())
|
|
alarm['id'] = id
|
|
alarm['alarm_definition'] = self.alarm_definition
|
|
alarm['metrics'] = self.related_metrics[name]
|
|
alarm['state'] = self.expr_data_queue[name]['state']
|
|
alarm['reason'] = reasons[alarm['state']]
|
|
alarm['reason_data'] = {}
|
|
sub_alarms = []
|
|
dt = self.expr_data_queue[name]['data']
|
|
for expr in self.sub_expr_list:
|
|
sub_alarms.append({
|
|
'sub_alarm_expression':
|
|
self.sub_alarm_expr[expr.fmtd_sub_expr_str],
|
|
'sub_alarm_state': dt[expr.fmtd_sub_expr_str]['state'],
|
|
'current_values': dt[expr.fmtd_sub_expr_str]['values']
|
|
})
|
|
alarm['sub_alarms'] = sub_alarms
|
|
ct = self.expr_data_queue[name]['create_timestamp']
|
|
st = self.expr_data_queue[name]['state_update_timestamp']
|
|
t = self.expr_data_queue[name]['update_timestamp']
|
|
alarm['state_updated_timestamp'] = tu.iso8601_from_timestamp(st)
|
|
alarm['updated_timestamp'] = tu.iso8601_from_timestamp(t)
|
|
alarm['created_timestamp'] = tu.iso8601_from_timestamp(ct)
|
|
return json.dumps(alarm)
|