Add the AFD framework for threshold alarms

This patch provides Lua libraries to evaluate metrics against thresholds.
The AFD evaluates a list of alarms, with an alarm defined like the
following:

name: 'fs-warning'
description: 'Filesystem usage'
severity: 'warning'
trigger:
  logical_operator: 'or'
  rules:
    - metric: fs_space_percent_free
      fields:
        fs: '*'
      relational_operator: '<'
      threshold: 5
      window: 60
      period: 1
      function: avg

where:
- *name* is required and must be unique,
- *description* is required,
- *severity* is one of 'okay', 'warning', 'critical', 'down', 'unknown'
- *logical_operator* optional (can be 'or' or 'and', default 'or')
- *metric*, *relational_operator*, *threshold*, *window* and *function*
  are required,
- *fields* is optional

The AFD evaluates alarms in specified order and stop evaluation at
the first triggered alarm.

This implementation doesn't fully support all the specification, the
current limitation are:

 - aggregation functions supported are: max, min, avg, sum, sd and variance and
   these ones are not: last, median, mww, mww_nonparametric.
 - *periods* rules parameter is supported for these functions in the sense that
   thresholds are compared on the entire interval "window * periods" but
   not compared between each period. In other words: it's equivalent
   to write a rule with 'window=300/periods=1|0' and 'window=100/periods=3'.

Change-Id: Ia739ceb080971e3b7bb5a2212275d2a15d65d3e9
This commit is contained in:
Swann Croiset 2015-09-18 18:07:47 +02:00
parent fe40230efb
commit 933311a72b
8 changed files with 1153 additions and 2 deletions

View File

@ -3,3 +3,4 @@ spec/fixtures/manifests/*
Gemfile.lock Gemfile.lock
tests/lua/mocks/date_time.lua tests/lua/mocks/date_time.lua
.bundle .bundle
tests/lua/mocks/inspect.lua

View File

@ -26,7 +26,7 @@ task :test => [
LUA_TESTS = Rake::FileList.new("tests/**/test_*.lua") LUA_TESTS = Rake::FileList.new("tests/**/test_*.lua")
desc "Run Lua unit tests." desc "Run Lua unit tests."
task :lua_tests => [:lua_5_1, :lua_libraries] do |t| task :lua_tests => [:lua_5_1, :lua_libraries, :lib_cbuf] do |t|
LUA_TESTS.each do |f| LUA_TESTS.each do |f|
sh "lua5.1 #{f}" sh "lua5.1 #{f}"
end end
@ -42,7 +42,7 @@ end
# Need to pull date_time.lua from the lua_sandbox repository because some tests # Need to pull date_time.lua from the lua_sandbox repository because some tests
# depend on it indirectly # depend on it indirectly
task :lua_libraries => ['tests/lua/mocks/date_time.lua'] task :lua_libraries => ['tests/lua/mocks/date_time.lua', 'tests/lua/mocks/inspect.lua']
file 'tests/lua/mocks/date_time.lua' do |t| file 'tests/lua/mocks/date_time.lua' do |t|
sh "curl", "-s", "-o", t.name, "https://raw.githubusercontent.com/mozilla-services/lua_sandbox/dev/modules/date_time.lua" do |ok, res| sh "curl", "-s", "-o", t.name, "https://raw.githubusercontent.com/mozilla-services/lua_sandbox/dev/modules/date_time.lua" do |ok, res|
@ -51,3 +51,27 @@ file 'tests/lua/mocks/date_time.lua' do |t|
end end
end end
end end
file 'tests/lua/mocks/inspect.lua' do |t|
sh "curl", "-s", "-o", t.name, "https://raw.githubusercontent.com/kikito/inspect.lua/master/inspect.lua" do |ok, res|
if ! ok then
raise "Fail to download inspect.lua from gitub repository!"
end
end
end
# Need to pull circular_buffer library from mozilla Github and compile it
task :lib_cbuf => ['./circular_buffer.so']
file './circular_buffer.so' do |t|
sh %{cd /tmp && rm -rf lua_circular_buffer && git clone https://github.com/mozilla-services/lua_circular_buffer.git && cd lua_circular_buffer && mkdir release && cd release && cmake -DCMAKE_BUILD_TYPE=release .. && make} do |ok, res|
if ! ok then
raise "failed to compile circular_buffer!"
end
end
sh %{cp /tmp/lua_circular_buffer/release/circular_buffer.so .} do |ok, res|
if ! ok then
raise "failed bloom_filter to compule!"
end
end
end

View File

@ -133,4 +133,9 @@ function inject_afd_service_metric(service, value, hostname, interval, source)
inject_message(msg) inject_message(msg)
end end
MATCH = 1
NO_MATCH = 2
NO_DATA = 3
MISSING_DATA = 4
return M return M

View File

@ -0,0 +1,190 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local assert = assert
local ipairs = ipairs
local pairs = pairs
local string = string
local setmetatable = setmetatable
-- LMA libs
local utils = require 'lma_utils'
local consts = require 'gse_constants'
local afd = require 'afd'
local Rule = require 'afd_rule'
local SEVERITIES = {
warning = consts.WARN,
critical = consts.CRIT,
down = consts.DOWN,
}
local Alarm = {}
Alarm.__index = Alarm
setfenv(1, Alarm) -- Remove external access to contain everything in the module
function Alarm.new(alarm)
local a = {}
setmetatable(a, Alarm)
a._metrics_list = nil
a.alarm = alarm
a.name = alarm.name
a.description = alarm.description
if alarm.trigger.logical_operator then
a.logical_operator = string.lower(alarm.trigger.logical_operator)
else
a.logical_operator = 'or'
end
a.severity_str = string.upper(alarm.severity)
a.severity = SEVERITIES[string.lower(alarm.severity)]
assert(a.severity ~= nil)
a.rules = {}
a.initial_wait = 0
for _, rule in ipairs(alarm.trigger.rules) do
local r = Rule.new(rule)
a.rules[#a.rules+1] = r
local wait = r.window * r.periods
if wait > a.initial_wait then
a.initial_wait = wait * 1e9
end
end
a.start_time_ns = 0
return a
end
-- return the Set of metrics used by the alarm
function Alarm:get_metrics()
if not self._metrics_list then
self._metrics_list = {}
for _, rule in ipairs(self.rules) do
if not utils.table_find(rule.metric, metrics) then
self._metrics_list[#self._metrics_list+1] = rule.metric
end
end
end
return self._metrics_list
end
-- return a list of field names used for the metric
-- (can have duplicate names)
function Alarm:get_metric_fields(metric_name)
local fields = {}
for _, rule in ipairs(self.rules) do
if rule.metric == metric_name then
for k, _ in pairs(rule.fields) do
fields[#fields+1] = k
end
end
end
return fields
end
function Alarm:has_metric(metric)
if utils.table_find(metric, self:get_metrics()) then
return true
end
return false
end
-- dispatch datapoint in datastores
function Alarm:add_value(ts, metric, value, fields)
local data
for id, rule in pairs(self.rules) do
if rule.metric == metric then
rule:add_value(ts, value, fields)
end
end
end
-- convert fields to fields map
-- {foo="bar"} --> {name="foo", value="bar"}
local function convert_field_list(fields)
local named_fields = {}
for name, value in pairs(fields or {}) do
named_fields[#named_fields+1] = {name=name, value=value}
end
return named_fields
end
-- return: state of alarm and a list of alarm details.
--
-- with alarm list when state != OKAY:
-- {
-- {
-- value = <current value>,
-- fields = <metric fields table>,
-- message = <string>,
-- },
-- }
function Alarm:evaluate(ns)
local state
local all_alerts = {}
local function add_alarm(rule, value, message, fields)
all_alerts[#all_alerts+1] = {
severity = self.severity_str,
['function'] = rule.fct,
metric = rule.metric,
operator = rule.relational_operator,
threshold = rule.threshold,
window = rule.window,
periods = rule.periods,
value = value,
fields = fields,
message = message
}
end
local one_unknown = false
for _, rule in ipairs(self.rules) do
local eval, context_list = rule:evaluate(ns)
if eval == afd.MATCH then
state = self.severity
for _, context in ipairs(context_list) do
add_alarm(rule, context.value, self.description,
convert_field_list(context.fields))
end
elseif eval == afd.MISSING_DATA then
local msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds'
add_alarm(rule, -2, msg, convert_field_list(rule.fields))
one_unknown = true
elseif eval == afd.NO_DATA then
add_alarm(rule, -1, 'No datapoint have been received ever', convert_field_list(rule.fields))
one_unknown = true
end
end
if self.logical_operator == 'and' and one_unknown then
state = consts.UNKW
elseif not state and one_unknown then
state = consts.UNKW
elseif not state then
state = consts.OKAY
all_alerts = {}
end
return state, all_alerts
end
function Alarm:set_start_time(ns)
self.start_time_ns = ns
end
function Alarm:is_evaluation_time(ns)
local delta = ns - self.start_time_ns
if delta >= self.initial_wait then
return true
end
return false
end
return Alarm

View File

@ -0,0 +1,119 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local pairs = pairs
local ipairs = ipairs
local lma = require 'lma_utils'
local consts = require 'gse_constants'
local gse = require 'gse'
local Alarm = require 'afd_alarm'
local all_alarms = {}
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- return a list of field names required for the metric
function get_metric_fields(metric_name)
local fields = {}
for name, alarm in pairs(all_alarms) do
local mf = alarm:get_metric_fields(metric_name)
if mf then
for _, field in pairs(mf) do
if not lma.table_find(field, fields) then
fields[#fields+1] = field
end
end
end
end
return fields
end
-- return list of alarms interested by a metric
function get_interested_alarms(metric)
local interested_alarms = {}
for _, alarm in pairs(all_alarms) do
if alarm:has_metric(metric) then
interested_alarms[#interested_alarms+1] = alarm
end
end
return interested_alarms
end
function add_value(ts, metric, value, fields)
local interested_alarms = get_interested_alarms(metric)
for _, alarm in ipairs (interested_alarms) do
alarm:add_value(ts, metric, value, fields)
end
end
function reset_alarms()
all_alarms = {}
end
function evaluate(ns)
local global_state
local all_alerts = {}
for _, alarm in pairs(all_alarms) do
if alarm:is_evaluation_time(ns) then
local state, alerts = alarm:evaluate(ns)
global_state = gse.max_status(state, global_state)
for _, a in ipairs(alerts)do
all_alerts[#all_alerts+1] = { state=state, alert=a }
end
-- raise the first triggered alarm except for OKAY/UNKW states
if global_state ~= consts.UNKW or global_state ~= consts.OKAY then
break
end
end
end
return global_state, all_alerts
end
function get_alarms()
return all_alarms
end
function get_alarm(alarm_name)
for _, a in ipairs(all_alarms) do
if a.name == alarm_name then
return a
end
end
end
function load_alarm(alarm)
local A = Alarm.new(alarm)
all_alarms[#all_alarms+1] = A
end
function load_alarms(alarms)
for _, alarm in ipairs(alarms) do
load_alarm(alarm)
end
end
local started = false
function set_start_time(ns)
for _, alarm in ipairs(all_alarms) do
alarm:set_start_time(ns)
end
started = true
end
function is_started()
return started
end
return M

View File

@ -0,0 +1,212 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local circular_buffer = require('circular_buffer')
local setmetatable = setmetatable
local ipairs = ipairs
local pairs = pairs
local math = require 'math'
local string = string
local table = table
-- LMA libs
local utils = require 'lma_utils'
local consts = require 'gse_constants'
local afd = require 'afd'
local MIN_WINDOW = 10
local MIN_PERIOD = 1
local SECONDS_PER_ROW = 5
local Rule = {}
Rule.__index = Rule
setfenv(1, Rule) -- Remove external access to contain everything in the module
local function get_datastore_id(metric, fields, fct, window, periods)
local arr = {metric, fct, window, periods}
for f, v in utils.orderedPairs(fields or {}) do
arr[#arr+1] = string.format('(%s=%s)', f, v)
end
return table.concat(arr, '/')
end
function Rule.new(rule)
local r = {}
setmetatable(r, Rule)
local win = MIN_WINDOW
if rule.window and rule.window + 0 > 0 then
win = rule.window + 0
end
r.window = win
local periods = MIN_PERIOD
if rule.periods and rule.periods + 0 > 0 then
periods = rule.periods + 0
end
r.periods = periods
r.relational_operator = rule.relational_operator
r.metric = rule.metric
r.fields = rule.fields or {}
r.fct = rule['function']
r.threshold = rule.threshold + 0
r.ids_datastore = {}
r.datastore = {}
r.observation_window = math.ceil(r.window * r.periods)
r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW)
return r
end
function Rule:fields_accepted(fields)
if not fields then
fields = {}
end
local matched_fields = 0
local no_match_on_fields = true
for f, wanted in pairs(self.fields) do
no_match_on_fields = false
for k, v in pairs(fields) do
if k == f and wanted == '*' then
matched_fields = matched_fields + 1
elseif k == f and v == wanted then
matched_fields = matched_fields + 1
elseif k == f and v ~= wanted then
return false
end
end
end
return no_match_on_fields or matched_fields > 0
end
function Rule:get_circular_buffer()
local cbuf
if self.fct == 'avg' then
cbuf = circular_buffer.new(self.cbuf_size, 2, SECONDS_PER_ROW)
cbuf:set_header(1, self.metric, 'sum', 'sum')
cbuf:set_header(2, self.metric, 'count', 'sum')
elseif self.fct == 'min' or self.fct == 'max' then
cbuf = circular_buffer.new(self.cbuf_size, 2, SECONDS_PER_ROW)
cbuf:set_header(1, self.metric, self.fct)
else
cbuf = circular_buffer.new(self.cbuf_size, 2, SECONDS_PER_ROW)
cbuf:set_header(1, self.metric)
end
return cbuf
end
-- store datapoints in cbuf, create the cbuf if not exists
function Rule:add_value(ts, value, fields)
if not self:fields_accepted(fields) then
return
end
local data
local uniq_field_id = get_datastore_id(self.metric, fields, self.fct, self.window, self.periods)
if not self.datastore[uniq_field_id] then
self.datastore[uniq_field_id] = {
fields = fields,
cbuf = self:get_circular_buffer()
}
self:add_datastore(uniq_field_id)
end
data = self.datastore[uniq_field_id]
if self.fct == 'avg' then
data.cbuf:add(ts, 1, value)
data.cbuf:add(ts, 2, 1)
elseif self.fct == 'min' or self.fct == 'max' then
data.cbuf:add(ts, 1, value)
else
data.cbuf:set(ts, 1, value)
end
end
function Rule:add_datastore(id)
if not utils.table_find(id, self.ids_datastore) then
self.ids_datastore[#self.ids_datastore+1] = id
end
end
local function compare_threshold(value, op, threshold)
local rule_matches = false
if op == '==' or op == 'eq' then
rule_matches = value == threshold
elseif op == '!=' or op == 'ne' then
rule_matches = value ~= threshold
elseif op == '>=' or op == 'gte' then
rule_matches = value >= threshold
elseif op == '>' or op == 'gt' then
rule_matches = value > threshold
elseif op == '<=' or op == 'lte' then
rule_matches = value <= threshold
elseif op == '<' or op == 'lt' then
rule_matches = value < threshold
end
if rule_matches then
return afd.MATCH
end
return afd.NO_MATCH
end
-- evaluate the rule against datapoints
-- return a list: match (bool or string), context ({value=v, fields=list of field table})
--
-- examples:
-- true, { value=100, fields={{queue='nova'}, {queue='neutron'}}
-- false, { value=10, fields={}}
-- with 2 special cases:
-- - never receive one datapoint
-- 'nodata', {}
-- - no more datapoint received for a metric
-- 'missing', {value=-1, fields={}}
-- There is a drawback with the 'missing' state and could leads to emit false positive
-- state. For example when the monitored thing has been renamed/deleted,
-- it's normal to don't receive datapoint anymore .. for example a filesystem.
function Rule:evaluate(ns)
local fields = {}
local match = afd.NO_DATA
for _, id in ipairs(self.ids_datastore) do
local data = self.datastore[id]
if data then
local cbuf_time = data.cbuf:current_time()
-- if we didn't receive datapoint within the observation window this means
-- we don't receive anymore data and cannot compute the rule.
if ns - cbuf_time > self.observation_window * 1e9 then
return afd.MISSING_DATA, {value = -1, fields = data.fields}
end
if self.fct == 'avg' or self.fct == 'max' or self.fct == 'min' or self.fct == 'sum' or self.fct == 'sd' or self.fct == 'variance' then
local result
local num_row = -1
if self.fct == 'avg' then
local total
total, num_row = data.cbuf:compute('sum', 1)
local count = data.cbuf:compute('sum', 2)
result = total/count
else
result, num_row = data.cbuf:compute(self.fct, 1)
end
if result then
match = compare_threshold(result, self.relational_operator, self.threshold)
end
if match then
fields[#fields+1] = {value=result, fields=data.fields}
end
end
end
end
return match, fields
end
return Rule

View File

@ -15,7 +15,9 @@ local cjson = require 'cjson'
local string = require 'string' local string = require 'string'
local extra = require 'extra_fields' local extra = require 'extra_fields'
local patt = require 'patterns' local patt = require 'patterns'
local table = require 'table'
local pairs = pairs local pairs = pairs
local ipairs = ipairs
local inject_message = inject_message local inject_message = inject_message
local read_message = read_message local read_message = read_message
local pcall = pcall local pcall = pcall
@ -192,4 +194,60 @@ function deepcopy(t)
return t return t
end end
-- return true if an item is present in the list, else false
function table_find(item, list)
if type(list) == 'table' then
for _, v in ipairs(list) do
if v == item then
return true
end
end
return false
end
end
-- from http://lua-users.org/wiki/SortedIteration
function __genOrderedIndex( t )
local orderedIndex = {}
for key in pairs(t) do
table.insert( orderedIndex, key )
end
table.sort( orderedIndex )
return orderedIndex
end
function orderedNext(t, state)
-- Equivalent of the next function, but returns the keys in the alphabetic
-- order. We use a temporary ordered key table that is stored in the
-- table being iterated.
key = nil
if state == nil then
-- the first time, generate the index
t.__orderedIndex = __genOrderedIndex( t )
key = t.__orderedIndex[1]
else
-- fetch the next value
for i = 1,table.getn(t.__orderedIndex) do
if t.__orderedIndex[i] == state then
key = t.__orderedIndex[i+1]
end
end
end
if key then
return key, t[key]
end
-- no more value to return, cleanup
t.__orderedIndex = nil
return
end
function orderedPairs(t)
-- Equivalent of the pairs() function on tables. Allows to iterate
-- in order
return orderedNext, t, nil
end
return M return M

View File

@ -0,0 +1,542 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
require('luaunit')
package.path = package.path .. ";files/plugins/common/?.lua;tests/lua/mocks/?.lua"
local lma_alarm = require('afd_alarms')
local consts = require('gse_constants')
local alarms = {
{ -- 1
name = 'FS_all_no_field',
description = 'FS all no field',
enabled = true,
trigger = {
rules = {
{
metric = 'fs_space_percent_free',
window = 120,
['function'] = 'avg',
relational_operator = '<=',
threshold = 11,
},
},
logical_operator = 'and',
},
severity = 'warning',
},
{ -- 2
name = 'RabbitMQ_Critical',
description = 'Number of messages in queue is critical',
enabled = true,
trigger = {
rules = {
{
relational_operator = '>=',
metric = 'rabbitmq_messages',
fields = {},
window = "300",
periods = "0",
['function'] = 'min',
threshold = "50",
},
},
logical_operator = 'or',
},
severity = 'critical',
},
{ -- 3
name = 'RabbitMQ_Warning',
description = 'Number of messages becomes high',
enabled = true,
trigger = {
rules = {
{
relational_operator = '>=',
metric = 'rabbitmq_queue_messages',
fields = { queue = '*'},
window = 120,
periods = 0,
['function'] = 'avg',
threshold = 120,
},
{
relational_operator = '>=',
metric = 'rabbitmq_queue_messages',
fields = { queue = 'nova'},
window = 60,
periods = 0,
['function'] = 'max',
threshold = 250,
},
},
},
severity = 'warning',
},
{ -- 4
name = 'CPU_Critical_Controller',
description = 'CPU is critical for the controller',
enabled = true,
trigger = {
rules = {
{
metric = 'cpu_idle',
window = 120,
periods = 2,
['function'] = 'avg',
relational_operator = '<=',
threshold = 5,
},
{
metric = 'cpu_wait',
window = 120,
periods = 1,
fields = { hostname = '*' },
['function'] = 'avg',
relational_operator = '>=',
threshold = 20,
},
},
logical_operator = 'or',
},
severity = 'critical',
},
{ -- 5
name = 'CPU_Warning_Controller',
description = 'CPU is warning for controller',
enabled = true,
trigger = {
rules = {
{
metric = 'cpu_idle',
window = 100,
periods = 2,
['function'] = 'avg',
relational_operator = '<=',
threshold = 15,
},
{
metric = 'cpu_wait',
window = 60,
periods = 0,
['function'] = 'avg',
relational_operator = '>=',
threshold = 25,
},
},
logical_operator = 'or',
},
severity = 'warning',
},
{ -- 6
name = 'CPU_Critical_Controller_AND',
description = 'CPU is critical for controller',
enabled = true,
trigger = {
rules = {
{
metric = 'cpu_idle',
window = 120,
periods = 2,
['function'] = 'avg',
relational_operator = '<=',
threshold = 3,
},
{
metric = 'cpu_wait',
window = 60,
periods = 1,
['function'] = 'avg',
relational_operator = '>=',
threshold = 30,
},
},
logical_operator = 'and',
},
severity = 'critical',
},
{ -- 7
name = 'FS_root',
description = 'FS root',
enabled = true,
trigger = {
rules = {
{
metric = 'fs_space_percent_free',
window = 120,
['function'] = 'avg',
fields = { fs='/'},
relational_operator = '<=',
threshold = 10,
},
},
logical_operator = 'and',
},
severity = 'critical',
},
{ -- 8
name = 'FS_all',
description = 'FS all',
enabled = true,
trigger = {
rules = {
{
metric = 'fs_space_percent_free',
window = 120,
['function'] = 'avg',
fields = { fs='*'},
relational_operator = '<=',
threshold = 10,
},
},
logical_operator = 'and',
},
severity = 'warning',
},
}
TestLMAAlarm = {}
local current_time = 0
function TestLMAAlarm:tearDown()
lma_alarm.reset_alarms()
current_time = 0
end
local function next_time(inc)
if not inc then inc = 10 end
current_time = current_time + (inc*1e9)
return current_time
end
function TestLMAAlarm:test_start_evaluation()
lma_alarm.load_alarm(alarms[4]) -- window=120 period=2
lma_alarm.set_start_time(current_time)
local alarm = lma_alarm.get_alarm('CPU_Critical_Controller')
assertEquals(alarm:is_evaluation_time(next_time(10)), false) -- 10 seconds
assertEquals(alarm:is_evaluation_time(next_time(50)), false) -- 60 seconds
assertEquals(alarm:is_evaluation_time(next_time(60)), false) -- 120 seconds
assertEquals(alarm:is_evaluation_time(next_time(120)), true) -- 240 seconds
assertEquals(alarm:is_evaluation_time(next_time(240)), true) -- later
end
function TestLMAAlarm:test_not_the_time()
lma_alarm.load_alarms(alarms)
lma_alarm.set_start_time(current_time)
local state, _ = lma_alarm.evaluate(next_time()) -- no alarm w/ window <= 10s
assertEquals(state, nil)
end
function TestLMAAlarm:test_lookup_fields_for_metric()
lma_alarm.load_alarms(alarms)
local fields_required = lma_alarm.get_metric_fields('rabbitmq_queue_messages')
assertItemsEquals(fields_required, {"queue"})
end
function TestLMAAlarm:test_lookup_empty_fields_for_metric()
lma_alarm.load_alarms(alarms)
local fields_required = lma_alarm.get_metric_fields('cpu_idle')
assertItemsEquals(fields_required, {})
local fields_required = lma_alarm.get_metric_fields('cpu_wait')
assertItemsEquals(fields_required, {'hostname'})
end
function TestLMAAlarm:test_lookup_interested_alarms()
lma_alarm.load_alarms(alarms)
local alarms = lma_alarm.get_interested_alarms('foometric')
assertEquals(#alarms, 0)
local alarms = lma_alarm.get_interested_alarms('cpu_wait')
assertEquals(#alarms, 3)
end
function TestLMAAlarm:test_get_alarms()
lma_alarm.load_alarms(alarms)
local all_alarms = lma_alarm.get_alarms()
local num = 0
for _, _ in pairs(all_alarms) do
num = num + 1
end
assertEquals(num, 8)
end
function TestLMAAlarm:test_no_datapoint()
lma_alarm.load_alarms(alarms)
lma_alarm.set_start_time(current_time)
local t = next_time(300) -- at this time all alarms can be evaluated
local state, results = lma_alarm.evaluate(t)
assertEquals(state, consts.UNKW)
for _, result in ipairs(results) do
assertEquals(result.alert.message, 'No datapoint have been received ever')
assertNotEquals(result.alert.fields, nil)
end
end
function TestLMAAlarm:test_rules_logical_op_and_no_alert()
lma_alarm.load_alarm(alarms[6])
lma_alarm.set_start_time(current_time)
local t1 = next_time(60) -- 60s
local t2 = next_time(60) -- 120s
local t3 = next_time(60) -- 180s
local t4 = next_time(60) -- 240s
lma_alarm.add_value(t1, 'cpu_wait', 3)
lma_alarm.add_value(t2, 'cpu_wait', 10)
lma_alarm.add_value(t3, 'cpu_wait', 1)
lma_alarm.add_value(t4, 'cpu_wait', 10)
lma_alarm.add_value(t1, 'cpu_idle', 30)
lma_alarm.add_value(t2, 'cpu_idle', 10)
lma_alarm.add_value(t3, 'cpu_idle', 10)
lma_alarm.add_value(t4, 'cpu_idle', 20)
local state, result = lma_alarm.evaluate(t4)
assertEquals(#result, 0)
assertEquals(state, consts.OKAY)
end
function TestLMAAlarm:test_rules_logical_missing_datapoint__op_and()
lma_alarm.load_alarm(alarms[6])
lma_alarm.set_start_time(current_time)
local t1 = next_time(60)
local t2 = next_time(60)
local t3 = next_time(60)
local t4 = next_time(60)
lma_alarm.add_value(t1, 'cpu_wait', 0) -- 60s
lma_alarm.add_value(t2, 'cpu_wait', 2) -- 120s
lma_alarm.add_value(t3, 'cpu_wait', 5) -- 180s
lma_alarm.add_value(t4, 'cpu_wait', 6) -- 240s
lma_alarm.add_value(t1, 'cpu_idle', 20) -- 60s
lma_alarm.add_value(t2, 'cpu_idle', 20) -- 120s
lma_alarm.add_value(t3, 'cpu_idle', 20) -- 180s
lma_alarm.add_value(t4, 'cpu_idle', 20) -- 240s
local state, result = lma_alarm.evaluate(t4) -- 240s we can evaluate
assertEquals(state, consts.OKAY)
assertEquals(#result, 0)
local state, result = lma_alarm.evaluate(next_time(60)) -- 60s w/o datapoint
assertEquals(state, consts.OKAY)
-- cpu_wait have no data within its observation period
local state, result = lma_alarm.evaluate(next_time(1)) -- 61s w/o datapoint
assertEquals(state, consts.UNKW)
assertEquals(#result, 1)
assertEquals(result[1].alert.metric, 'cpu_wait')
assertStrContains(result[1].alert.message, 'No datapoint have been received over the last')
-- both cpu_idle and cpu_wait have no data within their observation periods
local state, result = lma_alarm.evaluate(next_time(180)) -- 241s w/o datapoint
assertEquals(state, consts.UNKW)
assertEquals(#result, 2)
assertEquals(result[1].alert.metric, 'cpu_idle')
assertStrContains(result[1].alert.message, 'No datapoint have been received over the last')
assertEquals(result[2].alert.metric, 'cpu_wait')
assertStrContains(result[2].alert.message, 'No datapoint have been received over the last')
-- datapoints come back for both metrics
lma_alarm.add_value(next_time(), 'cpu_idle', 20)
lma_alarm.add_value(next_time(), 'cpu_idle', 20)
lma_alarm.add_value(next_time(), 'cpu_wait', 20)
lma_alarm.add_value(next_time(), 'cpu_wait', 20)
local state, result = lma_alarm.evaluate(next_time()) -- 240s we can evaluate
assertEquals(state, consts.OKAY)
assertEquals(#result, 0)
end
function TestLMAAlarm:test_rules_logical_missing_datapoint__op_and_2()
lma_alarm.load_alarm(alarms[6])
lma_alarm.set_start_time(current_time)
local t1 = next_time(60)
local t2 = next_time(60)
local t3 = next_time(60)
local t4 = next_time(60)
lma_alarm.add_value(t1, 'cpu_wait', 0) -- 60s
lma_alarm.add_value(t2, 'cpu_wait', 2) -- 120s
lma_alarm.add_value(t3, 'cpu_wait', 5) -- 180s
lma_alarm.add_value(t4, 'cpu_wait', 6) -- 240s
lma_alarm.add_value(t1, 'cpu_idle', 20) -- 60s
lma_alarm.add_value(t2, 'cpu_idle', 20) -- 120s
lma_alarm.add_value(t3, 'cpu_idle', 20) -- 180s
lma_alarm.add_value(t4, 'cpu_idle', 20) -- 240s
local state, result = lma_alarm.evaluate(t4) -- 240s we can evaluate
assertEquals(state, consts.OKAY)
assertEquals(#result, 0)
local state, result = lma_alarm.evaluate(next_time(60)) -- 60s w/o datapoint
assertEquals(state, consts.OKAY)
-- cpu_wait have no data within its observation period
local state, result = lma_alarm.evaluate(next_time(1)) -- 61s w/o datapoint
assertEquals(state, consts.UNKW)
assertEquals(#result, 1)
assertEquals(result[1].alert.metric, 'cpu_wait')
assertStrContains(result[1].alert.message, 'No datapoint have been received over the last')
lma_alarm.add_value(next_time(170), 'cpu_wait', 20)
-- cpu_idle have no data within its observation period
local state, result = lma_alarm.evaluate(next_time())
assertEquals(state, consts.UNKW)
assertEquals(#result, 1)
assertEquals(result[1].alert.metric, 'cpu_idle')
assertStrContains(result[1].alert.message, 'No datapoint have been received over the last')
-- datapoints come back for both metrics
lma_alarm.add_value(next_time(), 'cpu_idle', 20)
lma_alarm.add_value(next_time(), 'cpu_idle', 20)
lma_alarm.add_value(next_time(), 'cpu_wait', 20)
lma_alarm.add_value(next_time(), 'cpu_wait', 20)
local state, result = lma_alarm.evaluate(next_time()) -- 240s we can evaluate
assertEquals(state, consts.OKAY)
assertEquals(#result, 0)
end
function TestLMAAlarm:test_rules_logical_op_and_with_alerts()
lma_alarm.load_alarm(alarms[6])
local cpu_critical_and = lma_alarm.get_alarm('CPU_Critical_Controller_AND')
lma_alarm.add_value(next_time(1), 'cpu_wait', 30)
lma_alarm.add_value(next_time(1), 'cpu_wait', 30)
lma_alarm.add_value(next_time(1), 'cpu_wait', 35)
lma_alarm.add_value(next_time(2), 'cpu_idle', 0)
lma_alarm.add_value(next_time(2), 'cpu_idle', 1)
lma_alarm.add_value(next_time(2), 'cpu_idle', 7)
lma_alarm.add_value(next_time(2), 'cpu_idle', 2)
local state, result = cpu_critical_and:evaluate(current_time)
assertEquals(state, consts.CRIT)
assertEquals(#result, 2) -- avg(cpu_wait)>=30 and avg(cpu_idle)<=15
end
function TestLMAAlarm:test_rules_logical_op_or_one_alert()
lma_alarm.load_alarm(alarms[5])
local cpu_warn_and = lma_alarm.get_alarm('CPU_Warning_Controller')
lma_alarm.add_value(next_time(), 'cpu_wait', 15)
lma_alarm.add_value(next_time(), 'cpu_wait', 10)
lma_alarm.add_value(next_time(), 'cpu_wait', 20)
lma_alarm.add_value(next_time(), 'cpu_idle', 11)
lma_alarm.add_value(next_time(), 'cpu_idle', 8)
lma_alarm.add_value(next_time(), 'cpu_idle', 7)
local state, result = cpu_warn_and:evaluate(current_time)
assertEquals(state, consts.WARN)
assertEquals(#result, 1) -- avg(cpu_wait) IS NOT >=25 and avg(cpu_idle)<=2
end
function TestLMAAlarm:test_rules_logical_op_or_all_alert()
lma_alarm.load_alarm(alarms[5])
local cpu_warn_and = lma_alarm.get_alarm('CPU_Warning_Controller')
lma_alarm.add_value(next_time(), 'cpu_wait', 35)
lma_alarm.add_value(next_time(), 'cpu_wait', 20)
lma_alarm.add_value(next_time(), 'cpu_wait', 32)
lma_alarm.add_value(next_time(), 'cpu_idle', 3)
lma_alarm.add_value(next_time(), 'cpu_idle', 2.5)
lma_alarm.add_value(next_time(), 'cpu_idle', 1.5)
local state, result = cpu_warn_and:evaluate(current_time)
assertEquals(state, consts.WARN)
assertEquals(#result, 2) -- avg(cpu_wait) >=25 and avg(cpu_idle)<=3
end
function TestLMAAlarm:test_min()
lma_alarm.load_alarms(alarms)
lma_alarm.add_value(next_time(), 'rabbitmq_messages', 50)
lma_alarm.add_value(next_time(), 'rabbitmq_messages', 100)
lma_alarm.add_value(next_time(), 'rabbitmq_messages', 75)
lma_alarm.add_value(next_time(), 'rabbitmq_messages', 81)
local rabbitmq_critical = lma_alarm.get_alarm('RabbitMQ_Critical')
assertEquals(rabbitmq_critical.severity, consts.CRIT)
local state_crit, result = rabbitmq_critical:evaluate(current_time)
assertEquals(state_crit, consts.CRIT) -- min()>=50
assertEquals(#result, 1)
assertEquals(result[1].value, 50)
end
function TestLMAAlarm:test_max()
lma_alarm.load_alarms(alarms)
local rabbitmq_warning = lma_alarm.get_alarm('RabbitMQ_Warning')
lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 0, {queue = 'queue-XX', hostname = 'node-x'})
lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 260, {queue = 'queue-XX', hostname = 'node-x'})
lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 200, {queue = 'queue-XX', hostname = 'node-x'})
lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 152, {queue = 'queue-XX', hostname = 'node-x'})
lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 152, {queue = 'nova', hostname = 'node-x'})
lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 532, {queue = 'nova', hostname = 'node-x'})
local state_warn, result = rabbitmq_warning:evaluate(current_time)
assertEquals(state_warn, consts.WARN)
assertEquals(#result, 3)
assertEquals(result[1]['function'], 'avg')
assertEquals(result[1].value, 153) -- avg() > 120 for queue=queue-XX
assertEquals(result[2]['function'], 'avg')
assertEquals(result[2].value, 342) -- avg() > 120 for queue=nova
assertEquals(result[3]['function'], 'max')
assertEquals(result[3].value, 532) -- max() > 250 for queue=nova
end
function TestLMAAlarm:test_alarm_first_match()
lma_alarm.load_alarm(alarms[1]) -- FS_all_no_field
lma_alarm.load_alarm(alarms[7]) -- FS_root
lma_alarm.load_alarm(alarms[8]) -- FS_all
lma_alarm.set_start_time(current_time)
local t = next_time() -- 10s
lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'})
lma_alarm.add_value(t, 'fs_space_percent_free', 6 )
lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'})
lma_alarm.add_value(next_time(40), 'fs_space_percent_free', 6, {fs = '/'}) -- 50s
lma_alarm.add_value(next_time(40), 'fs_space_percent_free', 6, {fs = 'foo'}) -- 90s
local state, result = lma_alarm.evaluate(next_time(30)) -- 120s
assertEquals(state, consts.WARN) -- FS_all_no_field severity, the first of the alarm list
end
function TestLMAAlarm:test_rules_fields()
lma_alarm.load_alarm(alarms[1]) -- FS_all_no_field
lma_alarm.load_alarm(alarms[7]) -- FS_root
lma_alarm.load_alarm(alarms[8]) -- FS_all
lma_alarm.set_start_time(current_time)
local t = next_time()
lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'})
lma_alarm.add_value(t, 'fs_space_percent_free', 6 )
lma_alarm.add_value(next_time(), 'fs_space_percent_free', 12, {fs = '/'})
lma_alarm.add_value(next_time(), 'fs_space_percent_free', 12 )
lma_alarm.add_value(next_time(), 'fs_space_percent_free', 6, {fs = '/'})
lma_alarm.add_value(next_time(), 'fs_space_percent_free', 6, {fs = 'foo'})
lma_alarm.add_value(next_time(), 'fs_space_percent_free', 3, {fs = 'foo'})
local t = next_time()
local root_fs = lma_alarm.get_alarm('FS_root')
local state, result = root_fs:evaluate(t)
assertEquals(#result, 1)
assertItemsEquals(result[1].fields, {{name='fs', value='/'}})
assertEquals(result[1].value, 8)
local root_fs = lma_alarm.get_alarm('FS_all')
local state, result = root_fs:evaluate(t)
assertEquals(#result, 2)
assertItemsEquals(result[1].fields, {{name='fs', value='/'}})
assertItemsEquals(result[2].fields, {{name='fs', value='foo'}})
assertEquals(result[2].value, 4.5)
local root_fs = lma_alarm.get_alarm('FS_all_no_field')
local state, result = root_fs:evaluate(t)
assertEquals(#result, 3)
assertItemsEquals(result[1].fields, {{name='fs', value='/'}})
assertEquals(result[1].value, 8)
assertItemsEquals(result[2].fields, {})
assertEquals(result[2].value, 9)
assertItemsEquals(result[3].fields, {{name='fs', value='foo'}})
assertEquals(result[3].value, 4.5)
end
lu = LuaUnit
lu:setVerbosity( 1 )
os.exit( lu:run() )