diff --git a/deployment_scripts/puppet/modules/lma_collector/.gitignore b/deployment_scripts/puppet/modules/lma_collector/.gitignore index cf17e0370..262d12a43 100644 --- a/deployment_scripts/puppet/modules/lma_collector/.gitignore +++ b/deployment_scripts/puppet/modules/lma_collector/.gitignore @@ -3,3 +3,4 @@ spec/fixtures/manifests/* Gemfile.lock tests/lua/mocks/date_time.lua .bundle +tests/lua/mocks/inspect.lua diff --git a/deployment_scripts/puppet/modules/lma_collector/Rakefile b/deployment_scripts/puppet/modules/lma_collector/Rakefile index 4d1d31cc3..3f32542c2 100644 --- a/deployment_scripts/puppet/modules/lma_collector/Rakefile +++ b/deployment_scripts/puppet/modules/lma_collector/Rakefile @@ -26,7 +26,7 @@ task :test => [ LUA_TESTS = Rake::FileList.new("tests/**/test_*.lua") desc "Run Lua unit tests." -task :lua_tests => [:lua_5_1, :lua_libraries] do |t| +task :lua_tests => [:lua_5_1, :lua_libraries, :lib_cbuf] do |t| LUA_TESTS.each do |f| sh "lua5.1 #{f}" end @@ -42,7 +42,7 @@ end # Need to pull date_time.lua from the lua_sandbox repository because some tests # depend on it indirectly -task :lua_libraries => ['tests/lua/mocks/date_time.lua'] +task :lua_libraries => ['tests/lua/mocks/date_time.lua', 'tests/lua/mocks/inspect.lua'] file 'tests/lua/mocks/date_time.lua' do |t| sh "curl", "-s", "-o", t.name, "https://raw.githubusercontent.com/mozilla-services/lua_sandbox/dev/modules/date_time.lua" do |ok, res| @@ -51,3 +51,27 @@ file 'tests/lua/mocks/date_time.lua' do |t| end end end + +file 'tests/lua/mocks/inspect.lua' do |t| + sh "curl", "-s", "-o", t.name, "https://raw.githubusercontent.com/kikito/inspect.lua/master/inspect.lua" do |ok, res| + if ! ok then + raise "Fail to download inspect.lua from gitub repository!" + end + end +end + +# Need to pull circular_buffer library from mozilla Github and compile it +task :lib_cbuf => ['./circular_buffer.so'] + +file './circular_buffer.so' do |t| + sh %{cd /tmp && rm -rf lua_circular_buffer && git clone https://github.com/mozilla-services/lua_circular_buffer.git && cd lua_circular_buffer && mkdir release && cd release && cmake -DCMAKE_BUILD_TYPE=release .. && make} do |ok, res| + if ! ok then + raise "failed to compile circular_buffer!" + end + end + sh %{cp /tmp/lua_circular_buffer/release/circular_buffer.so .} do |ok, res| + if ! ok then + raise "failed bloom_filter to compule!" + end + end +end diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd.lua index cb814f029..b7b41b533 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd.lua +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd.lua @@ -133,4 +133,9 @@ function inject_afd_service_metric(service, value, hostname, interval, source) inject_message(msg) end +MATCH = 1 +NO_MATCH = 2 +NO_DATA = 3 +MISSING_DATA = 4 + return M diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua new file mode 100644 index 000000000..14a85e881 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua @@ -0,0 +1,190 @@ +-- Copyright 2015 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local assert = assert +local ipairs = ipairs +local pairs = pairs +local string = string +local setmetatable = setmetatable + +-- LMA libs +local utils = require 'lma_utils' +local consts = require 'gse_constants' +local afd = require 'afd' +local Rule = require 'afd_rule' + +local SEVERITIES = { + warning = consts.WARN, + critical = consts.CRIT, + down = consts.DOWN, +} + +local Alarm = {} +Alarm.__index = Alarm + +setfenv(1, Alarm) -- Remove external access to contain everything in the module + +function Alarm.new(alarm) + local a = {} + setmetatable(a, Alarm) + a._metrics_list = nil + a.alarm = alarm + a.name = alarm.name + a.description = alarm.description + if alarm.trigger.logical_operator then + a.logical_operator = string.lower(alarm.trigger.logical_operator) + else + a.logical_operator = 'or' + end + a.severity_str = string.upper(alarm.severity) + a.severity = SEVERITIES[string.lower(alarm.severity)] + assert(a.severity ~= nil) + a.rules = {} + a.initial_wait = 0 + for _, rule in ipairs(alarm.trigger.rules) do + local r = Rule.new(rule) + a.rules[#a.rules+1] = r + local wait = r.window * r.periods + if wait > a.initial_wait then + a.initial_wait = wait * 1e9 + end + end + a.start_time_ns = 0 + + return a +end + +-- return the Set of metrics used by the alarm +function Alarm:get_metrics() + if not self._metrics_list then + self._metrics_list = {} + for _, rule in ipairs(self.rules) do + if not utils.table_find(rule.metric, metrics) then + self._metrics_list[#self._metrics_list+1] = rule.metric + end + end + end + return self._metrics_list +end + +-- return a list of field names used for the metric +-- (can have duplicate names) +function Alarm:get_metric_fields(metric_name) + local fields = {} + for _, rule in ipairs(self.rules) do + if rule.metric == metric_name then + for k, _ in pairs(rule.fields) do + fields[#fields+1] = k + end + end + end + return fields +end + +function Alarm:has_metric(metric) + if utils.table_find(metric, self:get_metrics()) then + return true + end + return false +end + +-- dispatch datapoint in datastores +function Alarm:add_value(ts, metric, value, fields) + local data + for id, rule in pairs(self.rules) do + if rule.metric == metric then + rule:add_value(ts, value, fields) + end + end +end + +-- convert fields to fields map +-- {foo="bar"} --> {name="foo", value="bar"} +local function convert_field_list(fields) + local named_fields = {} + for name, value in pairs(fields or {}) do + named_fields[#named_fields+1] = {name=name, value=value} + end + return named_fields +end + +-- return: state of alarm and a list of alarm details. +-- +-- with alarm list when state != OKAY: +-- { +-- { +-- value = , +-- fields = , +-- message = , +-- }, +-- } +function Alarm:evaluate(ns) + local state + local all_alerts = {} + local function add_alarm(rule, value, message, fields) + all_alerts[#all_alerts+1] = { + severity = self.severity_str, + ['function'] = rule.fct, + metric = rule.metric, + operator = rule.relational_operator, + threshold = rule.threshold, + window = rule.window, + periods = rule.periods, + value = value, + fields = fields, + message = message + } + end + local one_unknown = false + for _, rule in ipairs(self.rules) do + local eval, context_list = rule:evaluate(ns) + if eval == afd.MATCH then + state = self.severity + for _, context in ipairs(context_list) do + add_alarm(rule, context.value, self.description, + convert_field_list(context.fields)) + end + elseif eval == afd.MISSING_DATA then + local msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds' + add_alarm(rule, -2, msg, convert_field_list(rule.fields)) + one_unknown = true + elseif eval == afd.NO_DATA then + add_alarm(rule, -1, 'No datapoint have been received ever', convert_field_list(rule.fields)) + one_unknown = true + end + end + if self.logical_operator == 'and' and one_unknown then + state = consts.UNKW + elseif not state and one_unknown then + state = consts.UNKW + elseif not state then + state = consts.OKAY + all_alerts = {} + end + return state, all_alerts +end + +function Alarm:set_start_time(ns) + self.start_time_ns = ns +end + +function Alarm:is_evaluation_time(ns) + local delta = ns - self.start_time_ns + if delta >= self.initial_wait then + return true + end + return false +end + +return Alarm diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarms.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarms.lua new file mode 100644 index 000000000..5ede2ed48 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarms.lua @@ -0,0 +1,119 @@ +-- Copyright 2015 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local pairs = pairs +local ipairs = ipairs +local lma = require 'lma_utils' +local consts = require 'gse_constants' +local gse = require 'gse' +local Alarm = require 'afd_alarm' + +local all_alarms = {} + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- return a list of field names required for the metric +function get_metric_fields(metric_name) + local fields = {} + for name, alarm in pairs(all_alarms) do + local mf = alarm:get_metric_fields(metric_name) + if mf then + for _, field in pairs(mf) do + if not lma.table_find(field, fields) then + fields[#fields+1] = field + end + end + end + end + return fields +end + +-- return list of alarms interested by a metric +function get_interested_alarms(metric) + local interested_alarms = {} + for _, alarm in pairs(all_alarms) do + if alarm:has_metric(metric) then + + interested_alarms[#interested_alarms+1] = alarm + end + end + return interested_alarms +end + +function add_value(ts, metric, value, fields) + local interested_alarms = get_interested_alarms(metric) + for _, alarm in ipairs (interested_alarms) do + alarm:add_value(ts, metric, value, fields) + end +end + +function reset_alarms() + all_alarms = {} +end + +function evaluate(ns) + local global_state + local all_alerts = {} + for _, alarm in pairs(all_alarms) do + if alarm:is_evaluation_time(ns) then + local state, alerts = alarm:evaluate(ns) + global_state = gse.max_status(state, global_state) + for _, a in ipairs(alerts)do + all_alerts[#all_alerts+1] = { state=state, alert=a } + end + -- raise the first triggered alarm except for OKAY/UNKW states + if global_state ~= consts.UNKW or global_state ~= consts.OKAY then + break + end + end + end + return global_state, all_alerts +end + +function get_alarms() + return all_alarms +end +function get_alarm(alarm_name) + for _, a in ipairs(all_alarms) do + if a.name == alarm_name then + return a + end + end +end + +function load_alarm(alarm) + local A = Alarm.new(alarm) + all_alarms[#all_alarms+1] = A +end + +function load_alarms(alarms) + for _, alarm in ipairs(alarms) do + load_alarm(alarm) + end +end + +local started = false +function set_start_time(ns) + for _, alarm in ipairs(all_alarms) do + alarm:set_start_time(ns) + end + started = true +end + +function is_started() + return started +end + +return M diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_rule.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_rule.lua new file mode 100644 index 000000000..60d68accf --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_rule.lua @@ -0,0 +1,212 @@ +-- Copyright 2015 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local circular_buffer = require('circular_buffer') +local setmetatable = setmetatable +local ipairs = ipairs +local pairs = pairs +local math = require 'math' +local string = string +local table = table + +-- LMA libs +local utils = require 'lma_utils' +local consts = require 'gse_constants' +local afd = require 'afd' + +local MIN_WINDOW = 10 +local MIN_PERIOD = 1 +local SECONDS_PER_ROW = 5 + +local Rule = {} +Rule.__index = Rule + +setfenv(1, Rule) -- Remove external access to contain everything in the module + +local function get_datastore_id(metric, fields, fct, window, periods) + local arr = {metric, fct, window, periods} + for f, v in utils.orderedPairs(fields or {}) do + arr[#arr+1] = string.format('(%s=%s)', f, v) + end + return table.concat(arr, '/') +end + +function Rule.new(rule) + local r = {} + setmetatable(r, Rule) + + local win = MIN_WINDOW + if rule.window and rule.window + 0 > 0 then + win = rule.window + 0 + end + r.window = win + local periods = MIN_PERIOD + if rule.periods and rule.periods + 0 > 0 then + periods = rule.periods + 0 + end + r.periods = periods + r.relational_operator = rule.relational_operator + r.metric = rule.metric + r.fields = rule.fields or {} + r.fct = rule['function'] + r.threshold = rule.threshold + 0 + r.ids_datastore = {} + r.datastore = {} + r.observation_window = math.ceil(r.window * r.periods) + r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW) + + return r +end + +function Rule:fields_accepted(fields) + if not fields then + fields = {} + end + local matched_fields = 0 + local no_match_on_fields = true + for f, wanted in pairs(self.fields) do + no_match_on_fields = false + for k, v in pairs(fields) do + if k == f and wanted == '*' then + matched_fields = matched_fields + 1 + elseif k == f and v == wanted then + matched_fields = matched_fields + 1 + elseif k == f and v ~= wanted then + return false + end + end + end + return no_match_on_fields or matched_fields > 0 +end + +function Rule:get_circular_buffer() + local cbuf + if self.fct == 'avg' then + cbuf = circular_buffer.new(self.cbuf_size, 2, SECONDS_PER_ROW) + cbuf:set_header(1, self.metric, 'sum', 'sum') + cbuf:set_header(2, self.metric, 'count', 'sum') + elseif self.fct == 'min' or self.fct == 'max' then + cbuf = circular_buffer.new(self.cbuf_size, 2, SECONDS_PER_ROW) + cbuf:set_header(1, self.metric, self.fct) + else + cbuf = circular_buffer.new(self.cbuf_size, 2, SECONDS_PER_ROW) + cbuf:set_header(1, self.metric) + end + return cbuf +end + +-- store datapoints in cbuf, create the cbuf if not exists +function Rule:add_value(ts, value, fields) + if not self:fields_accepted(fields) then + return + end + local data + local uniq_field_id = get_datastore_id(self.metric, fields, self.fct, self.window, self.periods) + if not self.datastore[uniq_field_id] then + self.datastore[uniq_field_id] = { + fields = fields, + cbuf = self:get_circular_buffer() + } + self:add_datastore(uniq_field_id) + end + data = self.datastore[uniq_field_id] + + if self.fct == 'avg' then + data.cbuf:add(ts, 1, value) + data.cbuf:add(ts, 2, 1) + elseif self.fct == 'min' or self.fct == 'max' then + data.cbuf:add(ts, 1, value) + else + data.cbuf:set(ts, 1, value) + end +end + +function Rule:add_datastore(id) + if not utils.table_find(id, self.ids_datastore) then + self.ids_datastore[#self.ids_datastore+1] = id + end +end + +local function compare_threshold(value, op, threshold) + local rule_matches = false + if op == '==' or op == 'eq' then + rule_matches = value == threshold + elseif op == '!=' or op == 'ne' then + rule_matches = value ~= threshold + elseif op == '>=' or op == 'gte' then + rule_matches = value >= threshold + elseif op == '>' or op == 'gt' then + rule_matches = value > threshold + elseif op == '<=' or op == 'lte' then + rule_matches = value <= threshold + elseif op == '<' or op == 'lt' then + rule_matches = value < threshold + end + if rule_matches then + return afd.MATCH + end + return afd.NO_MATCH +end + +-- evaluate the rule against datapoints +-- return a list: match (bool or string), context ({value=v, fields=list of field table}) +-- +-- examples: +-- true, { value=100, fields={{queue='nova'}, {queue='neutron'}} +-- false, { value=10, fields={}} +-- with 2 special cases: +-- - never receive one datapoint +-- 'nodata', {} +-- - no more datapoint received for a metric +-- 'missing', {value=-1, fields={}} +-- There is a drawback with the 'missing' state and could leads to emit false positive +-- state. For example when the monitored thing has been renamed/deleted, +-- it's normal to don't receive datapoint anymore .. for example a filesystem. +function Rule:evaluate(ns) + local fields = {} + local match = afd.NO_DATA + for _, id in ipairs(self.ids_datastore) do + local data = self.datastore[id] + if data then + local cbuf_time = data.cbuf:current_time() + -- if we didn't receive datapoint within the observation window this means + -- we don't receive anymore data and cannot compute the rule. + if ns - cbuf_time > self.observation_window * 1e9 then + return afd.MISSING_DATA, {value = -1, fields = data.fields} + end + + if self.fct == 'avg' or self.fct == 'max' or self.fct == 'min' or self.fct == 'sum' or self.fct == 'sd' or self.fct == 'variance' then + local result + local num_row = -1 + if self.fct == 'avg' then + local total + total, num_row = data.cbuf:compute('sum', 1) + local count = data.cbuf:compute('sum', 2) + result = total/count + else + result, num_row = data.cbuf:compute(self.fct, 1) + end + if result then + match = compare_threshold(result, self.relational_operator, self.threshold) + end + if match then + fields[#fields+1] = {value=result, fields=data.fields} + end + end + end + end + return match, fields +end + +return Rule diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/lma_utils.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/lma_utils.lua index 58291c3a0..50e0132c3 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/lma_utils.lua +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/lma_utils.lua @@ -15,7 +15,9 @@ local cjson = require 'cjson' local string = require 'string' local extra = require 'extra_fields' local patt = require 'patterns' +local table = require 'table' local pairs = pairs +local ipairs = ipairs local inject_message = inject_message local read_message = read_message local pcall = pcall @@ -192,4 +194,60 @@ function deepcopy(t) return t end +-- return true if an item is present in the list, else false +function table_find(item, list) + if type(list) == 'table' then + for _, v in ipairs(list) do + if v == item then + return true + end + end + return false + end +end + +-- from http://lua-users.org/wiki/SortedIteration +function __genOrderedIndex( t ) + local orderedIndex = {} + for key in pairs(t) do + table.insert( orderedIndex, key ) + end + table.sort( orderedIndex ) + return orderedIndex +end + +function orderedNext(t, state) + -- Equivalent of the next function, but returns the keys in the alphabetic + -- order. We use a temporary ordered key table that is stored in the + -- table being iterated. + + key = nil + if state == nil then + -- the first time, generate the index + t.__orderedIndex = __genOrderedIndex( t ) + key = t.__orderedIndex[1] + else + -- fetch the next value + for i = 1,table.getn(t.__orderedIndex) do + if t.__orderedIndex[i] == state then + key = t.__orderedIndex[i+1] + end + end + end + + if key then + return key, t[key] + end + + -- no more value to return, cleanup + t.__orderedIndex = nil + return +end + +function orderedPairs(t) + -- Equivalent of the pairs() function on tables. Allows to iterate + -- in order + return orderedNext, t, nil +end + return M diff --git a/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua b/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua new file mode 100644 index 000000000..779b52a2f --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua @@ -0,0 +1,542 @@ +-- Copyright 2015 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + + +require('luaunit') +package.path = package.path .. ";files/plugins/common/?.lua;tests/lua/mocks/?.lua" +local lma_alarm = require('afd_alarms') +local consts = require('gse_constants') + +local alarms = { + { -- 1 + name = 'FS_all_no_field', + description = 'FS all no field', + enabled = true, + trigger = { + rules = { + { + metric = 'fs_space_percent_free', + window = 120, + ['function'] = 'avg', + relational_operator = '<=', + threshold = 11, + }, + }, + logical_operator = 'and', + }, + severity = 'warning', + }, + { -- 2 + name = 'RabbitMQ_Critical', + description = 'Number of messages in queue is critical', + enabled = true, + trigger = { + rules = { + { + relational_operator = '>=', + metric = 'rabbitmq_messages', + fields = {}, + window = "300", + periods = "0", + ['function'] = 'min', + threshold = "50", + }, + }, + logical_operator = 'or', + }, + severity = 'critical', + }, + { -- 3 + name = 'RabbitMQ_Warning', + description = 'Number of messages becomes high', + enabled = true, + trigger = { + rules = { + { + relational_operator = '>=', + metric = 'rabbitmq_queue_messages', + fields = { queue = '*'}, + window = 120, + periods = 0, + ['function'] = 'avg', + threshold = 120, + }, + { + relational_operator = '>=', + metric = 'rabbitmq_queue_messages', + fields = { queue = 'nova'}, + window = 60, + periods = 0, + ['function'] = 'max', + threshold = 250, + }, + }, + }, + severity = 'warning', + }, + { -- 4 + name = 'CPU_Critical_Controller', + description = 'CPU is critical for the controller', + enabled = true, + trigger = { + rules = { + { + metric = 'cpu_idle', + window = 120, + periods = 2, + ['function'] = 'avg', + relational_operator = '<=', + threshold = 5, + }, + { + metric = 'cpu_wait', + window = 120, + periods = 1, + fields = { hostname = '*' }, + ['function'] = 'avg', + relational_operator = '>=', + threshold = 20, + }, + }, + logical_operator = 'or', + }, + severity = 'critical', + }, + { -- 5 + name = 'CPU_Warning_Controller', + description = 'CPU is warning for controller', + enabled = true, + trigger = { + rules = { + { + metric = 'cpu_idle', + window = 100, + periods = 2, + ['function'] = 'avg', + relational_operator = '<=', + threshold = 15, + }, + { + metric = 'cpu_wait', + window = 60, + periods = 0, + ['function'] = 'avg', + relational_operator = '>=', + threshold = 25, + }, + }, + logical_operator = 'or', + }, + severity = 'warning', + }, + { -- 6 + name = 'CPU_Critical_Controller_AND', + description = 'CPU is critical for controller', + enabled = true, + trigger = { + rules = { + { + metric = 'cpu_idle', + window = 120, + periods = 2, + ['function'] = 'avg', + relational_operator = '<=', + threshold = 3, + }, + { + metric = 'cpu_wait', + window = 60, + periods = 1, + ['function'] = 'avg', + relational_operator = '>=', + threshold = 30, + }, + }, + logical_operator = 'and', + }, + severity = 'critical', + }, + { -- 7 + name = 'FS_root', + description = 'FS root', + enabled = true, + trigger = { + rules = { + { + metric = 'fs_space_percent_free', + window = 120, + ['function'] = 'avg', + fields = { fs='/'}, + relational_operator = '<=', + threshold = 10, + }, + }, + logical_operator = 'and', + }, + severity = 'critical', + }, + { -- 8 + name = 'FS_all', + description = 'FS all', + enabled = true, + trigger = { + rules = { + { + metric = 'fs_space_percent_free', + window = 120, + ['function'] = 'avg', + fields = { fs='*'}, + relational_operator = '<=', + threshold = 10, + }, + }, + logical_operator = 'and', + }, + severity = 'warning', + }, +} + +TestLMAAlarm = {} + +local current_time = 0 + +function TestLMAAlarm:tearDown() + lma_alarm.reset_alarms() + current_time = 0 +end + +local function next_time(inc) + if not inc then inc = 10 end + current_time = current_time + (inc*1e9) + return current_time +end + +function TestLMAAlarm:test_start_evaluation() + lma_alarm.load_alarm(alarms[4]) -- window=120 period=2 + lma_alarm.set_start_time(current_time) + local alarm = lma_alarm.get_alarm('CPU_Critical_Controller') + assertEquals(alarm:is_evaluation_time(next_time(10)), false) -- 10 seconds + assertEquals(alarm:is_evaluation_time(next_time(50)), false) -- 60 seconds + assertEquals(alarm:is_evaluation_time(next_time(60)), false) -- 120 seconds + assertEquals(alarm:is_evaluation_time(next_time(120)), true) -- 240 seconds + assertEquals(alarm:is_evaluation_time(next_time(240)), true) -- later +end + +function TestLMAAlarm:test_not_the_time() + lma_alarm.load_alarms(alarms) + lma_alarm.set_start_time(current_time) + local state, _ = lma_alarm.evaluate(next_time()) -- no alarm w/ window <= 10s + assertEquals(state, nil) +end + +function TestLMAAlarm:test_lookup_fields_for_metric() + lma_alarm.load_alarms(alarms) + local fields_required = lma_alarm.get_metric_fields('rabbitmq_queue_messages') + assertItemsEquals(fields_required, {"queue"}) +end + +function TestLMAAlarm:test_lookup_empty_fields_for_metric() + lma_alarm.load_alarms(alarms) + local fields_required = lma_alarm.get_metric_fields('cpu_idle') + assertItemsEquals(fields_required, {}) + local fields_required = lma_alarm.get_metric_fields('cpu_wait') + assertItemsEquals(fields_required, {'hostname'}) +end + +function TestLMAAlarm:test_lookup_interested_alarms() + lma_alarm.load_alarms(alarms) + local alarms = lma_alarm.get_interested_alarms('foometric') + assertEquals(#alarms, 0) + local alarms = lma_alarm.get_interested_alarms('cpu_wait') + assertEquals(#alarms, 3) + +end + +function TestLMAAlarm:test_get_alarms() + lma_alarm.load_alarms(alarms) + local all_alarms = lma_alarm.get_alarms() + local num = 0 + for _, _ in pairs(all_alarms) do + num = num + 1 + end + assertEquals(num, 8) +end + +function TestLMAAlarm:test_no_datapoint() + lma_alarm.load_alarms(alarms) + lma_alarm.set_start_time(current_time) + local t = next_time(300) -- at this time all alarms can be evaluated + local state, results = lma_alarm.evaluate(t) + assertEquals(state, consts.UNKW) + for _, result in ipairs(results) do + assertEquals(result.alert.message, 'No datapoint have been received ever') + assertNotEquals(result.alert.fields, nil) + end +end + +function TestLMAAlarm:test_rules_logical_op_and_no_alert() + lma_alarm.load_alarm(alarms[6]) + lma_alarm.set_start_time(current_time) + local t1 = next_time(60) -- 60s + local t2 = next_time(60) -- 120s + local t3 = next_time(60) -- 180s + local t4 = next_time(60) -- 240s + lma_alarm.add_value(t1, 'cpu_wait', 3) + lma_alarm.add_value(t2, 'cpu_wait', 10) + lma_alarm.add_value(t3, 'cpu_wait', 1) + lma_alarm.add_value(t4, 'cpu_wait', 10) + + lma_alarm.add_value(t1, 'cpu_idle', 30) + lma_alarm.add_value(t2, 'cpu_idle', 10) + lma_alarm.add_value(t3, 'cpu_idle', 10) + lma_alarm.add_value(t4, 'cpu_idle', 20) + local state, result = lma_alarm.evaluate(t4) + assertEquals(#result, 0) + assertEquals(state, consts.OKAY) +end + +function TestLMAAlarm:test_rules_logical_missing_datapoint__op_and() + lma_alarm.load_alarm(alarms[6]) + lma_alarm.set_start_time(current_time) + local t1 = next_time(60) + local t2 = next_time(60) + local t3 = next_time(60) + local t4 = next_time(60) + lma_alarm.add_value(t1, 'cpu_wait', 0) -- 60s + lma_alarm.add_value(t2, 'cpu_wait', 2) -- 120s + lma_alarm.add_value(t3, 'cpu_wait', 5) -- 180s + lma_alarm.add_value(t4, 'cpu_wait', 6) -- 240s + lma_alarm.add_value(t1, 'cpu_idle', 20) -- 60s + lma_alarm.add_value(t2, 'cpu_idle', 20) -- 120s + lma_alarm.add_value(t3, 'cpu_idle', 20) -- 180s + lma_alarm.add_value(t4, 'cpu_idle', 20) -- 240s + local state, result = lma_alarm.evaluate(t4) -- 240s we can evaluate + assertEquals(state, consts.OKAY) + assertEquals(#result, 0) + local state, result = lma_alarm.evaluate(next_time(60)) -- 60s w/o datapoint + assertEquals(state, consts.OKAY) + -- cpu_wait have no data within its observation period + local state, result = lma_alarm.evaluate(next_time(1)) -- 61s w/o datapoint + assertEquals(state, consts.UNKW) + assertEquals(#result, 1) + assertEquals(result[1].alert.metric, 'cpu_wait') + assertStrContains(result[1].alert.message, 'No datapoint have been received over the last') + + -- both cpu_idle and cpu_wait have no data within their observation periods + local state, result = lma_alarm.evaluate(next_time(180)) -- 241s w/o datapoint + assertEquals(state, consts.UNKW) + assertEquals(#result, 2) + assertEquals(result[1].alert.metric, 'cpu_idle') + assertStrContains(result[1].alert.message, 'No datapoint have been received over the last') + assertEquals(result[2].alert.metric, 'cpu_wait') + assertStrContains(result[2].alert.message, 'No datapoint have been received over the last') + + -- datapoints come back for both metrics + lma_alarm.add_value(next_time(), 'cpu_idle', 20) + lma_alarm.add_value(next_time(), 'cpu_idle', 20) + lma_alarm.add_value(next_time(), 'cpu_wait', 20) + lma_alarm.add_value(next_time(), 'cpu_wait', 20) + local state, result = lma_alarm.evaluate(next_time()) -- 240s we can evaluate + assertEquals(state, consts.OKAY) + assertEquals(#result, 0) +end + +function TestLMAAlarm:test_rules_logical_missing_datapoint__op_and_2() + lma_alarm.load_alarm(alarms[6]) + lma_alarm.set_start_time(current_time) + local t1 = next_time(60) + local t2 = next_time(60) + local t3 = next_time(60) + local t4 = next_time(60) + lma_alarm.add_value(t1, 'cpu_wait', 0) -- 60s + lma_alarm.add_value(t2, 'cpu_wait', 2) -- 120s + lma_alarm.add_value(t3, 'cpu_wait', 5) -- 180s + lma_alarm.add_value(t4, 'cpu_wait', 6) -- 240s + lma_alarm.add_value(t1, 'cpu_idle', 20) -- 60s + lma_alarm.add_value(t2, 'cpu_idle', 20) -- 120s + lma_alarm.add_value(t3, 'cpu_idle', 20) -- 180s + lma_alarm.add_value(t4, 'cpu_idle', 20) -- 240s + local state, result = lma_alarm.evaluate(t4) -- 240s we can evaluate + assertEquals(state, consts.OKAY) + assertEquals(#result, 0) + local state, result = lma_alarm.evaluate(next_time(60)) -- 60s w/o datapoint + assertEquals(state, consts.OKAY) + -- cpu_wait have no data within its observation period + local state, result = lma_alarm.evaluate(next_time(1)) -- 61s w/o datapoint + assertEquals(state, consts.UNKW) + assertEquals(#result, 1) + assertEquals(result[1].alert.metric, 'cpu_wait') + assertStrContains(result[1].alert.message, 'No datapoint have been received over the last') + + lma_alarm.add_value(next_time(170), 'cpu_wait', 20) + -- cpu_idle have no data within its observation period + local state, result = lma_alarm.evaluate(next_time()) + assertEquals(state, consts.UNKW) + assertEquals(#result, 1) + assertEquals(result[1].alert.metric, 'cpu_idle') + assertStrContains(result[1].alert.message, 'No datapoint have been received over the last') + + -- datapoints come back for both metrics + lma_alarm.add_value(next_time(), 'cpu_idle', 20) + lma_alarm.add_value(next_time(), 'cpu_idle', 20) + lma_alarm.add_value(next_time(), 'cpu_wait', 20) + lma_alarm.add_value(next_time(), 'cpu_wait', 20) + local state, result = lma_alarm.evaluate(next_time()) -- 240s we can evaluate + assertEquals(state, consts.OKAY) + assertEquals(#result, 0) +end + +function TestLMAAlarm:test_rules_logical_op_and_with_alerts() + lma_alarm.load_alarm(alarms[6]) + local cpu_critical_and = lma_alarm.get_alarm('CPU_Critical_Controller_AND') + lma_alarm.add_value(next_time(1), 'cpu_wait', 30) + lma_alarm.add_value(next_time(1), 'cpu_wait', 30) + lma_alarm.add_value(next_time(1), 'cpu_wait', 35) + + lma_alarm.add_value(next_time(2), 'cpu_idle', 0) + lma_alarm.add_value(next_time(2), 'cpu_idle', 1) + lma_alarm.add_value(next_time(2), 'cpu_idle', 7) + lma_alarm.add_value(next_time(2), 'cpu_idle', 2) + local state, result = cpu_critical_and:evaluate(current_time) + assertEquals(state, consts.CRIT) + assertEquals(#result, 2) -- avg(cpu_wait)>=30 and avg(cpu_idle)<=15 +end + +function TestLMAAlarm:test_rules_logical_op_or_one_alert() + lma_alarm.load_alarm(alarms[5]) + local cpu_warn_and = lma_alarm.get_alarm('CPU_Warning_Controller') + lma_alarm.add_value(next_time(), 'cpu_wait', 15) + lma_alarm.add_value(next_time(), 'cpu_wait', 10) + lma_alarm.add_value(next_time(), 'cpu_wait', 20) + + lma_alarm.add_value(next_time(), 'cpu_idle', 11) + lma_alarm.add_value(next_time(), 'cpu_idle', 8) + lma_alarm.add_value(next_time(), 'cpu_idle', 7) + local state, result = cpu_warn_and:evaluate(current_time) + assertEquals(state, consts.WARN) + assertEquals(#result, 1) -- avg(cpu_wait) IS NOT >=25 and avg(cpu_idle)<=2 +end + +function TestLMAAlarm:test_rules_logical_op_or_all_alert() + lma_alarm.load_alarm(alarms[5]) + local cpu_warn_and = lma_alarm.get_alarm('CPU_Warning_Controller') + lma_alarm.add_value(next_time(), 'cpu_wait', 35) + lma_alarm.add_value(next_time(), 'cpu_wait', 20) + lma_alarm.add_value(next_time(), 'cpu_wait', 32) + + lma_alarm.add_value(next_time(), 'cpu_idle', 3) + lma_alarm.add_value(next_time(), 'cpu_idle', 2.5) + lma_alarm.add_value(next_time(), 'cpu_idle', 1.5) + local state, result = cpu_warn_and:evaluate(current_time) + assertEquals(state, consts.WARN) + assertEquals(#result, 2) -- avg(cpu_wait) >=25 and avg(cpu_idle)<=3 +end + +function TestLMAAlarm:test_min() + lma_alarm.load_alarms(alarms) + lma_alarm.add_value(next_time(), 'rabbitmq_messages', 50) + lma_alarm.add_value(next_time(), 'rabbitmq_messages', 100) + lma_alarm.add_value(next_time(), 'rabbitmq_messages', 75) + lma_alarm.add_value(next_time(), 'rabbitmq_messages', 81) + local rabbitmq_critical = lma_alarm.get_alarm('RabbitMQ_Critical') + assertEquals(rabbitmq_critical.severity, consts.CRIT) + local state_crit, result = rabbitmq_critical:evaluate(current_time) + assertEquals(state_crit, consts.CRIT) -- min()>=50 + assertEquals(#result, 1) + assertEquals(result[1].value, 50) +end + +function TestLMAAlarm:test_max() + lma_alarm.load_alarms(alarms) + local rabbitmq_warning = lma_alarm.get_alarm('RabbitMQ_Warning') + lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 0, {queue = 'queue-XX', hostname = 'node-x'}) + lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 260, {queue = 'queue-XX', hostname = 'node-x'}) + lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 200, {queue = 'queue-XX', hostname = 'node-x'}) + lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 152, {queue = 'queue-XX', hostname = 'node-x'}) + lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 152, {queue = 'nova', hostname = 'node-x'}) + lma_alarm.add_value(next_time(), 'rabbitmq_queue_messages', 532, {queue = 'nova', hostname = 'node-x'}) + local state_warn, result = rabbitmq_warning:evaluate(current_time) + assertEquals(state_warn, consts.WARN) + assertEquals(#result, 3) + assertEquals(result[1]['function'], 'avg') + assertEquals(result[1].value, 153) -- avg() > 120 for queue=queue-XX + assertEquals(result[2]['function'], 'avg') + assertEquals(result[2].value, 342) -- avg() > 120 for queue=nova + assertEquals(result[3]['function'], 'max') + assertEquals(result[3].value, 532) -- max() > 250 for queue=nova + +end + +function TestLMAAlarm:test_alarm_first_match() + lma_alarm.load_alarm(alarms[1]) -- FS_all_no_field + lma_alarm.load_alarm(alarms[7]) -- FS_root + lma_alarm.load_alarm(alarms[8]) -- FS_all + lma_alarm.set_start_time(current_time) + + local t = next_time() -- 10s + lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'}) + lma_alarm.add_value(t, 'fs_space_percent_free', 6 ) + lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'}) + lma_alarm.add_value(next_time(40), 'fs_space_percent_free', 6, {fs = '/'}) -- 50s + lma_alarm.add_value(next_time(40), 'fs_space_percent_free', 6, {fs = 'foo'}) -- 90s + local state, result = lma_alarm.evaluate(next_time(30)) -- 120s + assertEquals(state, consts.WARN) -- FS_all_no_field severity, the first of the alarm list +end + +function TestLMAAlarm:test_rules_fields() + lma_alarm.load_alarm(alarms[1]) -- FS_all_no_field + lma_alarm.load_alarm(alarms[7]) -- FS_root + lma_alarm.load_alarm(alarms[8]) -- FS_all + lma_alarm.set_start_time(current_time) + + local t = next_time() + lma_alarm.add_value(t, 'fs_space_percent_free', 6, {fs = '/'}) + lma_alarm.add_value(t, 'fs_space_percent_free', 6 ) + lma_alarm.add_value(next_time(), 'fs_space_percent_free', 12, {fs = '/'}) + lma_alarm.add_value(next_time(), 'fs_space_percent_free', 12 ) + lma_alarm.add_value(next_time(), 'fs_space_percent_free', 6, {fs = '/'}) + lma_alarm.add_value(next_time(), 'fs_space_percent_free', 6, {fs = 'foo'}) + lma_alarm.add_value(next_time(), 'fs_space_percent_free', 3, {fs = 'foo'}) + local t = next_time() + + local root_fs = lma_alarm.get_alarm('FS_root') + local state, result = root_fs:evaluate(t) + assertEquals(#result, 1) + assertItemsEquals(result[1].fields, {{name='fs', value='/'}}) + assertEquals(result[1].value, 8) + + local root_fs = lma_alarm.get_alarm('FS_all') + local state, result = root_fs:evaluate(t) + assertEquals(#result, 2) + assertItemsEquals(result[1].fields, {{name='fs', value='/'}}) + assertItemsEquals(result[2].fields, {{name='fs', value='foo'}}) + assertEquals(result[2].value, 4.5) + + local root_fs = lma_alarm.get_alarm('FS_all_no_field') + local state, result = root_fs:evaluate(t) + assertEquals(#result, 3) + + assertItemsEquals(result[1].fields, {{name='fs', value='/'}}) + assertEquals(result[1].value, 8) + + assertItemsEquals(result[2].fields, {}) + assertEquals(result[2].value, 9) + + assertItemsEquals(result[3].fields, {{name='fs', value='foo'}}) + assertEquals(result[3].value, 4.5) +end + +lu = LuaUnit +lu:setVerbosity( 1 ) +os.exit( lu:run() )