Add 'no_data_policy' attribute to alarm definition

This allows to specify the behavior of the alarm when no datapoint have
been received in the defined window. The default value is to send an AFD
alert with 'UNKNOWN' to be backward-compatible with the existing alarms.

The value for the 'no_data_policy' attribute can either be a severity
level (okay, warning, critical, down, unknown) or 'skip' to avoid
sending the AFD metric.

This change is required to define relevant alarms against "sporadic"
metrics which aren't guaranteed to be received at periodic intervals
like for instance "resource creation time" metrics. It also needed to
support alarm definitions for cluster-wide metrics.

DocImpact

Change-Id: Iaa7fe63d321403ac2c5d4695a3346d912d3cc984
This commit is contained in:
Swann Croiset
2016-08-23 16:10:23 +02:00
committed by Simon Pasquier
parent 14cd2e1755
commit 16b4b02d9e
4 changed files with 133 additions and 11 deletions

View File

@@ -29,6 +29,8 @@ local SEVERITIES = {
warning = consts.WARN,
critical = consts.CRIT,
down = consts.DOWN,
unknown = consts.UNKW,
okay = consts.OKAY,
}
local Alarm = {}
@@ -50,6 +52,19 @@ function Alarm.new(alarm)
a.severity_str = string.upper(alarm.severity)
a.severity = SEVERITIES[string.lower(alarm.severity)]
assert(a.severity ~= nil)
a.skip_when_no_data = false
if alarm.no_data_policy then
if string.lower(alarm.no_data_policy) == 'skip' then
a.skip_when_no_data = true
else
a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)]
end
else
a.no_data_severity = consts.UNKW
end
assert(a.skip_when_no_data or a.no_data_severity ~= nil)
a.rules = {}
a.initial_wait = 0
for _, rule in ipairs(alarm.trigger.rules) do
@@ -130,7 +145,7 @@ end
-- },
-- }
function Alarm:evaluate(ns)
local state
local state = consts.OKAY
local matches = 0
local all_alerts = {}
local function add_alarm(rule, value, message, fields)
@@ -170,23 +185,29 @@ function Alarm:evaluate(ns)
if self.logical_operator == 'and' then
if one_unknown then
state = consts.UNKW
if self.skip_when_no_data then
state = nil
else
state = self.no_data_severity
end
elseif #self.rules == matches then
state = self.severity
else
state = consts.OKAY
all_alerts = {}
end
elseif self.logical_operator == 'or' then
if matches > 0 then
state = self.severity
elseif one_unknown then
state = consts.UNKW
else
state = consts.OKAY
all_alerts = {}
if self.skip_when_no_data then
state = nil
else
state = self.no_data_severity
end
end
end
if state == nil or state == consts.OKAY then
all_alerts = {}
end
return state, all_alerts
end

View File

@@ -25,9 +25,12 @@ local STATUS_WEIGHTS = {
[consts.DOWN]=4
}
function max_status(val1, val2)
if not val2 or STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
if not val1 then
return val2
elseif not val2 then
return val1
elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
return val1
else
return val2

View File

@@ -10,6 +10,9 @@ local alarms = {
['name'] = '<%= alarm_name %>',
['description'] = '<%= alarm["description"].to_s().gsub("'"){"\\'"} %>',
['severity'] = '<%= alarm["severity"] %>',
<%- if alarm.key?("no_data_policy") -%>
['no_data_policy'] = '<%= alarm["no_data_policy"] %>',
<%- end -%>
['trigger'] = {
['logical_operator'] = '<%= alarm["trigger"]["logical_operator"] || 'or' %>',
['rules'] = {

View File

@@ -980,6 +980,101 @@ function TestLMAAlarm:test_group_by_missing_field_is_unknown()
assertEquals(state, consts.UNKW)
end
function TestLMAAlarm:test_no_data_policy_okay()
local alarm = {
name = 'foo-alarm',
description = 'foo description',
enabled = true,
trigger = {
rules = {
{
metric = 'foo_metric_name',
window = 30,
periods = 1,
['function'] = 'avg',
fields = { foo = 'bar', bar = 'foo' },
group_by = {'fs'},
relational_operator = '<=',
threshold = 5,
},
},
},
severity = 'warning',
no_data_policy = 'okay',
}
lma_alarm.load_alarm(alarm)
lma_alarm.set_start_time(current_time)
lma_alarm.add_value(next_time(100), 'another_metric', 5)
local state, result = lma_alarm.evaluate(next_time())
assertEquals(#result, 0)
assertEquals(state, consts.OKAY)
end
function TestLMAAlarm:test_no_data_policy_critical()
local alarm = {
name = 'foo-alarm',
description = 'foo description',
enabled = true,
trigger = {
rules = {
{
metric = 'foo_metric_name',
window = 30,
periods = 1,
['function'] = 'avg',
fields = { foo = 'bar', bar = 'foo' },
group_by = {'fs'},
relational_operator = '<=',
threshold = 5,
},
},
},
severity = 'critical',
no_data_policy = 'critical',
}
lma_alarm.load_alarm(alarm)
lma_alarm.set_start_time(current_time)
lma_alarm.add_value(next_time(100), 'another_metric', 5)
local state, result = lma_alarm.evaluate(next_time())
assertEquals(#result, 1)
assertEquals(state, consts.CRIT)
end
function TestLMAAlarm:test_no_data_policy_skip()
local alarm = {
name = 'foo-alarm',
description = 'foo description',
enabled = true,
trigger = {
rules = {
{
metric = 'foo_metric_name',
window = 30,
periods = 1,
['function'] = 'avg',
fields = { foo = 'bar', bar = 'foo' },
group_by = {'fs'},
relational_operator = '<=',
threshold = 5,
},
},
},
severity = 'critical',
no_data_policy = 'skip',
}
lma_alarm.load_alarm(alarm)
lma_alarm.set_start_time(current_time)
lma_alarm.add_value(next_time(100), 'another_metric', 5)
local state, result = lma_alarm.evaluate(next_time())
assertEquals(state, nil)
end
lu = LuaUnit
lu:setVerbosity( 1 )
os.exit( lu:run() )