Add 'no_data_policy' attribute to alarm definition
This allows to specify the behavior of the alarm when no datapoint have been received in the defined window. The default value is to send an AFD alert with 'UNKNOWN' to be backward-compatible with the existing alarms. The value for the 'no_data_policy' attribute can either be a severity level (okay, warning, critical, down, unknown) or 'skip' to avoid sending the AFD metric. This change is required to define relevant alarms against "sporadic" metrics which aren't guaranteed to be received at periodic intervals like for instance "resource creation time" metrics. It also needed to support alarm definitions for cluster-wide metrics. DocImpact Change-Id: Iaa7fe63d321403ac2c5d4695a3346d912d3cc984
This commit is contained in:
committed by
Simon Pasquier
parent
14cd2e1755
commit
16b4b02d9e
@@ -29,6 +29,8 @@ local SEVERITIES = {
|
||||
warning = consts.WARN,
|
||||
critical = consts.CRIT,
|
||||
down = consts.DOWN,
|
||||
unknown = consts.UNKW,
|
||||
okay = consts.OKAY,
|
||||
}
|
||||
|
||||
local Alarm = {}
|
||||
@@ -50,6 +52,19 @@ function Alarm.new(alarm)
|
||||
a.severity_str = string.upper(alarm.severity)
|
||||
a.severity = SEVERITIES[string.lower(alarm.severity)]
|
||||
assert(a.severity ~= nil)
|
||||
|
||||
a.skip_when_no_data = false
|
||||
if alarm.no_data_policy then
|
||||
if string.lower(alarm.no_data_policy) == 'skip' then
|
||||
a.skip_when_no_data = true
|
||||
else
|
||||
a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)]
|
||||
end
|
||||
else
|
||||
a.no_data_severity = consts.UNKW
|
||||
end
|
||||
assert(a.skip_when_no_data or a.no_data_severity ~= nil)
|
||||
|
||||
a.rules = {}
|
||||
a.initial_wait = 0
|
||||
for _, rule in ipairs(alarm.trigger.rules) do
|
||||
@@ -130,7 +145,7 @@ end
|
||||
-- },
|
||||
-- }
|
||||
function Alarm:evaluate(ns)
|
||||
local state
|
||||
local state = consts.OKAY
|
||||
local matches = 0
|
||||
local all_alerts = {}
|
||||
local function add_alarm(rule, value, message, fields)
|
||||
@@ -170,23 +185,29 @@ function Alarm:evaluate(ns)
|
||||
|
||||
if self.logical_operator == 'and' then
|
||||
if one_unknown then
|
||||
state = consts.UNKW
|
||||
if self.skip_when_no_data then
|
||||
state = nil
|
||||
else
|
||||
state = self.no_data_severity
|
||||
end
|
||||
elseif #self.rules == matches then
|
||||
state = self.severity
|
||||
else
|
||||
state = consts.OKAY
|
||||
all_alerts = {}
|
||||
end
|
||||
elseif self.logical_operator == 'or' then
|
||||
if matches > 0 then
|
||||
state = self.severity
|
||||
elseif one_unknown then
|
||||
state = consts.UNKW
|
||||
else
|
||||
state = consts.OKAY
|
||||
all_alerts = {}
|
||||
if self.skip_when_no_data then
|
||||
state = nil
|
||||
else
|
||||
state = self.no_data_severity
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if state == nil or state == consts.OKAY then
|
||||
all_alerts = {}
|
||||
end
|
||||
return state, all_alerts
|
||||
end
|
||||
|
||||
|
||||
@@ -25,9 +25,12 @@ local STATUS_WEIGHTS = {
|
||||
[consts.DOWN]=4
|
||||
}
|
||||
|
||||
|
||||
function max_status(val1, val2)
|
||||
if not val2 or STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
|
||||
if not val1 then
|
||||
return val2
|
||||
elseif not val2 then
|
||||
return val1
|
||||
elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
|
||||
return val1
|
||||
else
|
||||
return val2
|
||||
|
||||
@@ -10,6 +10,9 @@ local alarms = {
|
||||
['name'] = '<%= alarm_name %>',
|
||||
['description'] = '<%= alarm["description"].to_s().gsub("'"){"\\'"} %>',
|
||||
['severity'] = '<%= alarm["severity"] %>',
|
||||
<%- if alarm.key?("no_data_policy") -%>
|
||||
['no_data_policy'] = '<%= alarm["no_data_policy"] %>',
|
||||
<%- end -%>
|
||||
['trigger'] = {
|
||||
['logical_operator'] = '<%= alarm["trigger"]["logical_operator"] || 'or' %>',
|
||||
['rules'] = {
|
||||
|
||||
@@ -980,6 +980,101 @@ function TestLMAAlarm:test_group_by_missing_field_is_unknown()
|
||||
assertEquals(state, consts.UNKW)
|
||||
end
|
||||
|
||||
function TestLMAAlarm:test_no_data_policy_okay()
|
||||
local alarm = {
|
||||
name = 'foo-alarm',
|
||||
description = 'foo description',
|
||||
enabled = true,
|
||||
trigger = {
|
||||
rules = {
|
||||
{
|
||||
metric = 'foo_metric_name',
|
||||
window = 30,
|
||||
periods = 1,
|
||||
['function'] = 'avg',
|
||||
fields = { foo = 'bar', bar = 'foo' },
|
||||
group_by = {'fs'},
|
||||
relational_operator = '<=',
|
||||
threshold = 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
severity = 'warning',
|
||||
no_data_policy = 'okay',
|
||||
}
|
||||
lma_alarm.load_alarm(alarm)
|
||||
lma_alarm.set_start_time(current_time)
|
||||
|
||||
lma_alarm.add_value(next_time(100), 'another_metric', 5)
|
||||
|
||||
local state, result = lma_alarm.evaluate(next_time())
|
||||
assertEquals(#result, 0)
|
||||
assertEquals(state, consts.OKAY)
|
||||
end
|
||||
|
||||
function TestLMAAlarm:test_no_data_policy_critical()
|
||||
local alarm = {
|
||||
name = 'foo-alarm',
|
||||
description = 'foo description',
|
||||
enabled = true,
|
||||
trigger = {
|
||||
rules = {
|
||||
{
|
||||
metric = 'foo_metric_name',
|
||||
window = 30,
|
||||
periods = 1,
|
||||
['function'] = 'avg',
|
||||
fields = { foo = 'bar', bar = 'foo' },
|
||||
group_by = {'fs'},
|
||||
relational_operator = '<=',
|
||||
threshold = 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
severity = 'critical',
|
||||
no_data_policy = 'critical',
|
||||
}
|
||||
lma_alarm.load_alarm(alarm)
|
||||
lma_alarm.set_start_time(current_time)
|
||||
|
||||
lma_alarm.add_value(next_time(100), 'another_metric', 5)
|
||||
|
||||
local state, result = lma_alarm.evaluate(next_time())
|
||||
assertEquals(#result, 1)
|
||||
assertEquals(state, consts.CRIT)
|
||||
end
|
||||
|
||||
function TestLMAAlarm:test_no_data_policy_skip()
|
||||
local alarm = {
|
||||
name = 'foo-alarm',
|
||||
description = 'foo description',
|
||||
enabled = true,
|
||||
trigger = {
|
||||
rules = {
|
||||
{
|
||||
metric = 'foo_metric_name',
|
||||
window = 30,
|
||||
periods = 1,
|
||||
['function'] = 'avg',
|
||||
fields = { foo = 'bar', bar = 'foo' },
|
||||
group_by = {'fs'},
|
||||
relational_operator = '<=',
|
||||
threshold = 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
severity = 'critical',
|
||||
no_data_policy = 'skip',
|
||||
}
|
||||
lma_alarm.load_alarm(alarm)
|
||||
lma_alarm.set_start_time(current_time)
|
||||
|
||||
lma_alarm.add_value(next_time(100), 'another_metric', 5)
|
||||
|
||||
local state, result = lma_alarm.evaluate(next_time())
|
||||
assertEquals(state, nil)
|
||||
end
|
||||
|
||||
lu = LuaUnit
|
||||
lu:setVerbosity( 1 )
|
||||
os.exit( lu:run() )
|
||||
|
||||
Reference in New Issue
Block a user