From 8794ee5b3bf9ff368e3eea1b56c9cec089159610 Mon Sep 17 00:00:00 2001 From: Swann Croiset Date: Tue, 11 Oct 2016 09:32:18 +0200 Subject: [PATCH] Revert "Remove the no_data_policy=skip for AFD" This reverts commit 1612638e62aee7b547271dea4b1c4126dfa97394. Change-Id: I9ed3f4c48835e799a08442b5ba8470ca6f676922 --- .../templates/alarming.yaml.erb | 51 ++++++++++++------- .../files/plugins/common/afd_alarm.lua | 26 +++++++--- .../templates/lma_alarms.lua.erb | 4 +- .../tests/lua/test_afd_alarm.lua | 39 ++++++++++++-- 4 files changed, 91 insertions(+), 29 deletions(-) diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb index 8e1bdcafb..d9b31f413 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb @@ -223,7 +223,7 @@ lma_collector: severity: 'critical' # If the local RabbitMQ instance is down, it will be caught by the # rabbitmq-check alarm - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -239,7 +239,7 @@ lma_collector: severity: 'warning' # If the local RabbitMQ instance is down, it will be caught by the # rabbitmq-check alarm - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -255,7 +255,7 @@ lma_collector: severity: 'critical' # If the local RabbitMQ instance is down, it will be caught by the # rabbitmq-check alarm - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -271,7 +271,7 @@ lma_collector: severity: 'warning' # If the local RabbitMQ instance is down, it will be caught by the # rabbitmq-check alarm - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -287,7 +287,7 @@ lma_collector: severity: 'warning' # If the local RabbitMQ instance is down, it will be caught by the # rabbitmq-check alarm - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -301,6 +301,7 @@ lma_collector: - name: 'rabbitmq-pacemaker-down' description: 'The RabbitMQ cluster is down' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the DC node enabled: 'true' trigger: logical_operator: 'and' @@ -317,6 +318,7 @@ lma_collector: - name: 'rabbitmq-pacemaker-critical' description: 'The RabbitMQ cluster is critical because less than half of the nodes are up' severity: 'critical' + no_data_policy: 'skip' # the metric is only collected from the DC node enabled: 'true' trigger: logical_operator: 'and' @@ -333,6 +335,7 @@ lma_collector: - name: 'rabbitmq-pacemaker-warning' description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing' severity: 'warning' + no_data_policy: 'skip' # the metric is only collected from the DC node enabled: 'true' trigger: logical_operator: 'and' @@ -487,7 +490,7 @@ lma_collector: description: "The filesystem's free space is low" severity: 'warning' enabled: 'true' - no_data_severity: okay + no_data_policy: 'okay' trigger: rules: - metric: fs_space_percent_free @@ -503,7 +506,7 @@ lma_collector: description: "The filesystem's free space is too low" severity: 'critical' enabled: 'true' - no_data_severity: okay + no_data_policy: 'okay' trigger: rules: - metric: fs_space_percent_free @@ -549,7 +552,7 @@ lma_collector: - name: 'nova-logs-error' description: 'Too many errors have been detected in Nova logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -581,7 +584,7 @@ lma_collector: - name: 'heat-logs-error' description: 'Too many errors have been detected in Heat logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -613,7 +616,7 @@ lma_collector: - name: 'swift-logs-error' description: 'Too many errors have been detected in Swift logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -645,7 +648,7 @@ lma_collector: - name: 'cinder-logs-error' description: 'Too many errors have been detected in Cinder logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -677,7 +680,7 @@ lma_collector: - name: 'glance-logs-error' description: 'Too many errors have been detected in Glance logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -709,7 +712,7 @@ lma_collector: - name: 'neutron-logs-error' description: 'Too many errors have been detected in Neutron logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -726,7 +729,7 @@ lma_collector: - name: 'keystone-response-time-duration' description: 'Keystone API is too slow' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -789,7 +792,7 @@ lma_collector: - name: 'keystone-logs-error' description: 'Too many errors have been detected in Keystone logs' severity: 'warning' - no_data_severity: okay + no_data_policy: 'okay' enabled: 'true' trigger: logical_operator: 'or' @@ -1125,7 +1128,7 @@ lma_collector: - name: 'instance-creation-time-warning' description: "Instance creation takes too much time" severity: 'warning' - no_data_severity: okay # This is a sporadic metric + no_data_policy: 'okay' # This is a sporadic metric enabled: 'true' trigger: rules: @@ -1139,7 +1142,7 @@ lma_collector: description: 'Errors on hard drive(s) have been detected' severity: 'critical' enabled: 'true' - no_data_severity: okay + no_data_policy: okay trigger: rules: - metric: hdd_errors_rate @@ -1153,6 +1156,7 @@ lma_collector: description: 'There is none VCPU available for new instances' severity: 'warning' enabled: 'true' + no_data_policy: skip # the metric is only collected from the aggregator node trigger: rules: - metric: openstack_nova_total_free_vcpus @@ -1165,6 +1169,7 @@ lma_collector: description: 'There is none memory available for new instances' severity: 'warning' enabled: 'true' + no_data_policy: skip # the metric is only collected from the aggregator node trigger: rules: - metric: openstack_nova_total_free_ram @@ -1293,6 +1298,7 @@ lma_collector: - name: 'influxdb-api-check-failed' description: 'Endpoint check for InfluxDB is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1307,6 +1313,7 @@ lma_collector: - name: 'nova-api-check-failed' description: 'Endpoint check for nova-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1321,6 +1328,7 @@ lma_collector: - name: 'neutron-api-check-failed' description: 'Endpoint check for neutron-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1335,6 +1343,7 @@ lma_collector: - name: 'cinder-api-check-failed' description: 'Endpoint check for cinder-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1349,6 +1358,7 @@ lma_collector: - name: 'cinder-v2-api-check-failed' description: 'Endpoint check for cinder-v2-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1363,6 +1373,7 @@ lma_collector: - name: 'glance-api-check-failed' description: 'Endpoint check for glance-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1377,6 +1388,7 @@ lma_collector: - name: 'heat-api-check-failed' description: 'Endpoint check for heat-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1391,6 +1403,7 @@ lma_collector: - name: 'heat-cfn-api-check-failed' description: 'Endpoint check for heat-cfn-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1405,6 +1418,7 @@ lma_collector: - name: 'swift-api-check-failed' description: 'Endpoint check for swift-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1419,6 +1433,7 @@ lma_collector: - name: 'swift-s3-api-check-failed' description: 'Endpoint check for swift-s3-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1433,6 +1448,7 @@ lma_collector: - name: 'keystone-public-api-check-failed' description: 'Endpoint check for keystone-public-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: @@ -1447,6 +1463,7 @@ lma_collector: - name: 'ceilometer-api-check-failed' description: 'Endpoint check for ceilometer-api is failed' severity: 'down' + no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP enabled: 'true' trigger: rules: diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua index 5d5fa2b4b..4fd660f0a 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/common/afd_alarm.lua @@ -52,12 +52,18 @@ function Alarm.new(alarm) a.severity_str = string.upper(alarm.severity) a.severity = SEVERITIES[string.lower(alarm.severity)] assert(a.severity ~= nil) - if alarm.no_data_severity then - a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_severity)] - end - if not a.no_data_severity then + + a.skip_when_no_data = false + if alarm.no_data_policy then + if string.lower(alarm.no_data_policy) == 'skip' then + a.skip_when_no_data = true + else + a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)] + end + else a.no_data_severity = consts.UNKW end + assert(a.skip_when_no_data or a.no_data_severity ~= nil) a.rules = {} a.initial_wait = 0 @@ -169,7 +175,11 @@ function Alarm:evaluate(ns) if self.logical_operator == 'and' then if one_unknown then - state = self.no_data_severity + if self.skip_when_no_data then + state = nil + else + state = self.no_data_severity + end elseif #self.rules == matches then state = self.severity end @@ -177,7 +187,11 @@ function Alarm:evaluate(ns) if matches > 0 then state = self.severity elseif one_unknown then - state = self.no_data_severity + if self.skip_when_no_data then + state = nil + else + state = self.no_data_severity + end end end diff --git a/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.lua.erb b/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.lua.erb index cf968d98c..6ee5b3fd3 100644 --- a/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.lua.erb +++ b/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.lua.erb @@ -10,8 +10,8 @@ local alarms = { ['name'] = '<%= alarm_name %>', ['description'] = '<%= alarm["description"].to_s().gsub("'"){"\\'"} %>', ['severity'] = '<%= alarm["severity"] %>', -<%- if alarm.key?("no_data_severity") -%> - ['no_data_severity'] = '<%= alarm["no_data_severity"] %>', +<%- if alarm.key?("no_data_policy") -%> + ['no_data_policy'] = '<%= alarm["no_data_policy"] %>', <%- end -%> ['trigger'] = { ['logical_operator'] = '<%= alarm["trigger"]["logical_operator"] || 'or' %>', diff --git a/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua b/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua index 38be4dfd2..55588f615 100644 --- a/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua +++ b/deployment_scripts/puppet/modules/lma_collector/tests/lua/test_afd_alarm.lua @@ -980,7 +980,7 @@ function TestLMAAlarm:test_group_by_missing_field_is_unknown() assertEquals(state, consts.UNKW) end -function TestLMAAlarm:test_no_data_severity_okay() +function TestLMAAlarm:test_no_data_policy_okay() local alarm = { name = 'foo-alarm', description = 'foo description', @@ -1000,7 +1000,7 @@ function TestLMAAlarm:test_no_data_severity_okay() }, }, severity = 'warning', - no_data_severity = 'okay', + no_data_policy = 'okay', } lma_alarm.load_alarm(alarm) lma_alarm.set_start_time(current_time) @@ -1012,7 +1012,7 @@ function TestLMAAlarm:test_no_data_severity_okay() assertEquals(state, consts.OKAY) end -function TestLMAAlarm:test_no_data_severity_critical() +function TestLMAAlarm:test_no_data_policy_critical() local alarm = { name = 'foo-alarm', description = 'foo description', @@ -1032,7 +1032,7 @@ function TestLMAAlarm:test_no_data_severity_critical() }, }, severity = 'critical', - no_data_severity = 'critical', + no_data_policy = 'critical', } lma_alarm.load_alarm(alarm) lma_alarm.set_start_time(current_time) @@ -1044,6 +1044,37 @@ function TestLMAAlarm:test_no_data_severity_critical() assertEquals(state, consts.CRIT) end +function TestLMAAlarm:test_no_data_policy_skip() + local alarm = { + name = 'foo-alarm', + description = 'foo description', + enabled = true, + trigger = { + rules = { + { + metric = 'foo_metric_name', + window = 30, + periods = 1, + ['function'] = 'avg', + fields = { foo = 'bar', bar = 'foo' }, + group_by = {'fs'}, + relational_operator = '<=', + threshold = 5, + }, + }, + }, + severity = 'critical', + no_data_policy = 'skip', + } + lma_alarm.load_alarm(alarm) + lma_alarm.set_start_time(current_time) + + lma_alarm.add_value(next_time(100), 'another_metric', 5) + + local state, result = lma_alarm.evaluate(next_time()) + assertEquals(state, nil) +end + lu = LuaUnit lu:setVerbosity( 1 ) os.exit( lu:run() )