Alarm definition refactoring
DocImpact blueprint: alarming-refactoring Change-Id: I8c053f2fbc4b4b85958be8413919f9bf1b168027
This commit is contained in:
parent
385da2a160
commit
7deace8726
@ -38,8 +38,7 @@ if $is_controller or $is_rabbitmq or $is_mysql_server {
|
||||
|
||||
class { 'fuel_lma_collector::afds':
|
||||
roles => hiera('roles'),
|
||||
node_cluster_roles => $lma['node_cluster_roles'],
|
||||
service_cluster_roles => $lma['service_cluster_roles'],
|
||||
node_profiles => $lma['node_profiles'],
|
||||
node_cluster_alarms => $lma['node_cluster_alarms'],
|
||||
service_cluster_alarms => $lma['service_cluster_alarms'],
|
||||
alarms => $alarms_definitions,
|
||||
|
@ -239,11 +239,15 @@ lma::collector::infrastructure_alerting::password: <%= @nagios_password %>
|
||||
$detach_database = hiera('detach-database', {})
|
||||
$detach_database_enabled = $detach_database['metadata'] and $detach_database['metadata']['enabled']
|
||||
|
||||
fuel_lma_collector::hiera_data { 'gse_filters':
|
||||
content => template('fuel_lma_collector/gse_filters.yaml.erb')
|
||||
fuel_lma_collector::hiera_data { 'clusters':
|
||||
content => template('fuel_lma_collector/clusters.yaml.erb')
|
||||
}
|
||||
|
||||
fuel_lma_collector::hiera_data { 'alarming':
|
||||
content => template('fuel_lma_collector/alarming.yaml.erb')
|
||||
}
|
||||
|
||||
fuel_lma_collector::hiera_data { 'node_profiles':
|
||||
content => template('fuel_lma_collector/node_profiles.yaml.erb')
|
||||
}
|
||||
}
|
||||
|
@ -23,10 +23,10 @@
|
||||
# Ex:
|
||||
#
|
||||
# ARG0:
|
||||
# {"rabbitmq"=>{"queue"=>["rabbitmq-queue-warning"]},
|
||||
# "apache"=>{"worker"=>["apache-warning"]},
|
||||
# "memcached"=>{"all"=>["memcached-warning"]},
|
||||
# "haproxy"=>{"alive"=>["haproxy-warning"]}}
|
||||
# {"rabbitmq"=>{"apply_to_node" => "controller", "alarms" => {"queue"=>["rabbitmq-queue-warning"]}},
|
||||
# "apache"=>{"apply_to_node" => "controller", "alarms" => {"worker"=>["apache-warning"]}},
|
||||
# "memcached"=>{"apply_to_node"=>"controller", "alarms" => {"all"=>["memcached-warning"]}},
|
||||
# "haproxy"=>{"apply_to_node" => "controller", "alarms" => {"alive"=>["haproxy-warning"]}}}
|
||||
#
|
||||
# ARG1:
|
||||
#
|
||||
@ -63,7 +63,7 @@
|
||||
# "function"=>"min"}]}}
|
||||
# ]
|
||||
#
|
||||
# ARG2: ["rabbitmq", "apache"]
|
||||
# ARG2: ["controller", "compute"]
|
||||
#
|
||||
# ARG3: type (node|service)
|
||||
#
|
||||
@ -96,31 +96,47 @@ module Puppet::Parser::Functions
|
||||
afd_filters = {}
|
||||
|
||||
afd_profiles.each do |afd_profile|
|
||||
next unless afd_alarms.has_key?(afd_profile)
|
||||
|
||||
afd_alarms[afd_profile].each do |afd_name, alarms|
|
||||
# Collect the metrics which are required by this AFD filter
|
||||
metrics = Set.new([])
|
||||
alarms.each do |a_name|
|
||||
alarm_definitions.each do |alarm_def|
|
||||
if alarm_def['name'] == a_name
|
||||
alarm_def['trigger']['rules'].each do |r|
|
||||
metrics << r['metric']
|
||||
end
|
||||
end
|
||||
afds = afd_alarms.select {|k,v| v.has_key?('apply_to_node') and v['apply_to_node'] == afd_profile }
|
||||
afds.each do |k, v|
|
||||
activate_alerting=true
|
||||
if v.has_key?('activate_alerting')
|
||||
if v['activate_alerting'] == false
|
||||
activate_alerting=false
|
||||
end
|
||||
end
|
||||
enable_notification=false
|
||||
if v.has_key?('enable_notification')
|
||||
if v['enable_notification'] == true
|
||||
enable_notification=true
|
||||
end
|
||||
end
|
||||
afd_cluster_name = k
|
||||
v['alarms'].each do |afd_name, alarms|
|
||||
# Collect the metrics which are required by this AFD filter
|
||||
metrics = Set.new([])
|
||||
alarms.each do |a_name|
|
||||
alarm_definitions.each do |alarm_def|
|
||||
if alarm_def['name'] == a_name
|
||||
alarm_def['trigger']['rules'].each do |r|
|
||||
metrics << r['metric']
|
||||
end
|
||||
end
|
||||
|
||||
message_matcher = metrics.collect{|x| "Fields[name] == \'#{x}\'" }.join(' || ')
|
||||
end
|
||||
end
|
||||
message_matcher = metrics.collect{|x| "Fields[name] == \'#{x}\'" }.join(' || ')
|
||||
|
||||
afd_filters["#{afd_profile}_#{afd_name}"] = {
|
||||
'type' => type,
|
||||
'cluster_name' => afd_profile,
|
||||
'logical_name' => afd_name,
|
||||
'alarms' => alarms,
|
||||
'alarms_definitions' => alarm_definitions,
|
||||
'message_matcher' => message_matcher
|
||||
}
|
||||
afd_filters["#{afd_cluster_name}_#{afd_name}"] = {
|
||||
'type' => type,
|
||||
'cluster_name' => afd_cluster_name,
|
||||
'logical_name' => afd_name,
|
||||
'alarms' => alarms,
|
||||
'alarms_definitions' => alarm_definitions,
|
||||
'message_matcher' => message_matcher,
|
||||
'activate_alerting' => activate_alerting,
|
||||
'enable_notification' => enable_notification,
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -44,7 +44,7 @@ module Puppet::Parser::Functions
|
||||
|
||||
roles.each do |role|
|
||||
data.each do |k,v|
|
||||
cluster_names << k if v.include?(role)
|
||||
cluster_names << k if v['roles'].include?(role)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -15,37 +15,28 @@
|
||||
|
||||
class fuel_lma_collector::afds (
|
||||
$roles = undef,
|
||||
$node_cluster_roles = undef,
|
||||
$service_cluster_roles = undef,
|
||||
$node_profiles = undef,
|
||||
$node_cluster_alarms = undef,
|
||||
$service_cluster_alarms = undef,
|
||||
$alarms = undef,
|
||||
){
|
||||
|
||||
validate_array($roles)
|
||||
validate_hash($node_cluster_roles)
|
||||
validate_hash($service_cluster_roles)
|
||||
validate_hash($node_profiles)
|
||||
validate_hash($node_cluster_alarms)
|
||||
validate_hash($service_cluster_alarms)
|
||||
validate_array($alarms)
|
||||
|
||||
$node_cluster_names_tmp = get_cluster_names($node_cluster_roles, $roles)
|
||||
$service_cluster_names = get_cluster_names($service_cluster_roles, $roles)
|
||||
|
||||
if size($node_cluster_names_tmp) == 0 and $node_cluster_alarms['default'] {
|
||||
$node_cluster_names = ['default']
|
||||
} else {
|
||||
$node_cluster_names = $node_cluster_names_tmp
|
||||
}
|
||||
$clusters = get_cluster_names($node_profiles, $roles)
|
||||
|
||||
$node_afd_filters = get_afd_filters($node_cluster_alarms,
|
||||
$alarms,
|
||||
$node_cluster_names,
|
||||
$clusters,
|
||||
'node')
|
||||
|
||||
$service_afd_filters = get_afd_filters($service_cluster_alarms,
|
||||
$alarms,
|
||||
$service_cluster_names,
|
||||
$clusters,
|
||||
'service')
|
||||
|
||||
create_resources(lma_collector::afd_filter, $node_afd_filters)
|
||||
|
@ -22,10 +22,24 @@ describe 'fuel_lma_collector::afds' do
|
||||
describe 'with defaults' do
|
||||
let(:params) do
|
||||
{:roles => ['primary-controller'],
|
||||
:node_cluster_roles => {'controller' => ['primary-controller']},
|
||||
:service_cluster_roles => {'mysql' => ['primary-controller']},
|
||||
:node_cluster_alarms => {'controller' => {'cpu' => ['cpu_warning']}},
|
||||
:service_cluster_alarms => {'mysql' => {'all' => ['db_warning']}},
|
||||
:node_profiles => {'controller' => {'roles' => ['primary-controller']}},
|
||||
:node_cluster_alarms => {
|
||||
'controller' =>
|
||||
{
|
||||
'apply_to_node' => 'controller',
|
||||
'alarms' => {
|
||||
'cpu' => ['cpu_warning']
|
||||
}
|
||||
}
|
||||
},
|
||||
:service_cluster_alarms => {
|
||||
'mysql' => {
|
||||
'apply_to_node' => 'controller',
|
||||
'alarms' => {
|
||||
'all' => ['db_warning']
|
||||
}
|
||||
}
|
||||
},
|
||||
:alarms => [
|
||||
{"name"=>"cpu_warning",
|
||||
"description"=>"Fake alarm",
|
||||
@ -63,9 +77,15 @@ describe 'fuel_lma_collector::afds' do
|
||||
describe 'with enabled false' do
|
||||
let(:params) do
|
||||
{:roles => ['primary-controller'],
|
||||
:node_cluster_roles => {'controller' => ['primary-controller']},
|
||||
:service_cluster_roles => {},
|
||||
:node_cluster_alarms => {'controller' => {'cpu' => ['cpu_warning']}},
|
||||
:node_profiles => {'controller' => {'roles' => ['primary-controller']}},
|
||||
:node_cluster_alarms => {
|
||||
'controller' => {
|
||||
'apply_to_node' => 'controller',
|
||||
'alarms' => {
|
||||
'cpu' => ['cpu_warning']
|
||||
}
|
||||
}
|
||||
},
|
||||
:service_cluster_alarms => {},
|
||||
:alarms => [
|
||||
{"name"=>"cpu_warning",
|
||||
|
@ -0,0 +1,288 @@
|
||||
# Copyright 2015 Mirantis, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
require 'spec_helper'
|
||||
|
||||
describe 'get_afd_filters' do
|
||||
|
||||
alarms_nodes = [
|
||||
{"name"=>"cpu-critical-controller",
|
||||
"description"=>"The CPU usage is too high (controller node)",
|
||||
"severity"=>"critical",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[
|
||||
{"metric"=>"cpu_idle",
|
||||
"relational_operator"=>"<=",
|
||||
"threshold"=>5,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
{"metric"=>"cpu_wait",
|
||||
"relational_operator"=>">=",
|
||||
"threshold"=>35,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
]}},
|
||||
{"name"=>"cpu-warning-controller",
|
||||
"description"=>"The CPU usage is high (controller node)",
|
||||
"severity"=>"warning",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[
|
||||
{"metric"=>"cpu_idle",
|
||||
"relational_operator"=>"<=",
|
||||
"threshold"=>15,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
{"metric"=>"cpu_wait",
|
||||
"relational_operator"=>">=",
|
||||
"threshold"=>25,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
]}},
|
||||
{"name"=>"cpu-critical-compute",
|
||||
"description"=>"The CPU usage is high (critical node)",
|
||||
"severity"=>"critical",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[
|
||||
{"metric"=>"cpu_idle",
|
||||
"relational_operator"=>"<=",
|
||||
"threshold"=>30,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
]}},
|
||||
{"name"=>"cpu-warning-compute",
|
||||
"description"=>"The CPU usage is high (compute node)",
|
||||
"severity"=>"warning",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[
|
||||
{"metric"=>"cpu_idle",
|
||||
"relational_operator"=>"<=",
|
||||
"threshold"=>20,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
]}},
|
||||
{"name"=>"fs-critical",
|
||||
"description"=>"The FS usage is critical",
|
||||
"severity"=>"critical",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[
|
||||
{"metric"=>"fs_percent_free",
|
||||
"relational_operator"=>"<=",
|
||||
"threshold"=>8,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"},
|
||||
]}},
|
||||
]
|
||||
|
||||
afds_nodes = {
|
||||
"controller" => {
|
||||
"apply_to_node" => "controller",
|
||||
"enable_notification" => true,
|
||||
"activate_alerting" => true,
|
||||
"alarms" => {
|
||||
"system" => ["cpu-critical-controller", "cpu-warning-controller"],
|
||||
},
|
||||
},
|
||||
"compute" => {
|
||||
"apply_to_node" => "compute",
|
||||
"enable_notification" => true,
|
||||
"activate_alerting" => true,
|
||||
"alarms" => {
|
||||
"system" => ["cpu-critical-compute", "cpu-warning-compute"],
|
||||
"fs" => ["fs-critical"],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
describe 'For controller nodes' do
|
||||
it { should run.with_params(afds_nodes, alarms_nodes, ['controller'], 'node')
|
||||
.and_return(
|
||||
{"controller_system"=>
|
||||
{"type"=>"node",
|
||||
"cluster_name"=>"controller",
|
||||
"logical_name"=>"system",
|
||||
"alarms"=>["cpu-critical-controller", "cpu-warning-controller"],
|
||||
"alarms_definitions"=> alarms_nodes,
|
||||
"message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'",
|
||||
"enable_notification" => true,
|
||||
"activate_alerting" => true,
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
end
|
||||
describe 'For compute nodes' do
|
||||
it { should run.with_params(afds_nodes, alarms_nodes, ['compute'], 'node')
|
||||
.and_return(
|
||||
{"compute_system"=>
|
||||
{"type"=>"node",
|
||||
"cluster_name"=>"compute",
|
||||
"logical_name"=>"system",
|
||||
"alarms"=>["cpu-critical-compute", "cpu-warning-compute"],
|
||||
"alarms_definitions"=> alarms_nodes,
|
||||
"message_matcher"=>"Fields[name] == 'cpu_idle'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => true,
|
||||
},
|
||||
"compute_fs"=>
|
||||
{"type"=>"node",
|
||||
"cluster_name"=>"compute",
|
||||
"logical_name"=>"fs",
|
||||
"alarms"=>["fs-critical"],
|
||||
"alarms_definitions"=> alarms_nodes,
|
||||
"message_matcher"=>"Fields[name] == 'fs_percent_free'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => true,
|
||||
}
|
||||
})
|
||||
}
|
||||
end
|
||||
describe 'For compute and controller nodes' do
|
||||
it { should run.with_params(afds_nodes, alarms_nodes, ['compute', 'controller'], 'node')
|
||||
.and_return(
|
||||
{"compute_system"=>
|
||||
{"type"=>"node",
|
||||
"cluster_name"=>"compute",
|
||||
"logical_name"=>"system",
|
||||
"alarms"=>["cpu-critical-compute", "cpu-warning-compute"],
|
||||
"alarms_definitions"=> alarms_nodes,
|
||||
"message_matcher"=>"Fields[name] == 'cpu_idle'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => true,
|
||||
},
|
||||
"compute_fs"=>
|
||||
{"type"=>"node",
|
||||
"cluster_name"=>"compute",
|
||||
"logical_name"=>"fs",
|
||||
"alarms"=>["fs-critical"],
|
||||
"alarms_definitions"=> alarms_nodes,
|
||||
"message_matcher"=>"Fields[name] == 'fs_percent_free'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => true,
|
||||
},
|
||||
"controller_system"=>
|
||||
{"type"=>"node",
|
||||
"cluster_name"=>"controller",
|
||||
"logical_name"=>"system",
|
||||
"alarms"=>["cpu-critical-controller", "cpu-warning-controller"],
|
||||
"alarms_definitions"=> alarms_nodes,
|
||||
"message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => true,
|
||||
}
|
||||
})
|
||||
}
|
||||
end
|
||||
|
||||
alarms_services = [
|
||||
{"name"=>"rabbitmq-queue-warning",
|
||||
"description"=>"Number of message in queues too high",
|
||||
"severity"=>"warning",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[{"metric"=>"rabbitmq_messages",
|
||||
"relational_operator"=>">=",
|
||||
"threshold"=>200,
|
||||
"window"=>120,
|
||||
"periods"=>0,
|
||||
"function"=>"avg"}]}},
|
||||
{"name"=>"apache-warning",
|
||||
"description"=>"",
|
||||
"severity"=>"warning",
|
||||
"trigger"=>
|
||||
{"logical_operator"=>"or",
|
||||
"rules"=>
|
||||
[{"metric"=>"apache_idle_workers",
|
||||
"relational_operator"=>"=",
|
||||
"threshold"=>0,
|
||||
"window"=>60,
|
||||
"periods"=>0,
|
||||
"function"=>"min"},
|
||||
{"metric"=>"apache_status",
|
||||
"relational_operator"=>"=",
|
||||
"threshold"=>0,
|
||||
"window"=>60,
|
||||
"periods"=>0,
|
||||
"function"=>"min"}]}}
|
||||
]
|
||||
afds_services = {
|
||||
"rabbitmq" => {
|
||||
"apply_to_node" => "controller",
|
||||
"enable_notification" => false,
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => false,
|
||||
"alarms" => {
|
||||
# "pacemaker" => ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical'],
|
||||
"queue" => ["rabbitmq-queue-warning"]
|
||||
},
|
||||
},
|
||||
"apache" => {
|
||||
"apply_to_node" => "controller",
|
||||
"enable_notification" => false,
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => false,
|
||||
"alarms" => {
|
||||
"worker" => ['apache-warning'],
|
||||
},
|
||||
},
|
||||
}
|
||||
describe 'For services' do
|
||||
it { should run.with_params(afds_services, alarms_services, ['controller'], 'service')
|
||||
.and_return(
|
||||
{
|
||||
"rabbitmq_queue"=>
|
||||
{
|
||||
"type"=>"service",
|
||||
"cluster_name"=>"rabbitmq",
|
||||
"logical_name"=>"queue",
|
||||
"alarms_definitions"=> alarms_services,
|
||||
"alarms"=>["rabbitmq-queue-warning"],
|
||||
"message_matcher"=>"Fields[name] == 'rabbitmq_messages'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => false,
|
||||
},
|
||||
"apache_worker"=>
|
||||
{
|
||||
"type"=>"service",
|
||||
"cluster_name"=>"apache",
|
||||
"logical_name"=>"worker",
|
||||
"alarms_definitions"=> alarms_services,
|
||||
"alarms"=>["apache-warning"],
|
||||
"message_matcher"=>"Fields[name] == 'apache_idle_workers' || Fields[name] == 'apache_status'",
|
||||
"activate_alerting" => true,
|
||||
"enable_notification" => false,
|
||||
}}
|
||||
|
||||
)
|
||||
}
|
||||
end
|
||||
end
|
||||
|
@ -1090,200 +1090,289 @@ lma_collector:
|
||||
periods: 0
|
||||
function: max
|
||||
|
||||
# Mapping between the Fuel roles and the AFD node filters
|
||||
node_cluster_roles:
|
||||
controller: ['primary-controller', 'controller']
|
||||
<% if @detach_database_enabled -%>
|
||||
mysql-nodes: ['primary-standalone-database', 'standalone-database']
|
||||
<% else -%>
|
||||
mysql-nodes: ['primary-controller', 'controller']
|
||||
<% end -%>
|
||||
compute: ['compute']
|
||||
storage: ['cinder', 'ceph-osd']
|
||||
elasticsearch-nodes: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
|
||||
influxdb-nodes: ['primary-influxdb_grafana', 'influxdb_grafana']
|
||||
|
||||
# Mapping between the Fuel roles and the AFD service filters
|
||||
service_cluster_roles:
|
||||
<% if @detach_rabbitmq_enabled -%>
|
||||
rabbitmq-cluster: ['primary-standalone-rabbitmq', 'standalone-rabbitmq']
|
||||
rabbitmq-service: ['primary-standalone-rabbitmq', 'standalone-rabbitmq']
|
||||
<% else -%>
|
||||
rabbitmq-cluster: ['primary-controller', 'controller']
|
||||
rabbitmq-service: ['primary-controller', 'controller']
|
||||
<% end -%>
|
||||
<% if @detach_database_enabled -%>
|
||||
mysql: ['primary-standalone-database', 'standalone-database']
|
||||
<% else -%>
|
||||
mysql: ['primary-controller', 'controller']
|
||||
<% end -%>
|
||||
apache: ['primary-controller', 'controller']
|
||||
nova-api: ['primary-controller', 'controller']
|
||||
nova-logs: ['primary-controller', 'controller', 'compute']
|
||||
heat-api: ['primary-controller', 'controller']
|
||||
heat-logs: ['primary-controller', 'controller']
|
||||
<% if not @storage_options["objects_ceph"] then -%>
|
||||
swift-api: ['primary-controller', 'controller']
|
||||
<% end -%>
|
||||
cinder-api: ['primary-controller', 'controller']
|
||||
cinder-logs: ['primary-controller', 'controller', 'cinder']
|
||||
glance-api: ['primary-controller', 'controller']
|
||||
glance-logs: ['primary-controller', 'controller']
|
||||
neutron-api: ['primary-controller', 'controller']
|
||||
neutron-logs: ['primary-controller', 'controller', 'compute']
|
||||
keystone-response-time: ['primary-controller', 'controller']
|
||||
keystone-public-api: ['primary-controller', 'controller']
|
||||
keystone-admin-api: ['primary-controller', 'controller']
|
||||
keystone-logs: ['primary-controller', 'controller']
|
||||
nova-instances: ['primary-controller', 'controller']
|
||||
<% if @storage_options["volumes_ceph"] then -%>
|
||||
ceph-mon-cluster: ['primary-controller', 'controller']
|
||||
ceph-mon-service: ['primary-controller', 'controller']
|
||||
ceph-osd-service: ['ceph-osd']
|
||||
<% end -%>
|
||||
elasticsearch-cluster: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
|
||||
elasticsearch-service: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
|
||||
influxdb-service: ['primary-influxdb_grafana', 'influxdb_grafana']
|
||||
pacemaker-service: ['primary-controller', 'controller']
|
||||
haproxy-openstack: ['primary-controller', 'controller']
|
||||
libvirt-service: ['compute']
|
||||
memcached-service: ['primary-controller', 'controller']
|
||||
|
||||
# Definition of the AFD node filters
|
||||
node_cluster_alarms:
|
||||
controller:
|
||||
cpu: ['cpu-critical-controller', 'cpu-warning-controller']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
log-fs: ['log-fs-critical', 'log-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
cpu: ['cpu-critical-controller', 'cpu-warning-controller']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
log-fs: ['log-fs-critical', 'log-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
<% if @detach_rabbitmq_enabled -%>
|
||||
rabbitmq-nodes:
|
||||
cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
apply_to_node: rabbitmq-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
<% end -%>
|
||||
mysql-nodes:
|
||||
apply_to_node: mysql-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
<% if @detach_database_enabled -%>
|
||||
cpu: ['cpu-critical-mysql', 'cpu-warning-mysql']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
cpu: ['cpu-critical-mysql', 'cpu-warning-mysql']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
<% end -%>
|
||||
mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning']
|
||||
mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning']
|
||||
compute:
|
||||
cpu: ['cpu-critical-compute', 'cpu-warning-compute']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
nova-fs: ['nova-fs-critical', 'nova-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
apply_to_node: compute
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
cpu: ['cpu-critical-compute', 'cpu-warning-compute']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
nova-fs: ['nova-fs-critical', 'nova-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
storage:
|
||||
cpu: ['cpu-critical-storage', 'cpu-warning-storage']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
default:
|
||||
cpu: ['cpu-critical-default']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
apply_to_node: storage
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
cpu: ['cpu-critical-storage', 'cpu-warning-storage']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
elasticsearch-nodes:
|
||||
cpu: ['cpu-critical-default']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
apply_to_node: elasticsearch-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
cpu: ['cpu-critical-default']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
influxdb-nodes:
|
||||
cpu: ['cpu-critical-default']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
apply_to_node: influxdb-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
cpu: ['cpu-critical-default']
|
||||
network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
|
||||
network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
|
||||
root-fs: ['root-fs-critical', 'root-fs-warning']
|
||||
data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning']
|
||||
swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
|
||||
hdd-errors: ['hdd-errors-critical']
|
||||
|
||||
# Definition of the AFD service filters
|
||||
service_cluster_alarms:
|
||||
rabbitmq-cluster:
|
||||
pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning']
|
||||
queue: ['rabbitmq-queue-warning']
|
||||
memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning']
|
||||
disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning']
|
||||
apply_to_node: rabbitmq-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning']
|
||||
queue: ['rabbitmq-queue-warning']
|
||||
memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning']
|
||||
disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning']
|
||||
rabbitmq-service:
|
||||
check: ['rabbitmq-check']
|
||||
apply_to_node: rabbitmq-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['rabbitmq-check']
|
||||
mysql:
|
||||
node-status: ['mysql-node-connected', 'mysql-node-ready']
|
||||
check: ['mysql-check']
|
||||
apply_to_node: mysql-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
node-status: ['mysql-node-connected', 'mysql-node-ready']
|
||||
check: ['mysql-check']
|
||||
apache:
|
||||
worker: ['apache-warning']
|
||||
check: ['apache-check']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
worker: ['apache-warning']
|
||||
check: ['apache-check']
|
||||
nova-api:
|
||||
http_errors: ['nova-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['nova-api-http-errors']
|
||||
nova-logs:
|
||||
error: ['nova-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['nova-logs-error']
|
||||
heat-api:
|
||||
http_errors: ['heat-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['heat-api-http-errors']
|
||||
heat-logs:
|
||||
error: ['heat-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['heat-logs-error']
|
||||
<% if not @storage_options["objects_ceph"] then -%>
|
||||
swift-api:
|
||||
http_errors: ['swift-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['swift-api-http-errors']
|
||||
swift-logs:
|
||||
error: ['swift-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['swift-logs-error']
|
||||
<% end -%>
|
||||
cinder-api:
|
||||
http_errors: ['cinder-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['cinder-api-http-errors']
|
||||
cinder-logs:
|
||||
error: ['cinder-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['cinder-logs-error']
|
||||
glance-api:
|
||||
http_errors: ['glance-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['glance-api-http-errors']
|
||||
glance-logs:
|
||||
error: ['glance-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['glance-logs-error']
|
||||
neutron-api:
|
||||
http_errors: ['neutron-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['neutron-api-http-errors']
|
||||
neutron-logs:
|
||||
error: ['neutron-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['neutron-logs-error']
|
||||
keystone-response-time:
|
||||
duration: ['keystone-response-time-duration']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
duration: ['keystone-response-time-duration']
|
||||
keystone-public-api:
|
||||
http_errors: ['keystone-public-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['keystone-public-api-http-errors']
|
||||
keystone-logs:
|
||||
error: ['keystone-logs-error']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
error: ['keystone-logs-error']
|
||||
keystone-admin-api:
|
||||
http_errors: ['keystone-admin-api-http-errors']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
http_errors: ['keystone-admin-api-http-errors']
|
||||
nova-instances:
|
||||
creation-time: ['instance-creation-time-warning']
|
||||
#TODO(scroiset): apply on compute nodes
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
creation-time: ['instance-creation-time-warning']
|
||||
ceph-mon-cluster:
|
||||
health: ['ceph-health-critical', 'ceph-health-warning']
|
||||
capacity: ['ceph-capacity-critical', 'ceph-capacity-warning']
|
||||
apply_to_node: ceph-mon
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
health: ['ceph-health-critical', 'ceph-health-warning']
|
||||
capacity: ['ceph-capacity-critical', 'ceph-capacity-warning']
|
||||
ceph-mon-service:
|
||||
check: ['ceph-mon-check']
|
||||
apply_to_node: ceph-mon
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['ceph-mon-check']
|
||||
ceph-osd-service:
|
||||
check: ['ceph-osd-check']
|
||||
apply_to_node: ceph-osd
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['ceph-osd-check']
|
||||
elasticsearch-cluster:
|
||||
health: ['elasticsearch-health-critical', 'elasticsearch-health-warning']
|
||||
apply_to_node: elasticsearch-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
health: ['elasticsearch-health-critical', 'elasticsearch-health-warning']
|
||||
elasticsearch-service:
|
||||
check: ['elasticsearch-check']
|
||||
apply_to_node: elasticsearch-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['elasticsearch-check']
|
||||
influxdb-service:
|
||||
check: ['influxdb-check']
|
||||
apply_to_node: influxdb-nodes
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['influxdb-check']
|
||||
haproxy-openstack:
|
||||
check: ['haproxy-check']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['haproxy-check']
|
||||
pacemaker-service:
|
||||
check: ['pacemaker-check']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['pacemaker-check']
|
||||
libvirt-service:
|
||||
check: ['libvirt-check']
|
||||
apply_to_node: compute
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['libvirt-check']
|
||||
memcached-service:
|
||||
check: ['memcached-check']
|
||||
apply_to_node: controller
|
||||
enable_notification: false
|
||||
activate_alerting: true
|
||||
alarms:
|
||||
check: ['memcached-check']
|
||||
|
@ -0,0 +1,34 @@
|
||||
---
|
||||
lma_collector:
|
||||
# Fuel roles mapping to alarm evaluator key.
|
||||
node_profiles:
|
||||
controller:
|
||||
roles: ['primary-controller', 'controller']
|
||||
<% if @detach_database_enabled -%>
|
||||
mysql-nodes:
|
||||
roles: ['primary-standalone-database', 'standalone-database']
|
||||
<% else -%>
|
||||
mysql-nodes:
|
||||
roles: ['primary-controller', 'controller']
|
||||
<% end -%>
|
||||
<% if @detach_rabbitmq_enabled -%>
|
||||
rabbitmq-nodes:
|
||||
roles: ['primary-standalone-rabbitmq', 'standalone-rabbitmq']
|
||||
<% else -%>
|
||||
rabbitmq-nodes:
|
||||
roles: ['primary-controller', 'controller']
|
||||
<% end -%>
|
||||
<% if @storage_options["volumes_ceph"] then -%>
|
||||
ceph-mon:
|
||||
roles: ['primary-controller', 'controller']
|
||||
ceph-osd:
|
||||
roles: ['ceph-osd']
|
||||
<% end -%>
|
||||
compute:
|
||||
roles: ['compute']
|
||||
storage:
|
||||
roles: ['cinder']
|
||||
elasticsearch-nodes:
|
||||
roles: ['primary-elasticsearch_kibana', 'elasticsearch_kibana']
|
||||
influxdb-nodes:
|
||||
roles: ['primary-influxdb_grafana', 'influxdb_grafana']
|
@ -20,6 +20,8 @@ define lma_collector::afd_filter (
|
||||
$alarms,
|
||||
$alarms_definitions,
|
||||
$message_matcher,
|
||||
$activate_alerting = true,
|
||||
$enable_notification = false,
|
||||
) {
|
||||
include lma_collector::params
|
||||
include lma_collector::service::metric
|
||||
@ -44,11 +46,13 @@ define lma_collector::afd_filter (
|
||||
message_matcher => "(Type == \'metric\' || Type == \'heka.sandbox.metric\') && (${message_matcher})",
|
||||
ticker_interval => 10,
|
||||
config => {
|
||||
hostname => $::hostname,
|
||||
afd_type => $type,
|
||||
afd_file => $afd_file,
|
||||
afd_cluster_name => $cluster_name,
|
||||
afd_logical_name => $logical_name,
|
||||
hostname => $::hostname,
|
||||
afd_type => $type,
|
||||
afd_file => $afd_file,
|
||||
afd_cluster_name => $cluster_name,
|
||||
afd_logical_name => $logical_name,
|
||||
activate_alerting => $activate_alerting,
|
||||
enable_notification => $enable_notification,
|
||||
},
|
||||
module_directory => $lua_modules_dir,
|
||||
require => File[$afd_filename],
|
||||
|
Loading…
Reference in New Issue
Block a user