From 731265cdc8ff7506b82cf507cd38ccc37287ac2e Mon Sep 17 00:00:00 2001 From: Swann Croiset Date: Tue, 11 Oct 2016 00:52:47 +0200 Subject: [PATCH] Support alerting attribute per AFD Change-Id: I29aba65d35a12cc56a91c10f893e38a35ea3abf9 --- .../parser/functions/get_afd_filters.rb | 38 +- .../classes/fuel_lma_collector_afds_spec.rb | 26 +- .../spec/functions/get_afd_filters_spec.rb | 75 +- .../templates/alarming.yaml.erb | 717 +++++++++++------- 4 files changed, 528 insertions(+), 328 deletions(-) diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb index 0d7441f39..8cb7745c8 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/lib/puppet/parser/functions/get_afd_filters.rb @@ -24,10 +24,10 @@ # Ex: # # ARG0: -# {"rabbitmq"=>{"apply_to_node" => "controller", "alarms" => {"queue"=>["rabbitmq-queue-warning"]}}, -# "apache"=>{"apply_to_node" => "controller", "alarms" => {"worker"=>["apache-warning"]}}, -# "memcached"=>{"apply_to_node"=>"controller", "alarms" => {"all"=>["memcached-warning"]}}, -# "haproxy"=>{"apply_to_node" => "controller", "alarms" => {"alive"=>["haproxy-warning"]}}} +# {"rabbitmq"=>{"apply_to_node" => "controller", "members" => {"queue"=> {"alarms" => ["rabbitmq-queue-warning"]}}}, +# "apache"=>{"apply_to_node" => "controller", "members" => {"worker"=> {"alarms" => ["apache-warning"]}}}, +# "memcached"=>{"apply_to_node"=>"controller", "members" => {"all"=> {"alarms" => ["memcached-warning"]}}}, +# "haproxy"=>{"apply_to_node" => "controller", "members" => {"alive"=> {"alarms" => ["haproxy-warning"]}}}} # # ARG1: # @@ -114,28 +114,30 @@ module Puppet::Parser::Functions default_profile = false end - activate_alerting=true - enable_notification=false + default_activate_alerting=true + default_enable_notification=false if afds.has_key?('alerting') if afds['alerting'] == 'disabled' - activate_alerting=false + default_activate_alerting=false elsif afds['alerting'] == 'enabled_with_notification' - enable_notification = true + default_enable_notification = true end end - afds['alarms'].each do |afd_name, alarms| + afds['members'].each do |afd_name, alarms| metrics = Set.new([]) matches = false - alarms.each do |a_name| + activate_alerting = default_activate_alerting + enable_notification = default_enable_notification + if alarms.has_key?('alerting') + if alarms['alerting'] == 'disabled' + activate_alerting=false + elsif alarms['alerting'] == 'enabled_with_notification' + enable_notification = true + end + end + alarms['alarms'].each do |a_name| afd = alarm_definitions.select {|defi| defi['name'] == a_name} next if afd.empty? # user mention an unknown alarm for this AFD - #if afd[0].has_key('alerting') - # if afd[0]['alerting'] == 'disabled' - # activate_alerting=false - # elsif afd[0]['alerting'] == 'enabled_with_notification' - # enable_notification = true - # end - #end afd[0]['trigger']['rules'].each do |r| if metric_defs.has_key?(r['metric']) and metric_defs[r['metric']].has_key?('collected_on') and afd_profiles.include? metric_defs[r['metric']]['collected_on'] @@ -154,7 +156,7 @@ module Puppet::Parser::Functions 'type' => type, 'cluster_name' => cluster_name, 'logical_name' => afd_name, - 'alarms' => alarms, + 'alarms' => alarms['alarms'], 'alarms_definitions' => alarm_definitions, 'message_matcher' => message_matcher, 'activate_alerting' => activate_alerting, diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb index 2738dd9d7..f558afd3a 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/classes/fuel_lma_collector_afds_spec.rb @@ -27,16 +27,20 @@ describe 'fuel_lma_collector::afds' do 'controller' => { 'apply_to_node' => 'controller', - 'alarms' => { - 'cpu' => ['cpu_warning'] + 'members' => { + 'cpu' => { + "alarms" => ['cpu_warning'] + } } } }, - :service_cluster_alarms => { + :service_cluster_alarms=> { 'mysql' => { 'apply_to_node' => 'controller', - 'alarms' => { - 'all' => ['db_warning'] + 'members' => { + 'all' => { + "alarms" => ['db_warning'] + } } } }, @@ -81,8 +85,10 @@ describe 'fuel_lma_collector::afds' do :node_cluster_alarms => { 'controller' => { 'apply_to_node' => 'controller', - 'alarms' => { - 'cpu' => ['cpu_warning'] + 'members' => { + 'cpu' => { + "alarms" => ['cpu_warning'] + } } } }, @@ -115,8 +121,10 @@ describe 'fuel_lma_collector::afds' do 'others' => { 'apply_to_node' => 'default', - 'alarms' => { - 'cpu' => ['cpu_warning'] + 'members' => { + 'cpu' => { + "alarms" => ['cpu_warning'] + } } } }, diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb index 07cb3cb54..14dc3fe51 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/spec/functions/get_afd_filters_spec.rb @@ -104,17 +104,27 @@ describe 'get_afd_filters' do afds_nodes = { "controller" => { "apply_to_node" => "controller", - "alerting" => 'enabled_with_notification', - "alarms" => { - "system" => ["cpu-critical-controller", "cpu-warning-controller"], + "alerting" => 'enabled', + "members" => { + "system" => { + "alerting" => 'enabled_with_notification', + "alarms" => ["cpu-critical-controller", "cpu-warning-controller"], + }, + "foo" => { + "alarms" => ["cpu-critical-controller", "cpu-warning-controller"], + } }, }, "compute" => { "apply_to_node" => "compute", "alerting" => 'enabled_with_notification', - "alarms" => { - "system" => ["cpu-critical-compute", "cpu-warning-compute"], - "fs" => ["fs-critical"], + "members" => { + "system" => { + "alarms" => ["cpu-critical-compute", "cpu-warning-compute"], + }, + "fs" => { + "alarms" => ["fs-critical"], + } }, } } @@ -131,6 +141,16 @@ describe 'get_afd_filters' do "message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'", "enable_notification" => true, "activate_alerting" => true, + }, + "controller_foo"=> + {"type"=>"node", + "cluster_name"=>"controller", + "logical_name"=>"foo", + "alarms"=>["cpu-critical-controller", "cpu-warning-controller"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'", + "enable_notification" => false, + "activate_alerting" => true, } }) @@ -194,6 +214,16 @@ describe 'get_afd_filters' do "message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'", "activate_alerting" => true, "enable_notification" => true, + }, + "controller_foo"=> + {"type"=>"node", + "cluster_name"=>"controller", + "logical_name"=>"foo", + "alarms"=>["cpu-critical-controller", "cpu-warning-controller"], + "alarms_definitions"=> alarms_nodes, + "message_matcher"=>"Fields[name] == 'cpu_idle' || Fields[name] == 'cpu_wait'", + "enable_notification" => false, + "activate_alerting" => true, } }) } @@ -235,15 +265,19 @@ describe 'get_afd_filters' do "rabbitmq" => { "apply_to_node" => "controller", "alerting" => 'enabled', - "alarms" => { - "queue" => ["rabbitmq-queue-warning"] + "members" => { + "queue" => { + "alarms" => ["rabbitmq-queue-warning"] + } }, }, "apache" => { "apply_to_node" => "controller", "alerting" => 'enabled', - "alarms" => { - "worker" => ['apache-warning'], + "members" => { + "worker" => { + "alarms" => ['apache-warning'], + } }, }, } @@ -339,21 +373,28 @@ describe 'get_afd_filters' do "nova-free-resources" => { "apply_to_node" => "compute", "alerting" => 'enabled', - "alarms" => { - "free-vcpu" => ['free_vcpu_warning'], + "members" => { + "free-vcpu" => { + "alerting" => 'disabled', + "alarms" => ['free_vcpu_warning'], + } }, }, "nova-total-free-resources" => { "alerting" => 'enabled', - "alarms" => { - "total-free-vcpu" => ['total_free_vcpu_warning'], + "members" => { + "total-free-vcpu" => { + "alarms" => ['total_free_vcpu_warning'], + } }, }, "controller" => { "apply_to_node" => "controller", "alerting" => 'enabled_with_notification', - "alarms" => { - "system" => ["cpu-critical-controller", "cpu-warning-controller"], + "members" => { + "system" => { + "alarms" => ["cpu-critical-controller", "cpu-warning-controller"], + } }, }, } @@ -376,7 +417,7 @@ describe 'get_afd_filters' do "alarms_definitions"=> alarms_services_o, "alarms"=>["free_vcpu_warning"], "message_matcher"=>"Fields[name] == 'free_vcpu'", - "activate_alerting" => true, + "activate_alerting" => false, "enable_notification" => false, }, "nova-total-free-resources_total-free-vcpu"=> diff --git a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb index ba6b5af55..d15da268e 100644 --- a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb +++ b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb @@ -2915,459 +2915,589 @@ lma_collector: controller: apply_to_node: controller alerting: enabled - alarms: - cpu: ['cpu-critical-controller', 'cpu-warning-controller'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - log-fs: ['log-fs-critical', 'log-fs-warning'] - other-fs: ['other-fs-critical', 'other-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + members: + cpu: + alarms: ['cpu-critical-controller', 'cpu-warning-controller'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + log-fs: + alarms: ['log-fs-critical', 'log-fs-warning'] + other-fs: + alarms: ['other-fs-critical', 'other-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] <% if @detach_rabbitmq_enabled -%> rabbitmq-nodes: apply_to_node: rabbitmq-nodes alerting: enabled - alarms: - cpu: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - other-fs: ['other-fs-critical', 'other-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + members: + cpu: + alarms: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + other-fs: + alarms: ['other-fs-critical', 'other-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] <% end -%> mysql-nodes: apply_to_node: mysql-nodes alerting: enabled - alarms: + members: <% if @detach_database_enabled -%> - cpu: ['cpu-critical-mysql', 'cpu-warning-mysql'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - other-fs: ['other-fs-critical', 'other-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + cpu: + alarms: ['cpu-critical-mysql', 'cpu-warning-mysql'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + other-fs: + alarms: ['other-fs-critical', 'other-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] <% end -%> - mysql-fs: ['mysql-fs-critical', 'mysql-fs-warning'] + mysql-fs: + alarms: ['mysql-fs-critical', 'mysql-fs-warning'] compute: apply_to_node: compute alerting: enabled - alarms: - cpu: ['cpu-critical-compute', 'cpu-warning-compute'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - nova-fs: ['nova-fs-critical', 'nova-fs-warning'] - other-fs: ['other-fs-critical', 'other-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + members: + cpu: + alarms: ['cpu-critical-compute', 'cpu-warning-compute'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + nova-fs: + alarms: ['nova-fs-critical', 'nova-fs-warning'] + other-fs: + alarms: ['other-fs-critical', 'other-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] storage: apply_to_node: storage alerting: enabled - alarms: - cpu: ['cpu-critical-storage', 'cpu-warning-storage'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - other-fs: ['other-fs-critical', 'other-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + members: + cpu: + alarms: ['cpu-critical-storage', 'cpu-warning-storage'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + other-fs: + alarms: ['other-fs-critical', 'other-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] <% if @storage_options["volumes_ceph"] then -%> - osd-disk: ['osd-disk-critical'] + osd-disk: + alarms: ['osd-disk-critical'] <% end -%> elasticsearch-nodes: apply_to_node: elasticsearch-nodes alerting: enabled - alarms: - cpu: ['cpu-critical-default'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - data-fs: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + members: + cpu: + alarms: ['cpu-critical-default'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + data-fs: + alarms: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] influxdb-nodes: apply_to_node: influxdb-nodes alerting: enabled - alarms: - cpu: ['cpu-critical-default'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - data-fs: ['influxdb-fs-critical', 'influxdb-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] - # This is the default alarms configured for all nodes with unknown roles + members: + cpu: + alarms: ['cpu-critical-default'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + data-fs: + alarms: ['influxdb-fs-critical', 'influxdb-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] + # This is the default members configured for all nodes with unknown roles default: apply_to_node: default # Operator wants to receive alert notification for individual nodes alerting: enabled_with_notification - alarms: - cpu: ['cpu-critical-default'] - network-rx: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] - network-tx: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] - root-fs: ['root-fs-critical', 'root-fs-warning'] - other-fs: ['other-fs-critical', 'other-fs-warning'] - swap: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] - hdd-errors: ['hdd-errors-critical'] + members: + cpu: + alarms: ['cpu-critical-default'] + network-rx: + alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx'] + network-tx: + alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx'] + root-fs: + alarms: ['root-fs-critical', 'root-fs-warning'] + other-fs: + alarms: ['other-fs-critical', 'other-fs-warning'] + swap: + alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning'] + hdd-errors: + alarms: ['hdd-errors-critical'] # Definition of the AFD service filters service_cluster_alarms: rabbitmq-cluster: apply_to_node: rabbitmq-nodes alerting: enabled - alarms: - pacemaker: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning'] - queue: ['rabbitmq-queue-warning'] - memory: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning'] - disk: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning'] + members: + pacemaker: + alarms: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning'] + queue: + alarms: ['rabbitmq-queue-warning'] + memory: + alarms: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning'] + disk: + alarms: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning'] rabbitmq-service: apply_to_node: rabbitmq-nodes alerting: enabled - alarms: - check: ['rabbitmq-check'] + members: + check: + alarms: ['rabbitmq-check'] mysql: apply_to_node: mysql-nodes alerting: enabled - alarms: - node-status: ['mysql-node-connected', 'mysql-node-ready'] - check: ['mysql-check'] + members: + node-status: + alarms: ['mysql-node-connected', 'mysql-node-ready'] + check: + alarms: ['mysql-check'] apache: apply_to_node: controller alerting: enabled - alarms: - worker: ['apache-warning'] - check: ['apache-check'] + members: + worker: + alarms: ['apache-warning'] + check: + alarms: ['apache-check'] nova-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['nova-api-http-errors'] + members: + http_errors: + alarms: ['nova-api-http-errors'] backends: - - 'nova-api-backends-all-down' - - 'nova-api-backends-majority-down' - - 'nova-api-backends-one-down' + alarms: + - 'nova-api-backends-all-down' + - 'nova-api-backends-majority-down' + - 'nova-api-backends-one-down' nova-api-check: alerting: enabled - alarms: - vip: ['nova-api-check-failed'] + members: + vip: + alarms: ['nova-api-check-failed'] nova-metadata-api: apply_to_node: controller alerting: enabled - alarms: + members: backends: - - 'nova-metadata-api-backends-all-down' - - 'nova-metadata-api-backends-majority-down' - - 'nova-metadata-api-backends-one-down' + alarms: + - 'nova-metadata-api-backends-all-down' + - 'nova-metadata-api-backends-majority-down' + - 'nova-metadata-api-backends-one-down' nova-novncproxy-websocket: apply_to_node: controller alerting: enabled - alarms: + members: backends: - - 'nova-novncproxy-websocket-api-backends-all-down' - - 'nova-novncproxy-websocket-api-backends-majority-down' - - 'nova-novncproxy-websocket-api-backends-one-down' + alarms: + - 'nova-novncproxy-websocket-api-backends-all-down' + - 'nova-novncproxy-websocket-api-backends-majority-down' + - 'nova-novncproxy-websocket-api-backends-one-down' nova-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['nova-api-local-endpoint'] + members: + endpoint: + alarms: ['nova-api-local-endpoint'] nova-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['nova-logs-error'] + members: + error: + alarms: ['nova-logs-error'] nova-logs-compute: apply_to_node: compute alerting: enabled - alarms: - error: ['nova-logs-error'] + members: + error: + alarms: ['nova-logs-error'] nova-cert: alerting: enabled - alarms: + members: workers: - - 'nova-cert-all-down' - - 'nova-cert-majority-down' - - 'nova-cert-one-down' + alarms: + - 'nova-cert-all-down' + - 'nova-cert-majority-down' + - 'nova-cert-one-down' nova-consoleauth: alerting: enabled - alarms: + members: workers: - - 'nova-consoleauth-all-down' - - 'nova-consoleauth-majority-down' - - 'nova-consoleauth-one-down' + alarms: + - 'nova-consoleauth-all-down' + - 'nova-consoleauth-majority-down' + - 'nova-consoleauth-one-down' nova-compute: alerting: enabled - alarms: + members: workers: - - 'nova-compute-all-down' - - 'nova-compute-majority-down' - - 'nova-compute-one-down' + alarms: + - 'nova-compute-all-down' + - 'nova-compute-majority-down' + - 'nova-compute-one-down' nova-conductor: alerting: enabled - alarms: + members: workers: - - 'nova-conductor-all-down' - - 'nova-conductor-majority-down' - - 'nova-conductor-one-down' + alarms: + - 'nova-conductor-all-down' + - 'nova-conductor-majority-down' + - 'nova-conductor-one-down' nova-scheduler: alerting: enabled - alarms: + members: workers: - - 'nova-scheduler-all-down' - - 'nova-scheduler-majority-down' - - 'nova-scheduler-one-down' + alarms: + - 'nova-scheduler-all-down' + - 'nova-scheduler-majority-down' + - 'nova-scheduler-one-down' heat-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['heat-api-http-errors'] + members: + http_errors: + alarms: ['heat-api-http-errors'] backends: - - 'heat-api-backends-all-down' - - 'heat-api-backends-majority-down' - - 'heat-api-backends-one-down' + alarms: + - 'heat-api-backends-all-down' + - 'heat-api-backends-majority-down' + - 'heat-api-backends-one-down' heat-cfn-api: apply_to_node: controller alerting: enabled - alarms: + members: backends: - - 'heat-cfn-api-backends-all-down' - - 'heat-cfn-api-backends-majority-down' - - 'heat-cfn-api-backends-one-down' + alarms: + - 'heat-cfn-api-backends-all-down' + - 'heat-cfn-api-backends-majority-down' + - 'heat-cfn-api-backends-one-down' heat-cloudwatch-api: apply_to_node: controller alerting: enabled - alarms: + members: backends: - - 'heat-cloudwatch-api-backends-all-down' - - 'heat-cloudwatch-api-backends-majority-down' - - 'heat-cloudwatch-api-backends-one-down' + alarms: + - 'heat-cloudwatch-api-backends-all-down' + - 'heat-cloudwatch-api-backends-majority-down' + - 'heat-cloudwatch-api-backends-one-down' heat-api-check: alerting: enabled - alarms: - vip: ['heat-api-check-failed'] + members: + vip: + alarms: ['heat-api-check-failed'] heat-cfn-api-check: alerting: enabled - alarms: - vip: ['heat-cfn-api-check-failed'] + members: + vip: + alarms: ['heat-cfn-api-check-failed'] heat-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['heat-api-local-endpoint'] + members: + endpoint: + alarms: ['heat-api-local-endpoint'] heat-cfn-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['heat-cfn-api-local-endpoint'] + members: + endpoint: + alarms: ['heat-cfn-api-local-endpoint'] heat-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['heat-logs-error'] + members: + error: + alarms: ['heat-logs-error'] <% if not @storage_options["objects_ceph"] then -%> swift-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['swift-api-http-errors'] + members: + http_errors: + alarms: ['swift-api-http-errors'] backends: - - 'swift-api-backends-all-down' - - 'swift-api-backends-majority-down' - - 'swift-api-backends-one-down' + alarms: + - 'swift-api-backends-all-down' + - 'swift-api-backends-majority-down' + - 'swift-api-backends-one-down' swift-api-check: alerting: enabled - alarms: - vip: ['swift-api-check-failed'] + members: + vip: + alarms: ['swift-api-check-failed'] swift-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['swift-api-local-endpoint'] + members: + endpoint: + alarms: ['swift-api-local-endpoint'] swift-s3-api-check: alerting: enabled - alarms: - vip: ['swift-s3-api-check-failed'] + members: + vip: + alarms: ['swift-s3-api-check-failed'] swift-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['swift-logs-error'] + members: + error: + alarms: ['swift-logs-error'] <% end -%> cinder-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['cinder-api-http-errors'] + members: + http_errors: + alarms: ['cinder-api-http-errors'] backends: - - 'cinder-api-backends-all-down' - - 'cinder-api-backends-majority-down' - - 'cinder-api-backends-one-down' + alarms: + - 'cinder-api-backends-all-down' + - 'cinder-api-backends-majority-down' + - 'cinder-api-backends-one-down' cinder-api-check: alerting: enabled - alarms: - vip: ['cinder-api-check-failed'] + members: + vip: + alarms: ['cinder-api-check-failed'] cinder-v2-api-check: alerting: enabled - alarms: - vip: ['cinder-v2-api-check-failed'] + members: + vip: + alarms: ['cinder-v2-api-check-failed'] cinder-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['cinder-api-local-endpoint'] + members: + endpoint: + alarms: ['cinder-api-local-endpoint'] cinder-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['cinder-logs-error'] + members: + error: + alarms: ['cinder-logs-error'] cinder-scheduler: alerting: enabled - alarms: + members: workers: - - 'cinder-scheduler-all-down' - - 'cinder-scheduler-majority-down' - - 'cinder-scheduler-one-down' + alarms: + - 'cinder-scheduler-all-down' + - 'cinder-scheduler-majority-down' + - 'cinder-scheduler-one-down' cinder-volume: alerting: enabled - alarms: + members: workers: - - 'cinder-volume-all-down' - - 'cinder-volume-majority-down' - - 'cinder-volume-one-down' + alarms: + - 'cinder-volume-all-down' + - 'cinder-volume-majority-down' + - 'cinder-volume-one-down' <% if not @storage_options["volumes_ceph"] then -%> cinder-volume-logs: apply_to_node: storage alerting: enabled - alarms: - error: ['cinder-logs-error'] + members: + error: + alarms: ['cinder-logs-error'] <% end -%> glance-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['glance-api-http-errors'] + members: + http_errors: + alarms: ['glance-api-http-errors'] backends: - - 'glance-api-backends-all-down' - - 'glance-api-backends-majority-down' - - 'glance-api-backends-one-down' + alarms: + - 'glance-api-backends-all-down' + - 'glance-api-backends-majority-down' + - 'glance-api-backends-one-down' glance-registry-api: apply_to_node: controller alerting: enabled - alarms: + members: backends: - - 'glance-registry-api-backends-all-down' - - 'glance-registry-api-backends-majority-down' - - 'glance-registry-api-backends-one-down' + alarms: + - 'glance-registry-api-backends-all-down' + - 'glance-registry-api-backends-majority-down' + - 'glance-registry-api-backends-one-down' glance-api-check: alerting: enabled - alarms: - vip: ['glance-api-check-failed'] + members: + vip: + alarms: ['glance-api-check-failed'] glance-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['glance-api-local-endpoint'] + members: + endpoint: + alarms: ['glance-api-local-endpoint'] glance-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['glance-logs-error'] + members: + error: + alarms: ['glance-logs-error'] neutron-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['neutron-api-http-errors'] + members: + http_errors: + alarms: ['neutron-api-http-errors'] backends: - - 'neutron-api-backends-all-down' - - 'neutron-api-backends-majority-down' - - 'neutron-api-backends-one-down' + alarms: + - 'neutron-api-backends-all-down' + - 'neutron-api-backends-majority-down' + - 'neutron-api-backends-one-down' neutron-api-check: alerting: enabled - alarms: - vip: ['neutron-api-check-failed'] + members: + vip: + alarms: ['neutron-api-check-failed'] neutron-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['neutron-api-local-endpoint'] + members: + endpoint: + alarms: ['neutron-api-local-endpoint'] neutron-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['neutron-logs-error'] + members: + error: + alarms: ['neutron-logs-error'] neutron-l3: alerting: enabled - alarms: + members: workers: - - 'neutron-l3-all-down' - - 'neutron-l3-majority-down' - - 'neutron-l3-one-down' + alarms: + - 'neutron-l3-all-down' + - 'neutron-l3-majority-down' + - 'neutron-l3-one-down' neutron-dhcp: alerting: enabled - alarms: + members: workers: - - 'neutron-dhcp-all-down' - - 'neutron-dhcp-majority-down' - - 'neutron-dhcp-one-down' + alarms: + - 'neutron-dhcp-all-down' + - 'neutron-dhcp-majority-down' + - 'neutron-dhcp-one-down' neutron-metadata: alerting: enabled - alarms: + members: workers: - - 'neutron-metadata-all-down' - - 'neutron-metadata-majority-down' - - 'neutron-metadata-one-down' + alarms: + - 'neutron-metadata-all-down' + - 'neutron-metadata-majority-down' + - 'neutron-metadata-one-down' neutron-openvswitch: alerting: enabled - alarms: + members: workers: - - 'neutron-openvswitch-all-down' - - 'neutron-openvswitch-majority-down' - - 'neutron-openvswitch-one-down' + alarms: + - 'neutron-openvswitch-all-down' + - 'neutron-openvswitch-majority-down' + - 'neutron-openvswitch-one-down' neutron-logs-compute: apply_to_node: compute alerting: enabled - alarms: - error: ['neutron-logs-error'] + members: + error: + alarms: ['neutron-logs-error'] keystone-response-time: apply_to_node: controller alerting: enabled - alarms: - duration: ['keystone-response-time-duration'] + members: + duration: + alarms: ['keystone-response-time-duration'] keystone-public-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['keystone-public-api-http-errors'] + members: + http_errors: + alarms: ['keystone-public-api-http-errors'] backends: - - 'keystone-public-api-backends-all-down' - - 'keystone-public-api-backends-majority-down' - - 'keystone-public-api-backends-one-down' + alarms: + - 'keystone-public-api-backends-all-down' + - 'keystone-public-api-backends-majority-down' + - 'keystone-public-api-backends-one-down' keystone-public-api-check: alerting: enabled - alarms: - vip: ['keystone-public-api-check-failed'] + members: + vip: + alarms: ['keystone-public-api-check-failed'] keystone-public-api-endpoint: apply_to_node: controller alerting: enabled - alarms: - endpoint: ['keystone-public-api-local-endpoint'] + members: + endpoint: + alarms: ['keystone-public-api-local-endpoint'] keystone-logs: apply_to_node: controller alerting: enabled - alarms: - error: ['keystone-logs-error'] + members: + error: + alarms: ['keystone-logs-error'] keystone-admin-api: apply_to_node: controller alerting: enabled - alarms: - http_errors: ['keystone-admin-api-http-errors'] + members: + http_errors: + alarms: ['keystone-admin-api-http-errors'] backends: - - 'keystone-admin-api-backends-all-down' - - 'keystone-admin-api-backends-majority-down' - - 'keystone-admin-api-backends-one-down' + alarms: + - 'keystone-admin-api-backends-all-down' + - 'keystone-admin-api-backends-majority-down' + - 'keystone-admin-api-backends-one-down' <% if @tls_enabled then -%> horizon-https: <% else -%> @@ -3375,92 +3505,111 @@ lma_collector: <% end -%> apply_to_node: controller alerting: enabled - alarms: - http_errors: ['horizon-web-http-errors'] + members: + http_errors: + alarms: ['horizon-web-http-errors'] backends: - - 'horizon-web-api-backends-all-down' - - 'horizon-web-api-backends-majority-down' - - 'horizon-web-api-backends-one-down' + alarms: + - 'horizon-web-api-backends-all-down' + - 'horizon-web-api-backends-majority-down' + - 'horizon-web-api-backends-one-down' nova-instances: #TODO(scroiset): apply on compute nodes apply_to_node: controller alerting: enabled - alarms: - creation-time: ['instance-creation-time-warning'] + members: + creation-time: + alarms: ['instance-creation-time-warning'] nova-free-vcpu: alerting: enabled - alarms: - nova-free-vcpu: ['total-nova-free-vcpu-warning'] + members: + nova-free-vcpu: + alarms: ['total-nova-free-vcpu-warning'] nova-free-memory: alerting: enabled - alarms: - nova-free-memory: ['total-nova-free-memory-warning'] + members: + nova-free-memory: + alarms: ['total-nova-free-memory-warning'] ceph-mon-cluster: apply_to_node: ceph-mon alerting: enabled - alarms: - health: ['ceph-health-critical', 'ceph-health-warning'] - capacity: ['ceph-capacity-critical', 'ceph-capacity-warning'] + members: + health: + alarms: ['ceph-health-critical', 'ceph-health-warning'] + capacity: + alarms: ['ceph-capacity-critical', 'ceph-capacity-warning'] ceph-mon-service: apply_to_node: ceph-mon alerting: enabled - alarms: - check: ['ceph-mon-check'] + members: + check: + alarms: ['ceph-mon-check'] <% if @storage_options["volumes_ceph"] then -%> ceph-osd-service: apply_to_node: storage alerting: enabled - alarms: - check: ['ceph-osd-check'] + members: + check: + alarms: ['ceph-osd-check'] <% end -%> elasticsearch-cluster: apply_to_node: elasticsearch-nodes alerting: enabled - alarms: - health: ['elasticsearch-health-critical', 'elasticsearch-health-warning'] + members: + health: + alarms: ['elasticsearch-health-critical', 'elasticsearch-health-warning'] elasticsearch-service: apply_to_node: elasticsearch-nodes alerting: enabled - alarms: - check: ['elasticsearch-check'] + members: + check: + alarms: ['elasticsearch-check'] influxdb-service: apply_to_node: influxdb-nodes alerting: enabled - alarms: - check: ['influxdb-check'] + members: + check: + alarms: ['influxdb-check'] influxdb-api-check: alerting: enabled - alarms: - vip: ['influxdb-api-check-failed'] + members: + vip: + alarms: ['influxdb-api-check-failed'] haproxy-openstack: apply_to_node: controller alerting: enabled - alarms: - check: ['haproxy-check'] + members: + check: + alarms: ['haproxy-check'] pacemaker-service: apply_to_node: controller alerting: enabled - alarms: - check: ['pacemaker-check'] + members: + check: + alarms: ['pacemaker-check'] libvirt-service: apply_to_node: compute alerting: enabled - alarms: - check: ['libvirt-check'] + members: + check: + alarms: ['libvirt-check'] memcached-service: apply_to_node: controller alerting: enabled - alarms: - check: ['memcached-check'] + members: + check: + alarms: ['memcached-check'] ceilometer-api-check: alerting: enabled - alarms: - vip: ['ceilometer-api-check-failed'] + members: + vip: + alarms: ['ceilometer-api-check-failed'] mysqld-tcp: apply_to_node: controller alerting: enabled - alarms: + members: backends: - - 'mysqld-tcp-api-backends-all-down' - - 'mysqld-tcp-api-backends-majority-down' - - 'mysqld-tcp-api-backends-one-down' + alarms: + - 'mysqld-tcp-api-backends-all-down' + - 'mysqld-tcp-api-backends-majority-down' + - 'mysqld-tcp-api-backends-one-down'