fuel-plugin-lma-collector/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb

3692 lines
114 KiB
Plaintext

---
lma_collector:
alarms:
- name: 'cpu-critical-controller'
description: 'The CPU usage is too high (controller node)'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 5
window: 120
periods: 0
function: avg
- metric: cpu_wait
relational_operator: '>='
threshold: 35
window: 120
periods: 0
function: avg
- name: 'cpu-warning-controller'
description: 'The CPU usage is high (controller node)'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 15
window: 120
periods: 0
function: avg
- metric: cpu_wait
relational_operator: '>='
threshold: 25
window: 120
periods: 0
function: avg
- name: 'swap-usage-critical'
description: 'There is no more swap free space'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: swap_free
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: max
- name: 'swap-activity-warning'
description: 'The swap activity is high'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: swap_io_in
relational_operator: '>='
threshold: 1048576 # 1 Mb/s
window: 120
periods: 0
function: avg
- metric: swap_io_out
relational_operator: '>='
threshold: 1048576 # 1 Mb/s
window: 120
periods: 0
function: avg
- name: 'swap-usage-warning'
description: 'The swap free space is low'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: swap_percent_used
relational_operator: '>='
threshold: 0.8
window: 60
periods: 0
function: avg
- name: 'cpu-critical-compute'
description: 'The CPU usage is too high (compute node)'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 30
window: 120
periods: 0
function: avg
- name: 'cpu-warning-compute'
description: 'The CPU usage is high (compute node)'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 20
window: 120
periods: 0
function: avg
- name: 'cpu-critical-rabbitmq'
description: 'The CPU usage is too high (RabbitMQ node)'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 5
window: 120
periods: 0
function: avg
- name: 'cpu-warning-rabbitmq'
description: 'The CPU usage is high (RabbitMQ node)'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 15
window: 120
periods: 0
function: avg
- name: 'cpu-critical-mysql'
description: 'The CPU usage is too high (MySQL node)'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 5
window: 120
periods: 0
function: avg
- name: 'cpu-warning-mysql'
description: 'The CPU usage is high (MySQL node)'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_idle
relational_operator: '<='
threshold: 15
window: 120
periods: 0
function: avg
- name: 'cpu-critical-storage'
description: 'The CPU usage is too high (storage node)'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 40
window: 120
periods: 0
function: avg
- metric: cpu_idle
relational_operator: '<='
threshold: 5
window: 120
periods: 0
function: avg
- name: 'cpu-warning-storage'
description: 'The CPU usage is high (storage node)'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 30
window: 120
periods: 0
function: avg
- metric: cpu_idle
relational_operator: '<='
threshold: 15
window: 120
periods: 0
function: avg
- name: 'cpu-critical-default'
description: 'The CPU usage is too high'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: cpu_wait
relational_operator: '>='
threshold: 35
window: 120
periods: 0
function: avg
- metric: cpu_idle
relational_operator: '<='
threshold: 5
window: 120
periods: 0
function: avg
- name: 'rabbitmq-disk-limit-critical'
description: 'RabbitMQ has reached the free disk threshold. All producers are blocked'
severity: 'critical'
# If the local RabbitMQ instance is down, it will be caught by the
# rabbitmq-check alarm
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: rabbitmq_remaining_disk
relational_operator: '<='
threshold: 0
window: 20
periods: 0
function: min
- name: 'rabbitmq-disk-limit-warning'
description: 'RabbitMQ is getting close to the free disk threshold'
severity: 'warning'
# If the local RabbitMQ instance is down, it will be caught by the
# rabbitmq-check alarm
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: rabbitmq_remaining_disk
relational_operator: '<='
threshold: 104857600 # 100MB
window: 20
periods: 0
function: min
- name: 'rabbitmq-memory-limit-critical'
description: 'RabbitMQ has reached the memory threshold. All producers are blocked'
severity: 'critical'
# If the local RabbitMQ instance is down, it will be caught by the
# rabbitmq-check alarm
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: rabbitmq_remaining_memory
relational_operator: '<='
threshold: 0
window: 20
periods: 0
function: min
- name: 'rabbitmq-memory-limit-warning'
description: 'RabbitMQ is getting close to the memory threshold'
severity: 'warning'
# If the local RabbitMQ instance is down, it will be caught by the
# rabbitmq-check alarm
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: rabbitmq_remaining_memory
relational_operator: '<='
threshold: 104857600 # 100MB
window: 20
periods: 0
function: min
- name: 'rabbitmq-queue-warning'
description: 'The number of outstanding messages is too high'
severity: 'warning'
# If the local RabbitMQ instance is down, it will be caught by the
# rabbitmq-check alarm
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: rabbitmq_messages
relational_operator: '>='
threshold: 200
window: 120
periods: 0
function: avg
- name: 'rabbitmq-pacemaker-down'
description: 'The RabbitMQ cluster is down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
logical_operator: 'and'
rules:
- metric: pacemaker_resource_percent
fields:
resource: rabbitmq
status: up
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'rabbitmq-pacemaker-critical'
description: 'The RabbitMQ cluster is critical because less than half of the nodes are up'
severity: 'critical'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
logical_operator: 'and'
rules:
- metric: pacemaker_resource_percent
fields:
resource: rabbitmq
status: up
relational_operator: '<'
threshold: 50
window: 60
periods: 0
function: last
- name: 'rabbitmq-pacemaker-warning'
description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
logical_operator: 'and'
rules:
- metric: pacemaker_resource_percent
fields:
resource: rabbitmq
status: up
relational_operator: '<'
threshold: 100
window: 60
periods: 0
function: last
- name: 'apache-warning'
description: 'There is no Apache idle workers available'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: apache_idle_workers
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: min
- name: 'apache-check'
description: 'Apache cannot be checked'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: apache_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'log-fs-warning'
description: "The log filesystem's free space is low"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/log'
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'log-fs-critical'
description: "The log filesystem's free space is too low"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/log'
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'root-fs-warning'
description: "The root filesystem's free space is low"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/'
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'root-fs-critical'
description: "The root filesystem's free space is too low"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/'
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'mysql-fs-warning'
description: "The MySQL filesystem's free space is low"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/lib/mysql'
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'mysql-fs-critical'
description: "The MySQL filesystem's free space is too low"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/lib/mysql'
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'nova-fs-warning'
description: "The filesystem's free space is low (compute node)"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/lib/nova'
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'nova-fs-critical'
description: "The filesystem's free space is too low (compute node)"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/lib/nova'
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'other-fs-warning'
description: "The filesystem's free space is low"
severity: 'warning'
enabled: 'true'
no_data_policy: 'okay'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '!= /var/lib/nova && != /var/log && != /var/lib/mysql && != / && !~ ceph%-%d+$'
group_by: [fs]
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'other-fs-critical'
description: "The filesystem's free space is too low"
severity: 'critical'
enabled: 'true'
no_data_policy: 'okay'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '!= /var/lib/nova && != /var/log && != /var/lib/mysql && != / && !~ ceph%-%d+$'
group_by: [fs]
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'osd-disk-critical'
description: "The filesystem's free space is too low (OSD disk)"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
# Real FS is /var/lib/ceph/osd/ceph-0 but Collectd substituted '/' by '-'
fs: '=~ ceph/%d+$'
group_by: [fs]
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'nova-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on nova-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'nova-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'nova-logs-error'
description: 'Too many errors have been detected in Nova logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'nova'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'heat-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on heat-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'heat-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'heat-logs-error'
description: 'Too many errors have been detected in Heat logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'heat'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'swift-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on swift-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'swift-api || object-storage'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'swift-logs-error'
description: 'Too many errors have been detected in Swift logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'swift'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'cinder-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on cinder-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'cinder-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'cinder-logs-error'
description: 'Too many errors have been detected in Cinder logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'cinder'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'glance-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on glance-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'glance-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'glance-logs-error'
description: 'Too many errors have been detected in Glance logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'glance'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'neutron-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on neutron-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'neutron-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'neutron-logs-error'
description: 'Too many errors have been detected in Neutron logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'neutron'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'keystone-response-time-duration'
description: 'Keystone API is too slow'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: openstack_keystone_http_response_times
fields:
http_method: '== GET || == POST'
http_status: '!= 5xx'
relational_operator: '>'
threshold: 0.3
window: 60
periods: 0
value: upper_90
function: max
- name: 'keystone-public-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on keystone-public-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'keystone-public-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'keystone-admin-api-http-errors'
description: 'Too many 5xx HTTP errors have been detected on keystone-admin-api'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'keystone-admin-api'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'horizon-web-http-errors'
description: 'Too many 5xx HTTP errors have been detected on horizon'
severity: 'warning'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: haproxy_backend_response_5xx
fields:
backend: 'horizon-web || horizon-https'
relational_operator: '>'
threshold: 0
window: 60
periods: 1
function: diff
- name: 'keystone-logs-error'
description: 'Too many errors have been detected in Keystone logs'
severity: 'warning'
no_data_policy: 'okay'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: log_messages
fields:
service: 'keystone'
level: 'error'
relational_operator: '>'
threshold: 0.1
window: 70
periods: 0
function: max
- name: 'mysql-node-connected'
description: 'The MySQL service has lost connectivity with the other nodes'
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: mysql_cluster_connected
relational_operator: '=='
threshold: 0
window: 30
periods: 1
function: min
- name: 'mysql-node-ready'
description: "The MySQL service isn't ready to serve queries"
severity: 'critical'
enabled: 'true'
trigger:
logical_operator: 'or'
rules:
- metric: mysql_cluster_ready
relational_operator: '=='
threshold: 0
window: 30
periods: 1
function: min
- name: 'ceph-health-critical'
description: 'Ceph health is critical'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: ceph_health
relational_operator: '=='
threshold: 3 # HEALTH_ERR
window: 60
function: max
- name: 'ceph-health-warning'
description: 'Ceph health is warning'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: ceph_health
relational_operator: '=='
threshold: 2 # HEALTH_WARN
window: 60
function: max
- name: 'ceph-capacity-critical'
description: 'Ceph free capacity is too low'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: ceph_pool_total_percent_free
relational_operator: '<'
threshold: 2
window: 60
function: max
- name: 'ceph-capacity-warning'
description: 'Ceph free capacity is low'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: ceph_pool_total_percent_free
relational_operator: '<'
threshold: 5
window: 60
function: max
- name: 'elasticsearch-health-critical'
description: 'Elasticsearch cluster health is critical'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: elasticsearch_cluster_health
relational_operator: '=='
threshold: 3 # red
window: 60
function: min
- name: 'elasticsearch-health-warning'
description: 'Elasticsearch health is warning'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: elasticsearch_cluster_health
relational_operator: '=='
threshold: 2 # yellow
window: 60
function: min
- name: 'elasticsearch-fs-warning'
description: "The filesystem's free space is low (Elasticsearch node)"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-'
relational_operator: '<'
threshold: 20 # The low watermark for disk usage is 85% by default
window: 60
periods: 0
function: min
- name: 'elasticsearch-fs-critical'
description: "The filesystem's free space is too low (Elasticsearch node)"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-'
relational_operator: '<'
threshold: 15 # The high watermark for disk usage is 90% by default
window: 60
periods: 0
function: min
- name: 'influxdb-fs-warning'
description: "The filesystem's free space is low (InfluxDB node)"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/lib/influxdb'
relational_operator: '<'
threshold: 10
window: 60
periods: 0
function: min
- name: 'influxdb-fs-critical'
description: "The filesystem's free space is too low (InfluxDB node)"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: fs_space_percent_free
fields:
fs: '/var/lib/influxdb'
relational_operator: '<'
threshold: 5
window: 60
periods: 0
function: min
- name: 'haproxy-check'
description: "HAProxy cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'rabbitmq-check'
description: "RabbitMQ cannot be checked"
# This alarm's severity is warning because the effective status of the
# RabbitMQ cluster is computed by rabbitmq-pacemaker-* alarms.
# This alarm is still useful because it will report the node(s) on which
# RabbitMQ isn't running.
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: rabbitmq_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'ceph-mon-check'
description: "Ceph monitor cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: ceph_mon_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'ceph-osd-check'
description: "Ceph OSD cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: ceph_osd_check
relational_operator: '=='
threshold: 0
window: 80 # The metric interval collection is 60s
periods: 0
function: last
- name: 'pacemaker-check'
description: "Pacemaker cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: pacemaker_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'elasticsearch-check'
description: "Elasticsearch cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: elasticsearch_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'influxdb-check'
description: "InfluxDB cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: influxdb_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'libvirt-check'
description: "Libvirt cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: libvirt_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'memcached-check'
description: "memcached cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: memcached_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'mysql-check'
description: "MySQL cannot be checked"
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: mysql_check
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'network-warning-dropped-rx'
description: "Some received packets have been dropped"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: if_dropped_rx
relational_operator: '>'
threshold: 100
window: 60
periods: 0
function: avg
- name: 'network-critical-dropped-rx'
description: "Too many received packets have been dropped"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: if_dropped_rx
relational_operator: '>'
threshold: 1000
window: 60
periods: 0
function: avg
- name: 'network-warning-dropped-tx'
description: "Some transmitted packets have been dropped"
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: if_dropped_tx
relational_operator: '>'
threshold: 100
window: 60
periods: 0
function: avg
- name: 'network-critical-dropped-tx'
description: "Too many transmitted packets have been dropped"
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: if_dropped_tx
relational_operator: '>'
threshold: 1000
function: avg
window: 60
- name: 'instance-creation-time-warning'
description: "Instance creation takes too much time"
severity: 'warning'
no_data_policy: 'okay' # This is a sporadic metric
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_instance_creation_time
relational_operator: '>'
threshold: 20
window: 600
periods: 0
function: avg
- name: 'hdd-errors-critical'
description: 'Errors on hard drive(s) have been detected'
severity: 'critical'
enabled: 'true'
no_data_policy: okay
trigger:
rules:
- metric: hdd_errors_rate
group_by: ['device']
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: max
- name: 'total-nova-free-vcpu-warning'
description: 'There is none VCPU available for new instances'
severity: 'warning'
enabled: 'true'
no_data_policy: skip # the metric is only collected from the aggregator node
trigger:
rules:
- metric: openstack_nova_total_free_vcpus
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: max
- name: 'total-nova-free-memory-warning'
description: 'There is none memory available for new instances'
severity: 'warning'
enabled: 'true'
no_data_policy: skip # the metric is only collected from the aggregator node
trigger:
rules:
- metric: openstack_nova_total_free_ram
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: max
- name: 'nova-aggregates-free-memory-warning'
description: "The nova aggregates free memory percent is low"
severity: 'warning'
enabled: 'true'
no_data_policy: skip # the metric is only collected from the aggregator node
trigger:
rules:
- metric: openstack_nova_aggregate_free_ram_percent
group_by: [aggregate]
relational_operator: '<'
threshold: 10.0
window: 60
periods: 0
function: min
- name: 'nova-aggregates-free-memory-critical'
description: "The nova aggregates free memory percent is too low"
severity: 'critical'
enabled: 'true'
no_data_policy: skip # the metric is only collected from the aggregator node
trigger:
rules:
- metric: openstack_nova_aggregate_free_ram_percent
group_by: [aggregate]
relational_operator: '<'
threshold: 1.0
window: 60
periods: 0
function: min
# Adds alarm on local check for OpenStack services endpoint
- name: 'cinder-api-local-endpoint'
description: 'Cinder API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'cinder-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'glance-api-local-endpoint'
description: 'Glance API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'glance-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-api-local-endpoint'
description: 'Heat API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'heat-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-cfn-api-local-endpoint'
description: 'Heat CFN API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'heat-cfn-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'keystone-public-api-local-endpoint'
description: 'Keystone public API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'keystone-public-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-api-local-endpoint'
description: 'Neutron API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'neutron-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-api-local-endpoint'
description: 'Nova API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'nova-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'swift-api-local-endpoint'
description: 'Swift API is locally down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: openstack_check_local_api
fields:
service: 'swift-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
# Following are the OpenStack service check API definitions and
# also InfluxDB API
- name: 'influxdb-api-check-failed'
description: 'Endpoint check for InfluxDB is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: http_check
fields:
service: 'influxdb-cluster'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-api-check-failed'
description: 'Endpoint check for nova-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'nova-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-api-check-failed'
description: 'Endpoint check for neutron-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'neutron-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-api-check-failed'
description: 'Endpoint check for cinder-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'cinder-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-v2-api-check-failed'
description: 'Endpoint check for cinder-v2-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'cinder-v2-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'glance-api-check-failed'
description: 'Endpoint check for glance-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'glance-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-api-check-failed'
description: 'Endpoint check for heat-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'heat-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-cfn-api-check-failed'
description: 'Endpoint check for heat-cfn-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'heat-cfn-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'swift-api-check-failed'
description: 'Endpoint check for swift-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'swift-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'swift-s3-api-check-failed'
description: 'Endpoint check for swift-s3-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'swift-s3-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'keystone-public-api-check-failed'
description: 'Endpoint check for keystone-public-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'keystone-public-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'ceilometer-api-check-failed'
description: 'Endpoint check for ceilometer-api is failed'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
enabled: 'true'
trigger:
rules:
- metric: openstack_check_api
fields:
service: 'ceilometer-api'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
# Following are the AFD generated to check API backends
# All backends are down
- name: 'elasticsearch-api-backends-all-down'
description: 'All Elasticsearch backends are down'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'elasticsearch-rest'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'kibana-api-backends-all-down'
description: 'All API backends are down for Kibana'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'kibana'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'influxdb-api-backends-all-down'
description: 'All API backends are down for InfluxDB'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'influxdb'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'grafana-api-backends-all-down'
description: 'All API backends are down for Grafana'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'grafana'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'glance-registry-api-backends-all-down'
description: 'All API backends are down for glance-registry-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'glance-registry-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-api-backends-all-down'
description: 'All API backends are down for nova-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'nova-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-api-backends-all-down'
description: 'All API backends are down for cinder-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'cinder-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'object-storage-api-backends-all-down'
description: 'All API backends are down for object-storage'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'object-storage'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-cfn-api-backends-all-down'
description: 'All API backends are down for heat-cfn-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'heat-cfn-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'horizon-web-api-backends-all-down'
description: 'All API backends are down for horizon-web'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'horizon-web || horizon-https'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-novncproxy-websocket-api-backends-all-down'
description: 'All API backends are down for nova-novncproxy-websocket'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'nova-novncproxy-websocket'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-api-backends-all-down'
description: 'All API backends are down for heat-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'heat-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'keystone-public-api-backends-all-down'
description: 'All API backends are down for keystone-public-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'keystone-public-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-cloudwatch-api-backends-all-down'
description: 'All API backends are down for heat-cloudwatch-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'heat-cloudwatch-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-metadata-api-backends-all-down'
description: 'All API backends are down for nova-metadata-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'nova-metadata-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'mysqld-tcp-api-backends-all-down'
description: 'All API backends are down for mysqld-tcp'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'mysqld-tcp'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'keystone-admin-api-backends-all-down'
description: 'All API backends are down for keystone-admin-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'keystone-admin-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'glance-api-backends-all-down'
description: 'All API backends are down for glance-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'glance-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-api-backends-all-down'
description: 'All API backends are down for neutron-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'neutron-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'swift-api-backends-all-down'
description: 'All API backends are down for swift-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'swift-api || object-storage'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'ceilometer-api-backends-all-down'
description: 'All API backends are down for ceilometer-api'
severity: 'down'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'ceilometer-api'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
# At least one backend is down
- name: 'elasticsearch-api-backends-one-down'
description: 'At least one API backend is down for elasticsearch'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'elasticsearch-rest'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'kibana-api-backends-one-down'
description: 'At least one API backend is down for kibana'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'kibana'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'influxdb-api-backends-one-down'
description: 'At least one API backend is down for influxdb'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'influxdb'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'grafana-api-backends-one-down'
description: 'At least one API backend is down for grafana'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'grafana'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'glance-registry-api-backends-one-down'
description: 'At least one API backend is down for glance-registry-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'glance-registry-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-api-backends-one-down'
description: 'At least one API backend is down for nova-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'nova-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-api-backends-one-down'
description: 'At least one API backend is down for cinder-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'cinder-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'object-storage-api-backends-one-down'
description: 'At least one API backend is down for object-storage'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'object-storage'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-cfn-api-backends-one-down'
description: 'At least one API backend is down for heat-cfn-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'heat-cfn-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'horizon-web-api-backends-one-down'
description: 'At least one API backend is down for horizon-web'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'horizon-web || horizon-https'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-novncproxy-websocket-api-backends-one-down'
description: 'At least one API backend is down for nova-novncproxy-websocket'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'nova-novncproxy-websocket'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-api-backends-one-down'
description: 'At least one API backend is down for heat-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'heat-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'keystone-public-api-backends-one-down'
description: 'At least one API backend is down for keystone-public-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'keystone-public-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'heat-cloudwatch-api-backends-one-down'
description: 'At least one API backend is down for heat-cloudwatch-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'heat-cloudwatch-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-metadata-api-backends-one-down'
description: 'At least one API backend is down for nova-metadata-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'nova-metadata-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'mysqld-tcp-api-backends-one-down'
description: 'At least one API backend is down for mysqld-tcp'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'mysqld-tcp'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'keystone-admin-api-backends-one-down'
description: 'At least one API backend is down for keystone-admin-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'keystone-admin-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'glance-api-backends-one-down'
description: 'At least one API backend is down for glance-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'glance-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-api-backends-one-down'
description: 'At least one API backend is down for neutron-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'neutron-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'swift-api-backends-one-down'
description: 'At least one API backend is down for swift-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'swift-api || object-storage'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'ceilometer-api-backends-one-down'
description: 'At least one API backend is down for ceilometer-api'
severity: 'warning'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers
fields:
backend: 'ceilometer-api'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
# Less than 50% of backends are up
- name: 'elasticsearch-api-backends-majority-down'
description: 'Less than 50% of backends are up for elasticsearch'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'elasticsearch-rest'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'kibana-api-backends-majority-down'
description: 'Less than 50% of backends are up for kibana'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'kibana'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'influxdb-api-backends-majority-down'
description: 'Less than 50% of backends are up for influxdb'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'influxdb'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'grafana-api-backends-majority-down'
description: 'Less than 50% of backends are up for grafana'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'grafana'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'glance-registry-api-backends-majority-down'
description: 'Less than 50% of backends are up for glance-registry-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'glance-registry-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-api-backends-majority-down'
description: 'Less than 50% of backends are up for nova-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'nova-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'cinder-api-backends-majority-down'
description: 'Less than 50% of backends are up for cinder-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'cinder-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'object-storage-api-backends-majority-down'
description: 'Less than 50% of backends are up for object-storage'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'object-storage'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'heat-cfn-api-backends-majority-down'
description: 'Less than 50% of backends are up for heat-cfn-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'heat-cfn-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'horizon-web-api-backends-majority-down'
description: 'Less than 50% of backends are up for horizon-web'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'horizon-web || horizon-https'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-novncproxy-websocket-api-backends-majority-down'
description: 'Less than 50% of backends are up for nova-novncproxy-websocket'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'nova-novncproxy-websocket'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'heat-api-backends-majority-down'
description: 'Less than 50% of backends are up for heat-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'heat-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'keystone-public-api-backends-majority-down'
description: 'Less than 50% of backends are up for keystone-public-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'keystone-public-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'heat-cloudwatch-api-backends-majority-down'
description: 'Less than 50% of backends are up for heat-cloudwatch-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'heat-cloudwatch-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-metadata-api-backends-majority-down'
description: 'Less than 50% of backends are up for nova-metadata-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'nova-metadata-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'mysqld-tcp-api-backends-majority-down'
description: 'Less than 50% of backends are up for mysqld-tcp'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'mysqld-tcp'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'keystone-admin-api-backends-majority-down'
description: 'Less than 50% of backends are up for keystone-admin-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'keystone-admin-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'glance-api-backends-majority-down'
description: 'Less than 50% of backends are up for glance-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'glance-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-api-backends-majority-down'
description: 'Less than 50% of backends are up for neutron-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'neutron-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'swift-api-backends-majority-down'
description: 'Less than 50% of backends are up for swift-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'swift-api || object-storage'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'ceilometer-api-backends-majority-down'
description: 'Less than 50% of backends are up for ceilometer-api'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: haproxy_backend_servers_percent
fields:
backend: 'ceilometer-api'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
# Following are the AFD generated to check workers
# All workers are down
- name: 'nova-scheduler-all-down'
description: 'All Nova schedulers are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'scheduler'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-cert-all-down'
description: 'All Nova certs are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'cert'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-consoleauth-all-down'
description: 'All Nova consoleauths are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'consoleauth'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-compute-all-down'
description: 'All Nova computes are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'compute'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-conductor-all-down'
description: 'All Nova conductors are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'conductor'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-scheduler-all-down'
description: 'All Cinder schedulers are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'scheduler'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-volume-all-down'
description: 'All Cinder volumes are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'volume'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-l3-all-down'
description: 'All Neutron L3 agents are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'l3'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-dhcp-all-down'
description: 'All Neutron DHCP agents are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'dhcp'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-metadata-all-down'
description: 'All Neutron metadata agents are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'metadata'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-openvswitch-all-down'
description: 'All Neutron openvswitch agents are down'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'openvswitch'
state: 'up'
relational_operator: '=='
threshold: 0
window: 60
periods: 0
function: last
# At least one backend is down
- name: 'nova-scheduler-one-down'
description: 'At least one Nova scheduler is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'scheduler'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-cert-one-down'
description: 'At least one Nova cert is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'cert'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-consoleauth-one-down'
description: 'At least one Nova consoleauth is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'consoleauth'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-compute-one-down'
description: 'At least one Nova compute is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'compute'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'nova-conductor-one-down'
description: 'At least one Nova conductor is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services
fields:
service: 'conductor'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-scheduler-one-down'
description: 'At least one Cinder scheduler is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'scheduler'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'cinder-volume-one-down'
description: 'At least one Cinder volume is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services
fields:
service: 'volume'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-l3-one-down'
description: 'At least one L3 agent is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'l3'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-dhcp-one-down'
description: 'At least one DHCP agent is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'dhcp'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-metadata-one-down'
description: 'At least one metadata agents is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'metadata'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- name: 'neutron-openvswitch-one-down'
description: 'At least one openvswitch agents is down'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents
fields:
service: 'openvswitch'
state: 'down'
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
# Less than 50% of service are up (compared to up and down).
- name: 'nova-scheduler-majority-down'
description: 'Less than 50% of Nova schedulers are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'scheduler'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-cert-majority-down'
description: 'Less than 50% of Nova certs are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'cert'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-consoleauth-majority-down'
description: 'Less than 50% of Nova consoleauths are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'consoleauth'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-compute-majority-down'
description: 'Less than 50% of Nova computes are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'compute'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'nova-conductor-majority-down'
description: 'Less than 50% of Nova conductors are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_nova_services_percent
fields:
service: 'conductor'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'cinder-scheduler-majority-down'
description: 'Less than 50% of Cinder schedulers are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services_percent
fields:
service: 'scheduler'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'cinder-volume-majority-down'
description: 'Less than 50% of Cinder volumes are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_cinder_services_percent
fields:
service: 'volume'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-l3-majority-down'
description: 'Less than 50% of Neutron L3 agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'l3'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-dhcp-majority-down'
description: 'Less than 50% of Neutron DHCP agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'dhcp'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-metadata-majority-down'
description: 'Less than 50% of Neutron metadata agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'metadata'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
- name: 'neutron-openvswitch-majority-down'
description: 'Less than 50% of Neutron openvswitch agents are up'
severity: 'critical'
enabled: 'true'
trigger:
rules:
- metric: openstack_neutron_agents_percent
fields:
service: 'openvswitch'
state: 'up'
relational_operator: '<='
threshold: 50
window: 60
periods: 0
function: last
# Definition of the AFD node filters
node_cluster_alarms:
controller:
apply_to_node: controller
alerting: enabled
members:
cpu:
alarms: ['cpu-critical-controller', 'cpu-warning-controller']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
log-fs:
alarms: ['log-fs-critical', 'log-fs-warning']
other-fs:
alarms: ['other-fs-critical', 'other-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
<% if @detach_rabbitmq_enabled -%>
rabbitmq-nodes:
apply_to_node: rabbitmq-nodes
alerting: enabled
members:
cpu:
alarms: ['cpu-critical-rabbitmq', 'cpu-warning-rabbitmq']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
other-fs:
alarms: ['other-fs-critical', 'other-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
<% end -%>
mysql-nodes:
apply_to_node: mysql-nodes
alerting: enabled
members:
<% if @detach_database_enabled -%>
cpu:
alarms: ['cpu-critical-mysql', 'cpu-warning-mysql']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
other-fs:
alarms: ['other-fs-critical', 'other-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
<% end -%>
mysql-fs:
alarms: ['mysql-fs-critical', 'mysql-fs-warning']
compute:
apply_to_node: compute
alerting: enabled
members:
cpu:
alarms: ['cpu-critical-compute', 'cpu-warning-compute']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
nova-fs:
alarms: ['nova-fs-critical', 'nova-fs-warning']
other-fs:
alarms: ['other-fs-critical', 'other-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
storage:
apply_to_node: storage
alerting: enabled
members:
cpu:
alarms: ['cpu-critical-storage', 'cpu-warning-storage']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
other-fs:
alarms: ['other-fs-critical', 'other-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
<% if @storage_options["volumes_ceph"] then -%>
osd-disk:
alarms: ['osd-disk-critical']
<% end -%>
elasticsearch-nodes:
apply_to_node: elasticsearch-nodes
alerting: enabled
members:
cpu:
alarms: ['cpu-critical-default']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
data-fs:
alarms: ['elasticsearch-fs-critical', 'elasticsearch-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
influxdb-nodes:
apply_to_node: influxdb-nodes
alerting: enabled
members:
cpu:
alarms: ['cpu-critical-default']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
data-fs:
alarms: ['influxdb-fs-critical', 'influxdb-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alerting: enabled_with_notification
alarms: ['hdd-errors-critical']
# This is the default members configured for all nodes with unknown roles
default:
apply_to_node: default
# Operator wants to receive alert notification for individual nodes
alerting: enabled_with_notification
members:
cpu:
alarms: ['cpu-critical-default']
network-rx:
alarms: ['network-critical-dropped-rx', 'network-warning-dropped-rx']
network-tx:
alarms: ['network-critical-dropped-tx', 'network-warning-dropped-tx']
root-fs:
alarms: ['root-fs-critical', 'root-fs-warning']
other-fs:
alarms: ['other-fs-critical', 'other-fs-warning']
swap:
alarms: ['swap-usage-critical', 'swap-activity-warning', 'swap-usage-warning']
hdd-errors:
alarms: ['hdd-errors-critical']
# Definition of the AFD service filters
service_cluster_alarms:
rabbitmq-cluster:
apply_to_node: rabbitmq-nodes
alerting: enabled
members:
pacemaker:
alarms: ['rabbitmq-pacemaker-down', 'rabbitmq-pacemaker-critical', 'rabbitmq-pacemaker-warning']
queue:
alarms: ['rabbitmq-queue-warning']
memory:
alarms: ['rabbitmq-memory-limit-critical', 'rabbitmq-memory-limit-warning']
disk:
alarms: ['rabbitmq-disk-limit-critical', 'rabbitmq-disk-limit-warning']
rabbitmq-service:
apply_to_node: rabbitmq-nodes
alerting: enabled
members:
check:
alarms: ['rabbitmq-check']
mysql:
apply_to_node: mysql-nodes
alerting: enabled
members:
node-status:
alarms: ['mysql-node-connected', 'mysql-node-ready']
check:
alarms: ['mysql-check']
apache:
apply_to_node: controller
alerting: enabled
members:
worker:
alarms: ['apache-warning']
check:
alarms: ['apache-check']
nova-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['nova-api-http-errors']
backends:
alarms:
- 'nova-api-backends-all-down'
- 'nova-api-backends-majority-down'
- 'nova-api-backends-one-down'
nova-api-check:
alerting: enabled
members:
vip:
alarms: ['nova-api-check-failed']
nova-metadata-api:
apply_to_node: controller
alerting: enabled
members:
backends:
alarms:
- 'nova-metadata-api-backends-all-down'
- 'nova-metadata-api-backends-majority-down'
- 'nova-metadata-api-backends-one-down'
nova-novncproxy-websocket:
apply_to_node: controller
alerting: enabled
members:
backends:
alarms:
- 'nova-novncproxy-websocket-api-backends-all-down'
- 'nova-novncproxy-websocket-api-backends-majority-down'
- 'nova-novncproxy-websocket-api-backends-one-down'
nova-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['nova-api-local-endpoint']
nova-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['nova-logs-error']
nova-logs-compute:
apply_to_node: compute
alerting: enabled
members:
error:
alarms: ['nova-logs-error']
nova-cert:
alerting: enabled
members:
workers:
alarms:
- 'nova-cert-all-down'
- 'nova-cert-majority-down'
- 'nova-cert-one-down'
nova-consoleauth:
alerting: enabled
members:
workers:
alarms:
- 'nova-consoleauth-all-down'
- 'nova-consoleauth-majority-down'
- 'nova-consoleauth-one-down'
nova-compute:
alerting: enabled
members:
workers:
alarms:
- 'nova-compute-all-down'
- 'nova-compute-majority-down'
- 'nova-compute-one-down'
nova-conductor:
alerting: enabled
members:
workers:
alarms:
- 'nova-conductor-all-down'
- 'nova-conductor-majority-down'
- 'nova-conductor-one-down'
nova-scheduler:
alerting: enabled
members:
workers:
alarms:
- 'nova-scheduler-all-down'
- 'nova-scheduler-majority-down'
- 'nova-scheduler-one-down'
heat-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['heat-api-http-errors']
backends:
alarms:
- 'heat-api-backends-all-down'
- 'heat-api-backends-majority-down'
- 'heat-api-backends-one-down'
heat-cfn-api:
apply_to_node: controller
alerting: enabled
members:
backends:
alarms:
- 'heat-cfn-api-backends-all-down'
- 'heat-cfn-api-backends-majority-down'
- 'heat-cfn-api-backends-one-down'
heat-cloudwatch-api:
apply_to_node: controller
alerting: enabled
members:
backends:
alarms:
- 'heat-cloudwatch-api-backends-all-down'
- 'heat-cloudwatch-api-backends-majority-down'
- 'heat-cloudwatch-api-backends-one-down'
heat-api-check:
alerting: enabled
members:
vip:
alarms: ['heat-api-check-failed']
heat-cfn-api-check:
alerting: enabled
members:
vip:
alarms: ['heat-cfn-api-check-failed']
heat-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['heat-api-local-endpoint']
heat-cfn-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['heat-cfn-api-local-endpoint']
heat-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['heat-logs-error']
<% if not @storage_options["objects_ceph"] then -%>
swift-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['swift-api-http-errors']
backends:
alarms:
- 'swift-api-backends-all-down'
- 'swift-api-backends-majority-down'
- 'swift-api-backends-one-down'
swift-api-check:
alerting: enabled
members:
vip:
alarms: ['swift-api-check-failed']
swift-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['swift-api-local-endpoint']
swift-s3-api-check:
alerting: enabled
members:
vip:
alarms: ['swift-s3-api-check-failed']
swift-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['swift-logs-error']
<% end -%>
cinder-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['cinder-api-http-errors']
backends:
alarms:
- 'cinder-api-backends-all-down'
- 'cinder-api-backends-majority-down'
- 'cinder-api-backends-one-down'
cinder-api-check:
alerting: enabled
members:
vip:
alarms: ['cinder-api-check-failed']
cinder-v2-api-check:
alerting: enabled
members:
vip:
alarms: ['cinder-v2-api-check-failed']
cinder-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['cinder-api-local-endpoint']
cinder-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['cinder-logs-error']
cinder-scheduler:
alerting: enabled
members:
workers:
alarms:
- 'cinder-scheduler-all-down'
- 'cinder-scheduler-majority-down'
- 'cinder-scheduler-one-down'
cinder-volume:
alerting: enabled
members:
workers:
alarms:
- 'cinder-volume-all-down'
- 'cinder-volume-majority-down'
- 'cinder-volume-one-down'
<% if not @storage_options["volumes_ceph"] then -%>
cinder-volume-logs:
apply_to_node: storage
alerting: enabled
members:
error:
alarms: ['cinder-logs-error']
<% end -%>
glance-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['glance-api-http-errors']
backends:
alarms:
- 'glance-api-backends-all-down'
- 'glance-api-backends-majority-down'
- 'glance-api-backends-one-down'
glance-registry-api:
apply_to_node: controller
alerting: enabled
members:
backends:
alarms:
- 'glance-registry-api-backends-all-down'
- 'glance-registry-api-backends-majority-down'
- 'glance-registry-api-backends-one-down'
glance-api-check:
alerting: enabled
members:
vip:
alarms: ['glance-api-check-failed']
glance-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['glance-api-local-endpoint']
glance-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['glance-logs-error']
neutron-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['neutron-api-http-errors']
backends:
alarms:
- 'neutron-api-backends-all-down'
- 'neutron-api-backends-majority-down'
- 'neutron-api-backends-one-down'
neutron-api-check:
alerting: enabled
members:
vip:
alarms: ['neutron-api-check-failed']
neutron-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['neutron-api-local-endpoint']
neutron-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['neutron-logs-error']
neutron-l3:
alerting: enabled
members:
workers:
alarms:
- 'neutron-l3-all-down'
- 'neutron-l3-majority-down'
- 'neutron-l3-one-down'
neutron-dhcp:
alerting: enabled
members:
workers:
alarms:
- 'neutron-dhcp-all-down'
- 'neutron-dhcp-majority-down'
- 'neutron-dhcp-one-down'
neutron-metadata:
alerting: enabled
members:
workers:
alarms:
- 'neutron-metadata-all-down'
- 'neutron-metadata-majority-down'
- 'neutron-metadata-one-down'
neutron-openvswitch:
alerting: enabled
members:
workers:
alarms:
- 'neutron-openvswitch-all-down'
- 'neutron-openvswitch-majority-down'
- 'neutron-openvswitch-one-down'
neutron-logs-compute:
apply_to_node: compute
alerting: enabled
members:
error:
alarms: ['neutron-logs-error']
keystone-response-time:
apply_to_node: controller
alerting: enabled
members:
duration:
alarms: ['keystone-response-time-duration']
keystone-public-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['keystone-public-api-http-errors']
backends:
alarms:
- 'keystone-public-api-backends-all-down'
- 'keystone-public-api-backends-majority-down'
- 'keystone-public-api-backends-one-down'
keystone-public-api-check:
alerting: enabled
members:
vip:
alarms: ['keystone-public-api-check-failed']
keystone-public-api-endpoint:
apply_to_node: controller
alerting: enabled
members:
endpoint:
alarms: ['keystone-public-api-local-endpoint']
keystone-logs:
apply_to_node: controller
alerting: enabled
members:
error:
alarms: ['keystone-logs-error']
keystone-admin-api:
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['keystone-admin-api-http-errors']
backends:
alarms:
- 'keystone-admin-api-backends-all-down'
- 'keystone-admin-api-backends-majority-down'
- 'keystone-admin-api-backends-one-down'
<% if @tls_enabled then -%>
horizon-https:
<% else -%>
horizon-web:
<% end -%>
apply_to_node: controller
alerting: enabled
members:
http_errors:
alarms: ['horizon-web-http-errors']
backends:
alarms:
- 'horizon-web-api-backends-all-down'
- 'horizon-web-api-backends-majority-down'
- 'horizon-web-api-backends-one-down'
nova-instances:
#TODO(scroiset): apply on compute nodes
apply_to_node: controller
alerting: enabled
members:
creation-time:
alarms: ['instance-creation-time-warning']
nova-free-vcpu:
alerting: enabled
members:
nova-free-vcpu:
alarms: ['total-nova-free-vcpu-warning']
nova-free-memory:
alerting: enabled
members:
nova-free-memory:
alarms: ['total-nova-free-memory-warning']
nova-aggregates-free-memory:
alarms: ['nova-aggregates-free-memory-critical', 'nova-aggregates-free-memory-warning']
ceph-mon-cluster:
apply_to_node: ceph-mon
alerting: enabled
members:
health:
alarms: ['ceph-health-critical', 'ceph-health-warning']
capacity:
alarms: ['ceph-capacity-critical', 'ceph-capacity-warning']
ceph-mon-service:
apply_to_node: ceph-mon
alerting: enabled
members:
check:
alarms: ['ceph-mon-check']
<% if @storage_options["volumes_ceph"] then -%>
ceph-osd-service:
apply_to_node: storage
alerting: enabled
members:
check:
alarms: ['ceph-osd-check']
<% end -%>
elasticsearch-cluster:
apply_to_node: elasticsearch-nodes
alerting: enabled
members:
health:
alarms: ['elasticsearch-health-critical', 'elasticsearch-health-warning']
elasticsearch-service:
apply_to_node: elasticsearch-nodes
alerting: enabled
members:
check:
alarms: ['elasticsearch-check']
influxdb-service:
apply_to_node: influxdb-nodes
alerting: enabled
members:
check:
alarms: ['influxdb-check']
influxdb-api-check:
alerting: enabled
members:
vip:
alarms: ['influxdb-api-check-failed']
haproxy-openstack:
apply_to_node: controller
alerting: enabled
members:
check:
alarms: ['haproxy-check']
pacemaker-service:
apply_to_node: controller
alerting: enabled
members:
check:
alarms: ['pacemaker-check']
libvirt-service:
apply_to_node: compute
alerting: enabled
members:
check:
alarms: ['libvirt-check']
memcached-service:
apply_to_node: controller
alerting: enabled
members:
check:
alarms: ['memcached-check']
ceilometer-api-check:
alerting: enabled
members:
vip:
alarms: ['ceilometer-api-check-failed']
mysqld-tcp:
apply_to_node: controller
alerting: enabled
members:
backends:
alarms:
- 'mysqld-tcp-api-backends-all-down'
- 'mysqld-tcp-api-backends-majority-down'
- 'mysqld-tcp-api-backends-one-down'