openstack-helm-infra/prometheus/values_overrides/openstack.yaml
Steven Fitzpatrick cdd0f33d0c Revert "Prometheus: Render Rules as Templates"
This reverts commit fb7fc87d23.

I first submitted that as a way to add dynamic capability to the
prometheus rules (they infamously don't support ENV variable
substitution there). However this be done easily with another solution,
and would clean up the prometheus chart values significantly.

Change-Id: Ibec512d92490798ae5522468b915b49e7746806a
2020-10-06 15:21:18 +00:00

326 lines
15 KiB
YAML

---
conf:
prometheus:
rules:
openstack:
groups:
- name: mariadb.rules
rules:
- alert: prom_exporter_mariadb_openstack_unavailable
expr: avg_over_time(up{job="mysql-exporter",kubernetes_namespace="openstack"}[5m]) == 0
for: 5m
labels:
severity: warning
annotations:
description: MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available
- alert: prom_exporter_mariadb_osh_infra_unavailable
expr: avg_over_time(up{job="mysql-exporter",kubernetes_namespace="osh-infra"}[5m]) == 0
for: 5m
labels:
severity: warning
annotations:
description: MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available
- alert: mariadb_table_lock_wait_high
expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
for: 10m
labels:
severity: warning
annotations:
description: 'Mariadb has high table lock waits of {{ $value }} percentage'
summary: 'Mariadb table lock waits are high'
- alert: mariadb_node_not_ready
expr: mysql_global_status_wsrep_ready != 1
for: 10m
labels:
severity: warning
annotations:
description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
summary: 'Galera cluster node not ready'
- alert: mariadb_galera_node_out_of_sync
expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
for: 10m
labels:
severity: warning
annotations:
description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
summary: 'Galera cluster node out of sync'
- alert: mariadb_innodb_replication_fallen_behind
expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
for: 10m
labels:
severity: warning
annotations:
description: 'The mysql innodb replication has fallen behind and is not recovering'
summary: 'MySQL innodb replication is lagging'
- name: openstack.rules
rules:
- alert: prom_exporter_openstack_unavailable
expr: avg_over_time(up{job="openstack-metrics"}[5m]) == 0
for: 5m
labels:
severity: warning
annotations:
description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
title: Openstack exporter is not collecting metrics or is not available
- alert: os_glance_api_availability
expr: openstack_check_glance_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Glance API is not available at {{$labels.url}}'
- alert: os_nova_api_availability
expr: openstack_check_nova_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Nova API is not available at {{$labels.url}}'
- alert: os_keystone_api_availability
expr: openstack_check_keystone_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Keystone API is not available at {{$labels.url}}'
- alert: os_neutron_api_availability
expr: openstack_check_neutron_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Neutron API is not available at {{$labels.url}}'
- alert: os_neutron_metadata_agent_availability
expr: openstack_services_neutron_metadata_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
summary: 'One or more neutron metadata_agents are not available'
- alert: os_neutron_openvswitch_agent_availability
expr: openstack_services_neutron_openvswitch_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
summary: 'One or more neutron openvswitch agents are not available'
- alert: os_neutron_dhcp_agent_availability
expr: openstack_services_neutron_dhcp_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
summary: 'One or more neutron dhcp agents are not available'
- alert: os_neutron_l3_agent_availability
expr: openstack_services_neutron_l3_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron L3 agents are not available for more than 5 minutes'
summary: 'One or more neutron L3 agents are not available'
- alert: os_swift_api_availability
expr: openstack_check_swift_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Swift API is not available at {{$labels.url}}'
- alert: os_cinder_api_availability
expr: openstack_check_cinder_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Cinder API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Cinder API is not available at {{$labels.url}}'
- alert: os_cinder_scheduler_availability
expr: openstack_services_cinder_cinder_scheduler != 1
for: 5m
labels:
severity: page
annotations:
description: 'Cinder scheduler is not available for more than 5 minutes'
summary: 'Cinder scheduler is not available'
- alert: os_heat_api_availability
expr: openstack_check_heat_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Heat API is not available at {{$labels.url}}'
- alert: os_nova_compute_disabled
expr: openstack_services_nova_compute_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is disabled on some hosts'
- alert: os_nova_conductor_disabled
expr: openstack_services_nova_conductor_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
- alert: os_nova_consoleauth_disabled
expr: openstack_services_nova_consoleauth_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
- alert: os_nova_scheduler_disabled
expr: openstack_services_nova_scheduler_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
- alert: os_nova_compute_down
expr: openstack_services_nova_compute_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is down on some hosts'
- alert: os_nova_conductor_down
expr: openstack_services_nova_conductor_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is down on some hosts'
- alert: os_nova_consoleauth_down
expr: openstack_services_nova_consoleauth_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is down on some hosts'
- alert: os_nova_scheduler_down
expr: openstack_services_nova_scheduler_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is down on some hosts'
- alert: os_vm_vcpu_usage_high
expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
summary: 'Openstack VM vcpu usage is high'
- alert: os_vm_ram_usage_high
expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM RAM usage is hight at {{$value}} percent'
summary: 'Openstack VM RAM usage is high'
- alert: os_vm_disk_usage_high
expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM Disk usage is hight at {{$value}} percent'
summary: 'Openstack VM Disk usage is high'
- name: rabbitmq.rules
rules:
- alert: rabbitmq_network_pratitions_detected
expr: min(partitions) by(instance) > 0
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
summary: 'RabbitMQ Network partitions detected'
- alert: rabbitmq_down
expr: min(rabbitmq_up) by(instance) != 1
for: 10m
labels:
severity: page
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
- alert: rabbitmq_file_descriptor_usage_high
expr: fd_used * 100 /fd_total > 80
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
- alert: rabbitmq_node_disk_free_alarm
expr: node_disk_free_alarm > 0
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
summary: 'RabbitMQ disk space usage is high'
- alert: rabbitmq_node_memory_alarm
expr: node_mem_alarm > 0
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
summary: 'RabbitMQ memory usage is high'
- alert: rabbitmq_less_than_3_nodes
expr: running < 3
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server has less than 3 nodes running.'
summary: 'RabbitMQ server is at risk of loosing data'
- alert: rabbitmq_queue_messages_returned_high
expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50
for: 5m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
- alert: rabbitmq_consumers_low_utilization
expr: queue_consumer_utilisation < .4
for: 5m
labels:
severity: warning
annotations:
description: 'RabbitMQ consumers message consumption speed is low'
summary: 'RabbitMQ consumers message consumption speed is low'
- alert: rabbitmq_high_message_load
expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
for: 5m
labels:
severity: warning
annotations:
description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
summary: 'RabbitMQ has high message load'
...