Merge "Revert "Prometheus: Render Rules as Templates""

This commit is contained in:
Zuul 2020-10-07 15:11:14 +00:00 committed by Gerrit Code Review
commit 72658e2712
9 changed files with 242 additions and 224 deletions

View File

@ -15,7 +15,7 @@ apiVersion: v1
appVersion: v2.12.0 appVersion: v2.12.0
description: OpenStack-Helm Prometheus description: OpenStack-Helm Prometheus
name: prometheus name: prometheus
version: 0.1.2 version: 0.1.3
home: https://prometheus.io/ home: https://prometheus.io/
sources: sources:
- https://github.com/prometheus/prometheus - https://github.com/prometheus/prometheus

View File

@ -22,9 +22,8 @@ metadata:
type: Opaque type: Opaque
data: data:
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.prometheus.scrape_configs.template "key" "prometheus.yml" "format" "Secret") | indent 2 }} {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.prometheus.scrape_configs.template "key" "prometheus.yml" "format" "Secret") | indent 2 }}
{{ range $name, $config := .Values.conf.prometheus.rules }} {{ range $key, $value := .Values.conf.prometheus.rules }}
{{- $filename := printf "%s.rules" $name}} {{ $key }}.rules: {{ toYaml $value | b64enc }}
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" $config "key" $filename "format" "Secret") | indent 2 }}
{{ end }} {{ end }}
# NOTE(srwilkers): this must be last, to work round helm ~2.7 bug. # NOTE(srwilkers): this must be last, to work round helm ~2.7 bug.
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.httpd "key" "httpd.conf" "format" "Secret") | indent 2 }} {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.httpd "key" "httpd.conf" "format" "Secret") | indent 2 }}

View File

@ -12,7 +12,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`The configuration of the instances of the Alertmanager cluster {{$labels.service}} are out of sync.`}}" description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
@ -20,7 +20,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.`}}" description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
summary: Alertmanager down or not discovered summary: Alertmanager down or not discovered
- alert: FailedReload - alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0 expr: alertmanager_config_last_reload_successful == 0
@ -28,6 +28,6 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.`}}" description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.
summary: Alertmanager configuration reload has failed summary: Alertmanager configuration reload has failed
... ...

View File

@ -29,56 +29,56 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`no ceph active mgr is present or all ceph mgr are down`}}" description: 'no ceph active mgr is present or all ceph mgr are down'
summary: "{{`no ceph active mgt is present`}}" summary: 'no ceph active mgt is present'
- alert: ceph_monitor_quorum_low - alert: ceph_monitor_quorum_low
expr: ceph_mon_quorum_count < 3 expr: ceph_mon_quorum_count < 3
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`ceph monitor quorum has been less than 3 for more than 5 minutes`}}" description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
summary: "{{`ceph high availability is at risk`}}" summary: 'ceph high availability is at risk'
- alert: ceph_monitor_quorum_absent - alert: ceph_monitor_quorum_absent
expr: absent(avg_over_time(ceph_mon_quorum_status[5m])) expr: absent(avg_over_time(ceph_mon_quorum_status[5m]))
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`ceph monitor quorum has been gone for more than 5 minutes`}}" description: 'ceph monitor quorum has been gone for more than 5 minutes'
summary: "{{`ceph high availability is at risk`}}" summary: 'ceph high availability is at risk'
- alert: ceph_cluster_usage_high - alert: ceph_cluster_usage_high
expr: avg_over_time(ceph_cluster_usage_percent[5m]) > 80 expr: avg_over_time(ceph_cluster_usage_percent[5m]) > 80
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`ceph cluster capacity usage more than 80 percent`}}" description: 'ceph cluster capacity usage more than 80 percent'
summary: "{{`ceph cluster usage is more than 80 percent`}}" summary: 'ceph cluster usage is more than 80 percent'
- alert: ceph_placement_group_degrade_pct_high - alert: ceph_placement_group_degrade_pct_high
expr: avg_over_time(ceph_placement_group_degrade_percent[5m]) > 80 expr: avg_over_time(ceph_placement_group_degrade_percent[5m]) > 80
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`ceph placement group degradation is more than 80 percent`}}" description: 'ceph placement group degradation is more than 80 percent'
summary: "{{`ceph placement groups degraded`}}" summary: 'ceph placement groups degraded'
- alert: ceph_osd_down_pct_high - alert: ceph_osd_down_pct_high
expr: avg_over_time(ceph_osd_down_percent[5m]) > 80 expr: avg_over_time(ceph_osd_down_percent[5m]) > 80
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`ceph OSDs down percent is more than 80 percent`}}" description: 'ceph OSDs down percent is more than 80 percent'
summary: "{{`ceph OSDs down percent is high`}}" summary: 'ceph OSDs down percent is high'
- alert: ceph_osd_down - alert: ceph_osd_down
expr: avg_over_time(ceph_osd_up[5m]) == 0 expr: avg_over_time(ceph_osd_up[5m]) == 0
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.`}}" description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
summary: "{{`ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.`}}" summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
- alert: ceph_osd_out - alert: ceph_osd_out
expr: avg_over_time(ceph_osd_in[5m]) == 0 expr: avg_over_time(ceph_osd_in[5m]) == 0
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.`}}" description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
summary: "{{`ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.`}}" summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
... ...

View File

@ -20,72 +20,72 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.`}}" description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
summary: Elasticsearch has a very high process open file count. summary: 'Elasticsearch has a very high process open file count.'
- alert: es_high_process_cpu_percent - alert: es_high_process_cpu_percent
expr: elasticsearch_process_cpu_percent > 95 expr: elasticsearch_process_cpu_percent > 95
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.`}}" description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
summary: Elasticsearch process cpu usage is more than 95 percent. summary: 'Elasticsearch process cpu usage is more than 95 percent.'
- alert: es_fs_usage_high - alert: es_fs_usage_high
expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80 expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.`}}" description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
summary: Elasticsearch filesystem usage is high. summary: 'Elasticsearch filesystem usage is high.'
- alert: es_unassigned_shards - alert: es_unassigned_shards
expr: elasticsearch_cluster_health_unassigned_shards > 0 expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Elasticsearch has {{ $value }} unassigned shards.`}}" description: 'Elasticsearch has {{ $value }} unassigned shards.'
summary: Elasticsearch has unassigned shards and hence a unhealthy cluster state. summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
- alert: es_cluster_health_timed_out - alert: es_cluster_health_timed_out
expr: elasticsearch_cluster_health_timed_out > 0 expr: elasticsearch_cluster_health_timed_out > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Elasticsearch cluster health status call timedout {{ $value }} times.`}}" description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
summary: Elasticsearch cluster health status calls are timing out. summary: 'Elasticsearch cluster health status calls are timing out.'
- alert: es_cluster_health_status_alert - alert: es_cluster_health_status_alert
expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2 expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.`}}" description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
summary: Elasticsearch cluster health status is not green. summary: 'Elasticsearch cluster health status is not green.'
- alert: es_cluster_health_too_few_nodes_running - alert: es_cluster_health_too_few_nodes_running
expr: elasticsearch_cluster_health_number_of_nodes < 3 expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`There are only {{$value}} < 3 ElasticSearch nodes running`}}" description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
summary: ElasticSearch running on less than 3 nodes summary: 'ElasticSearch running on less than 3 nodes'
- alert: es_cluster_health_too_few_data_nodes_running - alert: es_cluster_health_too_few_data_nodes_running
expr: elasticsearch_cluster_health_number_of_data_nodes < 3 expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`There are only {{$value}} < 3 ElasticSearch data nodes running`}}" description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
summary: ElasticSearch running on less than 3 data nodes summary: 'ElasticSearch running on less than 3 data nodes'
- alert: es_cluster_health_too_few_data_nodes_running - alert: es_cluster_health_too_few_data_nodes_running
expr: elasticsearch_cluster_health_number_of_data_nodes < 3 expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`There are only {{$value}} < 3 ElasticSearch data nodes running`}}" description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
summary: ElasticSearch running on less than 3 data nodes summary: 'ElasticSearch running on less than 3 data nodes'
fluentd: fluentd:
groups: groups:
- name: fluentd.alerting_rules - name: fluentd.alerting_rules

View File

@ -19,45 +19,45 @@ conf:
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour`}}" description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
summary: A high number of dataplane failures within Felix are happening summary: 'A high number of dataplane failures within Felix are happening'
- alert: calico_datapane_address_msg_batch_size_high_5m - alert: calico_datapane_address_msg_batch_size_high_5m
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5 expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size`}}" description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
summary: Felix address message batch size is higher summary: 'Felix address message batch size is higher'
- alert: calico_datapane_iface_msg_batch_size_high_5m - alert: calico_datapane_iface_msg_batch_size_high_5m
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5 expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size`}}" description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
summary: Felix interface message batch size is higher summary: 'Felix interface message batch size is higher'
- alert: calico_ipset_errors_high_1h - alert: calico_ipset_errors_high_1h
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5 expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour`}}" description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
summary: A high number of ipset errors within Felix are happening summary: 'A high number of ipset errors within Felix are happening'
- alert: calico_iptable_save_errors_high_1h - alert: calico_iptable_save_errors_high_1h
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5 expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour`}}" description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
summary: A high number of iptable save errors within Felix are happening summary: 'A high number of iptable save errors within Felix are happening'
- alert: calico_iptable_restore_errors_high_1h - alert: calico_iptable_restore_errors_high_1h
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5 expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour`}}" description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
summary: A high number of iptable restore errors within Felix are happening summary: 'A high number of iptable restore errors within Felix are happening'
- name: etcd3.rules - name: etcd3.rules
rules: rules:
- alert: etcd_InsufficientMembers - alert: etcd_InsufficientMembers
@ -74,14 +74,14 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`etcd member {{ $labels.instance }} has no leader`}}" description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader summary: etcd member has no leader
- alert: etcd_HighNumberOfLeaderChanges - alert: etcd_HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour`}}" description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening summary: a high number of leader changes within the etcd cluster are happening
- alert: etcd_HighNumberOfFailedGRPCRequests - alert: etcd_HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
@ -89,7 +89,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}`}}" description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing summary: a high number of gRPC requests are failing
- alert: etcd_HighNumberOfFailedGRPCRequests - alert: etcd_HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
@ -97,7 +97,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}`}}" description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing summary: a high number of gRPC requests are failing
- alert: etcd_GRPCRequestsSlow - alert: etcd_GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
@ -105,7 +105,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow`}}" description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests summary: slow gRPC requests
- alert: etcd_HighNumberOfFailedHTTPRequests - alert: etcd_HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
@ -113,7 +113,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}`}}" description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing summary: a high number of HTTP requests are failing
- alert: etcd_HighNumberOfFailedHTTPRequests - alert: etcd_HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
@ -121,7 +121,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}`}}" description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing summary: a high number of HTTP requests are failing
- alert: etcd_HTTPRequestsSlow - alert: etcd_HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
@ -129,7 +129,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow`}}" description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests summary: slow HTTP requests
- alert: etcd_EtcdMemberCommunicationSlow - alert: etcd_EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
@ -137,14 +137,14 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow`}}" description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow summary: etcd member communication is slow
- alert: etcd_HighNumberOfFailedProposals - alert: etcd_HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour`}}" description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing summary: a high number of proposals within the etcd cluster are failing
- alert: etcd_HighFsyncDurations - alert: etcd_HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
@ -152,7 +152,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`etcd instance {{ $labels.instance }} fync durations are high`}}" description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations summary: high fsync durations
- alert: etcd_HighCommitDurations - alert: etcd_HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
@ -160,7 +160,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`etcd instance {{ $labels.instance }} commit durations are high`}}" description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations summary: high commit durations
- name: kubelet.rules - name: kubelet.rules
rules: rules:
@ -170,15 +170,15 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute`}}" description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute
summary: "{{`{{ $labels.node }} Node status is NotReady and {{ $labels.status }}`}}" summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}'
- alert: K8SManyNodesNotReady - alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2 expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).`}}" description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready summary: Many Kubernetes nodes are Not Ready
- alert: K8SManyNodesNotReady - alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2 expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
@ -186,7 +186,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).`}}" description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready summary: Many Kubernetes nodes are Not Ready
- alert: K8SNodesNotReady - alert: K8SNodesNotReady
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0 expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
@ -194,7 +194,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`{{ $value }} nodes are notReady state.`}}" description: '{{ $value }} nodes are notReady state.'
summary: One or more Kubernetes nodes are Not Ready summary: One or more Kubernetes nodes are Not Ready
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
@ -202,7 +202,7 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`Prometheus failed to scrape {{ $value }}% of kubelets.`}}" description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
@ -210,14 +210,14 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: "{{`Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.`}}" description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods - alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100 expr: kubelet_running_pod_count > 100
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110`}}" description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit summary: Kubelet is close to pod limit
- name: kube-apiserver.rules - name: kube-apiserver.rules
rules: rules:
@ -235,7 +235,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.`}}" description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high summary: Kubernetes apiserver latency is high
- name: kube-controller-manager.rules - name: kube-controller-manager.rules
rules: rules:
@ -264,118 +264,118 @@ conf:
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired`}}" description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
summary: "{{`{{$labels.statefulset}}: has inssuficient replicas.`}}" summary: '{{$labels.statefulset}}: has inssuficient replicas.'
- alert: daemonsets_misscheduled - alert: daemonsets_misscheduled
expr: kube_daemonset_status_number_misscheduled > 0 expr: kube_daemonset_status_number_misscheduled > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Daemonset {{$labels.daemonset}} is running where it is not supposed to run`}}" description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
summary: Daemonsets not scheduled correctly summary: 'Daemonsets not scheduled correctly'
- alert: daemonsets_not_scheduled - alert: daemonsets_not_scheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number`}}" description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: Less than desired number of daemonsets scheduled summary: 'Less than desired number of daemonsets scheduled'
- alert: daemonset_pods_unavailable - alert: daemonset_pods_unavailable
expr: kube_daemonset_status_number_unavailable > 0 expr: kube_daemonset_status_number_unavailable > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Daemonset {{$labels.daemonset}} currently has pods unavailable`}}" description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
summary: Daemonset pods unavailable, due to one of many reasons summary: 'Daemonset pods unavailable, due to one of many reasons'
- alert: deployment_replicas_unavailable - alert: deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0 expr: kube_deployment_status_replicas_unavailable > 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`deployment {{$labels.deployment}} has {{$value}} replicas unavailable`}}" description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
summary: "{{`{{$labels.deployment}}: has inssuficient replicas.`}}" summary: '{{$labels.deployment}}: has inssuficient replicas.'
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0 expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update`}}" description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
summary: "{{`{{$labels.deployment}}: has inssuficient replicas during a rolling update.`}}" summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
- alert: job_status_failed - alert: job_status_failed
expr: kube_job_status_failed > 0 expr: kube_job_status_failed > 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Job {{$labels.exported_job}} is in failed status`}}" description: 'Job {{$labels.exported_job}} is in failed status'
summary: "{{`{{$labels.exported_job}} has failed status`}}" summary: '{{$labels.exported_job}} has failed status'
- alert: pod_status_pending - alert: pod_status_pending
expr: kube_pod_status_phase{phase="Pending"} == 1 expr: kube_pod_status_phase{phase="Pending"} == 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes`}}" description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status`}}" summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
- alert: pod_status_error_image_pull - alert: pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes`}}" description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}" summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_status_error_image_pull_backoff - alert: pod_status_error_image_pull_backoff
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1 expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes`}}" description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}" summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_crash_loop_back_off - alert: pod_error_crash_loop_back_off
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1 expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes`}}" description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}" summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_config_error - alert: pod_error_config_error
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1 expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes`}}" description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}" summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: replicaset_missing_replicas - alert: replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes`}}" description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
summary: "{{`Replicaset {{$labels.replicaset}} is missing replicas`}}" summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
- alert: pod_container_terminated - alert: pod_container_terminated
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0 expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes`}}" description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
summary: "{{`Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status`}}" summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: volume_claim_capacity_high_utilization - alert: volume_claim_capacity_high_utilization
expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80 expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity`}}" description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
summary: "{{`{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.`}}" summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
... ...

View File

@ -28,71 +28,80 @@ conf:
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} has less than 20% free space left.`}}" description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
summary: "{{`{{$labels.alias}}: Filesystem is running out of space soon.`}}" has less than 20% free space left.'
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
- alert: node_filesystem_full_in_4h - alert: node_filesystem_full_in_4h
expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0 expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours`}}" description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
summary: "{{`{{$labels.alias}}: Filesystem is running out of space in 4 hours.`}}" is running out of space of in approx. 4 hours'
summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
- alert: node_filedescriptors_full_in_3h - alert: node_filedescriptors_full_in_3h
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
for: 20m for: 20m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} is running out of available file descriptors in approx. 3 hours`}}" description: '{{$labels.alias}} is running out of available file descriptors
summary: "{{`{{$labels.alias}} is running out of available file descriptors in 3 hours.`}}" in approx. 3 hours'
summary: '{{$labels.alias}} is running out of available file descriptors in
3 hours.'
- alert: node_load1_90percent - alert: node_load1_90percent
expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9 expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
for: 1h for: 1h
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} is running with > 90% total load for at least 1h.`}}" description: '{{$labels.alias}} is running with > 90% total load for at least
summary: "{{`{{$labels.alias}}: Running on high load.`}}" 1h.'
summary: '{{$labels.alias}}: Running on high load.'
- alert: node_cpu_util_90percent - alert: node_cpu_util_90percent
expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90 expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
for: 1h for: 1h
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} has total CPU utilization over 90% for at least 1h.`}}" description: '{{$labels.alias}} has total CPU utilization over 90% for at least
summary: "{{`{{$labels.alias}}: High CPU utilization.`}}" 1h.'
summary: '{{$labels.alias}}: High CPU utilization.'
- alert: node_ram_using_90percent - alert: node_ram_using_90percent
expr: avg_over_time(node_ram_usage_percent[2m]) > 90 expr: avg_over_time(node_ram_usage_percent[2m]) > 90
for: 30m for: 30m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.`}}" description: '{{$labels.alias}} is using at least 90% of its RAM for at least
summary: "{{`{{$labels.alias}}: Using lots of RAM.`}}" 30 minutes now.'
summary: '{{$labels.alias}}: Using lots of RAM.'
- alert: node_swap_using_80percent - alert: node_swap_using_80percent
expr: avg_over_time(node_swap_usage_percent[2m]) > 80 expr: avg_over_time(node_swap_usage_percent[2m]) > 80
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now.`}}" description: '{{$labels.alias}} is using 80% of its swap space for at least
summary: "{{`{{$labels.alias}}: Running out of swap soon.`}}" 10 minutes now.'
summary: '{{$labels.alias}}: Running out of swap soon.'
- alert: node_high_cpu_load - alert: node_high_cpu_load
expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0 expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}`}}" description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
summary: "{{`{{$labels.alias}}: Running on high load: {{$value}}`}}" summary: '{{$labels.alias}}: Running on high load: {{$value}}'
- alert: node_high_memory_load - alert: node_high_memory_load
expr: avg_over_time(node_ram_usage_percent[2m]) > 85 expr: avg_over_time(node_ram_usage_percent[2m]) > 85
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host memory usage is {{ humanize $value }}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}.`}}" description: Host memory usage is {{ humanize $value }}%. Reported by
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server memory is almost full summary: Server memory is almost full
- alert: node_high_storage_load - alert: node_high_storage_load
expr: avg_over_time(node_storage_usage_percent{mountpoint="/"}[2m]) > 85 expr: avg_over_time(node_storage_usage_percent{mountpoint="/"}[2m]) > 85
@ -100,7 +109,8 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host storage usage is {{ humanize $value }}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}.`}}" description: Host storage usage is {{ humanize $value }}%. Reported by
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server storage is almost full summary: Server storage is almost full
- alert: node_high_swap - alert: node_high_swap
expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
@ -109,7 +119,8 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host system has a high swap usage of {{ humanize $value }}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}.`}}" description: Host system has a high swap usage of {{ humanize $value }}. Reported
by instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server has a high swap usage summary: Server has a high swap usage
- alert: node_high_network_drop_rcv - alert: node_high_network_drop_rcv
expr: node_network_receive_drop{device!="lo"} > 3000 expr: node_network_receive_drop{device!="lo"} > 3000
@ -117,7 +128,9 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host system has an unusally high drop in network reception ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ $labels.job }}`}}" description: Host system has an unusally high drop in network reception ({{
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
$labels.job }}
summary: Server has a high receive drop summary: Server has a high receive drop
- alert: node_high_network_drop_send - alert: node_high_network_drop_send
expr: node_network_transmit_drop{device!="lo"} > 3000 expr: node_network_transmit_drop{device!="lo"} > 3000
@ -125,7 +138,9 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host system has an unusally high drop in network transmission ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job {{$labels.job }}`}}" description: Host system has an unusally high drop in network transmission ({{
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
$labels.job }}
summary: Server has a high transmit drop summary: Server has a high transmit drop
- alert: node_high_network_errs_rcv - alert: node_high_network_errs_rcv
expr: node_network_receive_errs{device!="lo"} > 3000 expr: node_network_receive_errs{device!="lo"} > 3000
@ -133,7 +148,9 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host system has an unusally high error rate in network reception ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ $labels.job }}`}}" description: Host system has an unusally high error rate in network reception
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
{{ $labels.job }}
summary: Server has unusual high reception errors summary: Server has unusual high reception errors
- alert: node_high_network_errs_send - alert: node_high_network_errs_send
expr: node_network_transmit_errs{device!="lo"} > 3000 expr: node_network_transmit_errs{device!="lo"} > 3000
@ -141,7 +158,9 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Host system has an unusally high error rate in network transmission ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ $labels.job }}`}}" description: Host system has an unusally high error rate in network transmission
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
{{ $labels.job }}
summary: Server has unusual high transmission errors summary: Server has unusual high transmission errors
- alert: node_network_conntrack_usage_80percent - alert: node_network_conntrack_usage_80percent
expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8) expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8)
@ -149,78 +168,78 @@ conf:
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit`}}" description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
summary: "{{`{{$labels.instance}}: available network conntrack entries are low.`}}" summary: '{{$labels.instance}}: available network conntrack entries are low.'
- alert: node_entropy_available_low - alert: node_entropy_available_low
expr: node_entropy_available_bits < 300 expr: node_entropy_available_bits < 300
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300`}}" description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
summary: "{{`{{$labels.instance}}: is low on entropy bits.`}}" summary: '{{$labels.instance}}: is low on entropy bits.'
- alert: node_hwmon_high_cpu_temp - alert: node_hwmon_high_cpu_temp
expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0 expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}`}}" description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
summary: "{{`{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}`}}" summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
- alert: node_vmstat_paging_rate_high - alert: node_vmstat_paging_rate_high
expr: irate(node_vmstat_pgpgin[5m]) > 80 expr: irate(node_vmstat_pgpgin[5m]) > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}`}}" description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
summary: "{{`{{$labels.alias}}: memory paging rate is high: {{$value}}`}}" summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
- alert: node_xfs_block_allocation_high - alert: node_xfs_block_allocation_high
expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80 expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}`}}" description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
summary: "{{`{{$labels.alias}}: xfs block allocation high: {{$value}}`}}" summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
- alert: node_network_bond_slaves_down - alert: node_network_bond_slaves_down
expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0 expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{ $labels.master }} is missing {{ $value }} slave interface(s).`}}" description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
summary: "{{`Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)`}}" summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
- alert: node_numa_memory_used - alert: node_numa_memory_used
expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80 expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}`}}" description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
summary: "{{`{{$labels.alias}}: has high NUMA memory usage: {{$value}}`}}" summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
- alert: node_ntp_clock_skew_high - alert: node_ntp_clock_skew_high
expr: abs(node_ntp_drift_seconds) > 2 expr: abs(node_ntp_drift_seconds) > 2
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}`}}" description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
summary: "{{`{{$labels.alias}}: time is skewed by : {{$value}} seconds`}}" summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
- alert: node_disk_read_latency - alert: node_disk_read_latency
expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 40 expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 40
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.device}} has a high read latency of {{ $value }}`}}" description: '{{$labels.device}} has a high read latency of {{ $value }}'
summary: "{{`High read latency observed for device {{ $labels.device }}`}}" summary: 'High read latency observed for device {{ $labels.device }}'
- alert: node_disk_write_latency - alert: node_disk_write_latency
expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 40 expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 40
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`{{$labels.device}} has a high write latency of {{ $value }}`}}" description: '{{$labels.device}} has a high write latency of {{ $value }}'
summary: "{{`High write latency observed for device {{ $labels.device }}`}}" summary: 'High write latency observed for device {{ $labels.device }}'
... ...

View File

@ -12,7 +12,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes`}}" description: MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available title: MariaDB exporter is not collecting metrics or is not available
- alert: prom_exporter_mariadb_osh_infra_unavailable - alert: prom_exporter_mariadb_osh_infra_unavailable
expr: avg_over_time(up{job="mysql-exporter",kubernetes_namespace="osh-infra"}[5m]) == 0 expr: avg_over_time(up{job="mysql-exporter",kubernetes_namespace="osh-infra"}[5m]) == 0
@ -20,7 +20,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes`}}" description: MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available title: MariaDB exporter is not collecting metrics or is not available
- alert: mariadb_table_lock_wait_high - alert: mariadb_table_lock_wait_high
expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30 expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
@ -28,32 +28,32 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Mariadb has high table lock waits of {{ $value }} percentage`}}" description: 'Mariadb has high table lock waits of {{ $value }} percentage'
summary: Mariadb table lock waits are high summary: 'Mariadb table lock waits are high'
- alert: mariadb_node_not_ready - alert: mariadb_node_not_ready
expr: mysql_global_status_wsrep_ready != 1 expr: mysql_global_status_wsrep_ready != 1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`{{$labels.job}} on {{$labels.instance}} is not ready.`}}" description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
summary: Galera cluster node not ready summary: 'Galera cluster node not ready'
- alert: mariadb_galera_node_out_of_sync - alert: mariadb_galera_node_out_of_sync
expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0 expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)`}}" description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
summary: Galera cluster node out of sync summary: 'Galera cluster node out of sync'
- alert: mariadb_innodb_replication_fallen_behind - alert: mariadb_innodb_replication_fallen_behind
expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0) expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: The mysql innodb replication has fallen behind and is not recovering description: 'The mysql innodb replication has fallen behind and is not recovering'
summary: MySQL innodb replication is lagging summary: 'MySQL innodb replication is lagging'
- name: openstack.rules - name: openstack.rules
rules: rules:
- alert: prom_exporter_openstack_unavailable - alert: prom_exporter_openstack_unavailable
@ -70,184 +70,184 @@ conf:
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Glance API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Glance API is not available at {{$labels.url}}`}}" summary: 'Glance API is not available at {{$labels.url}}'
- alert: os_nova_api_availability - alert: os_nova_api_availability
expr: openstack_check_nova_api != 1 expr: openstack_check_nova_api != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Nova API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Nova API is not available at {{$labels.url}}`}}" summary: 'Nova API is not available at {{$labels.url}}'
- alert: os_keystone_api_availability - alert: os_keystone_api_availability
expr: openstack_check_keystone_api != 1 expr: openstack_check_keystone_api != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Keystone API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Keystone API is not available at {{$labels.url}}`}}" summary: 'Keystone API is not available at {{$labels.url}}'
- alert: os_neutron_api_availability - alert: os_neutron_api_availability
expr: openstack_check_neutron_api != 1 expr: openstack_check_neutron_api != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Neutron API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Neutron API is not available at {{$labels.url}}`}}" summary: 'Neutron API is not available at {{$labels.url}}'
- alert: os_neutron_metadata_agent_availability - alert: os_neutron_metadata_agent_availability
expr: openstack_services_neutron_metadata_agent_down_total > 0 expr: openstack_services_neutron_metadata_agent_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: One or more neutron metadata_agents are not available for more than 5 minutes description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
summary: One or more neutron metadata_agents are not available summary: 'One or more neutron metadata_agents are not available'
- alert: os_neutron_openvswitch_agent_availability - alert: os_neutron_openvswitch_agent_availability
expr: openstack_services_neutron_openvswitch_agent_down_total > 0 expr: openstack_services_neutron_openvswitch_agent_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: One or more neutron openvswitch agents are not available for more than 5 minutes description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
summary: One or more neutron openvswitch agents are not available summary: 'One or more neutron openvswitch agents are not available'
- alert: os_neutron_dhcp_agent_availability - alert: os_neutron_dhcp_agent_availability
expr: openstack_services_neutron_dhcp_agent_down_total > 0 expr: openstack_services_neutron_dhcp_agent_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: One or more neutron dhcp agents are not available for more than 5 minutes description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
summary: One or more neutron dhcp agents are not available summary: 'One or more neutron dhcp agents are not available'
- alert: os_neutron_l3_agent_availability - alert: os_neutron_l3_agent_availability
expr: openstack_services_neutron_l3_agent_down_total > 0 expr: openstack_services_neutron_l3_agent_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: One or more neutron L3 agents are not available for more than 5 minutes description: 'One or more neutron L3 agents are not available for more than 5 minutes'
summary: One or more neutron L3 agents are not available summary: 'One or more neutron L3 agents are not available'
- alert: os_swift_api_availability - alert: os_swift_api_availability
expr: openstack_check_swift_api != 1 expr: openstack_check_swift_api != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Swift API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Swift API is not available at {{$labels.url}}`}}" summary: 'Swift API is not available at {{$labels.url}}'
- alert: os_cinder_api_availability - alert: os_cinder_api_availability
expr: openstack_check_cinder_api != 1 expr: openstack_check_cinder_api != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Cinder API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Cinder API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Cinder API is not available at {{$labels.url}}`}}" summary: 'Cinder API is not available at {{$labels.url}}'
- alert: os_cinder_scheduler_availability - alert: os_cinder_scheduler_availability
expr: openstack_services_cinder_cinder_scheduler != 1 expr: openstack_services_cinder_cinder_scheduler != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: Cinder scheduler is not available for more than 5 minutes description: 'Cinder scheduler is not available for more than 5 minutes'
summary: Cinder scheduler is not available summary: 'Cinder scheduler is not available'
- alert: os_heat_api_availability - alert: os_heat_api_availability
expr: openstack_check_heat_api != 1 expr: openstack_check_heat_api != 1
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Heat API is not available at {{$labels.url}} for more than 5 minutes`}}" description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
summary: "{{`Heat API is not available at {{$labels.url}}`}}" summary: 'Heat API is not available at {{$labels.url}}'
- alert: os_nova_compute_disabled - alert: os_nova_compute_disabled
expr: openstack_services_nova_compute_disabled_total > 0 expr: openstack_services_nova_compute_disabled_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-compute is disabled on certain hosts for more than 5 minutes description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-compute is disabled on some hosts summary: 'Openstack compute service nova-compute is disabled on some hosts'
- alert: os_nova_conductor_disabled - alert: os_nova_conductor_disabled
expr: openstack_services_nova_conductor_disabled_total > 0 expr: openstack_services_nova_conductor_disabled_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-conductor is disabled on certain hosts for more than 5 minutes description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-conductor is disabled on some hosts summary: 'Openstack compute service nova-conductor is disabled on some hosts'
- alert: os_nova_consoleauth_disabled - alert: os_nova_consoleauth_disabled
expr: openstack_services_nova_consoleauth_disabled_total > 0 expr: openstack_services_nova_consoleauth_disabled_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-consoleauth is disabled on certain hosts for more than 5 minutes description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-consoleauth is disabled on some hosts summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
- alert: os_nova_scheduler_disabled - alert: os_nova_scheduler_disabled
expr: openstack_services_nova_scheduler_disabled_total > 0 expr: openstack_services_nova_scheduler_disabled_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-scheduler is disabled on certain hosts for more than 5 minutes description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-scheduler is disabled on some hosts summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
- alert: os_nova_compute_down - alert: os_nova_compute_down
expr: openstack_services_nova_compute_down_total > 0 expr: openstack_services_nova_compute_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-compute is down on certain hosts for more than 5 minutes description: 'nova-compute is down on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-compute is down on some hosts summary: 'Openstack compute service nova-compute is down on some hosts'
- alert: os_nova_conductor_down - alert: os_nova_conductor_down
expr: openstack_services_nova_conductor_down_total > 0 expr: openstack_services_nova_conductor_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-conductor is down on certain hosts for more than 5 minutes description: 'nova-conductor is down on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-conductor is down on some hosts summary: 'Openstack compute service nova-conductor is down on some hosts'
- alert: os_nova_consoleauth_down - alert: os_nova_consoleauth_down
expr: openstack_services_nova_consoleauth_down_total > 0 expr: openstack_services_nova_consoleauth_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-consoleauth is down on certain hosts for more than 5 minutes description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-consoleauth is down on some hosts summary: 'Openstack compute service nova-consoleauth is down on some hosts'
- alert: os_nova_scheduler_down - alert: os_nova_scheduler_down
expr: openstack_services_nova_scheduler_down_total > 0 expr: openstack_services_nova_scheduler_down_total > 0
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: nova-scheduler is down on certain hosts for more than 5 minutes description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
summary: Openstack compute service nova-scheduler is down on some hosts summary: 'Openstack compute service nova-scheduler is down on some hosts'
- alert: os_vm_vcpu_usage_high - alert: os_vm_vcpu_usage_high
expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80 expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Openstack VM vcpu usage is hight at {{$value}} percent`}}" description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
summary: Openstack VM vcpu usage is high summary: 'Openstack VM vcpu usage is high'
- alert: os_vm_ram_usage_high - alert: os_vm_ram_usage_high
expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80 expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Openstack VM RAM usage is hight at {{$value}} percent`}}" description: 'Openstack VM RAM usage is hight at {{$value}} percent'
summary: Openstack VM RAM usage is high summary: 'Openstack VM RAM usage is high'
- alert: os_vm_disk_usage_high - alert: os_vm_disk_usage_high
expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80 expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
for: 5m for: 5m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`Openstack VM Disk usage is hight at {{$value}} percent`}}" description: 'Openstack VM Disk usage is hight at {{$value}} percent'
summary: Openstack VM Disk usage is high summary: 'Openstack VM Disk usage is high'
- name: rabbitmq.rules - name: rabbitmq.rules
rules: rules:
- alert: rabbitmq_network_pratitions_detected - alert: rabbitmq_network_pratitions_detected
@ -256,70 +256,70 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions`}}" description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
summary: RabbitMQ Network partitions detected summary: 'RabbitMQ Network partitions detected'
- alert: rabbitmq_down - alert: rabbitmq_down
expr: min(rabbitmq_up) by(instance) != 1 expr: min(rabbitmq_up) by(instance) != 1
for: 10m for: 10m
labels: labels:
severity: page severity: page
annotations: annotations:
description: "{{`RabbitMQ Server instance {{ $labels.instance }} is down`}}" description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
summary: "{{`The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins`}}" summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
- alert: rabbitmq_file_descriptor_usage_high - alert: rabbitmq_file_descriptor_usage_high
expr: fd_used * 100 /fd_total > 80 expr: fd_used * 100 /fd_total > 80
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.`}}" description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
summary: RabbitMQ file descriptors usage is high for last 10 mins summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
- alert: rabbitmq_node_disk_free_alarm - alert: rabbitmq_node_disk_free_alarm
expr: node_disk_free_alarm > 0 expr: node_disk_free_alarm > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.`}}" description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
summary: RabbitMQ disk space usage is high summary: 'RabbitMQ disk space usage is high'
- alert: rabbitmq_node_memory_alarm - alert: rabbitmq_node_memory_alarm
expr: node_mem_alarm > 0 expr: node_mem_alarm > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`RabbitMQ Server instance {{ $labels.instance }} has low free memory.`}}" description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
summary: RabbitMQ memory usage is high summary: 'RabbitMQ memory usage is high'
- alert: rabbitmq_less_than_3_nodes - alert: rabbitmq_less_than_3_nodes
expr: running < 3 expr: running < 3
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: RabbitMQ Server has less than 3 nodes running. description: 'RabbitMQ Server has less than 3 nodes running.'
summary: RabbitMQ server is at risk of loosing data summary: 'RabbitMQ server is at risk of loosing data'
- alert: rabbitmq_queue_messages_returned_high - alert: rabbitmq_queue_messages_returned_high
expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50 expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: RabbitMQ Server is returing more than 50 percent of messages received. description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
summary: RabbitMQ server is returning more than 50 percent of messages received. summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
- alert: rabbitmq_consumers_low_utilization - alert: rabbitmq_consumers_low_utilization
expr: queue_consumer_utilisation < .4 expr: queue_consumer_utilisation < .4
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: RabbitMQ consumers message consumption speed is low description: 'RabbitMQ consumers message consumption speed is low'
summary: RabbitMQ consumers message consumption speed is low summary: 'RabbitMQ consumers message consumption speed is low'
- alert: rabbitmq_high_message_load - alert: rabbitmq_high_message_load
expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000 expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages. description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
summary: RabbitMQ has high message load summary: 'RabbitMQ has high message load'
... ...

View File

@ -20,7 +20,7 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: "{{`Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}`}}" description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
title: Postgres Replication lag is over 2 minutes title: Postgres Replication lag is over 2 minutes
- alert: pg_connections_too_high - alert: pg_connections_too_high
expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95 expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
@ -29,13 +29,13 @@ conf:
severity: warn severity: warn
channel: database channel: database
annotations: annotations:
description: "{{`Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum`}}" title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
- alert: pg_deadlocks_detected - alert: pg_deadlocks_detected
expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0 expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
for: 5m for: 5m
labels: labels:
severity: warn severity: warn
annotations: annotations:
description: "{{`postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}`}}" description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
title: Postgres server is experiencing deadlocks title: Postgres server is experiencing deadlocks
... ...