openstack-helm-infra/prometheus/values_overrides/ceph.yaml
Steven Fitzpatrick cdd0f33d0c Revert "Prometheus: Render Rules as Templates"
This reverts commit fb7fc87d23.

I first submitted that as a way to add dynamic capability to the
prometheus rules (they infamously don't support ENV variable
substitution there). However this be done easily with another solution,
and would clean up the prometheus chart values significantly.

Change-Id: Ibec512d92490798ae5522468b915b49e7746806a
2020-10-06 15:21:18 +00:00

85 lines
3.7 KiB
YAML

---
conf:
prometheus:
rules:
ceph:
groups:
- name: ceph.recording_rules
rules:
- record: ceph_cluster_usage_percent
expr: 100 * (ceph_cluster_total_used_bytes / ceph_cluster_total_bytes)
- record: ceph_placement_group_degrade_percent
expr: 100 * (ceph_pg_degraded / ceph_pg_total)
- record: ceph_osd_down_percent
expr: 100 * (count(ceph_osd_up == 0) / count(ceph_osd_metadata))
- record: ceph_osd_out_percent
expr: 100 * (count(ceph_osd_in == 0) / count(ceph_osd_metadata))
- name: ceph.alerting_rules
rules:
- alert: prom_exporter_ceph_unavailable
expr: absent(ceph_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
title: Ceph exporter is not collecting metrics or is not available
- alert: no_active_ceph_mgr
expr: avg_over_time(up{job="ceph-mgr"}[5m]) == 0
labels:
severity: warning
annotations:
description: 'no ceph active mgr is present or all ceph mgr are down'
summary: 'no ceph active mgt is present'
- alert: ceph_monitor_quorum_low
expr: ceph_mon_quorum_count < 3
for: 5m
labels:
severity: page
annotations:
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
summary: 'ceph high availability is at risk'
- alert: ceph_monitor_quorum_absent
expr: absent(avg_over_time(ceph_mon_quorum_status[5m]))
labels:
severity: page
annotations:
description: 'ceph monitor quorum has been gone for more than 5 minutes'
summary: 'ceph high availability is at risk'
- alert: ceph_cluster_usage_high
expr: avg_over_time(ceph_cluster_usage_percent[5m]) > 80
labels:
severity: page
annotations:
description: 'ceph cluster capacity usage more than 80 percent'
summary: 'ceph cluster usage is more than 80 percent'
- alert: ceph_placement_group_degrade_pct_high
expr: avg_over_time(ceph_placement_group_degrade_percent[5m]) > 80
labels:
severity: critical
annotations:
description: 'ceph placement group degradation is more than 80 percent'
summary: 'ceph placement groups degraded'
- alert: ceph_osd_down_pct_high
expr: avg_over_time(ceph_osd_down_percent[5m]) > 80
labels:
severity: critical
annotations:
description: 'ceph OSDs down percent is more than 80 percent'
summary: 'ceph OSDs down percent is high'
- alert: ceph_osd_down
expr: avg_over_time(ceph_osd_up[5m]) == 0
labels:
severity: critical
annotations:
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}.'
- alert: ceph_osd_out
expr: avg_over_time(ceph_osd_in[5m]) == 0
labels:
severity: page
annotations:
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}.'
...