From 24fccea832af75f8416ba9e3e2df3f298d510632 Mon Sep 17 00:00:00 2001 From: Peter Sabaini Date: Wed, 17 Jan 2024 16:50:37 +0100 Subject: [PATCH] Add alerting rules for RGW multisite deployments Add default prometheus alerting rules for RadosGW multisite deployments based on the built-in Ceph RGW multisite metrics. Note that the included prometheus_alerts.yml.default rule file is included for reference only. The ceph-mon charm will utilize the resource file from https://charmhub.io/ceph-mon/resources/alert-rules for deployment so that operators can easily customize these rules. Change-Id: I5a12162d73686963132a952bddd85ec205964de4 --- .../prometheus_alerts.yml.default | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/files/prometheus_alert_rules/prometheus_alerts.yml.default b/files/prometheus_alert_rules/prometheus_alerts.yml.default index a544d41e..e0914e49 100644 --- a/files/prometheus_alert_rules/prometheus_alerts.yml.default +++ b/files/prometheus_alert_rules/prometheus_alerts.yml.default @@ -633,3 +633,50 @@ groups: oid: "1.3.6.1.4.1.50495.1.2.1.1.2" severity: "critical" type: "ceph_default" + - name: "rgwmultisite" + rules: + - alert: "CephRGWMultisiteFetchError" + annotations: + description: "Unsuccessful Object Replications from source zone threshold has been exceeded. The threshold is defined as 2 errors per 15min" + summary: "Unsuccessful Object Replications from Source Zone Threshold Exceeded" + expr: "increase(ceph_data_sync_from_zone_fetch_errors[15m]) > 2" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephRGWMultisitePollError" + annotations: + description: "Unsuccessful Object Replications from Source Zone Threshold Exceeded. The threshold is defined as 2 errors per 15min" + summary: "Unsuccessful Object Replications from Source Zone Threshold Exceeded" + expr: "increase(ceph_data_sync_from_zone_poll_errors[15m]) > 2" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephRGWMultisiteFetchErrorCritical" + annotations: + description: "Critical: Unsuccessful Object Replications from source zone threshold has been exceeded. The threshold is defined as 50 errors per 15min" + summary: "Critical: Unsuccessful Object Replications from Source Zone Threshold Exceeded" + expr: "increase(ceph_data_sync_from_zone_fetch_errors[15m]) > 50" + for: "5m" + labels: + severity: "critical" + type: "ceph_default" + - alert: "CephRGWMultisitePollErrorCritical" + annotations: + description: "Critical: Unsuccessful Object Replications from source zone threshold has been exceeded. The threshold is defined as 50 errors per 15min" + summary: "Critical: Unsuccessful Object Replications from Source Zone Threshold Exceeded" + expr: "increase(ceph_data_sync_from_zone_poll_errors[15m]) > 50" + for: "5m" + labels: + severity: "critical" + type: "ceph_default" + - alert: "CephRGWMultisitePollLatency" + annotations: + description: "Latency for poll request threshold exceeded. The threshold is defined as 600s latency per 15min" + summary: "Poll Request Latency Threshold Exceeded" + expr: "increase(ceph_data_sync_from_zone_poll_latency_sum[15m]) > 600" + for: "5m" + labels: + severity: "warning" + type: "ceph_default"