From 47565d2d191f22ad3c09340c9d393d0561b164eb Mon Sep 17 00:00:00 2001 From: "Pai, Radhika (rp592h)" Date: Thu, 11 Jul 2019 20:16:07 +0000 Subject: [PATCH] Nagios: Updated the alert for Ceph OSD Down Earlier the Nagios alert monitor was percent based as in when the percent of OSD down is greater than 80, it will send alert. >check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down is more than 80 percent!OK- CEPH OSDs down is less than 80 percent Updated the code in nagios values.yaml to send alert when even 1 OSD is down: >check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down >for more than 5 minutes!OK- All the CEPH OSDs are up Change-Id: Id24c4a0cca64674890dae3599edc0c90d9534e90 --- nagios/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nagios/values.yaml b/nagios/values.yaml index 6350fcafbf..6865dbd049 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -944,7 +944,7 @@ conf: } define service { - check_command check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent + check_command check_prom_alert!ceph_osd_down!CRITICAL- One or more CEPH OSDs are down for more than 5 minutes!OK- All the CEPH OSDs are up check_interval 60 hostgroup_name prometheus-hosts service_description CEPH_OSDs-down