From d408bed90d44666169d47e39b9288f060ba6edc3 Mon Sep 17 00:00:00 2001 From: Steven Fitzpatrick Date: Fri, 8 Nov 2019 11:20:29 -0600 Subject: [PATCH] Prometheus: Status Alerts Scalar/Vector Conversion This change converts alert expressions which relied on instant vectors to use range aggregate functions instead. Change-Id: I4df757f961524bed23b6a6ad361779c1749ca2c5 Co-Authored-By: Meghan Heisler --- .../values_overrides/elasticsearch.yaml | 18 ++++++++++++++--- prometheus/values_overrides/kubernetes.yaml | 6 +++--- prometheus/values_overrides/nodes.yaml | 4 ++-- prometheus/values_overrides/openstack.yaml | 20 +++++++++++++------ prometheus/values_overrides/postgresql.yaml | 4 ++-- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/prometheus/values_overrides/elasticsearch.yaml b/prometheus/values_overrides/elasticsearch.yaml index ca185a2e1..d009eba1e 100644 --- a/prometheus/values_overrides/elasticsearch.yaml +++ b/prometheus/values_overrides/elasticsearch.yaml @@ -3,11 +3,11 @@ conf: rules: elasticsearch: groups: - - name: elasticsearch.rules + - name: elasticsearch.alerting_rules rules: - alert: prom_exporter_elasticsearch_unavailable - expr: absent(elasticsearch_cluster_health_status) - for: 10m + expr: avg_over_time(up{job="elasticsearch-exporter"}[5m]) == 0 + for: 5m labels: severity: warning annotations: @@ -85,3 +85,15 @@ conf: annotations: description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' summary: 'ElasticSearch running on less than 3 data nodes' + fluentd: + groups: + - name: fluentd.alerting_rules + rules: + - alert: prom_exporter_fluentd_unavailable + expr: avg_over_time(up{job="fluentd-daemonset-exporter"}[5m]) == 0 + for: 5m + labels: + severity: warning + annotations: + description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes + title: Fluentd exporter is not collecting metrics or is not available diff --git a/prometheus/values_overrides/kubernetes.yaml b/prometheus/values_overrides/kubernetes.yaml index 638722a82..fb4b75325 100644 --- a/prometheus/values_overrides/kubernetes.yaml +++ b/prometheus/values_overrides/kubernetes.yaml @@ -6,7 +6,7 @@ conf: - name: calico.rules rules: - alert: prom_exporter_calico_unavailable - expr: absent(felix_host) + expr: avg_over_time(up{job="kubernetes-pods",application="calico"}[5m]) == 0 for: 10m labels: severity: warning @@ -250,8 +250,8 @@ conf: - name: kubernetes-object.rules rules: - alert: prom_exporter_kube_state_metrics_unavailable - expr: absent(kube_node_info) - for: 10m + expr: avg_over_time(up{job="kube-state-metrics"}[5m]) == 0 + for: 5m labels: severity: warning annotations: diff --git a/prometheus/values_overrides/nodes.yaml b/prometheus/values_overrides/nodes.yaml index dbde76075..81497bf66 100644 --- a/prometheus/values_overrides/nodes.yaml +++ b/prometheus/values_overrides/nodes.yaml @@ -6,8 +6,8 @@ conf: - name: nodes.rules rules: - alert: prom_exporter_node_unavailable - expr: absent(node_uname_info) - for: 10m + expr: avg_over_time(up{job="node-exporter"}[5m]) == 0 + for: 5m labels: severity: warning annotations: diff --git a/prometheus/values_overrides/openstack.yaml b/prometheus/values_overrides/openstack.yaml index 4c38a6a5d..da8e6702e 100644 --- a/prometheus/values_overrides/openstack.yaml +++ b/prometheus/values_overrides/openstack.yaml @@ -5,13 +5,21 @@ conf: groups: - name: mariadb.rules rules: - - alert: prom_exporter_mariadb_unavailable - expr: absent(mysql_up) - for: 10m + - alert: prom_exporter_mariadb_openstack_unavailable + expr: avg_over_time(up{job="mysql-exporter",kubernetes_namespace="openstack"}[5m]) == 0 + for: 5m labels: severity: warning annotations: - description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes + description: MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes + title: MariaDB exporter is not collecting metrics or is not available + - alert: prom_exporter_mariadb_osh_infra_unavailable + expr: avg_over_time(up{job="mysql-exporter",kubernetes_namespace="osh-infra"}[5m]) == 0 + for: 5m + labels: + severity: warning + annotations: + description: MariaDB exporter in {{ $labels.kubernetes_namespace }} is not collecting metrics or is not available for past 10 minutes title: MariaDB exporter is not collecting metrics or is not available - alert: mariadb_table_lock_wait_high expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30 @@ -48,8 +56,8 @@ conf: - name: openstack.rules rules: - alert: prom_exporter_openstack_unavailable - expr: absent(openstack_exporter_cache_refresh_duration_seconds) - for: 10m + expr: avg_over_time(up{job="openstack-metrics"}[5m]) == 0 + for: 5m labels: severity: warning annotations: diff --git a/prometheus/values_overrides/postgresql.yaml b/prometheus/values_overrides/postgresql.yaml index 9e83ee92a..22fe481e1 100644 --- a/prometheus/values_overrides/postgresql.yaml +++ b/prometheus/values_overrides/postgresql.yaml @@ -6,8 +6,8 @@ conf: - name: postgresql.rules rules: - alert: prom_exporter_postgresql_unavailable - expr: absent(pg_static) - for: 10m + expr: avg_over_time(up{job="postgresql-exporter"}[5m]) == 0 + for: 5m labels: severity: warning annotations: