From 87ff958fb828661412290853c5f2528ae0ff9019 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Fri, 18 Jan 2019 09:54:18 -0600 Subject: [PATCH] Prometheus: Update pod container status alerts This updates the Prometheus pod container status alerts. This ensures there are alerts defined for ImagePullBackOff, ErrImagePull, and CreateContainerConfigError errors. This also updates the Nagios service checks to include correct checks for those alerts Change-Id: I91544e7dff8c6aac8c79cd8aa7d8f7bc03adaa9a --- nagios/values.yaml | 18 ++++++++++++++++++ prometheus/values.yaml | 22 +++++++++++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/nagios/values.yaml b/nagios/values.yaml index e6daf7609..74e2da9fe 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -526,6 +526,12 @@ conf: service_description: "Daemonset_not-scheduled" check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired check_interval: 60 + - check_daemonset_unavailable: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Daemonset_pods-unavailable" + check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available + check_interval: 60 - check_deployment_replicas_unavailable: use: notifying_service hostgroup_name: prometheus-hosts @@ -562,6 +568,18 @@ conf: service_description: "Pod_status-error-image-pull" check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status check_interval: 60 + - check_pod_status_error_image_pull_backoff: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Pod_status-error-image-pull" + check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status + check_interval: 60 + - check_pod_status_error_container_config_error: + use: notifying_service + hostgroup_name: prometheus-hosts + service_description: "Pod_status-error-image-pull" + check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status + check_interval: 60 - check_pod_error_crash_loop_back_off: use: notifying_service hostgroup_name: prometheus-hosts diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 28ce99e46..e3675b507 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1300,6 +1300,14 @@ conf: annotations: description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number' summary: 'Less than desired number of daemonsets scheduled' + - alert: daemonset_pods_unavailable + expr: kube_daemonset_status_number_unavailable > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable' + summary: 'Daemonset pods unavailable, due to one of many reasons' - alert: deployment_replicas_unavailable expr: kube_deployment_status_replicas_unavailable > 0 for: 10m @@ -1340,13 +1348,13 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - - alert: pod_status_error_image_pull - expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 + - alert: pod_status_error_image_pull_backoff + expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1 for: 10m labels: severity: page annotations: - description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - alert: pod_error_crash_loop_back_off expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1 @@ -1356,6 +1364,14 @@ conf: annotations: description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes' summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: pod_error_config_error + expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' - alert: replicaset_missing_replicas expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 for: 10m