Merge "Prometheus: Update pod container status alerts"

This commit is contained in:
Zuul 2019-01-25 19:39:47 +00:00 committed by Gerrit Code Review
commit d1b77b2bea
2 changed files with 37 additions and 3 deletions

View File

@ -526,6 +526,12 @@ conf:
service_description: "Daemonset_not-scheduled"
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
check_interval: 60
- check_daemonset_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Daemonset_pods-unavailable"
check_command: check_prom_alert!daemonset_pods_unavailable!CRITICAL- Daemonset {daemonset} has pods unavailable!OK- All daemonset pods available
check_interval: 60
- check_deployment_replicas_unavailable:
use: notifying_service
hostgroup_name: prometheus-hosts
@ -562,6 +568,18 @@ conf:
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_status_error_image_pull_backoff:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert! pod_status_error_image_pull_backoff!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ImagePullBackOff for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_status_error_container_config_error:
use: notifying_service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert! pod_error_config_error!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of CreateContainerConfigError for more than 10 minutes!OK- No pods in error status
check_interval: 60
- check_pod_error_crash_loop_back_off:
use: notifying_service
hostgroup_name: prometheus-hosts

View File

@ -1300,6 +1300,14 @@ conf:
annotations:
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: 'Less than desired number of daemonsets scheduled'
- alert: daemonset_pods_unavailable
expr: kube_daemonset_status_number_unavailable > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
summary: 'Daemonset pods unavailable, due to one of many reasons'
- alert: deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 10m
@ -1340,13 +1348,13 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
- alert: pod_status_error_image_pull_backoff
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_crash_loop_back_off
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
@ -1356,6 +1364,14 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_config_error
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m