Prometheus: Update values to be yaml rather than freeform text

This PS udpates the Prometheus values to use yaml rather than text.

It also consolates all configuration into a single `etc` configmap,
inline with other OSH charts.

Change-Id: I162d4817a2b1b842499ef27d754707f8fce23bf3
This commit is contained in:
portdirect 2017-12-20 00:03:56 -05:00
parent 558ed8cd27
commit 3b6596c56e
6 changed files with 165 additions and 247 deletions

View File

@ -22,8 +22,8 @@ kind: ConfigMap
metadata: metadata:
name: alertmanager-etc name: alertmanager-etc
data: data:
alertmanager.yml: alertmanager.yml: |+
{{- toYaml .Values.conf.alertmanager | indent 4 }} {{ toYaml .Values.conf.alertmanager | indent 4 }}
alert-templates.tmpl: alert-templates.tmpl: |+
{{- toYaml .Values.conf.alert_templates | indent 4 }} {{ toYaml .Values.conf.alert_templates | indent 4 }}
{{- end }} {{- end }}

View File

@ -162,7 +162,7 @@ conf:
path: /var/lib/alertmanager/data path: /var/lib/alertmanager/data
mesh: mesh:
listen_address: "0.0.0.0:6783" listen_address: "0.0.0.0:6783"
alertmanager: | alertmanager:
global: global:
# The smarthost and SMTP sender used for mail notifications. # The smarthost and SMTP sender used for mail notifications.
smtp_smarthost: 'localhost:25' smtp_smarthost: 'localhost:25'
@ -181,7 +181,10 @@ conf:
# The labels by which incoming alerts are grouped together. For example, # The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group. # be batched into a single group.
group_by: ['alertname', 'cluster', 'service'] group_by:
- alertname
- cluster
- service
# When a new group of alerts is created by an incoming alert, wait at # When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification. # least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start # This way ensures that you get multiple alerts for the same group that start
@ -225,7 +228,10 @@ conf:
service: database service: database
receiver: team-DB-pager receiver: team-DB-pager
# Also group alerts by affected database. # Also group alerts by affected database.
group_by: [alertname, cluster, database] group_by:
- alertname
- cluster
- database
routes: routes:
- match: - match:
owner: team-X owner: team-X
@ -243,7 +249,10 @@ conf:
target_match: target_match:
severity: 'warning' severity: 'warning'
# Apply inhibition if the alertname is the same. # Apply inhibition if the alertname is the same.
equal: ['alertname', 'cluster', 'service'] equal:
- alertname
- cluster
- service
receivers: receivers:
- name: 'team-X-mails' - name: 'team-X-mails'
email_configs: email_configs:

View File

@ -22,6 +22,28 @@ kind: ConfigMap
metadata: metadata:
name: prometheus-etc name: prometheus-etc
data: data:
prometheus.yml: prometheus.yml: |+
{{- toYaml .Values.conf.prometheus.scrape_configs | indent 4 }} {{ toYaml .Values.conf.prometheus.scrape_configs | indent 4 }}
alertmanager.rules: |+
{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }}
etcd3.rules: |+
{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }}
kube-apiserver.rules: |+
{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }}
kube-controller-manager.rules: |+
{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }}
kubelet.rules: |+
{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }}
kubernetes.rules: |+
{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }}
rabbitmq.rules: |+
{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }}
mysql.rules: |+
{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }}
ceph.rules: |+
{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }}
openstack.rules: |+
{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }}
custom.rules: |+
{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }}
{{- end }} {{- end }}

View File

@ -1,47 +0,0 @@
{{/*
Copyright 2017 The Openstack-Helm Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
{{- if .Values.manifests.configmap_rules }}
{{- $envAll := . }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
data:
alertmanager.rules:
{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }}
etcd3.rules:
{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }}
kube-apiserver.rules:
{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }}
kube-controller-manager.rules:
{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }}
kubelet.rules:
{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }}
kubernetes.rules:
{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }}
rabbitmq.rules:
{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }}
mysql.rules:
{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }}
ceph.rules:
{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }}
openstack.rules:
{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }}
custom.rules:
{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }}
{{- end }}

View File

@ -42,7 +42,6 @@ spec:
annotations: annotations:
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }} configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }} configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
configmap-rules-hash: {{ tuple "configmap-rules.yaml" . | include "helm-toolkit.utils.hash" }}
spec: spec:
serviceAccountName: {{ $serviceAccountName }} serviceAccountName: {{ $serviceAccountName }}
affinity: affinity:
@ -79,47 +78,47 @@ spec:
mountPath: /etc/config mountPath: /etc/config
- name: rulesprometheus - name: rulesprometheus
mountPath: /etc/config/rules mountPath: /etc/config/rules
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/alertmanager.rules mountPath: /etc/config/rules/alertmanager.rules
subPath: alertmanager.rules subPath: alertmanager.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/etcd3.rules mountPath: /etc/config/rules/etcd3.rules
subPath: etcd3.rules subPath: etcd3.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/kubernetes.rules mountPath: /etc/config/rules/kubernetes.rules
subPath: kubernetes.rules subPath: kubernetes.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/kube-apiserver.rules mountPath: /etc/config/rules/kube-apiserver.rules
subPath: kube-apiserver.rules subPath: kube-apiserver.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/kube-controller-manager.rules mountPath: /etc/config/rules/kube-controller-manager.rules
subPath: kube-controller-manager.rules subPath: kube-controller-manager.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/kubelet.rules mountPath: /etc/config/rules/kubelet.rules
subPath: kubelet.rules subPath: kubelet.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/rabbitmq.rules mountPath: /etc/config/rules/rabbitmq.rules
subPath: rabbitmq.rules subPath: rabbitmq.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/mysql.rules mountPath: /etc/config/rules/mysql.rules
subPath: mysql.rules subPath: mysql.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/ceph.rules mountPath: /etc/config/rules/ceph.rules
subPath: ceph.rules subPath: ceph.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/openstack.rules mountPath: /etc/config/rules/openstack.rules
subPath: openstack.rules subPath: openstack.rules
readOnly: true readOnly: true
- name: prometheus-rules - name: prometheus-etc
mountPath: /etc/config/rules/custom.rules mountPath: /etc/config/rules/custom.rules
subPath: custom.rules subPath: custom.rules
readOnly: true readOnly: true
@ -139,9 +138,6 @@ spec:
emptyDir: {} emptyDir: {}
- name: rulesprometheus - name: rulesprometheus
emptyDir: {} emptyDir: {}
- name: prometheus-rules
configMap:
name: prometheus-rules
- name: prometheus-etc - name: prometheus-etc
configMap: configMap:
name: prometheus-etc name: prometheus-etc

View File

@ -171,7 +171,6 @@ manifests:
clusterrolebinding: true clusterrolebinding: true
configmap_bin: true configmap_bin: true
configmap_etc: true configmap_etc: true
configmap_rules: true
ingress_prometheus: true ingress_prometheus: true
helm_tests: true helm_tests: true
job_image_repo_sync: true job_image_repo_sync: true
@ -194,7 +193,7 @@ conf:
timeout: 2m timeout: 2m
web_admin_api: web_admin_api:
enabled: true enabled: true
scrape_configs: | scrape_configs:
global: global:
scrape_interval: 25s scrape_interval: 25s
evaluation_interval: 10s evaluation_interval: 10s
@ -231,11 +230,13 @@ conf:
regex: __meta_kubernetes_node_label_(.+) regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__ - target_label: __address__
replacement: kubernetes.default.svc:443 replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name] - source_labels:
- __meta_kubernetes_node_name
regex: (.+) regex: (.+)
target_label: __metrics_path__ target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics replacement: /api/v1/nodes/${1}/proxy/metrics
- source_labels: [__meta_kubernetes_node_name] - source_labels:
- __meta_kubernetes_node_name
action: replace action: replace
target_label: kubernetes_io_hostname target_label: kubernetes_io_hostname
# Scrape config for Kubelet cAdvisor. # Scrape config for Kubelet cAdvisor.
@ -273,21 +274,25 @@ conf:
regex: __meta_kubernetes_node_label_(.+) regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__ - target_label: __address__
replacement: kubernetes.default.svc:443 replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name] - source_labels:
- __meta_kubernetes_node_name
regex: (.+) regex: (.+)
target_label: __metrics_path__ target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- source_labels: [__meta_kubernetes_node_name] - source_labels:
- __meta_kubernetes_node_name
action: replace action: replace
target_label: kubernetes_io_hostname target_label: kubernetes_io_hostname
metric_relabel_configs: metric_relabel_configs:
- action: replace - action: replace
source_labels: [id] source_labels:
- id
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$' regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
target_label: rkt_container_name target_label: rkt_container_name
replacement: '${2}-${1}' replacement: '${2}-${1}'
- action: replace - action: replace
source_labels: [id] source_labels:
- id
regex: '^/system\.slice/(.+)\.service$' regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name target_label: systemd_service_name
replacement: '${1}' replacement: '${1}'
@ -325,7 +330,10 @@ conf:
# will add targets for each API server which Kubernetes adds an endpoint to # will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service. # the default/kubernetes service.
relabel_configs: relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
action: keep action: keep
regex: default;kubernetes;https regex: default;kubernetes;https
# Scrape config for service endpoints. # Scrape config for service endpoints.
@ -344,28 +352,35 @@ conf:
- role: endpoints - role: endpoints
scrape_interval: 60s scrape_interval: 60s
relabel_configs: relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
action: keep action: keep
regex: true regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
action: replace action: replace
target_label: __scheme__ target_label: __scheme__
regex: (https?) regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
action: replace action: replace
target_label: __metrics_path__ target_label: __metrics_path__
regex: (.+) regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
action: replace action: replace
target_label: __address__ target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+) regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2 replacement: $1:$2
- action: labelmap - action: labelmap
regex: __meta_kubernetes_service_label_(.+) regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace] - source_labels:
- __meta_kubernetes_namespace
action: replace action: replace
target_label: kubernetes_namespace target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name] - source_labels:
- __meta_kubernetes_service_name
action: replace action: replace
target_label: kubernetes_name target_label: kubernetes_name
- source_labels: - source_labels:
@ -411,40 +426,38 @@ conf:
- kubernetes_sd_configs: - kubernetes_sd_configs:
- role: pod - role: pod
relabel_configs: relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_name] - source_labels:
- __meta_kubernetes_pod_label_name
regex: alertmanager regex: alertmanager
action: keep action: keep
- source_labels: [__meta_kubernetes_namespace] - source_labels:
- __meta_kubernetes_namespace
regex: openstack regex: openstack
action: keep action: keep
- source_labels: [__meta_kubernetes_pod_container_port_number] - source_labels:
- __meta_kubernetes_pod_container_port_number
regex: regex:
action: drop action: drop
rules: rules:
alertmanager: |- alertmanager:
groups: groups:
- name: alertmanager.rules - name: alertmanager.rules
rules: rules:
- alert: AlertmanagerConfigInconsistent - alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: The configuration of the instances of the Alertmanager cluster description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
`{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
disappeared from discovery.
summary: Alertmanager down or not discovered summary: Alertmanager down or not discovered
- alert: FailedReload - alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0 expr: alertmanager_config_last_reload_successful == 0
@ -452,11 +465,9 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
}}/{{ $labels.pod}}.
summary: Alertmanager configuration reload has failed summary: Alertmanager configuration reload has failed
etcd3:
etcd3: |-
groups: groups:
- name: etcd3.rules - name: etcd3.rules
rules: rules:
@ -481,90 +492,73 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests - alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests - alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow - alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
> 0.15
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
}} are slow
summary: slow gRPC requests summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests - alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
BY (method) > 0.01
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests - alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
BY (method) > 0.05
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow - alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
> 0.15
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
}} are slow
summary: slow HTTP requests summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow - alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
> 0.15
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: etcd instance {{ $labels.instance }} member communication with description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
{{ $labels.To }} is slow
summary: etcd member communication is slow summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals - alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations - alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
> 0.5
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -572,16 +566,14 @@ conf:
description: etcd instance {{ $labels.instance }} fync durations are high description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations summary: high fsync durations
- alert: HighCommitDurations - alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
> 0.25
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: etcd instance {{ $labels.instance }} commit durations are high description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations summary: high commit durations
kube_apiserver:
kube_apiserver: |-
groups: groups:
- name: kube-apiserver.rules - name: kube-apiserver.rules
rules: rules:
@ -591,21 +583,17 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: Prometheus failed to scrape API server(s), or all API servers have description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
disappeared from service discovery.
summary: API server unreachable summary: API server unreachable
- alert: K8SApiServerLatency - alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
WITHOUT (instance, resource)) / 1e+06 > 1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high summary: Kubernetes apiserver latency is high
kube_controller_manager:
kube_controller_manager: |-
groups: groups:
- name: kube-controller-manager.rules - name: kube-controller-manager.rules
rules: rules:
@ -615,12 +603,10 @@ conf:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: There is no running K8S controller manager. Deployments and replication description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down summary: Controller manager is down
kubelet:
kubelet: |-
groups: groups:
- name: kubelet.rules - name: kubelet.rules
rules: rules:
@ -630,18 +616,15 @@ conf:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady summary: Node status is NotReady
- alert: K8SManyNodesNotReady - alert: K8SManyNodesNotReady
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
== 0) / count(kube_node_status_ready{condition="true"})) > 0.2
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
state).'
summary: Many Kubernetes nodes are Not Ready summary: Many Kubernetes nodes are Not Ready
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
@ -652,147 +635,102 @@ conf:
description: Prometheus failed to scrape {{ $value }}% of kubelets. description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
> 0.1
for: 1h for: 1h
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods - alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100 expr: kubelet_running_pod_count > 100
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
to the limit of 110
summary: Kubelet is close to pod limit summary: Kubelet is close to pod limit
kubernetes:
kubernetes: |-
groups: groups:
- name: kubernetes.rules - name: kubernetes.rules
rules: rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:spec_cpu_shares - record: cluster_namespace_controller_pod_container:spec_cpu_shares
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:cpu_usage:rate - record: cluster_namespace_controller_pod_container:cpu_usage:rate
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_usage:bytes - record: cluster_namespace_controller_pod_container:memory_usage:bytes
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_rss:bytes - record: cluster_namespace_controller_pod_container:memory_rss:bytes
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_cache:bytes - record: cluster_namespace_controller_pod_container:memory_cache:bytes
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:disk_usage:bytes - record: cluster_namespace_controller_pod_container:disk_usage:bytes
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name, scope, type)
- record: cluster_namespace_controller_pod_container:memory_oom:rate - record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name, scope, type)
- record: cluster:memory_allocation:percent - record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
/ sum(machine_memory_bytes) BY (cluster)
- record: cluster:memory_used:percent - record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
BY (cluster)
- record: cluster:cpu_allocation:percent - record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
- record: cluster:node_cpu_use:percent - record: cluster:node_cpu_use:percent
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
BY (cluster)
- record: cluster_resource_verb:apiserver_latency:quantile_seconds - record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
cluster, job, resource, verb)) / 1e+06
labels: labels:
quantile: "0.99" quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds - record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
cluster, job, resource, verb)) / 1e+06
labels: labels:
quantile: "0.9" quantile: "0.9"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds - record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
cluster, job, resource, verb)) / 1e+06
labels: labels:
quantile: "0.5" quantile: "0.5"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.99" quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.9" quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.5" quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.99" quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.9" quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.5" quantile: "0.5"
- record: cluster:scheduler_binding_latency:quantile_seconds - record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.99" quantile: "0.99"
- record: cluster:scheduler_binding_latency:quantile_seconds - record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.9" quantile: "0.9"
- record: cluster:scheduler_binding_latency:quantile_seconds - record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
BY (le, cluster)) / 1e+06
labels: labels:
quantile: "0.5" quantile: "0.5"
rabbitmq: null
rabbitmq: |- mysql: null
ceph: null
mysql: |- openstack: null
custom: null
ceph: |-
openstack: |-
custom: |-