Prometheus: Update values to be yaml rather than freeform text
This PS udpates the Prometheus values to use yaml rather than text. It also consolates all configuration into a single `etc` configmap, inline with other OSH charts. Change-Id: I162d4817a2b1b842499ef27d754707f8fce23bf3
This commit is contained in:
parent
558ed8cd27
commit
3b6596c56e
@ -22,8 +22,8 @@ kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-etc
|
||||
data:
|
||||
alertmanager.yml:
|
||||
{{- toYaml .Values.conf.alertmanager | indent 4 }}
|
||||
alert-templates.tmpl:
|
||||
{{- toYaml .Values.conf.alert_templates | indent 4 }}
|
||||
alertmanager.yml: |+
|
||||
{{ toYaml .Values.conf.alertmanager | indent 4 }}
|
||||
alert-templates.tmpl: |+
|
||||
{{ toYaml .Values.conf.alert_templates | indent 4 }}
|
||||
{{- end }}
|
||||
|
@ -162,7 +162,7 @@ conf:
|
||||
path: /var/lib/alertmanager/data
|
||||
mesh:
|
||||
listen_address: "0.0.0.0:6783"
|
||||
alertmanager: |
|
||||
alertmanager:
|
||||
global:
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost: 'localhost:25'
|
||||
@ -181,7 +181,10 @@ conf:
|
||||
# The labels by which incoming alerts are grouped together. For example,
|
||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||
# be batched into a single group.
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
group_by:
|
||||
- alertname
|
||||
- cluster
|
||||
- service
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
# least 'group_wait' to send the initial notification.
|
||||
# This way ensures that you get multiple alerts for the same group that start
|
||||
@ -225,7 +228,10 @@ conf:
|
||||
service: database
|
||||
receiver: team-DB-pager
|
||||
# Also group alerts by affected database.
|
||||
group_by: [alertname, cluster, database]
|
||||
group_by:
|
||||
- alertname
|
||||
- cluster
|
||||
- database
|
||||
routes:
|
||||
- match:
|
||||
owner: team-X
|
||||
@ -243,7 +249,10 @@ conf:
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
# Apply inhibition if the alertname is the same.
|
||||
equal: ['alertname', 'cluster', 'service']
|
||||
equal:
|
||||
- alertname
|
||||
- cluster
|
||||
- service
|
||||
receivers:
|
||||
- name: 'team-X-mails'
|
||||
email_configs:
|
||||
|
@ -22,6 +22,28 @@ kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-etc
|
||||
data:
|
||||
prometheus.yml:
|
||||
{{- toYaml .Values.conf.prometheus.scrape_configs | indent 4 }}
|
||||
prometheus.yml: |+
|
||||
{{ toYaml .Values.conf.prometheus.scrape_configs | indent 4 }}
|
||||
alertmanager.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }}
|
||||
etcd3.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }}
|
||||
kube-apiserver.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }}
|
||||
kube-controller-manager.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }}
|
||||
kubelet.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }}
|
||||
kubernetes.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }}
|
||||
rabbitmq.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }}
|
||||
mysql.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }}
|
||||
ceph.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }}
|
||||
openstack.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }}
|
||||
custom.rules: |+
|
||||
{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }}
|
||||
{{- end }}
|
||||
|
@ -1,47 +0,0 @@
|
||||
{{/*
|
||||
Copyright 2017 The Openstack-Helm Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- if .Values.manifests.configmap_rules }}
|
||||
{{- $envAll := . }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-rules
|
||||
data:
|
||||
alertmanager.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }}
|
||||
etcd3.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }}
|
||||
kube-apiserver.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }}
|
||||
kube-controller-manager.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }}
|
||||
kubelet.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }}
|
||||
kubernetes.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }}
|
||||
rabbitmq.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }}
|
||||
mysql.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }}
|
||||
ceph.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }}
|
||||
openstack.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }}
|
||||
custom.rules:
|
||||
{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }}
|
||||
{{- end }}
|
@ -42,7 +42,6 @@ spec:
|
||||
annotations:
|
||||
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
configmap-rules-hash: {{ tuple "configmap-rules.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||
spec:
|
||||
serviceAccountName: {{ $serviceAccountName }}
|
||||
affinity:
|
||||
@ -79,47 +78,47 @@ spec:
|
||||
mountPath: /etc/config
|
||||
- name: rulesprometheus
|
||||
mountPath: /etc/config/rules
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/alertmanager.rules
|
||||
subPath: alertmanager.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/etcd3.rules
|
||||
subPath: etcd3.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/kubernetes.rules
|
||||
subPath: kubernetes.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/kube-apiserver.rules
|
||||
subPath: kube-apiserver.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/kube-controller-manager.rules
|
||||
subPath: kube-controller-manager.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/kubelet.rules
|
||||
subPath: kubelet.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/rabbitmq.rules
|
||||
subPath: rabbitmq.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/mysql.rules
|
||||
subPath: mysql.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/ceph.rules
|
||||
subPath: ceph.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/openstack.rules
|
||||
subPath: openstack.rules
|
||||
readOnly: true
|
||||
- name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
mountPath: /etc/config/rules/custom.rules
|
||||
subPath: custom.rules
|
||||
readOnly: true
|
||||
@ -139,9 +138,6 @@ spec:
|
||||
emptyDir: {}
|
||||
- name: rulesprometheus
|
||||
emptyDir: {}
|
||||
- name: prometheus-rules
|
||||
configMap:
|
||||
name: prometheus-rules
|
||||
- name: prometheus-etc
|
||||
configMap:
|
||||
name: prometheus-etc
|
||||
|
@ -171,7 +171,6 @@ manifests:
|
||||
clusterrolebinding: true
|
||||
configmap_bin: true
|
||||
configmap_etc: true
|
||||
configmap_rules: true
|
||||
ingress_prometheus: true
|
||||
helm_tests: true
|
||||
job_image_repo_sync: true
|
||||
@ -194,7 +193,7 @@ conf:
|
||||
timeout: 2m
|
||||
web_admin_api:
|
||||
enabled: true
|
||||
scrape_configs: |
|
||||
scrape_configs:
|
||||
global:
|
||||
scrape_interval: 25s
|
||||
evaluation_interval: 10s
|
||||
@ -231,11 +230,13 @@ conf:
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
action: replace
|
||||
target_label: kubernetes_io_hostname
|
||||
# Scrape config for Kubelet cAdvisor.
|
||||
@ -273,21 +274,25 @@ conf:
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_node_name
|
||||
action: replace
|
||||
target_label: kubernetes_io_hostname
|
||||
metric_relabel_configs:
|
||||
- action: replace
|
||||
source_labels: [id]
|
||||
source_labels:
|
||||
- id
|
||||
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
|
||||
target_label: rkt_container_name
|
||||
replacement: '${2}-${1}'
|
||||
- action: replace
|
||||
source_labels: [id]
|
||||
source_labels:
|
||||
- id
|
||||
regex: '^/system\.slice/(.+)\.service$'
|
||||
target_label: systemd_service_name
|
||||
replacement: '${1}'
|
||||
@ -325,7 +330,10 @@ conf:
|
||||
# will add targets for each API server which Kubernetes adds an endpoint to
|
||||
# the default/kubernetes service.
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_endpoint_port_name
|
||||
action: keep
|
||||
regex: default;kubernetes;https
|
||||
# Scrape config for service endpoints.
|
||||
@ -344,32 +352,39 @@ conf:
|
||||
- role: endpoints
|
||||
scrape_interval: 60s
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
||||
action: keep
|
||||
regex: true
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||
action: replace
|
||||
target_label: __scheme__
|
||||
regex: (https?)
|
||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||
action: replace
|
||||
target_label: __metrics_path__
|
||||
regex: (.+)
|
||||
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
|
||||
- source_labels:
|
||||
- __address__
|
||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||
action: replace
|
||||
target_label: __address__
|
||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||
replacement: $1:$2
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
action: replace
|
||||
target_label: kubernetes_namespace
|
||||
- source_labels: [__meta_kubernetes_service_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
action: replace
|
||||
target_label: kubernetes_name
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: job
|
||||
replacement: ${1}
|
||||
- job_name: calico-etcd
|
||||
@ -382,25 +397,25 @@ conf:
|
||||
regex: __meta_kubernetes_service_label_(.+)
|
||||
- action: keep
|
||||
source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_service_name
|
||||
regex: "calico-etcd"
|
||||
- action: keep
|
||||
source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
- __meta_kubernetes_namespace
|
||||
regex: kube-system
|
||||
target_label: namespace
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_name
|
||||
- __meta_kubernetes_pod_name
|
||||
target_label: pod
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: service
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_name
|
||||
- __meta_kubernetes_service_name
|
||||
target_label: job
|
||||
replacement: ${1}
|
||||
- source_labels:
|
||||
- __meta_kubernetes_service_label
|
||||
- __meta_kubernetes_service_label
|
||||
target_label: job
|
||||
regex: calico-etcd
|
||||
replacement: ${1}
|
||||
@ -411,40 +426,38 @@ conf:
|
||||
- kubernetes_sd_configs:
|
||||
- role: pod
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_name]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_label_name
|
||||
regex: alertmanager
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_namespace
|
||||
regex: openstack
|
||||
action: keep
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||
- source_labels:
|
||||
- __meta_kubernetes_pod_container_port_number
|
||||
regex:
|
||||
action: drop
|
||||
rules:
|
||||
alertmanager: |-
|
||||
alertmanager:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
|
||||
summary: Alertmanager configurations are inconsistent
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
|
||||
summary: Alertmanager down or not discovered
|
||||
- alert: FailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
@ -452,11 +465,9 @@ conf:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
|
||||
summary: Alertmanager configuration reload has failed
|
||||
|
||||
etcd3: |-
|
||||
etcd3:
|
||||
groups:
|
||||
- name: etcd3.rules
|
||||
rules:
|
||||
@ -481,90 +492,73 @@ conf:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||
}} are slow
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.01
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.05
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with
|
||||
{{ $labels.To }} is slow
|
||||
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||
failures within the last hour
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -572,16 +566,14 @@ conf:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
||||
|
||||
kube_apiserver: |-
|
||||
kube_apiserver:
|
||||
groups:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
@ -591,21 +583,17 @@ conf:
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape API server(s), or all API servers have
|
||||
disappeared from service discovery.
|
||||
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
|
||||
summary: API server unreachable
|
||||
- alert: K8SApiServerLatency
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
||||
WITHOUT (instance, resource)) / 1e+06 > 1
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 99th percentile Latency for {{ $labels.verb }} requests to the
|
||||
kube-apiserver is higher than 1s.
|
||||
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
|
||||
summary: Kubernetes apiserver latency is high
|
||||
|
||||
kube_controller_manager: |-
|
||||
kube_controller_manager:
|
||||
groups:
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
@ -615,12 +603,10 @@ conf:
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
|
||||
kubelet: |-
|
||||
kubelet:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
@ -630,18 +616,15 @@ conf:
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"}
|
||||
== 0) / count(kube_node_status_ready{condition="true"})) > 0.2
|
||||
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
|
||||
state).'
|
||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
||||
summary: Many Kubernetes nodes are Not Ready
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
||||
@ -652,147 +635,102 @@ conf:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletDown
|
||||
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
|
||||
> 0.1
|
||||
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||
to the limit of 110
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
|
||||
kubernetes: |-
|
||||
kubernetes:
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
||||
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
||||
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
||||
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
||||
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
||||
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
||||
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
||||
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
||||
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
||||
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name, scope, type)
|
||||
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
|
||||
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
||||
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name, scope, type)
|
||||
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
|
||||
- record: cluster:memory_allocation:percent
|
||||
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
|
||||
/ sum(machine_memory_bytes) BY (cluster)
|
||||
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:memory_used:percent
|
||||
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
|
||||
BY (cluster)
|
||||
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:cpu_allocation:percent
|
||||
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
|
||||
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
||||
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
||||
- record: cluster:node_cpu_use:percent
|
||||
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
|
||||
BY (cluster)
|
||||
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
|
||||
rabbitmq: |-
|
||||
|
||||
mysql: |-
|
||||
|
||||
ceph: |-
|
||||
|
||||
openstack: |-
|
||||
|
||||
custom: |-
|
||||
rabbitmq: null
|
||||
mysql: null
|
||||
ceph: null
|
||||
openstack: null
|
||||
custom: null
|
||||
|
Loading…
Reference in New Issue
Block a user