Prometheus: Update values to be yaml rather than freeform text
This PS udpates the Prometheus values to use yaml rather than text. It also consolates all configuration into a single `etc` configmap, inline with other OSH charts. Change-Id: I162d4817a2b1b842499ef27d754707f8fce23bf3
This commit is contained in:
parent
558ed8cd27
commit
3b6596c56e
@ -22,8 +22,8 @@ kind: ConfigMap
|
|||||||
metadata:
|
metadata:
|
||||||
name: alertmanager-etc
|
name: alertmanager-etc
|
||||||
data:
|
data:
|
||||||
alertmanager.yml:
|
alertmanager.yml: |+
|
||||||
{{- toYaml .Values.conf.alertmanager | indent 4 }}
|
{{ toYaml .Values.conf.alertmanager | indent 4 }}
|
||||||
alert-templates.tmpl:
|
alert-templates.tmpl: |+
|
||||||
{{- toYaml .Values.conf.alert_templates | indent 4 }}
|
{{ toYaml .Values.conf.alert_templates | indent 4 }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -162,7 +162,7 @@ conf:
|
|||||||
path: /var/lib/alertmanager/data
|
path: /var/lib/alertmanager/data
|
||||||
mesh:
|
mesh:
|
||||||
listen_address: "0.0.0.0:6783"
|
listen_address: "0.0.0.0:6783"
|
||||||
alertmanager: |
|
alertmanager:
|
||||||
global:
|
global:
|
||||||
# The smarthost and SMTP sender used for mail notifications.
|
# The smarthost and SMTP sender used for mail notifications.
|
||||||
smtp_smarthost: 'localhost:25'
|
smtp_smarthost: 'localhost:25'
|
||||||
@ -181,7 +181,10 @@ conf:
|
|||||||
# The labels by which incoming alerts are grouped together. For example,
|
# The labels by which incoming alerts are grouped together. For example,
|
||||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||||
# be batched into a single group.
|
# be batched into a single group.
|
||||||
group_by: ['alertname', 'cluster', 'service']
|
group_by:
|
||||||
|
- alertname
|
||||||
|
- cluster
|
||||||
|
- service
|
||||||
# When a new group of alerts is created by an incoming alert, wait at
|
# When a new group of alerts is created by an incoming alert, wait at
|
||||||
# least 'group_wait' to send the initial notification.
|
# least 'group_wait' to send the initial notification.
|
||||||
# This way ensures that you get multiple alerts for the same group that start
|
# This way ensures that you get multiple alerts for the same group that start
|
||||||
@ -225,7 +228,10 @@ conf:
|
|||||||
service: database
|
service: database
|
||||||
receiver: team-DB-pager
|
receiver: team-DB-pager
|
||||||
# Also group alerts by affected database.
|
# Also group alerts by affected database.
|
||||||
group_by: [alertname, cluster, database]
|
group_by:
|
||||||
|
- alertname
|
||||||
|
- cluster
|
||||||
|
- database
|
||||||
routes:
|
routes:
|
||||||
- match:
|
- match:
|
||||||
owner: team-X
|
owner: team-X
|
||||||
@ -243,7 +249,10 @@ conf:
|
|||||||
target_match:
|
target_match:
|
||||||
severity: 'warning'
|
severity: 'warning'
|
||||||
# Apply inhibition if the alertname is the same.
|
# Apply inhibition if the alertname is the same.
|
||||||
equal: ['alertname', 'cluster', 'service']
|
equal:
|
||||||
|
- alertname
|
||||||
|
- cluster
|
||||||
|
- service
|
||||||
receivers:
|
receivers:
|
||||||
- name: 'team-X-mails'
|
- name: 'team-X-mails'
|
||||||
email_configs:
|
email_configs:
|
||||||
|
@ -22,6 +22,28 @@ kind: ConfigMap
|
|||||||
metadata:
|
metadata:
|
||||||
name: prometheus-etc
|
name: prometheus-etc
|
||||||
data:
|
data:
|
||||||
prometheus.yml:
|
prometheus.yml: |+
|
||||||
{{- toYaml .Values.conf.prometheus.scrape_configs | indent 4 }}
|
{{ toYaml .Values.conf.prometheus.scrape_configs | indent 4 }}
|
||||||
|
alertmanager.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }}
|
||||||
|
etcd3.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }}
|
||||||
|
kube-apiserver.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }}
|
||||||
|
kube-controller-manager.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }}
|
||||||
|
kubelet.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }}
|
||||||
|
kubernetes.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }}
|
||||||
|
rabbitmq.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }}
|
||||||
|
mysql.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }}
|
||||||
|
ceph.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }}
|
||||||
|
openstack.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }}
|
||||||
|
custom.rules: |+
|
||||||
|
{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
@ -1,47 +0,0 @@
|
|||||||
{{/*
|
|
||||||
Copyright 2017 The Openstack-Helm Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/}}
|
|
||||||
|
|
||||||
{{- if .Values.manifests.configmap_rules }}
|
|
||||||
{{- $envAll := . }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: prometheus-rules
|
|
||||||
data:
|
|
||||||
alertmanager.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }}
|
|
||||||
etcd3.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }}
|
|
||||||
kube-apiserver.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }}
|
|
||||||
kube-controller-manager.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }}
|
|
||||||
kubelet.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }}
|
|
||||||
kubernetes.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }}
|
|
||||||
rabbitmq.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }}
|
|
||||||
mysql.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }}
|
|
||||||
ceph.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }}
|
|
||||||
openstack.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }}
|
|
||||||
custom.rules:
|
|
||||||
{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }}
|
|
||||||
{{- end }}
|
|
@ -42,7 +42,6 @@ spec:
|
|||||||
annotations:
|
annotations:
|
||||||
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
|
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||||
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
|
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
|
||||||
configmap-rules-hash: {{ tuple "configmap-rules.yaml" . | include "helm-toolkit.utils.hash" }}
|
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: {{ $serviceAccountName }}
|
serviceAccountName: {{ $serviceAccountName }}
|
||||||
affinity:
|
affinity:
|
||||||
@ -79,47 +78,47 @@ spec:
|
|||||||
mountPath: /etc/config
|
mountPath: /etc/config
|
||||||
- name: rulesprometheus
|
- name: rulesprometheus
|
||||||
mountPath: /etc/config/rules
|
mountPath: /etc/config/rules
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/alertmanager.rules
|
mountPath: /etc/config/rules/alertmanager.rules
|
||||||
subPath: alertmanager.rules
|
subPath: alertmanager.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/etcd3.rules
|
mountPath: /etc/config/rules/etcd3.rules
|
||||||
subPath: etcd3.rules
|
subPath: etcd3.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/kubernetes.rules
|
mountPath: /etc/config/rules/kubernetes.rules
|
||||||
subPath: kubernetes.rules
|
subPath: kubernetes.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/kube-apiserver.rules
|
mountPath: /etc/config/rules/kube-apiserver.rules
|
||||||
subPath: kube-apiserver.rules
|
subPath: kube-apiserver.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/kube-controller-manager.rules
|
mountPath: /etc/config/rules/kube-controller-manager.rules
|
||||||
subPath: kube-controller-manager.rules
|
subPath: kube-controller-manager.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/kubelet.rules
|
mountPath: /etc/config/rules/kubelet.rules
|
||||||
subPath: kubelet.rules
|
subPath: kubelet.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/rabbitmq.rules
|
mountPath: /etc/config/rules/rabbitmq.rules
|
||||||
subPath: rabbitmq.rules
|
subPath: rabbitmq.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/mysql.rules
|
mountPath: /etc/config/rules/mysql.rules
|
||||||
subPath: mysql.rules
|
subPath: mysql.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/ceph.rules
|
mountPath: /etc/config/rules/ceph.rules
|
||||||
subPath: ceph.rules
|
subPath: ceph.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/openstack.rules
|
mountPath: /etc/config/rules/openstack.rules
|
||||||
subPath: openstack.rules
|
subPath: openstack.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- name: prometheus-rules
|
- name: prometheus-etc
|
||||||
mountPath: /etc/config/rules/custom.rules
|
mountPath: /etc/config/rules/custom.rules
|
||||||
subPath: custom.rules
|
subPath: custom.rules
|
||||||
readOnly: true
|
readOnly: true
|
||||||
@ -139,9 +138,6 @@ spec:
|
|||||||
emptyDir: {}
|
emptyDir: {}
|
||||||
- name: rulesprometheus
|
- name: rulesprometheus
|
||||||
emptyDir: {}
|
emptyDir: {}
|
||||||
- name: prometheus-rules
|
|
||||||
configMap:
|
|
||||||
name: prometheus-rules
|
|
||||||
- name: prometheus-etc
|
- name: prometheus-etc
|
||||||
configMap:
|
configMap:
|
||||||
name: prometheus-etc
|
name: prometheus-etc
|
||||||
|
@ -171,7 +171,6 @@ manifests:
|
|||||||
clusterrolebinding: true
|
clusterrolebinding: true
|
||||||
configmap_bin: true
|
configmap_bin: true
|
||||||
configmap_etc: true
|
configmap_etc: true
|
||||||
configmap_rules: true
|
|
||||||
ingress_prometheus: true
|
ingress_prometheus: true
|
||||||
helm_tests: true
|
helm_tests: true
|
||||||
job_image_repo_sync: true
|
job_image_repo_sync: true
|
||||||
@ -194,7 +193,7 @@ conf:
|
|||||||
timeout: 2m
|
timeout: 2m
|
||||||
web_admin_api:
|
web_admin_api:
|
||||||
enabled: true
|
enabled: true
|
||||||
scrape_configs: |
|
scrape_configs:
|
||||||
global:
|
global:
|
||||||
scrape_interval: 25s
|
scrape_interval: 25s
|
||||||
evaluation_interval: 10s
|
evaluation_interval: 10s
|
||||||
@ -231,11 +230,13 @@ conf:
|
|||||||
regex: __meta_kubernetes_node_label_(.+)
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
- target_label: __address__
|
- target_label: __address__
|
||||||
replacement: kubernetes.default.svc:443
|
replacement: kubernetes.default.svc:443
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_node_name
|
||||||
regex: (.+)
|
regex: (.+)
|
||||||
target_label: __metrics_path__
|
target_label: __metrics_path__
|
||||||
replacement: /api/v1/nodes/${1}/proxy/metrics
|
replacement: /api/v1/nodes/${1}/proxy/metrics
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_node_name
|
||||||
action: replace
|
action: replace
|
||||||
target_label: kubernetes_io_hostname
|
target_label: kubernetes_io_hostname
|
||||||
# Scrape config for Kubelet cAdvisor.
|
# Scrape config for Kubelet cAdvisor.
|
||||||
@ -273,21 +274,25 @@ conf:
|
|||||||
regex: __meta_kubernetes_node_label_(.+)
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
- target_label: __address__
|
- target_label: __address__
|
||||||
replacement: kubernetes.default.svc:443
|
replacement: kubernetes.default.svc:443
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_node_name
|
||||||
regex: (.+)
|
regex: (.+)
|
||||||
target_label: __metrics_path__
|
target_label: __metrics_path__
|
||||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_node_name
|
||||||
action: replace
|
action: replace
|
||||||
target_label: kubernetes_io_hostname
|
target_label: kubernetes_io_hostname
|
||||||
metric_relabel_configs:
|
metric_relabel_configs:
|
||||||
- action: replace
|
- action: replace
|
||||||
source_labels: [id]
|
source_labels:
|
||||||
|
- id
|
||||||
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
|
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
|
||||||
target_label: rkt_container_name
|
target_label: rkt_container_name
|
||||||
replacement: '${2}-${1}'
|
replacement: '${2}-${1}'
|
||||||
- action: replace
|
- action: replace
|
||||||
source_labels: [id]
|
source_labels:
|
||||||
|
- id
|
||||||
regex: '^/system\.slice/(.+)\.service$'
|
regex: '^/system\.slice/(.+)\.service$'
|
||||||
target_label: systemd_service_name
|
target_label: systemd_service_name
|
||||||
replacement: '${1}'
|
replacement: '${1}'
|
||||||
@ -325,7 +330,10 @@ conf:
|
|||||||
# will add targets for each API server which Kubernetes adds an endpoint to
|
# will add targets for each API server which Kubernetes adds an endpoint to
|
||||||
# the default/kubernetes service.
|
# the default/kubernetes service.
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
- __meta_kubernetes_service_name
|
||||||
|
- __meta_kubernetes_endpoint_port_name
|
||||||
action: keep
|
action: keep
|
||||||
regex: default;kubernetes;https
|
regex: default;kubernetes;https
|
||||||
# Scrape config for service endpoints.
|
# Scrape config for service endpoints.
|
||||||
@ -344,28 +352,35 @@ conf:
|
|||||||
- role: endpoints
|
- role: endpoints
|
||||||
scrape_interval: 60s
|
scrape_interval: 60s
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
||||||
action: keep
|
action: keep
|
||||||
regex: true
|
regex: true
|
||||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
||||||
action: replace
|
action: replace
|
||||||
target_label: __scheme__
|
target_label: __scheme__
|
||||||
regex: (https?)
|
regex: (https?)
|
||||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_path
|
||||||
action: replace
|
action: replace
|
||||||
target_label: __metrics_path__
|
target_label: __metrics_path__
|
||||||
regex: (.+)
|
regex: (.+)
|
||||||
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
|
- source_labels:
|
||||||
|
- __address__
|
||||||
|
- __meta_kubernetes_service_annotation_prometheus_io_port
|
||||||
action: replace
|
action: replace
|
||||||
target_label: __address__
|
target_label: __address__
|
||||||
regex: ([^:]+)(?::\d+)?;(\d+)
|
regex: ([^:]+)(?::\d+)?;(\d+)
|
||||||
replacement: $1:$2
|
replacement: $1:$2
|
||||||
- action: labelmap
|
- action: labelmap
|
||||||
regex: __meta_kubernetes_service_label_(.+)
|
regex: __meta_kubernetes_service_label_(.+)
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
action: replace
|
action: replace
|
||||||
target_label: kubernetes_namespace
|
target_label: kubernetes_namespace
|
||||||
- source_labels: [__meta_kubernetes_service_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_service_name
|
||||||
action: replace
|
action: replace
|
||||||
target_label: kubernetes_name
|
target_label: kubernetes_name
|
||||||
- source_labels:
|
- source_labels:
|
||||||
@ -411,40 +426,38 @@ conf:
|
|||||||
- kubernetes_sd_configs:
|
- kubernetes_sd_configs:
|
||||||
- role: pod
|
- role: pod
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels: [__meta_kubernetes_pod_label_name]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_pod_label_name
|
||||||
regex: alertmanager
|
regex: alertmanager
|
||||||
action: keep
|
action: keep
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
regex: openstack
|
regex: openstack
|
||||||
action: keep
|
action: keep
|
||||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
- source_labels:
|
||||||
|
- __meta_kubernetes_pod_container_port_number
|
||||||
regex:
|
regex:
|
||||||
action: drop
|
action: drop
|
||||||
rules:
|
rules:
|
||||||
alertmanager: |-
|
alertmanager:
|
||||||
groups:
|
groups:
|
||||||
- name: alertmanager.rules
|
- name: alertmanager.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: AlertmanagerConfigInconsistent
|
- alert: AlertmanagerConfigInconsistent
|
||||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
|
||||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: The configuration of the instances of the Alertmanager cluster
|
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
|
||||||
`{{$labels.service}}` are out of sync.
|
|
||||||
summary: Alertmanager configurations are inconsistent
|
summary: Alertmanager configurations are inconsistent
|
||||||
- alert: AlertmanagerDownOrMissing
|
- alert: AlertmanagerDownOrMissing
|
||||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
|
||||||
disappeared from discovery.
|
|
||||||
summary: Alertmanager down or not discovered
|
summary: Alertmanager down or not discovered
|
||||||
- alert: FailedReload
|
- alert: FailedReload
|
||||||
expr: alertmanager_config_last_reload_successful == 0
|
expr: alertmanager_config_last_reload_successful == 0
|
||||||
@ -452,11 +465,9 @@ conf:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
|
||||||
}}/{{ $labels.pod}}.
|
|
||||||
summary: Alertmanager configuration reload has failed
|
summary: Alertmanager configuration reload has failed
|
||||||
|
etcd3:
|
||||||
etcd3: |-
|
|
||||||
groups:
|
groups:
|
||||||
- name: etcd3.rules
|
- name: etcd3.rules
|
||||||
rules:
|
rules:
|
||||||
@ -481,90 +492,73 @@ conf:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
|
||||||
changes within the last hour
|
|
||||||
summary: a high number of leader changes within the etcd cluster are happening
|
summary: a high number of leader changes within the etcd cluster are happening
|
||||||
- alert: HighNumberOfFailedGRPCRequests
|
- alert: HighNumberOfFailedGRPCRequests
|
||||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
||||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
||||||
on etcd instance {{ $labels.instance }}'
|
|
||||||
summary: a high number of gRPC requests are failing
|
summary: a high number of gRPC requests are failing
|
||||||
- alert: HighNumberOfFailedGRPCRequests
|
- alert: HighNumberOfFailedGRPCRequests
|
||||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
||||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
|
||||||
on etcd instance {{ $labels.instance }}'
|
|
||||||
summary: a high number of gRPC requests are failing
|
summary: a high number of gRPC requests are failing
|
||||||
- alert: GRPCRequestsSlow
|
- alert: GRPCRequestsSlow
|
||||||
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
|
||||||
> 0.15
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
|
||||||
}} are slow
|
|
||||||
summary: slow gRPC requests
|
summary: slow gRPC requests
|
||||||
- alert: HighNumberOfFailedHTTPRequests
|
- alert: HighNumberOfFailedHTTPRequests
|
||||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
|
||||||
BY (method) > 0.01
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||||
instance {{ $labels.instance }}'
|
|
||||||
summary: a high number of HTTP requests are failing
|
summary: a high number of HTTP requests are failing
|
||||||
- alert: HighNumberOfFailedHTTPRequests
|
- alert: HighNumberOfFailedHTTPRequests
|
||||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
|
||||||
BY (method) > 0.05
|
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
|
||||||
instance {{ $labels.instance }}'
|
|
||||||
summary: a high number of HTTP requests are failing
|
summary: a high number of HTTP requests are failing
|
||||||
- alert: HTTPRequestsSlow
|
- alert: HTTPRequestsSlow
|
||||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
|
||||||
> 0.15
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
|
||||||
}} are slow
|
|
||||||
summary: slow HTTP requests
|
summary: slow HTTP requests
|
||||||
- alert: EtcdMemberCommunicationSlow
|
- alert: EtcdMemberCommunicationSlow
|
||||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||||
> 0.15
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: etcd instance {{ $labels.instance }} member communication with
|
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
|
||||||
{{ $labels.To }} is slow
|
|
||||||
summary: etcd member communication is slow
|
summary: etcd member communication is slow
|
||||||
- alert: HighNumberOfFailedProposals
|
- alert: HighNumberOfFailedProposals
|
||||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
|
||||||
failures within the last hour
|
|
||||||
summary: a high number of proposals within the etcd cluster are failing
|
summary: a high number of proposals within the etcd cluster are failing
|
||||||
- alert: HighFsyncDurations
|
- alert: HighFsyncDurations
|
||||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||||
> 0.5
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -572,16 +566,14 @@ conf:
|
|||||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||||
summary: high fsync durations
|
summary: high fsync durations
|
||||||
- alert: HighCommitDurations
|
- alert: HighCommitDurations
|
||||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||||
> 0.25
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||||
summary: high commit durations
|
summary: high commit durations
|
||||||
|
kube_apiserver:
|
||||||
kube_apiserver: |-
|
|
||||||
groups:
|
groups:
|
||||||
- name: kube-apiserver.rules
|
- name: kube-apiserver.rules
|
||||||
rules:
|
rules:
|
||||||
@ -591,21 +583,17 @@ conf:
|
|||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus failed to scrape API server(s), or all API servers have
|
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
|
||||||
disappeared from service discovery.
|
|
||||||
summary: API server unreachable
|
summary: API server unreachable
|
||||||
- alert: K8SApiServerLatency
|
- alert: K8SApiServerLatency
|
||||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"})
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
|
||||||
WITHOUT (instance, resource)) / 1e+06 > 1
|
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: 99th percentile Latency for {{ $labels.verb }} requests to the
|
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
|
||||||
kube-apiserver is higher than 1s.
|
|
||||||
summary: Kubernetes apiserver latency is high
|
summary: Kubernetes apiserver latency is high
|
||||||
|
kube_controller_manager:
|
||||||
kube_controller_manager: |-
|
|
||||||
groups:
|
groups:
|
||||||
- name: kube-controller-manager.rules
|
- name: kube-controller-manager.rules
|
||||||
rules:
|
rules:
|
||||||
@ -615,12 +603,10 @@ conf:
|
|||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: There is no running K8S controller manager. Deployments and replication
|
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
|
||||||
controllers are not making progress.
|
|
||||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||||
summary: Controller manager is down
|
summary: Controller manager is down
|
||||||
|
kubelet:
|
||||||
kubelet: |-
|
|
||||||
groups:
|
groups:
|
||||||
- name: kubelet.rules
|
- name: kubelet.rules
|
||||||
rules:
|
rules:
|
||||||
@ -630,18 +616,15 @@ conf:
|
|||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
|
||||||
or has set itself to NotReady, for more than an hour
|
|
||||||
summary: Node status is NotReady
|
summary: Node status is NotReady
|
||||||
- alert: K8SManyNodesNotReady
|
- alert: K8SManyNodesNotReady
|
||||||
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"}
|
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
|
||||||
== 0) / count(kube_node_status_ready{condition="true"})) > 0.2
|
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
|
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
|
||||||
state).'
|
|
||||||
summary: Many Kubernetes nodes are Not Ready
|
summary: Many Kubernetes nodes are Not Ready
|
||||||
- alert: K8SKubeletDown
|
- alert: K8SKubeletDown
|
||||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
|
||||||
@ -652,147 +635,102 @@ conf:
|
|||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||||
summary: Many Kubelets cannot be scraped
|
summary: Many Kubelets cannot be scraped
|
||||||
- alert: K8SKubeletDown
|
- alert: K8SKubeletDown
|
||||||
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})
|
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
|
||||||
> 0.1
|
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
|
||||||
have disappeared from service discovery.
|
|
||||||
summary: Many Kubelets cannot be scraped
|
summary: Many Kubelets cannot be scraped
|
||||||
- alert: K8SKubeletTooManyPods
|
- alert: K8SKubeletTooManyPods
|
||||||
expr: kubelet_running_pod_count > 100
|
expr: kubelet_running_pod_count > 100
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
|
||||||
to the limit of 110
|
|
||||||
summary: Kubelet is close to pod limit
|
summary: Kubelet is close to pod limit
|
||||||
|
kubernetes:
|
||||||
kubernetes: |-
|
|
||||||
groups:
|
groups:
|
||||||
- name: kubernetes.rules
|
- name: kubernetes.rules
|
||||||
rules:
|
rules:
|
||||||
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
||||||
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
|
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
||||||
controller, pod_name, container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
||||||
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
|
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
||||||
container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
||||||
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
|
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
||||||
controller, pod_name, container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
||||||
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
|
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
||||||
container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
||||||
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
|
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
||||||
controller, pod_name, container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
||||||
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
|
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
||||||
container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
||||||
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
|
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
||||||
container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
||||||
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
|
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
|
||||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
|
||||||
container_name)
|
|
||||||
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
||||||
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
|
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
|
||||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
||||||
controller, pod_name, container_name, scope, type)
|
|
||||||
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
||||||
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
|
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
|
||||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
|
||||||
controller, pod_name, container_name, scope, type)
|
|
||||||
- record: cluster:memory_allocation:percent
|
- record: cluster:memory_allocation:percent
|
||||||
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
|
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||||
/ sum(machine_memory_bytes) BY (cluster)
|
|
||||||
- record: cluster:memory_used:percent
|
- record: cluster:memory_used:percent
|
||||||
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
|
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||||
BY (cluster)
|
|
||||||
- record: cluster:cpu_allocation:percent
|
- record: cluster:cpu_allocation:percent
|
||||||
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
|
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
||||||
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
|
||||||
- record: cluster:node_cpu_use:percent
|
- record: cluster:node_cpu_use:percent
|
||||||
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
|
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
|
||||||
BY (cluster)
|
|
||||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
|
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
||||||
cluster, job, resource, verb)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
|
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
||||||
cluster, job, resource, verb)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
|
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
|
||||||
cluster, job, resource, verb)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.99"
|
quantile: "0.99"
|
||||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.9"
|
quantile: "0.9"
|
||||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
|
||||||
BY (le, cluster)) / 1e+06
|
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
|
rabbitmq: null
|
||||||
rabbitmq: |-
|
mysql: null
|
||||||
|
ceph: null
|
||||||
mysql: |-
|
openstack: null
|
||||||
|
custom: null
|
||||||
ceph: |-
|
|
||||||
|
|
||||||
openstack: |-
|
|
||||||
|
|
||||||
custom: |-
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user