diff --git a/prometheus-alertmanager/templates/configmap-etc.yaml b/prometheus-alertmanager/templates/configmap-etc.yaml index 602a9b990..177b0fb91 100644 --- a/prometheus-alertmanager/templates/configmap-etc.yaml +++ b/prometheus-alertmanager/templates/configmap-etc.yaml @@ -22,8 +22,8 @@ kind: ConfigMap metadata: name: alertmanager-etc data: - alertmanager.yml: -{{- toYaml .Values.conf.alertmanager | indent 4 }} - alert-templates.tmpl: -{{- toYaml .Values.conf.alert_templates | indent 4 }} + alertmanager.yml: |+ +{{ toYaml .Values.conf.alertmanager | indent 4 }} + alert-templates.tmpl: |+ +{{ toYaml .Values.conf.alert_templates | indent 4 }} {{- end }} diff --git a/prometheus-alertmanager/values.yaml b/prometheus-alertmanager/values.yaml index 7987e968c..6b5b49504 100644 --- a/prometheus-alertmanager/values.yaml +++ b/prometheus-alertmanager/values.yaml @@ -162,7 +162,7 @@ conf: path: /var/lib/alertmanager/data mesh: listen_address: "0.0.0.0:6783" - alertmanager: | + alertmanager: global: # The smarthost and SMTP sender used for mail notifications. smtp_smarthost: 'localhost:25' @@ -181,7 +181,10 @@ conf: # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. - group_by: ['alertname', 'cluster', 'service'] + group_by: + - alertname + - cluster + - service # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start @@ -225,7 +228,10 @@ conf: service: database receiver: team-DB-pager # Also group alerts by affected database. - group_by: [alertname, cluster, database] + group_by: + - alertname + - cluster + - database routes: - match: owner: team-X @@ -243,7 +249,10 @@ conf: target_match: severity: 'warning' # Apply inhibition if the alertname is the same. - equal: ['alertname', 'cluster', 'service'] + equal: + - alertname + - cluster + - service receivers: - name: 'team-X-mails' email_configs: diff --git a/prometheus/templates/configmap-etc.yaml b/prometheus/templates/configmap-etc.yaml index 29c472822..5885046b4 100644 --- a/prometheus/templates/configmap-etc.yaml +++ b/prometheus/templates/configmap-etc.yaml @@ -22,6 +22,28 @@ kind: ConfigMap metadata: name: prometheus-etc data: - prometheus.yml: -{{- toYaml .Values.conf.prometheus.scrape_configs | indent 4 }} + prometheus.yml: |+ +{{ toYaml .Values.conf.prometheus.scrape_configs | indent 4 }} + alertmanager.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }} + etcd3.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }} + kube-apiserver.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }} + kube-controller-manager.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }} + kubelet.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }} + kubernetes.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }} + rabbitmq.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }} + mysql.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }} + ceph.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }} + openstack.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }} + custom.rules: |+ +{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }} {{- end }} diff --git a/prometheus/templates/configmap-rules.yaml b/prometheus/templates/configmap-rules.yaml deleted file mode 100644 index d3ed93a02..000000000 --- a/prometheus/templates/configmap-rules.yaml +++ /dev/null @@ -1,47 +0,0 @@ -{{/* -Copyright 2017 The Openstack-Helm Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/}} - -{{- if .Values.manifests.configmap_rules }} -{{- $envAll := . }} ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-rules -data: - alertmanager.rules: -{{ toYaml .Values.conf.prometheus.rules.alertmanager | indent 4 }} - etcd3.rules: -{{ toYaml .Values.conf.prometheus.rules.etcd3 | indent 4 }} - kube-apiserver.rules: -{{ toYaml .Values.conf.prometheus.rules.kube_apiserver | indent 4 }} - kube-controller-manager.rules: -{{ toYaml .Values.conf.prometheus.rules.kube_controller_manager | indent 4 }} - kubelet.rules: -{{ toYaml .Values.conf.prometheus.rules.kubelet | indent 4 }} - kubernetes.rules: -{{ toYaml .Values.conf.prometheus.rules.kubernetes | indent 4 }} - rabbitmq.rules: -{{ toYaml .Values.conf.prometheus.rules.rabbitmq | indent 4 }} - mysql.rules: -{{ toYaml .Values.conf.prometheus.rules.mysql | indent 4 }} - ceph.rules: -{{ toYaml .Values.conf.prometheus.rules.ceph | indent 4 }} - openstack.rules: -{{ toYaml .Values.conf.prometheus.rules.openstack | indent 4 }} - custom.rules: -{{ toYaml .Values.conf.prometheus.rules.custom | indent 4 }} -{{- end }} diff --git a/prometheus/templates/statefulset.yaml b/prometheus/templates/statefulset.yaml index 0d13dc7e8..9bb2955ef 100644 --- a/prometheus/templates/statefulset.yaml +++ b/prometheus/templates/statefulset.yaml @@ -42,7 +42,6 @@ spec: annotations: configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }} configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }} - configmap-rules-hash: {{ tuple "configmap-rules.yaml" . | include "helm-toolkit.utils.hash" }} spec: serviceAccountName: {{ $serviceAccountName }} affinity: @@ -79,47 +78,47 @@ spec: mountPath: /etc/config - name: rulesprometheus mountPath: /etc/config/rules - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/alertmanager.rules subPath: alertmanager.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/etcd3.rules subPath: etcd3.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/kubernetes.rules subPath: kubernetes.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/kube-apiserver.rules subPath: kube-apiserver.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/kube-controller-manager.rules subPath: kube-controller-manager.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/kubelet.rules subPath: kubelet.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/rabbitmq.rules subPath: rabbitmq.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/mysql.rules subPath: mysql.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/ceph.rules subPath: ceph.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/openstack.rules subPath: openstack.rules readOnly: true - - name: prometheus-rules + - name: prometheus-etc mountPath: /etc/config/rules/custom.rules subPath: custom.rules readOnly: true @@ -139,9 +138,6 @@ spec: emptyDir: {} - name: rulesprometheus emptyDir: {} - - name: prometheus-rules - configMap: - name: prometheus-rules - name: prometheus-etc configMap: name: prometheus-etc diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 1446c692e..cb8835783 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -171,7 +171,6 @@ manifests: clusterrolebinding: true configmap_bin: true configmap_etc: true - configmap_rules: true ingress_prometheus: true helm_tests: true job_image_repo_sync: true @@ -194,7 +193,7 @@ conf: timeout: 2m web_admin_api: enabled: true - scrape_configs: | + scrape_configs: global: scrape_interval: 25s evaluation_interval: 10s @@ -231,11 +230,13 @@ conf: regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] + - source_labels: + - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics - - source_labels: [__meta_kubernetes_node_name] + - source_labels: + - __meta_kubernetes_node_name action: replace target_label: kubernetes_io_hostname # Scrape config for Kubelet cAdvisor. @@ -273,21 +274,25 @@ conf: regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] + - source_labels: + - __meta_kubernetes_node_name regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - - source_labels: [__meta_kubernetes_node_name] + - source_labels: + - __meta_kubernetes_node_name action: replace target_label: kubernetes_io_hostname metric_relabel_configs: - action: replace - source_labels: [id] + source_labels: + - id regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$' target_label: rkt_container_name replacement: '${2}-${1}' - action: replace - source_labels: [id] + source_labels: + - id regex: '^/system\.slice/(.+)\.service$' target_label: systemd_service_name replacement: '${1}' @@ -325,7 +330,10 @@ conf: # will add targets for each API server which Kubernetes adds an endpoint to # the default/kubernetes service. relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + - source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name action: keep regex: default;kubernetes;https # Scrape config for service endpoints. @@ -344,32 +352,39 @@ conf: - role: endpoints scrape_interval: 60s relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape action: keep regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme action: replace target_label: __scheme__ regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path action: replace target_label: __metrics_path__ regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] + - source_labels: + - __meta_kubernetes_namespace action: replace target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] + - source_labels: + - __meta_kubernetes_service_name action: replace target_label: kubernetes_name - source_labels: - - __meta_kubernetes_service_name + - __meta_kubernetes_service_name target_label: job replacement: ${1} - job_name: calico-etcd @@ -382,25 +397,25 @@ conf: regex: __meta_kubernetes_service_label_(.+) - action: keep source_labels: - - __meta_kubernetes_service_name + - __meta_kubernetes_service_name regex: "calico-etcd" - action: keep source_labels: - - __meta_kubernetes_namespace + - __meta_kubernetes_namespace regex: kube-system target_label: namespace - source_labels: - - __meta_kubernetes_pod_name + - __meta_kubernetes_pod_name target_label: pod - source_labels: - - __meta_kubernetes_service_name + - __meta_kubernetes_service_name target_label: service - source_labels: - - __meta_kubernetes_service_name + - __meta_kubernetes_service_name target_label: job replacement: ${1} - source_labels: - - __meta_kubernetes_service_label + - __meta_kubernetes_service_label target_label: job regex: calico-etcd replacement: ${1} @@ -411,40 +426,38 @@ conf: - kubernetes_sd_configs: - role: pod relabel_configs: - - source_labels: [__meta_kubernetes_pod_label_name] + - source_labels: + - __meta_kubernetes_pod_label_name regex: alertmanager action: keep - - source_labels: [__meta_kubernetes_namespace] + - source_labels: + - __meta_kubernetes_namespace regex: openstack action: keep - - source_labels: [__meta_kubernetes_pod_container_port_number] + - source_labels: + - __meta_kubernetes_pod_container_port_number regex: action: drop rules: - alertmanager: |- + alertmanager: groups: - name: alertmanager.rules rules: - alert: AlertmanagerConfigInconsistent - expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) - GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", - "alertmanager-$1", "alertmanager", "(.*)") != 1 + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 for: 5m labels: severity: critical annotations: - description: The configuration of the instances of the Alertmanager cluster - `{{$labels.service}}` are out of sync. + description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. summary: Alertmanager configurations are inconsistent - alert: AlertmanagerDownOrMissing - expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", - "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 for: 5m labels: severity: warning annotations: - description: An unexpected number of Alertmanagers are scraped or Alertmanagers - disappeared from discovery. + description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. summary: Alertmanager down or not discovered - alert: FailedReload expr: alertmanager_config_last_reload_successful == 0 @@ -452,11 +465,9 @@ conf: labels: severity: warning annotations: - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. summary: Alertmanager configuration reload has failed - - etcd3: |- + etcd3: groups: - name: etcd3.rules rules: @@ -481,90 +492,73 @@ conf: labels: severity: warning annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader - changes within the last hour + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour summary: a high number of leader changes within the etcd cluster are happening - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) - / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 for: 10m labels: severity: warning annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: HighNumberOfFailedGRPCRequests - expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) - / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 for: 5m labels: severity: critical annotations: - description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed - on etcd instance {{ $labels.instance }}' + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of gRPC requests are failing - alert: GRPCRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) - > 0.15 + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: critical annotations: - description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method - }} are slow + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow summary: slow gRPC requests - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.01 + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 for: 10m labels: severity: warning annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HighNumberOfFailedHTTPRequests - expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) - BY (method) > 0.05 + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 for: 5m labels: severity: critical annotations: - description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd - instance {{ $labels.instance }}' + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' summary: a high number of HTTP requests are failing - alert: HTTPRequestsSlow - expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: - description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method - }} are slow + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow summary: slow HTTP requests - alert: EtcdMemberCommunicationSlow - expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) - > 0.15 + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: - description: etcd instance {{ $labels.instance }} member communication with - {{ $labels.To }} is slow + description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow summary: etcd member communication is slow - alert: HighNumberOfFailedProposals expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 labels: severity: warning annotations: - description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal - failures within the last hour + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour summary: a high number of proposals within the etcd cluster are failing - alert: HighFsyncDurations - expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) - > 0.5 + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 for: 10m labels: severity: warning @@ -572,16 +566,14 @@ conf: description: etcd instance {{ $labels.instance }} fync durations are high summary: high fsync durations - alert: HighCommitDurations - expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) - > 0.25 + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 for: 10m labels: severity: warning annotations: description: etcd instance {{ $labels.instance }} commit durations are high summary: high commit durations - - kube_apiserver: |- + kube_apiserver: groups: - name: kube-apiserver.rules rules: @@ -591,21 +583,17 @@ conf: labels: severity: critical annotations: - description: Prometheus failed to scrape API server(s), or all API servers have - disappeared from service discovery. + description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery. summary: API server unreachable - alert: K8SApiServerLatency - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) - WITHOUT (instance, resource)) / 1e+06 > 1 + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1 for: 10m labels: severity: warning annotations: - description: 99th percentile Latency for {{ $labels.verb }} requests to the - kube-apiserver is higher than 1s. + description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s. summary: Kubernetes apiserver latency is high - - kube_controller_manager: |- + kube_controller_manager: groups: - name: kube-controller-manager.rules rules: @@ -615,12 +603,10 @@ conf: labels: severity: critical annotations: - description: There is no running K8S controller manager. Deployments and replication - controllers are not making progress. + description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager summary: Controller manager is down - - kubelet: |- + kubelet: groups: - name: kubelet.rules rules: @@ -630,18 +616,15 @@ conf: labels: severity: warning annotations: - description: The Kubelet on {{ $labels.node }} has not checked in with the API, - or has set itself to NotReady, for more than an hour + description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour summary: Node status is NotReady - alert: K8SManyNodesNotReady - expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} - == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 for: 1m labels: severity: critical annotations: - description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady - state).' + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' summary: Many Kubernetes nodes are Not Ready - alert: K8SKubeletDown expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 @@ -652,147 +635,102 @@ conf: description: Prometheus failed to scrape {{ $value }}% of kubelets. summary: Many Kubelets cannot be scraped - alert: K8SKubeletDown - expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) - > 0.1 + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 for: 1h labels: severity: critical annotations: - description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets - have disappeared from service discovery. + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery. summary: Many Kubelets cannot be scraped - alert: K8SKubeletTooManyPods expr: kubelet_running_pod_count > 100 labels: severity: warning annotations: - description: Kubelet {{$labels.instance}} is running {{$value}} pods, close - to the limit of 110 + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 summary: Kubelet is close to pod limit - - kubernetes: |- + kubernetes: groups: - name: kubernetes.rules rules: - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:spec_cpu_shares - expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:cpu_usage:rate - expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_usage:bytes - expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_working_set:bytes - expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_rss:bytes - expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_cache:bytes - expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:disk_usage:bytes - expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate - expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) - record: cluster_namespace_controller_pod_container:memory_oom:rate - expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name, scope, type) + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) - record: cluster:memory_allocation:percent - expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) - / sum(machine_memory_bytes) BY (cluster) + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:memory_used:percent - expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) - BY (cluster) + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) - record: cluster:cpu_allocation:percent - expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} - * ON(cluster, instance) machine_cpu_cores) BY (cluster) + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster) - record: cluster:node_cpu_use:percent - expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) - BY (cluster) + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster) - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.99" - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.9" - record: cluster_resource_verb:apiserver_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, - cluster, job, resource, verb)) / 1e+06 + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.99" - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.9" - record: cluster:scheduler_binding_latency:quantile_seconds - expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) - BY (le, cluster)) / 1e+06 + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 labels: quantile: "0.5" - - rabbitmq: |- - - mysql: |- - - ceph: |- - - openstack: |- - - custom: |- + rabbitmq: null + mysql: null + ceph: null + openstack: null + custom: null