openstack-helm-infra/prometheus/values.yaml

# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Default values for prometheus.
# This is a YAML-formatted file.
# Declare name/value pairs to be passed into your templates.
# name: value

images:
  tags:
    prometheus: docker.io/prom/prometheus:v2.0.0
    helm_tests: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
    dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.1
    image_repo_sync: docker.io/docker:17.07.0
  pull_policy: IfNotPresent
  local_registry:
    active: false
    exclude:
      - dep_check
      - image_repo_sync

labels:
  prometheus:
    node_selector_key: openstack-control-plane
    node_selector_value: enabled
  job:
    node_selector_key: openstack-control-plane
    node_selector_value: enabled

pod:
  affinity:
    anti:
      type:
        default: preferredDuringSchedulingIgnoredDuringExecution
      topologyKey:
        default: kubernetes.io/hostname
  mounts:
    prometheus:
      prometheus:
      init_container: null
  replicas:
    prometheus: 1
  lifecycle:
    upgrades:
      revision_history: 3
      pod_replacement_strategy: RollingUpdate
      rolling_update:
        max_unavailable: 1
        max_surge: 3
    termination_grace_period:
      prometheus:
        timeout: 30
  resources:
    enabled: false
    prometheus:
      limits:
        memory: "1024Mi"
        cpu: "2000m"
      requests:
        memory: "128Mi"
        cpu: "500m"
    jobs:
      image_repo_sync:
        requests:
          memory: "128Mi"
          cpu: "100m"
        limits:
          memory: "1024Mi"
          cpu: "2000m"
      tests:
        requests:
          memory: "128Mi"
          cpu: "100m"
        limits:
          memory: "1024Mi"
          cpu: "2000m"

endpoints:
  cluster_domain_suffix: cluster.local
  local_image_registry:
    name: docker-registry
    namespace: docker-registry
    hosts:
      default: localhost
      internal: docker-registry
      node: localhost
    host_fqdn_override:
      default: null
    port:
      registry:
        node: 5000
  monitoring:
    name: prometheus
    namespace: null
    hosts:
      default: prom-metrics
      public: prometheus
    host_fqdn_override:
      default: null
    path:
      default: null
    scheme:
      default: 'http'
    port:
      api:
        default: 9090
        public: 80
  alerts:
    name: alertmanager
    namespace: null
    hosts:
      default: alerts-engine
      public: alertmanager
      discovery: alertmanager-discovery
    host_fqdn_override:
      default: null
    path:
      default: null
    scheme:
      default: 'http'
    port:
      api:
        default: 9093
        public: 80
      mesh:
        default: 6783

dependencies:
  dynamic:
    common:
      local_image_registry:
        jobs:
          - prometheus-image-repo-sync
        services:
          - endpoint: node
            service: local_image_registry
  static:
    image_repo_sync:
      services:
        - endpoint: internal
          service: local_image_registry
    prometheus:
      services: null

monitoring:
  prometheus:
    enabled: true
    prometheus:
      scrape: true

network:
  prometheus:
    ingress:
      public: true
      classes:
        namespace: "nginx"
        cluster: "nginx-cluster"
      annotations:
        nginx.ingress.kubernetes.io/rewrite-target: /
    node_port:
      enabled: false
      port: 30900

storage:
  enabled: true
  pvc:
    name: prometheus-pvc
    access_mode: [ "ReadWriteOnce" ]
  requests:
    storage: 5Gi
  storage_class: general

manifests:
  configmap_bin: true
  configmap_etc: true
  ingress: true
  helm_tests: true
  job_image_repo_sync: true
  service_ingress: true
  service: true
  statefulset_prometheus: true

conf:
  prometheus:
    # Consumed by a prometheus helper function to generate the command line flags
    # for configuring the prometheus service
    command_line_flags:
      log.level: info
      query.max_concurrency: 20
      query.timeout: 2m
      storage.tsdb.path: /var/lib/prometheus/data
      storage.tsdb.retention: 7d
      storage.tsdb.min_block_duration: 2h
      storage.tsdb.max_block_duration: 2h
      web.enable_admin_api: false
    scrape_configs:
      global:
        scrape_interval: 60s
        evaluation_interval: 60s
      scrape_configs:
        - job_name: kubelet
          scheme: https
          # This TLS & bearer token file config is used to connect to the actual scrape
          # endpoints for cluster components. This is separate to discovery auth
          # configuration because discovery & scraping are two separate concerns in
          # Prometheus. The discovery auth config is automatic if Prometheus runs inside
          # the cluster. Otherwise, more config options have to be provided within the
          # <kubernetes_sd_config>.
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          kubernetes_sd_configs:
          - role: node
          scrape_interval: 45s
          relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels:
              - __meta_kubernetes_node_name
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/${1}/proxy/metrics
          - source_labels:
              - __meta_kubernetes_node_name
            action: replace
            target_label: kubernetes_io_hostname
          # Scrape config for Kubelet cAdvisor.
          #
          # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
          # (those whose names begin with 'container_') have been removed from the
          # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
          # retrieve those metrics.
          #
          # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
          # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
          # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
          # the --cadvisor-port=0 Kubelet flag).
          #
          # This job is not necessary and should be removed in Kubernetes 1.6 and
          # earlier versions, or it will cause the metrics to be scraped twice.
        - job_name: 'kubernetes-cadvisor'
          # Default to scraping over https. If required, just disable this or change to
          # `http`.
          scheme: https
          # This TLS & bearer token file config is used to connect to the actual scrape
          # endpoints for cluster components. This is separate to discovery auth
          # configuration because discovery & scraping are two separate concerns in
          # Prometheus. The discovery auth config is automatic if Prometheus runs inside
          # the cluster. Otherwise, more config options have to be provided within the
          # <kubernetes_sd_config>.
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          kubernetes_sd_configs:
          - role: node
          scrape_interval: 45s
          relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          - target_label: __address__
            replacement: kubernetes.default.svc:443
          - source_labels:
              - __meta_kubernetes_node_name
            regex: (.+)
            target_label: __metrics_path__
            replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
          - source_labels:
              - __meta_kubernetes_node_name
            action: replace
            target_label: kubernetes_io_hostname
          metric_relabel_configs:
          - action: replace
            source_labels:
              - id
            regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
            target_label: rkt_container_name
            replacement: '${2}-${1}'
          - action: replace
            source_labels:
              - id
            regex: '^/system\.slice/(.+)\.service$'
            target_label: systemd_service_name
            replacement: '${1}'
          # Scrape config for API servers.
          #
          # Kubernetes exposes API servers as endpoints to the default/kubernetes
          # service so this uses `endpoints` role and uses relabelling to only keep
          # the endpoints associated with the default/kubernetes service using the
          # default named port `https`. This works for single API server deployments as
          # well as HA API server deployments.
        - job_name: 'apiserver'
          kubernetes_sd_configs:
          - role: endpoints
          scrape_interval: 45s
          # Default to scraping over https. If required, just disable this or change to
          # `http`.
          scheme: https
          # This TLS & bearer token file config is used to connect to the actual scrape
          # endpoints for cluster components. This is separate to discovery auth
          # configuration because discovery & scraping are two separate concerns in
          # Prometheus. The discovery auth config is automatic if Prometheus runs inside
          # the cluster. Otherwise, more config options have to be provided within the
          # <kubernetes_sd_config>.
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            # If your node certificates are self-signed or use a different CA to the
            # master CA, then disable certificate verification below. Note that
            # certificate verification is an integral part of a secure infrastructure
            # so this should only be disabled in a controlled environment. You can
            # disable certificate verification by uncommenting the line below.
            #
            # insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          # Keep only the default/kubernetes service endpoints for the https port. This
          # will add targets for each API server which Kubernetes adds an endpoint to
          # the default/kubernetes service.
          relabel_configs:
          - source_labels:
              - __meta_kubernetes_namespace
              - __meta_kubernetes_service_name
              - __meta_kubernetes_endpoint_port_name
            action: keep
            regex: default;kubernetes;https
        # Scrape config for service endpoints.
        #
        # The relabeling allows the actual service scrape endpoint to be configured
        # via the following annotations:
        #
        # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
        # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
        # to set this to `https` & most likely set the `tls_config` of the scrape config.
        # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
        # * `prometheus.io/port`: If the metrics are exposed on a different port to the
        # service then set this appropriately.
        - job_name: 'kubernetes-service-endpoints'
          kubernetes_sd_configs:
          - role: endpoints
          scrape_interval: 60s
          relabel_configs:
          - source_labels:
              - __meta_kubernetes_service_annotation_prometheus_io_scrape
            action: keep
            regex: true
          - source_labels:
              - __meta_kubernetes_service_annotation_prometheus_io_scheme
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels:
              - __meta_kubernetes_service_annotation_prometheus_io_path
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels:
              - __address__
              - __meta_kubernetes_service_annotation_prometheus_io_port
            action: replace
            target_label: __address__
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels:
              - __meta_kubernetes_namespace
            action: replace
            target_label: kubernetes_namespace
          - source_labels:
              - __meta_kubernetes_service_name
            action: replace
            target_label: kubernetes_name
          - source_labels:
              - __meta_kubernetes_service_name
            target_label: job
            replacement: ${1}
        # Example scrape config for pods
        #
        # The relabeling allows the actual pod scrape endpoint to be configured via the
        # following annotations:
        #
        # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
        # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
        # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
        # pod's declared ports (default is a port-free target if none are declared).
        - job_name: 'kubernetes-pods'
          kubernetes_sd_configs:
          - role: pod
          relabel_configs:
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
            target_label: __address__
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
        - job_name: calico-etcd
          honor_labels: false
          kubernetes_sd_configs:
          - role: service
          scrape_interval: 20s
          relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - action: keep
            source_labels:
              - __meta_kubernetes_service_name
            regex: "calico-etcd"
          - action: keep
            source_labels:
              - __meta_kubernetes_namespace
            regex: kube-system
            target_label: namespace
          - source_labels:
              - __meta_kubernetes_pod_name
            target_label: pod
          - source_labels:
              - __meta_kubernetes_service_name
            target_label: service
          - source_labels:
              - __meta_kubernetes_service_name
            target_label: job
            replacement: ${1}
          - source_labels:
              - __meta_kubernetes_service_label
            target_label: job
            regex: calico-etcd
            replacement: ${1}
          - target_label: endpoint
            replacement: "calico-etcd"
      alerting:
        alertmanagers:
        - kubernetes_sd_configs:
            - role: pod
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_application]
            regex: alertmanager
            action: keep
          - source_labels: [__meta_kubernetes_pod_container_port_name]
            regex: alerts-api
            action: keep
          - source_labels: [__meta_kubernetes_pod_container_port_name]
            regex: peer-mesh
            action: drop
          - source_labels: [__meta_kubernetes_namespace]
            regex: openstack
            action: keep
    rules:
      alertmanager:
        groups:
        - name: alertmanager.rules
          rules:
          - alert: AlertmanagerConfigInconsistent
            expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
            for: 5m
            labels:
              severity: critical
            annotations:
              description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
              summary: Alertmanager configurations are inconsistent
          - alert: AlertmanagerDownOrMissing
            expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
            for: 5m
            labels:
              severity: warning
            annotations:
              description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
              summary: Alertmanager down or not discovered
          - alert: FailedReload
            expr: alertmanager_config_last_reload_successful == 0
            for: 10m
            labels:
              severity: warning
            annotations:
              description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
              summary: Alertmanager configuration reload has failed
      etcd3:
        groups:
        - name: etcd3.rules
          rules:
          - alert: etcd_InsufficientMembers
            expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
            for: 3m
            labels:
              severity: critical
            annotations:
              description: If one more etcd member goes down the cluster will be unavailable
              summary: etcd cluster insufficient members
          - alert: etcd_NoLeader
            expr: etcd_server_has_leader{job="etcd"} == 0
            for: 1m
            labels:
              severity: critical
            annotations:
              description: etcd member {{ $labels.instance }} has no leader
              summary: etcd member has no leader
          - alert: etcd_HighNumberOfLeaderChanges
            expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
              summary: a high number of leader changes within the etcd cluster are happening
          - alert: etcd_HighNumberOfFailedGRPCRequests
            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
            for: 10m
            labels:
              severity: warning
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of gRPC requests are failing
          - alert: etcd_HighNumberOfFailedGRPCRequests
            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
            for: 5m
            labels:
              severity: critical
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of gRPC requests are failing
          - alert: etcd_GRPCRequestsSlow
            expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
            for: 10m
            labels:
              severity: critical
            annotations:
              description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
              summary: slow gRPC requests
          - alert: etcd_HighNumberOfFailedHTTPRequests
            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
            for: 10m
            labels:
              severity: warning
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of HTTP requests are failing
          - alert: etcd_HighNumberOfFailedHTTPRequests
            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
            for: 5m
            labels:
              severity: critical
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of HTTP requests are failing
          - alert: etcd_HTTPRequestsSlow
            expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
            for: 10m
            labels:
              severity: warning
            annotations:
              description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
              summary: slow HTTP requests
          - alert: etcd_EtcdMemberCommunicationSlow
            expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
            for: 10m
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
              summary: etcd member communication is slow
          - alert: etcd_HighNumberOfFailedProposals
            expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
              summary: a high number of proposals within the etcd cluster are failing
          - alert: etcd_HighFsyncDurations
            expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
            for: 10m
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} fync durations are high
              summary: high fsync durations
          - alert: etcd_HighCommitDurations
            expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
            for: 10m
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} commit durations are high
              summary: high commit durations
      kube_apiserver:
        groups:
        - name: kube-apiserver.rules
          rules:
          - alert: K8SApiserverDown
            expr: absent(up{job="apiserver"} == 1)
            for: 5m
            labels:
              severity: critical
            annotations:
              description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
              summary: API server unreachable
          - alert: K8SApiServerLatency
            expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
            for: 10m
            labels:
              severity: warning
            annotations:
              description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
              summary: Kubernetes apiserver latency is high
      kube_controller_manager:
        groups:
        - name: kube-controller-manager.rules
          rules:
          - alert: K8SControllerManagerDown
            expr: absent(up{job="kube-controller-manager-discovery"} == 1)
            for: 5m
            labels:
              severity: critical
            annotations:
              description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
              runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
              summary: Controller manager is down
      kubelet:
        groups:
        - name: kubelet.rules
          rules:
          - alert: K8SNodeNotReady
            expr: kube_node_status_ready{condition="true"} == 0
            for: 1h
            labels:
              severity: warning
            annotations:
              description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
              summary: Node status is NotReady
          - alert: K8SManyNodesNotReady
            expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
            for: 1m
            labels:
              severity: critical
            annotations:
              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
              summary: Many Kubernetes nodes are Not Ready
          - alert: K8SKubeletDown
            expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
            for: 1h
            labels:
              severity: warning
            annotations:
              description: Prometheus failed to scrape {{ $value }}% of kubelets.
              summary: Many Kubelets cannot be scraped
          - alert: K8SKubeletDown
            expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
            for: 1h
            labels:
              severity: critical
            annotations:
              description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
              summary: Many Kubelets cannot be scraped
          - alert: K8SKubeletTooManyPods
            expr: kubelet_running_pod_count > 100
            labels:
              severity: warning
            annotations:
              description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
              summary: Kubelet is close to pod limit
      kubernetes:
        groups:
        - name: kubernetes.rules
          rules:
          - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
            expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:spec_cpu_shares
            expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:cpu_usage:rate
            expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:memory_usage:bytes
            expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
            expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:memory_rss:bytes
            expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:memory_cache:bytes
            expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:disk_usage:bytes
            expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
          - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
            expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
          - record: cluster_namespace_controller_pod_container:memory_oom:rate
            expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
          - record: cluster:memory_allocation:percent
            expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
          - record: cluster:memory_used:percent
            expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
          - record: cluster:cpu_allocation:percent
            expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
          - record: cluster:node_cpu_use:percent
            expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
          - record: cluster_resource_verb:apiserver_latency:quantile_seconds
            expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
            labels:
              quantile: "0.99"
          - record: cluster_resource_verb:apiserver_latency:quantile_seconds
            expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
            labels:
              quantile: "0.9"
          - record: cluster_resource_verb:apiserver_latency:quantile_seconds
            expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
            labels:
              quantile: "0.5"
          - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
            expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.99"
          - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
            expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.9"
          - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
            expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.5"
          - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
            expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.99"
          - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
            expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.9"
          - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
            expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.5"
          - record: cluster:scheduler_binding_latency:quantile_seconds
            expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.99"
          - record: cluster:scheduler_binding_latency:quantile_seconds
            expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.9"
          - record: cluster:scheduler_binding_latency:quantile_seconds
            expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.5"
          - alert: kube_statefulset_replicas_unavailable
            expr: kube_statefulset_status_replicas < kube_statefulset_replicas
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
              summary: '{{$labels.statefulset}}: has inssuficient replicas.'
          - alert: kube_daemonsets_misscheduled
            expr: kube_daemonset_status_number_misscheduled > 0
            for: 10m
            labels:
              severity: warning
            annotations:
              description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
              summary: 'Daemonsets not scheduled correctly'
          - alert: kube_daemonsets_not_scheduled
            expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
            for: 10m
            labels:
              severity: warning
            annotations:
              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
              summary: 'Less than desired number of daemonsets scheduled'
          - alert: kube_deployment_replicas_unavailable
            expr: kube_deployment_status_replicas_unavailable > 0
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
              summary: '{{$labels.deployment}}: has inssuficient replicas.'
          - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
            expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
              summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
          - alert: kube_job_status_failed
            expr: kube_job_status_failed > 0
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'Job {{$labels.exported_job}} is in failed status'
              summary: '{{$labels.exported_job}} has failed status'
          - alert: kube_pod_status_pending
            expr: kube_pod_status_phase{phase="Pending"} == 1
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
          - alert: kube_pod_error_image_pull
            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
          - alert: kube_pod_status_error_image_pull
            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
          - alert: kube_replicaset_missing_replicas
            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
              summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
          - alert: kube_pod_container_terminated
            expr: kube_pod_container_status_terminated > 0
            for: 10m
            labels:
              severity: page
            annotations:
              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
      basic_linux:
        groups:
        - name: basic_linux.rules
          rules:
          - alert: node_filesystem_full_80percent
            expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"}
              * 0.2) / 1024 ^ 3
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
                got less than 10% space left on its filesystem.'
              summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
          - alert: node_filesystem_full_in_4h
            expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
                is running out of space of in approx. 4 hours'
              summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
          - alert: node_filedescriptors_full_in_3h
            expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
            for: 20m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} is running out of available file descriptors
                in approx. 3 hours'
              summary: '{{$labels.alias}} is running out of available file descriptors in
                3 hours.'
          - alert: node_load1_90percent
            expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
            for: 1h
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} is running with > 90% total load for at least
                1h.'
              summary: '{{$labels.alias}}: Running on high load.'
          - alert: node_cpu_util_90percent
            expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
            for: 1h
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} has total CPU utilization over 90% for at least
                1h.'
              summary: '{{$labels.alias}}: High CPU utilization.'
          - alert: node_ram_using_90percent
            expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
              * 0.1
            for: 30m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} is using at least 90% of its RAM for at least
                30 minutes now.'
              summary: '{{$labels.alias}}: Using lots of RAM.'
          - alert: node_swap_using_80percent
            expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
              > node_memory_SwapTotal * 0.8
            for: 10m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} is using 80% of its swap space for at least
                10 minutes now.'
              summary: '{{$labels.alias}}: Running out of swap soon.'
          - alert: node_high_cpu_load
            expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
            for: 1m
            labels:
              severity: warning
            annotations:
              description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
              summary: '{{$labels.alias}}: Running on high load: {{$value}}'
          - alert: node_high_memory_load
            expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
              + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
            for: 1m
            labels:
              severity: warning
            annotations:
              description: Host memory usage is {{ humanize $value }}%. Reported by
                instance {{ $labels.instance }} of job {{ $labels.job }}.
              summary: Server memory is almost full
          - alert: node_high_storage_load
            expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
              / node_filesystem_size{mountpoint="/"} * 100 > 85
            for: 30s
            labels:
              severity: warning
            annotations:
              description: Host storage usage is {{ humanize $value }}%. Reported by
                instance {{ $labels.instance }} of job {{ $labels.job }}.
              summary: Server storage is almost full
          - alert: node_high_swap
            expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
              * 0.4)
            for: 1m
            labels:
              severity: warning
            annotations:
              description: Host system has a high swap usage of {{ humanize $value }}. Reported
                by instance {{ $labels.instance }} of job {{ $labels.job }}.
              summary: Server has a high swap usage
          - alert: node_high_network_drop_rcv
            expr: node_network_receive_drop{device!="lo"} > 3000
            for: 30s
            labels:
              severity: warning
            annotations:
              description: Host system has an unusally high drop in network reception ({{
                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
                $labels.job }}
              summary: Server has a high receive drop
          - alert: node_high_network_drop_send
            expr: node_network_transmit_drop{device!="lo"} > 3000
            for: 30s
            labels:
              severity: warning
            annotations:
              description: Host system has an unusally high drop in network transmission ({{
                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
                $labels.job }}
              summary: Server has a high transmit drop
          - alert: node_high_network_errs_rcv
            expr: node_network_receive_errs{device!="lo"} > 3000
            for: 30s
            labels:
              severity: warning
            annotations:
              description: Host system has an unusally high error rate in network reception
                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
                {{ $labels.job }}
              summary: Server has unusual high reception errors
          - alert: node_high_network_errs_send
            expr: node_network_transmit_errs{device!="lo"} > 3000
            for: 30s
            labels:
              severity: warning
            annotations:
              description: Host system has an unusally high error rate in network transmission
                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
                {{ $labels.job }}
              summary: Server has unusual high transmission errors
          - alert: node_network_conntrack_usage_80percent
            expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"}  * 0.8)
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
              summary: '{{$labels.instance}}: available network conntrack entries are low.'
          - alert: node_entropy_available_low
            expr: node_entropy_available_bits < 300
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
              summary: '{{$labels.instance}}: is low on entropy bits.'
          - alert: node_hwmon_high_cpu_temp
            expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
              summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
          - alert: node_vmstat_paging_rate_high
            expr: irate(node_vmstat_pgpgin[5m]) > 80
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
              summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
          - alert: node_xfs_block_allocation_high
            expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
              summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
          - alert: node_network_bond_slaves_down
            expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
              summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
          - alert: node_numa_memory_used
            expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
              summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
          - alert: node_ntp_clock_skew_high
            expr: abs(node_ntp_drift_seconds) > 2
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
              summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
          - alert: node_disk_read_latency
            expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.device}} has a high read latency of {{ $value }}'
              summary: 'High read latency observed for device {{ $labels.device }}'
          - alert: node_disk_write_latency
            expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10
            for: 5m
            labels:
              severity: page
            annotations:
              description: '{{$labels.device}} has a high write latency of {{ $value }}'
              summary: 'High write latency observed for device {{ $labels.device }}'
      openstack:
        groups:
        - name: openstack.rules
          rules:
          - alert: os_glance_api_availability
            expr:  check_glance_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Glance API is not available at {{$labels.url}}'
          - alert: os_nova_api_availability
            expr:  check_nova_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Nova API is not available at {{$labels.url}}'
          - alert: os_keystone_api_availability
            expr:  check_keystone_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Keystone API is not available at {{$labels.url}}'
          - alert: os_neutron_api_availability
            expr:  check_neutron_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Neutron API is not available at {{$labels.url}}'
          - alert: os_swift_api_availability
            expr:  check_swift_api != 1
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
              summary: 'Swift API is not available at {{$labels.url}}'
          - alert: os_nova_compute_disabled
            expr:  services_nova_compute_disabled_total > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-compute is disabled on some hosts'
          - alert: os_nova_conductor_disabled
            expr:  services_nova_conductor_disabled_total > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-conductor is disabled on some hosts'
          - alert: os_nova_consoleauth_disabled
            expr:  services_nova_consoleauth_disabled_total > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
          - alert: os_nova_scheduler_disabled
            expr:  services_nova_scheduler_disabled_total > 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
              summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
      ceph:
        groups:
        - name: ceph.rules
          rules:
          - alert: ceph_monitor_quorum_low
            expr:  ceph_monitor_quorum_count < 3
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
              summary: 'ceph high availability is at risk'
          - alert: ceph_cluster_usage_high
            expr:  100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'ceph cluster capacity usage more than 80 percent'
              summary: 'ceph cluster usage is more than 80 percent'
          - alert: ceph_placement_group_degrade_pct_high
            expr:  100*ceph_degraded_pgs/ceph_total_pgs > 80
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'ceph placement group degradation is more than 80 percent'
              summary: 'ceph placement groups degraded'
          - alert: ceph_osd_down_pct_high
            expr:  100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'ceph OSDs down percent is more than 80 percent'
              summary: 'ceph OSDs down percent is high'
          - alert: ceph_monitor_clock_skew_high
            expr:  ceph_monitor_clock_skew_seconds > 2
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds'
              summary: 'ceph monitor clock skew high'
      fluentd:
        groups:
        - name: fluentd.rules
          rules:
          - alert: fluentd_not_running
            expr:  fluentd_up == 0
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
              summary: 'Fluentd is down'
      calico:
        groups:
        - name: calico.rules
          rules:
          - alert: calico_datapane_failures_high_1h
            expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
            labels:
              severity: page
            annotations:
              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
              summary: 'A high number of dataplane failures within Felix are happening'
          - alert: calico_datapane_address_msg_batch_size_high_5m
            expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
              summary: 'Felix address message batch size is higher'
          - alert: calico_datapane_iface_msg_batch_size_high_5m
            expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
            for: 5m
            labels:
              severity: page
            annotations:
              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
              summary: 'Felix interface message batch size is higher'
          - alert: calico_ipset_errors_high_1h
            expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
            labels:
              severity: page
            annotations:
              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
              summary: 'A high number of ipset errors within Felix are happening'
          - alert: calico_iptable_save_errors_high_1h
            expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
            labels:
              severity: page
            annotations:
              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
              summary: 'A high number of iptable save errors within Felix are happening'
          - alert: calico_iptable_restore_errors_high_1h
            expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
            labels:
              severity: page
            annotations:
              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
              summary: 'A high number of iptable restore errors within Felix are happening'