Prometheus: Update chart to support federation

This updates the Prometheus chart to support federation. This
moves to defining the Prometheus configuration file via a template
in the values.yaml file instead of through raw yaml. This allows
for overriding the chart's default configuration wholesale, as
this would be required for a hierarchical federated setup. This
also strips out all of the default rules defined in the chart for
the same reason. There are example rules defined for the various
aspects of OSH's infrastructure in the prometheus/values_overrides
directory that are executed as part of the normal CI jobs. This
also adds a nonvoting federated-monitoring job that vets out the
ability to federate prometheus in a hierarchical fashion with
extremely basic overrides

Change-Id: I0f121ad5e4f80be4c790dc869955c6b299ca9f26
Signed-off-by: Steve Wilkerson <sw5822@att.com>
This commit is contained in:
Steve Wilkerson 2019-11-19 12:02:31 -06:00 committed by Steve Wilkerson
parent 0edd3e18de
commit fbd34421f2
26 changed files with 1983 additions and 1818 deletions

View File

@ -20,7 +20,7 @@ limitations under the License.
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-bin
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
data:
apache.sh: |
{{ tuple "bin/_apache.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}

View File

@ -16,34 +16,14 @@ limitations under the License.
{{- if .Values.manifests.configmap_etc }}
{{- $envAll := . }}
{{- if empty $envAll.Values.conf.prometheus.scrape_configs.rule_files -}}
{{- $_ := set $envAll.Values "__rule_files" ( list ) }}
{{- $rulesKeys := keys $envAll.Values.conf.prometheus.rules -}}
{{- range $rule := $rulesKeys }}
{{- $rulesFile := printf "/etc/config/rules/%s.rules" $rule }}
{{- $__rule_files := append $envAll.Values.__rule_files $rulesFile }}
{{- $_ := set $envAll.Values "__rule_files" $__rule_files }}
{{ end }}
{{- $_ := set .Values.conf.prometheus.scrape_configs "rule_files" $envAll.Values.__rule_files -}}
{{- end -}}
{{- if not (empty $envAll.Values.conf.prometheus.scrape_configs.scrape_configs) }}
{{- $_ := set $envAll.Values "__updated_scrape_configs" ( list ) }}
{{- $promScrapeTarget := first $envAll.Values.conf.prometheus.scrape_configs.scrape_configs }}
{{- if (empty $promScrapeTarget.basic_auth) }}
{{- $_ := set $promScrapeTarget "basic_auth" $envAll.Values.endpoints.monitoring.auth.admin }}
{{- end }}
{{- end }}
---
apiVersion: v1
kind: Secret
metadata:
name: prometheus-etc
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-etc" | quote }}
type: Opaque
data:
prometheus.yml: {{ toYaml .Values.conf.prometheus.scrape_configs | b64enc }}
{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.prometheus.scrape_configs.template "key" "prometheus.yml" "format" "Secret") | indent 2 }}
{{ range $key, $value := .Values.conf.prometheus.rules }}
{{ $key }}.rules: {{ toYaml $value | b64enc }}
{{ end }}

View File

@ -16,7 +16,6 @@ limitations under the License.
{{- if .Values.manifests.helm_tests }}
{{- $envAll := . }}
{{- $promUserSecret := .Values.secrets.prometheus.admin }}
{{- $serviceAccountName := print .Release.Name "-test" }}
{{ tuple $envAll "tests" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
@ -47,12 +46,12 @@ spec:
- name: PROMETHEUS_ADMIN_USERNAME
valueFrom:
secretKeyRef:
name: {{ $promUserSecret }}
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
key: PROMETHEUS_ADMIN_USERNAME
- name: PROMETHEUS_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: {{ $promUserSecret }}
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
key: PROMETHEUS_ADMIN_PASSWORD
- name: PROMETHEUS_ENDPOINT
value: {{ tuple "monitoring" "internal" "http" $envAll | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
@ -68,6 +67,6 @@ spec:
emptyDir: {}
- name: prometheus-bin
configMap:
name: prometheus-bin
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
defaultMode: 0555
{{- end }}

View File

@ -16,12 +16,11 @@ limitations under the License.
{{- if .Values.manifests.secret_prometheus }}
{{- $envAll := . }}
{{- $secretName := index $envAll.Values.secrets.prometheus.admin }}
---
apiVersion: v1
kind: Secret
metadata:
name: {{ $secretName }}
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
type: Opaque
data:
PROMETHEUS_ADMIN_USERNAME: {{ .Values.endpoints.monitoring.auth.admin.username | b64enc }}

View File

@ -19,15 +19,14 @@ limitations under the License.
{{- $mounts_prometheus := .Values.pod.mounts.prometheus.prometheus }}
{{- $mounts_prometheus_init := .Values.pod.mounts.prometheus.init_container }}
{{- $promUserSecret := .Values.secrets.prometheus.admin }}
{{- $serviceAccountName := printf "%s-%s" .Release.Name "prometheus" }}
{{ tuple $envAll "prometheus" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
{{- $rcControllerName := printf "%s-%s" $envAll.Release.Name "prometheus" }}
{{ tuple $envAll "prometheus" $rcControllerName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: {{ $serviceAccountName }}
name: {{ $rcControllerName | quote }}
rules:
- apiGroups:
- ""
@ -55,20 +54,20 @@ rules:
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: {{ $serviceAccountName }}
name: {{ $rcControllerName | quote }}
subjects:
- kind: ServiceAccount
name: {{ $serviceAccountName }}
name: {{ $rcControllerName | quote }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ $serviceAccountName }}
name: {{ $rcControllerName | quote }}
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
name: {{ $rcControllerName | quote }}
annotations:
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
labels:
@ -90,7 +89,7 @@ spec:
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
spec:
{{ dict "envAll" $envAll "application" "api" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
serviceAccountName: {{ $serviceAccountName }}
serviceAccountName: {{ $rcControllerName | quote }}
affinity:
{{ tuple $envAll "prometheus" "api" | include "helm-toolkit.snippets.kubernetes_pod_anti_affinity" | indent 8 }}
nodeSelector:
@ -129,12 +128,12 @@ spec:
- name: PROMETHEUS_ADMIN_USERNAME
valueFrom:
secretKeyRef:
name: {{ $promUserSecret }}
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
key: PROMETHEUS_ADMIN_USERNAME
- name: PROMETHEUS_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: {{ $promUserSecret }}
name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
key: PROMETHEUS_ADMIN_PASSWORD
volumeMounts:
- name: pod-tmp
@ -169,6 +168,10 @@ spec:
port: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
initialDelaySeconds: 30
timeoutSeconds: 30
env:
{{- if .Values.pod.env.prometheus }}
{{ include "helm-toolkit.utils.to_k8s_env_vars" .Values.pod.env.prometheus | indent 12 }}
{{- end }}
volumeMounts:
- name: pod-tmp
mountPath: /tmp
@ -202,11 +205,11 @@ spec:
emptyDir: {}
- name: prometheus-etc
secret:
secretName: prometheus-etc
secretName: {{ printf "%s-%s" $envAll.Release.Name "prometheus-etc" | quote }}
defaultMode: 0444
- name: prometheus-bin
configMap:
name: prometheus-bin
name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
defaultMode: 0555
{{ if $mounts_prometheus.volumes }}{{ toYaml $mounts_prometheus.volumes | indent 8 }}{{ end }}
{{- if not .Values.storage.enabled }}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
conf:
prometheus:
rules:
alertmanager:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
summary: Alertmanager down or not discovered
- alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.
summary: Alertmanager configuration reload has failed

View File

@ -0,0 +1,71 @@
conf:
prometheus:
rules:
ceph:
groups:
- name: ceph.rules
rules:
- alert: prom_exporter_ceph_unavailable
expr: absent(ceph_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
title: Ceph exporter is not collecting metrics or is not available
- alert: no_active_ceph_mgr
expr: count(up{job="ceph-mgr"} == 1) == 0
for: 5m
labels:
severity: warning
annotations:
description: 'no ceph active mgr is present or all ceph mgr are down'
summary: 'no ceph active mgt is present'
- alert: ceph_mon_quorum_low
expr: ceph_mon_quorum_count < 3
for: 5m
labels:
severity: page
annotations:
description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
summary: 'ceph high availability is at risk'
- alert: ceph_cluster_usage_high
expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
for: 5m
labels:
severity: page
annotations:
description: 'ceph cluster capacity usage more than 80 percent'
summary: 'ceph cluster usage is more than 80 percent'
- alert: ceph_placement_group_degrade_pct_high
expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
for: 5m
labels:
severity: critical
annotations:
description: 'ceph placement group degradation is more than 80 percent'
summary: 'ceph placement groups degraded'
- alert: ceph_osd_down_pct_high
expr: 100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
for: 5m
labels:
severity: critical
annotations:
description: 'ceph OSDs down percent is more than 80 percent'
summary: 'ceph OSDs down percent is high'
- alert: ceph_osd_down
expr: ceph_osd_up == 0
for: 1m
labels:
severity: critical
annotations:
description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
- alert: ceph_osd_out
expr: ceph_osd_in == 0
for: 5m
labels:
severity: page
annotations:
description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'

View File

@ -0,0 +1,379 @@
conf:
prometheus:
rules:
kubernetes:
groups:
- name: calico.rules
rules:
- alert: prom_exporter_calico_unavailable
expr: absent(felix_host)
for: 10m
labels:
severity: warning
annotations:
description: Calico exporter is not collecting metrics or is not available for past 10 minutes
title: Calico exporter is not collecting metrics or is not available
- alert: calico_datapane_failures_high_1h
expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
summary: 'A high number of dataplane failures within Felix are happening'
- alert: calico_datapane_address_msg_batch_size_high_5m
expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
for: 5m
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
summary: 'Felix address message batch size is higher'
- alert: calico_datapane_iface_msg_batch_size_high_5m
expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
for: 5m
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
summary: 'Felix interface message batch size is higher'
- alert: calico_ipset_errors_high_1h
expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
summary: 'A high number of ipset errors within Felix are happening'
- alert: calico_iptable_save_errors_high_1h
expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
summary: 'A high number of iptable save errors within Felix are happening'
- alert: calico_iptable_restore_errors_high_1h
expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
labels:
severity: page
annotations:
description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
summary: 'A high number of iptable restore errors within Felix are happening'
- name: etcd3.rules
rules:
- alert: etcd_InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: etcd_NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: etcd_HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: etcd_HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: etcd_HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: etcd_GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
- alert: etcd_HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: etcd_HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: etcd_HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
- alert: etcd_EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: etcd_HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: etcd_HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: etcd_HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1
for: 1m
labels:
severity: critical
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute
summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}'
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SNodesNotReady
expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} nodes are notReady state.'
summary: One or more Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
for: 1m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
- name: kube-apiserver.rules
rules:
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
summary: API server unreachable
- alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
for: 10m
labels:
severity: warning
annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager-discovery"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
- name: kubernetes-object.rules
rules:
- alert: prom_exporter_kube_state_metrics_unavailable
expr: absent(kube_node_info)
for: 10m
labels:
severity: warning
annotations:
description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
title: kube-state-metrics exporter is not collecting metrics or is not available
- alert: kube_statefulset_replicas_unavailable
expr: kube_statefulset_status_replicas < kube_statefulset_replicas
for: 5m
labels:
severity: page
annotations:
description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
summary: '{{$labels.statefulset}}: has inssuficient replicas.'
- alert: daemonsets_misscheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
summary: 'Daemonsets not scheduled correctly'
- alert: daemonsets_not_scheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
summary: 'Less than desired number of daemonsets scheduled'
- alert: daemonset_pods_unavailable
expr: kube_daemonset_status_number_unavailable > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
summary: 'Daemonset pods unavailable, due to one of many reasons'
- alert: deployment_replicas_unavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 10m
labels:
severity: page
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
summary: '{{$labels.deployment}}: has inssuficient replicas.'
- alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
for: 10m
labels:
severity: page
annotations:
description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
- alert: job_status_failed
expr: kube_job_status_failed > 0
for: 10m
labels:
severity: page
annotations:
description: 'Job {{$labels.exported_job}} is in failed status'
summary: '{{$labels.exported_job}} has failed status'
- alert: pod_status_pending
expr: kube_pod_status_phase{phase="Pending"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
- alert: pod_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_status_error_image_pull_backoff
expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_crash_loop_back_off
expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: pod_error_config_error
expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: replicaset_missing_replicas
expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
for: 10m
labels:
severity: page
annotations:
description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
- alert: pod_container_terminated
expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
for: 10m
labels:
severity: page
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
- alert: volume_claim_capacity_high_utilization
expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
for: 5m
labels:
severity: page
annotations:
description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'

View File

@ -0,0 +1,105 @@
conf:
prometheus:
rules:
logging:
groups:
- name: fluentd.rules
rules:
- alert: prom_exporter_fluentd_unavailable
expr: absent(fluentd_up)
for: 10m
labels:
severity: warning
annotations:
description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
title: Fluentd exporter is not collecting metrics or is not available
- alert: fluentd_not_running
expr: fluentd_up == 0
for: 5m
labels:
severity: page
annotations:
description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
summary: 'Fluentd is down'
- name: elasticsearch.rules
rules:
- alert: prom_exporter_elasticsearch_unavailable
expr: absent(elasticsearch_cluster_health_status)
for: 10m
labels:
severity: warning
annotations:
description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
title: Elasticsearch exporter is not collecting metrics or is not available
- alert: es_high_process_open_files_count
expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
for: 10m
labels:
severity: warning
annotations:
description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
summary: 'Elasticsearch has a very high process open file count.'
- alert: es_high_process_cpu_percent
expr: elasticsearch_process_cpu_percent > 95
for: 10m
labels:
severity: warning
annotations:
description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
summary: 'Elasticsearch process cpu usage is more than 95 percent.'
- alert: es_fs_usage_high
expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
for: 10m
labels:
severity: warning
annotations:
description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
summary: 'Elasticsearch filesystem usage is high.'
- alert: es_unassigned_shards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Elasticsearch has {{ $value }} unassigned shards.'
summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
- alert: es_cluster_health_timed_out
expr: elasticsearch_cluster_health_timed_out > 0
for: 10m
labels:
severity: warning
annotations:
description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
summary: 'Elasticsearch cluster health status calls are timing out.'
- alert: es_cluster_health_status_alert
expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
for: 10m
labels:
severity: warning
annotations:
description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
summary: 'Elasticsearch cluster health status is not green.'
- alert: es_cluster_health_too_few_nodes_running
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 10m
labels:
severity: warning
annotations:
description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
summary: 'ElasticSearch running on less than 3 nodes'
- alert: es_cluster_health_too_few_data_nodes_running
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 10m
labels:
severity: warning
annotations:
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
summary: 'ElasticSearch running on less than 3 data nodes'
- alert: es_cluster_health_too_few_data_nodes_running
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 10m
labels:
severity: warning
annotations:
description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
summary: 'ElasticSearch running on less than 3 data nodes'

View File

@ -0,0 +1,240 @@
conf:
prometheus:
rules:
nodes:
groups:
- name: nodes.rules
rules:
- alert: prom_exporter_node_unavailable
expr: absent(node_uname_info)
for: 10m
labels:
severity: warning
annotations:
description: node exporter is not collecting metrics or is not available for past 10 minutes
title: node exporter is not collecting metrics or is not available
- alert: node_filesystem_full_80percent
expr: sort(node_filesystem_free{fstype =~ "xfs|ext[34]"} < node_filesystem_size{fstype =~ "xfs|ext[34]"}
* 0.2) / 1024 ^ 3
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
got less than 10% space left on its filesystem.'
summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
- alert: node_filesystem_full_in_4h
expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
is running out of space of in approx. 4 hours'
summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
- alert: node_filedescriptors_full_in_3h
expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
for: 20m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is running out of available file descriptors
in approx. 3 hours'
summary: '{{$labels.alias}} is running out of available file descriptors in
3 hours.'
- alert: node_load1_90percent
expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
for: 1h
labels:
severity: page
annotations:
description: '{{$labels.alias}} is running with > 90% total load for at least
1h.'
summary: '{{$labels.alias}}: Running on high load.'
- alert: node_cpu_util_90percent
expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
for: 1h
labels:
severity: page
annotations:
description: '{{$labels.alias}} has total CPU utilization over 90% for at least
1h.'
summary: '{{$labels.alias}}: High CPU utilization.'
- alert: node_ram_using_90percent
expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
* 0.1
for: 30m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is using at least 90% of its RAM for at least
30 minutes now.'
summary: '{{$labels.alias}}: Using lots of RAM.'
- alert: node_swap_using_80percent
expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
> node_memory_SwapTotal * 0.8
for: 10m
labels:
severity: page
annotations:
description: '{{$labels.alias}} is using 80% of its swap space for at least
10 minutes now.'
summary: '{{$labels.alias}}: Running out of swap soon.'
- alert: node_high_cpu_load
expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
for: 1m
labels:
severity: warning
annotations:
description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
summary: '{{$labels.alias}}: Running on high load: {{$value}}'
- alert: node_high_memory_load
expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+ node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
for: 1m
labels:
severity: warning
annotations:
description: Host memory usage is {{ humanize $value }}%. Reported by
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server memory is almost full
- alert: node_high_storage_load
expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
/ node_filesystem_size{mountpoint="/"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
description: Host storage usage is {{ humanize $value }}%. Reported by
instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server storage is almost full
- alert: node_high_swap
expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
* 0.4)
for: 1m
labels:
severity: warning
annotations:
description: Host system has a high swap usage of {{ humanize $value }}. Reported
by instance {{ $labels.instance }} of job {{ $labels.job }}.
summary: Server has a high swap usage
- alert: node_high_network_drop_rcv
expr: node_network_receive_drop{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high drop in network reception ({{
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
$labels.job }}
summary: Server has a high receive drop
- alert: node_high_network_drop_send
expr: node_network_transmit_drop{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high drop in network transmission ({{
humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
$labels.job }}
summary: Server has a high transmit drop
- alert: node_high_network_errs_rcv
expr: node_network_receive_errs{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high error rate in network reception
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
{{ $labels.job }}
summary: Server has unusual high reception errors
- alert: node_high_network_errs_send
expr: node_network_transmit_errs{device!="lo"} > 3000
for: 30s
labels:
severity: warning
annotations:
description: Host system has an unusally high error rate in network transmission
({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
{{ $labels.job }}
summary: Server has unusual high transmission errors
- alert: node_network_conntrack_usage_80percent
expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8)
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
summary: '{{$labels.instance}}: available network conntrack entries are low.'
- alert: node_entropy_available_low
expr: node_entropy_available_bits < 300
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
summary: '{{$labels.instance}}: is low on entropy bits.'
- alert: node_hwmon_high_cpu_temp
expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
- alert: node_vmstat_paging_rate_high
expr: irate(node_vmstat_pgpgin[5m]) > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
- alert: node_xfs_block_allocation_high
expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
- alert: node_network_bond_slaves_down
expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
- alert: node_numa_memory_used
expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
- alert: node_ntp_clock_skew_high
expr: abs(node_ntp_drift_seconds) > 2
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
- alert: node_disk_read_latency
expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 40
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.device}} has a high read latency of {{ $value }}'
summary: 'High read latency observed for device {{ $labels.device }}'
- alert: node_disk_write_latency
expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 40
for: 5m
labels:
severity: page
annotations:
description: '{{$labels.device}} has a high write latency of {{ $value }}'
summary: 'High write latency observed for device {{ $labels.device }}'

View File

@ -0,0 +1,315 @@
conf:
prometheus:
rules:
openstack:
groups:
- name: mariadb.rules
rules:
- alert: prom_exporter_mariadb_unavailable
expr: absent(mysql_up)
for: 10m
labels:
severity: warning
annotations:
description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
title: MariaDB exporter is not collecting metrics or is not available
- alert: mariadb_table_lock_wait_high
expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
for: 10m
labels:
severity: warning
annotations:
description: 'Mariadb has high table lock waits of {{ $value }} percentage'
summary: 'Mariadb table lock waits are high'
- alert: mariadb_node_not_ready
expr: mysql_global_status_wsrep_ready != 1
for: 10m
labels:
severity: warning
annotations:
description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
summary: 'Galera cluster node not ready'
- alert: mariadb_galera_node_out_of_sync
expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
for: 10m
labels:
severity: warning
annotations:
description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
summary: 'Galera cluster node out of sync'
- alert: mariadb_innodb_replication_fallen_behind
expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
for: 10m
labels:
severity: warning
annotations:
description: 'The mysql innodb replication has fallen behind and is not recovering'
summary: 'MySQL innodb replication is lagging'
- name: openstack.rules
rules:
- alert: prom_exporter_openstack_unavailable
expr: absent(openstack_exporter_cache_refresh_duration_seconds)
for: 10m
labels:
severity: warning
annotations:
description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
title: Openstack exporter is not collecting metrics or is not available
- alert: os_glance_api_availability
expr: openstack_check_glance_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Glance API is not available at {{$labels.url}}'
- alert: os_nova_api_availability
expr: openstack_check_nova_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Nova API is not available at {{$labels.url}}'
- alert: os_keystone_api_availability
expr: openstack_check_keystone_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Keystone API is not available at {{$labels.url}}'
- alert: os_neutron_api_availability
expr: openstack_check_neutron_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Neutron API is not available at {{$labels.url}}'
- alert: os_neutron_metadata_agent_availability
expr: openstack_services_neutron_metadata_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
summary: 'One or more neutron metadata_agents are not available'
- alert: os_neutron_openvswitch_agent_availability
expr: openstack_services_neutron_openvswitch_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
summary: 'One or more neutron openvswitch agents are not available'
- alert: os_neutron_dhcp_agent_availability
expr: openstack_services_neutron_dhcp_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
summary: 'One or more neutron dhcp agents are not available'
- alert: os_neutron_l3_agent_availability
expr: openstack_services_neutron_l3_agent_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'One or more neutron L3 agents are not available for more than 5 minutes'
summary: 'One or more neutron L3 agents are not available'
- alert: os_swift_api_availability
expr: openstack_check_swift_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Swift API is not available at {{$labels.url}}'
- alert: os_cinder_api_availability
expr: openstack_check_cinder_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Cinder API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Cinder API is not available at {{$labels.url}}'
- alert: os_cinder_scheduler_availability
expr: openstack_services_cinder_cinder_scheduler != 1
for: 5m
labels:
severity: page
annotations:
description: 'Cinder scheduler is not available for more than 5 minutes'
summary: 'Cinder scheduler is not available'
- alert: os_heat_api_availability
expr: openstack_check_heat_api != 1
for: 5m
labels:
severity: page
annotations:
description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
summary: 'Heat API is not available at {{$labels.url}}'
- alert: os_nova_compute_disabled
expr: openstack_services_nova_compute_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is disabled on some hosts'
- alert: os_nova_conductor_disabled
expr: openstack_services_nova_conductor_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is disabled on some hosts'
- alert: os_nova_consoleauth_disabled
expr: openstack_services_nova_consoleauth_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
- alert: os_nova_scheduler_disabled
expr: openstack_services_nova_scheduler_disabled_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
- alert: os_nova_compute_down
expr: openstack_services_nova_compute_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-compute is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-compute is down on some hosts'
- alert: os_nova_conductor_down
expr: openstack_services_nova_conductor_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-conductor is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-conductor is down on some hosts'
- alert: os_nova_consoleauth_down
expr: openstack_services_nova_consoleauth_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-consoleauth is down on some hosts'
- alert: os_nova_scheduler_down
expr: openstack_services_nova_scheduler_down_total > 0
for: 5m
labels:
severity: page
annotations:
description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
summary: 'Openstack compute service nova-scheduler is down on some hosts'
- alert: os_vm_vcpu_usage_high
expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
summary: 'Openstack VM vcpu usage is high'
- alert: os_vm_ram_usage_high
expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM RAM usage is hight at {{$value}} percent'
summary: 'Openstack VM RAM usage is high'
- alert: os_vm_disk_usage_high
expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
for: 5m
labels:
severity: page
annotations:
description: 'Openstack VM Disk usage is hight at {{$value}} percent'
summary: 'Openstack VM Disk usage is high'
- name: rabbitmq.rules
rules:
- alert: rabbitmq_network_pratitions_detected
expr: min(partitions) by(instance) > 0
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
summary: 'RabbitMQ Network partitions detected'
- alert: rabbitmq_down
expr: min(rabbitmq_up) by(instance) != 1
for: 10m
labels:
severity: page
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
- alert: rabbitmq_file_descriptor_usage_high
expr: fd_used * 100 /fd_total > 80
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
- alert: rabbitmq_node_disk_free_alarm
expr: node_disk_free_alarm > 0
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
summary: 'RabbitMQ disk space usage is high'
- alert: rabbitmq_node_memory_alarm
expr: node_mem_alarm > 0
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
summary: 'RabbitMQ memory usage is high'
- alert: rabbitmq_less_than_3_nodes
expr: running < 3
for: 10m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server has less than 3 nodes running.'
summary: 'RabbitMQ server is at risk of loosing data'
- alert: rabbitmq_queue_messages_returned_high
expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50
for: 5m
labels:
severity: warning
annotations:
description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
- alert: rabbitmq_consumers_low_utilization
expr: queue_consumer_utilisation < .4
for: 5m
labels:
severity: warning
annotations:
description: 'RabbitMQ consumers message consumption speed is low'
summary: 'RabbitMQ consumers message consumption speed is low'
- alert: rabbitmq_high_message_load
expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
for: 5m
labels:
severity: warning
annotations:
description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
summary: 'RabbitMQ has high message load'

View File

@ -0,0 +1,39 @@
conf:
prometheus:
rules:
postgresql:
groups:
- name: postgresql.rules
rules:
- alert: prom_exporter_postgresql_unavailable
expr: absent(pg_static)
for: 10m
labels:
severity: warning
annotations:
description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
title: postgresql exporter is not collecting metrics or is not available
- alert: pg_replication_fallen_behind
expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
title: Postgres Replication lag is over 2 minutes
- alert: pg_connections_too_high
expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
for: 5m
labels:
severity: warn
channel: database
annotations:
title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
- alert: pg_deadlocks_detected
expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
for: 5m
labels:
severity: warn
annotations:
description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
title: Postgres server is experiencing deadlocks

View File

@ -0,0 +1 @@
../common/000-install-packages.sh

View File

@ -0,0 +1 @@
../common/005-deploy-k8s.sh

View File

@ -0,0 +1 @@
../common/020-ingress.sh

View File

@ -0,0 +1 @@
../osh-infra-monitoring/030-nfs-provisioner.sh

View File

@ -0,0 +1 @@
../common/040-ldap.sh

View File

@ -0,0 +1 @@
../common/070-kube-state-metrics.sh

View File

@ -0,0 +1 @@
../common/080-node-exporter.sh

View File

@ -0,0 +1,68 @@
#!/bin/bash
# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
set -xe
#NOTE: Lint and package chart
make prometheus
tee /tmp/prometheus-one.yaml << EOF
endpoints:
monitoring:
hosts:
default: prom-metrics-one
public: prometheus-one
manifests:
network_policy: false
EOF
tee /tmp/prometheus-two.yaml << EOF
endpoints:
monitoring:
hosts:
default: prom-metrics-two
public: prometheus-two
manifests:
network_policy: false
EOF
tee /tmp/prometheus-three.yaml << EOF
endpoints:
monitoring:
hosts:
default: prom-metrics-three
public: prometheus-three
manifests:
network_policy: false
EOF
#NOTE: Deploy command
for release in prometheus-one prometheus-two prometheus-three; do
rules_overrides=""
for rules_file in $(ls ./prometheus/values_overrides); do
rules_overrides="$rules_overrides --values=./prometheus/values_overrides/$rules_file"
done
helm upgrade --install prometheus-$release ./prometheus \
--namespace=osh-infra \
--values=/tmp/$release.yaml \
$rules_overrides
#NOTE: Wait for deploy
./tools/deployment/common/wait-for-pods.sh osh-infra
#NOTE: Validate Deployment info
helm status prometheus-$release
helm test prometheus-$release
done

View File

@ -0,0 +1,66 @@
#!/bin/bash
# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
set -xe
tee /tmp/federated-prometheus.yaml << EOF
endpoints:
monitoring:
hosts:
default: prom-metrics-federate
public: prometheus-federate
manifests:
network_policy: false
conf:
prometheus:
scrape_configs:
template: |
global:
scrape_interval: 60s
evaluation_interval: 60s
scrape_configs:
- job_name: 'federate'
scrape_interval: 15s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{__name__=~".+"}'
static_configs:
- targets:
- 'prometheus-one.osh-infra.svc.cluster.local:80'
- 'prometheus-two.osh-infra.svc.cluster.local:80'
- 'prometheus-three.osh-infra.svc.cluster.local:80'
EOF
#NOTE: Lint and package chart
make prometheus
#NOTE: Deploy command
helm upgrade --install federated-prometheus ./prometheus \
--namespace=osh-infra \
--values=/tmp/federated-prometheus.yaml
#NOTE: Wait for deploy
./tools/deployment/common/wait-for-pods.sh osh-infra
#NOTE: Validate Deployment info
helm status federated-prometheus
helm test federated-prometheus

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -xe
export CHROMEDRIVER="${CHROMEDRIVER:="/etc/selenium/chromedriver"}"
export ARTIFACTS_DIR="${ARTIFACTS_DIR:="/tmp/artifacts/"}"
export PROMETHEUS_USER="admin"
export PROMETHEUS_PASSWORD="changeme"
export PROMETHEUS_URI="prometheus-one.osh-infra.svc.cluster.local"
python3 tools/gate/selenium/prometheusSelenium.py
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_One_Command_Line_Flags.png
mv ${ARTIFACTS_DIR}Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_One_Dashboard.png
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_One_Runtime_Info.png
export PROMETHEUS_URI="prometheus-two.osh-infra.svc.cluster.local"
python3 tools/gate/selenium/prometheusSelenium.py
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Two_Command_Line_Flags.png
mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Two_Dashboard.png
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Two_Runtime_Info.png
export PROMETHEUS_URI="prometheus-three.osh-infra.svc.cluster.local"
python3 tools/gate/selenium/prometheusSelenium.py
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Three_Command_Line_Flags.png
mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Three_Dashboard.png
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Three_Runtime_Info.png
export PROMETHEUS_URI="prometheus-federate.osh-infra.svc.cluster.local"
python3 tools/gate/selenium/prometheusSelenium.py
mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Federated_Command_Line_Flags.png
mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Federated_Dashboard.png
mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Federated_Runtime_Info.png

View File

@ -19,9 +19,15 @@ set -xe
#NOTE: Lint and package chart
make prometheus
rules_overrides=""
for rules_file in $(ls ./prometheus/values_overrides); do
rules_overrides="$rules_overrides --values=./prometheus/values_overrides/$rules_file"
done
#NOTE: Deploy command
helm upgrade --install prometheus ./prometheus \
--namespace=osh-infra
--namespace=osh-infra \
$rules_overrides
#NOTE: Wait for deploy
./tools/deployment/common/wait-for-pods.sh osh-infra

View File

@ -169,6 +169,29 @@
- ./tools/deployment/osh-infra-monitoring/610-prometheus-selenium.sh || true
- ./tools/deployment/osh-infra-monitoring/620-nagios-selenium.sh || true
- job:
name: openstack-helm-infra-federated-monitoring
parent: openstack-helm-infra-functional
timeout: 7200
pre-run:
- playbooks/osh-infra-upgrade-host.yaml
- playbooks/osh-infra-deploy-selenium.yaml
run: playbooks/osh-infra-gate-runner.yaml
post-run: playbooks/osh-infra-collect-logs.yaml
nodeset: openstack-helm-single-node
vars:
gate_scripts:
- ./tools/deployment/federated-monitoring/000-install-packages.sh
- ./tools/deployment/federated-monitoring/005-deploy-k8s.sh
- ./tools/deployment/federated-monitoring/010-ingress.sh
- ./tools/deployment/federated-monitoring/020-nfs-provisioner.sh
- ./tools/deployment/federated-monitoring/030-ldap.sh
- ./tools/deployment/federated-monitoring/040-kube-state-metrics.sh
- ./tools/deployment/federated-monitoring/050-node-exporter.sh
- ./tools/deployment/federated-monitoring/060-prometheus.sh
- ./tools/deployment/federated-monitoring/070-federated-prometheus.sh
- ./tools/deployment/federated-monitoring/100-prometheus-selenium.sh || true
- job:
name: openstack-helm-infra-aio-network-policy
parent: openstack-helm-infra-functional

View File

@ -21,6 +21,8 @@
- openstack-helm-lint
- openstack-helm-infra-aio-logging
- openstack-helm-infra-aio-monitoring
- openstack-helm-infra-federated-monitoring:
voting: false
- openstack-helm-infra-aio-network-policy:
voting: false
- openstack-helm-infra-openstack-support