Alert rules in prometheus to support nagios based monitoring via alert metric queries

Change-Id: I425dbc1b33d7dcb1aa20a7b2a22bd6b5adfbfa5a
2018-03-23 08:37:20 +00:00 · 2018-03-23 08:37:20 +00:00 · e8da761ccc
commit e8da761ccc
parent 6921006103
1 changed files with 510 additions and 13 deletions
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@ -495,7 +495,7 @@ conf:
        groups:
        - name: etcd3.rules
          rules:
-          - alert: InsufficientMembers
+          - alert: etcd_InsufficientMembers
            expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
            for: 3m
            labels:
@ -503,7 +503,7 @@ conf:
            annotations:
              description: If one more etcd member goes down the cluster will be unavailable
              summary: etcd cluster insufficient members
-          - alert: NoLeader
+          - alert: etcd_NoLeader
            expr: etcd_server_has_leader{job="etcd"} == 0
            for: 1m
            labels:
@ -511,14 +511,14 @@ conf:
            annotations:
              description: etcd member {{ $labels.instance }} has no leader
              summary: etcd member has no leader
-          - alert: HighNumberOfLeaderChanges
+          - alert: etcd_HighNumberOfLeaderChanges
            expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
              summary: a high number of leader changes within the etcd cluster are happening
-          - alert: HighNumberOfFailedGRPCRequests
+          - alert: etcd_HighNumberOfFailedGRPCRequests
            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
            for: 10m
            labels:
@ -526,7 +526,7 @@ conf:
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of gRPC requests are failing
-          - alert: HighNumberOfFailedGRPCRequests
+          - alert: etcd_HighNumberOfFailedGRPCRequests
            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
            for: 5m
            labels:
@ -534,7 +534,7 @@ conf:
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of gRPC requests are failing
-          - alert: GRPCRequestsSlow
+          - alert: etcd_GRPCRequestsSlow
            expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
            for: 10m
            labels:
@ -542,7 +542,7 @@ conf:
            annotations:
              description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
              summary: slow gRPC requests
-          - alert: HighNumberOfFailedHTTPRequests
+          - alert: etcd_HighNumberOfFailedHTTPRequests
            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
            for: 10m
            labels:
@ -550,7 +550,7 @@ conf:
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of HTTP requests are failing
-          - alert: HighNumberOfFailedHTTPRequests
+          - alert: etcd_HighNumberOfFailedHTTPRequests
            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
            for: 5m
            labels:
@ -558,7 +558,7 @@ conf:
            annotations:
              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
              summary: a high number of HTTP requests are failing
-          - alert: HTTPRequestsSlow
+          - alert: etcd_HTTPRequestsSlow
            expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
            for: 10m
            labels:
@ -566,7 +566,7 @@ conf:
            annotations:
              description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
              summary: slow HTTP requests
-          - alert: EtcdMemberCommunicationSlow
+          - alert: etcd_EtcdMemberCommunicationSlow
            expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
            for: 10m
            labels:
@ -574,14 +574,14 @@ conf:
            annotations:
              description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
              summary: etcd member communication is slow
-          - alert: HighNumberOfFailedProposals
+          - alert: etcd_HighNumberOfFailedProposals
            expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
            labels:
              severity: warning
            annotations:
              description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
              summary: a high number of proposals within the etcd cluster are failing
-          - alert: HighFsyncDurations
+          - alert: etcd_HighFsyncDurations
            expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
            for: 10m
            labels:
@ -589,7 +589,7 @@ conf:
            annotations:
              description: etcd instance {{ $labels.instance }} fync durations are high
              summary: high fsync durations
-          - alert: HighCommitDurations
+          - alert: etcd_HighCommitDurations
            expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
            for: 10m
            labels:
@ -753,3 +753,500 @@ conf:
            expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
            labels:
              quantile: "0.5"
+          - alert: kube_statefulset_replicas_unavailable
+            expr: kube_statefulset_status_replicas < kube_statefulset_replicas
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
+              summary: '{{$labels.statefulset}}: has inssuficient replicas.'
+          - alert: kube_daemonsets_misscheduled
+            expr: kube_daemonset_status_number_misscheduled > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
+              summary: 'Daemonsets not scheduled correctly'
+          - alert: kube_daemonsets_not_scheduled
+            expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
+              summary: 'Less than desired number of daemonsets scheduled'
+          - alert: kube_deployment_replicas_unavailable
+            expr: kube_deployment_status_replicas_unavailable > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
+              summary: '{{$labels.deployment}}: has inssuficient replicas.'
+          - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
+            expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
+              summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
+          - alert: kube_job_status_failed
+            expr: kube_job_status_failed > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Job {{$labels.exported_job}} is in failed status'
+              summary: '{{$labels.exported_job}} has failed status'
+          - alert: kube_pod_status_pending
+            expr: kube_pod_status_phase{phase="Pending"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
+          - alert: kube_pod_error_image_pull
+            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: kube_pod_status_error_image_pull
+            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: kube_replicaset_missing_replicas
+            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
+              summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
+          - alert: kube_pod_container_terminated
+            expr: kube_pod_container_status_terminated > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+      basic_linux:
+        groups:
+        - name: basic_linux.rules
+          rules:
+          - alert: node_filesystem_full_80percent
+            expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"}
+              * 0.2) / 1024 ^ 3
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+                got less than 10% space left on its filesystem.'
+              summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
+          - alert: node_filesystem_full_in_4h
+            expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+                is running out of space of in approx. 4 hours'
+              summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
+          - alert: node_filedescriptors_full_in_3h
+            expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
+            for: 20m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is running out of available file descriptors
+                in approx. 3 hours'
+              summary: '{{$labels.alias}} is running out of available file descriptors in
+                3 hours.'
+          - alert: node_load1_90percent
+            expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
+            for: 1h
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is running with > 90% total load for at least
+                1h.'
+              summary: '{{$labels.alias}}: Running on high load.'
+          - alert: node_cpu_util_90percent
+            expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
+            for: 1h
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has total CPU utilization over 90% for at least
+                1h.'
+              summary: '{{$labels.alias}}: High CPU utilization.'
+          - alert: node_ram_using_90percent
+            expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
+              * 0.1
+            for: 30m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is using at least 90% of its RAM for at least
+                30 minutes now.'
+              summary: '{{$labels.alias}}: Using lots of RAM.'
+          - alert: node_swap_using_80percent
+            expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
+              > node_memory_SwapTotal * 0.8
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is using 80% of its swap space for at least
+                10 minutes now.'
+              summary: '{{$labels.alias}}: Running out of swap soon.'
+          - alert: node_high_cpu_load
+            expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
+            for: 1m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
+              summary: '{{$labels.alias}}: Running on high load: {{$value}}'
+          - alert: node_high_memory_load
+            expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+              + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
+            for: 1m
+            labels:
+              severity: warning
+            annotations:
+              description: Host memory usage is {{ humanize $value }}%. Reported by
+                instance {{ $labels.instance }} of job {{ $labels.job }}.
+              summary: Server memory is almost full
+          - alert: node_high_storage_load
+            expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
+              / node_filesystem_size{mountpoint="/"} * 100 > 85
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host storage usage is {{ humanize $value }}%. Reported by
+                instance {{ $labels.instance }} of job {{ $labels.job }}.
+              summary: Server storage is almost full
+          - alert: node_high_swap
+            expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
+              * 0.4)
+            for: 1m
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has a high swap usage of {{ humanize $value }}. Reported
+                by instance {{ $labels.instance }} of job {{ $labels.job }}.
+              summary: Server has a high swap usage
+          - alert: node_high_network_drop_rcv
+            expr: node_network_receive_drop{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high drop in network reception ({{
+                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+                $labels.job }}
+              summary: Server has a high receive drop
+          - alert: node_high_network_drop_send
+            expr: node_network_transmit_drop{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high drop in network transmission ({{
+                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+                $labels.job }}
+              summary: Server has a high transmit drop
+          - alert: node_high_network_errs_rcv
+            expr: node_network_receive_errs{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high error rate in network reception
+                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+                {{ $labels.job }}
+              summary: Server has unusual high reception errors
+          - alert: node_high_network_errs_send
+            expr: node_network_transmit_errs{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high error rate in network transmission
+                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+                {{ $labels.job }}
+              summary: Server has unusual high transmission errors
+          - alert: node_network_conntrack_usage_80percent
+            expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"}  * 0.8)
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
+              summary: '{{$labels.instance}}: available network conntrack entries are low.'
+          - alert: node_entropy_available_low
+            expr: node_entropy_available_bits < 300
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
+              summary: '{{$labels.instance}}: is low on entropy bits.'
+          - alert: node_hwmon_high_cpu_temp
+            expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
+              summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
+          - alert: node_vmstat_paging_rate_high
+            expr: irate(node_vmstat_pgpgin[5m]) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
+              summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
+          - alert: node_xfs_block_allocation_high
+            expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
+              summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
+          - alert: node_network_bond_slaves_down
+            expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
+              summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
+          - alert: node_numa_memory_used
+            expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
+              summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
+          - alert: node_ntp_clock_skew_high
+            expr: abs(node_ntp_drift_seconds) > 2
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
+              summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
+          - alert: node_disk_read_latency
+            expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.device}} has a high read latency of {{ $value }}'
+              summary: 'High read latency observed for device {{ $labels.device }}'
+          - alert: node_disk_write_latency
+            expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.device}} has a high write latency of {{ $value }}'
+              summary: 'High write latency observed for device {{ $labels.device }}'
+      openstack:
+        groups:
+        - name: openstack.rules
+          rules:
+          - alert: os_glance_api_availability
+            expr:  check_glance_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Glance API is not available at {{$labels.url}}'
+          - alert: os_nova_api_availability
+            expr:  check_nova_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Nova API is not available at {{$labels.url}}'
+          - alert: os_keystone_api_availability
+            expr:  check_keystone_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Keystone API is not available at {{$labels.url}}'
+          - alert: os_neutron_api_availability
+            expr:  check_neutron_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Neutron API is not available at {{$labels.url}}'
+          - alert: os_swift_api_availability
+            expr:  check_swift_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Swift API is not available at {{$labels.url}}'
+          - alert: os_nova_compute_disabled
+            expr:  services_nova_compute_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-compute is disabled on some hosts'
+          - alert: os_nova_conductor_disabled
+            expr:  services_nova_conductor_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-conductor is disabled on some hosts'
+          - alert: os_nova_consoleauth_disabled
+            expr:  services_nova_consoleauth_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
+          - alert: os_nova_scheduler_disabled
+            expr:  services_nova_scheduler_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+      ceph:
+        groups:
+        - name: ceph.rules
+          rules:
+          - alert: ceph_monitor_quorum_low
+            expr:  ceph_monitor_quorum_count < 3
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
+              summary: 'ceph high availability is at risk'
+          - alert: ceph_cluster_usage_high
+            expr:  100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph cluster capacity usage more than 80 percent'
+              summary: 'ceph cluster usage is more than 80 percent'
+          - alert: ceph_placement_group_degrade_pct_high
+            expr:  100*ceph_degraded_pgs/ceph_total_pgs > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph placement group degradation is more than 80 percent'
+              summary: 'ceph placement groups degraded'
+          - alert: ceph_osd_down_pct_high
+            expr:  100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph OSDs down percent is more than 80 percent'
+              summary: 'ceph OSDs down percent is high'
+          - alert: ceph_monitor_clock_skew_high
+            expr:  ceph_monitor_clock_skew_seconds > 2
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds'
+              summary: 'ceph monitor clock skew high'
+      fluentd:
+        groups:
+        - name: fluentd.rules
+          rules:
+          - alert: fluentd_not_running
+            expr:  fluentd_up == 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
+              summary: 'Fluentd is down'
+      calico:
+        groups:
+        - name: calico.rules
+          rules:
+          - alert: calico_datapane_failures_high_1h
+            expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
+              summary: 'A high number of dataplane failures within Felix are happening'
+          - alert: calico_datapane_address_msg_batch_size_high_5m
+            expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
+              summary: 'Felix address message batch size is higher'
+          - alert: calico_datapane_iface_msg_batch_size_high_5m
+            expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
+              summary: 'Felix interface message batch size is higher'
+          - alert: calico_ipset_errors_high_1h
+            expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
+              summary: 'A high number of ipset errors within Felix are happening'
+          - alert: calico_iptable_save_errors_high_1h
+            expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
+              summary: 'A high number of iptable save errors within Felix are happening'
+          - alert: calico_iptable_restore_errors_high_1h
+            expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
+              summary: 'A high number of iptable restore errors within Felix are happening'