Rakesh Patnaik adab0e1e30 Nagios chart modifications to use prometheus alert metric for monitoring
Change-Id: I6bb3c7176a725d8f26f3c11ebfb1f6d1d430ab96
2018-04-19 10:55:44 -05:00

665 lines
33 KiB
YAML

# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for nagios.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
images:
tags:
nagios: quay.io/attcomdev/nagios:931116b88c54931c616dfa66f424be38f74d8ad2
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.2.1
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
nagios:
node_selector_key: openstack-control-plane
node_selector_value: enabled
job:
node_selector_key: openstack-control-plane
node_selector_value: enabled
dependencies:
dynamic:
common:
jobs:
- nagios-image-repo-sync
services:
- service: local_image_registry
endpoint: node
static:
image_repo_sync:
services:
- service: local_image_registry
endpoint: internal
nagios:
services: null
secrets:
nagios:
admin: nagios-admin-creds
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
monitoring:
name: prometheus
hosts:
default: prom-metrics
public: prometheus
host_fqdn_override:
default: null
path:
default: null
scheme:
default: http
port:
api:
default: 9090
public: 80
nagios:
name: nagios
namespace: null
auth:
admin:
username: admin
password: changeme
hosts:
default: nagios-metrics
public: nagios
host_fqdn_override:
default: null
path:
default: null
scheme:
default: http
port:
http:
default: 80
network:
nagios:
ingress:
public: true
classes:
namespace: "nginx"
cluster: "nginx-cluster"
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
node_port:
enabled: false
port: 30925
pod:
lifecycle:
upgrades:
revision_history: 3
pod_replacement_strategy: RollingUpdate
rolling_update:
max_unavailable: 1
max_surge: 3
termination_grace_period:
nagios:
timeout: 30
replicas:
nagios: 1
resources:
enabled: false
nagios:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
jobs:
image_repo_sync:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "100m"
manifests:
configmap_bin: true
configmap_etc: true
deployment: true
ingress: true
job_image_repo_sync: true
secret_nagios: true
service: true
service_ingress: true
conf:
nagios:
hosts:
- prometheus:
use: linux-server
host_name: prometheus
alias: "Prometheus Monitoring"
address: 127.0.0.1
hostgroups: prometheus-hosts
check_command: check-prometheus-host-alive
host_groups:
- prometheus-hosts:
hostgroup_name: prometheus-hosts
alias: "Prometheus Virtual Host"
- all:
hostgroup_name: all
alias: "all"
- base-os:
hostgroup_name: base-os
alias: "base-os"
commands:
- check_prometheus_host_alive:
command_name: check-prometheus-host-alive
command_line: "$USER1$/check_rest_get_api.py --url $USER2$ --warning_response_seconds 5 --critical_response_seconds 10"
- check_prom_alert_with_labels:
command_name: check_prom_alert_with_labels
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --labels_csv '$ARG2$' --msg_format '$ARG3$' --ok_message '$ARG4$'"
- check_prom_alert:
command_name: check_prom_alert
command_line: "$USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname '$ARG1$' --msg_format '$ARG2$' --ok_message '$ARG3$'"
- check_filespace_mounts-usage-rate-fullin4hrs:
command_name: check_filespace_mounts-usage-rate-fullin4hrs
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_in_4h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} will be full in four hours' --ok_message 'OK- All mountpoints usage rate is normal'
- check_filespace_mounts-usage:
command_name: check_filespace_mounts-usage
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filesystem_full_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Mountpoint {mountpoint} is more than 80 pecent full' --ok_message 'OK- All mountpoints usage is normal'
- check_node_loadavg:
command_name: check_node_loadavg
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_load1_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node load average has been more than 90% for the pash hour' --ok_message 'OK- Node load average is normal'
- check_node_cpu_util:
command_name: check_node_cpu_util
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_cpu_util_90percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node CPU utilization has been more than 90% for the pash hour' --ok_message 'OK- Node cpu utilization is normal'
- check_network_connections:
command_name: check_network_connections
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_conntrack_usage_80percent' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node network connections are more than 90% in use' --ok_message 'OK- Network connection utilization is normal'
- check_memory_usage:
command_name: check_memory_usage
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
- check_disk_write_latency:
command_name: check_disk_write_latency
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_write_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk write latency is high' --ok_message 'OK- Node disk write latency is normal'
- check_disk_read_latency:
command_name: check_disk_read_latency
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_disk_read_latency' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Disk read latency is high' --ok_message 'OK- Node disk read latency is normal'
- check_entropy_availability:
command_name: check_entropy_availability
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_entropy_available_low' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- System has low entropy availability' --ok_message 'OK- System entropy availability is sufficient'
- check_filedescriptor_usage_rate:
command_name: check_filedescriptor_usage_rate
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_filedescriptors_full_in_3h' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- at current consumption rate no free file descriptors will be available in 3hrs.' --ok_message 'OK- System file descriptor consumption is ok.'
- check_hwmon_high_cpu_temp:
command_name: check_hwmon_high_cpu_temp
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_hwmon_high_cpu_temp' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- CPU temperature is 90 percent of critical temperature.' --ok_message 'OK- CPU temperatures are normal.'
- check_network_receive_drop_high:
command_name: check_network_receive_drop_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
- check_network_transmit_drop_high:
command_name: check_network_transmit_drop_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
- check_network_receive_errors_high:
command_name: check_network_receive_errors_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
- check_network_transmit_errors_high:
command_name: check_network_transmit_errors_high
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
- check_vmstat_paging_rate:
command_name: check_vmstat_paging_rate
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_vmstat_paging_rate_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Memory paging rate over 5 minutes is high.' --ok_message 'OK- Memory paging rate over 5 minutes is ok.'
- check_xfs_block_allocation:
command_name: check_xfs_block_allocation
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_xfs_block_allocation_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- XFS block allocation is more than 80 percent of available.' --ok_message 'OK- XFS block allocation is less than 80 percent of available.'
- check_network_bond_status:
command_name: check_network_bond_status
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_network_bond_slaves_down' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- {master} is missing slave interfaces.' --ok_message 'OK- Network bonds have slave interfaces functional.'
- check_numa_memory_usage:
command_name: check_numa_memory_usage
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_numa_memory_used' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NUMA memory usage is more than 80 percent of available.' --ok_message 'OK- NUMA memory usage is normal.'
- check_ntp_sync:
command_name: check_ntp_sync
command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.'
services:
- check_prometheus_replicas:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Prometheus_replica-count"
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
check_interval: 1
- check_alertmanager_replicas:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "PrometheusAlertmanager_replica-count"
check_command: check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
check_interval: 1
- check_statefulset_replicas:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Statefulset_replica-count"
check_command: check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
check_interval: 1
- check_daemonset_misscheduled:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Daemonset_misscheduled"
check_command: check_prom_alert!daemonsets_misscheduled!CRITICAL- Daemonset {daemonset} is incorrectly scheudled!OK- No daemonset misscheduling detected
check_interval: 1
- check_daemonset_not-scheduled:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Daemonset_not-scheduled"
check_command: check_prom_alert!daemonsets_not_scheduled!CRITICAL- Daemonset {daemonset} is missing to be scheduled in some nodes!OK- All daemonset scheduling is as desired
check_interval: 1
- check_deployment_replicas_unavailable:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Deployment_replicas-unavailable"
check_command: check_prom_alert!deployment_replicas_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas!OK- All deployments have desired replicas
check_interval: 1
- check_deployment_rollingupdate_replicas_unavailable:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "RollingUpdate_Deployment-replicas-unavailable"
check_command: check_prom_alert!rollingupdate_deployment_replica_less_than_spec_max_unavailable!CRITICAL- Deployment {deployment} has less than desired replicas during a rolling update!OK- All deployments have desired replicas
check_interval: 1
- check_job_status_failed:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Job_status-failed"
check_command: check_prom_alert!job_status_failed!CRITICAL- Job {exported_job} has failed!OK- No Job failures
check_interval: 1
- check_pod_status_pending:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-pending"
check_command: check_prom_alert!pod_status_pending!CRITICAL- Pod {pod} in namespace {namespace} has been in pending status for more than 10 minutes!OK- No pods in pending status
check_interval: 1
- check_pod_status_error_image_pull:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-error-image-pull"
check_command: check_prom_alert!pod_status_error_image_pull!CRITICAL- Pod {pod} in namespace {namespace} has been in errpr status of ErrImagePull for more than 10 minutes!OK- No pods in error status
check_interval: 1
- check_replicaset_missing_replicas:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Replicaset_missing-replicas"
check_command: check_prom_alert!replicaset_missing_replicas!CRITICAL- Replicaset {replicaset} is missing replicas!OK- No replicas missing from replicaset
check_interval: 1
- check_pod_container_terminated:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Pod_status-container-terminated"
check_command: check_prom_alert!pod_container_terminated!CRITICAL- pod {pod} in namespace {namespace} has a container in terminated state!OK- pod container status looks good
check_interval: 1
- check_glance_api:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "API_glance"
check_command: check_prom_alert!glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_interval: 1
- check_nova_api:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "API_nova"
check_command: check_prom_alert!nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_interval: 1
- check_keystone_api:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "API_keystone"
check_command: check_prom_alert!keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_interval: 1
- check_neutron_api:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "API_neutron"
check_command: check_prom_alert!neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_interval: 1
- check_swift_api:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "API_swift"
check_command: check_prom_alert!swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_interval: 1
- check_service_nova_compute:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-compute"
check_command: check_prom_alert!openstack_nova_compute_disabled!CRITICAL- nova-compute services are disabled on certain hosts!OK- nova-compute services are enabled on all hosts
check_interval: 1
- check_service_nova_conductor:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-conductor"
check_command: check_prom_alert!openstack_nova_conductor_disabled!CRITICAL- nova-conductor services are disabled on certain hosts!OK- nova-conductor services are enabled on all hosts
check_interval: 1
- check_service_nova_consoleauth:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-consoleauth"
check_command: check_prom_alert!openstack_nova_consoleauth_disabled!CRITICAL- nova-consoleauth services are disabled on certain hosts!OK- nova-consoleauth services are enabled on all hosts
check_interval: 1
- check_service_nova_scheduler:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Service_nova-scheduler"
check_command: check_prom_alert!openstack_nova_scheduler_disabled!CRITICAL- nova-scheduler services are disabled on certain hosts!OK- nova-scheduler services are enabled on all hosts
check_interval: 1
- check_ceph_monitor_quorum:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "CEPH_quorum"
check_command: check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
check_interval: 1
- check_ceph_storage_usage:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "CEPH_storage-usage"
check_command: check_prom_alert!ceph_cluster_usage_high!CRITICAL- ceph cluster storage is more than 80 percent!OK- ceph storage is less than 80 percent
check_interval: 1
- check_ceph_pgs_degradation:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "CEPH_PGs-degradation"
check_command: check_prom_alert!ceph_placement_group_degrade_pct_high!CRITICAL- ceph cluster PGs down are more than 80 percent!OK- ceph PG degradation is less than 80 percent
check_interval: 1
- check_ceph_osds_down:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "CEPH_OSDs-down"
check_command: check_prom_alert!ceph_osd_down_pct_high!CRITICAL- CEPH OSDs down are more than 80 percent!OK- CEPH OSDs down is less than 80 percent
check_interval: 1
- check_ceph_monitor_clock_skew:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "CEPH_Clock-skew"
check_command: check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
check_interval: 1
- check_fluentd_up:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: "Fluentd_status"
check_command: check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
check_interval: 1
- check_etcd_high_http_deletes_failed:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: ETCD_high-http-delete-failures
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="DELETE"!CRITICAL- ETCD {instance} has a high HTTP DELETE operations failure!OK- ETCD at {instance} has low or no failures for HTTP DELETE
check_interval: 1
- check_etcd_high_http_get_failed:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: ETCD_high-http-get-failures
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method=~"GET|QGET"!CRITICAL- ETCD {instance} has a high HTTP GET operations failure!OK- ETCD at {instance} has low or no failures for HTTP GET
check_interval: 1
- check_etcd_high_http_updates_failed:
use: generic-service
hostgroup_name: prometheus-hosts
service_description: ETCD_high-http-update-failures
check_command: check_prom_alert_with_labels!etcd_HighNumberOfFailedHTTPRequests!method="PUT"!CRITICAL- ETCD {instance} has a high HTTP PUT operations failure!OK- ETCD at {instance} has low or no failures for HTTP PUT
check_interval: 1
- check_felix_iptables_save_errors:
use: generic-service
service_description: Calico_iptables-save-errors
check_command: check_prom_alert!calico_iptable_save_errors_high_1h!CRITICAL- Felix instance {instance} has seen high iptable save errors within the last hour!OK- iptables save errors are none or low
hostgroup_name: prometheus-hosts
- check_felix_ipset_errors:
use: generic-service
service_description: Calico_ipset-errors
check_command: check_prom_alert!calico_ipset_errors_high_1h!CRITICAL- Felix instance {instance} has seen high ipset errors within the last hour!OK- ipset errors are none or low
hostgroup_name: prometheus-hosts
- check_felix_int_dataplane_iface_msg_batch_size:
use: generic-service
service_description: Calico_interface-message-batch-size
check_command: check_prom_alert!calico_datapane_iface_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane interface message batch size!OK- dataplane interface message batch size are low
hostgroup_name: prometheus-hosts
- check_felix_int_dataplane_addr_msg_batch_size:
use: generic-service
service_description: Calico_address-message-batch-size
check_command: check_prom_alert!calico_datapane_address_msg_batch_size_high_5m!CRITICAL- Felix instance {instance} has seen a high value of dataplane address message batch size!OK- dataplane address message batch size are low
hostgroup_name: prometheus-hosts
- check_felix_int_dataplane_failures:
use: generic-service
service_description: Calico_datapane_failures_high
check_command: check_prom_alert!calico_datapane_failures_high_1h!CRITICAL- Felix instance {instance} has seen high dataplane failures within the last hour!OK- datapane failures are none or low
hostgroup_name: prometheus-hosts
- check_filespace_mounts-usage-rate-fullin4hrs:
use: generic-service
hostgroup_name: base-os
service_description: "Filespace_mounts-usage-rate-fullin4hrs"
check_command: check_filespace_mounts-usage-rate-fullin4hrs
check_interval: 1
- check_filespace_mounts-usage:
use: generic-service
hostgroup_name: base-os
service_description: "Filespace_mounts-usage"
check_command: check_filespace_mounts-usage
check_interval: 1
- check_node_loadavg:
use: generic-service
service_description: CPU_Load-average
check_command: check_node_loadavg
hostgroup_name: base-os
- check_node_cpu_util:
use: generic-service
service_description: CPU_utilization
check_command: check_node_cpu_util
hostgroup_name: base-os
- check_network_connections:
use: generic-service
service_description: Network_connections
check_command: check_network_connections
hostgroup_name: base-os
- check_memory_usage:
use: generic-service
service_description: Memory_usage
check_command: check_memory_usage
hostgroup_name: base-os
- check_disk_write_latency:
use: generic-service
service_description: Disk_write-latency
check_command: check_disk_write_latency
hostgroup_name: base-os
- check_disk_read_latency:
use: generic-service
service_description: Disk_read-latency
check_command: check_disk_read_latency
hostgroup_name: base-os
- check_entropy_availability:
use: generic-service
service_description: Entropy_availability
check_command: check_entropy_availability
hostgroup_name: base-os
- check_filedescriptor_usage_rate:
use: generic-service
service_description: FileDescriptors_usage-rate-high
check_command: check_filedescriptor_usage_rate
hostgroup_name: base-os
- check_hwmon_high_cpu_temp:
use: generic-service
service_description: HW_cpu-temp-high
check_command: check_hwmon_high_cpu_temp
hostgroup_name: base-os
- check_network_receive_drop_high:
use: generic-service
service_description: Network_receive-drop-high
check_command: check_network_receive_drop_high
hostgroup_name: base-os
- check_network_transmit_drop_high:
use: generic-service
service_description: Network_transmit-drop-high
check_command: check_network_transmit_drop_high
hostgroup_name: base-os
- check_network_receive_errors_high:
use: generic-service
service_description: Network_receive-errors-high
check_command: check_network_receive_errors_high
hostgroup_name: base-os
- check_network_transmit_errors_high:
use: generic-service
service_description: Network_transmit-errors-high
check_command: check_network_transmit_errors_high
hostgroup_name: base-os
- check_vmstat_paging_rate:
use: generic-service
service_description: Memory_vmstat-paging-rate
check_command: check_vmstat_paging_rate
hostgroup_name: base-os
- check_xfs_block_allocation:
use: generic-service
service_description: XFS_block-allocation
check_command: check_xfs_block_allocation
hostgroup_name: base-os
- check_network_bond_status:
use: generic-service
service_description: Network_bondstatus
check_command: check_network_bond_status
hostgroup_name: base-os
- check_numa_memory_usage:
use: generic-service
service_description: Memory_NUMA-usage
check_command: check_numa_memory_usage
hostgroup_name: base-os
- check_ntp_sync:
use: generic-service
service_description: NTP_sync
check_command: check_ntp_sync
hostgroup_name: base-os
config:
log_file: /opt/nagios/var/nagios.log
cfg_file:
- /opt/nagios/etc/nagios_objects.cfg
- /opt/nagios/etc/objects/commands.cfg
- /opt/nagios/etc/objects/contacts.cfg
- /opt/nagios/etc/objects/timeperiods.cfg
- /opt/nagios/etc/objects/templates.cfg
- /opt/nagios/etc/objects/prometheus_discovery_objects.cfg
object_cache_file: /opt/nagios/var/objects.cache
precached_object_file: /opt/nagios/var/objects.precache
resource_file: /opt/nagios/etc/resource.cfg
status_file: /opt/nagios/var/status.dat
status_update_interval: 10
nagios_user: nagios
nagios_group: nagios
check_external_commands: 1
command_file: /opt/nagios/var/rw/nagios.cmd
lock_file: /var/run/nagios.lock
temp_file: /opt/nagios/var/nagios.tmp
temp_path: /tmp
event_broker_options: -1
log_rotation_method: d
log_archive_path: /opt/nagios/var/archives
use_syslog: 1
log_service_retries: 1
log_host_retries: 1
log_event_handlers: 1
log_initial_states: 0
log_current_states: 1
log_external_commands: 1
log_passive_checks: 1
service_inter_check_delay_method: s
max_service_check_spread: 30
service_interleave_factor: s
host_inter_check_delay_method: s
max_host_check_spread: 30
max_concurrent_checks: 0
check_result_reaper_frequency: 10
max_check_result_reaper_time: 30
check_result_path: /opt/nagios/var/spool/checkresults
max_check_result_file_age: 3600
cached_host_check_horizon: 15
cached_service_check_horizon: 15
enable_predictive_host_dependency_checks: 1
enable_predictive_service_dependency_checks: 1
soft_state_dependencies: 0
auto_reschedule_checks: 0
auto_rescheduling_interval: 30
auto_rescheduling_window: 180
service_check_timeout: 60
host_check_timeout: 30
event_handler_timeout: 30
notification_timeout: 30
ocsp_timeout: 5
perfdata_timeout: 5
retain_state_information: 1
state_retention_file: /opt/nagios/var/retention.dat
retention_update_interval: 60
use_retained_program_state: 1
use_retained_scheduling_info: 1
retained_host_attribute_mask: 0
retained_service_attribute_mask: 0
retained_process_host_attribute_mask: 0
retained_process_service_attribute_mask: 0
retained_contact_host_attribute_mask: 0
retained_contact_service_attribute_mask: 0
interval_length: 60
check_for_updates: 1
bare_update_check: 0
use_aggressive_host_checking: 0
execute_service_checks: 1
accept_passive_service_checks: 1
execute_host_checks: 1
accept_passive_host_checks: 1
enable_notifications: 1
enable_event_handlers: 1
process_performance_data: 0
obsess_over_services: 0
obsess_over_hosts: 0
translate_passive_host_checks: 0
passive_host_checks_are_soft: 0
check_for_orphaned_services: 1
check_for_orphaned_hosts: 1
check_service_freshness: 1
service_freshness_check_interval: 60
check_host_freshness: 0
host_freshness_check_interval: 60
additional_freshness_latency: 15
enable_flap_detection: 1
low_service_flap_threshold: 5.0
high_service_flap_threshold: 20.0
low_host_flap_threshold: 5.0
high_host_flap_threshold: 20.0
date_format: us
use_regexp_matching: 0
use_true_regexp_matching: 0
daemon_dumps_core: 0
use_large_installation_tweaks: 0
enable_environment_macros: 0
debug_level: 0
debug_verbosity: 1
debug_file: /opt/nagios/var/nagios.debug
max_debug_file_size: 1000000
allow_empty_hostgroup_assignment: 1