openstack-helm-infra/nagios/values_overrides/openstack-objects.yaml

273 lines
12 KiB
YAML

---
conf:
nagios:
objects:
mariadb:
template: |
define service {
check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
hostgroup_name prometheus-hosts
service_description Prometheus-exporter_MariaDB
use generic-service
}
define service {
check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
hostgroup_name prometheus-hosts
service_description Mariadb_table-lock-waits-high
use generic-service
}
define service {
check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
hostgroup_name prometheus-hosts
service_description Mariadb_node-ready
use generic-service
}
define service {
check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
hostgroup_name prometheus-hosts
service_description Mariadb_node-synchronized
use generic-service
}
define service {
check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
hostgroup_name prometheus-hosts
service_description Mariadb_innodb-replication-lag
use generic-service
}
rabbitmq:
template: |
define service {
check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
hostgroup_name prometheus-hosts
service_description Rabbitmq_network-partitions-exist
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
hostgroup_name prometheus-hosts
service_description Rabbitmq_up
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
hostgroup_name prometheus-hosts
service_description Rabbitmq_file-descriptor-usage
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
hostgroup_name prometheus-hosts
service_description Rabbitmq_node-disk-alarm
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
hostgroup_name prometheus-hosts
service_description Rabbitmq_node-memory-alarm
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
hostgroup_name prometheus-hosts
service_description Rabbitmq_high-availability
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
hostgroup_name prometheus-hosts
service_description Rabbitmq_message-return-percent
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
hostgroup_name prometheus-hosts
service_description Rabbitmq_consumer-utilization
use generic-service
}
define service {
check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
hostgroup_name prometheus-hosts
service_description Rabbitmq_rabbitmq-queue-health
use generic-service
}
openstack:
template: |
define service {
check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_glance
use notifying_service
}
define service {
check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_nova
use notifying_service
}
define service {
check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_keystone
use notifying_service
}
define service {
check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_neutron
use notifying_service
}
define service {
check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_neutron-metadata-agent
use notifying_service
}
define service {
check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_neutron-openvswitch-agent
use notifying_service
}
define service {
check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_neutron-dhcp-agent
use notifying_service
}
define service {
check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_neutron-l3-agent
use notifying_service
}
define service {
check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_swift
use notifying_service
}
define service {
check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
hostgroup_name prometheus-hosts
service_description API_cinder
use notifying_service
}
define service {
check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_heat
use notifying_service
}
define service {
check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
check_interval 60
hostgroup_name prometheus-hosts
service_description API_cinder
use notifying_service
}
define service {
check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_cinder-scheduler
use notifying_service
}
define service {
check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_nova-compute
use notifying_service
}
define service {
check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_nova-conductor
use notifying_service
}
define service {
check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_nova-consoleauth
use notifying_service
}
define service {
check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
check_interval 60
hostgroup_name prometheus-hosts
service_description Service_nova-scheduler
use notifying_service
}
define service {
check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
check_interval 60
hostgroup_name prometheus-hosts
service_description OS-Total-Quota_VCPU-usage
use notifying_service
}
define service {
check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
check_interval 60
hostgroup_name prometheus-hosts
service_description OS-Total-Quota_RAM-usage
use notifying_service
}
define service {
check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
check_interval 60
hostgroup_name prometheus-hosts
service_description OS-Total-Quota_Disk-usage
use notifying_service
}
define service {
check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
hostgroup_name prometheus-hosts
service_description Prometheus-exporter_Openstack
use generic-service
}
...