Merge "Nagios: Add support for arbitrary object definitions via overrides"

2019-12-03 21:09:55 +00:00 · 2019-12-03 21:09:55 +00:00 · 9632d8719f
parent 6c4404ee4d 6f7790e451
commit 9632d8719f
8 changed files with 1090 additions and 1048 deletions
--- a/nagios/templates/configmap-etc.yaml
+++ b/nagios/templates/configmap-etc.yaml
@ -28,7 +28,10 @@ data:
  {{- end }}
 {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.nagios.template "key" "nagios.cfg" "format" "Secret") | indent 2 }}
 {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.cgi.template "key" "cgi.cfg" "format" "Secret") | indent 2 }}
-{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.nagios.objects.template "key" "nagios_objects.cfg" "format" "Secret") | indent 2 }}
+{{- range $objectType, $config := $envAll.Values.conf.nagios.objects }}
+{{- $objectFile := printf "%s.cfg" $objectType -}}
+{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" $config.template "key" $objectFile "format" "Secret") | indent 2 }}
+{{- end }}
 #NOTE(portdirect): this must be last, to work round helm ~2.7 bug.
 {{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.httpd "key" "httpd.conf" "format" "Secret") | indent 2 }}
 {{- end }}
--- a/nagios/templates/deployment.yaml
+++ b/nagios/templates/deployment.yaml
@ -203,10 +203,13 @@ spec:
              mountPath: /opt/nagios/etc/cgi.cfg
              subPath: cgi.cfg
              readOnly: true
+            {{- $objectKeys := keys $envAll.Values.conf.nagios.objects -}}
+            {{- range $objectType := $objectKeys }}
            - name: nagios-etc
-              mountPath: /opt/nagios/etc/nagios_objects.cfg
-              subPath: nagios_objects.cfg
+              mountPath: /opt/nagios/etc/{{$objectType}}.cfg
+              subPath: {{$objectType}}.cfg
              readOnly: true
+            {{- end }}
            - name: nagios-bin
              mountPath: /tmp/nagios-readiness.sh
              subPath: nagios-readiness.sh
--- a/nagios/values.yaml
+++ b/nagios/values.yaml
--- a/nagios/values_overrides/elasticsearch-objects.yaml
+++ b/nagios/values_overrides/elasticsearch-objects.yaml
@ -0,0 +1,93 @@
+conf:
+  nagios:
+    objects:
+      fluent:
+        template: |
+          define service {
+            check_command check_prom_alert!fluentd_not_running!CRITICAL- fluentd is not running on {instance}!OK- Flunetd is working on all nodes
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Fluentd_status
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!prom_exporter_fluentd_unavailable!CRITICAL- Fluentd exporter is not collecting metrics for alerting!OK- Fluentd exporter metrics are available.
+            hostgroup_name prometheus-hosts
+            service_description Prometheus-exporter_Fluentd
+            use generic-service
+          }
+      elasticsearch:
+        template: |
+          define command {
+            command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --match '$ARG8$' --range '$ARG9$'
+            command_name check_es_query
+          }
+
+          define command {
+            command_line $USER1$/query_elasticsearch.py $USER9$ '$ARG1$' '$ARG2$' '$ARG3$' '$ARG4$' '$ARG5$' --simple_query '$ARG6$' --simple_query_fields '$ARG7$' --query_file '/opt/nagios/etc/objects/query_es_clauses.json' --query_clause '$ARG8$' --match '$ARG9$' --range '$ARG10$'
+            command_name check_es_query_w_file
+          }
+
+          define service {
+            check_command check_prom_alert!prom_exporter_elasticsearch_unavailable!CRITICAL- Elasticsearch exporter is not collecting metrics for alerting!OK- Elasticsearch exporter metrics are available.
+            hostgroup_name prometheus-hosts
+            service_description Prometheus-exporter_Elasticsearch
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_high_process_open_files_count!CRITICAL- Elasticsearch {host} has high process open file count!OK- Elasticsearch process open file count is normal.
+            hostgroup_name prometheus-hosts
+            service_description ES_high-process-open-file-count
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_high_process_cpu_percent!CRITICAL- Elasticsearch {instance} has high process CPU percent!OK- Elasticsearch process cpu usage is normal.
+            hostgroup_name prometheus-hosts
+            service_description ES_high-process-cpu-percent
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_fs_usage_high!CRITICAL- Elasticsearch {instance} has high filesystem usage!OK- Elasticsearch filesystem usage is normal.
+            hostgroup_name prometheus-hosts
+            service_description ES_high-filesystem-usage
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_unassigned_shards!CRITICAL- Elasticsearch has unassinged shards!OK- Elasticsearch has no unassigned shards.
+            hostgroup_name prometheus-hosts
+            service_description ES_unassigned-shards
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_cluster_health_timed_out!CRITICAL- Elasticsearch Cluster health status call timedout!OK- Elasticsearch cluster health is retrievable.
+            hostgroup_name prometheus-hosts
+            service_description ES_cluster-health-timedout
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_cluster_health_status_alert!CRITICAL- Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated!OK- Elasticsearch cluster health is green.
+            hostgroup_name prometheus-hosts
+            service_description ES_cluster-health-status
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_cluster_health_too_few_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 nodes running!OK- Elasticsearch cluster has 3 or more nodes running.
+            hostgroup_name prometheus-hosts
+            service_description ES_cluster-running-node-count
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!es_cluster_health_too_few_data_nodes_running!CRITICAL- Elasticsearch Cluster has < 3 data nodes running!OK- Elasticsearch cluster has 3 or more data nodes running.
+            hostgroup_name prometheus-hosts
+            service_description ES_cluster-running-data-node-count
+            use generic-service
+          }
--- a/nagios/values_overrides/openstack-objects.yaml
+++ b/nagios/values_overrides/openstack-objects.yaml
@ -0,0 +1,270 @@
+conf:
+  nagios:
+    objects:
+      mariadb:
+        template: |
+          define service {
+            check_command check_prom_alert!prom_exporter_mariadb_unavailable!CRITICAL- MariaDB exporter is not collecting metrics for alerting!OK- MariaDB exporter metrics are available.
+            hostgroup_name prometheus-hosts
+            service_description Prometheus-exporter_MariaDB
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!mariadb_table_lock_wait_high!CRITICAL- Mariadb has high number of table lock waits!OK- No issues found with table lock waits.
+            hostgroup_name prometheus-hosts
+            service_description Mariadb_table-lock-waits-high
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!mariadb_node_not_ready!CRITICAL- Mariadb {instance} is not ready!OK- All galera cluster nodes are ready.
+            hostgroup_name prometheus-hosts
+            service_description Mariadb_node-ready
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!mariadb_galera_node_out_of_sync!CRITICAL- Mariadb {instance} is out of sync!OK- All galera cluster nodes are in sync
+            hostgroup_name prometheus-hosts
+            service_description Mariadb_node-synchronized
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!mariadb_innodb_replication_fallen_behind!CRITICAL- Innodb replication has fallen behind and not recovering!OK- innodb replication lag is nominal.
+            hostgroup_name prometheus-hosts
+            service_description Mariadb_innodb-replication-lag
+            use generic-service
+          }
+      rabbitmq:
+        template: |
+          define service {
+            check_command check_prom_alert!rabbitmq_network_pratitions_detected!CRITICAL- Rabbitmq instance {instance} has network partitions!OK- no network partitions detected in rabbitmq
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_network-partitions-exist
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_down!CRITICAL- Rabbitmq instance {instance} is down!OK- rabbitmq is available
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_up
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_file_descriptor_usage_high!CRITICAL- Rabbitmq instance {instance} has file desciptor usage more than 80 percent!OK- rabbitmq file descriptor usage is normal
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_file-descriptor-usage
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_node_disk_free_alarm!CRITICAL- Rabbitmq instance {instance} has a disk usage alarm!OK- rabbitmq node disk has no alarms
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_node-disk-alarm
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_node_memory_alarm!CRITICAL- Rabbitmq instance {instance} has a memory alarm!OK- rabbitmq node memory has no alarms
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_node-memory-alarm
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_less_than_3_nodes!CRITICAL- Rabbitmq has less than 3 nodes to serve!OK- rabbitmq has atleast 3 nodes serving
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_high-availability
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_queue_messages_returned_high!CRITICAL- Rabbitmq has high percent of messages being returned!OK- rabbitmq messages are consumed and low or no returns exist.
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_message-return-percent
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_consumers_low_utilization!CRITICAL- Rabbitmq consumer message consumption rate is slow!OK- rabbitmq message consumption speed is normal
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_consumer-utilization
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!rabbitmq_high_message_load!CRITICAL- Rabbitmq unacknowledged message count is high!OK- rabbitmq unacknowledged message count is high
+            hostgroup_name prometheus-hosts
+            service_description Rabbitmq_rabbitmq-queue-health
+            use generic-service
+          }
+      openstack:
+        template: |
+          define service {
+            check_command check_prom_alert!os_glance_api_availability!CRITICAL- Glance API at {url} is not available!OK- Glance API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_glance
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_nova_api_availability!CRITICAL- Nova API at {url} is not available!OK- Nova API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_nova
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_keystone_api_availability!CRITICAL- Keystone API at {url} is not available!OK- Keystone API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_keystone
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_neutron_api_availability!CRITICAL- Neutron API at {url} is not available!OK- Neutron API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_neutron
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_neutron_metadata_agent_availability!CRITICAL- Some Neutron metadata agents are not available!OK- All the neutron metadata agents are up
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_neutron-metadata-agent
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_neutron_openvswitch_agent_availability!CRITICAL- Some Neutron openvswitch agents are not available!OK- All the neutron openvswitch agents are up
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_neutron-openvswitch-agent
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_neutron_dhcp_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron dhcp agents are up
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_neutron-dhcp-agent
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_neutron_l3_agent_availability!CRITICAL- Some Neutron dhcp agents are not available!OK- All the neutron l3 agents are up
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_neutron-l3-agent
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_swift_api_availability!CRITICAL- Swift API at {url} is not available!OK- Swift API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_swift
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
+            hostgroup_name prometheus-hosts
+            service_description API_cinder
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_heat_api_availability!CRITICAL- Heat API at {url} is not available!OK- Heat API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_heat
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_cinder_api_availability!CRITICAL- Cinder API at {url} is not available!OK- Cinder API is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description API_cinder
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_cinder_scheduler_availability!CRITICAL- Cinder scheduler is not available!OK- Cinder scheduler is available
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_cinder-scheduler
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_nova_compute_down!CRITICAL- nova-compute services are down on certain hosts!OK- nova-compute services are up on all hosts
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_nova-compute
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_nova_conductor_down!CRITICAL- nova-conductor services are down on certain hosts!OK- nova-conductor services are up on all hosts
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_nova-conductor
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_nova_consoleauth_down!CRITICAL- nova-consoleauth services are down on certain hosts!OK- nova-consoleauth services are up on all hosts
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_nova-consoleauth
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!openstack_nova_scheduler_down!CRITICAL- nova-scheduler services are down on certain hosts!OK- nova-scheduler services are up on all hosts
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description Service_nova-scheduler
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_vm_vcpu_usage_high!CRITICAL- vcpu usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs vcpu usage is less than 80 percent of available.
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description OS-Total-Quota_VCPU-usage
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_vm_ram_usage_high!CRITICAL- RAM usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs RAM usage is less than 80 percent of available.
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description OS-Total-Quota_RAM-usage
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!os_vm_disk_usage_high!CRITICAL- Disk usage for openstack VMs is more than 80 percent of available!OK- Openstack VMs Disk usage is less than 80 percent of available.
+            check_interval 60
+            hostgroup_name prometheus-hosts
+            service_description OS-Total-Quota_Disk-usage
+            use notifying_service
+          }
+
+          define service {
+            check_command check_prom_alert!prom_exporter_openstack_unavailable!CRITICAL- Openstack exporter is not collecting metrics for alerting!OK- Openstack exporter metrics are available.
+            hostgroup_name prometheus-hosts
+            service_description Prometheus-exporter_Openstack
+            use generic-service
+          }
--- a/nagios/values_overrides/postgresql-objects.yaml
+++ b/nagios/values_overrides/postgresql-objects.yaml
@ -0,0 +1,32 @@
+conf:
+  nagios:
+    objects:
+      postgresql:
+        template: |
+          define service {
+            check_command check_prom_alert!prom_exporter_postgresql_unavailable!CRITICAL- Postgresql exporter is not collecting metrics for alerting!OK- Postgresql exporter metrics are available.
+            hostgroup_name prometheus-hosts
+            service_description Prometheus-exporter_Postgresql
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!pg_replication_fallen_behind!CRITICAL- Postgres Replication lag is over 2 minutes!OK- postgresql replication lag is nominal.
+            hostgroup_name prometheus-hosts
+            service_description Postgresql_replication-lag
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!pg_connections_too_high!CRITICAL- Postgres has more than 95% of available connections in use.!OK- postgresql open connections are within bounds.
+            hostgroup_name prometheus-hosts
+            service_description Postgresql_connections
+            use generic-service
+          }
+
+          define service {
+            check_command check_prom_alert!pg_deadlocks_detected!CRITICAL- Postgres server is experiencing deadlocks!OK- postgresql is not showing any deadlocks.
+            hostgroup_name prometheus-hosts
+            service_description Postgresql_deadlocks
+            use generic-service
+          }
--- a/tools/deployment/common/nagios.sh
+++ b/tools/deployment/common/nagios.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Copyright 2017 The Openstack-Helm Authors.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -xe
+
+#NOTE: Lint and package chart
+make nagios
+
+#NOTE: Deploy command
+tee /tmp/nagios.yaml << EOF
+conf:
+  nagios:
+    query_es_clauses:
+      test_es_query:
+        hello: world
+EOF
+helm upgrade --install nagios ./nagios \
+    --namespace=osh-infra \
+    --values=/tmp/nagios.yaml \
+    --values=nagios/values_overrides/openstack-objects.yaml \
+    --values=nagios/values_overrides/postgresql-objects.yaml \
+    --values=nagios/values_overrides/elasticsearch-objects.yaml
+
+#NOTE: Wait for deploy
+./tools/deployment/common/wait-for-pods.sh osh-infra
+
+#NOTE: Validate Deployment info
+helm status nagios
+
+#NOTE: Verify elasticsearch query clauses are functional by execing into pod
+NAGIOS_POD=$(kubectl -n osh-infra get pods -l='application=nagios,component=monitoring' --output=jsonpath='{.items[0].metadata.name}')
+kubectl exec $NAGIOS_POD  -n osh-infra -c nagios -- cat /opt/nagios/etc/objects/query_es_clauses.json | python -m json.tool
--- a/tools/deployment/osh-infra-monitoring/120-nagios.sh
+++ b/tools/deployment/osh-infra-monitoring/120-nagios.sh
@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Copyright 2017 The Openstack-Helm Authors.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-set -xe
-
-#NOTE: Lint and package chart
-make nagios
-
-#NOTE: Deploy command
-helm upgrade --install nagios ./nagios \
-    --namespace=osh-infra
-
-#NOTE: Wait for deploy
-./tools/deployment/common/wait-for-pods.sh osh-infra
-
-#NOTE: Validate Deployment info
-helm status nagios
-
-helm test nagios
--- a/tools/deployment/osh-infra-monitoring/120-nagios.sh
+++ b/tools/deployment/osh-infra-monitoring/120-nagios.sh
@ -0,0 +1 @@
+../common/nagios.sh