- Snapped binary packages of Filebeat, NRPE and Telegraf (disabled by default) - Added W/A of Telegraf segfault after ELF patching by snapcraft - Implemented IPMI input tuning for Telegraf - Allowed to run NRPE as root:root (from custom PPA) - Implemented Filebeat, NRPE and Telegraf control scripts and config on top of snap-overlay - Added support for checking Microstack systemd services by NRPE - Added few generic and Microstack-specific NRPE checks - Added possibility to override default config paths for the daemons - Added support for in-band IPMI input to Telegraf - Stick LMA wrappers and services naming to Microstack conventions - Increase build timeout in .zuul conf by 30min Change-Id: I68dbdb11248cf0c1e22e9333af3cf0f88954f557changes/61/697661/4
@ -0,0 +1,38 @@ | |||
#!/usr/bin/python3 | |||
# | |||
# Copyright 2016 Canonical Ltd | |||
# | |||
# Author: Brad Marshall <brad.marshall@canonical.com> | |||
# | |||
# Based on check_upstart_job and https://zignar.net/2014/09/08/getting-started-with-dbus-python-systemd/ | |||
# | |||
import dbus, sys | |||
service_arg = sys.argv[1] | |||
service_name = "%s.service" % service_arg | |||
try: | |||
bus = dbus.SystemBus() | |||
systemd = bus.get_object('org.freedesktop.systemd1', '/org/freedesktop/systemd1') | |||
manager = dbus.Interface(systemd, dbus_interface='org.freedesktop.systemd1.Manager') | |||
try: | |||
service_unit = manager.LoadUnit(service_name) | |||
service_proxy = bus.get_object('org.freedesktop.systemd1', str(service_unit)) | |||
service = dbus.Interface(service_proxy, dbus_interface='org.freedesktop.systemd1.Unit') | |||
service_res = service_proxy.Get('org.freedesktop.systemd1.Unit','SubState', dbus_interface='org.freedesktop.DBus.Properties') | |||
if service_res == 'running': | |||
print('OK: %s is running' % service_name) | |||
sys.exit(0) | |||
else: | |||
print('CRITICAL: %s is not running' % service_name) | |||
sys.exit(2) | |||
except dbus.DBusException as e: | |||
print('CRITICAL: unable to find %s in systemd' % service_name) | |||
sys.exit(2) | |||
except dbus.DBusException as e: | |||
print('CRITICAL: unable to connect to system for %s' % service_name) | |||
sys.exit(2) |
@ -0,0 +1,32 @@ | |||
-----BEGIN PGP PUBLIC KEY BLOCK----- | |||
Version: GnuPG v1 | |||
mQENBFI3HsoBCADXDtbNJnxbPqB1vDNtCsqhe49vFYsZN9IOZsZXgp7aHjh6CJBD | |||
A+bGFOwyhbd7at35jQjWAw1O3cfYsKAmFy+Ar3LHCMkV3oZspJACTIgCrwnkic/9 | |||
CUliQe324qvObU2QRtP4Fl0zWcfb/S8UYzWXWIFuJqMvE9MaRY1bwUBvzoqavLGZ | |||
j3SF1SPO+TB5QrHkrQHBsmX+Jda6d4Ylt8/t6CvMwgQNlrlzIO9WT+YN6zS+sqHd | |||
1YK/aY5qhoLNhp9G/HxhcSVCkLq8SStj1ZZ1S9juBPoXV1ZWNbxFNGwOh/NYGldD | |||
2kmBf3YgCqeLzHahsAEpvAm8TBa7Q9W21C8vABEBAAG0RUVsYXN0aWNzZWFyY2gg | |||
KEVsYXN0aWNzZWFyY2ggU2lnbmluZyBLZXkpIDxkZXZfb3BzQGVsYXN0aWNzZWFy | |||
Y2gub3JnPokBOAQTAQIAIgUCUjceygIbAwYLCQgHAwIGFQgCCQoLBBYCAwECHgEC | |||
F4AACgkQ0n1mbNiOQrRzjAgAlTUQ1mgo3nK6BGXbj4XAJvuZDG0HILiUt+pPnz75 | |||
nsf0NWhqR4yGFlmpuctgCmTD+HzYtV9fp9qW/bwVuJCNtKXk3sdzYABY+Yl0Cez/ | |||
7C2GuGCOlbn0luCNT9BxJnh4mC9h/cKI3y5jvZ7wavwe41teqG14V+EoFSn3NPKm | |||
TxcDTFrV7SmVPxCBcQze00cJhprKxkuZMPPVqpBS+JfDQtzUQD/LSFfhHj9eD+Xe | |||
8d7sw+XvxB2aN4gnTlRzjL1nTRp0h2/IOGkqYfIG9rWmSLNlxhB2t+c0RsjdGM4/ | |||
eRlPWylFbVMc5pmDpItrkWSnzBfkmXL3vO2X3WvwmSFiQbkBDQRSNx7KAQgA5JUl | |||
zcMW5/cuyZR8alSacKqhSbvoSqqbzHKcUQZmlzNMKGTABFG1yRx9r+wa/fvqP6OT | |||
RzRDvVS/cycws8YX7Ddum7x8uI95b9ye1/Xy5noPEm8cD+hplnpU+PBQZJ5XJ2I+ | |||
1l9Nixx47wPGXeClLqcdn0ayd+v+Rwf3/XUJrvccG2YZUiQ4jWZkoxsA07xx7Bj+ | |||
Lt8/FKG7sHRFvePFU0ZS6JFx9GJqjSBbHRRkam+4emW3uWgVfZxuwcUCn1ayNgRt | |||
KiFv9jQrg2TIWEvzYx9tywTCxc+FFMWAlbCzi+m4WD+QUWWfDQ009U/WM0ks0Kww | |||
EwSk/UDuToxGnKU2dQARAQABiQEfBBgBAgAJBQJSNx7KAhsMAAoJENJ9ZmzYjkK0 | |||
c3MIAIE9hAR20mqJWLcsxLtrRs6uNF1VrpB+4n/55QU7oxA1iVBO6IFu4qgsF12J | |||
TavnJ5MLaETlggXY+zDef9syTPXoQctpzcaNVDmedwo1SiL03uMoblOvWpMR/Y0j | |||
6rm7IgrMWUDXDPvoPGjMl2q1iTeyHkMZEyUJ8SKsaHh4jV9wp9KmC8C+9CwMukL7 | |||
vM5w8cgvJoAwsp3Fn59AxWthN3XJYcnMfStkIuWgR7U2r+a210W6vnUxU4oN0PmM | |||
cursYPyeV0NX/KQeUeNMwGTFB6QHS/anRaGQewijkrYYoTNtfllxIu9XYmiBERQ/ | |||
qPDlGRlOgVTd9xUfHFkzB52c70E= | |||
=92oX | |||
-----END PGP PUBLIC KEY BLOCK----- | |||
@ -0,0 +1,43 @@ | |||
filebeat: | |||
prospectors: | |||
- paths: | |||
- /var/log/syslog | |||
- /var/log/*/*.log | |||
- {{ snap_common }}/log | |||
- {{ snap_common }}/log/mysql | |||
- {{ snap_common }}/log/openvswitch | |||
- {{ snap_common }}/log/rabbitmq | |||
input_type: log | |||
exclude_files: ["/filebeat.*", ".*gz$"] | |||
exclude_lines: [] | |||
scan_frequency: 10s | |||
harvester_buffer_size: 16384 | |||
max_bytes: 10485760 | |||
registry_file: filebeat/registry | |||
logging: | |||
to_syslog: true | |||
to_files: false | |||
level: info | |||
metrics.enabled: false | |||
files: | |||
path: {{ snap_common }}/log | |||
name: filebeat | |||
keepfiles: 7 | |||
permissions: 0644 | |||
output: | |||
logstash: | |||
hosts: {{ ( logging_host.split(';') if logging_host else [] ) | tojson }} | |||
worker: 1 | |||
compression_level: 3 | |||
loadbalance: true | |||
{% if logging_tag %} | |||
# if name is empty, hostname will be used | |||
name: {{logging_tag}} | |||
{% endif %} |
@ -0,0 +1,56 @@ | |||
log_facility=daemon | |||
log_file={{ snap_common }}/log/nrpe.log | |||
server_port=5666 | |||
# WARNING: 0.0.0.0/0 is not supported, just comment out 'allowed_hosts' | |||
#allowed_hosts=0.0.0.0/0 | |||
#allowed_hosts=10.0.0.0/8,127.0.0.1 | |||
nrpe_user=root | |||
nrpe_group=root | |||
dont_blame_nrpe=0 | |||
debug=0 | |||
pid_file={{ snap_common }}/run/nrpe.pid | |||
# All configuration snippets go into nrpe.d/ | |||
command[check_users]={{ snap }}/usr/lib/nagios/plugins/check_users -w 5 -c 10 | |||
command[check_load]={{ snap }}/usr/lib/nagios/plugins/check_load -r -w 2.0,1.0,0.5 -c 4.0,2.0,1.0 | |||
#command[check_sda1]={{ snap }}/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /dev/sda1 | |||
command[check_all_fs]={{ snap }}/usr/lib/nagios/plugins/check_disk -l -X tmpfs -X squashfs -X proc -X sysfs -X devtmpfs -X lxcfs -X hugtlbfs | |||
command[check_swap]={{ snap }}/usr/lib/nagios/plugins/check_swap -n ok -w 5 -c 1 | |||
command[check_zombie_procs]={{ snap }}/usr/lib/nagios/plugins/check_procs -w 5 -c 10 -s Z | |||
command[check_total_procs]={{ snap }}/usr/lib/nagios/plugins/check_procs -w 220 -c 300 | |||
command[check_rabbitmq_server]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.rabbitmq-server | |||
command[check_cluster_server]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.cluster-server | |||
#command[check_external_bridge]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.icrostack.external-bridge | |||
command[check_glance_api]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.glance-api | |||
command[check_horizon_uwsgi]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.horizon-uwsgi | |||
command[check_keystone_uwsgi]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.keystone-uwsgi | |||
command[check_libvirtd]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.libvirtd | |||
command[check_memcached]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.memcached | |||
command[check_mysqld]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.mysqld | |||
command[check_neutron_api]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.neutron-api | |||
command[check_neutron_dhcp_agent]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.neutron-dhcp-agent | |||
command[check_neutron_l3_agent]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.neutron-l3-agent | |||
command[check_neutron_metadata_agent]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.neutron-metadata-agent | |||
command[check_neutron_openvswitch_agent]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.neutron-openvswitch-agent | |||
command[check_nginx]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nginx | |||
command[check_nova_api]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nova-api | |||
command[check_nova_api_metadata]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nova-api-metadata | |||
command[check_nova_compute]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nova-compute | |||
command[check_nova_conductor]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nova-conductor | |||
command[check_nova_scheduler]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nova-scheduler | |||
command[check_nova_uwsgi]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.nova-uwsgi | |||
command[check_ovs_vswitchd]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.ovs-vswitchd | |||
command[check_ovsdb_server]=python3 {{ snap }}/usr/lib/nagios/plugins/check_systemd.py snap.microstack.ovsdb-server | |||
#command[check_cpu_stats]={{ snap }}/usr/lib/nagios/plugins/check_cpu_stats.sh $arg1$ | |||
#command[check_users]={{ snap }}/usr/lib/nagios/plugins/check_users $ARG1$ | |||
#command[check_load]={{ snap }}/usr/lib/nagios/plugins/check_load $ARG1$ | |||
#command[check_disk]={{ snap }}/usr/lib/nagios/plugins/check_disk $arg1$ | |||
#command[check_swap]={{ snap }}/usr/lib/nagios/plugins/check_swap $arg1$ | |||
#command[check_mem]={{ snap }}/usr/lib/nagios/plugins/custom_check_mem -n $arg1$ | |||
include_dir={{ snap_common }}/nrpe/nrpe.conf.d |
@ -0,0 +1,131 @@ | |||
# Telegraf configuration | |||
# Telegraf is entirely plugin driven. All metrics are gathered from the | |||
# declared inputs, and sent to the declared outputs. | |||
# Plugins must be declared in here to be active. | |||
# To deactivate a plugin, comment out the name and any variables. | |||
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config | |||
# file would generate. | |||
# Global tags can be specified here in key="value" format. | |||
[tags] | |||
# dc = "us-east-1" # will tag all metrics with dc=us-east-1 | |||
# rack = "1a" | |||
# Configuration for telegraf agent | |||
[agent] | |||
# Default data collection interval for all plugins | |||
interval = "10s" | |||
# Rounds collection interval to 'interval' | |||
# ie, if interval="10s" then always collect on :00, :10, :20, etc. | |||
round_interval = true | |||
# Telegraf will cache metric_buffer_limit metrics for each output, and will | |||
# flush this buffer on a successful write. | |||
metric_buffer_limit = 10000 | |||
# Collection jitter is used to jitter the collection by a random amount. | |||
# Each plugin will sleep for a random time within jitter before collecting. | |||
# This can be used to avoid many plugins querying things like sysfs at the | |||
# same time, which can have a measurable effect on the system. | |||
collection_jitter = "0s" | |||
# Default data flushing interval for all outputs. You should not set this below | |||
# interval. Maximum flush_interval will be flush_interval + flush_jitter | |||
flush_interval = "10s" | |||
# Jitter the flush interval by a random amount. This is primarily to avoid | |||
# large write spikes for users running a large number of telegraf instances. | |||
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s | |||
flush_jitter = "0s" | |||
# Run telegraf in debug mode | |||
debug = false | |||
# Run telegraf in quiet mode | |||
quiet = false | |||
# Override default hostname, if empty use os.Hostname() | |||
hostname = "" | |||
############################################################################### | |||
# OUTPUTS # | |||
############################################################################### | |||
[[outputs.prometheus_client]] | |||
listen = ":9103" | |||
############################################################################### | |||
# INPUTS # | |||
############################################################################### | |||
# Read metrics about cpu usage | |||
[[inputs.cpu]] | |||
# Whether to report per-cpu stats or not | |||
percpu = false | |||
# Whether to report total system cpu stats or not | |||
totalcpu = true | |||
# Comment this line if you want the raw CPU time metrics | |||
drop = ["time_*"] | |||
# Read metrics about disk usage by mount point | |||
[[inputs.disk]] | |||
# By default, telegraf gather stats for all mountpoints. | |||
# Setting mountpoints will restrict the stats to the specified mountpoints. | |||
# mount_points=["/"] | |||
# Read metrics about disk IO by device | |||
[[inputs.diskio]] | |||
# By default, telegraf will gather stats for all devices including | |||
# disk partitions. | |||
# Setting devices will restrict the stats to the specified devices. | |||
# devices = ["sda", "sdb"] | |||
# Uncomment the following line if you do not need disk serial numbers. | |||
# skip_serial_number = true | |||
# Read metrics about memory usage | |||
[[inputs.mem]] | |||
# no configuration | |||
# Read metrics about network interface usage | |||
[[inputs.net]] | |||
# By default, telegraf gathers stats from any up interface (excluding loopback) | |||
# Setting interfaces will tell it to gather these explicit interfaces, | |||
# regardless of status. | |||
# | |||
# interfaces = ["eth0", ... ] | |||
# Read metrics about TCP status such as established, time wait etc and UDP sockets counts. | |||
[[inputs.netstat]] | |||
# no configuration | |||
# Read metrics about swap memory usage | |||
[[inputs.swap]] | |||
# no configuration | |||
# Read metrics about system load & uptime | |||
[[inputs.system]] | |||
# no configuration | |||
[[inputs.bond]] | |||
# no configuration | |||
[[inputs.cgroup]] | |||
# no configuration | |||
[[inputs.exec]] | |||
commands = [ | |||
"/usr/bin/awk '{ print $1 }' /proc/sys/fs/file-nr" | |||
] | |||
data_format = "value" | |||
{% if monitoring_ipmi %} | |||
[[inputs.ipmi_sensor]] | |||
path = "{{snap}}/usr/bin/ipmitool" | |||
interval = "60s" | |||
timeout = "60s" | |||
metric_version = 2 | |||
privilege = "ADMINISTRATOR" | |||
{% if monitoring_ipmi != 'in-band' %} | |||
servers = [ "{{ monitoring_ipmi }}" ] | |||
{% endif %} | |||
{% endif %} |
@ -0,0 +1,11 @@ | |||
#!/bin/bash | |||
mkdir -p $SNAP_COMMON/var/filebeat | |||
mkdir -p $SNAP_COMMON/var/log/filebeat | |||
mkdir -p $SNAP_COMMON/etc/filebeat/filebeat.conf.d | |||
$SNAP/usr/share/filebeat/bin/filebeat -c "$(snapctl get config.logging.custom-config)" \ | |||
-path.home $SNAP/usr/share/filebeat/ \ | |||
-path.config $SNAP_COMMON/etc/filebeat/filebeat.conf.d \ | |||
-path.data $SNAP_COMMON/var/filebeat \ | |||
-path.logs $SNAP_COMMON/var/log/filebeat |
@ -0,0 +1,6 @@ | |||
#!/bin/bash | |||
# this directory doesn't need to exist for nrpe to start | |||
mkdir -p $SNAP_COMMON/nrpe/nrpe.conf.d | |||
$SNAP/usr/sbin/nrpe -c $(snapctl get config.alerting.custom-config) -d -f |
@ -0,0 +1,8 @@ | |||
#!/bin/bash | |||
mkdir -p $SNAP_COMMON/etc/telegraf/telegraf.d | |||
# FIXME: set -path.config to correct folder | |||
$SNAP/usr/bin/telegraf --config "$(snapctl get config.monitoring.custom-config)" \ | |||
--config-directory $SNAP_COMMON/etc/telegraf/telegraf.d \ | |||
--pidfile $SNAP_COMMON/run/telegraf.pid |
@ -0,0 +1,28 @@ | |||
-----BEGIN PGP PUBLIC KEY BLOCK----- | |||
Version: SKS 1.1.6 | |||
Comment: Hostname: keyserver.ubuntu.com | |||
mQINBFcVSuIBEAC80aj0tAQ6+NhGV/bkSwu6Oj+BpDR50Be3uBv7ttdtvChL5zHTnaxjdK3h | |||
LKSyrDLlmSOkffQ2uO7CxvqeF09MsHhyvrDDx0EY54//xxoAB++PoB2OQqmqldg3Al5Hp4Dz | |||
rllV5CIX5PD8NGX8UpO3HXk5wEwn9G81l8cia3vPveU82EIkHMiJGpk6+L86OMlwXzxkSI3M | |||
xXgNFKQc+ELDYLvGSseYC9vPN3kdmFoo/UjznPPE4fxr4bXit3N8Abl1jYjBa0x6SWkK1BAb | |||
s8w3BXtvyk90z9Oyme69wPD4zAYfFp+kN2nDmTDBMtNCyMu9oatdI5SukMNK4Lcm8eAE6VNs | |||
04j7BKvGk9+17M8WP9Pw8nIisOwScS9gUlJlLUpnBaJ+sxoOvGQ4mzZxYMKzJh0E58aEX3bS | |||
AyzQfsae8bZLNOTcgotyzzIDJFF9npzu3wmKjeOt/706p4LiDqKUbQK6cI+QcJ/y80ZUK8pB | |||
M043ttSHWLmTBFX2drp6zQGae9+02fX89ZD+5c+MPlubJMYCCKkvQT4OssHfC+dVDQ66rwUy | |||
OObrzsVgikdpIxQVitL3J+Dms56xAkdFfoo+qdxxdv9S/eakc5mfavc/4WVvmFDaJiqJnJRR | |||
Ryw1zApRtuweEEdVn8niy1mahoKpWaw1pTI4AazjWI6xJH1JyQARAQABtB9MYXVuY2hwYWQg | |||
UFBBIGZvciBUZWxlZ3JhZiBEZXZziQI4BBMBAgAiBQJXFUriAhsDBgsJCAcDAgYVCAIJCgsE | |||
FgIDAQIeAQIXgAAKCRDxDL4ByUQG9UgbEACa4IzdeYxH/S5I6MrZfvWNo/JTZ/MZWDD+QlMW | |||
60ThAemCUSE+NJvZZ1q7ovGFpYnHJT9GQXOwJAX1quDUqyM1uXNmLlOyIVNnmjUTINoLhw2V | |||
iC8E7dMWC9w4Na2fKezmNHH00kNl43ncstIjjZ3pLnDGYm1y0ItiCUcTRgHhx2cUZ/vStz1S | |||
Pdqj4P3i8vuspoYJ2T3VPlM/0G+u9Yjuy3Uzu9RugOyO3UJPoi3+4O2VTNosSBy5MILVCp49 | |||
eigyFVGpq5sT/c86qd1zqmsNWEubrlzDfETS4LMj9epr46ZKPXGQkeryt1m2Oe0HkIdNZ+IQ | |||
5p+i9fnEy7/1uKTXWQYsg2UWsLA2PvTvwY8JxxMhUFgv12q2w7STntqJyi9PLItYNtbtKoS3 | |||
XZCCMqQLCWMXHY+2ol6rRSfs06H/wzlR8LjDaEXkDVuDmqMtcbgTboZYblsGxst7I/Y4Wgfi | |||
J52uiIyobQ69uJbG0XeRTLZ3WyrBkopEsTX/+sQjVqbADXYU4hBVDgnCf2uN/5dcwSEvDj8/ | |||
+WsToAfEJkscRBsQjTLVzf+eFqHLrbqz/yoYIqBc//IJMBSbxIf5mrOHHLdbOuMCB6PVwpTI | |||
vLFOSDNPuVDX+S1goA8KJTnXpm8jWDynn3XaXx3AlYw4iZ0ETSgQLQLRd6JuPOEGXsGdBA== | |||
=ufaX | |||
-----END PGP PUBLIC KEY BLOCK----- | |||