[thedac, r=gnuoy] This change adds support for queue monitoring by nagios.
This commit is contained in:
commit
c0a6ee8b6e
2
Makefile
2
Makefile
@ -39,4 +39,4 @@ unit_test: .venv
|
||||
|
||||
functional_test:
|
||||
@echo Starting amulet tests...
|
||||
@juju test -v -p AMULET_HTTP_PROXY --timeout 900
|
||||
@juju test -v -p AMULET_HTTP_PROXY,OS_USERNAME,OS_TENANT_NAME,OS_REGION_NAME,OS_PASSWORD,OS_AUTH_URL --timeout 900
|
||||
|
15
config.yaml
15
config.yaml
@ -172,3 +172,18 @@ options:
|
||||
description: |
|
||||
Minimum number of units expected to exist before charm will attempt to
|
||||
form a rabbitmq cluster.
|
||||
stats_cron_schedule:
|
||||
type: string
|
||||
default: '*/5 * * * *'
|
||||
description: |
|
||||
Cron schedule used to generate rabbitmq stats. If unset
|
||||
no stats will be generated
|
||||
queue_thresholds:
|
||||
type: string
|
||||
default: "[['\\*', '\\*', 100, 200]]"
|
||||
description: |
|
||||
List of RabbitMQ queue size check thresholds. Interpreted as YAML
|
||||
in format [<vhost>, <queue>, <warn>, <crit>]
|
||||
- ['/', 'queue1', 10, 20]
|
||||
- ['/', 'queue2', 200, 300]
|
||||
Wildcards '*' are accepted to monitor all vhosts and/or queues
|
||||
|
@ -6,6 +6,7 @@ import sys
|
||||
import subprocess
|
||||
import glob
|
||||
import socket
|
||||
import yaml
|
||||
|
||||
import rabbit_utils as rabbit
|
||||
from lib.utils import (
|
||||
@ -50,7 +51,8 @@ from charmhelpers.core.hookenv import (
|
||||
unit_get,
|
||||
is_relation_made,
|
||||
Hooks,
|
||||
UnregisteredHookError
|
||||
UnregisteredHookError,
|
||||
charm_dir
|
||||
)
|
||||
from charmhelpers.core.host import (
|
||||
cmp_pkgrevno,
|
||||
@ -58,6 +60,7 @@ from charmhelpers.core.host import (
|
||||
rsync,
|
||||
service_stop,
|
||||
service_restart,
|
||||
write_file,
|
||||
)
|
||||
from charmhelpers.contrib.charmsupport import nrpe
|
||||
from charmhelpers.contrib.ssl.service import ServiceCA
|
||||
@ -80,6 +83,11 @@ RABBIT_DIR = '/var/lib/rabbitmq'
|
||||
RABBIT_USER = 'rabbitmq'
|
||||
RABBIT_GROUP = 'rabbitmq'
|
||||
NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
|
||||
SCRIPTS_DIR = '/usr/local/bin'
|
||||
STATS_CRONFILE = '/etc/cron.d/rabbitmq-stats'
|
||||
STATS_DATAFILE = os.path.join(RABBIT_DIR, 'data',
|
||||
'{}_queue_stats.dat'
|
||||
''.format(socket.gethostname()))
|
||||
|
||||
|
||||
@hooks.hook('install')
|
||||
@ -489,6 +497,17 @@ def update_nrpe_checks():
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
|
||||
'check_rabbitmq.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py'))
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
|
||||
'check_rabbitmq_queues.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py'))
|
||||
if config('stats_cron_schedule'):
|
||||
script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh')
|
||||
cronjob = "{} root {}\n".format(config('stats_cron_schedule'), script)
|
||||
rsync(os.path.join(charm_dir(), 'scripts',
|
||||
'collect_rabbitmq_stats.sh'), script)
|
||||
write_file(STATS_CRONFILE, cronjob)
|
||||
elif os.path.isfile(STATS_CRONFILE):
|
||||
os.remove(STATS_CRONFILE)
|
||||
|
||||
# Find out if nrpe set nagios_hostname
|
||||
hostname = nrpe.get_nagios_hostname()
|
||||
@ -511,6 +530,17 @@ def update_nrpe_checks():
|
||||
check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}'
|
||||
''.format(NAGIOS_PLUGINS, user, password, vhost)
|
||||
)
|
||||
if config('queue_thresholds'):
|
||||
cmd = ""
|
||||
# If value of queue_thresholds is incorrect we want the hook to fail
|
||||
for item in yaml.safe_load(config('queue_thresholds')):
|
||||
cmd += ' -c "{}" "{}" {} {}'.format(*item)
|
||||
nrpe_compat.add_check(
|
||||
shortname=rabbit.RABBIT_USER + '_queue',
|
||||
description='Check RabbitMQ Queues',
|
||||
check_cmd='{}/check_rabbitmq_queues.py{} {}'.format(
|
||||
NAGIOS_PLUGINS, cmd, STATS_DATAFILE)
|
||||
)
|
||||
nrpe_compat.write()
|
||||
|
||||
|
||||
|
99
scripts/check_rabbitmq_queues.py
Executable file
99
scripts/check_rabbitmq_queues.py
Executable file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
# Copyright (C) 2011, 2012, 2014 Canonical
|
||||
# All Rights Reserved
|
||||
# Author: Liam Young, Jacek Nykis
|
||||
|
||||
from collections import defaultdict
|
||||
from fnmatch import fnmatchcase
|
||||
from itertools import chain
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
def gen_data_lines(filename):
|
||||
with open(filename, "rb") as fin:
|
||||
for line in fin:
|
||||
if not line.startswith("#"):
|
||||
yield line
|
||||
|
||||
|
||||
def gen_stats(data_lines):
|
||||
for line in data_lines:
|
||||
try:
|
||||
vhost, queue, _, _, m_all, _ = line.split(None, 5)
|
||||
except ValueError:
|
||||
print "ERROR: problem parsing the stats file"
|
||||
sys.exit(2)
|
||||
assert m_all.isdigit(), "Message count is not a number: %r" % m_all
|
||||
yield vhost, queue, int(m_all)
|
||||
|
||||
|
||||
def collate_stats(stats, limits):
|
||||
# Create a dict with stats collated according to the definitions in the
|
||||
# limits file. If none of the definitions in the limits file is matched,
|
||||
# store the stat without collating.
|
||||
collated = defaultdict(lambda: 0)
|
||||
for vhost, queue, m_all in stats:
|
||||
for l_vhost, l_queue, _, _ in limits:
|
||||
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
|
||||
collated[l_vhost, l_queue] += m_all
|
||||
break
|
||||
else:
|
||||
collated[vhost, queue] += m_all
|
||||
return collated
|
||||
|
||||
|
||||
def check_stats(stats_collated, limits):
|
||||
# Create a limits lookup dict with keys of the form (vhost, queue).
|
||||
limits_lookup = dict(
|
||||
((l_vhost, l_queue), (int(t_warning), int(t_critical)))
|
||||
for l_vhost, l_queue, t_warning, t_critical in limits)
|
||||
if not (stats_collated):
|
||||
yield 'No Queues Found', 'No Vhosts Found', None, "CRIT"
|
||||
# Go through the stats and compare again limits, if any.
|
||||
for l_vhost, l_queue in sorted(stats_collated):
|
||||
m_all = stats_collated[l_vhost, l_queue]
|
||||
try:
|
||||
t_warning, t_critical = limits_lookup[l_vhost, l_queue]
|
||||
except KeyError:
|
||||
yield l_queue, l_vhost, m_all, "UNKNOWN"
|
||||
else:
|
||||
if m_all >= t_critical:
|
||||
yield l_queue, l_vhost, m_all, "CRIT"
|
||||
elif m_all >= t_warning:
|
||||
yield l_queue, l_vhost, m_all, "WARN"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='RabbitMQ queue size nagios check.')
|
||||
parser.add_argument('-c', nargs=4, action='append', required=True,
|
||||
metavar=('vhost', 'queue', 'warn', 'crit'),
|
||||
help=('Vhost and queue to check. Can be used multiple times'))
|
||||
parser.add_argument('stats_file', nargs='*', type=str, help='file containing queue stats')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Start generating stats from all files given on the command line.
|
||||
stats = gen_stats(
|
||||
chain.from_iterable(
|
||||
gen_data_lines(filename) for filename in args.stats_file))
|
||||
# Collate stats according to limit definitions and check.
|
||||
stats_collated = collate_stats(stats, args.c)
|
||||
stats_checked = check_stats(stats_collated, args.c)
|
||||
criticals, warnings = [], []
|
||||
for queue, vhost, message_no, status in stats_checked:
|
||||
if status == "CRIT":
|
||||
criticals.append(
|
||||
"%s in %s has %s messages" % (queue, vhost, message_no))
|
||||
elif status == "WARN":
|
||||
warnings.append(
|
||||
"%s in %s has %s messages" % (queue, vhost, message_no))
|
||||
if len(criticals) > 0:
|
||||
print "CRITICALS: %s" % ", ".join(criticals)
|
||||
sys.exit(2)
|
||||
# XXX: No warnings if there are criticals?
|
||||
elif len(warnings) > 0:
|
||||
print "WARNINGS: %s" % ", ".join(warnings)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print "OK"
|
||||
sys.exit(0)
|
49
scripts/collect_rabbitmq_stats.sh
Executable file
49
scripts/collect_rabbitmq_stats.sh
Executable file
@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2011, 2014 Canonical
|
||||
# All Rights Reserved
|
||||
# Author: Liam Young, Jacek Nykis
|
||||
|
||||
# Produce a queue data for a given vhost. Useful for graphing and Nagios checks
|
||||
LOCK=/var/lock/rabbitmq-gather-metrics.lock
|
||||
# Check for a lock file and if not, create one
|
||||
lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
trap "rm -f $LOCK > /dev/null 2>&1" exit
|
||||
|
||||
# Required to fix the bug about start-stop-daemon not being found in
|
||||
# rabbitmq-server 2.7.1-0ubuntu4.
|
||||
# '/usr/sbin/rabbitmqctl: 33: /usr/sbin/rabbitmqctl: start-stop-daemon: not found'
|
||||
export PATH=${PATH}:/sbin/
|
||||
|
||||
if [ -f /var/lib/rabbitmq/pids ]; then
|
||||
RABBIT_PID=$(grep "{rabbit\@${HOSTNAME}," /var/lib/rabbitmq/pids | sed -e 's!^.*,\([0-9]*\).*!\1!')
|
||||
elif [ -f /var/run/rabbitmq/pid ]; then
|
||||
RABBIT_PID=$(cat /var/run/rabbitmq/pid)
|
||||
else
|
||||
echo "No PID file found"
|
||||
exit 3
|
||||
fi
|
||||
DATA_DIR="/var/lib/rabbitmq/data"
|
||||
DATA_FILE="${DATA_DIR}/$(hostname -s)_queue_stats.dat"
|
||||
LOG_DIR="/var/lib/rabbitmq/logs"
|
||||
RABBIT_STATS_DATA_FILE="${DATA_DIR}/$(hostname -s)_general_stats.dat"
|
||||
NOW=$(date +'%s')
|
||||
HOSTNAME=$(hostname -s)
|
||||
MNESIA_DB_SIZE=$(du -sm /var/lib/rabbitmq/mnesia | cut -f1)
|
||||
RABBIT_RSS=$(ps -p $RABBIT_PID -o rss=)
|
||||
if [ ! -d $DATA_DIR ]; then
|
||||
mkdir -p $DATA_DIR
|
||||
fi
|
||||
if [ ! -d $LOG_DIR ]; then
|
||||
mkdir -p $LOG_DIR
|
||||
fi
|
||||
echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory Time" > $DATA_FILE
|
||||
/usr/sbin/rabbitmqctl -q list_vhosts | \
|
||||
while read VHOST; do
|
||||
/usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory | \
|
||||
awk "{print \"$VHOST \" \$0 \" $(date +'%s') \"}" >> $DATA_FILE 2>${LOG_DIR}/list_queues.log
|
||||
done
|
||||
echo "mnesia_size: ${MNESIA_DB_SIZE}@$NOW" > $RABBIT_STATS_DATA_FILE
|
||||
echo "rss_size: ${RABBIT_RSS}@$NOW" >> $RABBIT_STATS_DATA_FILE
|
83
tests/50_test_monitoring.py
Executable file
83
tests/50_test_monitoring.py
Executable file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
# This Amulet test performs a basic deploy and checks if rabbitmq is running.
|
||||
|
||||
import amulet
|
||||
import os
|
||||
import time
|
||||
|
||||
# The number of seconds to wait for the environment to setup.
|
||||
seconds = 900
|
||||
# Get the directory in this way to load the files from the tests directory.
|
||||
path = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
key_path = os.path.join(path, 'rabbit-server-privkey.pem')
|
||||
# Read the private key file.
|
||||
with open(key_path) as f:
|
||||
privateKey = f.read()
|
||||
# Read the certificate file.
|
||||
cert_path = os.path.join(path, 'rabbit-server-cert.pem')
|
||||
with open(cert_path) as f:
|
||||
certificate = f.read()
|
||||
|
||||
# Create a dictionary for the rabbitmq configuration.
|
||||
rabbitmq_configuration = {
|
||||
'stats_cron_schedule': '*/1 * * * *'
|
||||
}
|
||||
d = amulet.Deployment(series='trusty')
|
||||
# Add the rabbitmq-server charm to the deployment.
|
||||
d.add('rabbitmq-server')
|
||||
# Configure options on the rabbitmq-server.
|
||||
d.configure('rabbitmq-server', rabbitmq_configuration)
|
||||
# Expose the server so we can connect.
|
||||
d.expose('rabbitmq-server')
|
||||
# XXX Remove charm= once this branch lands in the charm store
|
||||
d.add('nrpe-external-master',
|
||||
charm='lp:~gnuoy/charms/trusty/nrpe/services-rewrite')
|
||||
d.relate('rabbitmq-server:nrpe-external-master',
|
||||
'nrpe-external-master:nrpe-external-master')
|
||||
|
||||
try:
|
||||
# Execute the deployer with the current mapping.
|
||||
d.setup(timeout=seconds)
|
||||
except amulet.helpers.TimeoutError:
|
||||
message = 'The environment did not setup in %d seconds.' % seconds
|
||||
# The SKIP status enables skip or fail the test based on configuration.
|
||||
amulet.raise_status(amulet.SKIP, msg=message)
|
||||
except:
|
||||
raise
|
||||
print('The rabbitmq-server has been successfully deployed and related '
|
||||
'to nrpe-external-master.')
|
||||
|
||||
###############################################################################
|
||||
# # Verify nagios checks
|
||||
###############################################################################
|
||||
rabbitmq_sentry = d.sentry.unit['rabbitmq-server/0']
|
||||
|
||||
command = 'bash -c "$(egrep -oh /usr/local.* ' \
|
||||
'/etc/nagios/nrpe.d/check_rabbitmq.cfg)"'
|
||||
print(command)
|
||||
output, code = rabbitmq_sentry.run(command)
|
||||
print(output)
|
||||
if (code != 0):
|
||||
message = 'The ' + command + ' did not return the expected code of 0.'
|
||||
amulet.raise_status(amulet.FAIL, msg=message)
|
||||
else:
|
||||
print('The rabbitmq-server check_rabbitmq is OK')
|
||||
|
||||
print('Sleeping 70 seconds to make sure the monitoring cron has run')
|
||||
time.sleep(70)
|
||||
|
||||
command = 'bash -c "$(egrep -oh /usr/local.* ' \
|
||||
'/etc/nagios/nrpe.d/check_rabbitmq_queue.cfg)"'
|
||||
print(command)
|
||||
output, code = rabbitmq_sentry.run(command)
|
||||
print(output)
|
||||
if (code != 0):
|
||||
message = 'The ' + command + ' did not return the expected code of 0.'
|
||||
amulet.raise_status(amulet.FAIL, msg=message)
|
||||
else:
|
||||
print('The rabbitmq-server check_rabbitmq_queue is OK')
|
||||
|
||||
# Success!
|
||||
print('The rabbitmq-server passed the monitoring tests!')
|
Loading…
Reference in New Issue
Block a user