Added stats cronjob and queue monitoring nagios plugin

This commit is contained in:
Jacek Nykis
2014-05-07 10:52:24 +01:00
parent 1f831eca17
commit 21bc24944e
5 changed files with 192 additions and 5 deletions

View File

@@ -133,3 +133,15 @@ options:
description: | description: |
Key ID to import to the apt keyring to support use with arbitary source Key ID to import to the apt keyring to support use with arbitary source
configuration from outside of Launchpad archives or PPA's. configuration from outside of Launchpad archives or PPA's.
stats_cron_schedule:
type: string
description: |
Cron schedule used to generate rabbitmq stats. If unset
no stats will be generated
queue_thresholds:
type: string
description: |
List of RabbitMQ queue size check thresholds. Interpreted as YAML
in format [<vhost>, <queue>, <warn>, <crit>]
- ['/', 'queue1', 10, 20]
- ['/', 'queue2', 200, 300]

View File

@@ -5,6 +5,7 @@ import shutil
import sys import sys
import subprocess import subprocess
import glob import glob
import yaml
import rabbit_utils as rabbit import rabbit_utils as rabbit
from lib.utils import ( from lib.utils import (
@@ -41,7 +42,7 @@ from charmhelpers.core.hookenv import (
UnregisteredHookError UnregisteredHookError
) )
from charmhelpers.core.host import ( from charmhelpers.core.host import (
rsync, service_stop, service_restart rsync, service_stop, service_restart, write_file
) )
from charmhelpers.contrib.charmsupport.nrpe import NRPE from charmhelpers.contrib.charmsupport.nrpe import NRPE
from charmhelpers.contrib.ssl.service import ServiceCA from charmhelpers.contrib.ssl.service import ServiceCA
@@ -60,6 +61,10 @@ RABBIT_DIR = '/var/lib/rabbitmq'
RABBIT_USER = 'rabbitmq' RABBIT_USER = 'rabbitmq'
RABBIT_GROUP = 'rabbitmq' RABBIT_GROUP = 'rabbitmq'
NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins' NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
SCRIPTS_DIR = '/usr/local/bin'
STATS_CRONFILE = '/etc/cron.d/rabbitmq-stats'
STATS_DATAFILE = os.path.join(RABBIT_DIR, 'data',
subprocess.check_output(['hostname', '-s']).strip() + '_queue_stats.dat')
@hooks.hook('install') @hooks.hook('install')
@@ -334,10 +339,10 @@ def ceph_changed():
rbd_img=rbd_img, sizemb=sizemb, rbd_img=rbd_img, sizemb=sizemb,
fstype='ext4', mount_point=RABBIT_DIR, fstype='ext4', mount_point=RABBIT_DIR,
blk_device=blk_device, blk_device=blk_device,
system_services=['rabbitmq-server'])#, system_services=['rabbitmq-server']) # ,
#rbd_pool_replicas=rbd_pool_rep_count) #rbd_pool_replicas=rbd_pool_rep_count)
subprocess.check_call(['chown', '-R', '%s:%s' % subprocess.check_call(['chown', '-R', '%s:%s' %
(RABBIT_USER,RABBIT_GROUP), RABBIT_DIR]) (RABBIT_USER, RABBIT_GROUP), RABBIT_DIR])
else: else:
log('This is not the peer leader. Not configuring RBD.') log('This is not the peer leader. Not configuring RBD.')
log('Stopping rabbitmq-server.') log('Stopping rabbitmq-server.')
@@ -360,9 +365,20 @@ def update_nrpe_checks():
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
'check_rabbitmq.py'), 'check_rabbitmq.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
'check_rabbitmq_queues.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py'))
if config('stats_cron_schedule'):
script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh')
cronjob = "{} root {}\n".format(config('stats_cron_schedule'), script)
rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
'collect_rabbitmq_stats.sh'), script)
write_file(STATS_CRONFILE, cronjob)
elif os.path.isfile(STATS_CRONFILE):
os.remove(STATS_CRONFILE)
# Find out if nrpe set nagios_hostname # Find out if nrpe set nagios_hostname
hostname=None hostname = None
for rel in relations_of_type('nrpe-external-master'): for rel in relations_of_type('nrpe-external-master'):
if 'nagios_hostname' in rel: if 'nagios_hostname' in rel:
hostname = rel['nagios_hostname'] hostname = rel['nagios_hostname']
@@ -384,6 +400,17 @@ def update_nrpe_checks():
check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}' check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}'
''.format(NAGIOS_PLUGINS, user, password, vhost) ''.format(NAGIOS_PLUGINS, user, password, vhost)
) )
if config('queue_thresholds'):
cmd = ""
# If value of queue_thresholds is incorrect we want the hook to fail
for item in yaml.safe_load(config('queue_thresholds')):
cmd += ' -c {} {} {} {}'.format(*item)
nrpe_compat.add_check(
shortname=rabbit.RABBIT_USER + '_queue',
description='Check RabbitMQ Queues',
check_cmd='{}/check_rabbitmq_queues.py{} {}'.format(
NAGIOS_PLUGINS, cmd, STATS_DATAFILE)
)
nrpe_compat.write() nrpe_compat.write()

View File

@@ -1 +1 @@
128 150

View File

@@ -0,0 +1,99 @@
#!/usr/bin/python
# Copyright (C) 2011, 2012, 2014 Canonical
# All Rights Reserved
# Author: Liam Young, Jacek Nykis
from collections import defaultdict
from fnmatch import fnmatchcase
from itertools import chain
import argparse
import sys
def gen_data_lines(filename):
with open(filename, "rb") as fin:
for line in fin:
if not line.startswith("#"):
yield line
def gen_stats(data_lines):
for line in data_lines:
try:
vhost, queue, _, _, m_all, _ = line.split(None, 5)
except ValueError:
print "ERROR: problem parsing the stats file"
sys.exit(2)
assert m_all.isdigit(), "Message count is not a number: %r" % m_all
yield vhost, queue, int(m_all)
def collate_stats(stats, limits):
# Create a dict with stats collated according to the definitions in the
# limits file. If none of the definitions in the limits file is matched,
# store the stat without collating.
collated = defaultdict(lambda: 0)
for vhost, queue, m_all in stats:
for l_vhost, l_queue, _, _ in limits:
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
collated[l_vhost, l_queue] += m_all
break
else:
collated[vhost, queue] += m_all
return collated
def check_stats(stats_collated, limits):
# Create a limits lookup dict with keys of the form (vhost, queue).
limits_lookup = dict(
((l_vhost, l_queue), (int(t_warning), int(t_critical)))
for l_vhost, l_queue, t_warning, t_critical in limits)
if not (stats_collated):
yield 'No Queues Found', 'No Vhosts Found', None, "CRIT"
# Go through the stats and compare again limits, if any.
for l_vhost, l_queue in sorted(stats_collated):
m_all = stats_collated[l_vhost, l_queue]
try:
t_warning, t_critical = limits_lookup[l_vhost, l_queue]
except KeyError:
yield l_queue, l_vhost, m_all, "UNKNOWN"
else:
if m_all >= t_critical:
yield l_queue, l_vhost, m_all, "CRIT"
elif m_all >= t_warning:
yield l_queue, l_vhost, m_all, "WARN"
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='RabbitMQ queue size nagios check.')
parser.add_argument('-c', nargs=4, action='append', required=True,
metavar=('vhost', 'queue', 'warn', 'crit'),
help=('Vhost and queue to check. Can be used multiple times'))
parser.add_argument('stats_file', nargs='*', type=str, help='file containing queue stats')
args = parser.parse_args()
# Start generating stats from all files given on the command line.
stats = gen_stats(
chain.from_iterable(
gen_data_lines(filename) for filename in args.stats_file))
# Collate stats according to limit definitions and check.
stats_collated = collate_stats(stats, args.c)
stats_checked = check_stats(stats_collated, args.c)
criticals, warnings = [], []
for queue, vhost, message_no, status in stats_checked:
if status == "CRIT":
criticals.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
elif status == "WARN":
warnings.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
if len(criticals) > 0:
print "CRITICALS: %s" % ", ".join(criticals)
sys.exit(2)
# XXX: No warnings if there are criticals?
elif len(warnings) > 0:
print "WARNINGS: %s" % ", ".join(warnings)
sys.exit(1)
else:
print "OK"
sys.exit(0)

View File

@@ -0,0 +1,49 @@
#!/bin/bash
# Copyright (C) 2011, 2014 Canonical
# All Rights Reserved
# Author: Liam Young, Jacek Nykis
# Produce a queue data for a given vhost. Useful for graphing and Nagios checks
LOCK=/var/lock/rabbitmq-gather-metrics.lock
# Check for a lock file and if not, create one
lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
trap "rm -f $LOCK > /dev/null 2>&1" exit
# Required to fix the bug about start-stop-daemon not being found in
# rabbitmq-server 2.7.1-0ubuntu4.
# '/usr/sbin/rabbitmqctl: 33: /usr/sbin/rabbitmqctl: start-stop-daemon: not found'
export PATH=${PATH}:/sbin/
if [ -f /var/lib/rabbitmq/pids ]; then
RABBIT_PID=$(grep "{rabbit\@${HOSTNAME}," /var/lib/rabbitmq/pids | sed -e 's!^.*,\([0-9]*\).*!\1!')
elif [ -f /var/run/rabbitmq/pid ]; then
RABBIT_PID=$(cat /var/run/rabbitmq/pid)
else
echo "No PID file found"
exit 3
fi
DATA_DIR="/var/lib/rabbitmq/data"
DATA_FILE="${DATA_DIR}/$(hostname -s)_queue_stats.dat"
LOG_DIR="/var/lib/rabbitmq/logs"
RABBIT_STATS_DATA_FILE="${DATA_DIR}/$(hostname -s)_general_stats.dat"
NOW=$(date +'%s')
HOSTNAME=$(hostname -s)
MNESIA_DB_SIZE=$(du -sm /var/lib/rabbitmq/mnesia | cut -f1)
RABBIT_RSS=$(ps -p $RABBIT_PID -o rss=)
if [ ! -d $DATA_DIR ]; then
mkdir -p $DATA_DIR
fi
if [ ! -d $LOG_DIR ]; then
mkdir -p $LOG_DIR
fi
echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory Time" > $DATA_FILE
/usr/sbin/rabbitmqctl -q list_vhosts | \
while read VHOST; do
/usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory | \
awk "{print \"$VHOST \" \$0 \" $(date +'%s') \"}" >> $DATA_FILE 2>${LOG_DIR}/list_queues.log
done
echo "mnesia_size: ${MNESIA_DB_SIZE}@$NOW" > $RABBIT_STATS_DATA_FILE
echo "rss_size: ${RABBIT_RSS}@$NOW" >> $RABBIT_STATS_DATA_FILE