From 21bc24944efeccd6e399a3c47d35e4a261edb51b Mon Sep 17 00:00:00 2001 From: Jacek Nykis Date: Wed, 7 May 2014 10:52:24 +0100 Subject: [PATCH] Added stats cronjob and queue monitoring nagios plugin --- config.yaml | 12 ++++ hooks/rabbitmq_server_relations.py | 35 +++++++++-- revision | 2 +- scripts/check_rabbitmq_queues.py | 99 ++++++++++++++++++++++++++++++ scripts/collect_rabbitmq_stats.sh | 49 +++++++++++++++ 5 files changed, 192 insertions(+), 5 deletions(-) create mode 100755 scripts/check_rabbitmq_queues.py create mode 100755 scripts/collect_rabbitmq_stats.sh diff --git a/config.yaml b/config.yaml index 468fd552..eb39b5c7 100644 --- a/config.yaml +++ b/config.yaml @@ -133,3 +133,15 @@ options: description: | Key ID to import to the apt keyring to support use with arbitary source configuration from outside of Launchpad archives or PPA's. + stats_cron_schedule: + type: string + description: | + Cron schedule used to generate rabbitmq stats. If unset + no stats will be generated + queue_thresholds: + type: string + description: | + List of RabbitMQ queue size check thresholds. Interpreted as YAML + in format [, , , ] + - ['/', 'queue1', 10, 20] + - ['/', 'queue2', 200, 300] diff --git a/hooks/rabbitmq_server_relations.py b/hooks/rabbitmq_server_relations.py index 2a9444a0..45604561 100755 --- a/hooks/rabbitmq_server_relations.py +++ b/hooks/rabbitmq_server_relations.py @@ -5,6 +5,7 @@ import shutil import sys import subprocess import glob +import yaml import rabbit_utils as rabbit from lib.utils import ( @@ -41,7 +42,7 @@ from charmhelpers.core.hookenv import ( UnregisteredHookError ) from charmhelpers.core.host import ( - rsync, service_stop, service_restart + rsync, service_stop, service_restart, write_file ) from charmhelpers.contrib.charmsupport.nrpe import NRPE from charmhelpers.contrib.ssl.service import ServiceCA @@ -60,6 +61,10 @@ RABBIT_DIR = '/var/lib/rabbitmq' RABBIT_USER = 'rabbitmq' RABBIT_GROUP = 'rabbitmq' NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins' +SCRIPTS_DIR = '/usr/local/bin' +STATS_CRONFILE = '/etc/cron.d/rabbitmq-stats' +STATS_DATAFILE = os.path.join(RABBIT_DIR, 'data', + subprocess.check_output(['hostname', '-s']).strip() + '_queue_stats.dat') @hooks.hook('install') @@ -334,10 +339,10 @@ def ceph_changed(): rbd_img=rbd_img, sizemb=sizemb, fstype='ext4', mount_point=RABBIT_DIR, blk_device=blk_device, - system_services=['rabbitmq-server'])#, + system_services=['rabbitmq-server']) # , #rbd_pool_replicas=rbd_pool_rep_count) subprocess.check_call(['chown', '-R', '%s:%s' % - (RABBIT_USER,RABBIT_GROUP), RABBIT_DIR]) + (RABBIT_USER, RABBIT_GROUP), RABBIT_DIR]) else: log('This is not the peer leader. Not configuring RBD.') log('Stopping rabbitmq-server.') @@ -360,9 +365,20 @@ def update_nrpe_checks(): rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', 'check_rabbitmq.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) + rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', + 'check_rabbitmq_queues.py'), + os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py')) + if config('stats_cron_schedule'): + script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh') + cronjob = "{} root {}\n".format(config('stats_cron_schedule'), script) + rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts', + 'collect_rabbitmq_stats.sh'), script) + write_file(STATS_CRONFILE, cronjob) + elif os.path.isfile(STATS_CRONFILE): + os.remove(STATS_CRONFILE) # Find out if nrpe set nagios_hostname - hostname=None + hostname = None for rel in relations_of_type('nrpe-external-master'): if 'nagios_hostname' in rel: hostname = rel['nagios_hostname'] @@ -384,6 +400,17 @@ def update_nrpe_checks(): check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}' ''.format(NAGIOS_PLUGINS, user, password, vhost) ) + if config('queue_thresholds'): + cmd = "" + # If value of queue_thresholds is incorrect we want the hook to fail + for item in yaml.safe_load(config('queue_thresholds')): + cmd += ' -c {} {} {} {}'.format(*item) + nrpe_compat.add_check( + shortname=rabbit.RABBIT_USER + '_queue', + description='Check RabbitMQ Queues', + check_cmd='{}/check_rabbitmq_queues.py{} {}'.format( + NAGIOS_PLUGINS, cmd, STATS_DATAFILE) + ) nrpe_compat.write() diff --git a/revision b/revision index a949a93d..fa8f08cb 100644 --- a/revision +++ b/revision @@ -1 +1 @@ -128 +150 diff --git a/scripts/check_rabbitmq_queues.py b/scripts/check_rabbitmq_queues.py new file mode 100755 index 00000000..a81b954a --- /dev/null +++ b/scripts/check_rabbitmq_queues.py @@ -0,0 +1,99 @@ +#!/usr/bin/python + +# Copyright (C) 2011, 2012, 2014 Canonical +# All Rights Reserved +# Author: Liam Young, Jacek Nykis + +from collections import defaultdict +from fnmatch import fnmatchcase +from itertools import chain +import argparse +import sys + +def gen_data_lines(filename): + with open(filename, "rb") as fin: + for line in fin: + if not line.startswith("#"): + yield line + + +def gen_stats(data_lines): + for line in data_lines: + try: + vhost, queue, _, _, m_all, _ = line.split(None, 5) + except ValueError: + print "ERROR: problem parsing the stats file" + sys.exit(2) + assert m_all.isdigit(), "Message count is not a number: %r" % m_all + yield vhost, queue, int(m_all) + + +def collate_stats(stats, limits): + # Create a dict with stats collated according to the definitions in the + # limits file. If none of the definitions in the limits file is matched, + # store the stat without collating. + collated = defaultdict(lambda: 0) + for vhost, queue, m_all in stats: + for l_vhost, l_queue, _, _ in limits: + if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue): + collated[l_vhost, l_queue] += m_all + break + else: + collated[vhost, queue] += m_all + return collated + + +def check_stats(stats_collated, limits): + # Create a limits lookup dict with keys of the form (vhost, queue). + limits_lookup = dict( + ((l_vhost, l_queue), (int(t_warning), int(t_critical))) + for l_vhost, l_queue, t_warning, t_critical in limits) + if not (stats_collated): + yield 'No Queues Found', 'No Vhosts Found', None, "CRIT" + # Go through the stats and compare again limits, if any. + for l_vhost, l_queue in sorted(stats_collated): + m_all = stats_collated[l_vhost, l_queue] + try: + t_warning, t_critical = limits_lookup[l_vhost, l_queue] + except KeyError: + yield l_queue, l_vhost, m_all, "UNKNOWN" + else: + if m_all >= t_critical: + yield l_queue, l_vhost, m_all, "CRIT" + elif m_all >= t_warning: + yield l_queue, l_vhost, m_all, "WARN" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='RabbitMQ queue size nagios check.') + parser.add_argument('-c', nargs=4, action='append', required=True, + metavar=('vhost', 'queue', 'warn', 'crit'), + help=('Vhost and queue to check. Can be used multiple times')) + parser.add_argument('stats_file', nargs='*', type=str, help='file containing queue stats') + args = parser.parse_args() + + # Start generating stats from all files given on the command line. + stats = gen_stats( + chain.from_iterable( + gen_data_lines(filename) for filename in args.stats_file)) + # Collate stats according to limit definitions and check. + stats_collated = collate_stats(stats, args.c) + stats_checked = check_stats(stats_collated, args.c) + criticals, warnings = [], [] + for queue, vhost, message_no, status in stats_checked: + if status == "CRIT": + criticals.append( + "%s in %s has %s messages" % (queue, vhost, message_no)) + elif status == "WARN": + warnings.append( + "%s in %s has %s messages" % (queue, vhost, message_no)) + if len(criticals) > 0: + print "CRITICALS: %s" % ", ".join(criticals) + sys.exit(2) + # XXX: No warnings if there are criticals? + elif len(warnings) > 0: + print "WARNINGS: %s" % ", ".join(warnings) + sys.exit(1) + else: + print "OK" + sys.exit(0) diff --git a/scripts/collect_rabbitmq_stats.sh b/scripts/collect_rabbitmq_stats.sh new file mode 100755 index 00000000..ff53f07b --- /dev/null +++ b/scripts/collect_rabbitmq_stats.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright (C) 2011, 2014 Canonical +# All Rights Reserved +# Author: Liam Young, Jacek Nykis + +# Produce a queue data for a given vhost. Useful for graphing and Nagios checks +LOCK=/var/lock/rabbitmq-gather-metrics.lock +# Check for a lock file and if not, create one +lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1 +if [ $? -ne 0 ]; then + exit 1 +fi +trap "rm -f $LOCK > /dev/null 2>&1" exit + +# Required to fix the bug about start-stop-daemon not being found in +# rabbitmq-server 2.7.1-0ubuntu4. +# '/usr/sbin/rabbitmqctl: 33: /usr/sbin/rabbitmqctl: start-stop-daemon: not found' +export PATH=${PATH}:/sbin/ + +if [ -f /var/lib/rabbitmq/pids ]; then + RABBIT_PID=$(grep "{rabbit\@${HOSTNAME}," /var/lib/rabbitmq/pids | sed -e 's!^.*,\([0-9]*\).*!\1!') +elif [ -f /var/run/rabbitmq/pid ]; then + RABBIT_PID=$(cat /var/run/rabbitmq/pid) +else + echo "No PID file found" + exit 3 +fi +DATA_DIR="/var/lib/rabbitmq/data" +DATA_FILE="${DATA_DIR}/$(hostname -s)_queue_stats.dat" +LOG_DIR="/var/lib/rabbitmq/logs" +RABBIT_STATS_DATA_FILE="${DATA_DIR}/$(hostname -s)_general_stats.dat" +NOW=$(date +'%s') +HOSTNAME=$(hostname -s) +MNESIA_DB_SIZE=$(du -sm /var/lib/rabbitmq/mnesia | cut -f1) +RABBIT_RSS=$(ps -p $RABBIT_PID -o rss=) +if [ ! -d $DATA_DIR ]; then + mkdir -p $DATA_DIR +fi +if [ ! -d $LOG_DIR ]; then + mkdir -p $LOG_DIR +fi +echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory Time" > $DATA_FILE +/usr/sbin/rabbitmqctl -q list_vhosts | \ +while read VHOST; do + /usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory | \ + awk "{print \"$VHOST \" \$0 \" $(date +'%s') \"}" >> $DATA_FILE 2>${LOG_DIR}/list_queues.log +done +echo "mnesia_size: ${MNESIA_DB_SIZE}@$NOW" > $RABBIT_STATS_DATA_FILE +echo "rss_size: ${RABBIT_RSS}@$NOW" >> $RABBIT_STATS_DATA_FILE