[thedac, r=gnuoy] This change adds support for queue monitoring by nagios.

2015-04-20 12:13:39 +01:00 · 2015-04-20 12:13:39 +01:00 · c0a6ee8b6e
commit c0a6ee8b6e
parent c499ab55cb dc88a1572d
7 changed files with 279 additions and 3 deletions
--- a/2
+++ b/2
@ -39,4 +39,4 @@ unit_test: .venv

 functional_test:
 	@echo Starting amulet tests...
-	@juju test -v -p AMULET_HTTP_PROXY --timeout 900
+	@juju test -v -p AMULET_HTTP_PROXY,OS_USERNAME,OS_TENANT_NAME,OS_REGION_NAME,OS_PASSWORD,OS_AUTH_URL --timeout 900
--- a/config.yaml
+++ b/config.yaml
@ -172,3 +172,18 @@ options:
    description: |
      Minimum number of units expected to exist before charm will attempt to
      form a rabbitmq cluster.
+  stats_cron_schedule:
+      type: string
+      default: '*/5 * * * *'
+      description: |
+        Cron schedule used to generate rabbitmq stats. If unset
+        no stats will be generated
+  queue_thresholds:
+      type: string
+      default: "[['\\*', '\\*', 100, 200]]"
+      description: |
+        List of RabbitMQ queue size check thresholds. Interpreted as YAML
+        in format [<vhost>, <queue>, <warn>, <crit>]
+        - ['/', 'queue1', 10, 20]
+        - ['/', 'queue2', 200, 300]
+        Wildcards '*' are accepted to monitor all vhosts and/or queues
--- a/hooks/rabbitmq_server_relations.py
+++ b/hooks/rabbitmq_server_relations.py
@ -6,6 +6,7 @@ import sys
 import subprocess
 import glob
 import socket
+import yaml

 import rabbit_utils as rabbit
 from lib.utils import (
@ -50,7 +51,8 @@ from charmhelpers.core.hookenv import (
    unit_get,
    is_relation_made,
    Hooks,
-    UnregisteredHookError
+    UnregisteredHookError,
+    charm_dir
 )
 from charmhelpers.core.host import (
    cmp_pkgrevno,
@ -58,6 +60,7 @@ from charmhelpers.core.host import (
    rsync,
    service_stop,
    service_restart,
+    write_file,
 )
 from charmhelpers.contrib.charmsupport import nrpe
 from charmhelpers.contrib.ssl.service import ServiceCA
@ -80,6 +83,11 @@ RABBIT_DIR = '/var/lib/rabbitmq'
 RABBIT_USER = 'rabbitmq'
 RABBIT_GROUP = 'rabbitmq'
 NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
+SCRIPTS_DIR = '/usr/local/bin'
+STATS_CRONFILE = '/etc/cron.d/rabbitmq-stats'
+STATS_DATAFILE = os.path.join(RABBIT_DIR, 'data',
+                              '{}_queue_stats.dat'
+                              ''.format(socket.gethostname()))


@hooks.hook('install')
@ -489,6 +497,17 @@ def update_nrpe_checks():
        rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
                           'check_rabbitmq.py'),
              os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py'))
+        rsync(os.path.join(os.getenv('CHARM_DIR'), 'scripts',
+                           'check_rabbitmq_queues.py'),
+              os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py'))
+    if config('stats_cron_schedule'):
+        script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh')
+        cronjob = "{} root {}\n".format(config('stats_cron_schedule'), script)
+        rsync(os.path.join(charm_dir(), 'scripts',
+                           'collect_rabbitmq_stats.sh'), script)
+        write_file(STATS_CRONFILE, cronjob)
+    elif os.path.isfile(STATS_CRONFILE):
+        os.remove(STATS_CRONFILE)

    # Find out if nrpe set nagios_hostname
    hostname = nrpe.get_nagios_hostname()
@ -511,6 +530,17 @@ def update_nrpe_checks():
        check_cmd='{}/check_rabbitmq.py --user {} --password {} --vhost {}'
                  ''.format(NAGIOS_PLUGINS, user, password, vhost)
    )
+    if config('queue_thresholds'):
+        cmd = ""
+        # If value of queue_thresholds is incorrect we want the hook to fail
+        for item in yaml.safe_load(config('queue_thresholds')):
+            cmd += ' -c "{}" "{}" {} {}'.format(*item)
+        nrpe_compat.add_check(
+            shortname=rabbit.RABBIT_USER + '_queue',
+            description='Check RabbitMQ Queues',
+            check_cmd='{}/check_rabbitmq_queues.py{} {}'.format(
+                        NAGIOS_PLUGINS, cmd, STATS_DATAFILE)
+        )
    nrpe_compat.write()


--- a/2
+++ b/2
@ -1 +1 @@
-128
+150
--- a/scripts/check_rabbitmq_queues.py
+++ b/scripts/check_rabbitmq_queues.py
@ -0,0 +1,99 @@
+#!/usr/bin/python
+
+# Copyright (C) 2011, 2012, 2014 Canonical
+# All Rights Reserved
+# Author: Liam Young, Jacek Nykis
+
+from collections import defaultdict
+from fnmatch import fnmatchcase
+from itertools import chain
+import argparse
+import sys
+
+def gen_data_lines(filename):
+    with open(filename, "rb") as fin:
+        for line in fin:
+            if not line.startswith("#"):
+                yield line
+
+
+def gen_stats(data_lines):
+    for line in data_lines:
+        try:
+            vhost, queue, _, _, m_all, _ = line.split(None, 5)
+        except ValueError:
+            print "ERROR: problem parsing the stats file"
+            sys.exit(2)
+        assert m_all.isdigit(), "Message count is not a number: %r" % m_all
+        yield vhost, queue, int(m_all)
+
+
+def collate_stats(stats, limits):
+    # Create a dict with stats collated according to the definitions in the
+    # limits file. If none of the definitions in the limits file is matched,
+    # store the stat without collating.
+    collated = defaultdict(lambda: 0)
+    for vhost, queue, m_all in stats:
+        for l_vhost, l_queue, _, _ in limits:
+            if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
+                collated[l_vhost, l_queue] += m_all
+                break
+        else:
+            collated[vhost, queue] += m_all
+    return collated
+
+
+def check_stats(stats_collated, limits):
+    # Create a limits lookup dict with keys of the form (vhost, queue).
+    limits_lookup = dict(
+        ((l_vhost, l_queue), (int(t_warning), int(t_critical)))
+        for l_vhost, l_queue, t_warning, t_critical in limits)
+    if not (stats_collated):
+        yield 'No Queues Found', 'No Vhosts Found', None, "CRIT"
+    # Go through the stats and compare again limits, if any.
+    for l_vhost, l_queue in sorted(stats_collated):
+        m_all = stats_collated[l_vhost, l_queue]
+        try:
+            t_warning, t_critical = limits_lookup[l_vhost, l_queue]
+        except KeyError:
+            yield l_queue, l_vhost, m_all, "UNKNOWN"
+        else:
+            if m_all >= t_critical:
+                yield l_queue, l_vhost, m_all, "CRIT"
+            elif m_all >= t_warning:
+                yield l_queue, l_vhost, m_all, "WARN"
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='RabbitMQ queue size nagios check.')
+    parser.add_argument('-c', nargs=4, action='append', required=True,
+        metavar=('vhost', 'queue', 'warn', 'crit'),
+        help=('Vhost and queue to check. Can be used multiple times'))
+    parser.add_argument('stats_file', nargs='*', type=str, help='file containing queue stats')
+    args = parser.parse_args()
+
+    # Start generating stats from all files given on the command line.
+    stats = gen_stats(
+        chain.from_iterable(
+            gen_data_lines(filename) for filename in args.stats_file))
+    # Collate stats according to limit definitions and check.
+    stats_collated = collate_stats(stats, args.c)
+    stats_checked = check_stats(stats_collated, args.c)
+    criticals, warnings = [], []
+    for queue, vhost, message_no, status in stats_checked:
+        if status == "CRIT":
+            criticals.append(
+                "%s in %s has %s messages" % (queue, vhost, message_no))
+        elif status == "WARN":
+            warnings.append(
+                "%s in %s has %s messages" % (queue, vhost, message_no))
+    if len(criticals) > 0:
+        print "CRITICALS: %s" % ", ".join(criticals)
+        sys.exit(2)
+        # XXX: No warnings if there are criticals?
+    elif len(warnings) > 0:
+        print "WARNINGS: %s" % ", ".join(warnings)
+        sys.exit(1)
+    else:
+        print "OK"
+        sys.exit(0)
--- a/scripts/collect_rabbitmq_stats.sh
+++ b/scripts/collect_rabbitmq_stats.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright (C) 2011, 2014 Canonical
+# All Rights Reserved
+# Author: Liam Young, Jacek Nykis
+
+# Produce a queue data for a given vhost. Useful for graphing and Nagios checks
+LOCK=/var/lock/rabbitmq-gather-metrics.lock
+# Check for a lock file and if not, create one
+lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1
+if [ $? -ne 0 ]; then
+    exit 1
+fi
+trap "rm -f $LOCK > /dev/null 2>&1" exit
+
+# Required to fix the bug about start-stop-daemon not being found in
+# rabbitmq-server 2.7.1-0ubuntu4.
+# '/usr/sbin/rabbitmqctl: 33: /usr/sbin/rabbitmqctl: start-stop-daemon: not found'
+export PATH=${PATH}:/sbin/
+
+if [ -f /var/lib/rabbitmq/pids ]; then
+    RABBIT_PID=$(grep "{rabbit\@${HOSTNAME}," /var/lib/rabbitmq/pids | sed -e 's!^.*,\([0-9]*\).*!\1!')
+elif [ -f /var/run/rabbitmq/pid ]; then 
+    RABBIT_PID=$(cat /var/run/rabbitmq/pid)
+else
+    echo "No PID file found"
+    exit 3
+fi
+DATA_DIR="/var/lib/rabbitmq/data"
+DATA_FILE="${DATA_DIR}/$(hostname -s)_queue_stats.dat"
+LOG_DIR="/var/lib/rabbitmq/logs"
+RABBIT_STATS_DATA_FILE="${DATA_DIR}/$(hostname -s)_general_stats.dat"
+NOW=$(date +'%s')
+HOSTNAME=$(hostname -s)
+MNESIA_DB_SIZE=$(du -sm /var/lib/rabbitmq/mnesia | cut -f1)
+RABBIT_RSS=$(ps -p $RABBIT_PID -o rss=)
+if [ ! -d $DATA_DIR ]; then
+    mkdir -p $DATA_DIR
+fi
+if [ ! -d $LOG_DIR ]; then
+    mkdir -p $LOG_DIR
+fi
+echo "#Vhost Name Messages_ready Messages_unacknowledged Messages Consumers Memory Time" > $DATA_FILE
+/usr/sbin/rabbitmqctl -q list_vhosts | \
+while read VHOST; do
+    /usr/sbin/rabbitmqctl -q list_queues -p $VHOST name messages_ready messages_unacknowledged messages consumers memory | \
+    awk "{print \"$VHOST \" \$0 \" $(date +'%s') \"}" >> $DATA_FILE 2>${LOG_DIR}/list_queues.log
+done
+echo "mnesia_size: ${MNESIA_DB_SIZE}@$NOW" > $RABBIT_STATS_DATA_FILE
+echo "rss_size: ${RABBIT_RSS}@$NOW" >> $RABBIT_STATS_DATA_FILE
--- a/tests/50_test_monitoring.py
+++ b/tests/50_test_monitoring.py
@ -0,0 +1,83 @@
+#!/usr/bin/python3
+
+# This Amulet test performs a basic deploy and checks if rabbitmq is running.
+
+import amulet
+import os
+import time
+
+# The number of seconds to wait for the environment to setup.
+seconds = 900
+# Get the directory in this way to load the files from the tests directory.
+path = os.path.abspath(os.path.dirname(__file__))
+
+key_path = os.path.join(path, 'rabbit-server-privkey.pem')
+# Read the private key file.
+with open(key_path) as f:
+    privateKey = f.read()
+# Read the certificate file.
+cert_path = os.path.join(path, 'rabbit-server-cert.pem')
+with open(cert_path) as f:
+    certificate = f.read()
+
+# Create a dictionary for the rabbitmq configuration.
+rabbitmq_configuration = {
+    'stats_cron_schedule': '*/1 * * * *'
+}
+d = amulet.Deployment(series='trusty')
+# Add the rabbitmq-server charm to the deployment.
+d.add('rabbitmq-server')
+# Configure options on the rabbitmq-server.
+d.configure('rabbitmq-server', rabbitmq_configuration)
+# Expose the server so we can connect.
+d.expose('rabbitmq-server')
+# XXX Remove charm= once this branch lands in the charm store
+d.add('nrpe-external-master',
+      charm='lp:~gnuoy/charms/trusty/nrpe/services-rewrite')
+d.relate('rabbitmq-server:nrpe-external-master',
+         'nrpe-external-master:nrpe-external-master')
+
+try:
+    # Execute the deployer with the current mapping.
+    d.setup(timeout=seconds)
+except amulet.helpers.TimeoutError:
+    message = 'The environment did not setup in %d seconds.' % seconds
+    # The SKIP status enables skip or fail the test based on configuration.
+    amulet.raise_status(amulet.SKIP, msg=message)
+except:
+    raise
+print('The rabbitmq-server has been successfully deployed and related '
+      'to nrpe-external-master.')
+
+###############################################################################
+# # Verify nagios checks
+###############################################################################
+rabbitmq_sentry = d.sentry.unit['rabbitmq-server/0']
+
+command = 'bash -c "$(egrep -oh /usr/local.* ' \
+          '/etc/nagios/nrpe.d/check_rabbitmq.cfg)"'
+print(command)
+output, code = rabbitmq_sentry.run(command)
+print(output)
+if (code != 0):
+    message = 'The ' + command + ' did not return the expected code of 0.'
+    amulet.raise_status(amulet.FAIL, msg=message)
+else:
+    print('The rabbitmq-server check_rabbitmq is OK')
+
+print('Sleeping 70 seconds to make sure the monitoring cron has run')
+time.sleep(70)
+
+command = 'bash -c "$(egrep -oh /usr/local.* ' \
+          '/etc/nagios/nrpe.d/check_rabbitmq_queue.cfg)"'
+print(command)
+output, code = rabbitmq_sentry.run(command)
+print(output)
+if (code != 0):
+    message = 'The ' + command + ' did not return the expected code of 0.'
+    amulet.raise_status(amulet.FAIL, msg=message)
+else:
+    print('The rabbitmq-server check_rabbitmq_queue is OK')
+
+# Success!
+print('The rabbitmq-server passed the monitoring tests!')