9376aeb8e6
RabbitMQ sesrver sometimes creates non-uniform outputs that nrpe can't parse. Instead of breaking the check, this commit outputs the error messages and continue the check. This problem is most likely caused by queue state being "down" [1]. However, because the current charm doesn't show such information and the bug is hard to manually reproduce, this commit adds the state attribute when creating queue_state file for future debugging. [1] https://www.rabbitmq.com/rabbitmqctl.8.html#state_2 Closes-Bug: #1850948 Change-Id: Iaa493c8270f344cde8ad7c89bd2bb548f0ad71bd
212 lines
6.7 KiB
Python
Executable File
212 lines
6.7 KiB
Python
Executable File
#!/usr/bin/python3
|
|
|
|
# Copyright (C) 2011, 2012, 2014 Canonical
|
|
# All Rights Reserved
|
|
# Author: Liam Young, Jacek Nykis
|
|
|
|
from collections import defaultdict
|
|
from datetime import datetime, timedelta
|
|
from fnmatch import fnmatchcase
|
|
from itertools import chain
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
|
|
lsb_dict = {}
|
|
with open("/etc/lsb-release") as f:
|
|
lsb = [s.split("=") for s in f.readlines()]
|
|
lsb_dict = dict([(k, v.strip()) for k, v in lsb])
|
|
|
|
|
|
def gen_data_lines(filename):
|
|
with open(filename, "rt") as fin:
|
|
for line in fin:
|
|
if not line.startswith("#"):
|
|
yield line
|
|
|
|
|
|
def gen_stats(data_lines):
|
|
for line in data_lines:
|
|
try:
|
|
vhost, queue, _, _, m_all, _, _ = line.split(None, 6)
|
|
except ValueError:
|
|
print("ERROR: problem parsing the line {}".format(line))
|
|
continue
|
|
assert m_all.isdigit(), ("Message count is not a number: {0!r}"
|
|
.format(m_all))
|
|
yield vhost, queue, int(m_all)
|
|
|
|
|
|
def collate_stats(stats, limits, exclude, busiest_queues):
|
|
# Create a dict with stats collated according to the definitions in the
|
|
# limits file. If none of the definitions in the limits file is matched,
|
|
# store the stat without collating.
|
|
collated = defaultdict(lambda: 0)
|
|
for vhost, queue, m_all in stats:
|
|
skip = False
|
|
|
|
for e_vhost, e_queue in exclude:
|
|
if fnmatchcase(vhost, e_vhost) and fnmatchcase(queue, e_queue):
|
|
skip = True
|
|
break
|
|
|
|
if skip:
|
|
continue
|
|
|
|
for l_vhost, l_queue, _, _ in limits:
|
|
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
|
|
collated[l_vhost, l_queue] += m_all
|
|
# Save vhost and queue names when using wildcards as arguments.
|
|
if busiest_queues > 0:
|
|
collated[vhost, queue] += m_all
|
|
break
|
|
else:
|
|
collated[vhost, queue] += m_all
|
|
return collated
|
|
|
|
|
|
def check_stats(stats_collated, limits):
|
|
# Create a limits lookup dict with keys of the form (vhost, queue).
|
|
limits_lookup = dict(
|
|
((l_vhost, l_queue), (int(t_warning), int(t_critical)))
|
|
for l_vhost, l_queue, t_warning, t_critical in limits)
|
|
if not (stats_collated):
|
|
yield 'No Queues Found', 'No Vhosts Found', None, "UNKNOWN"
|
|
# Go through the stats and compare again limits, if any.
|
|
for l_vhost, l_queue in sorted(stats_collated):
|
|
m_all = stats_collated[l_vhost, l_queue]
|
|
try:
|
|
t_warning, t_critical = limits_lookup[l_vhost, l_queue]
|
|
except KeyError:
|
|
yield l_queue, l_vhost, m_all, "UNKNOWN"
|
|
else:
|
|
if m_all >= t_critical:
|
|
yield l_queue, l_vhost, m_all, "CRIT"
|
|
elif m_all >= t_warning:
|
|
yield l_queue, l_vhost, m_all, "WARN"
|
|
|
|
|
|
def check_stats_file_freshness(stats_file, oldest_timestamp):
|
|
"""Check if a rabbitmq stats file is fresh
|
|
|
|
Fresh here is defined as modified within the last 2* cron job intervals
|
|
|
|
:param stats_file: file name to check
|
|
:param oldest_timestamp: oldest timestamp the file can be last modified
|
|
:return: tuple (status, message)
|
|
"""
|
|
file_mtime = datetime.fromtimestamp(os.path.getmtime(stats_file))
|
|
if file_mtime < oldest_timestamp:
|
|
return (
|
|
"CRIT",
|
|
"Rabbit stats file not updated since {}".format(
|
|
file_mtime
|
|
),
|
|
)
|
|
return ("OK", "")
|
|
|
|
|
|
def top_n_queues(stats, busiest_queues):
|
|
if busiest_queues <= 0:
|
|
return []
|
|
tqueues = [" - Top Queues"]
|
|
sorted_messages_stats = sorted(stats.items(),
|
|
key=lambda y: y[1],
|
|
reverse=True)
|
|
for stat in sorted_messages_stats[:busiest_queues]:
|
|
tqueues.append("{0}:{1} -> {2}".format(stat[0][0], # vhost
|
|
stat[0][1], # queue
|
|
stat[1])) # messages
|
|
return tqueues
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description='RabbitMQ queue size nagios check.')
|
|
parser.add_argument(
|
|
'-c',
|
|
nargs=4,
|
|
action='append',
|
|
required=True,
|
|
metavar=('vhost', 'queue', 'warn', 'crit'),
|
|
help='Vhost and queue to check. Can be used multiple times'
|
|
)
|
|
parser.add_argument(
|
|
'-e',
|
|
nargs=2,
|
|
action='append',
|
|
required=False,
|
|
default=[],
|
|
metavar=('vhost', 'queue'),
|
|
help=(
|
|
'Vhost and queue to exclude from checks. Can be used multiple '
|
|
'times'
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
'-m',
|
|
nargs='?',
|
|
action='store',
|
|
required=False,
|
|
default=0,
|
|
type=int,
|
|
help=(
|
|
'Maximum age (in seconds) the stats files can be before a crit is '
|
|
'raised'
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
'-d',
|
|
type=int,
|
|
required=False,
|
|
default=0,
|
|
metavar=('n'),
|
|
help='Display the n busiest queues'
|
|
)
|
|
parser.add_argument(
|
|
'stats_file',
|
|
nargs='*',
|
|
type=str,
|
|
help='file containing queue stats')
|
|
args = parser.parse_args()
|
|
|
|
# Start generating stats from all files given on the command line.
|
|
stats = gen_stats(
|
|
chain.from_iterable(
|
|
gen_data_lines(filename) for filename in args.stats_file))
|
|
# Collate stats according to limit definitions and check.
|
|
stats_collated = collate_stats(stats, args.c, args.e, args.d)
|
|
stats_checked = check_stats(stats_collated, args.c)
|
|
criticals, warnings = [], []
|
|
for queue, vhost, message_no, status in stats_checked:
|
|
if status == "CRIT":
|
|
criticals.append(
|
|
"%s in %s has %s messages" % (queue, vhost, message_no))
|
|
elif status == "WARN":
|
|
warnings.append(
|
|
"%s in %s has %s messages" % (queue, vhost, message_no))
|
|
|
|
if args.m:
|
|
oldest = datetime.now() - timedelta(seconds=args.m)
|
|
freshness_results = [check_stats_file_freshness(f, oldest)
|
|
for f in args.stats_file]
|
|
criticals.extend(
|
|
msg for status, msg in freshness_results if status == "CRIT"
|
|
)
|
|
|
|
tqueues = top_n_queues(stats_collated, args.d)
|
|
|
|
if len(criticals) > 0:
|
|
print("CRITICAL: {0} {1}".format(", ".join(criticals),
|
|
" | ".join(tqueues)))
|
|
sys.exit(2)
|
|
# XXX: No warnings if there are criticals?
|
|
elif len(warnings) > 0:
|
|
print("WARNING: {0} {1}".format(", ".join(warnings),
|
|
" | ".join(tqueues)))
|
|
sys.exit(1)
|
|
else:
|
|
print("OK")
|
|
sys.exit(0)
|