charm-rabbitmq-server/files/check_rabbitmq_queues.py
Tianqi 9376aeb8e6 Handle non-uniform queue stats output
RabbitMQ sesrver sometimes creates non-uniform outputs that nrpe
can't parse. Instead of breaking the check, this commit outputs
the error messages and continue the check.

This problem is most likely caused by queue state being
"down" [1]. However, because the current charm doesn't show such
information and the bug is hard to manually reproduce, this
commit adds the state attribute when creating queue_state file
for future debugging.

[1] https://www.rabbitmq.com/rabbitmqctl.8.html#state_2

Closes-Bug: #1850948
Change-Id: Iaa493c8270f344cde8ad7c89bd2bb548f0ad71bd
2022-04-13 21:53:33 +00:00

212 lines
6.7 KiB
Python
Executable File

#!/usr/bin/python3
# Copyright (C) 2011, 2012, 2014 Canonical
# All Rights Reserved
# Author: Liam Young, Jacek Nykis
from collections import defaultdict
from datetime import datetime, timedelta
from fnmatch import fnmatchcase
from itertools import chain
import argparse
import os
import sys
lsb_dict = {}
with open("/etc/lsb-release") as f:
lsb = [s.split("=") for s in f.readlines()]
lsb_dict = dict([(k, v.strip()) for k, v in lsb])
def gen_data_lines(filename):
with open(filename, "rt") as fin:
for line in fin:
if not line.startswith("#"):
yield line
def gen_stats(data_lines):
for line in data_lines:
try:
vhost, queue, _, _, m_all, _, _ = line.split(None, 6)
except ValueError:
print("ERROR: problem parsing the line {}".format(line))
continue
assert m_all.isdigit(), ("Message count is not a number: {0!r}"
.format(m_all))
yield vhost, queue, int(m_all)
def collate_stats(stats, limits, exclude, busiest_queues):
# Create a dict with stats collated according to the definitions in the
# limits file. If none of the definitions in the limits file is matched,
# store the stat without collating.
collated = defaultdict(lambda: 0)
for vhost, queue, m_all in stats:
skip = False
for e_vhost, e_queue in exclude:
if fnmatchcase(vhost, e_vhost) and fnmatchcase(queue, e_queue):
skip = True
break
if skip:
continue
for l_vhost, l_queue, _, _ in limits:
if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
collated[l_vhost, l_queue] += m_all
# Save vhost and queue names when using wildcards as arguments.
if busiest_queues > 0:
collated[vhost, queue] += m_all
break
else:
collated[vhost, queue] += m_all
return collated
def check_stats(stats_collated, limits):
# Create a limits lookup dict with keys of the form (vhost, queue).
limits_lookup = dict(
((l_vhost, l_queue), (int(t_warning), int(t_critical)))
for l_vhost, l_queue, t_warning, t_critical in limits)
if not (stats_collated):
yield 'No Queues Found', 'No Vhosts Found', None, "UNKNOWN"
# Go through the stats and compare again limits, if any.
for l_vhost, l_queue in sorted(stats_collated):
m_all = stats_collated[l_vhost, l_queue]
try:
t_warning, t_critical = limits_lookup[l_vhost, l_queue]
except KeyError:
yield l_queue, l_vhost, m_all, "UNKNOWN"
else:
if m_all >= t_critical:
yield l_queue, l_vhost, m_all, "CRIT"
elif m_all >= t_warning:
yield l_queue, l_vhost, m_all, "WARN"
def check_stats_file_freshness(stats_file, oldest_timestamp):
"""Check if a rabbitmq stats file is fresh
Fresh here is defined as modified within the last 2* cron job intervals
:param stats_file: file name to check
:param oldest_timestamp: oldest timestamp the file can be last modified
:return: tuple (status, message)
"""
file_mtime = datetime.fromtimestamp(os.path.getmtime(stats_file))
if file_mtime < oldest_timestamp:
return (
"CRIT",
"Rabbit stats file not updated since {}".format(
file_mtime
),
)
return ("OK", "")
def top_n_queues(stats, busiest_queues):
if busiest_queues <= 0:
return []
tqueues = [" - Top Queues"]
sorted_messages_stats = sorted(stats.items(),
key=lambda y: y[1],
reverse=True)
for stat in sorted_messages_stats[:busiest_queues]:
tqueues.append("{0}:{1} -> {2}".format(stat[0][0], # vhost
stat[0][1], # queue
stat[1])) # messages
return tqueues
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='RabbitMQ queue size nagios check.')
parser.add_argument(
'-c',
nargs=4,
action='append',
required=True,
metavar=('vhost', 'queue', 'warn', 'crit'),
help='Vhost and queue to check. Can be used multiple times'
)
parser.add_argument(
'-e',
nargs=2,
action='append',
required=False,
default=[],
metavar=('vhost', 'queue'),
help=(
'Vhost and queue to exclude from checks. Can be used multiple '
'times'
)
)
parser.add_argument(
'-m',
nargs='?',
action='store',
required=False,
default=0,
type=int,
help=(
'Maximum age (in seconds) the stats files can be before a crit is '
'raised'
)
)
parser.add_argument(
'-d',
type=int,
required=False,
default=0,
metavar=('n'),
help='Display the n busiest queues'
)
parser.add_argument(
'stats_file',
nargs='*',
type=str,
help='file containing queue stats')
args = parser.parse_args()
# Start generating stats from all files given on the command line.
stats = gen_stats(
chain.from_iterable(
gen_data_lines(filename) for filename in args.stats_file))
# Collate stats according to limit definitions and check.
stats_collated = collate_stats(stats, args.c, args.e, args.d)
stats_checked = check_stats(stats_collated, args.c)
criticals, warnings = [], []
for queue, vhost, message_no, status in stats_checked:
if status == "CRIT":
criticals.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
elif status == "WARN":
warnings.append(
"%s in %s has %s messages" % (queue, vhost, message_no))
if args.m:
oldest = datetime.now() - timedelta(seconds=args.m)
freshness_results = [check_stats_file_freshness(f, oldest)
for f in args.stats_file]
criticals.extend(
msg for status, msg in freshness_results if status == "CRIT"
)
tqueues = top_n_queues(stats_collated, args.d)
if len(criticals) > 0:
print("CRITICAL: {0} {1}".format(", ".join(criticals),
" | ".join(tqueues)))
sys.exit(2)
# XXX: No warnings if there are criticals?
elif len(warnings) > 0:
print("WARNING: {0} {1}".format(", ".join(warnings),
" | ".join(tqueues)))
sys.exit(1)
else:
print("OK")
sys.exit(0)