NRPE: Allow excluding queues from queue-size checks

Option '-e <vhost> <queue>' was added to the 'check_rabbitmq_queues.py' nrpe script to allow excluding selected queues when checking queue sizes. Corresponding option 'exclude_queues' was added to the charm config. By default, following queues are excluded: * event.sample * notifications_designate.info * notifications_designate.error * versioned_notifications.info * versioned_notifications.error Closes-Bug: #1811433 Change-Id: I57e297bb4323a3ab98da020bfcb1630889aac6d7
2020-12-08 15:51:32 +01:00 · 2020-12-08 15:51:32 +01:00 · 7acad5fdaa
parent 07ec03b5d7
commit 7acad5fdaa
5 changed files with 57 additions and 8 deletions
--- a/.zuul.yaml
+++ b/.zuul.yaml
@ -1,5 +1,4 @@
 - project:
    templates:
-      - python35-charm-jobs
+      - openstack-python3-charm-jobs
      - openstack-python3-ussuri-jobs
      - openstack-cover-jobs
--- a/config.yaml
+++ b/config.yaml
@ -106,6 +106,20 @@ options:
      Wildcards '*' are accepted to monitor all vhosts and/or queues.
      In case of multiple matches, only the first will apply: wildcards should
      therefore be used last in order to avoid unexpected behavior.
  exclude_queues:
    type: string
    default: "[]"
    description: |
      List of RabbitMQ queues that should be skipped when checking thresholds.
      Interpreted as YAML in format [<vhost>, <queue>]
      Per-queue thresholds can be expressed as a multi-line YAML array:
      - ['/', 'queue1']
      - ['/', 'queue2']
      Or as a list of lists:
      [['/', 'queue1'], ['/', 'queue2']]
      Wildcards '*' are accepted to exclude, for example, single queue on all
      hosts. Note that the wildcard asterisk must be double-escaped. Example:
      [['\\*', 'queue1']]
  connection-backlog:
    type: int
    default:
--- a/files/check_rabbitmq_queues.py
+++ b/files/check_rabbitmq_queues.py
@ -38,12 +38,22 @@ def gen_stats(data_lines):
        yield vhost, queue, int(m_all)
-def collate_stats(stats, limits):
+def collate_stats(stats, limits, exclude):
    # Create a dict with stats collated according to the definitions in the
    # limits file. If none of the definitions in the limits file is matched,
    # store the stat without collating.
    collated = defaultdict(lambda: 0)
    for vhost, queue, m_all in stats:
        skip = False
        for e_vhost, e_queue in exclude:
            if fnmatchcase(vhost, e_vhost) and fnmatchcase(queue, e_queue):
                skip = True
                break
        if skip:
            continue
        for l_vhost, l_queue, _, _ in limits:
            if fnmatchcase(vhost, l_vhost) and fnmatchcase(queue, l_queue):
                collated[l_vhost, l_queue] += m_all
@ -120,7 +130,18 @@ if __name__ == "__main__":
        action='append',
        required=True,
        metavar=('vhost', 'queue', 'warn', 'crit'),
-        help=('Vhost and queue to check. Can be used multiple times'))
+        help='Vhost and queue to check. Can be used multiple times'
    )
    parser.add_argument(
        '-e',
        nargs=2,
        action='append',
        required=False,
        default=[],
        metavar=('vhost', 'queue'),
        help='Vhost and queue to exclude from checks. Can be used multiple \
        times'
    )
    parser.add_argument(
        'stats_file',
        nargs='*',
@ -133,7 +154,7 @@ if __name__ == "__main__":
        chain.from_iterable(
            gen_data_lines(filename) for filename in args.stats_file))
    # Collate stats according to limit definitions and check.
-    stats_collated = collate_stats(stats, args.c)
+    stats_collated = collate_stats(stats, args.c, args.e)
    stats_checked = check_stats(stats_collated, args.c)
    criticals, warnings = [], []
    for queue, vhost, message_no, status in stats_checked:
--- a/hooks/rabbit_utils.py
+++ b/hooks/rabbit_utils.py
@ -1403,6 +1403,9 @@ def nrpe_update_queues_check(nrpe_compat, rabbit_dir):
        # If value of queue_thresholds is incorrect we want the hook to fail
        for item in yaml.safe_load(config('queue_thresholds')):
            cmd += ' -c "{}" "{}" {} {}'.format(*item)
        for item in yaml.safe_load(config('exclude_queues')):
            cmd += ' -e "{}" "{}"'.format(*item)
        nrpe_compat.add_check(
            shortname=RABBIT_USER + '_queue',
            description='Check RabbitMQ Queues',
--- a/unit_tests/test_rabbit_utils.py
+++ b/unit_tests/test_rabbit_utils.py
@ -1157,13 +1157,25 @@ class UtilsTests(CharmTestCase):
        # call with stats_cron_schedule set to '*/5 * * * *'
        self.test_config.set('stats_cron_schedule', '*/5 * * * *')
        # set some queues to exclude to test proper command generation
        # with '-e' parameter
        self.test_config.set('exclude_queues',
                             "[['\\*', 'event.sample'], "
                             "['\\*', 'notifications_designate.info']]")
        rabbit_utils.nrpe_update_queues_check(self.nrpe_compat, self.tmp_dir)
        default_excludes = [
            ('\\*', 'event.sample'),
            ('\\*', 'notifications_designate.info'),
        ]
        exclude_queues = ''
        for vhost, queue in default_excludes:
            exclude_queues += '-e "{}" "{}" '.format(vhost, queue)
        self.nrpe_compat.add_check.assert_called_with(
            shortname='rabbitmq_queue',
            description='Check RabbitMQ Queues',
-            check_cmd='{}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 '
+            check_cmd='{0}/check_rabbitmq_queues.py -c "\\*" "\\*" 100 200 {1}'
-                      '{}/data/test_queue_stats.dat'.format(self.tmp_dir,
+                      '{0}/data/test_queue_stats.dat'.format(self.tmp_dir,
-                                                            self.tmp_dir))
+                                                             exclude_queues))
        self.nrpe_compat.remove_check.assert_not_called()
        self.nrpe_compat.reset_mock()