From 1c43773cd6c25e5d0cf8c9c70fa23bf6ce4cea34 Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Mon, 5 Oct 2015 11:18:19 +0200 Subject: [PATCH] Add new metrics for RabbitMQ The following new metrics are added: - The total number of queues that are not mirrored - The total memory used reported by RabbitMQ - The VM memory limit - The remaining memory before reaching VM memory limit - The disk free limit - The disk free space per node - The remaining disk space before reaching the disk free limit Change-Id: I1d50d3cb9035d60fe915afe465e6c35ff01b6bb6 --- .../files/collectd/rabbitmq_info.py | 47 ++++++++++++++++++- .../files/plugins/decoders/collectd.lua | 7 +++ doc/source/dev/metrics/rabbitmq.rst | 10 +++- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/deployment_scripts/puppet/modules/lma_collector/files/collectd/rabbitmq_info.py b/deployment_scripts/puppet/modules/lma_collector/files/collectd/rabbitmq_info.py index e443e4cb9..cf7d5ee6e 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/collectd/rabbitmq_info.py +++ b/deployment_scripts/puppet/modules/lma_collector/files/collectd/rabbitmq_info.py @@ -71,10 +71,40 @@ class RabbitMqPlugin(base.Base): stats['memory'] = 0 stats['consumers'] = 0 stats['queues'] = 0 + stats['unmirrored_queues'] = 0 stats['pmap_mapped'] = 0 stats['pmap_used'] = 0 stats['pmap_shared'] = 0 + out, err = self.execute([self.rabbitmqctl_bin, '-q', 'status'], + shell=False) + if not out: + self.logger.error('%s: Failed to get the status' % + self.rabbitmqctl_bin) + return + + for v in ('vm_memory_limit', 'disk_free_limit', 'disk_free'): + try: + stats[v] = int(re.findall('{%s,([0-9]+)}' % v, out)[0]) + except: + self.logger.error('%s: Failed to get %s' % + (self.rabbitmqctl_bin, v)) + + mem_str = re.findall('{memory,\s+\[([^\]]+)\]\}', out) + # We are only interested by the total of memory used + # TODO: Get all informations about memory usage from mem_str + try: + stats['used_memory'] = int(re.findall('total,([0-9]+)', + mem_str[0])[0]) + except: + self.logger.error('%s: Failed to get the memory used by rabbitmq' % + self.rabbitmqctl_bin) + + if 'vm_memory_limit' in stats and 'used_memory' in stats: + stats['remaining_memory'] = stats['vm_memory_limit'] - stats['used_memory'] + if 'disk_free' in stats and 'disk_free_limit' in stats: + stats['remaining_disk'] = stats['disk_free'] - stats['disk_free_limit'] + out, err = self.execute([self.rabbitmqctl_bin, '-q', 'cluster_status'], shell=False) if not out: @@ -109,14 +139,15 @@ class RabbitMqPlugin(base.Base): out, err = self.execute([self.rabbitmqctl_bin, '-q', '-p', self.vhost, 'list_queues', 'name', 'messages', 'memory', - 'consumers'], shell=False) + 'consumers', 'policy', 'slave_pids', + 'synchronised_slave_pids'], shell=False) if not out: self.logger.error('%s: Failed to get the list of queues' % self.rabbitmqctl_bin) return for line in out.split('\n'): - ctl_stats = line.split() + ctl_stats = line.split('\t') try: ctl_stats[1] = int(ctl_stats[1]) ctl_stats[2] = int(ctl_stats[2]) @@ -131,6 +162,18 @@ class RabbitMqPlugin(base.Base): stats['%s.messages' % queue_name] = ctl_stats[1] stats['%s.memory' % queue_name] = ctl_stats[2] stats['%s.consumers' % queue_name] = ctl_stats[3] + # a queue is unmirrored if its policy is not ha-all + if 'ha-all' not in ctl_stats[4]: + stats['unmirrored_queues'] += 1 + else: + # we need to check if the list of synchronised slaves is + # equal to the list of slaves. + slaves = re.findall('<([a-zA-Z@\-.0-9]+)>', ctl_stats[5]) + for s in slaves: + if s not in ctl_stats[6]: + stats['unmirrored_queues'] += 1 + break + if not stats['memory'] > 0: self.logger.warning( diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua index c8b591bde..2048cb46b 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua @@ -191,6 +191,13 @@ function process_message () if sample['type_instance'] ~= 'consumers' and sample['type_instance'] ~= 'messages' and sample['type_instance'] ~= 'memory' and + sample['type_instance'] ~= 'used_memory' and + sample['type_instance'] ~= 'unmirrored_queues' and + sample['type_instance'] ~= 'vm_memory_limit' and + sample['type_instance'] ~= 'disk_free_limit' and + sample['type_instance'] ~= 'disk_free' and + sample['type_instance'] ~= 'remaining_memory' and + sample['type_instance'] ~= 'remaining_disk' and (string.match(sample['type_instance'], '%.consumers$') or string.match(sample['type_instance'], '%.messages$') or string.match(sample['type_instance'], '%.memory$')) then diff --git a/doc/source/dev/metrics/rabbitmq.rst b/doc/source/dev/metrics/rabbitmq.rst index a04801318..4c55d4b9e 100644 --- a/doc/source/dev/metrics/rabbitmq.rst +++ b/doc/source/dev/metrics/rabbitmq.rst @@ -13,10 +13,18 @@ Cluster * ``rabbitmq_consumers``, total number of consumers. * ``rabbitmq_exchanges``, total number of exchanges. * ``rabbitmq_memory``, bytes of memory consumed by the Erlang process associated with all queues, including stack, heap and internal structures. +* ``rabbitmq_used_memory``, bytes of memory used by the whole RabbitMQ process. +* ``rabbitmq_remaining_memory``, the difference between ``rabbitmq_vm_memory_limit`` and ``rabbitmq_used_memory``. * ``rabbitmq_messages``, total number of messages which are ready to be consumed or not yet acknowledged. * ``rabbitmq_total_nodes``, total number of nodes in the cluster. * ``rabbitmq_running_nodes``, total number of running nodes in the cluster. * ``rabbitmq_queues``, total number of queues. +* ``rabbitmq_unmirrored_queues``, total number of queues that are not mirrored. +* ``rabbitmq_vm_memory_limit``, the maximum amount of memory allocated for RabbitMQ. When ``rabbitmq_used_memory`` uses more than this value, all producers are blocked. +* ``rabbitmq_disk_free_limit``, the minimum amount of free disk for RabbitMQ. When ``rabbitmq_disk_free`` drops below this value, all producers are blocked. +* ``rabbitmq_disk_free``, the disk free space. +* ``rabbitmq_remaining_disk``, the difference between ``rabbitmq_disk_free`` and ``rabbitmq_disk_free_limit``. + Queues ^^^^^^ @@ -24,5 +32,5 @@ Queues All metrics have a ``queue`` field which contains the name of the RabbitMQ queue. * ``rabbitmq_queue_consumers``, number of consumers for a given queue. -* ``rabbitmq_queue_memory``, bytes of memory consumed by the Erlang process associated with the queue, including stack, heap and internal structures. +* ``rabbitmq_queue_memory``, bytes of memory consumed by the Erlang process associated with the queue, including stack, heap and internal structures. * ``rabbitmq_queue_messages``, number of messages which are ready to be consumed or not yet acknowledged for the given queue.