Add Pacemaker collectd plugin

This change adds a collectd plugin that gets metrics from the Pacemaker
cluster:

  - cluster's metrics
  - node's metrics
  - resource's metrics

Most of the metrics are only collected from the node that is the
designated controller except pacemaker_resource_local_active and
pacemaker_dc_local_active.

The plugin also removes the 'pacemaker_resource' plugin by providing the
exact same metrics and notifications for the other collectd plugins.

Finally the plugin is also installed on the standalone-rabbitmq and
standalone-database nodes if they are present.

Change-Id: I8b5b987704f69c6a60b13e8ea982f27924f488d1
This commit is contained in:
Simon Pasquier 2016-07-29 12:24:08 +02:00
parent 79a906d619
commit 3a3ef6f2e3
10 changed files with 476 additions and 176 deletions

View File

@ -357,6 +357,30 @@ if hiera('lma::collector::influxdb::server', false) {
}
}
if ($is_rabbitmq or $is_mysql_server) and ! $is_controller {
if $is_mysql_server {
$mysql_resource = {
'p_mysqld' => 'mysqld',
}
}
else {
$mysql_resource = {}
}
if $is_rabbitmq {
$rabbitmq_resource = {
'p_rabbitmq-server' => 'rabbitmq',
}
}
else {
$rabbitmq_resource = {}
}
class { 'lma_collector::collectd::pacemaker':
resources => merge($rabbitmq_resource, $mysql_resource),
hostname => $::hostname,
}
}
class { 'lma_collector::influxdb':
server => hiera('lma::collector::influxdb::server'),
port => hiera('lma::collector::influxdb::port'),

View File

@ -21,6 +21,7 @@ $network_metadata = hiera_hash('network_metadata')
$node_profiles = hiera_hash('lma::collector::node_profiles')
$is_rabbitmq = $node_profiles['rabbitmq']
$is_mysql_server = $node_profiles['mysql']
$ceilometer = hiera_hash('ceilometer', {})
$lma_collector = hiera_hash('lma_collector')
@ -304,15 +305,34 @@ if hiera('lma::collector::influxdb::server', false) {
}
$pacemaker_master_resource = 'vip__management'
# Deal with detach-* plugins
if $is_mysql_server {
$mysql_resource = {
'p_mysqld' => 'mysqld',
}
}
else {
$mysql_resource = {}
}
if $is_rabbitmq {
$rabbitmq_resource = {
'p_rabbitmq-server' => 'rabbitmq',
}
}
else {
$rabbitmq_resource = {}
}
class { 'lma_collector::collectd::pacemaker':
resources => [
'vip__public',
'vip__management',
'vip__vrouter_pub',
'vip__vrouter',
],
master_resource => $pacemaker_master_resource,
resources => merge({
'vip__public' => 'vip__public',
'vip__management' => 'vip__management',
'vip__vrouter_pub' => 'vip__vrouter_pub',
'vip__vrouter' => 'vip__vrouter',
'p_haproxy' => 'haproxy',
}, $mysql_resource, $rabbitmq_resource),
notify_resource => $pacemaker_master_resource,
hostname => $::fqdn,
}

View File

@ -737,15 +737,14 @@ which uses Pacemaker's `crm_resource` command to get statistics from Pacemaker.
##### Parameters
* `resources`: *Required*. The Pacemaker resources to get statistics for. Valid
options: an array of strings.
* `master_resource`: *Optional*. If this is set a collectd `PostCache` chain is
created to generate a collectd notification each time the Python plugin
generates a metric for the Pacemaker resource identified to by
`master_resource`. Users of
options: an hash of strings.
* `notify_resource`: *Optional*. If this is set, the collectd plugin generates
a collectd notification reporting the state of the Pacemaker resource
identified to by `master_resource`. Users of
[`lma_collector::collectd::openstack`](#define-lma_collectorcollectdopenstack),
[`lma_collector::collectd::openstack_checks`](#class-lma_collectorcollectdopenstackchecks) and
[`lma_collector::collectd::hypervisor`](#class-lma_collectorcollectdhypervisor)
with the `pacemaker_resource_master` parameter needs to declare the
with the `notify_resource` parameter needs to declare the
`lma_collector::collectd::pacemaker` class and use that parameter.
Valid options: a string. Default: `undef`.
* `hostname`: *Optional*. If this is set it will be used to identify the local

View File

@ -109,13 +109,14 @@ class Base(object):
"""Iterate over the collected metrics
This class must be implemented by the subclass and should yield dict
objects that represent the collected values. Each dict has 3 keys:
objects that represent the collected values. Each dict has 6 keys:
- 'values', a scalar number or a list of numbers if the type
defines several datasources.
- 'type_instance' (optional)
- 'plugin_instance' (optional)
- 'type' (optional, default='gauge')
- 'meta' (optional)
- 'hostname' (optional)
For example:
@ -141,6 +142,7 @@ class Base(object):
v = self.collectd.Values(
plugin=self.plugin,
host=metric.get('hostname', ''),
type=metric.get('type', 'gauge'),
plugin_instance=self.plugin_instance,
type_instance=type_instance,

View File

@ -0,0 +1,306 @@
#!/usr/bin/python
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collectd
from collections import Counter
from collections import defaultdict
from sets import Set
import socket
import xml.etree.ElementTree as ET
import collectd_base as base
NAME = 'pacemaker'
CRM_MON_BINARY = '/usr/sbin/crm_mon'
# Node status
OFFLINE_STATUS = 0
MAINTENANCE_STATUS = 1
ONLINE_STATUS = 2
class CrmMonitorPlugin(base.Base):
def __init__(self, *args, **kwargs):
super(CrmMonitorPlugin, self).__init__(*args, **kwargs)
self.plugin = NAME
self.crm_mon_binary = CRM_MON_BINARY
self.hostname = socket.getfqdn()
self.notify_resource = None
self.resources = {}
self.history = {}
def config_callback(self, conf):
super(CrmMonitorPlugin, self).config_callback(conf)
for node in conf.children:
if node.key == 'Hostname':
self.hostname = node.values[0]
elif node.key == 'CrmMonBinary':
self.crm_mon_binary = node.values[0]
elif node.key == 'Resource':
self.resources[node.values[0]] = node.values[-1]
elif node.key == 'NotifyResource':
self.notify_resource = node.values[0]
def itermetrics(self):
def str_to_bool(v):
return str(v).lower() == 'true'
def str_to_boolint(v):
if str_to_bool(v):
return 1
else:
return 0
def shorten_hostname(v):
return v.split('.')[0]
def same_hostname(v):
if v is not None and v.get('name') == self.hostname:
return 1
return 0
out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'],
shell=False)
if not out:
raise base.CheckException(
"Failed to execute crm_mon '{}'".format(err))
try:
root = ET.fromstring(out)
except ET.ParseError:
raise base.CheckException(
"Failed to parse XML '{}'".format(out[:64]))
if self.notify_resource:
# Notify the other collectd plugins whether the resource runs
# locally or not
node = root.find('resources/resource[@id="{}"]/node'.format(
self.notify_resource))
self.collectd.Notification(
type='gauge',
message='{{"resource":"{}","value":{}}}'.format(
self.notify_resource, same_hostname(node)),
severity=self.collectd.NOTIF_OKAY
).dispatch()
# The metric needs to be emitted too for the Lua plugins executed
# by the metric_collector service
yield {
'type_instance': 'resource_local_active',
'values': same_hostname(node),
'meta': {'resource': self.notify_resource}
}
summary = root.find('summary')
current_dc = summary.find('current_dc')
# The metric needs to be emitted for the alarms that leverage the other
# metrics emitted by the plugin
yield {
'type_instance': 'dc_local_active',
'values': same_hostname(current_dc),
}
if current_dc.get('name') != self.hostname:
# The other metrics are only collected from the cluster's DC
return
# Report global cluster metrics
yield {
'type_instance': 'dc',
'values': str_to_boolint(current_dc.get('present', 'false'))
}
yield {
'type_instance': 'quorum_status',
'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
}
yield {
'type_instance': 'configured_nodes',
'values': int(summary.find('nodes_configured').get('number'))
}
yield {
'type_instance': 'configured_resources',
'values': int(summary.find('resources_configured').get('number'))
}
# Report node status metrics
cluster_nodes = []
aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
nodes_total = 0
for node in root.find('nodes').iter('node'):
nodes_total += 1
hostname = shorten_hostname(node.get('name'))
cluster_nodes.append(node.get('name'))
if str_to_bool(node.get('online')):
if str_to_bool(node.get('maintenance')):
aggregated_nodes_status['maintenance'] += 1
yield {
'type_instance': 'node_status',
'values': MAINTENANCE_STATUS,
'hostname': hostname,
'meta': {'status': 'maintenance'}
}
else:
aggregated_nodes_status['online'] += 1
yield {
'type_instance': 'node_status',
'values': ONLINE_STATUS,
'hostname': hostname,
'meta': {'status': 'online'}
}
else:
aggregated_nodes_status['offline'] += 1
yield {
'type_instance': 'node_status',
'values': OFFLINE_STATUS,
'hostname': hostname,
'meta': {'status': 'offline'}
}
for status, cnt in aggregated_nodes_status.items():
yield {
'type_instance': 'nodes_count',
'values': cnt,
'meta': {'status': status}
}
yield {
'type_instance': 'nodes_percent',
'values': 100.0 * cnt / nodes_total,
'meta': {'status': status}
}
# Report the number of resources per status
# Clone resources can run on multipe nodes while "simple" resources run
# only one node at the same time
aggregated_resources = defaultdict(Counter)
resources = root.find('resources')
for resource_id, resource_name in self.resources.iteritems():
resource_elts = []
simple_resource = None
clone_resource = resources.find(
'clone/resource[@id="{}"]/..'.format(resource_id))
if not clone_resource:
simple_resource = resources.find('resource[@id="{}"]'.format(
resource_id))
if simple_resource:
resource_elts = [simple_resource]
else:
resource_elts = clone_resource.findall('resource')
if not resource_elts:
self.logger.error("{}: Couldn't find resource '{}'".format(
self.plugin, resource_id))
continue
total = 0
for item in resource_elts:
total += 1
if (item.get('role') in ('Slave', 'Master') and
not str_to_bool(item.get('failed'))):
# Multi-master resource
aggregated_resources[resource_name]['up'] += 1
elif item.get('role') == 'Started':
aggregated_resources[resource_name]['up'] += 1
else:
aggregated_resources[resource_name]['down'] += 1
if simple_resource:
# Report on which node the "simple" resource is running
for node in cluster_nodes:
yield {
'type_instance': 'local_resource_active',
'values': str_to_boolint(
node == simple_resource.find('node').get('name')),
'hostname': shorten_hostname(node),
'meta': {'resource': resource_name}
}
for status in ('up', 'down'):
cnt = aggregated_resources[resource_name][status]
yield {
'type_instance': 'resource_count',
'values': cnt,
'meta': {'status': status, 'resource': resource_name}
}
yield {
'type_instance': 'resource_percent',
'values': 100.0 * cnt / total,
'meta': {'status': status, 'resource': resource_name}
}
# Collect operations' history metrics for the monitored resources
#
# The reported count for the resource's operations is an approximate
# value because crm_mon doesn't provide the exact number. To estimate
# the number of operations applied to a resource, the plugin keeps a
# copy of call_ids and compares it with the current value.
for node in root.find('node_history').iter('node'):
hostname = shorten_hostname(node.get('name'))
if hostname not in self.history:
self.history[hostname] = {}
for resource_id, resource_name in self.resources.iteritems():
if resource_id not in self.history[hostname]:
self.history[hostname][resource_id] = {
'fail_count': 0,
'ops_count': 0,
'call_ids': Set([])
}
v = self.history[hostname][resource_id]
res_history = node.find('resource_history[@id="{}"]'.format(
resource_id))
if res_history:
# For simple resources, the resource_history element only
# exists for the node that runs the resource
v['fail_count'] += int(res_history.get('fail-count', 0))
call_ids = Set([
i.get('call') for i in res_history.findall(
'operation_history')])
if call_ids:
v['ops_count'] += len(call_ids - v['call_ids'])
v['call_ids'] = call_ids
yield {
'type_instance': 'resource_failures',
'values': v['fail_count'],
'hostname': hostname,
'meta': {'resource': resource_name}
}
yield {
'type_instance': 'resource_operations',
'values': v['ops_count'],
'hostname': hostname,
'meta': {'resource': resource_name}
}
plugin = CrmMonitorPlugin(collectd)
def init_callback():
plugin.restore_sigchld()
def config_callback(conf):
plugin.config_callback(conf)
def read_callback():
plugin.read_callback()
collectd.register_config(config_callback)
collectd.register_read(read_callback)

View File

@ -1,80 +0,0 @@
#!/usr/bin/python
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collectd
import socket
import collectd_base as base
NAME = 'pacemaker_resource'
CRM_RESOURCE_BIN = '/usr/sbin/crm_resource'
class PacemakerResourcePlugin(base.Base):
def __init__(self, *args, **kwargs):
super(PacemakerResourcePlugin, self).__init__(*args, **kwargs)
self.plugin = NAME
self.crm_resource_bin = CRM_RESOURCE_BIN
self.hostname = socket.getfqdn()
self.resources = []
def config_callback(self, conf):
super(PacemakerResourcePlugin, self).config_callback(conf)
for node in conf.children:
if node.key == 'Resource':
self.resources.extend(node.values)
elif node.key == 'Hostname':
self.hostname = node.values[0]
elif node.key == 'CrmResourceBin':
self.crm_resource_bin = node.values[0]
def itermetrics(self):
for resource in self.resources:
out, err = self.execute([self.crm_resource_bin, '--locate',
'--quiet', '--resource', resource],
shell=False)
if not out:
msg = "{}: Failed to get the status for '%s'".format(
self.plugin, resource)
raise base.CheckException(msg)
else:
value = 0
if self.hostname == out.lstrip("\n"):
value = 1
yield {
'type_instance': resource,
'values': value
}
plugin = PacemakerResourcePlugin(collectd, 'pacemaker')
def init_callback():
plugin.restore_sigchld()
def config_callback(conf):
plugin.config_callback(conf)
def read_callback():
plugin.read_callback()
collectd.register_init(init_callback)
collectd.register_config(config_callback)
collectd.register_read(read_callback)

View File

@ -306,10 +306,20 @@ function process_message ()
msg['Fields'][additional_tag] = sample['type_instance']
end
end
elseif metric_source == 'pacemaker_resource' then
msg['Fields']['name'] = 'pacemaker_local_resource_active'
msg['Fields']['tag_fields'] = { 'resource' }
msg['Fields']['resource'] = sample['type_instance']
elseif metric_source == 'pacemaker' then
msg['Fields']['name'] = metric_source .. sep .. sample['type_instance']
-- add dimension fields
local t = {}
for _, v in ipairs({'status', 'resource'}) do
if sample['meta'] and sample['meta'][v] then
t[#t+1] = v
msg['Fields'][v] = sample['meta'][v]
end
end
if #t > 0 then
msg['Fields']['tag_fields'] = t
end
elseif metric_source == 'users' then
-- 'users' is a reserved name for InfluxDB v0.9
msg['Fields']['name'] = 'logged_users'

View File

@ -15,71 +15,40 @@
class lma_collector::collectd::pacemaker (
$resources,
$master_resource = undef,
$notify_resource = undef,
$hostname = undef,
) {
validate_array($resources)
validate_hash($resources)
# Add quotes around the array values
$real_resources = suffix(prefix($resources, '"'), '"')
# Add quotes around the hash keys and values
$resources_keys = suffix(prefix(keys($resources), '"'), '"')
$resources_values = suffix(prefix(values($resources), '"'), '"')
$real_resources = hash(flatten(zip($resources_keys, $resources_values)))
if $hostname {
$config = {
'Resource' => $real_resources,
'Hostname' => "\"${hostname}\"",
}
$_hostname = {'Hostname' => "\"${hostname}\""}
} else {
$config = {
'Resource' => $real_resources,
}
$_hostname = {}
}
if $notify_resource {
$_notify_resource = {'NotifyResource' => "\"${notify_resource}\""}
} else {
$_notify_resource = {}
}
lma_collector::collectd::python { 'pacemaker_resource':
config => $config
lma_collector::collectd::python { 'collectd_pacemaker':
config => merge({'Resource' => $real_resources}, $_hostname, $_notify_resource)
}
if $master_resource {
if ! member($resources, $master_resource) {
fail("${master_resource} not a member of ${resources}")
}
# Configure a PostCache chain to create a collectd notification each time
# the pacemaker_resource plugin generates a metric whose "type instance"
# matches the resource specified by the $master_resource parameter.
#
# The notifications are caught by other plugins to know the state of that
# Pacemaker resource.
collectd::plugin { 'target_notification': }
collectd::plugin { 'match_regex': }
class { 'collectd::plugin::chain':
chainname => 'PostCache',
defaulttarget => 'write',
rules => [
{
'match' => {
'type' => 'regex',
'matches' => {
'Plugin' => '^pacemaker_resource$',
'TypeInstance' => "^${master_resource}$",
},
},
'targets' => [
{
'type' => 'notification',
'attributes' => {
'Message' => '{\"resource\":\"%{type_instance}\",\"value\":%{ds:value}}',
'Severity' => 'OKAY',
},
},
],
},
],
}
# Remove configuration bits from versions < 1.0
collectd::plugin { 'target_notification':
ensure => absent
}
collectd::plugin { 'match_regex':
ensure => absent
}
class { 'collectd::plugin::chain':
ensure => absent
}
}

View File

@ -20,34 +20,26 @@ describe 'lma_collector::collectd::pacemaker' do
end
describe 'with "resources" param' do
let(:params) {{:resources => ['vip__public', 'vip__management']}}
it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \
.with_config({'Resource' => ['"vip__public"', '"vip__management"']}) }
it { is_expected.not_to contain_collectd__plugin('target_notification') }
it { is_expected.not_to contain_collectd__plugin('match_regex') }
it { is_expected.not_to contain_class('collectd::plugin::chain') }
let(:params) {{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'}}}
it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \
.with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'}}) }
end
describe 'with "hostname" param' do
let(:params) {{:resources => ['vip__public', 'vip__management'],
let(:params) {{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'},
:hostname => 'foo.example.com'}}
it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \
.with_config({'Resource' => ['"vip__public"', '"vip__management"'],
it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \
.with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'},
'Hostname' => '"foo.example.com"'}) }
it { is_expected.not_to contain_collectd__plugin('target_notification') }
it { is_expected.not_to contain_collectd__plugin('match_regex') }
it { is_expected.not_to contain_class('collectd::plugin::chain') }
end
describe 'with "master_resource" param' do
describe 'with "notify_resource" param' do
let(:params) do
{:resources => ['vip__public', 'vip__management'],
:master_resource => 'vip__management'}
{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'},
:notify_resource => 'vip__management'}
end
it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \
.with_config({'Resource' => ['"vip__public"', '"vip__management"'],}) }
it { is_expected.to contain_collectd__plugin('target_notification') }
it { is_expected.to contain_collectd__plugin('match_regex') }
it { is_expected.to contain_class('collectd::plugin::chain') }
it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \
.with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'},
"NotifyResource"=>"\"vip__management\""}) }
end
end

View File

@ -1,9 +1,67 @@
.. _pacemaker-metrics:
Resource location
^^^^^^^^^^^^^^^^^
Cluster
^^^^^^^
* ``pacemaker_resource_local_active``, ``1`` when the resource is located on
* ``pacemaker_dc_local_active``, ``1`` when the Designated Controller (DC) is
the local host, if not, then ``0``.
* ``pacemaker_dc`` [#f1]_, ``1`` when the Designated Controller (DC) is
present, if not, then ``0``.
* ``pacemaker_quorum_status`` [#f1]_, ``1`` when the cluster's quorum is
reached, if not, then ``0``.
* ``pacemaker_configured_nodes`` [#f1]_, the number of configured nodes in the
cluster.
* ``pacemaker_configured_resources`` [#f1]_, the number of configured nodes in
the cluster.
.. [#f1] this metric is only emitted from the node that is the Designated
Controller (DC) of the Pacemaker cluster.
Node
^^^^
The following metrics are only emitted from the node that is the Designated
Controller (DC) of the Pacemaker cluster. They have a ``status`` field which is
one of 'offline', 'maintenance', or 'online':
* ``pacemaker_node_status``, the status of the node, ``0`` when offline, ``1``
when in maintenance or ``2`` when online.
* ``pacemaker_node_count``, the total number of nodes with the given
``status``.
* ``pacemaker_node_percent``, the percentage of nodes with the given
``status``.
Resource
^^^^^^^^
* ``pacemaker_resource_local_active``, ``1`` when the resource is located on
the host reporting the metric, if not, then ``0``. The metric contains a
``resource`` field which is one of 'vip__public', 'vip__management',
'vip__vrouter_pub', or 'vip__vrouter'.
* ``pacemaker_resource_failures`` [#f2]_, the total number of failures that
Pacemaker detected for the ``resource``. The counter is reset every time the
collector restarts. The metric contains a ``resource`` field which one of
'vip__management', 'vip__public', 'vip__vrouter_pub', 'vip__vrouter',
'rabbitmq', 'mysqld' or 'haproxy'.
* ``pacemaker_resource_operations`` [#f2]_, the total number of operations that
Pacemaker applied to the ``resource``. The counter is reset every time the
collector restarts. The metric contains a ``resource`` field which one of
'vip__management', 'vip__public', 'vip__vrouter_pub', 'vip__vrouter',
'rabbitmq', 'mysqld' or 'haproxy'.
The following metrics have ``resource`` and ``status`` fields.
``status`` is one of 'offline', 'maintenance', or 'online'.
``resource`` is one of 'vip__management', 'vip__public', 'vip__vrouter_pub',
'vip__vrouter', 'rabbitmq', 'mysqld' or 'haproxy'.
* ``pacemaker_resource_count`` [#f2]_, the total number of instances for the given
``status`` and ``resource``.
* ``pacemaker_resource_percent`` [#f2]_, the percentage of instances for the given
``status`` and ``resource``.
.. [#f2] this metric is only emitted from the node that is the Designated
Controller (DC) of the Pacemaker cluster.