Add Pacemaker collectd plugin
This change adds a collectd plugin that gets metrics from the Pacemaker cluster: - cluster's metrics - node's metrics - resource's metrics Most of the metrics are only collected from the node that is the designated controller except pacemaker_resource_local_active and pacemaker_dc_local_active. The plugin also removes the 'pacemaker_resource' plugin by providing the exact same metrics and notifications for the other collectd plugins. Finally the plugin is also installed on the standalone-rabbitmq and standalone-database nodes if they are present. Change-Id: I8b5b987704f69c6a60b13e8ea982f27924f488d1
This commit is contained in:
parent
79a906d619
commit
3a3ef6f2e3
|
@ -357,6 +357,30 @@ if hiera('lma::collector::influxdb::server', false) {
|
|||
}
|
||||
}
|
||||
|
||||
if ($is_rabbitmq or $is_mysql_server) and ! $is_controller {
|
||||
if $is_mysql_server {
|
||||
$mysql_resource = {
|
||||
'p_mysqld' => 'mysqld',
|
||||
}
|
||||
}
|
||||
else {
|
||||
$mysql_resource = {}
|
||||
}
|
||||
if $is_rabbitmq {
|
||||
$rabbitmq_resource = {
|
||||
'p_rabbitmq-server' => 'rabbitmq',
|
||||
}
|
||||
}
|
||||
else {
|
||||
$rabbitmq_resource = {}
|
||||
}
|
||||
|
||||
class { 'lma_collector::collectd::pacemaker':
|
||||
resources => merge($rabbitmq_resource, $mysql_resource),
|
||||
hostname => $::hostname,
|
||||
}
|
||||
}
|
||||
|
||||
class { 'lma_collector::influxdb':
|
||||
server => hiera('lma::collector::influxdb::server'),
|
||||
port => hiera('lma::collector::influxdb::port'),
|
||||
|
|
|
@ -21,6 +21,7 @@ $network_metadata = hiera_hash('network_metadata')
|
|||
|
||||
$node_profiles = hiera_hash('lma::collector::node_profiles')
|
||||
$is_rabbitmq = $node_profiles['rabbitmq']
|
||||
$is_mysql_server = $node_profiles['mysql']
|
||||
|
||||
$ceilometer = hiera_hash('ceilometer', {})
|
||||
$lma_collector = hiera_hash('lma_collector')
|
||||
|
@ -304,15 +305,34 @@ if hiera('lma::collector::influxdb::server', false) {
|
|||
}
|
||||
|
||||
$pacemaker_master_resource = 'vip__management'
|
||||
# Deal with detach-* plugins
|
||||
if $is_mysql_server {
|
||||
$mysql_resource = {
|
||||
'p_mysqld' => 'mysqld',
|
||||
}
|
||||
}
|
||||
else {
|
||||
$mysql_resource = {}
|
||||
}
|
||||
if $is_rabbitmq {
|
||||
$rabbitmq_resource = {
|
||||
'p_rabbitmq-server' => 'rabbitmq',
|
||||
}
|
||||
}
|
||||
else {
|
||||
$rabbitmq_resource = {}
|
||||
}
|
||||
|
||||
|
||||
class { 'lma_collector::collectd::pacemaker':
|
||||
resources => [
|
||||
'vip__public',
|
||||
'vip__management',
|
||||
'vip__vrouter_pub',
|
||||
'vip__vrouter',
|
||||
],
|
||||
master_resource => $pacemaker_master_resource,
|
||||
resources => merge({
|
||||
'vip__public' => 'vip__public',
|
||||
'vip__management' => 'vip__management',
|
||||
'vip__vrouter_pub' => 'vip__vrouter_pub',
|
||||
'vip__vrouter' => 'vip__vrouter',
|
||||
'p_haproxy' => 'haproxy',
|
||||
}, $mysql_resource, $rabbitmq_resource),
|
||||
notify_resource => $pacemaker_master_resource,
|
||||
hostname => $::fqdn,
|
||||
}
|
||||
|
||||
|
|
|
@ -737,15 +737,14 @@ which uses Pacemaker's `crm_resource` command to get statistics from Pacemaker.
|
|||
##### Parameters
|
||||
|
||||
* `resources`: *Required*. The Pacemaker resources to get statistics for. Valid
|
||||
options: an array of strings.
|
||||
* `master_resource`: *Optional*. If this is set a collectd `PostCache` chain is
|
||||
created to generate a collectd notification each time the Python plugin
|
||||
generates a metric for the Pacemaker resource identified to by
|
||||
`master_resource`. Users of
|
||||
options: an hash of strings.
|
||||
* `notify_resource`: *Optional*. If this is set, the collectd plugin generates
|
||||
a collectd notification reporting the state of the Pacemaker resource
|
||||
identified to by `master_resource`. Users of
|
||||
[`lma_collector::collectd::openstack`](#define-lma_collectorcollectdopenstack),
|
||||
[`lma_collector::collectd::openstack_checks`](#class-lma_collectorcollectdopenstackchecks) and
|
||||
[`lma_collector::collectd::hypervisor`](#class-lma_collectorcollectdhypervisor)
|
||||
with the `pacemaker_resource_master` parameter needs to declare the
|
||||
with the `notify_resource` parameter needs to declare the
|
||||
`lma_collector::collectd::pacemaker` class and use that parameter.
|
||||
Valid options: a string. Default: `undef`.
|
||||
* `hostname`: *Optional*. If this is set it will be used to identify the local
|
||||
|
|
|
@ -109,13 +109,14 @@ class Base(object):
|
|||
"""Iterate over the collected metrics
|
||||
|
||||
This class must be implemented by the subclass and should yield dict
|
||||
objects that represent the collected values. Each dict has 3 keys:
|
||||
objects that represent the collected values. Each dict has 6 keys:
|
||||
- 'values', a scalar number or a list of numbers if the type
|
||||
defines several datasources.
|
||||
- 'type_instance' (optional)
|
||||
- 'plugin_instance' (optional)
|
||||
- 'type' (optional, default='gauge')
|
||||
- 'meta' (optional)
|
||||
- 'hostname' (optional)
|
||||
|
||||
For example:
|
||||
|
||||
|
@ -141,6 +142,7 @@ class Base(object):
|
|||
|
||||
v = self.collectd.Values(
|
||||
plugin=self.plugin,
|
||||
host=metric.get('hostname', ''),
|
||||
type=metric.get('type', 'gauge'),
|
||||
plugin_instance=self.plugin_instance,
|
||||
type_instance=type_instance,
|
||||
|
|
|
@ -0,0 +1,306 @@
|
|||
#!/usr/bin/python
|
||||
# Copyright 2016 Mirantis, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import collectd
|
||||
from collections import Counter
|
||||
from collections import defaultdict
|
||||
from sets import Set
|
||||
import socket
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
import collectd_base as base
|
||||
|
||||
NAME = 'pacemaker'
|
||||
CRM_MON_BINARY = '/usr/sbin/crm_mon'
|
||||
|
||||
# Node status
|
||||
OFFLINE_STATUS = 0
|
||||
MAINTENANCE_STATUS = 1
|
||||
ONLINE_STATUS = 2
|
||||
|
||||
|
||||
class CrmMonitorPlugin(base.Base):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(CrmMonitorPlugin, self).__init__(*args, **kwargs)
|
||||
self.plugin = NAME
|
||||
self.crm_mon_binary = CRM_MON_BINARY
|
||||
self.hostname = socket.getfqdn()
|
||||
self.notify_resource = None
|
||||
self.resources = {}
|
||||
self.history = {}
|
||||
|
||||
def config_callback(self, conf):
|
||||
super(CrmMonitorPlugin, self).config_callback(conf)
|
||||
|
||||
for node in conf.children:
|
||||
if node.key == 'Hostname':
|
||||
self.hostname = node.values[0]
|
||||
elif node.key == 'CrmMonBinary':
|
||||
self.crm_mon_binary = node.values[0]
|
||||
elif node.key == 'Resource':
|
||||
self.resources[node.values[0]] = node.values[-1]
|
||||
elif node.key == 'NotifyResource':
|
||||
self.notify_resource = node.values[0]
|
||||
|
||||
def itermetrics(self):
|
||||
def str_to_bool(v):
|
||||
return str(v).lower() == 'true'
|
||||
|
||||
def str_to_boolint(v):
|
||||
if str_to_bool(v):
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def shorten_hostname(v):
|
||||
return v.split('.')[0]
|
||||
|
||||
def same_hostname(v):
|
||||
if v is not None and v.get('name') == self.hostname:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
out, err = self.execute([self.crm_mon_binary, '--as-xml', '-r', '-f'],
|
||||
shell=False)
|
||||
if not out:
|
||||
raise base.CheckException(
|
||||
"Failed to execute crm_mon '{}'".format(err))
|
||||
|
||||
try:
|
||||
root = ET.fromstring(out)
|
||||
except ET.ParseError:
|
||||
raise base.CheckException(
|
||||
"Failed to parse XML '{}'".format(out[:64]))
|
||||
|
||||
if self.notify_resource:
|
||||
# Notify the other collectd plugins whether the resource runs
|
||||
# locally or not
|
||||
node = root.find('resources/resource[@id="{}"]/node'.format(
|
||||
self.notify_resource))
|
||||
self.collectd.Notification(
|
||||
type='gauge',
|
||||
message='{{"resource":"{}","value":{}}}'.format(
|
||||
self.notify_resource, same_hostname(node)),
|
||||
severity=self.collectd.NOTIF_OKAY
|
||||
).dispatch()
|
||||
# The metric needs to be emitted too for the Lua plugins executed
|
||||
# by the metric_collector service
|
||||
yield {
|
||||
'type_instance': 'resource_local_active',
|
||||
'values': same_hostname(node),
|
||||
'meta': {'resource': self.notify_resource}
|
||||
}
|
||||
|
||||
summary = root.find('summary')
|
||||
current_dc = summary.find('current_dc')
|
||||
# The metric needs to be emitted for the alarms that leverage the other
|
||||
# metrics emitted by the plugin
|
||||
yield {
|
||||
'type_instance': 'dc_local_active',
|
||||
'values': same_hostname(current_dc),
|
||||
}
|
||||
|
||||
if current_dc.get('name') != self.hostname:
|
||||
# The other metrics are only collected from the cluster's DC
|
||||
return
|
||||
|
||||
# Report global cluster metrics
|
||||
yield {
|
||||
'type_instance': 'dc',
|
||||
'values': str_to_boolint(current_dc.get('present', 'false'))
|
||||
}
|
||||
|
||||
yield {
|
||||
'type_instance': 'quorum_status',
|
||||
'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
|
||||
}
|
||||
yield {
|
||||
'type_instance': 'configured_nodes',
|
||||
'values': int(summary.find('nodes_configured').get('number'))
|
||||
}
|
||||
yield {
|
||||
'type_instance': 'configured_resources',
|
||||
'values': int(summary.find('resources_configured').get('number'))
|
||||
}
|
||||
|
||||
# Report node status metrics
|
||||
cluster_nodes = []
|
||||
aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
|
||||
nodes_total = 0
|
||||
for node in root.find('nodes').iter('node'):
|
||||
nodes_total += 1
|
||||
hostname = shorten_hostname(node.get('name'))
|
||||
cluster_nodes.append(node.get('name'))
|
||||
if str_to_bool(node.get('online')):
|
||||
if str_to_bool(node.get('maintenance')):
|
||||
aggregated_nodes_status['maintenance'] += 1
|
||||
yield {
|
||||
'type_instance': 'node_status',
|
||||
'values': MAINTENANCE_STATUS,
|
||||
'hostname': hostname,
|
||||
'meta': {'status': 'maintenance'}
|
||||
}
|
||||
else:
|
||||
aggregated_nodes_status['online'] += 1
|
||||
yield {
|
||||
'type_instance': 'node_status',
|
||||
'values': ONLINE_STATUS,
|
||||
'hostname': hostname,
|
||||
'meta': {'status': 'online'}
|
||||
}
|
||||
else:
|
||||
aggregated_nodes_status['offline'] += 1
|
||||
yield {
|
||||
'type_instance': 'node_status',
|
||||
'values': OFFLINE_STATUS,
|
||||
'hostname': hostname,
|
||||
'meta': {'status': 'offline'}
|
||||
}
|
||||
|
||||
for status, cnt in aggregated_nodes_status.items():
|
||||
yield {
|
||||
'type_instance': 'nodes_count',
|
||||
'values': cnt,
|
||||
'meta': {'status': status}
|
||||
}
|
||||
yield {
|
||||
'type_instance': 'nodes_percent',
|
||||
'values': 100.0 * cnt / nodes_total,
|
||||
'meta': {'status': status}
|
||||
}
|
||||
|
||||
# Report the number of resources per status
|
||||
# Clone resources can run on multipe nodes while "simple" resources run
|
||||
# only one node at the same time
|
||||
aggregated_resources = defaultdict(Counter)
|
||||
resources = root.find('resources')
|
||||
for resource_id, resource_name in self.resources.iteritems():
|
||||
resource_elts = []
|
||||
simple_resource = None
|
||||
clone_resource = resources.find(
|
||||
'clone/resource[@id="{}"]/..'.format(resource_id))
|
||||
if not clone_resource:
|
||||
simple_resource = resources.find('resource[@id="{}"]'.format(
|
||||
resource_id))
|
||||
if simple_resource:
|
||||
resource_elts = [simple_resource]
|
||||
else:
|
||||
resource_elts = clone_resource.findall('resource')
|
||||
|
||||
if not resource_elts:
|
||||
self.logger.error("{}: Couldn't find resource '{}'".format(
|
||||
self.plugin, resource_id))
|
||||
continue
|
||||
|
||||
total = 0
|
||||
for item in resource_elts:
|
||||
total += 1
|
||||
if (item.get('role') in ('Slave', 'Master') and
|
||||
not str_to_bool(item.get('failed'))):
|
||||
# Multi-master resource
|
||||
aggregated_resources[resource_name]['up'] += 1
|
||||
elif item.get('role') == 'Started':
|
||||
aggregated_resources[resource_name]['up'] += 1
|
||||
else:
|
||||
aggregated_resources[resource_name]['down'] += 1
|
||||
|
||||
if simple_resource:
|
||||
# Report on which node the "simple" resource is running
|
||||
for node in cluster_nodes:
|
||||
yield {
|
||||
'type_instance': 'local_resource_active',
|
||||
'values': str_to_boolint(
|
||||
node == simple_resource.find('node').get('name')),
|
||||
'hostname': shorten_hostname(node),
|
||||
'meta': {'resource': resource_name}
|
||||
}
|
||||
|
||||
for status in ('up', 'down'):
|
||||
cnt = aggregated_resources[resource_name][status]
|
||||
yield {
|
||||
'type_instance': 'resource_count',
|
||||
'values': cnt,
|
||||
'meta': {'status': status, 'resource': resource_name}
|
||||
}
|
||||
yield {
|
||||
'type_instance': 'resource_percent',
|
||||
'values': 100.0 * cnt / total,
|
||||
'meta': {'status': status, 'resource': resource_name}
|
||||
}
|
||||
|
||||
# Collect operations' history metrics for the monitored resources
|
||||
#
|
||||
# The reported count for the resource's operations is an approximate
|
||||
# value because crm_mon doesn't provide the exact number. To estimate
|
||||
# the number of operations applied to a resource, the plugin keeps a
|
||||
# copy of call_ids and compares it with the current value.
|
||||
for node in root.find('node_history').iter('node'):
|
||||
hostname = shorten_hostname(node.get('name'))
|
||||
if hostname not in self.history:
|
||||
self.history[hostname] = {}
|
||||
|
||||
for resource_id, resource_name in self.resources.iteritems():
|
||||
if resource_id not in self.history[hostname]:
|
||||
self.history[hostname][resource_id] = {
|
||||
'fail_count': 0,
|
||||
'ops_count': 0,
|
||||
'call_ids': Set([])
|
||||
}
|
||||
v = self.history[hostname][resource_id]
|
||||
|
||||
res_history = node.find('resource_history[@id="{}"]'.format(
|
||||
resource_id))
|
||||
if res_history:
|
||||
# For simple resources, the resource_history element only
|
||||
# exists for the node that runs the resource
|
||||
v['fail_count'] += int(res_history.get('fail-count', 0))
|
||||
call_ids = Set([
|
||||
i.get('call') for i in res_history.findall(
|
||||
'operation_history')])
|
||||
if call_ids:
|
||||
v['ops_count'] += len(call_ids - v['call_ids'])
|
||||
v['call_ids'] = call_ids
|
||||
|
||||
yield {
|
||||
'type_instance': 'resource_failures',
|
||||
'values': v['fail_count'],
|
||||
'hostname': hostname,
|
||||
'meta': {'resource': resource_name}
|
||||
}
|
||||
yield {
|
||||
'type_instance': 'resource_operations',
|
||||
'values': v['ops_count'],
|
||||
'hostname': hostname,
|
||||
'meta': {'resource': resource_name}
|
||||
}
|
||||
|
||||
|
||||
plugin = CrmMonitorPlugin(collectd)
|
||||
|
||||
|
||||
def init_callback():
|
||||
plugin.restore_sigchld()
|
||||
|
||||
|
||||
def config_callback(conf):
|
||||
plugin.config_callback(conf)
|
||||
|
||||
|
||||
def read_callback():
|
||||
plugin.read_callback()
|
||||
|
||||
collectd.register_config(config_callback)
|
||||
collectd.register_read(read_callback)
|
|
@ -1,80 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
# Copyright 2015 Mirantis, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import collectd
|
||||
import socket
|
||||
|
||||
import collectd_base as base
|
||||
|
||||
NAME = 'pacemaker_resource'
|
||||
CRM_RESOURCE_BIN = '/usr/sbin/crm_resource'
|
||||
|
||||
|
||||
class PacemakerResourcePlugin(base.Base):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(PacemakerResourcePlugin, self).__init__(*args, **kwargs)
|
||||
self.plugin = NAME
|
||||
self.crm_resource_bin = CRM_RESOURCE_BIN
|
||||
self.hostname = socket.getfqdn()
|
||||
self.resources = []
|
||||
|
||||
def config_callback(self, conf):
|
||||
super(PacemakerResourcePlugin, self).config_callback(conf)
|
||||
|
||||
for node in conf.children:
|
||||
if node.key == 'Resource':
|
||||
self.resources.extend(node.values)
|
||||
elif node.key == 'Hostname':
|
||||
self.hostname = node.values[0]
|
||||
elif node.key == 'CrmResourceBin':
|
||||
self.crm_resource_bin = node.values[0]
|
||||
|
||||
def itermetrics(self):
|
||||
for resource in self.resources:
|
||||
out, err = self.execute([self.crm_resource_bin, '--locate',
|
||||
'--quiet', '--resource', resource],
|
||||
shell=False)
|
||||
if not out:
|
||||
msg = "{}: Failed to get the status for '%s'".format(
|
||||
self.plugin, resource)
|
||||
raise base.CheckException(msg)
|
||||
|
||||
else:
|
||||
value = 0
|
||||
if self.hostname == out.lstrip("\n"):
|
||||
value = 1
|
||||
yield {
|
||||
'type_instance': resource,
|
||||
'values': value
|
||||
}
|
||||
|
||||
plugin = PacemakerResourcePlugin(collectd, 'pacemaker')
|
||||
|
||||
|
||||
def init_callback():
|
||||
plugin.restore_sigchld()
|
||||
|
||||
|
||||
def config_callback(conf):
|
||||
plugin.config_callback(conf)
|
||||
|
||||
|
||||
def read_callback():
|
||||
plugin.read_callback()
|
||||
|
||||
collectd.register_init(init_callback)
|
||||
collectd.register_config(config_callback)
|
||||
collectd.register_read(read_callback)
|
|
@ -306,10 +306,20 @@ function process_message ()
|
|||
msg['Fields'][additional_tag] = sample['type_instance']
|
||||
end
|
||||
end
|
||||
elseif metric_source == 'pacemaker_resource' then
|
||||
msg['Fields']['name'] = 'pacemaker_local_resource_active'
|
||||
msg['Fields']['tag_fields'] = { 'resource' }
|
||||
msg['Fields']['resource'] = sample['type_instance']
|
||||
elseif metric_source == 'pacemaker' then
|
||||
msg['Fields']['name'] = metric_source .. sep .. sample['type_instance']
|
||||
|
||||
-- add dimension fields
|
||||
local t = {}
|
||||
for _, v in ipairs({'status', 'resource'}) do
|
||||
if sample['meta'] and sample['meta'][v] then
|
||||
t[#t+1] = v
|
||||
msg['Fields'][v] = sample['meta'][v]
|
||||
end
|
||||
end
|
||||
if #t > 0 then
|
||||
msg['Fields']['tag_fields'] = t
|
||||
end
|
||||
elseif metric_source == 'users' then
|
||||
-- 'users' is a reserved name for InfluxDB v0.9
|
||||
msg['Fields']['name'] = 'logged_users'
|
||||
|
|
|
@ -15,71 +15,40 @@
|
|||
|
||||
class lma_collector::collectd::pacemaker (
|
||||
$resources,
|
||||
$master_resource = undef,
|
||||
$notify_resource = undef,
|
||||
$hostname = undef,
|
||||
) {
|
||||
|
||||
validate_array($resources)
|
||||
validate_hash($resources)
|
||||
|
||||
# Add quotes around the array values
|
||||
$real_resources = suffix(prefix($resources, '"'), '"')
|
||||
# Add quotes around the hash keys and values
|
||||
$resources_keys = suffix(prefix(keys($resources), '"'), '"')
|
||||
$resources_values = suffix(prefix(values($resources), '"'), '"')
|
||||
$real_resources = hash(flatten(zip($resources_keys, $resources_values)))
|
||||
|
||||
if $hostname {
|
||||
$config = {
|
||||
'Resource' => $real_resources,
|
||||
'Hostname' => "\"${hostname}\"",
|
||||
}
|
||||
$_hostname = {'Hostname' => "\"${hostname}\""}
|
||||
} else {
|
||||
$config = {
|
||||
'Resource' => $real_resources,
|
||||
}
|
||||
$_hostname = {}
|
||||
}
|
||||
if $notify_resource {
|
||||
$_notify_resource = {'NotifyResource' => "\"${notify_resource}\""}
|
||||
} else {
|
||||
$_notify_resource = {}
|
||||
}
|
||||
|
||||
lma_collector::collectd::python { 'pacemaker_resource':
|
||||
config => $config
|
||||
lma_collector::collectd::python { 'collectd_pacemaker':
|
||||
config => merge({'Resource' => $real_resources}, $_hostname, $_notify_resource)
|
||||
}
|
||||
|
||||
if $master_resource {
|
||||
|
||||
if ! member($resources, $master_resource) {
|
||||
fail("${master_resource} not a member of ${resources}")
|
||||
}
|
||||
|
||||
# Configure a PostCache chain to create a collectd notification each time
|
||||
# the pacemaker_resource plugin generates a metric whose "type instance"
|
||||
# matches the resource specified by the $master_resource parameter.
|
||||
#
|
||||
# The notifications are caught by other plugins to know the state of that
|
||||
# Pacemaker resource.
|
||||
|
||||
collectd::plugin { 'target_notification': }
|
||||
collectd::plugin { 'match_regex': }
|
||||
|
||||
class { 'collectd::plugin::chain':
|
||||
chainname => 'PostCache',
|
||||
defaulttarget => 'write',
|
||||
rules => [
|
||||
{
|
||||
'match' => {
|
||||
'type' => 'regex',
|
||||
'matches' => {
|
||||
'Plugin' => '^pacemaker_resource$',
|
||||
'TypeInstance' => "^${master_resource}$",
|
||||
},
|
||||
},
|
||||
'targets' => [
|
||||
{
|
||||
'type' => 'notification',
|
||||
'attributes' => {
|
||||
'Message' => '{\"resource\":\"%{type_instance}\",\"value\":%{ds:value}}',
|
||||
'Severity' => 'OKAY',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
# Remove configuration bits from versions < 1.0
|
||||
collectd::plugin { 'target_notification':
|
||||
ensure => absent
|
||||
}
|
||||
collectd::plugin { 'match_regex':
|
||||
ensure => absent
|
||||
}
|
||||
class { 'collectd::plugin::chain':
|
||||
ensure => absent
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,34 +20,26 @@ describe 'lma_collector::collectd::pacemaker' do
|
|||
end
|
||||
|
||||
describe 'with "resources" param' do
|
||||
let(:params) {{:resources => ['vip__public', 'vip__management']}}
|
||||
it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \
|
||||
.with_config({'Resource' => ['"vip__public"', '"vip__management"']}) }
|
||||
it { is_expected.not_to contain_collectd__plugin('target_notification') }
|
||||
it { is_expected.not_to contain_collectd__plugin('match_regex') }
|
||||
it { is_expected.not_to contain_class('collectd::plugin::chain') }
|
||||
let(:params) {{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'}}}
|
||||
it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \
|
||||
.with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'}}) }
|
||||
end
|
||||
|
||||
describe 'with "hostname" param' do
|
||||
let(:params) {{:resources => ['vip__public', 'vip__management'],
|
||||
let(:params) {{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'},
|
||||
:hostname => 'foo.example.com'}}
|
||||
it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \
|
||||
.with_config({'Resource' => ['"vip__public"', '"vip__management"'],
|
||||
it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \
|
||||
.with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'},
|
||||
'Hostname' => '"foo.example.com"'}) }
|
||||
it { is_expected.not_to contain_collectd__plugin('target_notification') }
|
||||
it { is_expected.not_to contain_collectd__plugin('match_regex') }
|
||||
it { is_expected.not_to contain_class('collectd::plugin::chain') }
|
||||
end
|
||||
|
||||
describe 'with "master_resource" param' do
|
||||
describe 'with "notify_resource" param' do
|
||||
let(:params) do
|
||||
{:resources => ['vip__public', 'vip__management'],
|
||||
:master_resource => 'vip__management'}
|
||||
{:resources => {'vip__public' => 'public', 'vip__management' => 'mgmt'},
|
||||
:notify_resource => 'vip__management'}
|
||||
end
|
||||
it { is_expected.to contain_lma_collector__collectd__python('pacemaker_resource') \
|
||||
.with_config({'Resource' => ['"vip__public"', '"vip__management"'],}) }
|
||||
it { is_expected.to contain_collectd__plugin('target_notification') }
|
||||
it { is_expected.to contain_collectd__plugin('match_regex') }
|
||||
it { is_expected.to contain_class('collectd::plugin::chain') }
|
||||
it { is_expected.to contain_lma_collector__collectd__python('collectd_pacemaker') \
|
||||
.with_config({'Resource' => {'"vip__public"' => '"public"', '"vip__management"' => '"mgmt"'},
|
||||
"NotifyResource"=>"\"vip__management\""}) }
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,9 +1,67 @@
|
|||
.. _pacemaker-metrics:
|
||||
|
||||
Resource location
|
||||
^^^^^^^^^^^^^^^^^
|
||||
Cluster
|
||||
^^^^^^^
|
||||
|
||||
* ``pacemaker_resource_local_active``, ``1`` when the resource is located on
|
||||
* ``pacemaker_dc_local_active``, ``1`` when the Designated Controller (DC) is
|
||||
the local host, if not, then ``0``.
|
||||
|
||||
* ``pacemaker_dc`` [#f1]_, ``1`` when the Designated Controller (DC) is
|
||||
present, if not, then ``0``.
|
||||
* ``pacemaker_quorum_status`` [#f1]_, ``1`` when the cluster's quorum is
|
||||
reached, if not, then ``0``.
|
||||
* ``pacemaker_configured_nodes`` [#f1]_, the number of configured nodes in the
|
||||
cluster.
|
||||
* ``pacemaker_configured_resources`` [#f1]_, the number of configured nodes in
|
||||
the cluster.
|
||||
|
||||
.. [#f1] this metric is only emitted from the node that is the Designated
|
||||
Controller (DC) of the Pacemaker cluster.
|
||||
|
||||
Node
|
||||
^^^^
|
||||
The following metrics are only emitted from the node that is the Designated
|
||||
Controller (DC) of the Pacemaker cluster. They have a ``status`` field which is
|
||||
one of 'offline', 'maintenance', or 'online':
|
||||
|
||||
* ``pacemaker_node_status``, the status of the node, ``0`` when offline, ``1``
|
||||
when in maintenance or ``2`` when online.
|
||||
* ``pacemaker_node_count``, the total number of nodes with the given
|
||||
``status``.
|
||||
* ``pacemaker_node_percent``, the percentage of nodes with the given
|
||||
``status``.
|
||||
|
||||
Resource
|
||||
^^^^^^^^
|
||||
|
||||
* ``pacemaker_resource_local_active``, ``1`` when the resource is located on
|
||||
the host reporting the metric, if not, then ``0``. The metric contains a
|
||||
``resource`` field which is one of 'vip__public', 'vip__management',
|
||||
'vip__vrouter_pub', or 'vip__vrouter'.
|
||||
|
||||
* ``pacemaker_resource_failures`` [#f2]_, the total number of failures that
|
||||
Pacemaker detected for the ``resource``. The counter is reset every time the
|
||||
collector restarts. The metric contains a ``resource`` field which one of
|
||||
'vip__management', 'vip__public', 'vip__vrouter_pub', 'vip__vrouter',
|
||||
'rabbitmq', 'mysqld' or 'haproxy'.
|
||||
|
||||
* ``pacemaker_resource_operations`` [#f2]_, the total number of operations that
|
||||
Pacemaker applied to the ``resource``. The counter is reset every time the
|
||||
collector restarts. The metric contains a ``resource`` field which one of
|
||||
'vip__management', 'vip__public', 'vip__vrouter_pub', 'vip__vrouter',
|
||||
'rabbitmq', 'mysqld' or 'haproxy'.
|
||||
|
||||
The following metrics have ``resource`` and ``status`` fields.
|
||||
|
||||
``status`` is one of 'offline', 'maintenance', or 'online'.
|
||||
|
||||
``resource`` is one of 'vip__management', 'vip__public', 'vip__vrouter_pub',
|
||||
'vip__vrouter', 'rabbitmq', 'mysqld' or 'haproxy'.
|
||||
|
||||
* ``pacemaker_resource_count`` [#f2]_, the total number of instances for the given
|
||||
``status`` and ``resource``.
|
||||
* ``pacemaker_resource_percent`` [#f2]_, the percentage of instances for the given
|
||||
``status`` and ``resource``.
|
||||
|
||||
.. [#f2] this metric is only emitted from the node that is the Designated
|
||||
Controller (DC) of the Pacemaker cluster.
|
||||
|
|
Loading…
Reference in New Issue