304 lines
11 KiB
Python
304 lines
11 KiB
Python
#!/usr/bin/python
|
|
# Copyright 2016 Mirantis, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import collectd
|
|
from collections import Counter
|
|
from collections import defaultdict
|
|
from sets import Set
|
|
import socket
|
|
import xml.etree.ElementTree as ET
|
|
|
|
import collectd_base as base
|
|
|
|
NAME = 'pacemaker'
|
|
CRM_MON_BINARY = '/usr/sbin/crm_mon'
|
|
|
|
# Node status
|
|
OFFLINE_STATUS = 0
|
|
MAINTENANCE_STATUS = 1
|
|
ONLINE_STATUS = 2
|
|
|
|
|
|
class CrmMonitorPlugin(base.Base):
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super(CrmMonitorPlugin, self).__init__(*args, **kwargs)
|
|
self.plugin = NAME
|
|
self.crm_mon_binary = CRM_MON_BINARY
|
|
self.hostname = socket.getfqdn()
|
|
self.notify_resource = None
|
|
self.resources = {}
|
|
self.history = {}
|
|
|
|
def config_callback(self, conf):
|
|
super(CrmMonitorPlugin, self).config_callback(conf)
|
|
|
|
for node in conf.children:
|
|
if node.key == 'Hostname':
|
|
self.hostname = node.values[0]
|
|
elif node.key == 'CrmMonBinary':
|
|
self.crm_mon_binary = node.values[0]
|
|
elif node.key == 'Resource':
|
|
self.resources[node.values[0]] = node.values[-1]
|
|
elif node.key == 'NotifyResource':
|
|
self.notify_resource = node.values[0]
|
|
|
|
def itermetrics(self):
|
|
def str_to_bool(v):
|
|
return str(v).lower() == 'true'
|
|
|
|
def str_to_boolint(v):
|
|
if str_to_bool(v):
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
def shorten_hostname(v):
|
|
return v.split('.')[0]
|
|
|
|
def same_hostname(v):
|
|
if v is not None and v.get('name') == self.hostname:
|
|
return 1
|
|
return 0
|
|
|
|
retcode, out, err = self.execute(
|
|
[self.crm_mon_binary, '--as-xml', '-r', '-f'], shell=False)
|
|
if retcode != 0:
|
|
raise base.CheckException(
|
|
"Failed to execute crm_mon '{}'".format(err))
|
|
|
|
try:
|
|
root = ET.fromstring(out)
|
|
except ET.ParseError:
|
|
raise base.CheckException(
|
|
"Failed to parse XML '{}'".format(out[:64]))
|
|
|
|
if self.notify_resource:
|
|
# Notify the other collectd plugins whether the resource runs
|
|
# locally or not
|
|
node = root.find('resources/resource[@id="{}"]/node'.format(
|
|
self.notify_resource))
|
|
self.collectd.Notification(
|
|
type='gauge',
|
|
message='{{"resource":"{}","value":{}}}'.format(
|
|
self.notify_resource, same_hostname(node)),
|
|
severity=self.collectd.NOTIF_OKAY
|
|
).dispatch()
|
|
# The metric needs to be emitted too for the Lua plugins executed
|
|
# by the metric_collector service
|
|
yield {
|
|
'type_instance': 'local_resource_active',
|
|
'values': same_hostname(node),
|
|
'meta': {'resource': self.notify_resource,
|
|
'host': shorten_hostname(self.hostname)}
|
|
}
|
|
|
|
summary = root.find('summary')
|
|
current_dc = summary.find('current_dc')
|
|
# The metric needs to be emitted for the alarms that leverage the other
|
|
# metrics emitted by the plugin
|
|
yield {
|
|
'type_instance': 'local_dc_active',
|
|
'values': same_hostname(current_dc),
|
|
'meta': {'host': shorten_hostname(self.hostname)}
|
|
}
|
|
|
|
if current_dc.get('name') != self.hostname:
|
|
# The other metrics are only collected from the cluster's DC
|
|
return
|
|
|
|
# Report global cluster metrics
|
|
yield {
|
|
'type_instance': 'dc',
|
|
'values': str_to_boolint(current_dc.get('present', 'false'))
|
|
}
|
|
|
|
yield {
|
|
'type_instance': 'quorum_status',
|
|
'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
|
|
}
|
|
yield {
|
|
'type_instance': 'configured_nodes',
|
|
'values': int(summary.find('nodes_configured').get('number'))
|
|
}
|
|
yield {
|
|
'type_instance': 'configured_resources',
|
|
'values': int(summary.find('resources_configured').get('number'))
|
|
}
|
|
|
|
# Report node status metrics
|
|
cluster_nodes = []
|
|
aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
|
|
nodes_total = 0
|
|
for node in root.find('nodes').iter('node'):
|
|
nodes_total += 1
|
|
hostname = shorten_hostname(node.get('name'))
|
|
cluster_nodes.append(node.get('name'))
|
|
if str_to_bool(node.get('online')):
|
|
if str_to_bool(node.get('maintenance')):
|
|
aggregated_nodes_status['maintenance'] += 1
|
|
yield {
|
|
'type_instance': 'node_status',
|
|
'values': MAINTENANCE_STATUS,
|
|
'meta': {'status': 'maintenance', 'host': hostname}
|
|
}
|
|
else:
|
|
aggregated_nodes_status['online'] += 1
|
|
yield {
|
|
'type_instance': 'node_status',
|
|
'values': ONLINE_STATUS,
|
|
'meta': {'status': 'online', 'host': hostname}
|
|
}
|
|
else:
|
|
aggregated_nodes_status['offline'] += 1
|
|
yield {
|
|
'type_instance': 'node_status',
|
|
'values': OFFLINE_STATUS,
|
|
'meta': {'status': 'offline', 'host': hostname}
|
|
}
|
|
|
|
for status, cnt in aggregated_nodes_status.items():
|
|
yield {
|
|
'type_instance': 'nodes_count',
|
|
'values': cnt,
|
|
'meta': {'status': status}
|
|
}
|
|
yield {
|
|
'type_instance': 'nodes_percent',
|
|
'values': 100.0 * cnt / nodes_total,
|
|
'meta': {'status': status}
|
|
}
|
|
|
|
# Report the number of resources per status
|
|
# Clone resources can run on multipe nodes while "simple" resources run
|
|
# only one node at the same time
|
|
aggregated_resources = defaultdict(Counter)
|
|
resources = root.find('resources')
|
|
for resource_id, resource_name in self.resources.iteritems():
|
|
resource_elts = []
|
|
simple_resource = None
|
|
clone_resource = resources.find(
|
|
'clone/resource[@id="{}"]/..'.format(resource_id))
|
|
if not clone_resource:
|
|
simple_resource = resources.find('resource[@id="{}"]'.format(
|
|
resource_id))
|
|
if simple_resource:
|
|
resource_elts = [simple_resource]
|
|
else:
|
|
resource_elts = clone_resource.findall('resource')
|
|
|
|
if not resource_elts:
|
|
self.logger.error("{}: Couldn't find resource '{}'".format(
|
|
self.plugin, resource_id))
|
|
continue
|
|
|
|
total = 0
|
|
for item in resource_elts:
|
|
total += 1
|
|
if (item.get('role') in ('Slave', 'Master') and
|
|
not str_to_bool(item.get('failed'))):
|
|
# Multi-master resource
|
|
aggregated_resources[resource_name]['up'] += 1
|
|
elif item.get('role') == 'Started':
|
|
aggregated_resources[resource_name]['up'] += 1
|
|
else:
|
|
aggregated_resources[resource_name]['down'] += 1
|
|
|
|
if simple_resource:
|
|
# Report on which node the "simple" resource is running
|
|
for node in cluster_nodes:
|
|
yield {
|
|
'type_instance': 'local_resource_active',
|
|
'values': str_to_boolint(
|
|
node == simple_resource.find('node').get('name')),
|
|
'meta': {'resource': resource_name,
|
|
'host': shorten_hostname(node)}
|
|
}
|
|
|
|
for status in ('up', 'down'):
|
|
cnt = aggregated_resources[resource_name][status]
|
|
yield {
|
|
'type_instance': 'resource_count',
|
|
'values': cnt,
|
|
'meta': {'status': status, 'resource': resource_name}
|
|
}
|
|
yield {
|
|
'type_instance': 'resource_percent',
|
|
'values': 100.0 * cnt / total,
|
|
'meta': {'status': status, 'resource': resource_name}
|
|
}
|
|
|
|
# Collect operations' history metrics for the monitored resources
|
|
#
|
|
# The reported count for the resource's operations is an approximate
|
|
# value because crm_mon doesn't provide the exact number. To estimate
|
|
# the number of operations applied to a resource, the plugin keeps a
|
|
# copy of call_ids and compares it with the current value.
|
|
for node in root.find('node_history').iter('node'):
|
|
hostname = shorten_hostname(node.get('name'))
|
|
if hostname not in self.history:
|
|
self.history[hostname] = {}
|
|
|
|
for resource_id, resource_name in self.resources.iteritems():
|
|
if resource_id not in self.history[hostname]:
|
|
self.history[hostname][resource_id] = {
|
|
'fail_count': 0,
|
|
'ops_count': 0,
|
|
'call_ids': Set([])
|
|
}
|
|
v = self.history[hostname][resource_id]
|
|
|
|
res_history = node.find('resource_history[@id="{}"]'.format(
|
|
resource_id))
|
|
if res_history:
|
|
# For simple resources, the resource_history element only
|
|
# exists for the node that runs the resource
|
|
v['fail_count'] += int(res_history.get('fail-count', 0))
|
|
call_ids = Set([
|
|
i.get('call') for i in res_history.findall(
|
|
'operation_history')])
|
|
if call_ids:
|
|
v['ops_count'] += len(call_ids - v['call_ids'])
|
|
v['call_ids'] = call_ids
|
|
|
|
yield {
|
|
'type_instance': 'resource_failures',
|
|
'values': v['fail_count'],
|
|
'meta': {'resource': resource_name, 'host': hostname}
|
|
}
|
|
yield {
|
|
'type_instance': 'resource_operations',
|
|
'values': v['ops_count'],
|
|
'meta': {'resource': resource_name, 'host': hostname}
|
|
}
|
|
|
|
|
|
plugin = CrmMonitorPlugin(collectd)
|
|
|
|
|
|
def init_callback():
|
|
plugin.restore_sigchld()
|
|
|
|
|
|
def config_callback(conf):
|
|
plugin.config_callback(conf)
|
|
|
|
|
|
def read_callback():
|
|
plugin.read_callback()
|
|
|
|
collectd.register_config(config_callback)
|
|
collectd.register_read(read_callback)
|