fuel-plugin-lma-collector/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py

304 lines
11 KiB
Python

#!/usr/bin/python
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collectd
from collections import Counter
from collections import defaultdict
from sets import Set
import socket
import xml.etree.ElementTree as ET
import collectd_base as base
NAME = 'pacemaker'
CRM_MON_BINARY = '/usr/sbin/crm_mon'
# Node status
OFFLINE_STATUS = 0
MAINTENANCE_STATUS = 1
ONLINE_STATUS = 2
class CrmMonitorPlugin(base.Base):
def __init__(self, *args, **kwargs):
super(CrmMonitorPlugin, self).__init__(*args, **kwargs)
self.plugin = NAME
self.crm_mon_binary = CRM_MON_BINARY
self.hostname = socket.getfqdn()
self.notify_resource = None
self.resources = {}
self.history = {}
def config_callback(self, conf):
super(CrmMonitorPlugin, self).config_callback(conf)
for node in conf.children:
if node.key == 'Hostname':
self.hostname = node.values[0]
elif node.key == 'CrmMonBinary':
self.crm_mon_binary = node.values[0]
elif node.key == 'Resource':
self.resources[node.values[0]] = node.values[-1]
elif node.key == 'NotifyResource':
self.notify_resource = node.values[0]
def itermetrics(self):
def str_to_bool(v):
return str(v).lower() == 'true'
def str_to_boolint(v):
if str_to_bool(v):
return 1
else:
return 0
def shorten_hostname(v):
return v.split('.')[0]
def same_hostname(v):
if v is not None and v.get('name') == self.hostname:
return 1
return 0
retcode, out, err = self.execute(
[self.crm_mon_binary, '--as-xml', '-r', '-f'], shell=False)
if retcode != 0:
raise base.CheckException(
"Failed to execute crm_mon '{}'".format(err))
try:
root = ET.fromstring(out)
except ET.ParseError:
raise base.CheckException(
"Failed to parse XML '{}'".format(out[:64]))
if self.notify_resource:
# Notify the other collectd plugins whether the resource runs
# locally or not
node = root.find('resources/resource[@id="{}"]/node'.format(
self.notify_resource))
self.collectd.Notification(
type='gauge',
message='{{"resource":"{}","value":{}}}'.format(
self.notify_resource, same_hostname(node)),
severity=self.collectd.NOTIF_OKAY
).dispatch()
# The metric needs to be emitted too for the Lua plugins executed
# by the metric_collector service
yield {
'type_instance': 'local_resource_active',
'values': same_hostname(node),
'meta': {'resource': self.notify_resource,
'host': shorten_hostname(self.hostname)}
}
summary = root.find('summary')
current_dc = summary.find('current_dc')
# The metric needs to be emitted for the alarms that leverage the other
# metrics emitted by the plugin
yield {
'type_instance': 'local_dc_active',
'values': same_hostname(current_dc),
'meta': {'host': shorten_hostname(self.hostname)}
}
if current_dc.get('name') != self.hostname:
# The other metrics are only collected from the cluster's DC
return
# Report global cluster metrics
yield {
'type_instance': 'dc',
'values': str_to_boolint(current_dc.get('present', 'false'))
}
yield {
'type_instance': 'quorum_status',
'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
}
yield {
'type_instance': 'configured_nodes',
'values': int(summary.find('nodes_configured').get('number'))
}
yield {
'type_instance': 'configured_resources',
'values': int(summary.find('resources_configured').get('number'))
}
# Report node status metrics
cluster_nodes = []
aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
nodes_total = 0
for node in root.find('nodes').iter('node'):
nodes_total += 1
hostname = shorten_hostname(node.get('name'))
cluster_nodes.append(node.get('name'))
if str_to_bool(node.get('online')):
if str_to_bool(node.get('maintenance')):
aggregated_nodes_status['maintenance'] += 1
yield {
'type_instance': 'node_status',
'values': MAINTENANCE_STATUS,
'meta': {'status': 'maintenance', 'host': hostname}
}
else:
aggregated_nodes_status['online'] += 1
yield {
'type_instance': 'node_status',
'values': ONLINE_STATUS,
'meta': {'status': 'online', 'host': hostname}
}
else:
aggregated_nodes_status['offline'] += 1
yield {
'type_instance': 'node_status',
'values': OFFLINE_STATUS,
'meta': {'status': 'offline', 'host': hostname}
}
for status, cnt in aggregated_nodes_status.items():
yield {
'type_instance': 'nodes_count',
'values': cnt,
'meta': {'status': status}
}
yield {
'type_instance': 'nodes_percent',
'values': 100.0 * cnt / nodes_total,
'meta': {'status': status}
}
# Report the number of resources per status
# Clone resources can run on multipe nodes while "simple" resources run
# only one node at the same time
aggregated_resources = defaultdict(Counter)
resources = root.find('resources')
for resource_id, resource_name in self.resources.iteritems():
resource_elts = []
simple_resource = None
clone_resource = resources.find(
'clone/resource[@id="{}"]/..'.format(resource_id))
if not clone_resource:
simple_resource = resources.find('resource[@id="{}"]'.format(
resource_id))
if simple_resource:
resource_elts = [simple_resource]
else:
resource_elts = clone_resource.findall('resource')
if not resource_elts:
self.logger.error("{}: Couldn't find resource '{}'".format(
self.plugin, resource_id))
continue
total = 0
for item in resource_elts:
total += 1
if (item.get('role') in ('Slave', 'Master') and
not str_to_bool(item.get('failed'))):
# Multi-master resource
aggregated_resources[resource_name]['up'] += 1
elif item.get('role') == 'Started':
aggregated_resources[resource_name]['up'] += 1
else:
aggregated_resources[resource_name]['down'] += 1
if simple_resource:
# Report on which node the "simple" resource is running
for node in cluster_nodes:
yield {
'type_instance': 'local_resource_active',
'values': str_to_boolint(
node == simple_resource.find('node').get('name')),
'meta': {'resource': resource_name,
'host': shorten_hostname(node)}
}
for status in ('up', 'down'):
cnt = aggregated_resources[resource_name][status]
yield {
'type_instance': 'resource_count',
'values': cnt,
'meta': {'status': status, 'resource': resource_name}
}
yield {
'type_instance': 'resource_percent',
'values': 100.0 * cnt / total,
'meta': {'status': status, 'resource': resource_name}
}
# Collect operations' history metrics for the monitored resources
#
# The reported count for the resource's operations is an approximate
# value because crm_mon doesn't provide the exact number. To estimate
# the number of operations applied to a resource, the plugin keeps a
# copy of call_ids and compares it with the current value.
for node in root.find('node_history').iter('node'):
hostname = shorten_hostname(node.get('name'))
if hostname not in self.history:
self.history[hostname] = {}
for resource_id, resource_name in self.resources.iteritems():
if resource_id not in self.history[hostname]:
self.history[hostname][resource_id] = {
'fail_count': 0,
'ops_count': 0,
'call_ids': Set([])
}
v = self.history[hostname][resource_id]
res_history = node.find('resource_history[@id="{}"]'.format(
resource_id))
if res_history:
# For simple resources, the resource_history element only
# exists for the node that runs the resource
v['fail_count'] += int(res_history.get('fail-count', 0))
call_ids = Set([
i.get('call') for i in res_history.findall(
'operation_history')])
if call_ids:
v['ops_count'] += len(call_ids - v['call_ids'])
v['call_ids'] = call_ids
yield {
'type_instance': 'resource_failures',
'values': v['fail_count'],
'meta': {'resource': resource_name, 'host': hostname}
}
yield {
'type_instance': 'resource_operations',
'values': v['ops_count'],
'meta': {'resource': resource_name, 'host': hostname}
}
plugin = CrmMonitorPlugin(collectd)
def init_callback():
plugin.restore_sigchld()
def config_callback(conf):
plugin.config_callback(conf)
def read_callback():
plugin.read_callback()
collectd.register_config(config_callback)
collectd.register_read(read_callback)