Monitor Elasticsearch cluster

The patch adds a new manifest executed on both influxdb and elasticsearch
nodes to configure collectd specially for them and also move here related
configuration from base.pp.

Implements: blueprint elasticsearch-clustering

Change-Id: I0e75446dd97e8c7108be87513a2b13e6909fcf44
This commit is contained in:
Swann Croiset 2016-01-06 18:01:41 +01:00
parent 33bfb1a6cf
commit 3479f09192
8 changed files with 220 additions and 16 deletions

View File

@ -146,18 +146,6 @@ case $influxdb_mode {
$influxdb_password = $influxdb_grafana['influxdb_userpass']
}
if member($current_roles, 'influxdb_grafana') or member($current_roles, 'primary-influxdb_grafana'){
$processes = ['influxd', 'grafana-server', 'hekad', 'collectd']
} else {
$processes = ['hekad', 'collectd']
}
if member($current_roles, 'elasticsearch_kibana') or member($current_roles, 'primary-elasticsearch_kibana') {
$process_matches = [{name => 'elasticsearch', regex => 'java'}]
}else{
$process_matches = undef
}
if $is_controller {
# plugins on the controllers do many network I/O operations so it is
# recommended to increase this value.
@ -167,11 +155,13 @@ case $influxdb_mode {
$collectd_read_threads = 5
}
# TODO(all): this class could be executed several times by other puppet runs,
# this is useless and need to be fixed by using a single collectd.pp
# manifest configuring collectd for all roles.
class { 'lma_collector::collectd::base':
processes => $processes,
process_matches => $process_matches,
read_threads => $collectd_read_threads,
require => Class['lma_collector'],
processes => ['hekad', 'collectd'],
read_threads => $collectd_read_threads,
require => Class['lma_collector'],
}
class { 'lma_collector::influxdb':

View File

@ -0,0 +1,47 @@
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
$lma_collector_hash = hiera_hash('lma_collector')
if $lma_collector_hash['influxdb_mode'] != 'disabled' {
$network_metadata = hiera('network_metadata')
$current_roles = hiera('roles')
$is_elasticsearch_node = member($current_roles, 'elasticsearch_kibana') or member($current_roles, 'primary-elasticsearch_kibana')
$is_influxdb_node = member($current_roles, 'influxdb_grafana') or member($current_roles, 'primary-influxdb_grafana')
if $is_elasticsearch_node {
$process_matches = [{name => 'elasticsearch', regex => 'java'}]
} else {
$process_matches = undef
}
if $is_influxdb_node {
$processes = ['influxd', 'grafana-server', 'hekad', 'collectd']
} else {
$processes = ['hekad', 'collectd']
}
class { 'lma_collector::collectd::base':
processes => $processes,
process_matches => $process_matches,
}
if $is_elasticsearch_node {
class { 'lma_collector::collectd::elasticsearch':
address => hiera('lma::elasticsearch::vip'),
}
}
}

View File

@ -0,0 +1,99 @@
#!/usr/bin/python
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collectd
import base
import requests
NAME = 'elasticsearch_cluster'
HEALTH_MAP = {
'green': 1,
'yellow': 2,
'red': 3,
}
METRICS = ['number_of_nodes', 'active_primary_shards', 'active_primary_shards',
'active_shards', 'relocating_shards', 'unassigned_shards',
'number_of_pending_tasks', 'initializing_shards']
class ElasticsearchClusterHealthPlugin(base.Base):
def __init__(self, *args, **kwargs):
super(ElasticsearchClusterHealthPlugin, self).__init__(*args, **kwargs)
self.plugin = NAME
self.address = '127.0.0.1'
self.port = 9200
self.max_retries = 3
self.session = requests.Session()
self.url = None
self.session.mount(
'http://',
requests.adapters.HTTPAdapter(max_retries=self.max_retries)
)
def config_callback(self, conf):
super(ElasticsearchClusterHealthPlugin, self).config_callback(conf)
for node in conf.children:
if node.key == 'Address':
self.address = node.values[0]
if node.key == 'Port':
self.port = node.values[0]
self.url = "http://{address}:{port}/_cluster/health".format(
**{
'address': self.address,
'port': int(self.port),
})
def itermetrics(self):
try:
r = self.session.get(self.url)
except Exception as e:
self.logger.error("Got exception for '{}': {}".format(self.url, e))
return
if r.status_code != 200:
self.logger.error("{} responded with code {}".format(
self.url, r.status_code))
return
data = r.json()
self.logger.debug("Got response from Elasticsearch: '%s'" % data)
yield {
'type_instance': 'health',
'values': HEALTH_MAP[data['status']]
}
for metric in METRICS:
yield {
'type_instance': metric,
'values': data[metric]
}
plugin = ElasticsearchClusterHealthPlugin()
def init_callback():
plugin.restore_sigchld()
def config_callback(conf):
plugin.config_callback(conf)
def read_callback():
plugin.read_callback()
collectd.register_init(init_callback)
collectd.register_config(config_callback)
collectd.register_read(read_callback)

View File

@ -352,6 +352,8 @@ function process_message ()
else
msg['Fields']['name'] = 'virt' .. sep .. metric_name
end
elseif metric_source == 'elasticsearch_cluster' then
msg['Fields']['name'] = metric_source .. sep .. sample['type_instance']
else
msg['Fields']['name'] = replace_dot_by_sep(metric_name)
end

View File

@ -0,0 +1,29 @@
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
class lma_collector::collectd::elasticsearch (
$address,
$port = 9200,
) {
validate_integer($port)
validate_string($address)
lma_collector::collectd::python { 'elasticsearch_cluster':
config => {
'Address' => $address,
'Port' => $port,
},
}
}

View File

@ -146,3 +146,9 @@ LMA self-monitoring
-------------------
.. include:: metrics/lma.rst
Elasticsearch
-------------
.. include:: metrics/elasticsearch.rst

View File

@ -0,0 +1,19 @@
.. _Elasticsearch:
The following metrics represent the simple status on the health of the cluster.
See `cluster health` for further details.
* ``elasticsearch_cluster_health``, the health status of the entire cluster
where values ``1``, ``2`` , ``3`` represent respectively ``green``,
``yellow`` and ``red``.
* ``elasticsearch_cluster_active_primary_shards``, the number of active primary
shards.
* ``elasticsearch_cluster_active_shards``, the number of active shards.
* ``elasticsearch_cluster_initializing_shards``, the number of initializing
shards.
* ``elasticsearch_cluster_number_of_nodes``, the number of nodes in the cluster.
* ``elasticsearch_cluster_number_of_pending_tasks``, the number of pending tasks.
* ``elasticsearch_cluster_relocating_shards``, the number of relocating shards.
* ``elasticsearch_cluster_unassigned_shards``, the number of unassigned shards.
.. _cluster health: https://www.elastic.co/guide/en/elasticsearch/reference/1.7/cluster-health.html

View File

@ -86,6 +86,18 @@
puppet_modules: puppet/modules
timeout: 600
- role:
- primary-elasticsearch_kibana
- elasticsearch_kibana
- primary-influxdb_grafana
- influxdb_grafana
stage: post_deployment/8200
type: puppet
parameters:
puppet_manifest: puppet/manifests/lma_backends.pp
puppet_modules: puppet/modules
timeout: 600
- role: '*'
stage: post_deployment/8200
type: puppet