Generate alarms in Lua and Heka filters from YAML

This commit creates two new functions:

    - get_cluster_names()
    - get_afd_filters()

Change-Id: Ifc660de86a952dbfeccafffd35491a7acccde16d
This commit is contained in:
Guillaume Thouvenin 2015-10-07 13:30:40 +02:00 committed by Simon Pasquier
parent d49b5fb1c8
commit 216926e555
11 changed files with 486 additions and 25 deletions

View File

@ -0,0 +1,30 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
$lma = hiera_hash('lma_collector', {})
$alarms_definitions = $lma['alarms']
if $alarms_definitions == undef {
fail('Alarms definitions not found. Check files in /etc/hiera/override.')
}
class { 'lma_collector::afds':
roles => hiera('roles'),
node_cluster_roles => $lma['node_cluster_roles'],
service_cluster_roles => $lma['service_cluster_roles'],
node_cluster_alarms => $lma['node_cluster_alarms'],
service_cluster_alarms => $lma['service_cluster_alarms'],
alarms => $alarms_definitions,
}

View File

@ -18,12 +18,20 @@ local afd = require 'afd'
-- node or service
local afd_type = read_config('afd_type') or error('afd_type must be specified!')
local msg_type = string.format('afd_%s_metric', afd_type)
local msg_field_name = string.format('%s_status', afd_type)
local msg_type
local msg_field_name
local afd_entity
local afd_entity = 'role'
if afd_type == 'service' then
if afd_type == 'node' then
msg_type = 'afd_node_metric'
msg_field_name = 'node_status'
afd_entity = 'node_role'
elseif afd_type == 'service' then
msg_type = 'afd_service_metric'
msg_field_name = 'service_status'
afd_entity = 'service'
else
error('invalid afd_type value')
end
-- ie: controller for node AFD / rabbitmq for service AFD

View File

@ -0,0 +1,148 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# This returns a hash that contains the filename of the alarm as key and
# list of alarms associated.
#
# ARG0: The hash table with all information
# ARG1: The hash with the list of cluster nodes and cluster services
#
# Ex:
#
# ARG0: cluster alarms
# [{"rabbitmq"=>[{"queue"=>["rabbitmq-queue-warning"]}]},
# {"apache"=>[{"worker"=>["apache-warning"]}]},
# {"memcached"=>[{"all"=>["memcached-warning"]}]},
# {"haproxy"=>[{"alive"=>["haproxy-warning"]}]}]
#
# ARG1: array of alarms
#
# [
# {"name"=>"rabbitmq-queue-warning",
# "description"=>"Number of message in queues too high",
# "severity"=>"warning",
# "trigger"=>
# {"logical_operator"=>"or",
# "rules"=>
# [{"metric"=>"rabbitmq_messages",
# "relational_operator"=>">=",
# "threshold"=>200,
# "window"=>120,
# "periods"=>0,
# "function"=>"avg"}]}},
# {"name"=>"apache-warning",
# "description"=>"",
# "severity"=>"warning",
# "trigger"=>
# {"logical_operator"=>"or",
# "rules"=>
# [{"metric"=>"apache_idle_workers",
# "relational_operator"=>"=",
# "threshold"=>0,
# "window"=>60,
# "periods"=>0,
# "function"=>"min"},
# {"metric"=>"apache_status",
# "relational_operator"=>"=",
# "threshold"=>0,
# "window"=>60,
# "periods"=>0,
# "function"=>"min"}]}}
# ]
#
# ARG2: ["rabbitmq", "apache"]
#
# ARG3: type (node|service)
#
# Results -> {
# 'rabbitmq_queue' => {
# 'type' => 'service',
# 'cluster_name' => 'rabbitmq',
# 'logical_name' => 'queue',
# 'alarms' => ['rabbitmq-queue-warning'],
# 'alarms_definitions' => {...},
# 'message_matcher' => "Fields[name] == 'rabbitmq_messages'"
# },
# 'apache_worker' => {
# 'type' => 'service',
# 'cluster_name' => 'apache',
# 'logical_name' => 'worker',
# 'alarms' => ['apache-warning'],
# 'alarms_definitions' => {...},
# 'message_matcher' => "Fields[name] == 'apache_idle_workers' || Fields[name] == 'apache_status'"
# }
# }
module Puppet::Parser::Functions
newfunction(:get_afd_filters, :type => :rvalue) do |args|
cluster_alarms = args[0]
alarms_definitions = args[1]
cluster_names = args[2]
type = args[3]
afd_filters = {}
cluster_names.each do |cluster_name|
# find alarms that belongs to the cluster_name
cluster_alarms.each do |cluster_alarm|
cluster_alarm.each do |name, alarms_list|
if name == cluster_name
# We need to get the list of metrics associated to alarms
alarms_list.each do |alarm|
alarm.each do |alarm_name, alarm_list|
# Get the list of metrics associated to alarm_list to
# build the message matcher
metrics = [].to_set
alarm_list.each do |a_name|
alarms_definitions.each do |definition|
if definition['name'] == a_name
rules = definition['trigger']['rules']
rules.each do |r|
metrics.add(r['metric'])
end
end
end
end
message_matcher = ""
metrics.each do |m|
if message_matcher.empty?
message_matcher = "Fields[name] == \'#{m}\'"
else
message_matcher = message_matcher + " || Fields[name] == \'#{m}\'"
end
end
afd_filters["#{name}_#{alarm_name}"] = {
'type' => type,
'cluster_name' => cluster_name,
'logical_name' => alarm_name,
'alarms' => alarm_list,
'alarms_definitions' => alarms_definitions,
'message_matcher' => message_matcher
}
end
end
break
end
end
end
end
return afd_filters
end
end

View File

@ -0,0 +1,64 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# This returns an array that contains the list of services or nodes related
# to a role.
#
# ARG0: An array of hash table that contains relation between node/service and
# roles.
# ARG1: An array of roles
#
# Ex:
#
# ARG0:
# [{"controller"=>["primary-controller", "controller"]},
# {"compute"=>["compute"]},
# {"storage"=>["cinder", "ceph-osd"]},
# {"influxdb"=>["influxdb-grafana"]}]
#
# ARG1: ['primary-controller']
#
# Results -> ['controller']
#
module Puppet::Parser::Functions
newfunction(:get_cluster_names, :type => :rvalue) do |args|
data = args[0]
roles = args[1]
raise Puppet::ParseError, "data passed to get_cluster_names is not a list" unless data.is_a?(Array)
raise Puppet::ParseError, "roles passed to get_cluster_names is not a list" unless roles.is_a?(Array)
cluster_names = [].to_set
has_default = false
roles.each do |role|
data.each do |v|
v.each { |name, t|
cluster_names.add(name) if t.include?(role)
has_default = (name == 'default')
}
end
# if cluster_names["node"] is empty, it means that we didn't find a cluster
# name that matches with role. So add "default" name if there is a default
# value
cluster_names.add("default") if cluster_names.empty? and has_default
end
return cluster_names.to_a()
end
end

View File

@ -0,0 +1,56 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
define lma_collector::afd_filter (
$type,
$cluster_name,
$logical_name,
$alarms,
$alarms_definitions,
$message_matcher,
) {
include lma_collector::params
include lma_collector::service
include heka::params
$alarms_dir = $heka::params::lua_modules_dir
$afd_file = "${alarms_dir}/lma_alarms_${name}.lua"
# Create the Lua structures that describe alarms
file { $afd_file:
ensure => present,
content => template('lma_collector/lma_alarms.erb'),
}
# Create the confguration file for Heka
heka::filter::sandbox { "afd_${type}_${cluster_name}_${logical_name}":
config_dir => $lma_collector::params::config_dir,
filename => "${lma_collector::params::plugins_dir}/filters/afd.lua",
message_matcher => "(Type == \'metric\' || Type == \'heka.sandbox.metric\') && (${message_matcher})",
ticker_interval => 10,
config => {
hostname => $::hostname,
afd_type => $type,
afd_file => "lma_alarms_${name}",
afd_cluster_name => $cluster_name,
afd_logical_name => $logical_name,
},
require => File[$afd_file],
notify => Class['lma_collector::service'],
}
}

View File

@ -0,0 +1,47 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
class lma_collector::afds (
$roles = undef,
$node_cluster_roles = undef,
$service_cluster_roles = undef,
$node_cluster_alarms = undef,
$service_cluster_alarms = undef,
$alarms = undef,
){
validate_array($roles)
validate_array($node_cluster_roles)
validate_array($service_cluster_roles)
validate_array($node_cluster_alarms)
validate_array($service_cluster_alarms)
validate_array($alarms)
$node_cluster_names = get_cluster_names($node_cluster_roles, $roles)
$service_cluster_names = get_cluster_names($service_cluster_roles, $roles)
$node_afd_filters = get_afd_filters($node_cluster_alarms,
$alarms,
$node_cluster_names,
'node')
$service_afd_filters = get_afd_filters($service_cluster_alarms,
$alarms,
$service_cluster_names,
'service')
create_resources(lma_collector::afd_filter, $node_afd_filters)
create_resources(lma_collector::afd_filter, $service_afd_filters)
}

View File

@ -0,0 +1,62 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
require 'spec_helper'
describe 'lma_collector::afds' do
let(:facts) do
{:kernel => 'Linux', :operatingsystem => 'Ubuntu',
:osfamily => 'Debian'}
end
describe 'with defaults' do
let(:params) do
{:roles => ['primary-controller'],
:node_cluster_roles => [{'controller' => ['primary-controller']}],
:service_cluster_roles => [{'mysql' => ['primary-controller']}],
:node_cluster_alarms => [{'controller' => [{'cpu' => ['cpu_warning']}]}],
:service_cluster_alarms => [{'mysql' => [{'all' => ['db_warning']}]}],
:alarms => [
{"name"=>"cpu_warning",
"description"=>"Fake alarm",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[{"metric"=>"fake_cpu",
"relational_operator"=>">=",
"threshold"=>200,
"window"=>120,
"periods"=>0,
"function"=>"avg"}]}},
{"name"=>"db_warning",
"description"=>"Fake alarm",
"severity"=>"warning",
"trigger"=>
{"logical_operator"=>"or",
"rules"=>
[{"metric"=>"db-warning",
"relational_operator"=>">=",
"threshold"=>200,
"window"=>120,
"periods"=>0,
"function"=>"avg"}]}}]}
end
it { is_expected.to contain_heka__filter__sandbox('afd_node_controller_cpu') }
it { is_expected.to contain_file('/usr/share/heka/lua_modules/lma_alarms_controller_cpu.lua') }
it { is_expected.to contain_heka__filter__sandbox('afd_service_mysql_all') }
it { is_expected.to contain_file('/usr/share/heka/lua_modules/lma_alarms_mysql_all.lua') }
end
end

View File

@ -32,7 +32,7 @@ lma_collector:
periods: 0
function: avg
- metric: cpu_wait
relational_operator: '<='
relational_operator: '>='
threshold: 25
window: 120
periods: 0
@ -46,11 +46,11 @@ lma_collector:
- metric: cpu_wait
relational_operator: '>='
threshold: 30
window: 300
window: 120
periods: 0
function: avg
- name: 'cpu-warning-compute'
description: 'CPU critical on compute'
description: 'CPU warning on compute'
severity: 'warning'
trigger:
logical_operator: 'or'
@ -164,7 +164,7 @@ lma_collector:
periods: 0
function: min
- name: 'rabbitmq-queue-warning'
description: 'Number of message in queues too high'
description: 'Number of messages sitting in queues is too high'
severity: 'warning'
trigger:
logical_operator: 'or'
@ -224,7 +224,7 @@ lma_collector:
periods: 0
function: avg
- name: 'fs-warning'
description: 'Filesystem usage'
description: 'Filesystem free space is low'
severity: 'warning'
trigger:
rules:
@ -235,9 +235,9 @@ lma_collector:
threshold: 5
window: 60
periods: 0
function: avg
function: min
- name: 'fs-warning-storage'
description: 'Filesystem usage'
description: 'Filesystem free space is low'
severity: 'warning'
trigger:
rules:
@ -248,9 +248,9 @@ lma_collector:
threshold: 10
window: 60
periods: 0
function: avg
function: min
- name: 'fs-critical-storage'
description: 'Filesystem usage'
description: 'Filesystem free space is low'
severity: 'critical'
trigger:
rules:
@ -261,9 +261,9 @@ lma_collector:
threshold: 5
window: 60
periods: 0
function: avg
function: min
- name: 'fs-critical-compute'
description: 'Filesystem usage'
description: 'Filesystem free space is low'
severity: 'critical'
trigger:
rules:
@ -274,9 +274,9 @@ lma_collector:
threshold: 7
window: 60
periods: 0
function: avg
function: min
- name: 'fs-critical'
description: 'Filesystem usage'
description: 'Filesystem free space is low'
severity: 'critical'
trigger:
rules:
@ -285,9 +285,9 @@ lma_collector:
fs: '*'
relational_operator: '<'
threshold: 2
window: 30
window: 60
periods: 0
function: avg
function: min
node_cluster_roles:
- controller: ['primary-controller', 'controller']
- compute: ['compute']
@ -302,16 +302,16 @@ lma_collector:
node_cluster_alarms:
- controller:
- system: ['cpu-critical-controller', 'cpu-warning-controller']
- fs: ['fs-warning', 'fs-critical']
- fs: ['fs-critical', 'fs-warning']
- compute:
- system: ['cpu-critical-compute', 'cpu-warning-compute']
- fs: ['fs-warning', 'fs-critical-compute']
- fs: ['fs-critical', 'fs-critical-compute', 'fs-warning']
- storage:
- system: ['cpu-critical-storage', 'cpu-warning-storage']
- fs: ['fs-warning-storage', 'fs-critical-storage']
- _default:
- fs: ['fs-critical-storage', 'fs-warning-storage']
- default:
- cpu: ['cpu-critical-default']
- fs: ['fs-warning', 'fs-critical']
- fs: ['fs-critical', 'fs-warning']
service_cluster_alarms:
- rabbitmq:
- queue: ['rabbitmq-queue-warning']

View File

@ -127,7 +127,7 @@ lma_collector:
- afd_node_metric
aggregator_flag: true
# the field in the input messages to identify the cluster
cluster_field: hostname
cluster_field: node_role
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_node_cluster_metric

View File

@ -0,0 +1,38 @@
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local alarms = {
<% @alarms.each do |alarm_name| -%>
<% @alarms_definitions.each do |alarm| -%>
<% if alarm_name == alarm["name"] -%>
{
['name'] = '<%= alarm_name %>',
['description'] = '<%= alarm["description"] %>',
['severity'] = '<%= alarm["severity"] %>',
['trigger'] = {
['logical_operator'] = '<%= alarm["trigger"]["logical_operator"] || 'or' %>',
['rules'] = {
<% alarm["trigger"]["rules"].each do |rule| -%>
{
['metric'] = '<%= rule["metric"] %>',
['fields'] = {
<% (rule["fields"] || []).each do |k, v| -%>
['<%= k %>'] = '<%= v %>',
<% end -%>
},
['relational_operator'] = '<%= rule["relational_operator"] %>',
['threshold'] = '<%= rule["threshold"] %>',
['window'] = '<%= rule["window"] %>',
['periods'] = '<%= rule["periods"] || 0 %>',
['function'] = '<%= rule["function"] %>',
},
<% end -%>
},
},
},
<% end -%>
<% end -%>
<% end -%>
}
return alarms

View File

@ -76,6 +76,14 @@
puppet_modules: puppet/modules
timeout: 600
- role: '*'
stage: post_deployment/8200
type: puppet
parameters:
puppet_manifest: puppet/manifests/configure_afd_filters.pp
puppet_modules: puppet/modules
timeout: 600
- role: '*'
stage: post_deployment/8200
type: puppet