From 216926e555755219ea8aa1201f107681cc3f4c0d Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Wed, 7 Oct 2015 13:30:40 +0200 Subject: [PATCH] Generate alarms in Lua and Heka filters from YAML This commit creates two new functions: - get_cluster_names() - get_afd_filters() Change-Id: Ifc660de86a952dbfeccafffd35491a7acccde16d --- .../puppet/manifests/configure_afd_filters.pp | 30 ++++ .../files/plugins/filters/afd.lua | 16 +- .../parser/functions/get_afd_filters.rb | 148 ++++++++++++++++++ .../parser/functions/get_cluster_names.rb | 64 ++++++++ .../lma_collector/manifests/afd_filter.pp | 56 +++++++ .../modules/lma_collector/manifests/afds.pp | 47 ++++++ .../spec/classes/lma_collector_afds_spec.rb | 62 ++++++++ .../lma_collector/templates/alarming.yaml.erb | 40 ++--- .../templates/gse_filters.yaml.erb | 2 +- .../lma_collector/templates/lma_alarms.erb | 38 +++++ tasks.yaml | 8 + 11 files changed, 486 insertions(+), 25 deletions(-) create mode 100644 deployment_scripts/puppet/manifests/configure_afd_filters.pp create mode 100644 deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_afd_filters.rb create mode 100644 deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_cluster_names.rb create mode 100644 deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp create mode 100644 deployment_scripts/puppet/modules/lma_collector/manifests/afds.pp create mode 100644 deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afds_spec.rb create mode 100644 deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.erb diff --git a/deployment_scripts/puppet/manifests/configure_afd_filters.pp b/deployment_scripts/puppet/manifests/configure_afd_filters.pp new file mode 100644 index 000000000..441673697 --- /dev/null +++ b/deployment_scripts/puppet/manifests/configure_afd_filters.pp @@ -0,0 +1,30 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +$lma = hiera_hash('lma_collector', {}) + +$alarms_definitions = $lma['alarms'] +if $alarms_definitions == undef { + fail('Alarms definitions not found. Check files in /etc/hiera/override.') +} + +class { 'lma_collector::afds': + roles => hiera('roles'), + node_cluster_roles => $lma['node_cluster_roles'], + service_cluster_roles => $lma['service_cluster_roles'], + node_cluster_alarms => $lma['node_cluster_alarms'], + service_cluster_alarms => $lma['service_cluster_alarms'], + alarms => $alarms_definitions, +} diff --git a/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd.lua b/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd.lua index cc5cafdcf..e7fe88c1c 100644 --- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd.lua +++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/afd.lua @@ -18,12 +18,20 @@ local afd = require 'afd' -- node or service local afd_type = read_config('afd_type') or error('afd_type must be specified!') -local msg_type = string.format('afd_%s_metric', afd_type) -local msg_field_name = string.format('%s_status', afd_type) +local msg_type +local msg_field_name +local afd_entity -local afd_entity = 'role' -if afd_type == 'service' then +if afd_type == 'node' then + msg_type = 'afd_node_metric' + msg_field_name = 'node_status' + afd_entity = 'node_role' +elseif afd_type == 'service' then + msg_type = 'afd_service_metric' + msg_field_name = 'service_status' afd_entity = 'service' +else + error('invalid afd_type value') end -- ie: controller for node AFD / rabbitmq for service AFD diff --git a/deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_afd_filters.rb b/deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_afd_filters.rb new file mode 100644 index 000000000..343f128c4 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_afd_filters.rb @@ -0,0 +1,148 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +# This returns a hash that contains the filename of the alarm as key and +# list of alarms associated. +# +# ARG0: The hash table with all information +# ARG1: The hash with the list of cluster nodes and cluster services +# +# Ex: +# +# ARG0: cluster alarms +# [{"rabbitmq"=>[{"queue"=>["rabbitmq-queue-warning"]}]}, +# {"apache"=>[{"worker"=>["apache-warning"]}]}, +# {"memcached"=>[{"all"=>["memcached-warning"]}]}, +# {"haproxy"=>[{"alive"=>["haproxy-warning"]}]}] +# +# ARG1: array of alarms +# +# [ +# {"name"=>"rabbitmq-queue-warning", +# "description"=>"Number of message in queues too high", +# "severity"=>"warning", +# "trigger"=> +# {"logical_operator"=>"or", +# "rules"=> +# [{"metric"=>"rabbitmq_messages", +# "relational_operator"=>">=", +# "threshold"=>200, +# "window"=>120, +# "periods"=>0, +# "function"=>"avg"}]}}, +# {"name"=>"apache-warning", +# "description"=>"", +# "severity"=>"warning", +# "trigger"=> +# {"logical_operator"=>"or", +# "rules"=> +# [{"metric"=>"apache_idle_workers", +# "relational_operator"=>"=", +# "threshold"=>0, +# "window"=>60, +# "periods"=>0, +# "function"=>"min"}, +# {"metric"=>"apache_status", +# "relational_operator"=>"=", +# "threshold"=>0, +# "window"=>60, +# "periods"=>0, +# "function"=>"min"}]}} +# ] +# +# ARG2: ["rabbitmq", "apache"] +# +# ARG3: type (node|service) +# +# Results -> { +# 'rabbitmq_queue' => { +# 'type' => 'service', +# 'cluster_name' => 'rabbitmq', +# 'logical_name' => 'queue', +# 'alarms' => ['rabbitmq-queue-warning'], +# 'alarms_definitions' => {...}, +# 'message_matcher' => "Fields[name] == 'rabbitmq_messages'" +# }, +# 'apache_worker' => { +# 'type' => 'service', +# 'cluster_name' => 'apache', +# 'logical_name' => 'worker', +# 'alarms' => ['apache-warning'], +# 'alarms_definitions' => {...}, +# 'message_matcher' => "Fields[name] == 'apache_idle_workers' || Fields[name] == 'apache_status'" +# } +# } + +module Puppet::Parser::Functions + newfunction(:get_afd_filters, :type => :rvalue) do |args| + + cluster_alarms = args[0] + alarms_definitions = args[1] + cluster_names = args[2] + type = args[3] + afd_filters = {} + + cluster_names.each do |cluster_name| + # find alarms that belongs to the cluster_name + cluster_alarms.each do |cluster_alarm| + cluster_alarm.each do |name, alarms_list| + if name == cluster_name + # We need to get the list of metrics associated to alarms + alarms_list.each do |alarm| + alarm.each do |alarm_name, alarm_list| + + # Get the list of metrics associated to alarm_list to + # build the message matcher + metrics = [].to_set + alarm_list.each do |a_name| + alarms_definitions.each do |definition| + if definition['name'] == a_name + rules = definition['trigger']['rules'] + rules.each do |r| + metrics.add(r['metric']) + end + end + end + end + + message_matcher = "" + metrics.each do |m| + if message_matcher.empty? + message_matcher = "Fields[name] == \'#{m}\'" + else + message_matcher = message_matcher + " || Fields[name] == \'#{m}\'" + end + end + + afd_filters["#{name}_#{alarm_name}"] = { + 'type' => type, + 'cluster_name' => cluster_name, + 'logical_name' => alarm_name, + 'alarms' => alarm_list, + 'alarms_definitions' => alarms_definitions, + 'message_matcher' => message_matcher + } + end + end + + break + end + end + end + end + + return afd_filters + end +end diff --git a/deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_cluster_names.rb b/deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_cluster_names.rb new file mode 100644 index 000000000..9e96bcfbb --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/lib/puppet/parser/functions/get_cluster_names.rb @@ -0,0 +1,64 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +# This returns an array that contains the list of services or nodes related +# to a role. +# +# ARG0: An array of hash table that contains relation between node/service and +# roles. +# ARG1: An array of roles +# +# Ex: +# +# ARG0: +# [{"controller"=>["primary-controller", "controller"]}, +# {"compute"=>["compute"]}, +# {"storage"=>["cinder", "ceph-osd"]}, +# {"influxdb"=>["influxdb-grafana"]}] +# +# ARG1: ['primary-controller'] +# +# Results -> ['controller'] +# + +module Puppet::Parser::Functions + newfunction(:get_cluster_names, :type => :rvalue) do |args| + + data = args[0] + roles = args[1] + + raise Puppet::ParseError, "data passed to get_cluster_names is not a list" unless data.is_a?(Array) + raise Puppet::ParseError, "roles passed to get_cluster_names is not a list" unless roles.is_a?(Array) + + cluster_names = [].to_set + has_default = false + + roles.each do |role| + data.each do |v| + v.each { |name, t| + cluster_names.add(name) if t.include?(role) + has_default = (name == 'default') + } + end + + # if cluster_names["node"] is empty, it means that we didn't find a cluster + # name that matches with role. So add "default" name if there is a default + # value + cluster_names.add("default") if cluster_names.empty? and has_default + end + + return cluster_names.to_a() + end +end diff --git a/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp b/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp new file mode 100644 index 000000000..db8c45eb6 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/manifests/afd_filter.pp @@ -0,0 +1,56 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +define lma_collector::afd_filter ( + $type, + $cluster_name, + $logical_name, + $alarms, + $alarms_definitions, + $message_matcher, +) { + include lma_collector::params + include lma_collector::service + include heka::params + + $alarms_dir = $heka::params::lua_modules_dir + $afd_file = "${alarms_dir}/lma_alarms_${name}.lua" + + + # Create the Lua structures that describe alarms + file { $afd_file: + ensure => present, + content => template('lma_collector/lma_alarms.erb'), + } + + # Create the confguration file for Heka + heka::filter::sandbox { "afd_${type}_${cluster_name}_${logical_name}": + config_dir => $lma_collector::params::config_dir, + filename => "${lma_collector::params::plugins_dir}/filters/afd.lua", + message_matcher => "(Type == \'metric\' || Type == \'heka.sandbox.metric\') && (${message_matcher})", + ticker_interval => 10, + config => { + hostname => $::hostname, + afd_type => $type, + afd_file => "lma_alarms_${name}", + afd_cluster_name => $cluster_name, + afd_logical_name => $logical_name, + }, + require => File[$afd_file], + notify => Class['lma_collector::service'], + } +} + + diff --git a/deployment_scripts/puppet/modules/lma_collector/manifests/afds.pp b/deployment_scripts/puppet/modules/lma_collector/manifests/afds.pp new file mode 100644 index 000000000..2728861bb --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/manifests/afds.pp @@ -0,0 +1,47 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +class lma_collector::afds ( + $roles = undef, + $node_cluster_roles = undef, + $service_cluster_roles = undef, + $node_cluster_alarms = undef, + $service_cluster_alarms = undef, + $alarms = undef, +){ + + validate_array($roles) + validate_array($node_cluster_roles) + validate_array($service_cluster_roles) + validate_array($node_cluster_alarms) + validate_array($service_cluster_alarms) + validate_array($alarms) + + $node_cluster_names = get_cluster_names($node_cluster_roles, $roles) + $service_cluster_names = get_cluster_names($service_cluster_roles, $roles) + + $node_afd_filters = get_afd_filters($node_cluster_alarms, + $alarms, + $node_cluster_names, + 'node') + + $service_afd_filters = get_afd_filters($service_cluster_alarms, + $alarms, + $service_cluster_names, + 'service') + + create_resources(lma_collector::afd_filter, $node_afd_filters) + create_resources(lma_collector::afd_filter, $service_afd_filters) +} diff --git a/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afds_spec.rb b/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afds_spec.rb new file mode 100644 index 000000000..aca81019f --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/spec/classes/lma_collector_afds_spec.rb @@ -0,0 +1,62 @@ +# Copyright 2015 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +require 'spec_helper' + +describe 'lma_collector::afds' do + let(:facts) do + {:kernel => 'Linux', :operatingsystem => 'Ubuntu', + :osfamily => 'Debian'} + end + + describe 'with defaults' do + let(:params) do + {:roles => ['primary-controller'], + :node_cluster_roles => [{'controller' => ['primary-controller']}], + :service_cluster_roles => [{'mysql' => ['primary-controller']}], + :node_cluster_alarms => [{'controller' => [{'cpu' => ['cpu_warning']}]}], + :service_cluster_alarms => [{'mysql' => [{'all' => ['db_warning']}]}], + :alarms => [ + {"name"=>"cpu_warning", + "description"=>"Fake alarm", + "severity"=>"warning", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [{"metric"=>"fake_cpu", + "relational_operator"=>">=", + "threshold"=>200, + "window"=>120, + "periods"=>0, + "function"=>"avg"}]}}, + {"name"=>"db_warning", + "description"=>"Fake alarm", + "severity"=>"warning", + "trigger"=> + {"logical_operator"=>"or", + "rules"=> + [{"metric"=>"db-warning", + "relational_operator"=>">=", + "threshold"=>200, + "window"=>120, + "periods"=>0, + "function"=>"avg"}]}}]} + end + + it { is_expected.to contain_heka__filter__sandbox('afd_node_controller_cpu') } + it { is_expected.to contain_file('/usr/share/heka/lua_modules/lma_alarms_controller_cpu.lua') } + + it { is_expected.to contain_heka__filter__sandbox('afd_service_mysql_all') } + it { is_expected.to contain_file('/usr/share/heka/lua_modules/lma_alarms_mysql_all.lua') } + end +end diff --git a/deployment_scripts/puppet/modules/lma_collector/templates/alarming.yaml.erb b/deployment_scripts/puppet/modules/lma_collector/templates/alarming.yaml.erb index ed4d30eb6..ab02e92da 100644 --- a/deployment_scripts/puppet/modules/lma_collector/templates/alarming.yaml.erb +++ b/deployment_scripts/puppet/modules/lma_collector/templates/alarming.yaml.erb @@ -32,7 +32,7 @@ lma_collector: periods: 0 function: avg - metric: cpu_wait - relational_operator: '<=' + relational_operator: '>=' threshold: 25 window: 120 periods: 0 @@ -46,11 +46,11 @@ lma_collector: - metric: cpu_wait relational_operator: '>=' threshold: 30 - window: 300 + window: 120 periods: 0 function: avg - name: 'cpu-warning-compute' - description: 'CPU critical on compute' + description: 'CPU warning on compute' severity: 'warning' trigger: logical_operator: 'or' @@ -164,7 +164,7 @@ lma_collector: periods: 0 function: min - name: 'rabbitmq-queue-warning' - description: 'Number of message in queues too high' + description: 'Number of messages sitting in queues is too high' severity: 'warning' trigger: logical_operator: 'or' @@ -224,7 +224,7 @@ lma_collector: periods: 0 function: avg - name: 'fs-warning' - description: 'Filesystem usage' + description: 'Filesystem free space is low' severity: 'warning' trigger: rules: @@ -235,9 +235,9 @@ lma_collector: threshold: 5 window: 60 periods: 0 - function: avg + function: min - name: 'fs-warning-storage' - description: 'Filesystem usage' + description: 'Filesystem free space is low' severity: 'warning' trigger: rules: @@ -248,9 +248,9 @@ lma_collector: threshold: 10 window: 60 periods: 0 - function: avg + function: min - name: 'fs-critical-storage' - description: 'Filesystem usage' + description: 'Filesystem free space is low' severity: 'critical' trigger: rules: @@ -261,9 +261,9 @@ lma_collector: threshold: 5 window: 60 periods: 0 - function: avg + function: min - name: 'fs-critical-compute' - description: 'Filesystem usage' + description: 'Filesystem free space is low' severity: 'critical' trigger: rules: @@ -274,9 +274,9 @@ lma_collector: threshold: 7 window: 60 periods: 0 - function: avg + function: min - name: 'fs-critical' - description: 'Filesystem usage' + description: 'Filesystem free space is low' severity: 'critical' trigger: rules: @@ -285,9 +285,9 @@ lma_collector: fs: '*' relational_operator: '<' threshold: 2 - window: 30 + window: 60 periods: 0 - function: avg + function: min node_cluster_roles: - controller: ['primary-controller', 'controller'] - compute: ['compute'] @@ -302,16 +302,16 @@ lma_collector: node_cluster_alarms: - controller: - system: ['cpu-critical-controller', 'cpu-warning-controller'] - - fs: ['fs-warning', 'fs-critical'] + - fs: ['fs-critical', 'fs-warning'] - compute: - system: ['cpu-critical-compute', 'cpu-warning-compute'] - - fs: ['fs-warning', 'fs-critical-compute'] + - fs: ['fs-critical', 'fs-critical-compute', 'fs-warning'] - storage: - system: ['cpu-critical-storage', 'cpu-warning-storage'] - - fs: ['fs-warning-storage', 'fs-critical-storage'] - - _default: + - fs: ['fs-critical-storage', 'fs-warning-storage'] + - default: - cpu: ['cpu-critical-default'] - - fs: ['fs-warning', 'fs-critical'] + - fs: ['fs-critical', 'fs-warning'] service_cluster_alarms: - rabbitmq: - queue: ['rabbitmq-queue-warning'] diff --git a/deployment_scripts/puppet/modules/lma_collector/templates/gse_filters.yaml.erb b/deployment_scripts/puppet/modules/lma_collector/templates/gse_filters.yaml.erb index 13fe84903..17dc1cc47 100644 --- a/deployment_scripts/puppet/modules/lma_collector/templates/gse_filters.yaml.erb +++ b/deployment_scripts/puppet/modules/lma_collector/templates/gse_filters.yaml.erb @@ -127,7 +127,7 @@ lma_collector: - afd_node_metric aggregator_flag: true # the field in the input messages to identify the cluster - cluster_field: hostname + cluster_field: node_role # the field in the input messages to identify the cluster's member member_field: source output_message_type: gse_node_cluster_metric diff --git a/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.erb b/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.erb new file mode 100644 index 000000000..809996042 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_collector/templates/lma_alarms.erb @@ -0,0 +1,38 @@ +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +local alarms = { +<% @alarms.each do |alarm_name| -%> +<% @alarms_definitions.each do |alarm| -%> +<% if alarm_name == alarm["name"] -%> + { + ['name'] = '<%= alarm_name %>', + ['description'] = '<%= alarm["description"] %>', + ['severity'] = '<%= alarm["severity"] %>', + ['trigger'] = { + ['logical_operator'] = '<%= alarm["trigger"]["logical_operator"] || 'or' %>', + ['rules'] = { +<% alarm["trigger"]["rules"].each do |rule| -%> + { + ['metric'] = '<%= rule["metric"] %>', + ['fields'] = { +<% (rule["fields"] || []).each do |k, v| -%> + ['<%= k %>'] = '<%= v %>', +<% end -%> + }, + ['relational_operator'] = '<%= rule["relational_operator"] %>', + ['threshold'] = '<%= rule["threshold"] %>', + ['window'] = '<%= rule["window"] %>', + ['periods'] = '<%= rule["periods"] || 0 %>', + ['function'] = '<%= rule["function"] %>', + }, +<% end -%> + }, + }, + }, +<% end -%> +<% end -%> +<% end -%> +} + +return alarms diff --git a/tasks.yaml b/tasks.yaml index 95b520569..fe0712cb4 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -76,6 +76,14 @@ puppet_modules: puppet/modules timeout: 600 +- role: '*' + stage: post_deployment/8200 + type: puppet + parameters: + puppet_manifest: puppet/manifests/configure_afd_filters.pp + puppet_modules: puppet/modules + timeout: 600 + - role: '*' stage: post_deployment/8200 type: puppet