Update service status filter for InfluxDB 0.9

This change modifies the service accumulator and global service status
filters to support the new data model implemented for InfluxDB 0.9.

Change-Id: Id437d1111660d2d55bb7ba8dc46ed8c8eb8e41b7
Implements: blueprint upgrade-influxdb-grafana
This commit is contained in:
Simon Pasquier
2015-08-05 10:50:33 +02:00
parent 8f268f04ff
commit 098c3189f5
9 changed files with 585 additions and 93 deletions

View File

@@ -162,7 +162,12 @@ if $lma_collector['influxdb_mode'] != 'disabled' {
}
# Service status metrics and annotations
class { 'lma_collector::metrics::service_status': }
if $lma_collector['influxdb_legacy'] {
class { 'lma_collector::metrics::service_status_legacy': }
} else {
class { 'lma_collector::metrics::service_status': }
}
}
$alerting_mode = $lma_collector['alerting_mode']

View File

@@ -157,6 +157,11 @@ function process_message ()
elseif metric_source == 'check_openstack_api' then
-- For OpenStack API metrics, plugin_instance = <service name>
msg['Fields']['name'] = 'openstack' .. sep .. sample['plugin_instance'] .. sep .. 'check_api'
-- 'service' isn't used as a tag because there is no point to
-- aggregate data across all services. It is stored in the
-- Fields table though because it simplifies the life of
-- downstream filters consuming this data.
msg['Fields']['service'] = sample['plugin_instance']
if sample['type_instance'] ~= nil and sample['type_instance'] ~= '' then
msg['Fields']['os_region'] = sample['type_instance']
end
@@ -270,7 +275,9 @@ function process_message ()
msg['Fields']['service'] = service
msg['Fields']['state'] = state
elseif metric_source == 'pacemaker_resource' then
msg['Fields']['name'] = 'pacemaker_resource' .. sep .. sample['type_instance'] .. sep .. 'active'
msg['Fields']['name'] = 'pacemaker_local_resource_active'
msg['Fields']['tag_fields'] = { 'resource' }
msg['Fields']['resource'] = sample['type_instance']
elseif metric_source == 'users' then
-- 'users' is a reserved name for InfluxDB v0.9
msg['Fields']['name'] = 'logged_users'

View File

@@ -77,24 +77,29 @@ require 'math'
local floor = math.floor
local utils = require 'lma_utils'
_PRESERVATION_VERSION = 1
_PRESERVATION_VERSION = 2
-- variables with global scope are preserved between restarts
services = {}
vip_active_at = 0
local payload_name = read_config('inject_payload_name') or 'service_status'
local state_label_map = {
up = utils.state_map.UP,
down = utils.state_map.DOWN,
disabled = utils.state_map.DISABLED,
}
function process_message ()
local ts = floor(read_message("Timestamp")/1e6) -- ms
local ts = read_message("Timestamp")
local metric_name = read_message("Fields[name]")
local value = read_message("Fields[value]")
local state = state_label_map[read_message('Fields[state]')]
local name
local top_entry
local item_name
local group_name
local state
if string.find(metric_name, '^pacemaker.resource.vip__public') then
if metric_name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__public' then
if value == 1 then
vip_active_at = ts
else
@@ -103,42 +108,34 @@ function process_message ()
return 0
end
if string.find(metric_name, '%.up$') then
state = utils.state_map.UP
elseif string.find(metric_name, '%.down$') then
state = utils.state_map.DOWN
elseif string.find(metric_name, '%.disabled$') then
state = utils.state_map.DISABLED
end
if string.find(metric_name, '^openstack') then
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
if string.find(metric_name, '^openstack_([^._]+)_services$') or string.find(metric_name, '^openstack_([^._]+)_agents$') then
name, group_name = string.match(metric_name, '([^_]+)_([^_]+)$')
top_entry = 'workers'
if not item_name then
-- A service can have several API checks, by convention the service name
-- is written down "<name>-<item>" or just "<name>".
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
if not name then
name = item_name
end
item_name = read_message('Fields[service]')
top_entry = 'check_api'
group_name = 'endpoint'
-- retrieve the current state
state = utils.check_api_status_to_state_map[value]
-- and always override value to 1
value = 1
end
elseif string.find(metric_name, '_check_api$') then
-- A service can have several API checks, by convention the service name
-- is written down "<name>-<item>" or just "<name>".
name = string.match(read_message('Fields[service]'), '^([^-]+)')
top_entry = 'check_api'
group_name = 'endpoint'
item_name = read_message('Fields[service]')
elseif string.find(metric_name, '^haproxy%.backend') then
-- convert 0/1 value to up/down state
state = utils.check_api_status_to_state_map[value]
-- and always override value to 1
value = 1
elseif metric_name == 'haproxy_backend_servers' then
name = string.match(read_message('Fields[backend]'), '^([^-]+)')
top_entry = 'haproxy'
group_name = 'pool'
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
name = string.match(item_name, '^([^-]+)')
item_name = read_message('Fields[backend]')
end
if not name or not item_name then
return -1
return -1
end
-- table initialization for the first time we see a service
@@ -165,6 +162,7 @@ function process_message ()
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
end
end
return 0
end

View File

@@ -0,0 +1,176 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- The filter accumulates data into a table and emits regularly a message per
-- service with a payload like this:
-- {
-- "vip_active_at": 1435829917607,
-- "name": "nova",
-- "states": {
-- "check_api":{
-- "nova":{
-- "down":{
-- "value":0,
-- "group_name":"endpoint",
-- "last_seen":1433252000524
-- },
-- "up":{
-- "value":1,
-- "group_name":"endpoint",
-- "last_seen":1433252000524
-- }
-- },
-- ...
-- },
-- "workers":{
-- "scheduler":{
-- "down":{
-- "value":0,
-- "group_name":"services",
-- "last_seen":1433251999229
-- },
-- "disabled":{
-- "value":1,
-- "group_name":"services",
-- "last_seen":1433251999226
-- },
-- "up":{
-- "value":2,
-- "group_name":"services",
-- "last_seen":1433251999227
-- }
-- },
-- ...
-- },
-- "haproxy":{
-- "nova-api":{
-- "down":{
-- "value":0,
-- "group_name":"pool",
-- "last_seen":1433252000957
-- },
-- "up":{
-- "value":3,
-- "group_name":"pool",
-- "last_seen":1433252000954
-- }
-- }
-- }
-- ...
-- }
-- }
require 'cjson'
require 'string'
require 'math'
local floor = math.floor
local utils = require 'lma_utils'
_PRESERVATION_VERSION = 1
-- variables with global scope are preserved between restarts
services = {}
vip_active_at = 0
local payload_name = read_config('inject_payload_name') or 'service_status'
function process_message ()
local ts = floor(read_message("Timestamp")/1e6) -- ms
local metric_name = read_message("Fields[name]")
local value = read_message("Fields[value]")
local name
local top_entry
local item_name
local group_name
local state
if string.find(metric_name, '^pacemaker.resource.vip__public') then
if value == 1 then
vip_active_at = ts
else
vip_active_at = 0
end
return 0
end
if string.find(metric_name, '%.up$') then
state = utils.state_map.UP
elseif string.find(metric_name, '%.down$') then
state = utils.state_map.DOWN
elseif string.find(metric_name, '%.disabled$') then
state = utils.state_map.DISABLED
end
if string.find(metric_name, '^openstack') then
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
top_entry = 'workers'
if not item_name then
-- A service can have several API checks, by convention the service name
-- is written down "<name>-<item>" or just "<name>".
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
if not name then
name = item_name
end
top_entry = 'check_api'
group_name = 'endpoint'
-- retrieve the current state
state = utils.check_api_status_to_state_map[value]
-- and always override value to 1
value = 1
end
elseif string.find(metric_name, '^haproxy%.backend') then
top_entry = 'haproxy'
group_name = 'pool'
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
name = string.match(item_name, '^([^-]+)')
end
if not name or not item_name then
return -1
end
-- table initialization for the first time we see a service
if not services[name] then services[name] = {} end
if not services[name][top_entry] then services[name][top_entry] = {} end
if not services[name][top_entry][item_name] then services[name][top_entry][item_name] = {} end
local service = services[name][top_entry][item_name]
service[state] = {last_seen=ts, value=value, group_name=group_name}
-- In the logic to treat check_api results like others, group by up/down
-- and reset the counterpart w/ value=0
if top_entry == 'check_api' then
local invert_state
if state == utils.state_map.UP then
invert_state = utils.state_map.DOWN
elseif state == utils.state_map.DOWN then
invert_state = utils.state_map.UP
end
if invert_state then
if not service[invert_state] then
service[invert_state] = {}
end
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
end
end
return 0
end
function timer_event(ns)
for name, states in pairs(services) do
inject_payload('json', payload_name,
cjson.encode({vip_active_at=vip_active_at, name=name, states=states}))
end
end

View File

@@ -14,18 +14,15 @@
require 'cjson'
require 'string'
require 'math'
local floor = math.floor
local max = math.max
local utils = require 'lma_utils'
_PRESERVATION_VERSION = 2
_PRESERVATION_VERSION = 3
-- variables with global scope are preserved between restarts
all_service_status = {}
-- local scope variables
local timeout = read_config("timeout") or 60
local hostname
local datapoints = {}
local timeout = (read_config("timeout") or 60) * 1e9
function process_message ()
local ok, data = pcall(cjson.decode, read_message("Payload"))
@@ -33,8 +30,7 @@ function process_message ()
return -1
end
local timestamp = read_message('Timestamp')
local ts = floor(timestamp/1e6) -- in ms
hostname = read_message("Hostname")
local hostname = read_message("Hostname")
local service_name = data.name
local states = data.states
@@ -49,23 +45,25 @@ function process_message ()
if not all_service_status[service_name] then all_service_status[service_name] = {} end
if states.workers then
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, states.workers, true)
worker_status = compute_status(events, not_up_status, timestamp, 'workers', service_name, states.workers, true)
end
if states.check_api then
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, states.check_api, false)
check_api_status = compute_status(events, not_up_status, timestamp, 'check_api', service_name, states.check_api, false)
end
if states.haproxy then
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, states.haproxy, true)
haproxy_server_status = compute_status(events, not_up_status, timestamp, 'haproxy', service_name, states.haproxy, true)
end
global_status = max(worker_status, check_api_status, haproxy_server_status)
-- global service status
utils.add_metric(datapoints,
string.format('%s.openstack.%s.status', hostname, service_name),
{ts, global_status})
utils.add_to_bulk_metric(
string.format('openstack_%s_status', service_name),
global_status
)
utils.inject_bulk_metric(timestamp, hostname, 'service_status_filter')
-- only emit status if the public vip is active
if not expired(ts, data.vip_active_at) then
if not expired(data.vip_active_at) then
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
local updated
updated = (prev ~= global_status or #events > 0)
@@ -76,17 +74,13 @@ function process_message ()
if #events > 0 then
details = cjson.encode(events)
end
utils.inject_status_message(timestamp, service_name,
global_status, prev,
updated, details)
utils.inject_status_message(
timestamp, service_name, global_status, prev, updated, details
)
end
all_service_status[service_name].global_status = global_status
if #datapoints > 0 then
inject_payload("json", "influxdb", cjson.encode(datapoints))
datapoints = {}
end
return 0
end
@@ -183,10 +177,13 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
utils.service_status_to_label_map[DOWN],
event_detail)
end
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
hostname, name, worker.group_name, worker_name),
{current_time, utils.service_status_map.DOWN})
utils.add_to_bulk_metric(
string.format('openstack_%s_%s_status', name, worker.group_name),
utils.service_status_map.DOWN,
{ service = worker_name}
)
end
-- elements down or degraded
for worker_name, worker in pairs(down_elts) do
local prev = get_previous_status(name, elts_name, worker_name)
@@ -198,11 +195,11 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
new_status = utils.service_status_map.DOWN
end
set_status(name, elts_name, worker_name, new_status)
utils.add_metric(datapoints,
string.format("%s.openstack.%s.%s.%s.status",
hostname, name, worker.group_name, worker_name),
{current_time, new_status})
utils.add_to_bulk_metric(
string.format('openstack_%s_%s_status', name, worker.group_name),
new_status,
{ service = worker_name}
)
if display_num then
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
total_elements[worker_name])
@@ -235,17 +232,16 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
utils.service_status_to_label_map[prev],
utils.service_status_to_label_map[UP])
end
utils.add_metric(datapoints, string.format("%s.openstack.%s.%s.%s.status",
hostname, name, worker.group_name, worker_name),
{current_time, utils.service_status_map.UP})
utils.add_to_bulk_metric(
string.format('openstack_%s_%s_status', name, worker.group_name),
utils.service_status_map.UP,
{ service = worker_name}
)
end
end
return service_status
end
function expired(current_time, last_time)
if last_time > 0 and current_time - last_time <= timeout * 1000 then
return false
end
return true
function expired(last_time)
return not (last_time > 0 and (read_message('Timestamp') - last_time) <= timeout)
end

View File

@@ -0,0 +1,251 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
require 'cjson'
require 'string'
require 'math'
local floor = math.floor
local max = math.max
local utils = require 'lma_utils'
_PRESERVATION_VERSION = 2
-- variables with global scope are preserved between restarts
all_service_status = {}
-- local scope variables
local timeout = read_config("timeout") or 60
local hostname
local datapoints = {}
function process_message ()
local ok, data = pcall(cjson.decode, read_message("Payload"))
if not ok then
return -1
end
local timestamp = read_message('Timestamp')
local ts = floor(timestamp/1e6) -- in ms
hostname = read_message("Hostname")
local service_name = data.name
local states = data.states
local worker_status = -1
local check_api_status = -1
local haproxy_server_status = -1
local global_status
local events = {}
local not_up_status = {}
local msg_event
if not all_service_status[service_name] then all_service_status[service_name] = {} end
if states.workers then
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, states.workers, true)
end
if states.check_api then
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, states.check_api, false)
end
if states.haproxy then
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, states.haproxy, true)
end
global_status = max(worker_status, check_api_status, haproxy_server_status)
-- global service status
utils.add_metric(datapoints,
string.format('%s.openstack.%s.status', hostname, service_name),
{ts, global_status})
-- only emit status if the public vip is active
if not expired(ts, data.vip_active_at) then
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
local updated
updated = (prev ~= global_status or #events > 0)
-- always append not UP status elements in details
for k, v in pairs(not_up_status) do events[#events+1] = v end
local details = ''
if #events > 0 then
details = cjson.encode(events)
end
utils.inject_status_message(timestamp, service_name,
global_status, prev,
updated, details)
end
all_service_status[service_name].global_status = global_status
if #datapoints > 0 then
inject_payload("json", "influxdb", cjson.encode(datapoints))
datapoints = {}
end
return 0
end
function get_previous_status(service_name, top_entry, name)
if not all_service_status[service_name] then
all_service_status[service_name] = {}
end
if not all_service_status[service_name][top_entry] then
all_service_status[service_name][top_entry] = {}
end
if not all_service_status[service_name][top_entry][name] then
all_service_status[service_name][top_entry][name] = utils.service_status_map.UNKNOWN
end
return all_service_status[service_name][top_entry][name]
end
function set_status(service_name, top_entry, name, status)
all_service_status[service_name][top_entry][name] = status
end
function compute_status(events, not_up_status, current_time, elts_name, name, states, display_num)
local down_elts = {}
local down_elts_count = 0
local zero_up = {}
local zero_up_count = 0
local one_up = {}
local one_disabled = {}
local one_disabled_count = 0
local service_status = utils.service_status_map.UNKNOWN
local up_elements = {}
local total_elements = {}
for worker, worker_data in pairs(states) do
if not total_elements[worker] then
total_elements[worker] = 0
end
if not up_elements[worker] then
up_elements[worker] = 0
end
for state, data in pairs(worker_data) do
if not expired(current_time, data.last_seen) then
total_elements[worker] = total_elements[worker] + data.value
if state == utils.state_map.DOWN and data.value > 0 then
down_elts[worker] = data
down_elts_count = down_elts_count + 1
end
if state == utils.state_map.UP then
if data.value > 0 then
one_up[worker] = data
else
zero_up[worker] = data
zero_up_count = zero_up_count + 1
end
up_elements[worker] = data.value
end
if state == utils.state_map.DISABLED and data.value > 0 then
one_disabled[worker] = data
one_disabled_count = one_disabled_count + 1
end
end
end
end
-- general element status
if zero_up_count > 0 then
service_status = utils.service_status_map.DOWN
elseif down_elts_count > 0 then
service_status = utils.service_status_map.DEGRADED
elseif down_elts_count == 0 then
service_status = utils.service_status_map.UP
end
-- elements clearly down
for worker_name, worker in pairs(zero_up) do
local prev = get_previous_status(name, elts_name, worker_name)
local DOWN = utils.service_status_map.DOWN
local event_detail = ""
set_status(name, elts_name, worker_name, DOWN)
if display_num then
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
total_elements[worker_name])
end
if prev and prev ~= DOWN then
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[prev],
utils.service_status_to_label_map[DOWN],
event_detail)
else
not_up_status[#not_up_status+1] = string.format("%s %s %s %s",
worker_name,
worker.group_name,
utils.service_status_to_label_map[DOWN],
event_detail)
end
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
hostname, name, worker.group_name, worker_name),
{current_time, utils.service_status_map.DOWN})
end
-- elements down or degraded
for worker_name, worker in pairs(down_elts) do
local prev = get_previous_status(name, elts_name, worker_name)
local new_status
local event_detail
if one_up[worker_name] then
new_status = utils.service_status_map.DEGRADED
else
new_status = utils.service_status_map.DOWN
end
set_status(name, elts_name, worker_name, new_status)
utils.add_metric(datapoints,
string.format("%s.openstack.%s.%s.%s.status",
hostname, name, worker.group_name, worker_name),
{current_time, new_status})
if display_num then
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
total_elements[worker_name])
else
event_detail = ""
end
if prev ~= new_status then
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[prev],
utils.service_status_to_label_map[new_status],
event_detail)
elseif not zero_up[worker_name] then
not_up_status[#not_up_status+1] = string.format("%s %s %s %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[new_status],
event_detail)
end
end
-- elements up
for worker_name, worker in pairs(one_up) do
if not zero_up[worker_name] and not down_elts[worker_name] then
local prev = get_previous_status(name, elts_name, worker_name)
local UP = utils.service_status_map.UP
set_status(name, elts_name, worker_name, UP)
if prev and prev ~= utils.service_status_map.UP then
events[#events+1] = string.format("%s %s %s -> %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[prev],
utils.service_status_to_label_map[UP])
end
utils.add_metric(datapoints, string.format("%s.openstack.%s.%s.%s.status",
hostname, name, worker.group_name, worker_name),
{current_time, utils.service_status_map.UP})
end
end
return service_status
end
function expired(current_time, last_time)
if last_time > 0 and current_time - last_time <= timeout * 1000 then
return false
end
return true
end

View File

@@ -13,26 +13,26 @@
# under the License.
#
class lma_collector::metrics::service_status (
$metrics_regexp = $lma_collector::params::service_status_metrics_regexp,
$payload_name = $lma_collector::params::service_status_payload_name,
$metrics_matcher = $lma_collector::params::service_status_metrics_matcher,
$timeout = $lma_collector::params::service_status_timeout,
){
include heka::params
) inherits lma_collector::params {
validate_array($metrics_regexp)
validate_string($metrics_regexp)
$payload_name = $lma_collector::params::service_status_payload_name
if (size(metrics_regexp) > 0){
heka::filter::sandbox { 'service_accumulator_states':
config_dir => $lma_collector::params::config_dir,
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states.lua",
message_matcher => inline_template('<%= @metrics_regexp.collect{|x| "Fields[name] =~ /%s/" % x}.join(" || ") %>'),
ticker_interval => $lma_collector::params::service_status_interval,
preserve_data => true,
config => {
config_dir => $lma_collector::params::config_dir,
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states.lua",
message_matcher => $metrics_matcher,
ticker_interval => $lma_collector::params::service_status_interval,
preserve_data => true,
config => {
inject_payload_name => $payload_name,
},
notify => Class['lma_collector::service'],
notify => Class['lma_collector::service'],
}
heka::filter::sandbox { 'service_status':

View File

@@ -0,0 +1,49 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
class lma_collector::metrics::service_status_legacy (
$metrics_regexp = $lma_collector::params::service_status_metrics_regexp_legacy,
$payload_name = $lma_collector::params::service_status_payload_name,
$timeout = $lma_collector::params::service_status_timeout,
){
include heka::params
validate_array($metrics_regexp)
if (size(metrics_regexp) > 0){
heka::filter::sandbox { 'service_accumulator_states':
config_dir => $lma_collector::params::config_dir,
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states_legacy.lua",
message_matcher => inline_template('<%= @metrics_regexp.collect{|x| "Fields[name] =~ /%s/" % x}.join(" || ") %>'),
ticker_interval => $lma_collector::params::service_status_interval,
preserve_data => true,
config => {
inject_payload_name => $payload_name,
},
notify => Class['lma_collector::service'],
}
heka::filter::sandbox { 'service_status':
config_dir => $lma_collector::params::config_dir,
filename => "${lma_collector::params::plugins_dir}/filters/service_status_legacy.lua",
message_matcher => "Fields[payload_type] == 'json' && Fields[payload_name] == '${payload_name}'",
preserve_data => true,
config => {
timeout => $timeout,
},
notify => Class['lma_collector::service'],
}
}
}

View File

@@ -105,15 +105,25 @@ class lma_collector::params {
$service_status_payload_name = 'service_status'
# Catch all metrics used to compute OpenStack service statutes
$service_status_metrics_regexp = [
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
# Exception for mysqld backend because the MySQL service status is
# computed by a dedicated filter and this avoids to send an annoying
# status Heka message.
'^haproxy.backend.(horizon|nova|cinder|neutron|ceilometer|keystone|swift|heat|glance|radosgw)(-.+)?.servers.(down|up)$',
'^pacemaker.resource.vip__public.active$',
'^openstack.*check_api$'
]
$service_status_metrics_regexp_legacy = [
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
# Exception for mysqld backend because the MySQL service status is
# computed by a dedicated filter and this avoids to send an annoying
# status Heka message.
'^haproxy.backend.(horizon|nova|cinder|neutron|ceilometer|keystone|swift|heat|glance|radosgw)(-.+)?.servers.(down|up)$',
'^pacemaker.resource.vip__public.active$',
'^openstack.*check_api$'
]
$service_status_metrics_matcher = join([
'(Type == \'metric\' || Type == \'heka.sandbox.metric\') && ',
'(Fields[name] =~ /^openstack_(nova|cinder|neutron)_(services|agents)$/ || ',
# Exception for mysqld backend because the MySQL service status is
# computed by a dedicated filter and this avoids to send an annoying
# status Heka message.
'(Fields[name] == \'haproxy_backend_servers\' && Fields[backend] !~ /mysql/) || ',
'(Fields[name] == \'pacemaker_local_resource_active\' && Fields[resource] == \'vip__public\') || ',
'Fields[name] =~ /^openstack.*check_api$/)'
], '')
$worker_report_interval = 60
$worker_downtime_factor = 2