Update service status filter for InfluxDB 0.9
This change modifies the service accumulator and global service status filters to support the new data model implemented for InfluxDB 0.9. Change-Id: Id437d1111660d2d55bb7ba8dc46ed8c8eb8e41b7 Implements: blueprint upgrade-influxdb-grafana
This commit is contained in:
@@ -162,7 +162,12 @@ if $lma_collector['influxdb_mode'] != 'disabled' {
|
||||
}
|
||||
|
||||
# Service status metrics and annotations
|
||||
class { 'lma_collector::metrics::service_status': }
|
||||
if $lma_collector['influxdb_legacy'] {
|
||||
class { 'lma_collector::metrics::service_status_legacy': }
|
||||
} else {
|
||||
class { 'lma_collector::metrics::service_status': }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$alerting_mode = $lma_collector['alerting_mode']
|
||||
|
||||
@@ -157,6 +157,11 @@ function process_message ()
|
||||
elseif metric_source == 'check_openstack_api' then
|
||||
-- For OpenStack API metrics, plugin_instance = <service name>
|
||||
msg['Fields']['name'] = 'openstack' .. sep .. sample['plugin_instance'] .. sep .. 'check_api'
|
||||
-- 'service' isn't used as a tag because there is no point to
|
||||
-- aggregate data across all services. It is stored in the
|
||||
-- Fields table though because it simplifies the life of
|
||||
-- downstream filters consuming this data.
|
||||
msg['Fields']['service'] = sample['plugin_instance']
|
||||
if sample['type_instance'] ~= nil and sample['type_instance'] ~= '' then
|
||||
msg['Fields']['os_region'] = sample['type_instance']
|
||||
end
|
||||
@@ -270,7 +275,9 @@ function process_message ()
|
||||
msg['Fields']['service'] = service
|
||||
msg['Fields']['state'] = state
|
||||
elseif metric_source == 'pacemaker_resource' then
|
||||
msg['Fields']['name'] = 'pacemaker_resource' .. sep .. sample['type_instance'] .. sep .. 'active'
|
||||
msg['Fields']['name'] = 'pacemaker_local_resource_active'
|
||||
msg['Fields']['tag_fields'] = { 'resource' }
|
||||
msg['Fields']['resource'] = sample['type_instance']
|
||||
elseif metric_source == 'users' then
|
||||
-- 'users' is a reserved name for InfluxDB v0.9
|
||||
msg['Fields']['name'] = 'logged_users'
|
||||
|
||||
@@ -77,24 +77,29 @@ require 'math'
|
||||
local floor = math.floor
|
||||
local utils = require 'lma_utils'
|
||||
|
||||
_PRESERVATION_VERSION = 1
|
||||
_PRESERVATION_VERSION = 2
|
||||
-- variables with global scope are preserved between restarts
|
||||
services = {}
|
||||
vip_active_at = 0
|
||||
|
||||
local payload_name = read_config('inject_payload_name') or 'service_status'
|
||||
|
||||
local state_label_map = {
|
||||
up = utils.state_map.UP,
|
||||
down = utils.state_map.DOWN,
|
||||
disabled = utils.state_map.DISABLED,
|
||||
}
|
||||
function process_message ()
|
||||
local ts = floor(read_message("Timestamp")/1e6) -- ms
|
||||
local ts = read_message("Timestamp")
|
||||
local metric_name = read_message("Fields[name]")
|
||||
local value = read_message("Fields[value]")
|
||||
local state = state_label_map[read_message('Fields[state]')]
|
||||
local name
|
||||
local top_entry
|
||||
local item_name
|
||||
local group_name
|
||||
local state
|
||||
|
||||
if string.find(metric_name, '^pacemaker.resource.vip__public') then
|
||||
if metric_name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__public' then
|
||||
if value == 1 then
|
||||
vip_active_at = ts
|
||||
else
|
||||
@@ -103,42 +108,34 @@ function process_message ()
|
||||
return 0
|
||||
end
|
||||
|
||||
if string.find(metric_name, '%.up$') then
|
||||
state = utils.state_map.UP
|
||||
elseif string.find(metric_name, '%.down$') then
|
||||
state = utils.state_map.DOWN
|
||||
elseif string.find(metric_name, '%.disabled$') then
|
||||
state = utils.state_map.DISABLED
|
||||
end
|
||||
|
||||
if string.find(metric_name, '^openstack') then
|
||||
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
|
||||
if string.find(metric_name, '^openstack_([^._]+)_services$') or string.find(metric_name, '^openstack_([^._]+)_agents$') then
|
||||
name, group_name = string.match(metric_name, '([^_]+)_([^_]+)$')
|
||||
top_entry = 'workers'
|
||||
if not item_name then
|
||||
-- A service can have several API checks, by convention the service name
|
||||
-- is written down "<name>-<item>" or just "<name>".
|
||||
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
|
||||
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
|
||||
if not name then
|
||||
name = item_name
|
||||
end
|
||||
item_name = read_message('Fields[service]')
|
||||
|
||||
top_entry = 'check_api'
|
||||
group_name = 'endpoint'
|
||||
-- retrieve the current state
|
||||
state = utils.check_api_status_to_state_map[value]
|
||||
-- and always override value to 1
|
||||
value = 1
|
||||
end
|
||||
elseif string.find(metric_name, '_check_api$') then
|
||||
-- A service can have several API checks, by convention the service name
|
||||
-- is written down "<name>-<item>" or just "<name>".
|
||||
name = string.match(read_message('Fields[service]'), '^([^-]+)')
|
||||
top_entry = 'check_api'
|
||||
group_name = 'endpoint'
|
||||
item_name = read_message('Fields[service]')
|
||||
|
||||
elseif string.find(metric_name, '^haproxy%.backend') then
|
||||
-- convert 0/1 value to up/down state
|
||||
state = utils.check_api_status_to_state_map[value]
|
||||
-- and always override value to 1
|
||||
value = 1
|
||||
|
||||
elseif metric_name == 'haproxy_backend_servers' then
|
||||
name = string.match(read_message('Fields[backend]'), '^([^-]+)')
|
||||
top_entry = 'haproxy'
|
||||
group_name = 'pool'
|
||||
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
|
||||
name = string.match(item_name, '^([^-]+)')
|
||||
item_name = read_message('Fields[backend]')
|
||||
|
||||
end
|
||||
|
||||
if not name or not item_name then
|
||||
return -1
|
||||
return -1
|
||||
end
|
||||
|
||||
-- table initialization for the first time we see a service
|
||||
@@ -165,6 +162,7 @@ function process_message ()
|
||||
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
|
||||
end
|
||||
end
|
||||
|
||||
return 0
|
||||
end
|
||||
|
||||
|
||||
@@ -0,0 +1,176 @@
|
||||
-- Copyright 2015 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
-- The filter accumulates data into a table and emits regularly a message per
|
||||
-- service with a payload like this:
|
||||
-- {
|
||||
-- "vip_active_at": 1435829917607,
|
||||
-- "name": "nova",
|
||||
-- "states": {
|
||||
-- "check_api":{
|
||||
-- "nova":{
|
||||
-- "down":{
|
||||
-- "value":0,
|
||||
-- "group_name":"endpoint",
|
||||
-- "last_seen":1433252000524
|
||||
-- },
|
||||
-- "up":{
|
||||
-- "value":1,
|
||||
-- "group_name":"endpoint",
|
||||
-- "last_seen":1433252000524
|
||||
-- }
|
||||
-- },
|
||||
-- ...
|
||||
-- },
|
||||
-- "workers":{
|
||||
-- "scheduler":{
|
||||
-- "down":{
|
||||
-- "value":0,
|
||||
-- "group_name":"services",
|
||||
-- "last_seen":1433251999229
|
||||
-- },
|
||||
-- "disabled":{
|
||||
-- "value":1,
|
||||
-- "group_name":"services",
|
||||
-- "last_seen":1433251999226
|
||||
-- },
|
||||
-- "up":{
|
||||
-- "value":2,
|
||||
-- "group_name":"services",
|
||||
-- "last_seen":1433251999227
|
||||
-- }
|
||||
-- },
|
||||
-- ...
|
||||
-- },
|
||||
-- "haproxy":{
|
||||
-- "nova-api":{
|
||||
-- "down":{
|
||||
-- "value":0,
|
||||
-- "group_name":"pool",
|
||||
-- "last_seen":1433252000957
|
||||
-- },
|
||||
-- "up":{
|
||||
-- "value":3,
|
||||
-- "group_name":"pool",
|
||||
-- "last_seen":1433252000954
|
||||
-- }
|
||||
-- }
|
||||
-- }
|
||||
-- ...
|
||||
-- }
|
||||
-- }
|
||||
|
||||
require 'cjson'
|
||||
require 'string'
|
||||
require 'math'
|
||||
local floor = math.floor
|
||||
local utils = require 'lma_utils'
|
||||
|
||||
_PRESERVATION_VERSION = 1
|
||||
-- variables with global scope are preserved between restarts
|
||||
services = {}
|
||||
vip_active_at = 0
|
||||
|
||||
local payload_name = read_config('inject_payload_name') or 'service_status'
|
||||
|
||||
function process_message ()
|
||||
local ts = floor(read_message("Timestamp")/1e6) -- ms
|
||||
local metric_name = read_message("Fields[name]")
|
||||
local value = read_message("Fields[value]")
|
||||
local name
|
||||
local top_entry
|
||||
local item_name
|
||||
local group_name
|
||||
local state
|
||||
|
||||
if string.find(metric_name, '^pacemaker.resource.vip__public') then
|
||||
if value == 1 then
|
||||
vip_active_at = ts
|
||||
else
|
||||
vip_active_at = 0
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
||||
if string.find(metric_name, '%.up$') then
|
||||
state = utils.state_map.UP
|
||||
elseif string.find(metric_name, '%.down$') then
|
||||
state = utils.state_map.DOWN
|
||||
elseif string.find(metric_name, '%.disabled$') then
|
||||
state = utils.state_map.DISABLED
|
||||
end
|
||||
|
||||
if string.find(metric_name, '^openstack') then
|
||||
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
|
||||
top_entry = 'workers'
|
||||
if not item_name then
|
||||
-- A service can have several API checks, by convention the service name
|
||||
-- is written down "<name>-<item>" or just "<name>".
|
||||
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
|
||||
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
|
||||
if not name then
|
||||
name = item_name
|
||||
end
|
||||
|
||||
top_entry = 'check_api'
|
||||
group_name = 'endpoint'
|
||||
-- retrieve the current state
|
||||
state = utils.check_api_status_to_state_map[value]
|
||||
-- and always override value to 1
|
||||
value = 1
|
||||
end
|
||||
|
||||
elseif string.find(metric_name, '^haproxy%.backend') then
|
||||
top_entry = 'haproxy'
|
||||
group_name = 'pool'
|
||||
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
|
||||
name = string.match(item_name, '^([^-]+)')
|
||||
end
|
||||
if not name or not item_name then
|
||||
return -1
|
||||
end
|
||||
|
||||
-- table initialization for the first time we see a service
|
||||
if not services[name] then services[name] = {} end
|
||||
if not services[name][top_entry] then services[name][top_entry] = {} end
|
||||
if not services[name][top_entry][item_name] then services[name][top_entry][item_name] = {} end
|
||||
|
||||
local service = services[name][top_entry][item_name]
|
||||
service[state] = {last_seen=ts, value=value, group_name=group_name}
|
||||
|
||||
-- In the logic to treat check_api results like others, group by up/down
|
||||
-- and reset the counterpart w/ value=0
|
||||
if top_entry == 'check_api' then
|
||||
local invert_state
|
||||
if state == utils.state_map.UP then
|
||||
invert_state = utils.state_map.DOWN
|
||||
elseif state == utils.state_map.DOWN then
|
||||
invert_state = utils.state_map.UP
|
||||
end
|
||||
if invert_state then
|
||||
if not service[invert_state] then
|
||||
service[invert_state] = {}
|
||||
end
|
||||
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
|
||||
end
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
||||
function timer_event(ns)
|
||||
for name, states in pairs(services) do
|
||||
inject_payload('json', payload_name,
|
||||
cjson.encode({vip_active_at=vip_active_at, name=name, states=states}))
|
||||
end
|
||||
end
|
||||
@@ -14,18 +14,15 @@
|
||||
require 'cjson'
|
||||
require 'string'
|
||||
require 'math'
|
||||
local floor = math.floor
|
||||
local max = math.max
|
||||
local utils = require 'lma_utils'
|
||||
|
||||
_PRESERVATION_VERSION = 2
|
||||
_PRESERVATION_VERSION = 3
|
||||
-- variables with global scope are preserved between restarts
|
||||
all_service_status = {}
|
||||
|
||||
-- local scope variables
|
||||
local timeout = read_config("timeout") or 60
|
||||
local hostname
|
||||
local datapoints = {}
|
||||
local timeout = (read_config("timeout") or 60) * 1e9
|
||||
|
||||
function process_message ()
|
||||
local ok, data = pcall(cjson.decode, read_message("Payload"))
|
||||
@@ -33,8 +30,7 @@ function process_message ()
|
||||
return -1
|
||||
end
|
||||
local timestamp = read_message('Timestamp')
|
||||
local ts = floor(timestamp/1e6) -- in ms
|
||||
hostname = read_message("Hostname")
|
||||
local hostname = read_message("Hostname")
|
||||
local service_name = data.name
|
||||
local states = data.states
|
||||
|
||||
@@ -49,23 +45,25 @@ function process_message ()
|
||||
if not all_service_status[service_name] then all_service_status[service_name] = {} end
|
||||
|
||||
if states.workers then
|
||||
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, states.workers, true)
|
||||
worker_status = compute_status(events, not_up_status, timestamp, 'workers', service_name, states.workers, true)
|
||||
end
|
||||
|
||||
if states.check_api then
|
||||
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, states.check_api, false)
|
||||
check_api_status = compute_status(events, not_up_status, timestamp, 'check_api', service_name, states.check_api, false)
|
||||
end
|
||||
if states.haproxy then
|
||||
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, states.haproxy, true)
|
||||
haproxy_server_status = compute_status(events, not_up_status, timestamp, 'haproxy', service_name, states.haproxy, true)
|
||||
end
|
||||
global_status = max(worker_status, check_api_status, haproxy_server_status)
|
||||
-- global service status
|
||||
utils.add_metric(datapoints,
|
||||
string.format('%s.openstack.%s.status', hostname, service_name),
|
||||
{ts, global_status})
|
||||
utils.add_to_bulk_metric(
|
||||
string.format('openstack_%s_status', service_name),
|
||||
global_status
|
||||
)
|
||||
utils.inject_bulk_metric(timestamp, hostname, 'service_status_filter')
|
||||
|
||||
-- only emit status if the public vip is active
|
||||
if not expired(ts, data.vip_active_at) then
|
||||
if not expired(data.vip_active_at) then
|
||||
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
|
||||
local updated
|
||||
updated = (prev ~= global_status or #events > 0)
|
||||
@@ -76,17 +74,13 @@ function process_message ()
|
||||
if #events > 0 then
|
||||
details = cjson.encode(events)
|
||||
end
|
||||
utils.inject_status_message(timestamp, service_name,
|
||||
global_status, prev,
|
||||
updated, details)
|
||||
utils.inject_status_message(
|
||||
timestamp, service_name, global_status, prev, updated, details
|
||||
)
|
||||
end
|
||||
|
||||
all_service_status[service_name].global_status = global_status
|
||||
|
||||
if #datapoints > 0 then
|
||||
inject_payload("json", "influxdb", cjson.encode(datapoints))
|
||||
datapoints = {}
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
||||
@@ -183,10 +177,13 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
|
||||
utils.service_status_to_label_map[DOWN],
|
||||
event_detail)
|
||||
end
|
||||
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, utils.service_status_map.DOWN})
|
||||
utils.add_to_bulk_metric(
|
||||
string.format('openstack_%s_%s_status', name, worker.group_name),
|
||||
utils.service_status_map.DOWN,
|
||||
{ service = worker_name}
|
||||
)
|
||||
end
|
||||
|
||||
-- elements down or degraded
|
||||
for worker_name, worker in pairs(down_elts) do
|
||||
local prev = get_previous_status(name, elts_name, worker_name)
|
||||
@@ -198,11 +195,11 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
|
||||
new_status = utils.service_status_map.DOWN
|
||||
end
|
||||
set_status(name, elts_name, worker_name, new_status)
|
||||
utils.add_metric(datapoints,
|
||||
string.format("%s.openstack.%s.%s.%s.status",
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, new_status})
|
||||
|
||||
utils.add_to_bulk_metric(
|
||||
string.format('openstack_%s_%s_status', name, worker.group_name),
|
||||
new_status,
|
||||
{ service = worker_name}
|
||||
)
|
||||
if display_num then
|
||||
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||
total_elements[worker_name])
|
||||
@@ -235,17 +232,16 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
|
||||
utils.service_status_to_label_map[prev],
|
||||
utils.service_status_to_label_map[UP])
|
||||
end
|
||||
utils.add_metric(datapoints, string.format("%s.openstack.%s.%s.%s.status",
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, utils.service_status_map.UP})
|
||||
utils.add_to_bulk_metric(
|
||||
string.format('openstack_%s_%s_status', name, worker.group_name),
|
||||
utils.service_status_map.UP,
|
||||
{ service = worker_name}
|
||||
)
|
||||
end
|
||||
end
|
||||
return service_status
|
||||
end
|
||||
|
||||
function expired(current_time, last_time)
|
||||
if last_time > 0 and current_time - last_time <= timeout * 1000 then
|
||||
return false
|
||||
end
|
||||
return true
|
||||
function expired(last_time)
|
||||
return not (last_time > 0 and (read_message('Timestamp') - last_time) <= timeout)
|
||||
end
|
||||
|
||||
@@ -0,0 +1,251 @@
|
||||
-- Copyright 2015 Mirantis, Inc.
|
||||
--
|
||||
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||
-- you may not use this file except in compliance with the License.
|
||||
-- You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
require 'cjson'
|
||||
require 'string'
|
||||
require 'math'
|
||||
local floor = math.floor
|
||||
local max = math.max
|
||||
local utils = require 'lma_utils'
|
||||
|
||||
_PRESERVATION_VERSION = 2
|
||||
-- variables with global scope are preserved between restarts
|
||||
all_service_status = {}
|
||||
|
||||
-- local scope variables
|
||||
local timeout = read_config("timeout") or 60
|
||||
local hostname
|
||||
local datapoints = {}
|
||||
|
||||
function process_message ()
|
||||
local ok, data = pcall(cjson.decode, read_message("Payload"))
|
||||
if not ok then
|
||||
return -1
|
||||
end
|
||||
local timestamp = read_message('Timestamp')
|
||||
local ts = floor(timestamp/1e6) -- in ms
|
||||
hostname = read_message("Hostname")
|
||||
local service_name = data.name
|
||||
local states = data.states
|
||||
|
||||
local worker_status = -1
|
||||
local check_api_status = -1
|
||||
local haproxy_server_status = -1
|
||||
local global_status
|
||||
local events = {}
|
||||
local not_up_status = {}
|
||||
local msg_event
|
||||
|
||||
if not all_service_status[service_name] then all_service_status[service_name] = {} end
|
||||
|
||||
if states.workers then
|
||||
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, states.workers, true)
|
||||
end
|
||||
|
||||
if states.check_api then
|
||||
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, states.check_api, false)
|
||||
end
|
||||
if states.haproxy then
|
||||
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, states.haproxy, true)
|
||||
end
|
||||
global_status = max(worker_status, check_api_status, haproxy_server_status)
|
||||
-- global service status
|
||||
utils.add_metric(datapoints,
|
||||
string.format('%s.openstack.%s.status', hostname, service_name),
|
||||
{ts, global_status})
|
||||
|
||||
-- only emit status if the public vip is active
|
||||
if not expired(ts, data.vip_active_at) then
|
||||
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
|
||||
local updated
|
||||
updated = (prev ~= global_status or #events > 0)
|
||||
-- always append not UP status elements in details
|
||||
for k, v in pairs(not_up_status) do events[#events+1] = v end
|
||||
|
||||
local details = ''
|
||||
if #events > 0 then
|
||||
details = cjson.encode(events)
|
||||
end
|
||||
utils.inject_status_message(timestamp, service_name,
|
||||
global_status, prev,
|
||||
updated, details)
|
||||
end
|
||||
|
||||
all_service_status[service_name].global_status = global_status
|
||||
|
||||
if #datapoints > 0 then
|
||||
inject_payload("json", "influxdb", cjson.encode(datapoints))
|
||||
datapoints = {}
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
||||
function get_previous_status(service_name, top_entry, name)
|
||||
if not all_service_status[service_name] then
|
||||
all_service_status[service_name] = {}
|
||||
end
|
||||
if not all_service_status[service_name][top_entry] then
|
||||
all_service_status[service_name][top_entry] = {}
|
||||
end
|
||||
if not all_service_status[service_name][top_entry][name] then
|
||||
all_service_status[service_name][top_entry][name] = utils.service_status_map.UNKNOWN
|
||||
end
|
||||
return all_service_status[service_name][top_entry][name]
|
||||
end
|
||||
|
||||
function set_status(service_name, top_entry, name, status)
|
||||
all_service_status[service_name][top_entry][name] = status
|
||||
end
|
||||
|
||||
function compute_status(events, not_up_status, current_time, elts_name, name, states, display_num)
|
||||
local down_elts = {}
|
||||
local down_elts_count = 0
|
||||
local zero_up = {}
|
||||
local zero_up_count = 0
|
||||
local one_up = {}
|
||||
local one_disabled = {}
|
||||
local one_disabled_count = 0
|
||||
local service_status = utils.service_status_map.UNKNOWN
|
||||
local up_elements = {}
|
||||
local total_elements = {}
|
||||
|
||||
for worker, worker_data in pairs(states) do
|
||||
if not total_elements[worker] then
|
||||
total_elements[worker] = 0
|
||||
end
|
||||
if not up_elements[worker] then
|
||||
up_elements[worker] = 0
|
||||
end
|
||||
for state, data in pairs(worker_data) do
|
||||
if not expired(current_time, data.last_seen) then
|
||||
total_elements[worker] = total_elements[worker] + data.value
|
||||
if state == utils.state_map.DOWN and data.value > 0 then
|
||||
down_elts[worker] = data
|
||||
down_elts_count = down_elts_count + 1
|
||||
end
|
||||
if state == utils.state_map.UP then
|
||||
if data.value > 0 then
|
||||
one_up[worker] = data
|
||||
else
|
||||
zero_up[worker] = data
|
||||
zero_up_count = zero_up_count + 1
|
||||
end
|
||||
up_elements[worker] = data.value
|
||||
end
|
||||
if state == utils.state_map.DISABLED and data.value > 0 then
|
||||
one_disabled[worker] = data
|
||||
one_disabled_count = one_disabled_count + 1
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
-- general element status
|
||||
if zero_up_count > 0 then
|
||||
service_status = utils.service_status_map.DOWN
|
||||
elseif down_elts_count > 0 then
|
||||
service_status = utils.service_status_map.DEGRADED
|
||||
elseif down_elts_count == 0 then
|
||||
service_status = utils.service_status_map.UP
|
||||
end
|
||||
|
||||
-- elements clearly down
|
||||
for worker_name, worker in pairs(zero_up) do
|
||||
local prev = get_previous_status(name, elts_name, worker_name)
|
||||
local DOWN = utils.service_status_map.DOWN
|
||||
local event_detail = ""
|
||||
set_status(name, elts_name, worker_name, DOWN)
|
||||
if display_num then
|
||||
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||
total_elements[worker_name])
|
||||
end
|
||||
|
||||
if prev and prev ~= DOWN then
|
||||
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[prev],
|
||||
utils.service_status_to_label_map[DOWN],
|
||||
event_detail)
|
||||
|
||||
else
|
||||
not_up_status[#not_up_status+1] = string.format("%s %s %s %s",
|
||||
worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[DOWN],
|
||||
event_detail)
|
||||
end
|
||||
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, utils.service_status_map.DOWN})
|
||||
end
|
||||
-- elements down or degraded
|
||||
for worker_name, worker in pairs(down_elts) do
|
||||
local prev = get_previous_status(name, elts_name, worker_name)
|
||||
local new_status
|
||||
local event_detail
|
||||
if one_up[worker_name] then
|
||||
new_status = utils.service_status_map.DEGRADED
|
||||
else
|
||||
new_status = utils.service_status_map.DOWN
|
||||
end
|
||||
set_status(name, elts_name, worker_name, new_status)
|
||||
utils.add_metric(datapoints,
|
||||
string.format("%s.openstack.%s.%s.%s.status",
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, new_status})
|
||||
|
||||
if display_num then
|
||||
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||
total_elements[worker_name])
|
||||
else
|
||||
event_detail = ""
|
||||
end
|
||||
if prev ~= new_status then
|
||||
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[prev],
|
||||
utils.service_status_to_label_map[new_status],
|
||||
event_detail)
|
||||
elseif not zero_up[worker_name] then
|
||||
not_up_status[#not_up_status+1] = string.format("%s %s %s %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[new_status],
|
||||
event_detail)
|
||||
end
|
||||
end
|
||||
|
||||
-- elements up
|
||||
for worker_name, worker in pairs(one_up) do
|
||||
if not zero_up[worker_name] and not down_elts[worker_name] then
|
||||
local prev = get_previous_status(name, elts_name, worker_name)
|
||||
local UP = utils.service_status_map.UP
|
||||
set_status(name, elts_name, worker_name, UP)
|
||||
if prev and prev ~= utils.service_status_map.UP then
|
||||
events[#events+1] = string.format("%s %s %s -> %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[prev],
|
||||
utils.service_status_to_label_map[UP])
|
||||
end
|
||||
utils.add_metric(datapoints, string.format("%s.openstack.%s.%s.%s.status",
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, utils.service_status_map.UP})
|
||||
end
|
||||
end
|
||||
return service_status
|
||||
end
|
||||
|
||||
function expired(current_time, last_time)
|
||||
if last_time > 0 and current_time - last_time <= timeout * 1000 then
|
||||
return false
|
||||
end
|
||||
return true
|
||||
end
|
||||
@@ -13,26 +13,26 @@
|
||||
# under the License.
|
||||
#
|
||||
class lma_collector::metrics::service_status (
|
||||
$metrics_regexp = $lma_collector::params::service_status_metrics_regexp,
|
||||
$payload_name = $lma_collector::params::service_status_payload_name,
|
||||
$metrics_matcher = $lma_collector::params::service_status_metrics_matcher,
|
||||
$timeout = $lma_collector::params::service_status_timeout,
|
||||
){
|
||||
include heka::params
|
||||
) inherits lma_collector::params {
|
||||
|
||||
validate_array($metrics_regexp)
|
||||
validate_string($metrics_regexp)
|
||||
|
||||
$payload_name = $lma_collector::params::service_status_payload_name
|
||||
|
||||
if (size(metrics_regexp) > 0){
|
||||
|
||||
heka::filter::sandbox { 'service_accumulator_states':
|
||||
config_dir => $lma_collector::params::config_dir,
|
||||
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states.lua",
|
||||
message_matcher => inline_template('<%= @metrics_regexp.collect{|x| "Fields[name] =~ /%s/" % x}.join(" || ") %>'),
|
||||
ticker_interval => $lma_collector::params::service_status_interval,
|
||||
preserve_data => true,
|
||||
config => {
|
||||
config_dir => $lma_collector::params::config_dir,
|
||||
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states.lua",
|
||||
message_matcher => $metrics_matcher,
|
||||
ticker_interval => $lma_collector::params::service_status_interval,
|
||||
preserve_data => true,
|
||||
config => {
|
||||
inject_payload_name => $payload_name,
|
||||
},
|
||||
notify => Class['lma_collector::service'],
|
||||
notify => Class['lma_collector::service'],
|
||||
}
|
||||
|
||||
heka::filter::sandbox { 'service_status':
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
# Copyright 2015 Mirantis, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
class lma_collector::metrics::service_status_legacy (
|
||||
$metrics_regexp = $lma_collector::params::service_status_metrics_regexp_legacy,
|
||||
$payload_name = $lma_collector::params::service_status_payload_name,
|
||||
$timeout = $lma_collector::params::service_status_timeout,
|
||||
){
|
||||
include heka::params
|
||||
|
||||
validate_array($metrics_regexp)
|
||||
|
||||
if (size(metrics_regexp) > 0){
|
||||
|
||||
heka::filter::sandbox { 'service_accumulator_states':
|
||||
config_dir => $lma_collector::params::config_dir,
|
||||
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states_legacy.lua",
|
||||
message_matcher => inline_template('<%= @metrics_regexp.collect{|x| "Fields[name] =~ /%s/" % x}.join(" || ") %>'),
|
||||
ticker_interval => $lma_collector::params::service_status_interval,
|
||||
preserve_data => true,
|
||||
config => {
|
||||
inject_payload_name => $payload_name,
|
||||
},
|
||||
notify => Class['lma_collector::service'],
|
||||
}
|
||||
|
||||
heka::filter::sandbox { 'service_status':
|
||||
config_dir => $lma_collector::params::config_dir,
|
||||
filename => "${lma_collector::params::plugins_dir}/filters/service_status_legacy.lua",
|
||||
message_matcher => "Fields[payload_type] == 'json' && Fields[payload_name] == '${payload_name}'",
|
||||
preserve_data => true,
|
||||
config => {
|
||||
timeout => $timeout,
|
||||
},
|
||||
notify => Class['lma_collector::service'],
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -105,15 +105,25 @@ class lma_collector::params {
|
||||
$service_status_payload_name = 'service_status'
|
||||
|
||||
# Catch all metrics used to compute OpenStack service statutes
|
||||
$service_status_metrics_regexp = [
|
||||
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
|
||||
# Exception for mysqld backend because the MySQL service status is
|
||||
# computed by a dedicated filter and this avoids to send an annoying
|
||||
# status Heka message.
|
||||
'^haproxy.backend.(horizon|nova|cinder|neutron|ceilometer|keystone|swift|heat|glance|radosgw)(-.+)?.servers.(down|up)$',
|
||||
'^pacemaker.resource.vip__public.active$',
|
||||
'^openstack.*check_api$'
|
||||
]
|
||||
$service_status_metrics_regexp_legacy = [
|
||||
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
|
||||
# Exception for mysqld backend because the MySQL service status is
|
||||
# computed by a dedicated filter and this avoids to send an annoying
|
||||
# status Heka message.
|
||||
'^haproxy.backend.(horizon|nova|cinder|neutron|ceilometer|keystone|swift|heat|glance|radosgw)(-.+)?.servers.(down|up)$',
|
||||
'^pacemaker.resource.vip__public.active$',
|
||||
'^openstack.*check_api$'
|
||||
]
|
||||
$service_status_metrics_matcher = join([
|
||||
'(Type == \'metric\' || Type == \'heka.sandbox.metric\') && ',
|
||||
'(Fields[name] =~ /^openstack_(nova|cinder|neutron)_(services|agents)$/ || ',
|
||||
# Exception for mysqld backend because the MySQL service status is
|
||||
# computed by a dedicated filter and this avoids to send an annoying
|
||||
# status Heka message.
|
||||
'(Fields[name] == \'haproxy_backend_servers\' && Fields[backend] !~ /mysql/) || ',
|
||||
'(Fields[name] == \'pacemaker_local_resource_active\' && Fields[resource] == \'vip__public\') || ',
|
||||
'Fields[name] =~ /^openstack.*check_api$/)'
|
||||
], '')
|
||||
$worker_report_interval = 60
|
||||
$worker_downtime_factor = 2
|
||||
|
||||
|
||||
Reference in New Issue
Block a user