Update service status filter for InfluxDB 0.9
This change modifies the service accumulator and global service status filters to support the new data model implemented for InfluxDB 0.9. Change-Id: Id437d1111660d2d55bb7ba8dc46ed8c8eb8e41b7 Implements: blueprint upgrade-influxdb-grafana
This commit is contained in:
@@ -162,9 +162,14 @@ if $lma_collector['influxdb_mode'] != 'disabled' {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Service status metrics and annotations
|
# Service status metrics and annotations
|
||||||
|
if $lma_collector['influxdb_legacy'] {
|
||||||
|
class { 'lma_collector::metrics::service_status_legacy': }
|
||||||
|
} else {
|
||||||
class { 'lma_collector::metrics::service_status': }
|
class { 'lma_collector::metrics::service_status': }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
$alerting_mode = $lma_collector['alerting_mode']
|
$alerting_mode = $lma_collector['alerting_mode']
|
||||||
if $alerting_mode != 'disabled' {
|
if $alerting_mode != 'disabled' {
|
||||||
|
|
||||||
|
|||||||
@@ -157,6 +157,11 @@ function process_message ()
|
|||||||
elseif metric_source == 'check_openstack_api' then
|
elseif metric_source == 'check_openstack_api' then
|
||||||
-- For OpenStack API metrics, plugin_instance = <service name>
|
-- For OpenStack API metrics, plugin_instance = <service name>
|
||||||
msg['Fields']['name'] = 'openstack' .. sep .. sample['plugin_instance'] .. sep .. 'check_api'
|
msg['Fields']['name'] = 'openstack' .. sep .. sample['plugin_instance'] .. sep .. 'check_api'
|
||||||
|
-- 'service' isn't used as a tag because there is no point to
|
||||||
|
-- aggregate data across all services. It is stored in the
|
||||||
|
-- Fields table though because it simplifies the life of
|
||||||
|
-- downstream filters consuming this data.
|
||||||
|
msg['Fields']['service'] = sample['plugin_instance']
|
||||||
if sample['type_instance'] ~= nil and sample['type_instance'] ~= '' then
|
if sample['type_instance'] ~= nil and sample['type_instance'] ~= '' then
|
||||||
msg['Fields']['os_region'] = sample['type_instance']
|
msg['Fields']['os_region'] = sample['type_instance']
|
||||||
end
|
end
|
||||||
@@ -270,7 +275,9 @@ function process_message ()
|
|||||||
msg['Fields']['service'] = service
|
msg['Fields']['service'] = service
|
||||||
msg['Fields']['state'] = state
|
msg['Fields']['state'] = state
|
||||||
elseif metric_source == 'pacemaker_resource' then
|
elseif metric_source == 'pacemaker_resource' then
|
||||||
msg['Fields']['name'] = 'pacemaker_resource' .. sep .. sample['type_instance'] .. sep .. 'active'
|
msg['Fields']['name'] = 'pacemaker_local_resource_active'
|
||||||
|
msg['Fields']['tag_fields'] = { 'resource' }
|
||||||
|
msg['Fields']['resource'] = sample['type_instance']
|
||||||
elseif metric_source == 'users' then
|
elseif metric_source == 'users' then
|
||||||
-- 'users' is a reserved name for InfluxDB v0.9
|
-- 'users' is a reserved name for InfluxDB v0.9
|
||||||
msg['Fields']['name'] = 'logged_users'
|
msg['Fields']['name'] = 'logged_users'
|
||||||
|
|||||||
@@ -77,24 +77,29 @@ require 'math'
|
|||||||
local floor = math.floor
|
local floor = math.floor
|
||||||
local utils = require 'lma_utils'
|
local utils = require 'lma_utils'
|
||||||
|
|
||||||
_PRESERVATION_VERSION = 1
|
_PRESERVATION_VERSION = 2
|
||||||
-- variables with global scope are preserved between restarts
|
-- variables with global scope are preserved between restarts
|
||||||
services = {}
|
services = {}
|
||||||
vip_active_at = 0
|
vip_active_at = 0
|
||||||
|
|
||||||
local payload_name = read_config('inject_payload_name') or 'service_status'
|
local payload_name = read_config('inject_payload_name') or 'service_status'
|
||||||
|
|
||||||
|
local state_label_map = {
|
||||||
|
up = utils.state_map.UP,
|
||||||
|
down = utils.state_map.DOWN,
|
||||||
|
disabled = utils.state_map.DISABLED,
|
||||||
|
}
|
||||||
function process_message ()
|
function process_message ()
|
||||||
local ts = floor(read_message("Timestamp")/1e6) -- ms
|
local ts = read_message("Timestamp")
|
||||||
local metric_name = read_message("Fields[name]")
|
local metric_name = read_message("Fields[name]")
|
||||||
local value = read_message("Fields[value]")
|
local value = read_message("Fields[value]")
|
||||||
|
local state = state_label_map[read_message('Fields[state]')]
|
||||||
local name
|
local name
|
||||||
local top_entry
|
local top_entry
|
||||||
local item_name
|
local item_name
|
||||||
local group_name
|
local group_name
|
||||||
local state
|
|
||||||
|
|
||||||
if string.find(metric_name, '^pacemaker.resource.vip__public') then
|
if metric_name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__public' then
|
||||||
if value == 1 then
|
if value == 1 then
|
||||||
vip_active_at = ts
|
vip_active_at = ts
|
||||||
else
|
else
|
||||||
@@ -103,40 +108,32 @@ function process_message ()
|
|||||||
return 0
|
return 0
|
||||||
end
|
end
|
||||||
|
|
||||||
if string.find(metric_name, '%.up$') then
|
if string.find(metric_name, '^openstack_([^._]+)_services$') or string.find(metric_name, '^openstack_([^._]+)_agents$') then
|
||||||
state = utils.state_map.UP
|
name, group_name = string.match(metric_name, '([^_]+)_([^_]+)$')
|
||||||
elseif string.find(metric_name, '%.down$') then
|
|
||||||
state = utils.state_map.DOWN
|
|
||||||
elseif string.find(metric_name, '%.disabled$') then
|
|
||||||
state = utils.state_map.DISABLED
|
|
||||||
end
|
|
||||||
|
|
||||||
if string.find(metric_name, '^openstack') then
|
|
||||||
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
|
|
||||||
top_entry = 'workers'
|
top_entry = 'workers'
|
||||||
if not item_name then
|
item_name = read_message('Fields[service]')
|
||||||
|
|
||||||
|
elseif string.find(metric_name, '_check_api$') then
|
||||||
-- A service can have several API checks, by convention the service name
|
-- A service can have several API checks, by convention the service name
|
||||||
-- is written down "<name>-<item>" or just "<name>".
|
-- is written down "<name>-<item>" or just "<name>".
|
||||||
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
|
name = string.match(read_message('Fields[service]'), '^([^-]+)')
|
||||||
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
|
|
||||||
if not name then
|
|
||||||
name = item_name
|
|
||||||
end
|
|
||||||
|
|
||||||
top_entry = 'check_api'
|
top_entry = 'check_api'
|
||||||
group_name = 'endpoint'
|
group_name = 'endpoint'
|
||||||
-- retrieve the current state
|
item_name = read_message('Fields[service]')
|
||||||
|
|
||||||
|
-- convert 0/1 value to up/down state
|
||||||
state = utils.check_api_status_to_state_map[value]
|
state = utils.check_api_status_to_state_map[value]
|
||||||
-- and always override value to 1
|
-- and always override value to 1
|
||||||
value = 1
|
value = 1
|
||||||
end
|
|
||||||
|
|
||||||
elseif string.find(metric_name, '^haproxy%.backend') then
|
elseif metric_name == 'haproxy_backend_servers' then
|
||||||
|
name = string.match(read_message('Fields[backend]'), '^([^-]+)')
|
||||||
top_entry = 'haproxy'
|
top_entry = 'haproxy'
|
||||||
group_name = 'pool'
|
group_name = 'pool'
|
||||||
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
|
item_name = read_message('Fields[backend]')
|
||||||
name = string.match(item_name, '^([^-]+)')
|
|
||||||
end
|
end
|
||||||
|
|
||||||
if not name or not item_name then
|
if not name or not item_name then
|
||||||
return -1
|
return -1
|
||||||
end
|
end
|
||||||
@@ -165,6 +162,7 @@ function process_message ()
|
|||||||
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
|
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,176 @@
|
|||||||
|
-- Copyright 2015 Mirantis, Inc.
|
||||||
|
--
|
||||||
|
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
-- you may not use this file except in compliance with the License.
|
||||||
|
-- You may obtain a copy of the License at
|
||||||
|
--
|
||||||
|
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
--
|
||||||
|
-- Unless required by applicable law or agreed to in writing, software
|
||||||
|
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
-- See the License for the specific language governing permissions and
|
||||||
|
-- limitations under the License.
|
||||||
|
|
||||||
|
-- The filter accumulates data into a table and emits regularly a message per
|
||||||
|
-- service with a payload like this:
|
||||||
|
-- {
|
||||||
|
-- "vip_active_at": 1435829917607,
|
||||||
|
-- "name": "nova",
|
||||||
|
-- "states": {
|
||||||
|
-- "check_api":{
|
||||||
|
-- "nova":{
|
||||||
|
-- "down":{
|
||||||
|
-- "value":0,
|
||||||
|
-- "group_name":"endpoint",
|
||||||
|
-- "last_seen":1433252000524
|
||||||
|
-- },
|
||||||
|
-- "up":{
|
||||||
|
-- "value":1,
|
||||||
|
-- "group_name":"endpoint",
|
||||||
|
-- "last_seen":1433252000524
|
||||||
|
-- }
|
||||||
|
-- },
|
||||||
|
-- ...
|
||||||
|
-- },
|
||||||
|
-- "workers":{
|
||||||
|
-- "scheduler":{
|
||||||
|
-- "down":{
|
||||||
|
-- "value":0,
|
||||||
|
-- "group_name":"services",
|
||||||
|
-- "last_seen":1433251999229
|
||||||
|
-- },
|
||||||
|
-- "disabled":{
|
||||||
|
-- "value":1,
|
||||||
|
-- "group_name":"services",
|
||||||
|
-- "last_seen":1433251999226
|
||||||
|
-- },
|
||||||
|
-- "up":{
|
||||||
|
-- "value":2,
|
||||||
|
-- "group_name":"services",
|
||||||
|
-- "last_seen":1433251999227
|
||||||
|
-- }
|
||||||
|
-- },
|
||||||
|
-- ...
|
||||||
|
-- },
|
||||||
|
-- "haproxy":{
|
||||||
|
-- "nova-api":{
|
||||||
|
-- "down":{
|
||||||
|
-- "value":0,
|
||||||
|
-- "group_name":"pool",
|
||||||
|
-- "last_seen":1433252000957
|
||||||
|
-- },
|
||||||
|
-- "up":{
|
||||||
|
-- "value":3,
|
||||||
|
-- "group_name":"pool",
|
||||||
|
-- "last_seen":1433252000954
|
||||||
|
-- }
|
||||||
|
-- }
|
||||||
|
-- }
|
||||||
|
-- ...
|
||||||
|
-- }
|
||||||
|
-- }
|
||||||
|
|
||||||
|
require 'cjson'
|
||||||
|
require 'string'
|
||||||
|
require 'math'
|
||||||
|
local floor = math.floor
|
||||||
|
local utils = require 'lma_utils'
|
||||||
|
|
||||||
|
_PRESERVATION_VERSION = 1
|
||||||
|
-- variables with global scope are preserved between restarts
|
||||||
|
services = {}
|
||||||
|
vip_active_at = 0
|
||||||
|
|
||||||
|
local payload_name = read_config('inject_payload_name') or 'service_status'
|
||||||
|
|
||||||
|
function process_message ()
|
||||||
|
local ts = floor(read_message("Timestamp")/1e6) -- ms
|
||||||
|
local metric_name = read_message("Fields[name]")
|
||||||
|
local value = read_message("Fields[value]")
|
||||||
|
local name
|
||||||
|
local top_entry
|
||||||
|
local item_name
|
||||||
|
local group_name
|
||||||
|
local state
|
||||||
|
|
||||||
|
if string.find(metric_name, '^pacemaker.resource.vip__public') then
|
||||||
|
if value == 1 then
|
||||||
|
vip_active_at = ts
|
||||||
|
else
|
||||||
|
vip_active_at = 0
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
|
||||||
|
if string.find(metric_name, '%.up$') then
|
||||||
|
state = utils.state_map.UP
|
||||||
|
elseif string.find(metric_name, '%.down$') then
|
||||||
|
state = utils.state_map.DOWN
|
||||||
|
elseif string.find(metric_name, '%.disabled$') then
|
||||||
|
state = utils.state_map.DISABLED
|
||||||
|
end
|
||||||
|
|
||||||
|
if string.find(metric_name, '^openstack') then
|
||||||
|
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
|
||||||
|
top_entry = 'workers'
|
||||||
|
if not item_name then
|
||||||
|
-- A service can have several API checks, by convention the service name
|
||||||
|
-- is written down "<name>-<item>" or just "<name>".
|
||||||
|
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
|
||||||
|
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
|
||||||
|
if not name then
|
||||||
|
name = item_name
|
||||||
|
end
|
||||||
|
|
||||||
|
top_entry = 'check_api'
|
||||||
|
group_name = 'endpoint'
|
||||||
|
-- retrieve the current state
|
||||||
|
state = utils.check_api_status_to_state_map[value]
|
||||||
|
-- and always override value to 1
|
||||||
|
value = 1
|
||||||
|
end
|
||||||
|
|
||||||
|
elseif string.find(metric_name, '^haproxy%.backend') then
|
||||||
|
top_entry = 'haproxy'
|
||||||
|
group_name = 'pool'
|
||||||
|
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
|
||||||
|
name = string.match(item_name, '^([^-]+)')
|
||||||
|
end
|
||||||
|
if not name or not item_name then
|
||||||
|
return -1
|
||||||
|
end
|
||||||
|
|
||||||
|
-- table initialization for the first time we see a service
|
||||||
|
if not services[name] then services[name] = {} end
|
||||||
|
if not services[name][top_entry] then services[name][top_entry] = {} end
|
||||||
|
if not services[name][top_entry][item_name] then services[name][top_entry][item_name] = {} end
|
||||||
|
|
||||||
|
local service = services[name][top_entry][item_name]
|
||||||
|
service[state] = {last_seen=ts, value=value, group_name=group_name}
|
||||||
|
|
||||||
|
-- In the logic to treat check_api results like others, group by up/down
|
||||||
|
-- and reset the counterpart w/ value=0
|
||||||
|
if top_entry == 'check_api' then
|
||||||
|
local invert_state
|
||||||
|
if state == utils.state_map.UP then
|
||||||
|
invert_state = utils.state_map.DOWN
|
||||||
|
elseif state == utils.state_map.DOWN then
|
||||||
|
invert_state = utils.state_map.UP
|
||||||
|
end
|
||||||
|
if invert_state then
|
||||||
|
if not service[invert_state] then
|
||||||
|
service[invert_state] = {}
|
||||||
|
end
|
||||||
|
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function timer_event(ns)
|
||||||
|
for name, states in pairs(services) do
|
||||||
|
inject_payload('json', payload_name,
|
||||||
|
cjson.encode({vip_active_at=vip_active_at, name=name, states=states}))
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -14,18 +14,15 @@
|
|||||||
require 'cjson'
|
require 'cjson'
|
||||||
require 'string'
|
require 'string'
|
||||||
require 'math'
|
require 'math'
|
||||||
local floor = math.floor
|
|
||||||
local max = math.max
|
local max = math.max
|
||||||
local utils = require 'lma_utils'
|
local utils = require 'lma_utils'
|
||||||
|
|
||||||
_PRESERVATION_VERSION = 2
|
_PRESERVATION_VERSION = 3
|
||||||
-- variables with global scope are preserved between restarts
|
-- variables with global scope are preserved between restarts
|
||||||
all_service_status = {}
|
all_service_status = {}
|
||||||
|
|
||||||
-- local scope variables
|
-- local scope variables
|
||||||
local timeout = read_config("timeout") or 60
|
local timeout = (read_config("timeout") or 60) * 1e9
|
||||||
local hostname
|
|
||||||
local datapoints = {}
|
|
||||||
|
|
||||||
function process_message ()
|
function process_message ()
|
||||||
local ok, data = pcall(cjson.decode, read_message("Payload"))
|
local ok, data = pcall(cjson.decode, read_message("Payload"))
|
||||||
@@ -33,8 +30,7 @@ function process_message ()
|
|||||||
return -1
|
return -1
|
||||||
end
|
end
|
||||||
local timestamp = read_message('Timestamp')
|
local timestamp = read_message('Timestamp')
|
||||||
local ts = floor(timestamp/1e6) -- in ms
|
local hostname = read_message("Hostname")
|
||||||
hostname = read_message("Hostname")
|
|
||||||
local service_name = data.name
|
local service_name = data.name
|
||||||
local states = data.states
|
local states = data.states
|
||||||
|
|
||||||
@@ -49,23 +45,25 @@ function process_message ()
|
|||||||
if not all_service_status[service_name] then all_service_status[service_name] = {} end
|
if not all_service_status[service_name] then all_service_status[service_name] = {} end
|
||||||
|
|
||||||
if states.workers then
|
if states.workers then
|
||||||
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, states.workers, true)
|
worker_status = compute_status(events, not_up_status, timestamp, 'workers', service_name, states.workers, true)
|
||||||
end
|
end
|
||||||
|
|
||||||
if states.check_api then
|
if states.check_api then
|
||||||
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, states.check_api, false)
|
check_api_status = compute_status(events, not_up_status, timestamp, 'check_api', service_name, states.check_api, false)
|
||||||
end
|
end
|
||||||
if states.haproxy then
|
if states.haproxy then
|
||||||
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, states.haproxy, true)
|
haproxy_server_status = compute_status(events, not_up_status, timestamp, 'haproxy', service_name, states.haproxy, true)
|
||||||
end
|
end
|
||||||
global_status = max(worker_status, check_api_status, haproxy_server_status)
|
global_status = max(worker_status, check_api_status, haproxy_server_status)
|
||||||
-- global service status
|
-- global service status
|
||||||
utils.add_metric(datapoints,
|
utils.add_to_bulk_metric(
|
||||||
string.format('%s.openstack.%s.status', hostname, service_name),
|
string.format('openstack_%s_status', service_name),
|
||||||
{ts, global_status})
|
global_status
|
||||||
|
)
|
||||||
|
utils.inject_bulk_metric(timestamp, hostname, 'service_status_filter')
|
||||||
|
|
||||||
-- only emit status if the public vip is active
|
-- only emit status if the public vip is active
|
||||||
if not expired(ts, data.vip_active_at) then
|
if not expired(data.vip_active_at) then
|
||||||
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
|
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
|
||||||
local updated
|
local updated
|
||||||
updated = (prev ~= global_status or #events > 0)
|
updated = (prev ~= global_status or #events > 0)
|
||||||
@@ -76,17 +74,13 @@ function process_message ()
|
|||||||
if #events > 0 then
|
if #events > 0 then
|
||||||
details = cjson.encode(events)
|
details = cjson.encode(events)
|
||||||
end
|
end
|
||||||
utils.inject_status_message(timestamp, service_name,
|
utils.inject_status_message(
|
||||||
global_status, prev,
|
timestamp, service_name, global_status, prev, updated, details
|
||||||
updated, details)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
all_service_status[service_name].global_status = global_status
|
all_service_status[service_name].global_status = global_status
|
||||||
|
|
||||||
if #datapoints > 0 then
|
|
||||||
inject_payload("json", "influxdb", cjson.encode(datapoints))
|
|
||||||
datapoints = {}
|
|
||||||
end
|
|
||||||
return 0
|
return 0
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -183,10 +177,13 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
|
|||||||
utils.service_status_to_label_map[DOWN],
|
utils.service_status_to_label_map[DOWN],
|
||||||
event_detail)
|
event_detail)
|
||||||
end
|
end
|
||||||
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
|
utils.add_to_bulk_metric(
|
||||||
hostname, name, worker.group_name, worker_name),
|
string.format('openstack_%s_%s_status', name, worker.group_name),
|
||||||
{current_time, utils.service_status_map.DOWN})
|
utils.service_status_map.DOWN,
|
||||||
|
{ service = worker_name}
|
||||||
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
-- elements down or degraded
|
-- elements down or degraded
|
||||||
for worker_name, worker in pairs(down_elts) do
|
for worker_name, worker in pairs(down_elts) do
|
||||||
local prev = get_previous_status(name, elts_name, worker_name)
|
local prev = get_previous_status(name, elts_name, worker_name)
|
||||||
@@ -198,11 +195,11 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
|
|||||||
new_status = utils.service_status_map.DOWN
|
new_status = utils.service_status_map.DOWN
|
||||||
end
|
end
|
||||||
set_status(name, elts_name, worker_name, new_status)
|
set_status(name, elts_name, worker_name, new_status)
|
||||||
utils.add_metric(datapoints,
|
utils.add_to_bulk_metric(
|
||||||
string.format("%s.openstack.%s.%s.%s.status",
|
string.format('openstack_%s_%s_status', name, worker.group_name),
|
||||||
hostname, name, worker.group_name, worker_name),
|
new_status,
|
||||||
{current_time, new_status})
|
{ service = worker_name}
|
||||||
|
)
|
||||||
if display_num then
|
if display_num then
|
||||||
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||||
total_elements[worker_name])
|
total_elements[worker_name])
|
||||||
@@ -235,17 +232,16 @@ function compute_status(events, not_up_status, current_time, elts_name, name, st
|
|||||||
utils.service_status_to_label_map[prev],
|
utils.service_status_to_label_map[prev],
|
||||||
utils.service_status_to_label_map[UP])
|
utils.service_status_to_label_map[UP])
|
||||||
end
|
end
|
||||||
utils.add_metric(datapoints, string.format("%s.openstack.%s.%s.%s.status",
|
utils.add_to_bulk_metric(
|
||||||
hostname, name, worker.group_name, worker_name),
|
string.format('openstack_%s_%s_status', name, worker.group_name),
|
||||||
{current_time, utils.service_status_map.UP})
|
utils.service_status_map.UP,
|
||||||
|
{ service = worker_name}
|
||||||
|
)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return service_status
|
return service_status
|
||||||
end
|
end
|
||||||
|
|
||||||
function expired(current_time, last_time)
|
function expired(last_time)
|
||||||
if last_time > 0 and current_time - last_time <= timeout * 1000 then
|
return not (last_time > 0 and (read_message('Timestamp') - last_time) <= timeout)
|
||||||
return false
|
|
||||||
end
|
|
||||||
return true
|
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -0,0 +1,251 @@
|
|||||||
|
-- Copyright 2015 Mirantis, Inc.
|
||||||
|
--
|
||||||
|
-- Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
-- you may not use this file except in compliance with the License.
|
||||||
|
-- You may obtain a copy of the License at
|
||||||
|
--
|
||||||
|
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
--
|
||||||
|
-- Unless required by applicable law or agreed to in writing, software
|
||||||
|
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
-- See the License for the specific language governing permissions and
|
||||||
|
-- limitations under the License.
|
||||||
|
require 'cjson'
|
||||||
|
require 'string'
|
||||||
|
require 'math'
|
||||||
|
local floor = math.floor
|
||||||
|
local max = math.max
|
||||||
|
local utils = require 'lma_utils'
|
||||||
|
|
||||||
|
_PRESERVATION_VERSION = 2
|
||||||
|
-- variables with global scope are preserved between restarts
|
||||||
|
all_service_status = {}
|
||||||
|
|
||||||
|
-- local scope variables
|
||||||
|
local timeout = read_config("timeout") or 60
|
||||||
|
local hostname
|
||||||
|
local datapoints = {}
|
||||||
|
|
||||||
|
function process_message ()
|
||||||
|
local ok, data = pcall(cjson.decode, read_message("Payload"))
|
||||||
|
if not ok then
|
||||||
|
return -1
|
||||||
|
end
|
||||||
|
local timestamp = read_message('Timestamp')
|
||||||
|
local ts = floor(timestamp/1e6) -- in ms
|
||||||
|
hostname = read_message("Hostname")
|
||||||
|
local service_name = data.name
|
||||||
|
local states = data.states
|
||||||
|
|
||||||
|
local worker_status = -1
|
||||||
|
local check_api_status = -1
|
||||||
|
local haproxy_server_status = -1
|
||||||
|
local global_status
|
||||||
|
local events = {}
|
||||||
|
local not_up_status = {}
|
||||||
|
local msg_event
|
||||||
|
|
||||||
|
if not all_service_status[service_name] then all_service_status[service_name] = {} end
|
||||||
|
|
||||||
|
if states.workers then
|
||||||
|
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, states.workers, true)
|
||||||
|
end
|
||||||
|
|
||||||
|
if states.check_api then
|
||||||
|
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, states.check_api, false)
|
||||||
|
end
|
||||||
|
if states.haproxy then
|
||||||
|
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, states.haproxy, true)
|
||||||
|
end
|
||||||
|
global_status = max(worker_status, check_api_status, haproxy_server_status)
|
||||||
|
-- global service status
|
||||||
|
utils.add_metric(datapoints,
|
||||||
|
string.format('%s.openstack.%s.status', hostname, service_name),
|
||||||
|
{ts, global_status})
|
||||||
|
|
||||||
|
-- only emit status if the public vip is active
|
||||||
|
if not expired(ts, data.vip_active_at) then
|
||||||
|
local prev = all_service_status[service_name].global_status or utils.global_status_map.UNKNOWN
|
||||||
|
local updated
|
||||||
|
updated = (prev ~= global_status or #events > 0)
|
||||||
|
-- always append not UP status elements in details
|
||||||
|
for k, v in pairs(not_up_status) do events[#events+1] = v end
|
||||||
|
|
||||||
|
local details = ''
|
||||||
|
if #events > 0 then
|
||||||
|
details = cjson.encode(events)
|
||||||
|
end
|
||||||
|
utils.inject_status_message(timestamp, service_name,
|
||||||
|
global_status, prev,
|
||||||
|
updated, details)
|
||||||
|
end
|
||||||
|
|
||||||
|
all_service_status[service_name].global_status = global_status
|
||||||
|
|
||||||
|
if #datapoints > 0 then
|
||||||
|
inject_payload("json", "influxdb", cjson.encode(datapoints))
|
||||||
|
datapoints = {}
|
||||||
|
end
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
|
||||||
|
function get_previous_status(service_name, top_entry, name)
|
||||||
|
if not all_service_status[service_name] then
|
||||||
|
all_service_status[service_name] = {}
|
||||||
|
end
|
||||||
|
if not all_service_status[service_name][top_entry] then
|
||||||
|
all_service_status[service_name][top_entry] = {}
|
||||||
|
end
|
||||||
|
if not all_service_status[service_name][top_entry][name] then
|
||||||
|
all_service_status[service_name][top_entry][name] = utils.service_status_map.UNKNOWN
|
||||||
|
end
|
||||||
|
return all_service_status[service_name][top_entry][name]
|
||||||
|
end
|
||||||
|
|
||||||
|
function set_status(service_name, top_entry, name, status)
|
||||||
|
all_service_status[service_name][top_entry][name] = status
|
||||||
|
end
|
||||||
|
|
||||||
|
function compute_status(events, not_up_status, current_time, elts_name, name, states, display_num)
|
||||||
|
local down_elts = {}
|
||||||
|
local down_elts_count = 0
|
||||||
|
local zero_up = {}
|
||||||
|
local zero_up_count = 0
|
||||||
|
local one_up = {}
|
||||||
|
local one_disabled = {}
|
||||||
|
local one_disabled_count = 0
|
||||||
|
local service_status = utils.service_status_map.UNKNOWN
|
||||||
|
local up_elements = {}
|
||||||
|
local total_elements = {}
|
||||||
|
|
||||||
|
for worker, worker_data in pairs(states) do
|
||||||
|
if not total_elements[worker] then
|
||||||
|
total_elements[worker] = 0
|
||||||
|
end
|
||||||
|
if not up_elements[worker] then
|
||||||
|
up_elements[worker] = 0
|
||||||
|
end
|
||||||
|
for state, data in pairs(worker_data) do
|
||||||
|
if not expired(current_time, data.last_seen) then
|
||||||
|
total_elements[worker] = total_elements[worker] + data.value
|
||||||
|
if state == utils.state_map.DOWN and data.value > 0 then
|
||||||
|
down_elts[worker] = data
|
||||||
|
down_elts_count = down_elts_count + 1
|
||||||
|
end
|
||||||
|
if state == utils.state_map.UP then
|
||||||
|
if data.value > 0 then
|
||||||
|
one_up[worker] = data
|
||||||
|
else
|
||||||
|
zero_up[worker] = data
|
||||||
|
zero_up_count = zero_up_count + 1
|
||||||
|
end
|
||||||
|
up_elements[worker] = data.value
|
||||||
|
end
|
||||||
|
if state == utils.state_map.DISABLED and data.value > 0 then
|
||||||
|
one_disabled[worker] = data
|
||||||
|
one_disabled_count = one_disabled_count + 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
-- general element status
|
||||||
|
if zero_up_count > 0 then
|
||||||
|
service_status = utils.service_status_map.DOWN
|
||||||
|
elseif down_elts_count > 0 then
|
||||||
|
service_status = utils.service_status_map.DEGRADED
|
||||||
|
elseif down_elts_count == 0 then
|
||||||
|
service_status = utils.service_status_map.UP
|
||||||
|
end
|
||||||
|
|
||||||
|
-- elements clearly down
|
||||||
|
for worker_name, worker in pairs(zero_up) do
|
||||||
|
local prev = get_previous_status(name, elts_name, worker_name)
|
||||||
|
local DOWN = utils.service_status_map.DOWN
|
||||||
|
local event_detail = ""
|
||||||
|
set_status(name, elts_name, worker_name, DOWN)
|
||||||
|
if display_num then
|
||||||
|
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||||
|
total_elements[worker_name])
|
||||||
|
end
|
||||||
|
|
||||||
|
if prev and prev ~= DOWN then
|
||||||
|
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
|
||||||
|
worker.group_name,
|
||||||
|
utils.service_status_to_label_map[prev],
|
||||||
|
utils.service_status_to_label_map[DOWN],
|
||||||
|
event_detail)
|
||||||
|
|
||||||
|
else
|
||||||
|
not_up_status[#not_up_status+1] = string.format("%s %s %s %s",
|
||||||
|
worker_name,
|
||||||
|
worker.group_name,
|
||||||
|
utils.service_status_to_label_map[DOWN],
|
||||||
|
event_detail)
|
||||||
|
end
|
||||||
|
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
|
||||||
|
hostname, name, worker.group_name, worker_name),
|
||||||
|
{current_time, utils.service_status_map.DOWN})
|
||||||
|
end
|
||||||
|
-- elements down or degraded
|
||||||
|
for worker_name, worker in pairs(down_elts) do
|
||||||
|
local prev = get_previous_status(name, elts_name, worker_name)
|
||||||
|
local new_status
|
||||||
|
local event_detail
|
||||||
|
if one_up[worker_name] then
|
||||||
|
new_status = utils.service_status_map.DEGRADED
|
||||||
|
else
|
||||||
|
new_status = utils.service_status_map.DOWN
|
||||||
|
end
|
||||||
|
set_status(name, elts_name, worker_name, new_status)
|
||||||
|
utils.add_metric(datapoints,
|
||||||
|
string.format("%s.openstack.%s.%s.%s.status",
|
||||||
|
hostname, name, worker.group_name, worker_name),
|
||||||
|
{current_time, new_status})
|
||||||
|
|
||||||
|
if display_num then
|
||||||
|
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||||
|
total_elements[worker_name])
|
||||||
|
else
|
||||||
|
event_detail = ""
|
||||||
|
end
|
||||||
|
if prev ~= new_status then
|
||||||
|
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
|
||||||
|
worker.group_name,
|
||||||
|
utils.service_status_to_label_map[prev],
|
||||||
|
utils.service_status_to_label_map[new_status],
|
||||||
|
event_detail)
|
||||||
|
elseif not zero_up[worker_name] then
|
||||||
|
not_up_status[#not_up_status+1] = string.format("%s %s %s %s", worker_name,
|
||||||
|
worker.group_name,
|
||||||
|
utils.service_status_to_label_map[new_status],
|
||||||
|
event_detail)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
-- elements up
|
||||||
|
for worker_name, worker in pairs(one_up) do
|
||||||
|
if not zero_up[worker_name] and not down_elts[worker_name] then
|
||||||
|
local prev = get_previous_status(name, elts_name, worker_name)
|
||||||
|
local UP = utils.service_status_map.UP
|
||||||
|
set_status(name, elts_name, worker_name, UP)
|
||||||
|
if prev and prev ~= utils.service_status_map.UP then
|
||||||
|
events[#events+1] = string.format("%s %s %s -> %s", worker_name,
|
||||||
|
worker.group_name,
|
||||||
|
utils.service_status_to_label_map[prev],
|
||||||
|
utils.service_status_to_label_map[UP])
|
||||||
|
end
|
||||||
|
utils.add_metric(datapoints, string.format("%s.openstack.%s.%s.%s.status",
|
||||||
|
hostname, name, worker.group_name, worker_name),
|
||||||
|
{current_time, utils.service_status_map.UP})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return service_status
|
||||||
|
end
|
||||||
|
|
||||||
|
function expired(current_time, last_time)
|
||||||
|
if last_time > 0 and current_time - last_time <= timeout * 1000 then
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
@@ -13,20 +13,20 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
#
|
#
|
||||||
class lma_collector::metrics::service_status (
|
class lma_collector::metrics::service_status (
|
||||||
$metrics_regexp = $lma_collector::params::service_status_metrics_regexp,
|
$metrics_matcher = $lma_collector::params::service_status_metrics_matcher,
|
||||||
$payload_name = $lma_collector::params::service_status_payload_name,
|
|
||||||
$timeout = $lma_collector::params::service_status_timeout,
|
$timeout = $lma_collector::params::service_status_timeout,
|
||||||
){
|
) inherits lma_collector::params {
|
||||||
include heka::params
|
|
||||||
|
|
||||||
validate_array($metrics_regexp)
|
validate_string($metrics_regexp)
|
||||||
|
|
||||||
|
$payload_name = $lma_collector::params::service_status_payload_name
|
||||||
|
|
||||||
if (size(metrics_regexp) > 0){
|
if (size(metrics_regexp) > 0){
|
||||||
|
|
||||||
heka::filter::sandbox { 'service_accumulator_states':
|
heka::filter::sandbox { 'service_accumulator_states':
|
||||||
config_dir => $lma_collector::params::config_dir,
|
config_dir => $lma_collector::params::config_dir,
|
||||||
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states.lua",
|
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states.lua",
|
||||||
message_matcher => inline_template('<%= @metrics_regexp.collect{|x| "Fields[name] =~ /%s/" % x}.join(" || ") %>'),
|
message_matcher => $metrics_matcher,
|
||||||
ticker_interval => $lma_collector::params::service_status_interval,
|
ticker_interval => $lma_collector::params::service_status_interval,
|
||||||
preserve_data => true,
|
preserve_data => true,
|
||||||
config => {
|
config => {
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
# Copyright 2015 Mirantis, Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
#
|
||||||
|
class lma_collector::metrics::service_status_legacy (
|
||||||
|
$metrics_regexp = $lma_collector::params::service_status_metrics_regexp_legacy,
|
||||||
|
$payload_name = $lma_collector::params::service_status_payload_name,
|
||||||
|
$timeout = $lma_collector::params::service_status_timeout,
|
||||||
|
){
|
||||||
|
include heka::params
|
||||||
|
|
||||||
|
validate_array($metrics_regexp)
|
||||||
|
|
||||||
|
if (size(metrics_regexp) > 0){
|
||||||
|
|
||||||
|
heka::filter::sandbox { 'service_accumulator_states':
|
||||||
|
config_dir => $lma_collector::params::config_dir,
|
||||||
|
filename => "${lma_collector::params::plugins_dir}/filters/service_accumulator_states_legacy.lua",
|
||||||
|
message_matcher => inline_template('<%= @metrics_regexp.collect{|x| "Fields[name] =~ /%s/" % x}.join(" || ") %>'),
|
||||||
|
ticker_interval => $lma_collector::params::service_status_interval,
|
||||||
|
preserve_data => true,
|
||||||
|
config => {
|
||||||
|
inject_payload_name => $payload_name,
|
||||||
|
},
|
||||||
|
notify => Class['lma_collector::service'],
|
||||||
|
}
|
||||||
|
|
||||||
|
heka::filter::sandbox { 'service_status':
|
||||||
|
config_dir => $lma_collector::params::config_dir,
|
||||||
|
filename => "${lma_collector::params::plugins_dir}/filters/service_status_legacy.lua",
|
||||||
|
message_matcher => "Fields[payload_type] == 'json' && Fields[payload_name] == '${payload_name}'",
|
||||||
|
preserve_data => true,
|
||||||
|
config => {
|
||||||
|
timeout => $timeout,
|
||||||
|
},
|
||||||
|
notify => Class['lma_collector::service'],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -105,7 +105,7 @@ class lma_collector::params {
|
|||||||
$service_status_payload_name = 'service_status'
|
$service_status_payload_name = 'service_status'
|
||||||
|
|
||||||
# Catch all metrics used to compute OpenStack service statutes
|
# Catch all metrics used to compute OpenStack service statutes
|
||||||
$service_status_metrics_regexp = [
|
$service_status_metrics_regexp_legacy = [
|
||||||
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
|
'^openstack.(nova|cinder|neutron).(services|agents).*(up|down|disabled)$',
|
||||||
# Exception for mysqld backend because the MySQL service status is
|
# Exception for mysqld backend because the MySQL service status is
|
||||||
# computed by a dedicated filter and this avoids to send an annoying
|
# computed by a dedicated filter and this avoids to send an annoying
|
||||||
@@ -114,6 +114,16 @@ class lma_collector::params {
|
|||||||
'^pacemaker.resource.vip__public.active$',
|
'^pacemaker.resource.vip__public.active$',
|
||||||
'^openstack.*check_api$'
|
'^openstack.*check_api$'
|
||||||
]
|
]
|
||||||
|
$service_status_metrics_matcher = join([
|
||||||
|
'(Type == \'metric\' || Type == \'heka.sandbox.metric\') && ',
|
||||||
|
'(Fields[name] =~ /^openstack_(nova|cinder|neutron)_(services|agents)$/ || ',
|
||||||
|
# Exception for mysqld backend because the MySQL service status is
|
||||||
|
# computed by a dedicated filter and this avoids to send an annoying
|
||||||
|
# status Heka message.
|
||||||
|
'(Fields[name] == \'haproxy_backend_servers\' && Fields[backend] !~ /mysql/) || ',
|
||||||
|
'(Fields[name] == \'pacemaker_local_resource_active\' && Fields[resource] == \'vip__public\') || ',
|
||||||
|
'Fields[name] =~ /^openstack.*check_api$/)'
|
||||||
|
], '')
|
||||||
$worker_report_interval = 60
|
$worker_report_interval = 60
|
||||||
$worker_downtime_factor = 2
|
$worker_downtime_factor = 2
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user