Support several APIs per service for status determination

Change-Id: Idd45613194db6a08644d8a387e24945bbdd99993
This commit is contained in:
Swann Croiset 2015-05-28 19:07:24 +02:00
parent 3c451a0c9f
commit 1631d18891
3 changed files with 125 additions and 37 deletions

View File

@ -83,10 +83,17 @@ check_api_to_status_map = {
[2] = 3, -- UNKNOWN
}
check_api_status_to_state_map = {
[0] = 'down',
[1] = 'up',
[2] = 'unknown',
}
state_map = {
UP = 'up',
DOWN = 'down',
DISABLED = 'disabled',
UNKNOWN = 'unknown'
}
function add_metric(datapoints, name, points)

View File

@ -11,6 +11,65 @@
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- The filter accumulates data into a table and emits regularly a message with
-- a payload like this:
-- { ...
-- "nova":{
-- "check_api":{
-- "nova":{
-- "down":{
-- "value":0,
-- "group_name":"endpoint",
-- "last_seen":1433252000524
-- },
-- "up":{
-- "value":1,
-- "group_name":"endpoint",
-- "last_seen":1433252000524
-- }
-- },
-- ...
-- },
-- "workers":{
-- "scheduler":{
-- "down":{
-- "value":0,
-- "group_name":"services",
-- "last_seen":1433251999229
-- },
-- "disabled":{
-- "value":1,
-- "group_name":"services",
-- "last_seen":1433251999226
-- },
-- "up":{
-- "value":2,
-- "group_name":"services",
-- "last_seen":1433251999227
-- }
-- },
-- ...
-- },
-- "haproxy":{
-- "nova-api":{
-- "down":{
-- "value":0,
-- "group_name":"pool",
-- "last_seen":1433252000957
-- },
-- "up":{
-- "value":3,
-- "group_name":"pool",
-- "last_seen":1433252000954
-- }
-- }
-- }
-- ...
-- },
-- ..
-- }
require 'cjson'
require 'string'
require 'math'
@ -55,8 +114,20 @@ function process_message ()
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
top_entry = 'workers'
if not item_name then
name = string.match(metric_name, '^openstack%.([^.]+)%.check.api$')
-- A service can have several API checks, by convention the service name
-- is written down "<name>-<item>" or just "<name>".
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
if not name then
name = item_name
end
top_entry = 'check_api'
group_name = 'endpoint'
-- retrieve the current state
state = utils.check_api_status_to_state_map[value]
-- and always override value to 1
value = 1
end
elseif string.find(metric_name, '^haproxy%.backend') then
@ -65,24 +136,33 @@ function process_message ()
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
name = string.match(item_name, '^([^-]+)')
end
if not name then
if not name or not item_name then
return -1
end
-- table initialization for the first time we see a service
if not services[name] then services[name] = {} end
if not services[name][top_entry] then services[name][top_entry] = {} end
if not services[name][top_entry][item_name] then services[name][top_entry][item_name] = {} end
local service = services[name]
local item = {last_seen=ts, value=value}
if item_name then
if not service[top_entry][item_name] then
service[top_entry][item_name] = {}
local service = services[name][top_entry][item_name]
service[state] = {last_seen=ts, value=value, group_name=group_name}
-- In the logic to treat check_api results like others, group by up/down
-- and reset the counterpart w/ value=0
if top_entry == 'check_api' then
local invert_state
if state == utils.state_map.UP then
invert_state = utils.state_map.DOWN
elseif state == utils.state_map.DOWN then
invert_state = utils.state_map.UP
end
if invert_state then
if not service[invert_state] then
service[invert_state] = {}
end
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
end
item.group_name = group_name
service[top_entry][item_name][state] = item
else
service[top_entry] = item
end
return 0
end

View File

@ -18,7 +18,7 @@ local floor = math.floor
local max = math.max
local utils = require 'lma_utils'
_PRESERVATION_VERSION = 1
_PRESERVATION_VERSION = 2
-- variables with global scope are preserved between restarts
all_service_status = {}
@ -48,26 +48,14 @@ function process_message ()
if not all_service_status[service_name] then all_service_status[service_name] = {} end
if service.workers then
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, service.workers)
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, service.workers, true)
end
if service.check_api and service.check_api.value then
check_api_status = utils.check_api_to_status_map[service.check_api.value]
local prev_check_api_status = utils.service_status_map.UNKNOWN
if all_service_status[service_name].check_api then
prev_check_api_status = all_service_status[service_name].check_api
end
all_service_status[service_name].check_api = check_api_status
if prev_check_api_status and prev_check_api_status ~= check_api_status then
events[#events+1] = string.format("endpoint %s -> %s",
utils.service_status_to_label_map[prev_check_api_status],
utils.service_status_to_label_map[check_api_status])
elseif check_api_status == utils.service_status_map.DOWN then
not_up_status[#not_up_status+1] = string.format("API status DOWN")
end
if service.check_api then
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, service.check_api, false)
end
if service.haproxy then
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, service.haproxy)
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, service.haproxy, true)
end
global_status = max(worker_status, check_api_status, haproxy_server_status)
-- global service status
@ -116,7 +104,7 @@ function set_status(service_name, top_entry, name, status)
all_service_status[service_name][top_entry][name] = status
end
function compute_status(events, not_up_status, current_time, elts_name, name, service)
function compute_status(events, not_up_status, current_time, elts_name, name, service, display_num)
local down_elts = {}
local down_elts_count = 0
local zero_up = {}
@ -171,20 +159,26 @@ function compute_status(events, not_up_status, current_time, elts_name, name, se
for worker_name, worker in pairs(zero_up) do
local prev = get_previous_status(name, elts_name, worker_name)
local DOWN = utils.service_status_map.DOWN
local event_detail = ""
set_status(name, elts_name, worker_name, DOWN)
if display_num then
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
total_elements[worker_name])
end
if prev and prev ~= DOWN then
events[#events+1] = string.format("%s %s %s -> %s (%s/%s UP)", worker_name,
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[prev],
utils.service_status_to_label_map[DOWN],
up_elements[worker_name], total_elements[worker_name])
event_detail)
else
not_up_status[#not_up_status+1] = string.format("%s %s %s (%s/%s UP)",
not_up_status[#not_up_status+1] = string.format("%s %s %s %s",
worker_name,
worker.group_name,
utils.service_status_to_label_map[DOWN],
up_elements[worker_name], total_elements[worker_name])
event_detail)
end
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
hostname, name, worker.group_name, worker_name),
@ -194,6 +188,7 @@ function compute_status(events, not_up_status, current_time, elts_name, name, se
for worker_name, worker in pairs(down_elts) do
local prev = get_previous_status(name, elts_name, worker_name)
local new_status
local event_detail
if one_up[worker_name] then
new_status = utils.service_status_map.DEGRADED
else
@ -205,17 +200,23 @@ function compute_status(events, not_up_status, current_time, elts_name, name, se
hostname, name, worker.group_name, worker_name),
{current_time, new_status})
if display_num then
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
total_elements[worker_name])
else
event_detail = ""
end
if prev ~= new_status then
events[#events+1] = string.format("%s %s %s -> %s (%s/%s UP)", worker_name,
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[prev],
utils.service_status_to_label_map[new_status],
up_elements[worker_name], total_elements[worker_name])
event_detail)
elseif not zero_up[worker_name] then
not_up_status[#not_up_status+1] = string.format("%s %s %s (%s/%s UP)", worker_name,
not_up_status[#not_up_status+1] = string.format("%s %s %s %s", worker_name,
worker.group_name,
utils.service_status_to_label_map[new_status],
up_elements[worker_name], total_elements[worker_name])
event_detail)
end
end