Support several APIs per service for status determination
Change-Id: Idd45613194db6a08644d8a387e24945bbdd99993
This commit is contained in:
parent
3c451a0c9f
commit
1631d18891
|
@ -83,10 +83,17 @@ check_api_to_status_map = {
|
|||
[2] = 3, -- UNKNOWN
|
||||
}
|
||||
|
||||
check_api_status_to_state_map = {
|
||||
[0] = 'down',
|
||||
[1] = 'up',
|
||||
[2] = 'unknown',
|
||||
}
|
||||
|
||||
state_map = {
|
||||
UP = 'up',
|
||||
DOWN = 'down',
|
||||
DISABLED = 'disabled',
|
||||
UNKNOWN = 'unknown'
|
||||
}
|
||||
|
||||
function add_metric(datapoints, name, points)
|
||||
|
|
|
@ -11,6 +11,65 @@
|
|||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
-- The filter accumulates data into a table and emits regularly a message with
|
||||
-- a payload like this:
|
||||
-- { ...
|
||||
-- "nova":{
|
||||
-- "check_api":{
|
||||
-- "nova":{
|
||||
-- "down":{
|
||||
-- "value":0,
|
||||
-- "group_name":"endpoint",
|
||||
-- "last_seen":1433252000524
|
||||
-- },
|
||||
-- "up":{
|
||||
-- "value":1,
|
||||
-- "group_name":"endpoint",
|
||||
-- "last_seen":1433252000524
|
||||
-- }
|
||||
-- },
|
||||
-- ...
|
||||
-- },
|
||||
-- "workers":{
|
||||
-- "scheduler":{
|
||||
-- "down":{
|
||||
-- "value":0,
|
||||
-- "group_name":"services",
|
||||
-- "last_seen":1433251999229
|
||||
-- },
|
||||
-- "disabled":{
|
||||
-- "value":1,
|
||||
-- "group_name":"services",
|
||||
-- "last_seen":1433251999226
|
||||
-- },
|
||||
-- "up":{
|
||||
-- "value":2,
|
||||
-- "group_name":"services",
|
||||
-- "last_seen":1433251999227
|
||||
-- }
|
||||
-- },
|
||||
-- ...
|
||||
-- },
|
||||
-- "haproxy":{
|
||||
-- "nova-api":{
|
||||
-- "down":{
|
||||
-- "value":0,
|
||||
-- "group_name":"pool",
|
||||
-- "last_seen":1433252000957
|
||||
-- },
|
||||
-- "up":{
|
||||
-- "value":3,
|
||||
-- "group_name":"pool",
|
||||
-- "last_seen":1433252000954
|
||||
-- }
|
||||
-- }
|
||||
-- }
|
||||
-- ...
|
||||
-- },
|
||||
-- ..
|
||||
-- }
|
||||
|
||||
require 'cjson'
|
||||
require 'string'
|
||||
require 'math'
|
||||
|
@ -55,8 +114,20 @@ function process_message ()
|
|||
name, group_name, item_name = string.match(metric_name, '^openstack%.([^._]+)%.([^._]+)%.([^._]+)')
|
||||
top_entry = 'workers'
|
||||
if not item_name then
|
||||
name = string.match(metric_name, '^openstack%.([^.]+)%.check.api$')
|
||||
-- A service can have several API checks, by convention the service name
|
||||
-- is written down "<name>-<item>" or just "<name>".
|
||||
item_name = string.match(metric_name, '^openstack%.([^.]+)%.check_api$')
|
||||
name, _ = string.match(item_name, '^([^-]+)\-(.*)')
|
||||
if not name then
|
||||
name = item_name
|
||||
end
|
||||
|
||||
top_entry = 'check_api'
|
||||
group_name = 'endpoint'
|
||||
-- retrieve the current state
|
||||
state = utils.check_api_status_to_state_map[value]
|
||||
-- and always override value to 1
|
||||
value = 1
|
||||
end
|
||||
|
||||
elseif string.find(metric_name, '^haproxy%.backend') then
|
||||
|
@ -65,24 +136,33 @@ function process_message ()
|
|||
item_name = string.match(metric_name, '^haproxy%.backend%.([^.]+)%.servers')
|
||||
name = string.match(item_name, '^([^-]+)')
|
||||
end
|
||||
if not name then
|
||||
if not name or not item_name then
|
||||
return -1
|
||||
end
|
||||
|
||||
-- table initialization for the first time we see a service
|
||||
if not services[name] then services[name] = {} end
|
||||
if not services[name][top_entry] then services[name][top_entry] = {} end
|
||||
if not services[name][top_entry][item_name] then services[name][top_entry][item_name] = {} end
|
||||
|
||||
local service = services[name]
|
||||
local item = {last_seen=ts, value=value}
|
||||
if item_name then
|
||||
if not service[top_entry][item_name] then
|
||||
service[top_entry][item_name] = {}
|
||||
local service = services[name][top_entry][item_name]
|
||||
service[state] = {last_seen=ts, value=value, group_name=group_name}
|
||||
|
||||
-- In the logic to treat check_api results like others, group by up/down
|
||||
-- and reset the counterpart w/ value=0
|
||||
if top_entry == 'check_api' then
|
||||
local invert_state
|
||||
if state == utils.state_map.UP then
|
||||
invert_state = utils.state_map.DOWN
|
||||
elseif state == utils.state_map.DOWN then
|
||||
invert_state = utils.state_map.UP
|
||||
end
|
||||
if invert_state then
|
||||
if not service[invert_state] then
|
||||
service[invert_state] = {}
|
||||
end
|
||||
service[invert_state] = {last_seen=ts, value=0, group_name=group_name}
|
||||
end
|
||||
item.group_name = group_name
|
||||
service[top_entry][item_name][state] = item
|
||||
else
|
||||
service[top_entry] = item
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
|
|
@ -18,7 +18,7 @@ local floor = math.floor
|
|||
local max = math.max
|
||||
local utils = require 'lma_utils'
|
||||
|
||||
_PRESERVATION_VERSION = 1
|
||||
_PRESERVATION_VERSION = 2
|
||||
-- variables with global scope are preserved between restarts
|
||||
all_service_status = {}
|
||||
|
||||
|
@ -48,26 +48,14 @@ function process_message ()
|
|||
if not all_service_status[service_name] then all_service_status[service_name] = {} end
|
||||
|
||||
if service.workers then
|
||||
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, service.workers)
|
||||
worker_status = compute_status(events, not_up_status, ts, 'workers', service_name, service.workers, true)
|
||||
end
|
||||
|
||||
if service.check_api and service.check_api.value then
|
||||
check_api_status = utils.check_api_to_status_map[service.check_api.value]
|
||||
local prev_check_api_status = utils.service_status_map.UNKNOWN
|
||||
if all_service_status[service_name].check_api then
|
||||
prev_check_api_status = all_service_status[service_name].check_api
|
||||
end
|
||||
all_service_status[service_name].check_api = check_api_status
|
||||
if prev_check_api_status and prev_check_api_status ~= check_api_status then
|
||||
events[#events+1] = string.format("endpoint %s -> %s",
|
||||
utils.service_status_to_label_map[prev_check_api_status],
|
||||
utils.service_status_to_label_map[check_api_status])
|
||||
elseif check_api_status == utils.service_status_map.DOWN then
|
||||
not_up_status[#not_up_status+1] = string.format("API status DOWN")
|
||||
end
|
||||
if service.check_api then
|
||||
check_api_status = compute_status(events, not_up_status, ts, 'check_api', service_name, service.check_api, false)
|
||||
end
|
||||
if service.haproxy then
|
||||
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, service.haproxy)
|
||||
haproxy_server_status = compute_status(events, not_up_status, ts, 'haproxy', service_name, service.haproxy, true)
|
||||
end
|
||||
global_status = max(worker_status, check_api_status, haproxy_server_status)
|
||||
-- global service status
|
||||
|
@ -116,7 +104,7 @@ function set_status(service_name, top_entry, name, status)
|
|||
all_service_status[service_name][top_entry][name] = status
|
||||
end
|
||||
|
||||
function compute_status(events, not_up_status, current_time, elts_name, name, service)
|
||||
function compute_status(events, not_up_status, current_time, elts_name, name, service, display_num)
|
||||
local down_elts = {}
|
||||
local down_elts_count = 0
|
||||
local zero_up = {}
|
||||
|
@ -171,20 +159,26 @@ function compute_status(events, not_up_status, current_time, elts_name, name, se
|
|||
for worker_name, worker in pairs(zero_up) do
|
||||
local prev = get_previous_status(name, elts_name, worker_name)
|
||||
local DOWN = utils.service_status_map.DOWN
|
||||
local event_detail = ""
|
||||
set_status(name, elts_name, worker_name, DOWN)
|
||||
if display_num then
|
||||
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||
total_elements[worker_name])
|
||||
end
|
||||
|
||||
if prev and prev ~= DOWN then
|
||||
events[#events+1] = string.format("%s %s %s -> %s (%s/%s UP)", worker_name,
|
||||
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[prev],
|
||||
utils.service_status_to_label_map[DOWN],
|
||||
up_elements[worker_name], total_elements[worker_name])
|
||||
event_detail)
|
||||
|
||||
else
|
||||
not_up_status[#not_up_status+1] = string.format("%s %s %s (%s/%s UP)",
|
||||
not_up_status[#not_up_status+1] = string.format("%s %s %s %s",
|
||||
worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[DOWN],
|
||||
up_elements[worker_name], total_elements[worker_name])
|
||||
event_detail)
|
||||
end
|
||||
utils.add_metric(datapoints, string.format('%s.openstack.%s.%s.%s.status',
|
||||
hostname, name, worker.group_name, worker_name),
|
||||
|
@ -194,6 +188,7 @@ function compute_status(events, not_up_status, current_time, elts_name, name, se
|
|||
for worker_name, worker in pairs(down_elts) do
|
||||
local prev = get_previous_status(name, elts_name, worker_name)
|
||||
local new_status
|
||||
local event_detail
|
||||
if one_up[worker_name] then
|
||||
new_status = utils.service_status_map.DEGRADED
|
||||
else
|
||||
|
@ -205,17 +200,23 @@ function compute_status(events, not_up_status, current_time, elts_name, name, se
|
|||
hostname, name, worker.group_name, worker_name),
|
||||
{current_time, new_status})
|
||||
|
||||
if display_num then
|
||||
event_detail = string.format("(%s/%s UP)", up_elements[worker_name],
|
||||
total_elements[worker_name])
|
||||
else
|
||||
event_detail = ""
|
||||
end
|
||||
if prev ~= new_status then
|
||||
events[#events+1] = string.format("%s %s %s -> %s (%s/%s UP)", worker_name,
|
||||
events[#events+1] = string.format("%s %s %s -> %s %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[prev],
|
||||
utils.service_status_to_label_map[new_status],
|
||||
up_elements[worker_name], total_elements[worker_name])
|
||||
event_detail)
|
||||
elseif not zero_up[worker_name] then
|
||||
not_up_status[#not_up_status+1] = string.format("%s %s %s (%s/%s UP)", worker_name,
|
||||
not_up_status[#not_up_status+1] = string.format("%s %s %s %s", worker_name,
|
||||
worker.group_name,
|
||||
utils.service_status_to_label_map[new_status],
|
||||
up_elements[worker_name], total_elements[worker_name])
|
||||
event_detail)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue