Rework the GSE filters
This change modifies the implementation of the GSE filters. The main differences are: - level-1 dependencies define now the members of a cluster and the status of a cluster is defined by the highest severity among all members. - level-2 dependencies are now known as 'hints', they define relationships between clusters (eg, Nova depends on Keystone) but have no influence on the status of a cluster. Change-Id: I58bd79463de78b04b9bad92d02e3fb0da4bacdf4
This commit is contained in:
parent
5820faee0f
commit
d49b5fb1c8
|
@ -14,6 +14,7 @@
|
|||
local cjson = require 'cjson'
|
||||
local consts = require 'gse_constants'
|
||||
local string = require 'string'
|
||||
local table = require 'table'
|
||||
local lma = require 'lma_utils'
|
||||
|
||||
local pairs = pairs
|
||||
|
@ -26,9 +27,9 @@ local read_message = read_message
|
|||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
local facts = {}
|
||||
local level_1_deps = {}
|
||||
local level_2_deps = {}
|
||||
local clusters = {}
|
||||
local reverse_cluster_index = {}
|
||||
local ordered_clusters = {}
|
||||
|
||||
local VALID_STATUSES = {
|
||||
[consts.OKAY]=true,
|
||||
|
@ -38,7 +39,7 @@ local VALID_STATUSES = {
|
|||
[consts.UNKW]=true
|
||||
}
|
||||
|
||||
local STATUS_MAPPING_FOR_LEVEL_1 = {
|
||||
local STATUS_MAPPING_FOR_CLUSTERS = {
|
||||
[consts.OKAY]=consts.OKAY,
|
||||
[consts.WARN]=consts.WARN,
|
||||
[consts.CRIT]=consts.CRIT,
|
||||
|
@ -54,87 +55,164 @@ local STATUS_WEIGHTS = {
|
|||
[consts.DOWN]=4
|
||||
}
|
||||
|
||||
local function dependency(deps, superior, subordinate)
|
||||
if not deps[superior] then
|
||||
deps[superior] = {}
|
||||
function add_cluster(cluster_id, members, hints, group_by_hostname)
|
||||
assert(type(members) == 'table')
|
||||
assert(type(hints) == 'table')
|
||||
|
||||
if not clusters[cluster_id] then
|
||||
clusters[cluster_id] = {}
|
||||
end
|
||||
local cluster = clusters[cluster_id]
|
||||
|
||||
cluster.members = members
|
||||
cluster.hints = hints
|
||||
cluster.facts = {}
|
||||
cluster.status = consts.UNKW
|
||||
cluster.alarms={}
|
||||
if group_by_hostname then
|
||||
cluster.group_by_hostname = true
|
||||
else
|
||||
cluster.group_by_hostname = false
|
||||
end
|
||||
|
||||
-- update the reverse index
|
||||
for _, member in ipairs(members) do
|
||||
if not reverse_cluster_index[member] then
|
||||
reverse_cluster_index[member] = {}
|
||||
end
|
||||
local reverse_table = reverse_cluster_index[member]
|
||||
if not lma.table_find(cluster_id, reverse_table) then
|
||||
reverse_table[#reverse_table+1] = cluster_id
|
||||
end
|
||||
end
|
||||
|
||||
if not lma.table_find(cluster_id, ordered_clusters) then
|
||||
local after_index = 1
|
||||
for current_pos, id in ipairs(ordered_clusters) do
|
||||
if lma.table_find(id, cluster.hints) then
|
||||
after_index = current_pos + 1
|
||||
end
|
||||
end
|
||||
|
||||
local index = after_index
|
||||
for _, item in pairs(clusters) do
|
||||
for _, hint in pairs(item.hints) do
|
||||
if hint == cluster_id then
|
||||
local pos = lma.table_pos(hint, cluster_orderings)
|
||||
if pos and pos <= index then
|
||||
index = pos
|
||||
elseif index > after_index then
|
||||
error('circular dependency between clusters!')
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
table.insert(ordered_clusters, index, cluster_id)
|
||||
end
|
||||
local subordinates = deps[superior]
|
||||
subordinates[#subordinates+1] = subordinate
|
||||
end
|
||||
|
||||
-- define a first degree dependency between 2 entities.
|
||||
function level_1_dependency(superior, subordinate)
|
||||
return dependency(level_1_deps, superior, subordinate)
|
||||
function get_ordered_clusters()
|
||||
return ordered_clusters
|
||||
end
|
||||
|
||||
-- define a second degree dependency between 2 entities.
|
||||
function level_2_dependency(superior, subordinate)
|
||||
return dependency(level_2_deps, superior, subordinate)
|
||||
function cluster_exists(cluster_id)
|
||||
return clusters[cluster_id] ~= nil
|
||||
end
|
||||
|
||||
-- store the status of a service and a list of alarms
|
||||
function set_status(service, value, alarms)
|
||||
-- return the list of clusters which depends on a given member
|
||||
function find_cluster_memberships(member_id)
|
||||
return reverse_cluster_index[member_id] or {}
|
||||
end
|
||||
|
||||
-- store the status of a cluster's member and its current alarms
|
||||
function set_member_status(cluster_id, member, value, alarms, hostname)
|
||||
assert(VALID_STATUSES[value])
|
||||
assert(type(alarms) == 'table')
|
||||
facts[service] = {
|
||||
|
||||
local cluster = clusters[cluster_id]
|
||||
if not cluster then
|
||||
return
|
||||
end
|
||||
|
||||
local group_key = '__all_hosts__'
|
||||
if cluster.group_by_hostname then
|
||||
group_key = hostname
|
||||
else
|
||||
hostname = ''
|
||||
end
|
||||
|
||||
if not cluster.facts[member] then
|
||||
cluster.facts[member] = {}
|
||||
end
|
||||
cluster.facts[member][group_key] = {
|
||||
status=value,
|
||||
alarms=alarms
|
||||
alarms=alarms,
|
||||
hostname=hostname
|
||||
}
|
||||
end
|
||||
|
||||
function max_status(current, status)
|
||||
if not status or STATUS_WEIGHTS[current] > STATUS_WEIGHTS[status] then
|
||||
return current
|
||||
else
|
||||
return status
|
||||
end
|
||||
if not status or STATUS_WEIGHTS[current] > STATUS_WEIGHTS[status] then
|
||||
return current
|
||||
else
|
||||
return status
|
||||
end
|
||||
end
|
||||
|
||||
-- The service status depends on the status of the level-1 dependencies.
|
||||
-- The status of the level-2 dependencies don't modify the overall status
|
||||
-- but their alarms are returned.
|
||||
function resolve_status(name)
|
||||
local service_status = consts.UNKW
|
||||
-- The cluster status depends on the status of its members.
|
||||
-- The status of the related clusters (defined by cluster.hints) doesn't modify
|
||||
-- the overall status but their alarms are returned.
|
||||
function resolve_status(cluster_id)
|
||||
local cluster = clusters[cluster_id]
|
||||
assert(cluster)
|
||||
|
||||
cluster.status = consts.UNKW
|
||||
local alarms = {}
|
||||
|
||||
for _, level_1_dep in ipairs(level_1_deps[name] or {}) do
|
||||
if facts[level_1_dep] then
|
||||
local status = STATUS_MAPPING_FOR_LEVEL_1[facts[level_1_dep].status]
|
||||
for _, member in ipairs(cluster.members) do
|
||||
for _, fact in pairs(cluster.facts[member] or {}) do
|
||||
local status = STATUS_MAPPING_FOR_CLUSTERS[fact.status]
|
||||
if status ~= consts.OKAY then
|
||||
for _, v in ipairs(facts[level_1_dep].alarms) do
|
||||
-- append alarms when member's status aren't okay
|
||||
for _, v in ipairs(fact.alarms) do
|
||||
alarms[#alarms+1] = lma.deepcopy(v)
|
||||
if not alarms[#alarms]['tags'] then
|
||||
alarms[#alarms]['tags'] = {}
|
||||
end
|
||||
alarms[#alarms].tags['dependency'] = level_1_dep
|
||||
alarms[#alarms].tags['dependency_name'] = member
|
||||
alarms[#alarms].tags['dependency_level'] = 'direct'
|
||||
if fact.hostname then
|
||||
alarms[#alarms].hostname = fact.hostname
|
||||
end
|
||||
end
|
||||
end
|
||||
service_status = max_status(service_status, status)
|
||||
cluster.status = max_status(cluster.status, status)
|
||||
end
|
||||
end
|
||||
cluster.alarms = lma.deepcopy(alarms)
|
||||
|
||||
for _, level_2_dep in ipairs(level_2_deps[level_1_dep] or {}) do
|
||||
if facts[level_2_dep] then
|
||||
local status = facts[level_2_dep].status
|
||||
if status ~= consts.OKAY then
|
||||
for _, v in ipairs(facts[level_2_dep].alarms) do
|
||||
alarms[#alarms+1] = lma.deepcopy(v)
|
||||
if not alarms[#alarms]['tags'] then
|
||||
alarms[#alarms]['tags'] = {}
|
||||
end
|
||||
alarms[#alarms].tags['dependency'] = level_2_dep
|
||||
alarms[#alarms].tags['dependency_level'] = 'indirect'
|
||||
if cluster.status ~= consts.OKAY then
|
||||
-- add hints if the cluster isn't healthy
|
||||
for _, member in ipairs(cluster.hints or {}) do
|
||||
local other_cluster = clusters[member]
|
||||
if other_cluster and other_cluster.status ~= OKAY and #other_cluster.alarms > 0 then
|
||||
for _, v in ipairs(other_cluster.alarms) do
|
||||
alarms[#alarms+1] = lma.deepcopy(v)
|
||||
if not alarms[#alarms]['tags'] then
|
||||
alarms[#alarms]['tags'] = {}
|
||||
end
|
||||
alarms[#alarms].tags['dependency_name'] = member
|
||||
alarms[#alarms].tags['dependency_level'] = 'hint'
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
return service_status, alarms
|
||||
return cluster.status, alarms
|
||||
end
|
||||
|
||||
-- compute the cluster metric and inject it into the Heka pipeline
|
||||
-- the metric's value is computed using the status of the subordinates
|
||||
-- the metric's value is computed using the status of its members
|
||||
function inject_cluster_metric(msg_type, cluster_name, metric_name, hostname, interval, source)
|
||||
local payload
|
||||
local status, alarms = resolve_status(cluster_name)
|
||||
|
|
|
@ -194,18 +194,22 @@ function deepcopy(t)
|
|||
return t
|
||||
end
|
||||
|
||||
-- return true if an item is present in the list, else false
|
||||
function table_find(item, list)
|
||||
-- return the position (index) of an item in a list, nil if not found
|
||||
function table_pos(item, list)
|
||||
if type(list) == 'table' then
|
||||
for _, v in ipairs(list) do
|
||||
for i, v in ipairs(list) do
|
||||
if v == item then
|
||||
return true
|
||||
return i
|
||||
end
|
||||
end
|
||||
return false
|
||||
end
|
||||
end
|
||||
|
||||
-- return true if an item is present in the list, else false
|
||||
function table_find(item, list)
|
||||
return table_pos(item, list) ~= nil
|
||||
end
|
||||
|
||||
-- from http://lua-users.org/wiki/SortedIteration
|
||||
function __genOrderedIndex( t )
|
||||
local orderedIndex = {}
|
||||
|
|
|
@ -77,11 +77,11 @@ function process_message()
|
|||
string.format("At least one %s backend is down", service))
|
||||
end
|
||||
|
||||
afd.inject_afd_service_metric(service .. '-backends',
|
||||
afd.inject_afd_service_metric(service,
|
||||
state,
|
||||
read_message('Fields[hostname]'),
|
||||
0,
|
||||
'afd_api_backends')
|
||||
'backends')
|
||||
|
||||
-- reset the cache for this service
|
||||
haproxy_backend_states[service] = {}
|
||||
|
|
|
@ -40,11 +40,11 @@ function process_message()
|
|||
string.format("Endpoint check for %s is failed", service))
|
||||
end
|
||||
|
||||
afd.inject_afd_service_metric(service .. '-endpoint',
|
||||
afd.inject_afd_service_metric(service,
|
||||
state,
|
||||
read_message('Fields[hostname]'),
|
||||
0,
|
||||
'afd_api_endpoint')
|
||||
'endpoint')
|
||||
|
||||
return 0
|
||||
end
|
||||
|
|
|
@ -85,7 +85,7 @@ function process_message()
|
|||
state,
|
||||
read_message('Fields[hostname]'),
|
||||
0,
|
||||
'afd_workers')
|
||||
'workers')
|
||||
|
||||
-- reset the cache for this worker
|
||||
worker_states[worker_key] = {}
|
||||
|
|
|
@ -16,35 +16,31 @@ local cjson = require 'cjson'
|
|||
|
||||
local afd = require 'afd'
|
||||
local gse = require 'gse'
|
||||
local lma = require 'lma_utils'
|
||||
|
||||
local output_message_type = read_config('output_message_type') or error('output_message_type must be specified!')
|
||||
local entity_field = read_config('entity_field') or error('entity_field must be specified!')
|
||||
local cluster_field = read_config('cluster_field')
|
||||
local member_field = read_config('member_field') or error('member_field must be specified!')
|
||||
local output_metric_name = read_config('output_metric_name') or error('output_metric_name must be specified!')
|
||||
local hostname = read_config('hostname') or error('hostname must be specified!')
|
||||
local source = read_config('source') or error('source must be specified!')
|
||||
local topology_file = read_config('topology_file') or error('topology_file must be specified!')
|
||||
local interval = (read_config('interval') or error('interval must be specified!')) + 0
|
||||
local max_inject = (read_config('max_inject') or 10) + 0
|
||||
local interval_in_ns = interval * 1e9
|
||||
|
||||
local is_active = false
|
||||
local last_tick = 0
|
||||
local entities = {}
|
||||
local last_index = nil
|
||||
local topology = require(topology_file)
|
||||
|
||||
for parent, children in pairs(topology.level_1_dependencies) do
|
||||
entities[#entities+1] = parent
|
||||
for _, v in ipairs(children) do
|
||||
gse.level_1_dependency(parent, v)
|
||||
end
|
||||
end
|
||||
for parent, children in pairs(topology.level_2_dependencies) do
|
||||
for _, v in ipairs(children) do
|
||||
gse.level_2_dependency(parent, v)
|
||||
end
|
||||
for cluster_name, attributes in pairs(topology.clusters) do
|
||||
gse.add_cluster(cluster_name, attributes.members, attributes.hints, attributes.group_by_hostname)
|
||||
end
|
||||
|
||||
function process_message()
|
||||
local name = read_message('Fields[name]')
|
||||
local hostname = read_message('Fields[hostname]')
|
||||
if name and name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__management' then
|
||||
if read_message('Fields[value]') == 1 then
|
||||
is_active = true
|
||||
|
@ -54,37 +50,67 @@ function process_message()
|
|||
return 0
|
||||
end
|
||||
|
||||
name = afd.get_entity_name(entity_field)
|
||||
local status = afd.get_status()
|
||||
local alarms = afd.extract_alarms()
|
||||
if not name then
|
||||
return -1, "Cannot find entity's name in the AFD event message"
|
||||
end
|
||||
if not status then
|
||||
return -1, "Cannot find status in the AFD event message"
|
||||
end
|
||||
if not alarms then
|
||||
return -1, "Cannot find alarms in the AFD event message"
|
||||
local member_id = afd.get_entity_name(member_field)
|
||||
if not member_id then
|
||||
return -1, "Cannot find entity's name in the AFD/GSE message"
|
||||
end
|
||||
|
||||
gse.set_status(name, status, alarms)
|
||||
local status = afd.get_status()
|
||||
if not status then
|
||||
return -1, "Cannot find status in the AFD/GSE message"
|
||||
end
|
||||
|
||||
local alarms = afd.extract_alarms()
|
||||
if not alarms then
|
||||
return -1, "Cannot find alarms in the AFD/GSE message"
|
||||
end
|
||||
|
||||
local cluster_ids
|
||||
if cluster_field then
|
||||
local cluster_id = afd.get_entity_name(cluster_field)
|
||||
if not cluster_id then
|
||||
return -1, "Cannot find the cluster's name in the AFD/GSE message"
|
||||
elseif not gse.cluster_exists(cluster_id) then
|
||||
-- Just ignore AFD/GSE messages which aren't part of a cluster's definition
|
||||
return 0
|
||||
end
|
||||
cluster_ids = { cluster_id }
|
||||
else
|
||||
cluster_ids = gse.find_cluster_memberships(member_id)
|
||||
end
|
||||
|
||||
-- update all clusters that depend on this entity
|
||||
for _, cluster_id in ipairs(cluster_ids) do
|
||||
gse.set_member_status(cluster_id, member_id, status, alarms, hostname)
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
||||
function timer_event(ns)
|
||||
if not is_active or (ns - last_tick) < interval_in_ns then
|
||||
if not is_active or (last_index == nil and (ns - last_tick) < interval_in_ns) then
|
||||
return
|
||||
end
|
||||
last_tick = ns
|
||||
|
||||
for _, cluster_name in ipairs(entities) do
|
||||
gse.inject_cluster_metric(
|
||||
output_message_type,
|
||||
cluster_name,
|
||||
output_metric_name,
|
||||
hostname,
|
||||
interval,
|
||||
source
|
||||
)
|
||||
local injected = 0
|
||||
for i, cluster_name in ipairs(gse.get_ordered_clusters()) do
|
||||
if last_index == nil or i > last_index then
|
||||
gse.inject_cluster_metric(
|
||||
output_message_type,
|
||||
cluster_name,
|
||||
output_metric_name,
|
||||
hostname,
|
||||
interval,
|
||||
source
|
||||
)
|
||||
last_index = i
|
||||
injected = injected + 1
|
||||
|
||||
if injected >= max_inject then
|
||||
return
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
last_index = nil
|
||||
end
|
||||
|
|
|
@ -14,19 +14,20 @@
|
|||
define lma_collector::gse_cluster_filter (
|
||||
$input_message_types,
|
||||
$aggregator_flag,
|
||||
$entity_field,
|
||||
$member_field,
|
||||
$output_message_type,
|
||||
$output_metric_name,
|
||||
$interval = 10,
|
||||
$level_1_dependencies = {},
|
||||
$level_2_dependencies = {},
|
||||
$cluster_field = undef,
|
||||
$clusters = {},
|
||||
$ensure = present,
|
||||
) {
|
||||
include lma_collector::params
|
||||
include heka::params
|
||||
|
||||
validate_array($input_message_types)
|
||||
validate_string($entity_field)
|
||||
validate_string($cluster_field)
|
||||
validate_string($member_field)
|
||||
validate_string($output_metric_name)
|
||||
if size($input_message_types) == 0 {
|
||||
fail('input_message_types cannot be empty')
|
||||
|
@ -59,7 +60,9 @@ define lma_collector::gse_cluster_filter (
|
|||
source => "gse_${title}_filter",
|
||||
interval => $interval,
|
||||
topology_file => $topology_file,
|
||||
entity_field => $entity_field,
|
||||
cluster_field => $cluster_field,
|
||||
member_field => $member_field,
|
||||
max_inject => $lma_collector::params::hekad_max_timer_inject,
|
||||
},
|
||||
require => File[$topology_file],
|
||||
notify => Class['lma_collector::service']
|
||||
|
|
|
@ -83,9 +83,8 @@ class lma_collector::params {
|
|||
# Heka's default value is 1
|
||||
$hekad_max_process_inject = 1
|
||||
|
||||
# The GSE filters can inject up to 20 messages per timer_event() call
|
||||
# Heka's default value is 10
|
||||
$hekad_max_timer_inject = 20
|
||||
$hekad_max_timer_inject = 10
|
||||
|
||||
# Parameters for OpenStack notifications
|
||||
$rabbitmq_host = false
|
||||
|
|
|
@ -24,25 +24,36 @@ describe 'lma_collector::gse_cluster_filter' do
|
|||
let(:params) do
|
||||
{:input_message_types => ['afd_service_metric'],
|
||||
:aggregator_flag => true,
|
||||
:entity_field => 'service',
|
||||
:cluster_field => 'service',
|
||||
:member_field => 'source',
|
||||
:output_message_type => 'gse_service_cluster_metric',
|
||||
:output_metric_name => 'cluster_service_status'}
|
||||
end
|
||||
it { is_expected.to contain_heka__filter__sandbox('gse_service').with_message_matcher("(Fields[name] == 'pacemaker_local_resource_active' && Fields[resource] == 'vip__management') || (Fields[aggregator] != NIL && (Type =~ /afd_service_metric$/))") }
|
||||
it { is_expected.to contain_file('gse_service_topology') }
|
||||
end
|
||||
|
||||
describe 'with dependencies' do
|
||||
let(:params) do
|
||||
{:input_message_types => ['gse_service_cluster_metric', 'gse_node_cluster_metric'],
|
||||
:aggregator_flag => false,
|
||||
:entity_field => 'cluster_name',
|
||||
:member_field => 'cluster_name',
|
||||
:output_message_type => 'gse_cluster_metric',
|
||||
:output_metric_name => 'cluster_status',
|
||||
:level_1_dependencies => {'nova' => ['nova-api','nova-scheduler'],
|
||||
'cinder' => ['cinder-api']},
|
||||
:level_2_dependencies => {'nova-api' => ['neutron-api']}
|
||||
:clusters => {
|
||||
'nova' => {
|
||||
'members' => ['nova-api', 'nova-scheduler', 'controller_nodes'],
|
||||
'group_by_hostname' => false,
|
||||
'hints' => ['keystone']
|
||||
},
|
||||
'keystone' => {
|
||||
'members' => ['keystone-public-api', 'keystone-admin-api', 'controller_nodes'],
|
||||
'group_by_hostname' => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
end
|
||||
it { is_expected.to contain_heka__filter__sandbox('gse_service').with_message_matcher("(Fields[name] == 'pacemaker_local_resource_active' && Fields[resource] == 'vip__management') || (Fields[aggregator] == NIL && (Type =~ /gse_service_cluster_metric$/ || Type =~ /gse_node_cluster_metric$/))") }
|
||||
it { is_expected.to contain_file('gse_service_topology') }
|
||||
end
|
||||
end
|
||||
|
|
|
@ -4,135 +4,250 @@ lma_collector:
|
|||
input_message_types:
|
||||
- afd_service_metric
|
||||
aggregator_flag: true
|
||||
entity_field: service
|
||||
# the field in the input messages to identify the cluster
|
||||
cluster_field: service
|
||||
# the field in the input messages to identify the cluster's member
|
||||
member_field: source
|
||||
output_message_type: gse_service_cluster_metric
|
||||
output_metric_name: cluster_service_status
|
||||
interval: 10
|
||||
level_1_dependencies:
|
||||
clusters:
|
||||
nova-api:
|
||||
- nova-api-backends
|
||||
- nova-ec2-api-backends
|
||||
- nova-endpoint
|
||||
nova-novncproxy:
|
||||
- nova-novncproxy-websocket-backends
|
||||
nova-metadata:
|
||||
- nova-api-metadata-backends
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
nova-ec2-api:
|
||||
members:
|
||||
- backends
|
||||
nova-novncproxy-websocket:
|
||||
members:
|
||||
- backends
|
||||
nova-metadata-api:
|
||||
members:
|
||||
- backends
|
||||
nova-scheduler:
|
||||
- nova-scheduler
|
||||
members:
|
||||
- workers
|
||||
nova-cert:
|
||||
members:
|
||||
- workers
|
||||
nova-consoleauth:
|
||||
members:
|
||||
- workers
|
||||
nova-compute:
|
||||
- nova-compute
|
||||
members:
|
||||
- workers
|
||||
nova-conductor:
|
||||
- nova-conductor
|
||||
members:
|
||||
- workers
|
||||
cinder-api:
|
||||
- cinder-api-backends
|
||||
- cinder-endpoint
|
||||
- cinder-v2-endpoint
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
cinder-v2-api:
|
||||
members:
|
||||
# Cinder V2 backends are in fact the same as the Cinder backends
|
||||
- endpoint
|
||||
cinder-scheduler:
|
||||
- cinder-scheduler
|
||||
members:
|
||||
- workers
|
||||
cinder-volume:
|
||||
- cinder-volume
|
||||
members:
|
||||
- workers
|
||||
neutron-api:
|
||||
- neutron-api-backends
|
||||
- neutron-endpoint
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
neutron-l3:
|
||||
- l3
|
||||
members:
|
||||
- workers
|
||||
neutron-dhcp:
|
||||
- dhcp
|
||||
neutron-ovs:
|
||||
- openvswitch
|
||||
keystone-api:
|
||||
- keystone-public-api-backends
|
||||
- keystone-admin-api-backends
|
||||
- keystone-endpoint
|
||||
members:
|
||||
- workers
|
||||
neutron-metadata:
|
||||
members:
|
||||
- workers
|
||||
neutron-openvswitch:
|
||||
members:
|
||||
- workers
|
||||
keystone-public-api:
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
keystone-admin-api:
|
||||
members:
|
||||
# TODO(pasquier-s): add a metric reporting the status of the keystone-admin-api endpoint
|
||||
- backends
|
||||
glance-api:
|
||||
- glance-api-backends
|
||||
- glance-endpoint
|
||||
glance-registry:
|
||||
- glance-registry-api-backends
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
glance-registry-api:
|
||||
members:
|
||||
- backends
|
||||
heat-api:
|
||||
- heat-api-backends
|
||||
- heat-cfn-api-backends
|
||||
- heat-endpoint
|
||||
horizon-ui:
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
heat-cfn-api:
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
heat-cloudwatch-api:
|
||||
members:
|
||||
- backends
|
||||
<% if @tls_enabled then -%>
|
||||
- horizon-https-backends
|
||||
horizon-https:
|
||||
members:
|
||||
- backends
|
||||
<% else -%>
|
||||
- horizon-web-backends
|
||||
horizon-ui:
|
||||
members:
|
||||
- backends
|
||||
<% end -%>
|
||||
<% if not @storage_options["objects_ceph"] then -%>
|
||||
swift-api:
|
||||
- swift-api-backends
|
||||
- swift-endpoint
|
||||
- swift-s3-endpoint
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
swift-s3-api:
|
||||
members:
|
||||
# Swift S3 backends are in fact the same as the Swift backends
|
||||
- endpoint
|
||||
<% end -%>
|
||||
<% if @ceilometer_enabled -%>
|
||||
ceilometer-api:
|
||||
- ceilometer-api-backends
|
||||
- ceilometer-endpoint
|
||||
members:
|
||||
- backends
|
||||
- endpoint
|
||||
<% end -%>
|
||||
level_2_dependencies: {}
|
||||
|
||||
gse_cluster_node:
|
||||
input_message_types:
|
||||
- afd_node_metric
|
||||
aggregator_flag: true
|
||||
entity_field: hostname
|
||||
# the field in the input messages to identify the cluster
|
||||
cluster_field: hostname
|
||||
# the field in the input messages to identify the cluster's member
|
||||
member_field: source
|
||||
output_message_type: gse_node_cluster_metric
|
||||
output_metric_name: cluster_node_status
|
||||
interval: 10
|
||||
level_1_dependencies: {}
|
||||
level_2_dependencies: {}
|
||||
clusters:
|
||||
controller:
|
||||
group_by_hostname: true
|
||||
members:
|
||||
- system
|
||||
- fs
|
||||
compute:
|
||||
group_by_hostname: true
|
||||
members:
|
||||
- system
|
||||
- fs
|
||||
storage:
|
||||
group_by_hostname: true
|
||||
members:
|
||||
- system
|
||||
- fs
|
||||
|
||||
gse_cluster_global:
|
||||
input_message_types:
|
||||
- gse_service_cluster_metric
|
||||
- gse_node_cluster_metric
|
||||
aggregator_flag: false
|
||||
entity_field: cluster_name
|
||||
# the field in the input messages to identify the cluster's member
|
||||
member_field: cluster_name
|
||||
output_message_type: gse_cluster_metric
|
||||
output_metric_name: cluster_status
|
||||
interval: 10
|
||||
level_1_dependencies:
|
||||
clusters:
|
||||
nova:
|
||||
- nova-api
|
||||
- nova-scheduler
|
||||
- nova-compute
|
||||
- nova-conductor
|
||||
- nova-novncproxy
|
||||
- nova-metadata
|
||||
members:
|
||||
- nova-api
|
||||
- nova-ec2-api
|
||||
- nova-metadata-api
|
||||
- nova-scheduler
|
||||
- nova-compute
|
||||
- nova-conductor
|
||||
- nova-cert
|
||||
- nova-consoleauth
|
||||
- nova-novncproxy-websocket
|
||||
- controller
|
||||
- compute
|
||||
hints:
|
||||
- cinder
|
||||
- glance
|
||||
- keystone
|
||||
- neutron
|
||||
cinder:
|
||||
- cinder-api
|
||||
- cinder-scheduler
|
||||
- cinder-volume
|
||||
members:
|
||||
- cinder-api
|
||||
- cinder-v2-api
|
||||
- cinder-scheduler
|
||||
- cinder-volume
|
||||
- controller
|
||||
- storage
|
||||
hints:
|
||||
- keystone
|
||||
neutron:
|
||||
- neutron-api
|
||||
- neutron-l3
|
||||
- neutron-dhcp
|
||||
- neutron-metadata
|
||||
- neutron-ovs
|
||||
members:
|
||||
- neutron-api
|
||||
- neutron-l3
|
||||
- neutron-dhcp
|
||||
- neutron-metadata
|
||||
- neutron-openvswitch
|
||||
- controller
|
||||
hints:
|
||||
- keystone
|
||||
keystone:
|
||||
- keystone-api
|
||||
members:
|
||||
- keystone-public-api
|
||||
- keystone-admin-api
|
||||
- controller
|
||||
hints: []
|
||||
glance:
|
||||
- glance-api
|
||||
- glance-registry
|
||||
members:
|
||||
- glance-api
|
||||
- glance-registry-api
|
||||
- controller
|
||||
hints:
|
||||
- keystone
|
||||
heat:
|
||||
- heat-api
|
||||
members:
|
||||
- heat-api
|
||||
- heat-cfn-api
|
||||
- heat-cloudwatch-api
|
||||
- controller
|
||||
hints:
|
||||
- cinder
|
||||
- glance
|
||||
- keystone
|
||||
- neutron
|
||||
- nova
|
||||
horizon:
|
||||
- horizon-ui
|
||||
members:
|
||||
<% if @tls_enabled then -%>
|
||||
- horizon-https
|
||||
<% else -%>
|
||||
- horizon-ui
|
||||
<% end -%>
|
||||
- controller
|
||||
hints:
|
||||
- keystone
|
||||
<% if not @storage_options["objects_ceph"] then -%>
|
||||
swift:
|
||||
- swift-api
|
||||
members:
|
||||
- swift-api
|
||||
- swift-s3-api
|
||||
- controller
|
||||
hints:
|
||||
- keystone
|
||||
<% end -%>
|
||||
<% if @ceilometer_enabled -%>
|
||||
ceilometer:
|
||||
members:
|
||||
- ceilometer-api
|
||||
- controller
|
||||
hints:
|
||||
- keystone
|
||||
<% end -%>
|
||||
level_2_dependencies:
|
||||
nova-api:
|
||||
- neutron-api
|
||||
- keystone-api
|
||||
- cinder-api
|
||||
- glance-api
|
||||
cinder-api:
|
||||
- keystone-api
|
||||
neutron-api:
|
||||
- keystone-api
|
||||
glance-api:
|
||||
- keystone-api
|
||||
heat-api:
|
||||
- keystone-api
|
||||
|
||||
|
|
|
@ -14,15 +14,13 @@
|
|||
local M = {}
|
||||
setfenv(1, M) -- Remove external access to contain everything in the module
|
||||
|
||||
level_1_dependencies = {
|
||||
<% @level_1_dependencies.keys().sort().each do |k| -%>
|
||||
['<%= k.to_s().gsub("'"){"\\'"} %>']={<%= @level_1_dependencies[k].collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
|
||||
<% end -%>
|
||||
}
|
||||
|
||||
level_2_dependencies = {
|
||||
<% @level_2_dependencies.keys().sort().each do |k| -%>
|
||||
['<%= k.to_s().gsub("'"){"\\'"} %>']={<%= @level_2_dependencies[k].collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
|
||||
clusters = {
|
||||
<% @clusters.keys().sort().each do |cluster_id| -%>
|
||||
['<%= cluster_id.to_s().gsub("'"){"\\'"} %>']={
|
||||
['members']={<%= @clusters[cluster_id]['members'].sort().collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
|
||||
['hints']={<%= (@clusters[cluster_id]['hints'] || []).sort().collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
|
||||
['group_by_hostname']=<%= @clusters[cluster_id]['group_by_hostname'] ? 'true' : 'false' %>
|
||||
},
|
||||
<% end -%>
|
||||
}
|
||||
|
||||
|
|
|
@ -21,78 +21,105 @@ function inject_message(msg)
|
|||
last_injected_msg = msg
|
||||
end
|
||||
|
||||
local gse = require('gse')
|
||||
--local gse = require('gse')
|
||||
local consts = require('gse_constants')
|
||||
|
||||
-- configure relations and dependencies
|
||||
gse.level_1_dependency("keystone", "keystone_admin")
|
||||
gse.level_1_dependency("keystone", "keystone_main")
|
||||
gse.level_1_dependency("neutron", "neutron_api")
|
||||
gse.level_1_dependency("nova", "nova_api")
|
||||
gse.level_1_dependency("nova", "keystone_api")
|
||||
gse.level_1_dependency("nova", "nova_ec2_api")
|
||||
gse.level_1_dependency("nova", "nova_scheduler")
|
||||
gse.level_1_dependency("glance", "glance_api")
|
||||
gse.level_1_dependency("glance", "glance_registry")
|
||||
local gse = require('gse')
|
||||
|
||||
gse.level_2_dependency("nova_api", "neutron_api")
|
||||
gse.level_2_dependency("nova_scheduler", "rabbitmq")
|
||||
-- define clusters
|
||||
gse.add_cluster("heat", {'heat-api'}, {'nova', 'glance', 'neutron', 'keystone', 'rabbitmq'}, false)
|
||||
gse.add_cluster("nova", {'nova-api', 'nova-ec2-api', 'nova-scheduler'}, {'glance', 'neutron', 'keystone', 'rabbitmq'}, false)
|
||||
gse.add_cluster("neutron", {'neutron-api'}, {'keystone', 'rabbitmq'}, false)
|
||||
gse.add_cluster("keystone", {'keystone-admin-api', 'keystone-public-api'}, {}, false)
|
||||
gse.add_cluster("glance", {'glance-api', 'glance-registry-api', 'foobar'}, {'keystone'}, false)
|
||||
gse.add_cluster("rabbitmq", {'rabbitmq-cluster', 'foobar'}, {}, true)
|
||||
|
||||
-- provision facts
|
||||
gse.set_status("keystone_admin", consts.OKAY, {})
|
||||
gse.set_status("neutron_api", consts.DOWN, {{message="All neutron endpoints are down"}})
|
||||
gse.set_status("keystone_api", consts.CRIT, {{message="All keystone endpoints are critical"}})
|
||||
gse.set_status("nova_api", consts.OKAY, {})
|
||||
gse.set_status("nova_ec2_api", consts.OKAY, {})
|
||||
gse.set_status("nova_scheduler", consts.OKAY, {})
|
||||
gse.set_status("rabbitmq", consts.WARN, {{message="1 RabbitMQ node out of 3 is down"}})
|
||||
gse.set_status("glance_api", consts.WARN, {{message="glance-api endpoint is down on node-1"}})
|
||||
gse.set_status("glance_registry", consts.DOWN, {{message='glance-registry endpoints are down'}})
|
||||
gse.set_member_status("neutron", "neutron-api", consts.DOWN, {{message="All neutron endpoints are down"}}, 'node-1')
|
||||
gse.set_member_status('keystone', 'keystone-admin-api', consts.OKAY, {}, 'node-1')
|
||||
gse.set_member_status('glance', "glance-api", consts.WARN, {{message="glance-api endpoint is down on node-1"}}, 'node-1')
|
||||
gse.set_member_status('glance', "glance-registry-api", consts.DOWN, {{message='glance-registry endpoints are down'}}, 'node-1')
|
||||
gse.set_member_status("rabbitmq", 'rabbitmq-cluster', consts.WARN, {{message="1 RabbitMQ node out of 3 is down"}}, 'node-2')
|
||||
gse.set_member_status("rabbitmq", 'rabbitmq-cluster', consts.OKAY, {}, 'node-1')
|
||||
gse.set_member_status("rabbitmq", 'rabbitmq-cluster', consts.OKAY, {}, 'node-3')
|
||||
gse.set_member_status('heat', "heat-api", consts.WARN, {{message='5xx errors detected'}}, 'node-1')
|
||||
gse.set_member_status('nova', "nova-api", consts.OKAY, {}, 'node-1')
|
||||
gse.set_member_status('nova', "nova-ec2_api", consts.OKAY, {}, 'node-1')
|
||||
gse.set_member_status('nova', "nova-scheduler", consts.OKAY, {}, 'node-1')
|
||||
|
||||
TestGse = {}
|
||||
|
||||
function TestGse:test_keystone_is_okay()
|
||||
function TestGse:test_ordered_clusters()
|
||||
local ordered_clusters = gse.get_ordered_clusters()
|
||||
assertEquals(#ordered_clusters, 6)
|
||||
assertEquals(ordered_clusters[1], 'rabbitmq')
|
||||
assertEquals(ordered_clusters[2], 'keystone')
|
||||
assertEquals(ordered_clusters[3], 'glance')
|
||||
assertEquals(ordered_clusters[4], 'neutron')
|
||||
assertEquals(ordered_clusters[5], 'nova')
|
||||
assertEquals(ordered_clusters[6], 'heat')
|
||||
end
|
||||
|
||||
|
||||
function TestGse:test_01_rabbitmq_is_warning()
|
||||
local status, alarms = gse.resolve_status('rabbitmq')
|
||||
assertEquals(status, consts.WARN)
|
||||
assertEquals(#alarms, 1)
|
||||
assertEquals(alarms[1].hostname, 'node-2')
|
||||
assertEquals(alarms[1].tags.dependency_name, 'rabbitmq-cluster')
|
||||
assertEquals(alarms[1].tags.dependency_level, 'direct')
|
||||
end
|
||||
|
||||
function TestGse:test_02_keystone_is_okay()
|
||||
local status, alarms = gse.resolve_status('keystone')
|
||||
assertEquals(status, consts.OKAY)
|
||||
assertEquals(#alarms, 0)
|
||||
end
|
||||
|
||||
function TestGse:test_cinder_is_unknown()
|
||||
local status, alarms = gse.resolve_status('cinder')
|
||||
assertEquals(status, consts.UNKW)
|
||||
assertEquals(#alarms, 0)
|
||||
end
|
||||
|
||||
function TestGse:test_neutron_is_down()
|
||||
local status, alarms = gse.resolve_status('neutron')
|
||||
assertEquals(status, consts.DOWN)
|
||||
assertEquals(#alarms, 1)
|
||||
assertEquals(alarms[1].tags.dependency, 'neutron_api')
|
||||
assertEquals(alarms[1].tags.dependency_level, 'direct')
|
||||
end
|
||||
|
||||
function TestGse:test_nova_is_critical()
|
||||
local status, alarms = gse.resolve_status('nova')
|
||||
assertEquals(status, consts.CRIT)
|
||||
assertEquals(#alarms, 3)
|
||||
assertEquals(alarms[1].tags.dependency, 'neutron_api')
|
||||
assertEquals(alarms[1].tags.dependency_level, 'indirect')
|
||||
assertEquals(alarms[2].tags.dependency, 'keystone_api')
|
||||
assertEquals(alarms[2].tags.dependency_level, 'direct')
|
||||
assertEquals(alarms[3].tags.dependency, 'rabbitmq')
|
||||
assertEquals(alarms[3].tags.dependency_level, 'indirect')
|
||||
end
|
||||
|
||||
function TestGse:test_glance_is_down()
|
||||
function TestGse:test_03_glance_is_down()
|
||||
local status, alarms = gse.resolve_status('glance')
|
||||
assertEquals(status, consts.DOWN)
|
||||
assertEquals(#alarms, 2)
|
||||
assertEquals(alarms[1].tags.dependency, 'glance_api')
|
||||
assertEquals(alarms[1].hostname, '')
|
||||
assertEquals(alarms[1].tags.dependency_name, 'glance-api')
|
||||
assertEquals(alarms[1].tags.dependency_level, 'direct')
|
||||
assertEquals(alarms[2].tags.dependency, 'glance_registry')
|
||||
assertEquals(alarms[2].hostname, '')
|
||||
assertEquals(alarms[2].tags.dependency_name, 'glance-registry-api')
|
||||
assertEquals(alarms[2].tags.dependency_level, 'direct')
|
||||
end
|
||||
|
||||
function TestGse:test_04_neutron_is_down()
|
||||
local status, alarms = gse.resolve_status('neutron')
|
||||
assertEquals(status, consts.DOWN)
|
||||
assertEquals(#alarms, 2)
|
||||
assertEquals(alarms[1].tags.dependency_name, 'neutron-api')
|
||||
assertEquals(alarms[1].tags.dependency_level, 'direct')
|
||||
assertEquals(alarms[2].tags.dependency_name, 'rabbitmq')
|
||||
assertEquals(alarms[2].tags.dependency_level, 'hint')
|
||||
end
|
||||
|
||||
function TestGse:test_05_nova_is_okay()
|
||||
local status, alarms = gse.resolve_status('nova')
|
||||
assertEquals(status, consts.OKAY)
|
||||
assertEquals(#alarms, 0)
|
||||
end
|
||||
|
||||
function TestGse:test_06_heat_is_warning_with_hints()
|
||||
local status, alarms = gse.resolve_status('heat')
|
||||
assertEquals(status, consts.WARN)
|
||||
assertEquals(#alarms, 5)
|
||||
assertEquals(alarms[1].tags.dependency_name, 'heat-api')
|
||||
assertEquals(alarms[1].tags.dependency_level, 'direct')
|
||||
assertEquals(alarms[2].tags.dependency_name, 'glance')
|
||||
assertEquals(alarms[2].tags.dependency_level, 'hint')
|
||||
assertEquals(alarms[3].tags.dependency_name, 'glance')
|
||||
assertEquals(alarms[3].tags.dependency_level, 'hint')
|
||||
assertEquals(alarms[4].tags.dependency_name, 'neutron')
|
||||
assertEquals(alarms[4].tags.dependency_level, 'hint')
|
||||
assertEquals(alarms[5].tags.dependency_name, 'rabbitmq')
|
||||
assertEquals(alarms[5].tags.dependency_level, 'hint')
|
||||
end
|
||||
|
||||
function TestGse:test_inject_cluster_metric_for_nova()
|
||||
gse.inject_cluster_metric(
|
||||
'gse_service_cluster_metric',
|
||||
|
@ -106,10 +133,10 @@ TestGse = {}
|
|||
assertEquals(metric.Type, 'gse_service_cluster_metric')
|
||||
assertEquals(metric.Fields.cluster_name, 'nova')
|
||||
assertEquals(metric.Fields.name, 'service_cluster_status')
|
||||
assertEquals(metric.Fields.value, consts.CRIT)
|
||||
assertEquals(metric.Fields.value, consts.OKAY)
|
||||
assertEquals(metric.Fields.hostname, 'node-1')
|
||||
assertEquals(metric.Fields.interval, 10)
|
||||
assert(metric.Payload:match("All neutron endpoints are down"))
|
||||
assertEquals(metric.Payload, '{"alarms":[]}')
|
||||
end
|
||||
|
||||
function TestGse:test_inject_cluster_metric_for_glance()
|
||||
|
@ -132,10 +159,10 @@ TestGse = {}
|
|||
assert(metric.Payload:match("glance%-api endpoint is down on node%-1"))
|
||||
end
|
||||
|
||||
function TestGse:test_inject_cluster_metric_for_keystone()
|
||||
function TestGse:test_inject_cluster_metric_for_heat()
|
||||
gse.inject_cluster_metric(
|
||||
'gse_service_cluster_metric',
|
||||
'keystone',
|
||||
'heat',
|
||||
'service_cluster_status',
|
||||
'node-1',
|
||||
10,
|
||||
|
@ -143,12 +170,13 @@ TestGse = {}
|
|||
)
|
||||
local metric = last_injected_msg
|
||||
assertEquals(metric.Type, 'gse_service_cluster_metric')
|
||||
assertEquals(metric.Fields.cluster_name, 'keystone')
|
||||
assertEquals(metric.Fields.cluster_name, 'heat')
|
||||
assertEquals(metric.Fields.name, 'service_cluster_status')
|
||||
assertEquals(metric.Fields.value, consts.OKAY)
|
||||
assertEquals(metric.Fields.value, consts.WARN)
|
||||
assertEquals(metric.Fields.hostname, 'node-1')
|
||||
assertEquals(metric.Fields.interval, 10)
|
||||
assertEquals(metric.Payload, '{"alarms":[]}')
|
||||
assert(metric.Payload:match("5xx errors detected"))
|
||||
assert(metric.Payload:match("1 RabbitMQ node out of 3 is down"))
|
||||
end
|
||||
|
||||
function TestGse:test_max_status()
|
||||
|
@ -162,7 +190,13 @@ TestGse = {}
|
|||
assertEquals(consts.DOWN, status)
|
||||
end
|
||||
|
||||
lu = LuaUnit
|
||||
function TestGse:test_reverse_index()
|
||||
local clusters = gse.find_cluster_memberships('foobar')
|
||||
assertEquals(#clusters, 2)
|
||||
assertEquals(clusters[1], 'glance')
|
||||
assertEquals(clusters[2], 'rabbitmq')
|
||||
end
|
||||
|
||||
lu = LuaUnit
|
||||
lu:setVerbosity( 1 )
|
||||
os.exit( lu:run() )
|
||||
|
|
Loading…
Reference in New Issue