Rework the GSE filters

This change modifies the implementation of the GSE filters. The main
differences are:

- level-1 dependencies define now the members of a cluster and the
  status of a cluster is defined by the highest severity among all
  members.
- level-2 dependencies are now known as 'hints', they define
  relationships between clusters (eg, Nova depends on Keystone) but
  have no influence on the status of a cluster.

Change-Id: I58bd79463de78b04b9bad92d02e3fb0da4bacdf4
This commit is contained in:
Simon Pasquier 2015-10-05 18:44:05 +02:00
parent 5820faee0f
commit d49b5fb1c8
12 changed files with 524 additions and 256 deletions

View File

@ -14,6 +14,7 @@
local cjson = require 'cjson'
local consts = require 'gse_constants'
local string = require 'string'
local table = require 'table'
local lma = require 'lma_utils'
local pairs = pairs
@ -26,9 +27,9 @@ local read_message = read_message
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local facts = {}
local level_1_deps = {}
local level_2_deps = {}
local clusters = {}
local reverse_cluster_index = {}
local ordered_clusters = {}
local VALID_STATUSES = {
[consts.OKAY]=true,
@ -38,7 +39,7 @@ local VALID_STATUSES = {
[consts.UNKW]=true
}
local STATUS_MAPPING_FOR_LEVEL_1 = {
local STATUS_MAPPING_FOR_CLUSTERS = {
[consts.OKAY]=consts.OKAY,
[consts.WARN]=consts.WARN,
[consts.CRIT]=consts.CRIT,
@ -54,87 +55,164 @@ local STATUS_WEIGHTS = {
[consts.DOWN]=4
}
local function dependency(deps, superior, subordinate)
if not deps[superior] then
deps[superior] = {}
function add_cluster(cluster_id, members, hints, group_by_hostname)
assert(type(members) == 'table')
assert(type(hints) == 'table')
if not clusters[cluster_id] then
clusters[cluster_id] = {}
end
local cluster = clusters[cluster_id]
cluster.members = members
cluster.hints = hints
cluster.facts = {}
cluster.status = consts.UNKW
cluster.alarms={}
if group_by_hostname then
cluster.group_by_hostname = true
else
cluster.group_by_hostname = false
end
-- update the reverse index
for _, member in ipairs(members) do
if not reverse_cluster_index[member] then
reverse_cluster_index[member] = {}
end
local reverse_table = reverse_cluster_index[member]
if not lma.table_find(cluster_id, reverse_table) then
reverse_table[#reverse_table+1] = cluster_id
end
end
if not lma.table_find(cluster_id, ordered_clusters) then
local after_index = 1
for current_pos, id in ipairs(ordered_clusters) do
if lma.table_find(id, cluster.hints) then
after_index = current_pos + 1
end
end
local index = after_index
for _, item in pairs(clusters) do
for _, hint in pairs(item.hints) do
if hint == cluster_id then
local pos = lma.table_pos(hint, cluster_orderings)
if pos and pos <= index then
index = pos
elseif index > after_index then
error('circular dependency between clusters!')
end
end
end
end
table.insert(ordered_clusters, index, cluster_id)
end
local subordinates = deps[superior]
subordinates[#subordinates+1] = subordinate
end
-- define a first degree dependency between 2 entities.
function level_1_dependency(superior, subordinate)
return dependency(level_1_deps, superior, subordinate)
function get_ordered_clusters()
return ordered_clusters
end
-- define a second degree dependency between 2 entities.
function level_2_dependency(superior, subordinate)
return dependency(level_2_deps, superior, subordinate)
function cluster_exists(cluster_id)
return clusters[cluster_id] ~= nil
end
-- store the status of a service and a list of alarms
function set_status(service, value, alarms)
-- return the list of clusters which depends on a given member
function find_cluster_memberships(member_id)
return reverse_cluster_index[member_id] or {}
end
-- store the status of a cluster's member and its current alarms
function set_member_status(cluster_id, member, value, alarms, hostname)
assert(VALID_STATUSES[value])
assert(type(alarms) == 'table')
facts[service] = {
local cluster = clusters[cluster_id]
if not cluster then
return
end
local group_key = '__all_hosts__'
if cluster.group_by_hostname then
group_key = hostname
else
hostname = ''
end
if not cluster.facts[member] then
cluster.facts[member] = {}
end
cluster.facts[member][group_key] = {
status=value,
alarms=alarms
alarms=alarms,
hostname=hostname
}
end
function max_status(current, status)
if not status or STATUS_WEIGHTS[current] > STATUS_WEIGHTS[status] then
return current
else
return status
end
if not status or STATUS_WEIGHTS[current] > STATUS_WEIGHTS[status] then
return current
else
return status
end
end
-- The service status depends on the status of the level-1 dependencies.
-- The status of the level-2 dependencies don't modify the overall status
-- but their alarms are returned.
function resolve_status(name)
local service_status = consts.UNKW
-- The cluster status depends on the status of its members.
-- The status of the related clusters (defined by cluster.hints) doesn't modify
-- the overall status but their alarms are returned.
function resolve_status(cluster_id)
local cluster = clusters[cluster_id]
assert(cluster)
cluster.status = consts.UNKW
local alarms = {}
for _, level_1_dep in ipairs(level_1_deps[name] or {}) do
if facts[level_1_dep] then
local status = STATUS_MAPPING_FOR_LEVEL_1[facts[level_1_dep].status]
for _, member in ipairs(cluster.members) do
for _, fact in pairs(cluster.facts[member] or {}) do
local status = STATUS_MAPPING_FOR_CLUSTERS[fact.status]
if status ~= consts.OKAY then
for _, v in ipairs(facts[level_1_dep].alarms) do
-- append alarms when member's status aren't okay
for _, v in ipairs(fact.alarms) do
alarms[#alarms+1] = lma.deepcopy(v)
if not alarms[#alarms]['tags'] then
alarms[#alarms]['tags'] = {}
end
alarms[#alarms].tags['dependency'] = level_1_dep
alarms[#alarms].tags['dependency_name'] = member
alarms[#alarms].tags['dependency_level'] = 'direct'
if fact.hostname then
alarms[#alarms].hostname = fact.hostname
end
end
end
service_status = max_status(service_status, status)
cluster.status = max_status(cluster.status, status)
end
end
cluster.alarms = lma.deepcopy(alarms)
for _, level_2_dep in ipairs(level_2_deps[level_1_dep] or {}) do
if facts[level_2_dep] then
local status = facts[level_2_dep].status
if status ~= consts.OKAY then
for _, v in ipairs(facts[level_2_dep].alarms) do
alarms[#alarms+1] = lma.deepcopy(v)
if not alarms[#alarms]['tags'] then
alarms[#alarms]['tags'] = {}
end
alarms[#alarms].tags['dependency'] = level_2_dep
alarms[#alarms].tags['dependency_level'] = 'indirect'
if cluster.status ~= consts.OKAY then
-- add hints if the cluster isn't healthy
for _, member in ipairs(cluster.hints or {}) do
local other_cluster = clusters[member]
if other_cluster and other_cluster.status ~= OKAY and #other_cluster.alarms > 0 then
for _, v in ipairs(other_cluster.alarms) do
alarms[#alarms+1] = lma.deepcopy(v)
if not alarms[#alarms]['tags'] then
alarms[#alarms]['tags'] = {}
end
alarms[#alarms].tags['dependency_name'] = member
alarms[#alarms].tags['dependency_level'] = 'hint'
end
end
end
end
return service_status, alarms
return cluster.status, alarms
end
-- compute the cluster metric and inject it into the Heka pipeline
-- the metric's value is computed using the status of the subordinates
-- the metric's value is computed using the status of its members
function inject_cluster_metric(msg_type, cluster_name, metric_name, hostname, interval, source)
local payload
local status, alarms = resolve_status(cluster_name)

View File

@ -194,18 +194,22 @@ function deepcopy(t)
return t
end
-- return true if an item is present in the list, else false
function table_find(item, list)
-- return the position (index) of an item in a list, nil if not found
function table_pos(item, list)
if type(list) == 'table' then
for _, v in ipairs(list) do
for i, v in ipairs(list) do
if v == item then
return true
return i
end
end
return false
end
end
-- return true if an item is present in the list, else false
function table_find(item, list)
return table_pos(item, list) ~= nil
end
-- from http://lua-users.org/wiki/SortedIteration
function __genOrderedIndex( t )
local orderedIndex = {}

View File

@ -77,11 +77,11 @@ function process_message()
string.format("At least one %s backend is down", service))
end
afd.inject_afd_service_metric(service .. '-backends',
afd.inject_afd_service_metric(service,
state,
read_message('Fields[hostname]'),
0,
'afd_api_backends')
'backends')
-- reset the cache for this service
haproxy_backend_states[service] = {}

View File

@ -40,11 +40,11 @@ function process_message()
string.format("Endpoint check for %s is failed", service))
end
afd.inject_afd_service_metric(service .. '-endpoint',
afd.inject_afd_service_metric(service,
state,
read_message('Fields[hostname]'),
0,
'afd_api_endpoint')
'endpoint')
return 0
end

View File

@ -85,7 +85,7 @@ function process_message()
state,
read_message('Fields[hostname]'),
0,
'afd_workers')
'workers')
-- reset the cache for this worker
worker_states[worker_key] = {}

View File

@ -16,35 +16,31 @@ local cjson = require 'cjson'
local afd = require 'afd'
local gse = require 'gse'
local lma = require 'lma_utils'
local output_message_type = read_config('output_message_type') or error('output_message_type must be specified!')
local entity_field = read_config('entity_field') or error('entity_field must be specified!')
local cluster_field = read_config('cluster_field')
local member_field = read_config('member_field') or error('member_field must be specified!')
local output_metric_name = read_config('output_metric_name') or error('output_metric_name must be specified!')
local hostname = read_config('hostname') or error('hostname must be specified!')
local source = read_config('source') or error('source must be specified!')
local topology_file = read_config('topology_file') or error('topology_file must be specified!')
local interval = (read_config('interval') or error('interval must be specified!')) + 0
local max_inject = (read_config('max_inject') or 10) + 0
local interval_in_ns = interval * 1e9
local is_active = false
local last_tick = 0
local entities = {}
local last_index = nil
local topology = require(topology_file)
for parent, children in pairs(topology.level_1_dependencies) do
entities[#entities+1] = parent
for _, v in ipairs(children) do
gse.level_1_dependency(parent, v)
end
end
for parent, children in pairs(topology.level_2_dependencies) do
for _, v in ipairs(children) do
gse.level_2_dependency(parent, v)
end
for cluster_name, attributes in pairs(topology.clusters) do
gse.add_cluster(cluster_name, attributes.members, attributes.hints, attributes.group_by_hostname)
end
function process_message()
local name = read_message('Fields[name]')
local hostname = read_message('Fields[hostname]')
if name and name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__management' then
if read_message('Fields[value]') == 1 then
is_active = true
@ -54,37 +50,67 @@ function process_message()
return 0
end
name = afd.get_entity_name(entity_field)
local status = afd.get_status()
local alarms = afd.extract_alarms()
if not name then
return -1, "Cannot find entity's name in the AFD event message"
end
if not status then
return -1, "Cannot find status in the AFD event message"
end
if not alarms then
return -1, "Cannot find alarms in the AFD event message"
local member_id = afd.get_entity_name(member_field)
if not member_id then
return -1, "Cannot find entity's name in the AFD/GSE message"
end
gse.set_status(name, status, alarms)
local status = afd.get_status()
if not status then
return -1, "Cannot find status in the AFD/GSE message"
end
local alarms = afd.extract_alarms()
if not alarms then
return -1, "Cannot find alarms in the AFD/GSE message"
end
local cluster_ids
if cluster_field then
local cluster_id = afd.get_entity_name(cluster_field)
if not cluster_id then
return -1, "Cannot find the cluster's name in the AFD/GSE message"
elseif not gse.cluster_exists(cluster_id) then
-- Just ignore AFD/GSE messages which aren't part of a cluster's definition
return 0
end
cluster_ids = { cluster_id }
else
cluster_ids = gse.find_cluster_memberships(member_id)
end
-- update all clusters that depend on this entity
for _, cluster_id in ipairs(cluster_ids) do
gse.set_member_status(cluster_id, member_id, status, alarms, hostname)
end
return 0
end
function timer_event(ns)
if not is_active or (ns - last_tick) < interval_in_ns then
if not is_active or (last_index == nil and (ns - last_tick) < interval_in_ns) then
return
end
last_tick = ns
for _, cluster_name in ipairs(entities) do
gse.inject_cluster_metric(
output_message_type,
cluster_name,
output_metric_name,
hostname,
interval,
source
)
local injected = 0
for i, cluster_name in ipairs(gse.get_ordered_clusters()) do
if last_index == nil or i > last_index then
gse.inject_cluster_metric(
output_message_type,
cluster_name,
output_metric_name,
hostname,
interval,
source
)
last_index = i
injected = injected + 1
if injected >= max_inject then
return
end
end
end
last_index = nil
end

View File

@ -14,19 +14,20 @@
define lma_collector::gse_cluster_filter (
$input_message_types,
$aggregator_flag,
$entity_field,
$member_field,
$output_message_type,
$output_metric_name,
$interval = 10,
$level_1_dependencies = {},
$level_2_dependencies = {},
$cluster_field = undef,
$clusters = {},
$ensure = present,
) {
include lma_collector::params
include heka::params
validate_array($input_message_types)
validate_string($entity_field)
validate_string($cluster_field)
validate_string($member_field)
validate_string($output_metric_name)
if size($input_message_types) == 0 {
fail('input_message_types cannot be empty')
@ -59,7 +60,9 @@ define lma_collector::gse_cluster_filter (
source => "gse_${title}_filter",
interval => $interval,
topology_file => $topology_file,
entity_field => $entity_field,
cluster_field => $cluster_field,
member_field => $member_field,
max_inject => $lma_collector::params::hekad_max_timer_inject,
},
require => File[$topology_file],
notify => Class['lma_collector::service']

View File

@ -83,9 +83,8 @@ class lma_collector::params {
# Heka's default value is 1
$hekad_max_process_inject = 1
# The GSE filters can inject up to 20 messages per timer_event() call
# Heka's default value is 10
$hekad_max_timer_inject = 20
$hekad_max_timer_inject = 10
# Parameters for OpenStack notifications
$rabbitmq_host = false

View File

@ -24,25 +24,36 @@ describe 'lma_collector::gse_cluster_filter' do
let(:params) do
{:input_message_types => ['afd_service_metric'],
:aggregator_flag => true,
:entity_field => 'service',
:cluster_field => 'service',
:member_field => 'source',
:output_message_type => 'gse_service_cluster_metric',
:output_metric_name => 'cluster_service_status'}
end
it { is_expected.to contain_heka__filter__sandbox('gse_service').with_message_matcher("(Fields[name] == 'pacemaker_local_resource_active' && Fields[resource] == 'vip__management') || (Fields[aggregator] != NIL && (Type =~ /afd_service_metric$/))") }
it { is_expected.to contain_file('gse_service_topology') }
end
describe 'with dependencies' do
let(:params) do
{:input_message_types => ['gse_service_cluster_metric', 'gse_node_cluster_metric'],
:aggregator_flag => false,
:entity_field => 'cluster_name',
:member_field => 'cluster_name',
:output_message_type => 'gse_cluster_metric',
:output_metric_name => 'cluster_status',
:level_1_dependencies => {'nova' => ['nova-api','nova-scheduler'],
'cinder' => ['cinder-api']},
:level_2_dependencies => {'nova-api' => ['neutron-api']}
:clusters => {
'nova' => {
'members' => ['nova-api', 'nova-scheduler', 'controller_nodes'],
'group_by_hostname' => false,
'hints' => ['keystone']
},
'keystone' => {
'members' => ['keystone-public-api', 'keystone-admin-api', 'controller_nodes'],
'group_by_hostname' => false,
}
}
}
end
it { is_expected.to contain_heka__filter__sandbox('gse_service').with_message_matcher("(Fields[name] == 'pacemaker_local_resource_active' && Fields[resource] == 'vip__management') || (Fields[aggregator] == NIL && (Type =~ /gse_service_cluster_metric$/ || Type =~ /gse_node_cluster_metric$/))") }
it { is_expected.to contain_file('gse_service_topology') }
end
end

View File

@ -4,135 +4,250 @@ lma_collector:
input_message_types:
- afd_service_metric
aggregator_flag: true
entity_field: service
# the field in the input messages to identify the cluster
cluster_field: service
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_service_cluster_metric
output_metric_name: cluster_service_status
interval: 10
level_1_dependencies:
clusters:
nova-api:
- nova-api-backends
- nova-ec2-api-backends
- nova-endpoint
nova-novncproxy:
- nova-novncproxy-websocket-backends
nova-metadata:
- nova-api-metadata-backends
members:
- backends
- endpoint
nova-ec2-api:
members:
- backends
nova-novncproxy-websocket:
members:
- backends
nova-metadata-api:
members:
- backends
nova-scheduler:
- nova-scheduler
members:
- workers
nova-cert:
members:
- workers
nova-consoleauth:
members:
- workers
nova-compute:
- nova-compute
members:
- workers
nova-conductor:
- nova-conductor
members:
- workers
cinder-api:
- cinder-api-backends
- cinder-endpoint
- cinder-v2-endpoint
members:
- backends
- endpoint
cinder-v2-api:
members:
# Cinder V2 backends are in fact the same as the Cinder backends
- endpoint
cinder-scheduler:
- cinder-scheduler
members:
- workers
cinder-volume:
- cinder-volume
members:
- workers
neutron-api:
- neutron-api-backends
- neutron-endpoint
members:
- backends
- endpoint
neutron-l3:
- l3
members:
- workers
neutron-dhcp:
- dhcp
neutron-ovs:
- openvswitch
keystone-api:
- keystone-public-api-backends
- keystone-admin-api-backends
- keystone-endpoint
members:
- workers
neutron-metadata:
members:
- workers
neutron-openvswitch:
members:
- workers
keystone-public-api:
members:
- backends
- endpoint
keystone-admin-api:
members:
# TODO(pasquier-s): add a metric reporting the status of the keystone-admin-api endpoint
- backends
glance-api:
- glance-api-backends
- glance-endpoint
glance-registry:
- glance-registry-api-backends
members:
- backends
- endpoint
glance-registry-api:
members:
- backends
heat-api:
- heat-api-backends
- heat-cfn-api-backends
- heat-endpoint
horizon-ui:
members:
- backends
- endpoint
heat-cfn-api:
members:
- backends
- endpoint
heat-cloudwatch-api:
members:
- backends
<% if @tls_enabled then -%>
- horizon-https-backends
horizon-https:
members:
- backends
<% else -%>
- horizon-web-backends
horizon-ui:
members:
- backends
<% end -%>
<% if not @storage_options["objects_ceph"] then -%>
swift-api:
- swift-api-backends
- swift-endpoint
- swift-s3-endpoint
members:
- backends
- endpoint
swift-s3-api:
members:
# Swift S3 backends are in fact the same as the Swift backends
- endpoint
<% end -%>
<% if @ceilometer_enabled -%>
ceilometer-api:
- ceilometer-api-backends
- ceilometer-endpoint
members:
- backends
- endpoint
<% end -%>
level_2_dependencies: {}
gse_cluster_node:
input_message_types:
- afd_node_metric
aggregator_flag: true
entity_field: hostname
# the field in the input messages to identify the cluster
cluster_field: hostname
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_node_cluster_metric
output_metric_name: cluster_node_status
interval: 10
level_1_dependencies: {}
level_2_dependencies: {}
clusters:
controller:
group_by_hostname: true
members:
- system
- fs
compute:
group_by_hostname: true
members:
- system
- fs
storage:
group_by_hostname: true
members:
- system
- fs
gse_cluster_global:
input_message_types:
- gse_service_cluster_metric
- gse_node_cluster_metric
aggregator_flag: false
entity_field: cluster_name
# the field in the input messages to identify the cluster's member
member_field: cluster_name
output_message_type: gse_cluster_metric
output_metric_name: cluster_status
interval: 10
level_1_dependencies:
clusters:
nova:
- nova-api
- nova-scheduler
- nova-compute
- nova-conductor
- nova-novncproxy
- nova-metadata
members:
- nova-api
- nova-ec2-api
- nova-metadata-api
- nova-scheduler
- nova-compute
- nova-conductor
- nova-cert
- nova-consoleauth
- nova-novncproxy-websocket
- controller
- compute
hints:
- cinder
- glance
- keystone
- neutron
cinder:
- cinder-api
- cinder-scheduler
- cinder-volume
members:
- cinder-api
- cinder-v2-api
- cinder-scheduler
- cinder-volume
- controller
- storage
hints:
- keystone
neutron:
- neutron-api
- neutron-l3
- neutron-dhcp
- neutron-metadata
- neutron-ovs
members:
- neutron-api
- neutron-l3
- neutron-dhcp
- neutron-metadata
- neutron-openvswitch
- controller
hints:
- keystone
keystone:
- keystone-api
members:
- keystone-public-api
- keystone-admin-api
- controller
hints: []
glance:
- glance-api
- glance-registry
members:
- glance-api
- glance-registry-api
- controller
hints:
- keystone
heat:
- heat-api
members:
- heat-api
- heat-cfn-api
- heat-cloudwatch-api
- controller
hints:
- cinder
- glance
- keystone
- neutron
- nova
horizon:
- horizon-ui
members:
<% if @tls_enabled then -%>
- horizon-https
<% else -%>
- horizon-ui
<% end -%>
- controller
hints:
- keystone
<% if not @storage_options["objects_ceph"] then -%>
swift:
- swift-api
members:
- swift-api
- swift-s3-api
- controller
hints:
- keystone
<% end -%>
<% if @ceilometer_enabled -%>
ceilometer:
members:
- ceilometer-api
- controller
hints:
- keystone
<% end -%>
level_2_dependencies:
nova-api:
- neutron-api
- keystone-api
- cinder-api
- glance-api
cinder-api:
- keystone-api
neutron-api:
- keystone-api
glance-api:
- keystone-api
heat-api:
- keystone-api

View File

@ -14,15 +14,13 @@
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
level_1_dependencies = {
<% @level_1_dependencies.keys().sort().each do |k| -%>
['<%= k.to_s().gsub("'"){"\\'"} %>']={<%= @level_1_dependencies[k].collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
<% end -%>
}
level_2_dependencies = {
<% @level_2_dependencies.keys().sort().each do |k| -%>
['<%= k.to_s().gsub("'"){"\\'"} %>']={<%= @level_2_dependencies[k].collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
clusters = {
<% @clusters.keys().sort().each do |cluster_id| -%>
['<%= cluster_id.to_s().gsub("'"){"\\'"} %>']={
['members']={<%= @clusters[cluster_id]['members'].sort().collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
['hints']={<%= (@clusters[cluster_id]['hints'] || []).sort().collect{ |x| "'" + x.to_s().gsub("'"){"\\'"} + "'"}.join(',') %>},
['group_by_hostname']=<%= @clusters[cluster_id]['group_by_hostname'] ? 'true' : 'false' %>
},
<% end -%>
}

View File

@ -21,78 +21,105 @@ function inject_message(msg)
last_injected_msg = msg
end
local gse = require('gse')
--local gse = require('gse')
local consts = require('gse_constants')
-- configure relations and dependencies
gse.level_1_dependency("keystone", "keystone_admin")
gse.level_1_dependency("keystone", "keystone_main")
gse.level_1_dependency("neutron", "neutron_api")
gse.level_1_dependency("nova", "nova_api")
gse.level_1_dependency("nova", "keystone_api")
gse.level_1_dependency("nova", "nova_ec2_api")
gse.level_1_dependency("nova", "nova_scheduler")
gse.level_1_dependency("glance", "glance_api")
gse.level_1_dependency("glance", "glance_registry")
local gse = require('gse')
gse.level_2_dependency("nova_api", "neutron_api")
gse.level_2_dependency("nova_scheduler", "rabbitmq")
-- define clusters
gse.add_cluster("heat", {'heat-api'}, {'nova', 'glance', 'neutron', 'keystone', 'rabbitmq'}, false)
gse.add_cluster("nova", {'nova-api', 'nova-ec2-api', 'nova-scheduler'}, {'glance', 'neutron', 'keystone', 'rabbitmq'}, false)
gse.add_cluster("neutron", {'neutron-api'}, {'keystone', 'rabbitmq'}, false)
gse.add_cluster("keystone", {'keystone-admin-api', 'keystone-public-api'}, {}, false)
gse.add_cluster("glance", {'glance-api', 'glance-registry-api', 'foobar'}, {'keystone'}, false)
gse.add_cluster("rabbitmq", {'rabbitmq-cluster', 'foobar'}, {}, true)
-- provision facts
gse.set_status("keystone_admin", consts.OKAY, {})
gse.set_status("neutron_api", consts.DOWN, {{message="All neutron endpoints are down"}})
gse.set_status("keystone_api", consts.CRIT, {{message="All keystone endpoints are critical"}})
gse.set_status("nova_api", consts.OKAY, {})
gse.set_status("nova_ec2_api", consts.OKAY, {})
gse.set_status("nova_scheduler", consts.OKAY, {})
gse.set_status("rabbitmq", consts.WARN, {{message="1 RabbitMQ node out of 3 is down"}})
gse.set_status("glance_api", consts.WARN, {{message="glance-api endpoint is down on node-1"}})
gse.set_status("glance_registry", consts.DOWN, {{message='glance-registry endpoints are down'}})
gse.set_member_status("neutron", "neutron-api", consts.DOWN, {{message="All neutron endpoints are down"}}, 'node-1')
gse.set_member_status('keystone', 'keystone-admin-api', consts.OKAY, {}, 'node-1')
gse.set_member_status('glance', "glance-api", consts.WARN, {{message="glance-api endpoint is down on node-1"}}, 'node-1')
gse.set_member_status('glance', "glance-registry-api", consts.DOWN, {{message='glance-registry endpoints are down'}}, 'node-1')
gse.set_member_status("rabbitmq", 'rabbitmq-cluster', consts.WARN, {{message="1 RabbitMQ node out of 3 is down"}}, 'node-2')
gse.set_member_status("rabbitmq", 'rabbitmq-cluster', consts.OKAY, {}, 'node-1')
gse.set_member_status("rabbitmq", 'rabbitmq-cluster', consts.OKAY, {}, 'node-3')
gse.set_member_status('heat', "heat-api", consts.WARN, {{message='5xx errors detected'}}, 'node-1')
gse.set_member_status('nova', "nova-api", consts.OKAY, {}, 'node-1')
gse.set_member_status('nova', "nova-ec2_api", consts.OKAY, {}, 'node-1')
gse.set_member_status('nova', "nova-scheduler", consts.OKAY, {}, 'node-1')
TestGse = {}
function TestGse:test_keystone_is_okay()
function TestGse:test_ordered_clusters()
local ordered_clusters = gse.get_ordered_clusters()
assertEquals(#ordered_clusters, 6)
assertEquals(ordered_clusters[1], 'rabbitmq')
assertEquals(ordered_clusters[2], 'keystone')
assertEquals(ordered_clusters[3], 'glance')
assertEquals(ordered_clusters[4], 'neutron')
assertEquals(ordered_clusters[5], 'nova')
assertEquals(ordered_clusters[6], 'heat')
end
function TestGse:test_01_rabbitmq_is_warning()
local status, alarms = gse.resolve_status('rabbitmq')
assertEquals(status, consts.WARN)
assertEquals(#alarms, 1)
assertEquals(alarms[1].hostname, 'node-2')
assertEquals(alarms[1].tags.dependency_name, 'rabbitmq-cluster')
assertEquals(alarms[1].tags.dependency_level, 'direct')
end
function TestGse:test_02_keystone_is_okay()
local status, alarms = gse.resolve_status('keystone')
assertEquals(status, consts.OKAY)
assertEquals(#alarms, 0)
end
function TestGse:test_cinder_is_unknown()
local status, alarms = gse.resolve_status('cinder')
assertEquals(status, consts.UNKW)
assertEquals(#alarms, 0)
end
function TestGse:test_neutron_is_down()
local status, alarms = gse.resolve_status('neutron')
assertEquals(status, consts.DOWN)
assertEquals(#alarms, 1)
assertEquals(alarms[1].tags.dependency, 'neutron_api')
assertEquals(alarms[1].tags.dependency_level, 'direct')
end
function TestGse:test_nova_is_critical()
local status, alarms = gse.resolve_status('nova')
assertEquals(status, consts.CRIT)
assertEquals(#alarms, 3)
assertEquals(alarms[1].tags.dependency, 'neutron_api')
assertEquals(alarms[1].tags.dependency_level, 'indirect')
assertEquals(alarms[2].tags.dependency, 'keystone_api')
assertEquals(alarms[2].tags.dependency_level, 'direct')
assertEquals(alarms[3].tags.dependency, 'rabbitmq')
assertEquals(alarms[3].tags.dependency_level, 'indirect')
end
function TestGse:test_glance_is_down()
function TestGse:test_03_glance_is_down()
local status, alarms = gse.resolve_status('glance')
assertEquals(status, consts.DOWN)
assertEquals(#alarms, 2)
assertEquals(alarms[1].tags.dependency, 'glance_api')
assertEquals(alarms[1].hostname, '')
assertEquals(alarms[1].tags.dependency_name, 'glance-api')
assertEquals(alarms[1].tags.dependency_level, 'direct')
assertEquals(alarms[2].tags.dependency, 'glance_registry')
assertEquals(alarms[2].hostname, '')
assertEquals(alarms[2].tags.dependency_name, 'glance-registry-api')
assertEquals(alarms[2].tags.dependency_level, 'direct')
end
function TestGse:test_04_neutron_is_down()
local status, alarms = gse.resolve_status('neutron')
assertEquals(status, consts.DOWN)
assertEquals(#alarms, 2)
assertEquals(alarms[1].tags.dependency_name, 'neutron-api')
assertEquals(alarms[1].tags.dependency_level, 'direct')
assertEquals(alarms[2].tags.dependency_name, 'rabbitmq')
assertEquals(alarms[2].tags.dependency_level, 'hint')
end
function TestGse:test_05_nova_is_okay()
local status, alarms = gse.resolve_status('nova')
assertEquals(status, consts.OKAY)
assertEquals(#alarms, 0)
end
function TestGse:test_06_heat_is_warning_with_hints()
local status, alarms = gse.resolve_status('heat')
assertEquals(status, consts.WARN)
assertEquals(#alarms, 5)
assertEquals(alarms[1].tags.dependency_name, 'heat-api')
assertEquals(alarms[1].tags.dependency_level, 'direct')
assertEquals(alarms[2].tags.dependency_name, 'glance')
assertEquals(alarms[2].tags.dependency_level, 'hint')
assertEquals(alarms[3].tags.dependency_name, 'glance')
assertEquals(alarms[3].tags.dependency_level, 'hint')
assertEquals(alarms[4].tags.dependency_name, 'neutron')
assertEquals(alarms[4].tags.dependency_level, 'hint')
assertEquals(alarms[5].tags.dependency_name, 'rabbitmq')
assertEquals(alarms[5].tags.dependency_level, 'hint')
end
function TestGse:test_inject_cluster_metric_for_nova()
gse.inject_cluster_metric(
'gse_service_cluster_metric',
@ -106,10 +133,10 @@ TestGse = {}
assertEquals(metric.Type, 'gse_service_cluster_metric')
assertEquals(metric.Fields.cluster_name, 'nova')
assertEquals(metric.Fields.name, 'service_cluster_status')
assertEquals(metric.Fields.value, consts.CRIT)
assertEquals(metric.Fields.value, consts.OKAY)
assertEquals(metric.Fields.hostname, 'node-1')
assertEquals(metric.Fields.interval, 10)
assert(metric.Payload:match("All neutron endpoints are down"))
assertEquals(metric.Payload, '{"alarms":[]}')
end
function TestGse:test_inject_cluster_metric_for_glance()
@ -132,10 +159,10 @@ TestGse = {}
assert(metric.Payload:match("glance%-api endpoint is down on node%-1"))
end
function TestGse:test_inject_cluster_metric_for_keystone()
function TestGse:test_inject_cluster_metric_for_heat()
gse.inject_cluster_metric(
'gse_service_cluster_metric',
'keystone',
'heat',
'service_cluster_status',
'node-1',
10,
@ -143,12 +170,13 @@ TestGse = {}
)
local metric = last_injected_msg
assertEquals(metric.Type, 'gse_service_cluster_metric')
assertEquals(metric.Fields.cluster_name, 'keystone')
assertEquals(metric.Fields.cluster_name, 'heat')
assertEquals(metric.Fields.name, 'service_cluster_status')
assertEquals(metric.Fields.value, consts.OKAY)
assertEquals(metric.Fields.value, consts.WARN)
assertEquals(metric.Fields.hostname, 'node-1')
assertEquals(metric.Fields.interval, 10)
assertEquals(metric.Payload, '{"alarms":[]}')
assert(metric.Payload:match("5xx errors detected"))
assert(metric.Payload:match("1 RabbitMQ node out of 3 is down"))
end
function TestGse:test_max_status()
@ -162,7 +190,13 @@ TestGse = {}
assertEquals(consts.DOWN, status)
end
lu = LuaUnit
function TestGse:test_reverse_index()
local clusters = gse.find_cluster_memberships('foobar')
assertEquals(#clusters, 2)
assertEquals(clusters[1], 'glance')
assertEquals(clusters[2], 'rabbitmq')
end
lu = LuaUnit
lu:setVerbosity( 1 )
os.exit( lu:run() )