Refactor the GSE Lua library

This change splits the Lua GSE library to make it easier to implement
flexible GSE policies. There are now 2 pieces:

- the existing GSE library which is used primarily by the GSE filter.
- A new class named GseCluster that encapsulates the logic for
  deriving the cluster's status and alarms.

Change-Id: I57806a3ce2f2ea3f52106737fd9bd89630aa12dd
This commit is contained in:
Simon Pasquier 2015-10-26 18:34:25 +01:00
parent c1df262766
commit 4412859649
7 changed files with 302 additions and 146 deletions

View File

@ -17,7 +17,7 @@ local ipairs = ipairs
local lma = require 'lma_utils'
local table_utils = require 'table_utils'
local consts = require 'gse_constants'
local gse = require 'gse'
local gse_utils = require 'gse_utils'
local Alarm = require 'afd_alarm'
local all_alarms = {}
@ -70,7 +70,7 @@ function evaluate(ns)
for _, alarm in pairs(all_alarms) do
if alarm:is_evaluation_time(ns) then
local state, alerts = alarm:evaluate(ns)
global_state = gse.max_status(state, global_state)
global_state = gse_utils.max_status(state, global_state)
for _, a in ipairs(alerts) do
all_alerts[#all_alerts+1] = { state=state, alert=a }
end

View File

@ -14,6 +14,7 @@
local consts = require 'gse_constants'
local string = require 'string'
local table = require 'table'
local GseCluster = require 'gse_cluster'
local lma = require 'lma_utils'
local table_utils = require 'table_utils'
@ -26,53 +27,19 @@ local read_message = read_message
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
-- Hash of GseCluster instances organized by name
local clusters = {}
-- Reverse index table to map cluster's members to clusters
local reverse_cluster_index = {}
-- Array of cluster names ordered by dependency
local ordered_clusters = {}
local VALID_STATUSES = {
[consts.OKAY]=true,
[consts.WARN]=true,
[consts.CRIT]=true,
[consts.DOWN]=true,
[consts.UNKW]=true
}
local STATUS_MAPPING_FOR_CLUSTERS = {
[consts.OKAY]=consts.OKAY,
[consts.WARN]=consts.WARN,
[consts.CRIT]=consts.CRIT,
[consts.DOWN]=consts.DOWN,
[consts.UNKW]=consts.UNKW
}
local STATUS_WEIGHTS = {
[consts.UNKW]=0,
[consts.OKAY]=1,
[consts.WARN]=2,
[consts.CRIT]=3,
[consts.DOWN]=4
}
function add_cluster(cluster_id, members, hints, group_by_hostname)
function add_cluster(cluster_id, members, hints, group_by)
assert(type(members) == 'table')
assert(type(hints) == 'table')
if not clusters[cluster_id] then
clusters[cluster_id] = {}
end
local cluster = clusters[cluster_id]
cluster.members = members
cluster.hints = hints
cluster.facts = {}
cluster.status = consts.UNKW
cluster.alarms={}
if group_by_hostname then
cluster.group_by_hostname = true
else
cluster.group_by_hostname = false
end
local cluster = GseCluster.new(members, hints, group_by)
clusters[cluster_id] = cluster
-- update the reverse index
for _, member in ipairs(members) do
@ -125,36 +92,9 @@ end
-- store the status of a cluster's member and its current alarms
function set_member_status(cluster_id, member, value, alarms, hostname)
assert(VALID_STATUSES[value])
assert(type(alarms) == 'table')
local cluster = clusters[cluster_id]
if not cluster then
return
end
local group_key = '__all_hosts__'
if cluster.group_by_hostname then
group_key = hostname
end
if not cluster.facts[member] then
cluster.facts[member] = {}
end
cluster.facts[member][group_key] = {
status=value,
alarms=alarms
}
if cluster.group_by_hostname then
cluster.facts[member][group_key].hostname = hostname
end
end
function max_status(current, status)
if not status or STATUS_WEIGHTS[current] > STATUS_WEIGHTS[status] then
return current
else
return status
if cluster then
cluster:update_fact(member, hostname, value, alarms)
end
end
@ -165,50 +105,16 @@ function resolve_status(cluster_id)
local cluster = clusters[cluster_id]
assert(cluster)
cluster.status = consts.UNKW
local alarms = {}
local members_with_alarms = {}
for _, member in ipairs(cluster.members) do
for _, fact in pairs(cluster.facts[member] or {}) do
local status = STATUS_MAPPING_FOR_CLUSTERS[fact.status]
if status ~= consts.OKAY then
members_with_alarms[member] = true
-- append alarms only if the member affects the healthiness
-- of the cluster
for _, v in ipairs(fact.alarms) do
alarms[#alarms+1] = table_utils.deepcopy(v)
if not alarms[#alarms]['tags'] then
alarms[#alarms]['tags'] = {}
end
alarms[#alarms].tags['dependency_name'] = member
alarms[#alarms].tags['dependency_level'] = 'direct'
if fact.hostname then
alarms[#alarms].hostname = fact.hostname
end
end
end
cluster.status = max_status(cluster.status, status)
end
end
cluster.alarms = table_utils.deepcopy(alarms)
cluster:refresh_status()
local alarms = table_utils.deepcopy(cluster.alarms)
if cluster.status ~= consts.OKAY then
-- add hints if the cluster isn't healthy
for _, member in ipairs(cluster.hints or {}) do
local other_cluster = clusters[member]
if other_cluster and other_cluster.status ~= OKAY and #other_cluster.alarms > 0 then
for _, v in ipairs(other_cluster.alarms) do
if not (v.tags and v.tags.dependency_name and members_with_alarms[v.tags.dependency_name]) then
-- this isn't an alarm related to a member of the cluster itself
alarms[#alarms+1] = table_utils.deepcopy(v)
if not alarms[#alarms]['tags'] then
alarms[#alarms]['tags'] = {}
end
alarms[#alarms].tags['dependency_name'] = member
alarms[#alarms].tags['dependency_level'] = 'hint'
end
end
for _, other_id in ipairs(cluster.hints or {}) do
for _, v in pairs(cluster:subtract_alarms(clusters[other_id])) do
alarms[#alarms+1] = table_utils.deepcopy(v)
alarms[#alarms].tags['dependency_name'] = other_id
alarms[#alarms].tags['dependency_level'] = 'hint'
end
end
end

View File

@ -0,0 +1,141 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local consts = require 'gse_constants'
local gse_utils = require 'gse_utils'
local table_utils = require 'table_utils'
local ipairs = ipairs
local pairs = pairs
local setmetatable = setmetatable
local assert = assert
local type = type
local GseCluster = {}
GseCluster.__index = GseCluster
setfenv(1, GseCluster) -- Remove external access to contain everything in the module
local VALID_STATUSES = {
[consts.OKAY]=true,
[consts.WARN]=true,
[consts.CRIT]=true,
[consts.DOWN]=true,
[consts.UNKW]=true
}
-- TODO(pasquier-s): pass the cluster's policy
function GseCluster.new(members, hints, group_by)
assert(type(members) == 'table')
assert(type(hints) == 'table')
local cluster = {}
setmetatable(cluster, GseCluster)
cluster.members = members
cluster.hints = hints
-- when group_by is 'hostname', facts are stored by hostname then member
-- when group_by is 'member', facts are stored by member only
-- otherwise facts are stored by member then hostname
if group_by == 'hostname' or group_by == 'member' then
cluster.group_by = group_by
else
cluster.group_by = 'none'
end
cluster.status = consts.UNKW
cluster.facts = {}
cluster.alarms = {}
cluster.member_index = {}
for _, v in ipairs(members) do
cluster.member_index[v] = true
end
return cluster
end
function GseCluster:has_member(member)
return self.member_index[member]
end
-- Update the facts table for a cluster's member
function GseCluster:update_fact(member, hostname, value, alarms)
assert(VALID_STATUSES[value])
assert(type(alarms) == 'table')
local key1, key2 = member, hostname
if self.group_by == 'hostname' then
key1 = hostname
key2 = member
elseif self.group_by == 'member' then
key2 = '__anyhost__'
end
if not self.facts[key1] then
self.facts[key1] = {}
end
self.facts[key1][key2] = {
status=value,
alarms=table_utils.deepcopy(alarms),
member=member
}
if self.group_by == 'hostname' then
-- store the hostname for later reference in the alarms
self.facts[key1][key2].hostname = hostname
end
end
-- Compute the status and alarms of the cluster according to the current facts
-- and the cluster's policy
function GseCluster:refresh_status()
local status = consts.UNKW
local alarms = {}
for group_key, _ in table_utils.orderedPairs(self.facts) do
for sub_key, fact in table_utils.orderedPairs(self.facts[group_key]) do
if fact.status ~= consts.OKAY then
for _, v in ipairs(fact.alarms) do
alarms[#alarms+1] = table_utils.deepcopy(v)
if not alarms[#alarms]['tags'] then
alarms[#alarms]['tags'] = {}
end
alarms[#alarms].tags['dependency_name'] = fact.member
alarms[#alarms].tags['dependency_level'] = 'direct'
if fact.hostname then
alarms[#alarms].hostname = fact.hostname
end
end
end
status = gse_utils.max_status(status, fact.status)
end
end
self.status = status
self.alarms = alarms
return self.status
end
-- Return the alarms from another cluster which aren't already known by this
-- cluster
function GseCluster:subtract_alarms(cluster)
local subset = {}
if cluster then
for _, alarm in ipairs(cluster.alarms) do
if alarm.tags and alarm.tags['dependency_name'] and not self:has_member(alarm.tags['dependency_name']) then
subset[#subset+1] = alarm
end
end
end
return subset
end
return GseCluster

View File

@ -0,0 +1,37 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
local consts = require 'gse_constants'
local M = {}
setfenv(1, M) -- Remove external access to contain everything in the module
local STATUS_WEIGHTS = {
[consts.UNKW]=0,
[consts.OKAY]=1,
[consts.WARN]=2,
[consts.CRIT]=3,
[consts.DOWN]=4
}
function max_status(val1, val2)
if not val2 or STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then
return val1
else
return val2
end
end
return M

View File

@ -118,31 +118,32 @@ lma_collector:
clusters:
mysqld-tcp:
policy: highest_severity
group_by: member
members:
- backends
mysql:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- heartbeat
haproxy:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- heartbeat
apache:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- heartbeat
memcached:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- heartbeat
rabbitmq:
policy: highest_severity
group_by_hostname: true
group_by: member
members:
- heartbeat
- memory
@ -150,140 +151,169 @@ lma_collector:
- queue
nova-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
nova-ec2-api:
policy: highest_severity
group_by: member
members:
- backends
nova-novncproxy-websocket:
policy: highest_severity
group_by: member
members:
- backends
nova-metadata-api:
policy: highest_severity
group_by: member
members:
- backends
nova-scheduler:
policy: highest_severity
group_by: member
members:
- workers
nova-cert:
policy: highest_severity
group_by: member
members:
- workers
nova-consoleauth:
policy: highest_severity
group_by: member
members:
- workers
nova-compute:
policy: highest_severity
group_by: member
members:
- workers
nova-conductor:
policy: highest_severity
group_by: member
members:
- workers
cinder-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
cinder-v2-api:
policy: highest_severity
group_by: member
members:
# Cinder V2 backends are in fact the same as the Cinder backends
- endpoint
cinder-scheduler:
policy: highest_severity
group_by: member
members:
- workers
cinder-volume:
policy: highest_severity
group_by: member
members:
- workers
neutron-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
neutron-l3:
policy: highest_severity
group_by: member
members:
- workers
neutron-dhcp:
policy: highest_severity
group_by: member
members:
- workers
neutron-metadata:
policy: highest_severity
group_by: member
members:
- workers
neutron-openvswitch:
policy: highest_severity
group_by: member
members:
- workers
keystone-public-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
keystone-admin-api:
policy: highest_severity
group_by: member
members:
# TODO(pasquier-s): add a metric reporting the status of the keystone-admin-api endpoint
- backends
- http_errors
glance-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
glance-registry-api:
policy: highest_severity
group_by: member
members:
- backends
heat-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
heat-cfn-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
heat-cloudwatch-api:
policy: highest_severity
group_by: member
members:
- backends
<% if @tls_enabled then -%>
horizon-https:
policy: highest_severity
group_by: member
members:
- backends
<% else -%>
horizon-ui:
policy: highest_severity
group_by: member
members:
- backends
<% end -%>
<% if not @storage_options["objects_ceph"] then -%>
swift-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
- http_errors
swift-s3-api:
policy: highest_severity
group_by: member
members:
# Swift S3 backends are in fact the same as the Swift backends
- endpoint
@ -291,6 +321,7 @@ lma_collector:
<% if @ceilometer_enabled -%>
ceilometer-api:
policy: highest_severity
group_by: member
members:
- backends
- endpoint
@ -298,6 +329,7 @@ lma_collector:
<% if @storage_options["volumes_ceph"] then -%>
ceph-mon:
policy: highest_severity
group_by: member
members:
- health
<% end -%>
@ -317,19 +349,19 @@ lma_collector:
clusters:
controller:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- cpu
- fs
compute:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- cpu
- fs
storage:
policy: majority_of_members
group_by_hostname: true
group_by: hostname
members:
- cpu
- fs
@ -348,32 +380,38 @@ lma_collector:
clusters:
mysql:
policy: highest_severity
group_by: member
members:
- mysqld-tcp
- mysqld
- mysql
- controller
haproxy:
policy: highest_severity
group_by: member
members:
- haproxy
- controller
apache:
policy: highest_severity
group_by: member
members:
- apache
- controller
memcached:
policy: highest_severity
group_by: member
members:
- memcached
- controller
rabbitmq:
policy: highest_severity
group_by: member
members:
- rabbitmq
- controller
nova:
policy: highest_severity
group_by: member
members:
- nova-api
- nova-ec2-api
@ -393,6 +431,7 @@ lma_collector:
- neutron
cinder:
policy: highest_severity
group_by: member
members:
- cinder-api
- cinder-v2-api
@ -404,6 +443,7 @@ lma_collector:
- keystone
neutron:
policy: highest_severity
group_by: member
members:
- neutron-api
- neutron-l3
@ -415,6 +455,7 @@ lma_collector:
- keystone
keystone:
policy: highest_severity
group_by: member
members:
- keystone-public-api
- keystone-admin-api
@ -422,6 +463,7 @@ lma_collector:
hints: []
glance:
policy: highest_severity
group_by: member
members:
- glance-api
- glance-registry-api
@ -430,6 +472,7 @@ lma_collector:
- keystone
heat:
policy: highest_severity
group_by: member
members:
- heat-api
- heat-cfn-api
@ -443,6 +486,7 @@ lma_collector:
- nova
horizon:
policy: highest_severity
group_by: member
members:
<% if @tls_enabled then -%>
- horizon-https
@ -455,6 +499,7 @@ lma_collector:
<% if not @storage_options["objects_ceph"] then -%>
swift:
policy: highest_severity
group_by: member
members:
- swift-api
- swift-s3-api
@ -465,6 +510,7 @@ lma_collector:
<% if @ceilometer_enabled -%>
ceilometer:
policy: highest_severity
group_by: member
members:
- ceilometer-api
- controller
@ -474,6 +520,7 @@ lma_collector:
<% if @storage_options["volumes_ceph"] then -%>
ceph:
policy: highest_severity
group_by: member
members:
- ceph-mon
- controller

View File

@ -27,12 +27,12 @@ local consts = require('gse_constants')
local gse = require('gse')
-- define clusters
gse.add_cluster("heat", {'heat-api', 'controller'}, {'nova', 'glance', 'neutron', 'keystone', 'rabbitmq'}, false)
gse.add_cluster("nova", {'nova-api', 'nova-ec2-api', 'nova-scheduler'}, {'glance', 'neutron', 'keystone', 'rabbitmq'}, false)
gse.add_cluster("neutron", {'neutron-api'}, {'keystone', 'rabbitmq'}, false)
gse.add_cluster("keystone", {'keystone-admin-api', 'keystone-public-api'}, {}, false)
gse.add_cluster("glance", {'glance-api', 'glance-registry-api'}, {'keystone'}, false)
gse.add_cluster("rabbitmq", {'rabbitmq-cluster', 'controller'}, {}, true)
gse.add_cluster("heat", {'heat-api', 'controller'}, {'nova', 'glance', 'neutron', 'keystone', 'rabbitmq'}, 'member')
gse.add_cluster("nova", {'nova-api', 'nova-ec2-api', 'nova-scheduler'}, {'glance', 'neutron', 'keystone', 'rabbitmq'}, 'member')
gse.add_cluster("neutron", {'neutron-api'}, {'keystone', 'rabbitmq'}, 'member')
gse.add_cluster("keystone", {'keystone-admin-api', 'keystone-public-api'}, {}, 'member')
gse.add_cluster("glance", {'glance-api', 'glance-registry-api'}, {'keystone'}, 'member')
gse.add_cluster("rabbitmq", {'rabbitmq-cluster', 'controller'}, {}, 'hostname')
-- provision facts
gse.set_member_status("neutron", "neutron-api", consts.DOWN, {{message="All neutron endpoints are down"}}, 'node-1')
@ -70,11 +70,11 @@ TestGse = {}
local status, alarms = gse.resolve_status('rabbitmq')
assertEquals(status, consts.WARN)
assertEquals(#alarms, 2)
assertEquals(alarms[1].hostname, 'node-2')
assertEquals(alarms[1].tags.dependency_name, 'rabbitmq-cluster')
assertEquals(alarms[1].hostname, 'node-1')
assertEquals(alarms[1].tags.dependency_name, 'controller')
assertEquals(alarms[1].tags.dependency_level, 'direct')
assertEquals(alarms[2].hostname, 'node-1')
assertEquals(alarms[2].tags.dependency_name, 'controller')
assertEquals(alarms[2].hostname, 'node-2')
assertEquals(alarms[2].tags.dependency_name, 'rabbitmq-cluster')
assertEquals(alarms[2].tags.dependency_level, 'direct')
end
@ -105,10 +105,10 @@ TestGse = {}
assert(alarms[1].hostname == nil)
assertEquals(alarms[2].tags.dependency_name, 'rabbitmq')
assertEquals(alarms[2].tags.dependency_level, 'hint')
assertEquals(alarms[2].hostname, 'node-2')
assertEquals(alarms[2].hostname, 'node-1')
assertEquals(alarms[3].tags.dependency_name, 'rabbitmq')
assertEquals(alarms[3].tags.dependency_level, 'hint')
assertEquals(alarms[3].hostname, 'node-1')
assertEquals(alarms[3].hostname, 'node-2')
end
function TestGse:test_05_nova_is_okay()
@ -121,10 +121,10 @@ TestGse = {}
local status, alarms = gse.resolve_status('heat')
assertEquals(status, consts.WARN)
assertEquals(#alarms, 6)
assertEquals(alarms[1].tags.dependency_name, 'heat-api')
assertEquals(alarms[1].tags.dependency_name, 'controller')
assertEquals(alarms[1].tags.dependency_level, 'direct')
assert(alarms[1].hostname == nil)
assertEquals(alarms[2].tags.dependency_name, 'controller')
assertEquals(alarms[2].tags.dependency_name, 'heat-api')
assertEquals(alarms[2].tags.dependency_level, 'direct')
assert(alarms[2].hostname == nil)
assertEquals(alarms[3].tags.dependency_name, 'glance')
@ -194,17 +194,6 @@ TestGse = {}
assert(metric.Payload:match("1 RabbitMQ node out of 3 is down"))
end
function TestGse:test_max_status()
local status = gse.max_status(consts.DOWN, consts.WARN)
assertEquals(consts.DOWN, status)
local status = gse.max_status(consts.OKAY, consts.WARN)
assertEquals(consts.WARN, status)
local status = gse.max_status(consts.OKAY, consts.DOWN)
assertEquals(consts.DOWN, status)
local status = gse.max_status(consts.UNKW, consts.DOWN)
assertEquals(consts.DOWN, status)
end
function TestGse:test_reverse_index()
local clusters = gse.find_cluster_memberships('controller')
assertEquals(#clusters, 2)

View File

@ -0,0 +1,36 @@
-- Copyright 2015 Mirantis, Inc.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
require('luaunit')
package.path = package.path .. ";files/plugins/common/?.lua;tests/lua/mocks/?.lua"
local gse_utils = require('gse_utils')
local consts = require('gse_constants')
TestGseUtils = {}
function TestGseUtils:test_max_status()
local status = gse_utils.max_status(consts.DOWN, consts.WARN)
assertEquals(consts.DOWN, status)
local status = gse_utils.max_status(consts.OKAY, consts.WARN)
assertEquals(consts.WARN, status)
local status = gse_utils.max_status(consts.OKAY, consts.DOWN)
assertEquals(consts.DOWN, status)
local status = gse_utils.max_status(consts.UNKW, consts.DOWN)
assertEquals(consts.DOWN, status)
end
lu = LuaUnit
lu:setVerbosity( 1 )
os.exit( lu:run() )