diff --git a/docker/hindsight/Dockerfile.j2 b/docker/hindsight/Dockerfile.j2 index 1014dc3..c8d0edd 100644 --- a/docker/hindsight/Dockerfile.j2 +++ b/docker/hindsight/Dockerfile.j2 @@ -19,7 +19,9 @@ RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 1FA22B08 \ ADD output/*.lua /var/lib/hindsight/run/output/ ADD input/*.lua /var/lib/hindsight/run/input/ +ADD analysis/*.lua /var/lib/hindsight/run/analysis/ ADD modules/*.lua /opt/ccp/lua/modules/stacklight/ +ADD modules_alarms/afd_node_default_cpu_alarms.lua /opt/ccp/lua/modules/stacklight_alarms/ RUN useradd --user-group hindsight \ && usermod -a -G microservices hindsight \ diff --git a/docker/hindsight/analysis/afd.lua b/docker/hindsight/analysis/afd.lua new file mode 100644 index 0000000..ed9db37 --- /dev/null +++ b/docker/hindsight/analysis/afd.lua @@ -0,0 +1,120 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local string = require 'string' + +local message = require 'stacklight.message' +local afd = require 'stacklight.afd' +local afd_annotation = require 'stacklight.afd_annotation' + +-- node or service +local afd_type = read_config('afd_type') or error('afd_type must be specified!') +local msg_type +local msg_field_name +local afd_entity + +if afd_type == 'node' then + msg_type = 'afd_node_metric' + msg_field_name = 'node_status' + afd_entity = 'node_role' +elseif afd_type == 'service' then + msg_type = 'afd_service_metric' + msg_field_name = 'service_status' + afd_entity = 'service' +else + error('invalid afd_type value') +end + +-- ie: controller for node AFD / rabbitmq for service AFD +local afd_entity_value = read_config('afd_cluster_name') or + error('afd_cluster_name must be specified!') + +-- ie: cpu for node AFD / queue for service AFD +local msg_field_source = read_config('afd_logical_name') or + error('afd_logical_name must be specified!') + +local hostname = read_config('hostname') or error('hostname must be specified') + +local afd_file = read_config('afd_file') or error('afd_file must be specified') +local all_alarms = require('stacklight_alarms.' .. afd_file) +local A = require 'stacklight.afd_alarms' +A.load_alarms(all_alarms) + +function process_message() + + local metric_name = read_message('Fields[name]') + local ts = read_message('Timestamp') + + local value, err_msg = message.read_values() + if not value then + return -1, err_msg + end + -- retrieve field values + local fields = {} + for _, field in ipairs(A.get_metric_fields(metric_name)) do + local field_value = read_message(string.format('Fields[%s]', field)) + if not field_value then + return -1, "Cannot find Fields[" .. field .. "] for the metric " .. metric_name + end + fields[field] = field_value + end + A.add_value(ts, metric_name, value, fields) + return 0 +end + +function timer_event(ns) + if A.is_started() then + local state, alarms = A.evaluate(ns) + if state then -- it was time to evaluate at least one alarm + for _, alarm in ipairs(alarms) do + afd.add_to_alarms( + alarm.state, + alarm.alert['function'], + alarm.alert.metric, + alarm.alert.fields, + {}, -- tags + alarm.alert.operator, + alarm.alert.value, + alarm.alert.threshold, + alarm.alert.window, + alarm.alert.periods, + alarm.alert.message) + end + + -- Message example: + -- msg = { + -- Type = 'afd_node_metric', + -- Payload = '{"alarms":[...]}', + -- Fields = { + -- name = 'node_status', + -- value = 0, + -- hostname = 'node1', + -- source = 'cpu', + -- node_role = 'controller', + -- dimensions = {'node_role', 'source', 'hostname'}, + -- } + -- } + local msg = afd.inject_afd_metric( + msg_type, afd_entity, afd_entity_value, msg_field_name, + state, hostname, msg_field_source) + + if msg then + afd_annotation.inject_afd_annotation(msg) + end + + end + else + A.set_start_time(ns) + end +end diff --git a/docker/hindsight/modules/afd.lua b/docker/hindsight/modules/afd.lua new file mode 100644 index 0000000..1db84ac --- /dev/null +++ b/docker/hindsight/modules/afd.lua @@ -0,0 +1,181 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local cjson = require 'cjson' +local string = require 'string' +local table = require 'table' + +local utils = require 'stacklight.utils' +local constants = require 'stacklight.constants' + +local read_message = read_message +local assert = assert +local ipairs = ipairs +local pcall = pcall + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +local function read_field(msg, name) + return msg.Fields[name] +end + +function read_status(msg) + return read_field(msg, 'value') +end + +function read_source(msg) + return read_field(msg, 'source') +end + +function read_hostname(msg) + return read_field(msg, 'hostname') +end + +function extract_alarms(msg) + local ok, payload = pcall(cjson.decode, msg.Payload) + if not ok or not payload.alarms then + return nil + end + return payload.alarms +end + +-- return a human-readable message from an alarm table +-- for instance: "CPU load too high (WARNING, rule='last(load_midterm)>=5', current=7)" +function get_alarm_for_human(alarm) + local metric + if #(alarm.fields) > 0 then + local fields = {} + for _, field in ipairs(alarm.fields) do + fields[#fields+1] = field.name .. '="' .. field.value .. '"' + end + metric = string.format('%s[%s]', alarm.metric, table.concat(fields, ',')) + else + metric = alarm.metric + end + + local host = '' + if alarm.hostname then + host = string.format(', host=%s', alarm.hostname) + end + + return string.format( + "%s (%s, rule='%s(%s)%s%s', current=%.2f%s)", + alarm.message, + alarm.severity, + alarm['function'], + metric, + alarm.operator, + alarm.threshold, + alarm.value, + host + ) +end + +function alarms_for_human(alarms) + local alarm_messages = {} + local hint_messages = {} + + for _, v in ipairs(alarms) do + if v.tags and v.tags.dependency_level and v.tags.dependency_level == 'hint' then + hint_messages[#hint_messages+1] = get_alarm_for_human(v) + else + alarm_messages[#alarm_messages+1] = get_alarm_for_human(v) + end + end + + if #hint_messages > 0 then + alarm_messages[#alarm_messages+1] = "Other related alarms:" + end + for _, v in ipairs(hint_messages) do + alarm_messages[#alarm_messages+1] = v + end + + return alarm_messages +end + +local alarms = {} + +-- append an alarm to the list of pending alarms +-- the list is sent when inject_afd_metric is called +function add_to_alarms(status, fn, metric, fields, tags, operator, value, threshold, window, periods, message) + local severity = constants.status_label(status) + assert(severity) + alarms[#alarms+1] = { + severity=severity, + ['function']=fn, + metric=metric, + fields=fields or {}, + tags=tags or {}, + operator=operator, + value=value, + threshold=threshold, + window=window or 0, + periods=periods or 0, + message=message + } +end + +function get_alarms() + return alarms +end + +function reset_alarms() + alarms = {} +end + +-- inject an AFD event into the Heka pipeline +function inject_afd_metric(msg_type, msg_tag_name, msg_tag_value, metric_name, + value, hostname, source) + local payload + + if #alarms > 0 then + payload = utils.safe_json_encode({alarms=alarms}) + reset_alarms() + if not payload then + return + end + else + -- because cjson encodes empty tables as objects instead of arrays + payload = '{"alarms":[]}' + end + + local msg = { + Type = msg_type, + Payload = payload, + Fields = { + name = metric_name, + value = value, + hostname = hostname, + source = source, + dimensions = {msg_tag_name, 'hostname', 'source'}, + } + } + msg.Fields[msg_tag_name] = msg_tag_value + + local err_code, err_msg = utils.safe_inject_message(msg) + + if err_code ~= 0 then + return nil, err_msg + end + + return msg +end + +MATCH = 1 +NO_MATCH = 2 +NO_DATA = 3 +MISSING_DATA = 4 + +return M diff --git a/docker/hindsight/modules/afd_alarm.lua b/docker/hindsight/modules/afd_alarm.lua new file mode 100644 index 0000000..8d36e17 --- /dev/null +++ b/docker/hindsight/modules/afd_alarm.lua @@ -0,0 +1,224 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local assert = assert +local ipairs = ipairs +local pairs = pairs +local string = string +local setmetatable = setmetatable + +local table_utils = require 'stacklight.table_utils' +local constants = require 'stacklight.constants' +local afd = require 'stacklight.afd' +local Rule = require 'stacklight.afd_rule' + +local SEVERITIES = { + warning = constants.WARN, + critical = constants.CRIT, + down = constants.DOWN, + unknown = constants.UNKW, + okay = constants.OKAY, +} + +local Alarm = {} +Alarm.__index = Alarm + +setfenv(1, Alarm) -- Remove external access to contain everything in the module + +function Alarm.new(alarm) + local a = {} + setmetatable(a, Alarm) + a._metrics_list = nil + a.name = alarm.name + a.description = alarm.description + if alarm.trigger.logical_operator then + a.logical_operator = string.lower(alarm.trigger.logical_operator) + else + a.logical_operator = 'or' + end + a.severity_str = string.upper(alarm.severity) + a.severity = SEVERITIES[string.lower(alarm.severity)] + assert(a.severity ~= nil) + + a.skip_when_no_data = false + if alarm.no_data_policy then + if string.lower(alarm.no_data_policy) == 'skip' then + a.skip_when_no_data = true + else + a.no_data_severity = SEVERITIES[string.lower(alarm.no_data_policy)] + end + else + a.no_data_severity = constants.UNKW + end + assert(a.skip_when_no_data or a.no_data_severity ~= nil) + + a.rules = {} + a.initial_wait = 0 + for _, rule in ipairs(alarm.trigger.rules) do + local r = Rule.new(rule) + a.rules[#a.rules+1] = r + local wait = r.window * r.periods + if wait > a.initial_wait then + a.initial_wait = wait * 1e9 + end + end + a.start_time_ns = 0 + + return a +end + +-- return the Set of metrics used by the alarm +function Alarm:get_metrics() + if not self._metrics_list then + self._metrics_list = {} + for _, rule in ipairs(self.rules) do + if not table_utils.item_find(rule.metric, metrics) then + self._metrics_list[#self._metrics_list+1] = rule.metric + end + end + end + return self._metrics_list +end + +-- return a list of field names used for the metric +-- (can have duplicate names) +function Alarm:get_metric_fields(metric_name) + local fields = {} + for _, rule in ipairs(self.rules) do + if rule.metric == metric_name then + for k, _ in pairs(rule.fields) do + fields[#fields+1] = k + end + for _, g in ipairs(rule.group_by) do + fields[#fields+1] = g + end + end + end + return fields +end + +function Alarm:has_metric(metric) + return table_utils.item_find(metric, self:get_metrics()) +end + +-- dispatch datapoint in datastores +function Alarm:add_value(ts, metric, value, fields) + local data + for id, rule in pairs(self.rules) do + if rule.metric == metric then + rule:add_value(ts, value, fields) + end + end +end + +-- convert fields to fields map +-- {foo="bar"} --> {name="foo", value="bar"} +local function convert_field_list(fields) + local named_fields = {} + for name, value in pairs(fields or {}) do + named_fields[#named_fields+1] = {name=name, value=value} + end + return named_fields +end + +-- return: state of alarm and a list of alarm details. +-- +-- with alarm list when state != OKAY: +-- { +-- { +-- value = , +-- fields = , +-- message = , +-- }, +-- } +function Alarm:evaluate(ns) + local state = constants.OKAY + local matches = 0 + local all_alerts = {} + local function add_alarm(rule, value, message, fields) + all_alerts[#all_alerts+1] = { + severity = self.severity_str, + ['function'] = rule.fct, + metric = rule.metric, + operator = rule.relational_operator, + threshold = rule.threshold, + window = rule.window, + periods = rule.periods, + value = value, + fields = fields, + message = message + } + end + local one_unknown = false + local msg + + for _, rule in ipairs(self.rules) do + local eval, context_list = rule:evaluate(ns) + if eval == afd.MATCH then + matches = matches + 1 + msg = self.description + elseif eval == afd.MISSING_DATA then + msg = 'No datapoint have been received over the last ' .. rule.observation_window .. ' seconds' + one_unknown = true + elseif eval == afd.NO_DATA then + msg = 'No datapoint have been received ever' + one_unknown = true + end + for _, context in ipairs(context_list) do + add_alarm(rule, context.value, msg, + convert_field_list(context.fields)) + end + end + + if self.logical_operator == 'and' then + if one_unknown then + if self.skip_when_no_data then + state = nil + else + state = self.no_data_severity + end + elseif #self.rules == matches then + state = self.severity + end + elseif self.logical_operator == 'or' then + if matches > 0 then + state = self.severity + elseif one_unknown then + if self.skip_when_no_data then + state = nil + else + state = self.no_data_severity + end + end + end + + if state == nil or state == constants.OKAY then + all_alerts = {} + end + return state, all_alerts +end + +function Alarm:set_start_time(ns) + self.start_time_ns = ns +end + +function Alarm:is_evaluation_time(ns) + local delta = ns - self.start_time_ns + if delta >= self.initial_wait then + return true + end + return false +end + +return Alarm diff --git a/docker/hindsight/modules/afd_alarms.lua b/docker/hindsight/modules/afd_alarms.lua new file mode 100644 index 0000000..465241d --- /dev/null +++ b/docker/hindsight/modules/afd_alarms.lua @@ -0,0 +1,118 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local pairs = pairs +local ipairs = ipairs +local table_utils = require 'stacklight.table_utils' +local constants = require 'stacklight.constants' +local Alarm = require 'stacklight.afd_alarm' + +local all_alarms = {} + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- return a list of field names required for the metric +function get_metric_fields(metric_name) + local fields = {} + for name, alarm in pairs(all_alarms) do + local mf = alarm:get_metric_fields(metric_name) + if mf then + for _, field in pairs(mf) do + if not table_utils.item_find(field, fields) then + fields[#fields+1] = field + end + end + end + end + return fields +end + +-- return list of alarms interested by a metric +function get_interested_alarms(metric) + local interested_alarms = {} + for _, alarm in pairs(all_alarms) do + if alarm:has_metric(metric) then + + interested_alarms[#interested_alarms+1] = alarm + end + end + return interested_alarms +end + +function add_value(ts, metric, value, fields) + local interested_alarms = get_interested_alarms(metric) + for _, alarm in ipairs (interested_alarms) do + alarm:add_value(ts, metric, value, fields) + end +end + +function reset_alarms() + all_alarms = {} +end + +function evaluate(ns) + local global_state + local all_alerts = {} + for _, alarm in pairs(all_alarms) do + if alarm:is_evaluation_time(ns) then + local state, alerts = alarm:evaluate(ns) + global_state = constants.max_status(state, global_state) + for _, a in ipairs(alerts) do + all_alerts[#all_alerts+1] = { state=state, alert=a } + end + -- raise the first triggered alarm except for OKAY/UNKW states + if global_state ~= constants.UNKW and global_state ~= constants.OKAY then + break + end + end + end + return global_state, all_alerts +end + +function get_alarms() + return all_alarms +end +function get_alarm(alarm_name) + for _, a in ipairs(all_alarms) do + if a.name == alarm_name then + return a + end + end +end + +function load_alarm(alarm) + local A = Alarm.new(alarm) + all_alarms[#all_alarms+1] = A +end + +function load_alarms(alarms) + for _, alarm in ipairs(alarms) do + load_alarm(alarm) + end +end + +local started = false +function set_start_time(ns) + for _, alarm in ipairs(all_alarms) do + alarm:set_start_time(ns) + end + started = true +end + +function is_started() + return started +end + +return M diff --git a/docker/hindsight/modules/afd_annotation.lua b/docker/hindsight/modules/afd_annotation.lua new file mode 100644 index 0000000..1ede301 --- /dev/null +++ b/docker/hindsight/modules/afd_annotation.lua @@ -0,0 +1,99 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local string = require 'string' +local table = require 'table' + +local utils = require 'stacklight.utils' +local consts = require 'stacklight.constants' +local afd = require 'stacklight.afd' + +local M = {} +setfenv(1, M) + +local statuses = {} + +local annotation_msg = { + Type = 'metric', + Fields = { + name = 'annotation', + dimensions = {'source', 'hostname'}, + value_fields = {'title', 'tags', 'text'}, + title = nil, + tags = nil, + text = nil, + source = nil, + hostname = nil, + } +} + +function inject_afd_annotation(msg) + local previous + local text + + local source = afd.read_source(msg) + local status = afd.read_status(msg) + local hostname = afd.read_hostname(msg) + local alarms = afd.extract_alarms(msg) + + if not source or not status or not alarms then + return -1 + end + + if not statuses[source] then + statuses[source] = {} + end + previous = statuses[source] + + text = table.concat(afd.alarms_for_human(alarms), '
') + + -- build the title + if not previous.status and status == consts.OKAY then + -- don't send an annotation when we detect a new cluster which is OKAY + return 0 + elseif not previous.status then + title = string.format('General status is %s', + consts.status_label(status)) + elseif previous.status ~= status then + title = string.format('General status %s -> %s', + consts.status_label(previous.status), + consts.status_label(status)) + + -- TODO(pasquier-s): generate an annotation when the set of alarms has + -- changed. the following code generated an annotation whenever at least + -- one value associated to an alarm was changing. This led to way too + -- many annotations with alarms monitoring the CPU usage for instance. + +-- elseif previous.text ~= text then +-- title = string.format('General status remains %s', +-- consts.status_label(status)) + else + -- nothing has changed since the last message + return 0 + end + + annotation_msg.Fields.title = title + annotation_msg.Fields.tags = source + annotation_msg.Fields.text = text + annotation_msg.Fields.source = source + annotation_msg.Fields.hostname = hostname + + -- store the last status and alarm text for future messages + previous.status = status + previous.text = text + + return utils.safe_inject_message(annotation_msg) +end + +return M diff --git a/docker/hindsight/modules/afd_rule.lua b/docker/hindsight/modules/afd_rule.lua new file mode 100644 index 0000000..f61a2b0 --- /dev/null +++ b/docker/hindsight/modules/afd_rule.lua @@ -0,0 +1,279 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local circular_buffer = require 'circular_buffer' +local stats = require 'lsb.stats' +local setmetatable = setmetatable +local ipairs = ipairs +local pairs = pairs +local math = require 'math' +local string = string +local table = table +local assert = assert +local type = type + +-- StackLight libs +local table_utils = require 'stacklight.table_utils' +local constants = require 'stacklight.constants' +local afd = require 'stacklight.afd' +local matching = require 'stacklight.value_matching' + +local MIN_WINDOW = 10 +local MIN_PERIOD = 1 +local SECONDS_PER_ROW = 5 + +local Rule = {} +Rule.__index = Rule + +setfenv(1, Rule) -- Remove external access to contain everything in the module + +function Rule.new(rule) + local r = {} + setmetatable(r, Rule) + + local win = MIN_WINDOW + if rule.window and rule.window + 0 > 0 then + win = rule.window + 0 + end + r.window = win + local periods = MIN_PERIOD + if rule.periods and rule.periods + 0 > 0 then + periods = rule.periods + 0 + end + r.periods = periods + r.relational_operator = rule.relational_operator + r.metric = rule.metric + r.fields = rule.fields or {} + + -- build field matching + r.field_matchers = {} + for f, expression in pairs(r.fields) do + r.field_matchers[f] = matching.new(expression) + end + + r.fct = rule['function'] + r.threshold = rule.threshold + 0 + r.value_index = rule.value or nil -- Can be nil + + -- build unique rule id + local arr = {r.metric, r.fct, r.window, r.periods} + for f, v in table_utils.orderedPairs(r.fields or {}) do + arr[#arr+1] = string.format('(%s=%s)', f, v) + end + r.rule_id = table.concat(arr, '/') + + r.group_by = rule.group_by or {} + + r.cbuf_size = math.ceil(r.window * r.periods / SECONDS_PER_ROW) + + r.ids_datastore = {} + r.datastore = {} + r.observation_window = math.ceil(r.window * r.periods) + + return r +end + +function Rule:get_datastore_id(fields) + if #self.group_by == 0 or fields == nil then + return self.rule_id + end + + local arr = {} + arr[#arr + 1] = self.rule_id + for _, g in ipairs(self.group_by) do + arr[#arr + 1] = fields[g] + end + return table.concat(arr, '/') +end + +function Rule:fields_accepted(fields) + if not fields then + fields = {} + end + local matched_fields = 0 + local no_match_on_fields = true + for f, expression in pairs(self.field_matchers) do + no_match_on_fields = false + for k, v in pairs(fields) do + if k == f then + if expression:matches(v) then + matched_fields = matched_fields + 1 + else + return false + end + end + end + end + return no_match_on_fields or matched_fields > 0 +end + +function Rule:get_circular_buffer() + local fct + if self.fct == 'min' or self.fct == 'max' then + fct = self.fct + else + fct = 'sum' + end + local cbuf = circular_buffer.new(self.cbuf_size, 1, SECONDS_PER_ROW) + cbuf:set_header(1, self.metric, fct, fct) + return cbuf +end + +-- store datapoints in cbuf, create the cbuf if not exists. +-- value can be a table where the index to choose is referenced by self.value_index +function Rule:add_value(ts, value, fields) + if not self:fields_accepted(fields) then + return + end + if type(value) == 'table' then + value = value[self.value_index] + end + if value == nil then + return + end + + local data + local uniq_field_id = self:get_datastore_id(fields) + if not self.datastore[uniq_field_id] then + self.datastore[uniq_field_id] = { + fields = self.fields, + cbuf = self:get_circular_buffer() + } + if #self.group_by > 0 then + self.datastore[uniq_field_id].fields = fields + end + + self:add_datastore(uniq_field_id) + end + data = self.datastore[uniq_field_id] + + if self.fct == 'avg' then + data.cbuf:add(ts, 1, value) + else + data.cbuf:set(ts, 1, value) + end +end + +function Rule:add_datastore(id) + if not table_utils.item_find(id, self.ids_datastore) then + self.ids_datastore[#self.ids_datastore+1] = id + end +end + +function Rule:compare_threshold(value) + return constants.compare_threshold(value, self.relational_operator, self.threshold) +end + +local function isnumber(value) + return value ~= nil and not (value ~= value) +end + +local available_functions = {last=true, avg=true, max=true, min=true, sum=true, + variance=true, sd=true, diff=true} + +-- evaluate the rule against datapoints +-- return a list: match (bool or string), context ({value=v, fields=list of field table}) +-- +-- examples: +-- true, { {value=100, fields={{queue='nova'}, {queue='neutron'}}, ..} +-- false, { {value=10, fields={}}, ..} +-- with 2 special cases: +-- - never receive one datapoint +-- 'nodata', {} +-- - no more datapoint received for a metric +-- 'missing', {value=-1, fields={}} +-- There is a drawback with the 'missing' state and could leads to emit false positive +-- state. For example when the monitored thing has been renamed/deleted, +-- it's normal to don't receive datapoint anymore .. for example a filesystem. +function Rule:evaluate(ns) + local fields = {} + local one_match, one_no_match, one_missing_data = false, false, false + for _, id in ipairs(self.ids_datastore) do + local data = self.datastore[id] + if data then + local cbuf_time = data.cbuf:current_time() + -- if we didn't receive datapoint within the observation window this means + -- we don't receive anymore data and cannot compute the rule. + if ns - cbuf_time > self.observation_window * 1e9 then + one_missing_data = true + fields[#fields+1] = {value = -1, fields = data.fields} + else + assert(available_functions[self.fct]) + local result + + if self.fct == 'last' then + local last + local t = ns + while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do + last = data.cbuf:get(t, 1) + t = t - SECONDS_PER_ROW * 1e9 + end + if isnumber(last) then + result = last + else + one_missing_data = true + fields[#fields+1] = {value = -1, fields = data.fields} + end + elseif self.fct == 'diff' then + local first, last + + local t = ns + while (not isnumber(last)) and t >= ns - self.observation_window * 1e9 do + last = data.cbuf:get(t, 1) + t = t - SECONDS_PER_ROW * 1e9 + end + + if isnumber(last) then + t = ns - self.observation_window * 1e9 + while (not isnumber(first)) and t <= ns do + first = data.cbuf:get(t, 1) + t = t + SECONDS_PER_ROW * 1e9 + end + end + + if not isnumber(last) or not isnumber(first) then + one_missing_data = true + fields[#fields+1] = {value = -1, fields = data.fields} + else + result = last - first + end + else + local values = data.cbuf:get_range(1) + result = stats[self.fct](values) + end + + if result then + local m = self:compare_threshold(result) + if m then + one_match = true + fields[#fields+1] = {value=result, fields=data.fields} + else + one_no_match = true + end + end + end + end + end + if one_match then + return afd.MATCH, fields + elseif one_missing_data then + return afd.MISSING_DATA, fields + elseif one_no_match then + return afd.NO_MATCH, {} + else + return afd.NO_DATA, {{value=-1, fields=self.fields}} + end +end + +return Rule diff --git a/docker/hindsight/modules/constants.lua b/docker/hindsight/modules/constants.lua new file mode 100644 index 0000000..50e9bae --- /dev/null +++ b/docker/hindsight/modules/constants.lua @@ -0,0 +1,78 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- The status values were chosen to match with the Grafana constraints: +-- OKAY => green +-- WARN & UNKW => orange +-- CRIT & DOWN => red +OKAY=0 +WARN=1 +UNKW=2 +CRIT=3 +DOWN=4 + +local STATUS_LABELS = { + [OKAY]='OKAY', + [WARN]='WARN', + [UNKW]='UNKNOWN', + [CRIT]='CRITICAL', + [DOWN]='DOWN' +} + +function status_label(v) + return STATUS_LABELS[v] +end + +local STATUS_WEIGHTS = { + [UNKW]=0, + [OKAY]=1, + [WARN]=2, + [CRIT]=3, + [DOWN]=4 +} + +function max_status(val1, val2) + if not val1 then + return val2 + elseif not val2 then + return val1 + elseif STATUS_WEIGHTS[val1] > STATUS_WEIGHTS[val2] then + return val1 + else + return val2 + end +end + +function compare_threshold(value, op, threshold) + local rule_matches = false + if op == '==' or op == 'eq' then + rule_matches = value == threshold + elseif op == '!=' or op == 'ne' then + rule_matches = value ~= threshold + elseif op == '>=' or op == 'gte' then + rule_matches = value >= threshold + elseif op == '>' or op == 'gt' then + rule_matches = value > threshold + elseif op == '<=' or op == 'lte' then + rule_matches = value <= threshold + elseif op == '<' or op == 'lt' then + rule_matches = value < threshold + end + return rule_matches +end + +return M diff --git a/docker/hindsight/modules/patterns.lua b/docker/hindsight/modules/patterns.lua new file mode 100644 index 0000000..f6580e7 --- /dev/null +++ b/docker/hindsight/modules/patterns.lua @@ -0,0 +1,34 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local l = require 'lpeg' +l.locale(l) + +local tonumber = tonumber + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +function anywhere (patt) + return l.P { + patt + 1 * l.V(1) + } +end + +sp = l.space + +-- Pattern used to match a number +Number = l.P"-"^-1 * l.xdigit^1 * (l.S(".,") * l.xdigit^1 )^-1 / tonumber + +return M diff --git a/docker/hindsight/modules/table_utils.lua b/docker/hindsight/modules/table_utils.lua new file mode 100644 index 0000000..177e457 --- /dev/null +++ b/docker/hindsight/modules/table_utils.lua @@ -0,0 +1,83 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local table = require 'table' +local ipairs = ipairs +local pairs = pairs +local type = type + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- return the position (index) of an item in a list, nil if not found +function item_pos(item, list) + if type(list) == 'table' then + for i, v in ipairs(list) do + if v == item then + return i + end + end + end +end + +-- return true if an item is present in the list, false otherwise +function item_find(item, list) + return item_pos(item, list) ~= nil +end + +-- from http://lua-users.org/wiki/SortedIteration +function __genOrderedIndex( t ) + local orderedIndex = {} + for key in pairs(t) do + table.insert( orderedIndex, key ) + end + table.sort( orderedIndex ) + return orderedIndex +end + +function orderedNext(t, state) + -- Equivalent of the next function, but returns the keys in the alphabetic + -- order. We use a temporary ordered key table that is stored in the + -- table being iterated. + + key = nil + if state == nil then + -- the first time, generate the index + t.__orderedIndex = __genOrderedIndex( t ) + key = t.__orderedIndex[1] + else + -- fetch the next value + for i = 1,table.getn(t.__orderedIndex) do + if t.__orderedIndex[i] == state then + key = t.__orderedIndex[i+1] + end + end + end + + if key then + return key, t[key] + end + + -- no more value to return, cleanup + t.__orderedIndex = nil + return +end + +function orderedPairs(t) + -- Equivalent of the pairs() function on tables. Allows to iterate + -- in order + return orderedNext, t, nil +end + +return M diff --git a/docker/hindsight/modules/utils.lua b/docker/hindsight/modules/utils.lua new file mode 100644 index 0000000..8a512b8 --- /dev/null +++ b/docker/hindsight/modules/utils.lua @@ -0,0 +1,46 @@ +-- Copyright 2015-2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local cjson = require 'cjson' + +local inject_message = inject_message +local read_message = read_message +local string = string +local pcall = pcall + +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +-- Encode a Lua variable as JSON without raising an exception if the encoding +-- fails for some reason (for instance, the encoded buffer exceeds the sandbox +-- limit) +function safe_json_encode(v) + local ok, data = pcall(cjson.encode, v) + if not ok then + return + end + return data +end + +-- Call inject_message() wrapped by pcall() +function safe_inject_message(msg) + local ok, err_msg = pcall(inject_message, msg) + if not ok then + return -1, err_msg + else + return 0 + end +end + +return M diff --git a/docker/hindsight/modules/value_matching.lua b/docker/hindsight/modules/value_matching.lua new file mode 100644 index 0000000..3b1fd5c --- /dev/null +++ b/docker/hindsight/modules/value_matching.lua @@ -0,0 +1,171 @@ +-- Copyright 2016 Mirantis, Inc. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +local l = require "lpeg" +l.locale(l) +local pcall = pcall +local string = require 'string' + +local patterns = require 'stacklight.patterns' +local error = error +local setmetatable = setmetatable +local tonumber = tonumber + +local C = l.C +local P = l.P +local S = l.S +local V = l.V +local Ct = l.Ct +local Cc = l.Cc + +local Optional_space = patterns.sp^0 +local Only_spaces = patterns.sp^1 * -1 + +local function space(pat) + return Optional_space * pat * Optional_space +end + +local EQ = P'==' +local NEQ = P'!=' +local GT = P'>' +local LT = P'<' +local GTE = P'>=' +local LTE = P'<=' +local MATCH = P'=~' +local NO_MATCH = P'!~' + +local OR = P'||' +local AND = P'&&' + +local function get_operator(op) + if op == '' then + return '==' + end + return op +end + +local numerical_operator = (EQ + NEQ + LTE + GTE + GT + LT )^-1 / get_operator +local sub_numerical_expression = space(numerical_operator) * patterns.Number * Optional_space +local is_plain_numeric = (sub_numerical_expression * ((OR^1 + AND^1) * sub_numerical_expression)^0) * -1 + +local quoted_string = (P'"' * C((P(1) - (P'"'))^1) * P'"' + C((P(1) - patterns.sp)^1)) +local string_operator = (EQ + NEQ + MATCH + NO_MATCH)^-1 / get_operator +local sub_string_expression = space(string_operator) * quoted_string * Optional_space +local is_plain_string = (sub_string_expression * ((OR^1 + AND^1) * sub_string_expression)^0) * -1 + +local numerical_expression = P { + 'OR'; + AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'), + OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'), + SUB = Ct(sub_numerical_expression) +} * -1 + +local string_expression = P { + 'OR'; + AND = Ct(Cc('and') * V'SUB' * space(AND) * V'AND' + V'SUB'), + OR = Ct(Cc('or') * V'AND' * space(OR) * V'OR' + V'AND'), + SUB = Ct(sub_string_expression) +} * -1 + +local is_complex = patterns.anywhere(EQ + NEQ + LTE + GTE + GT + LT + MATCH + NO_MATCH + OR + AND) + +local function eval_tree(tree, value) + local match = false + + if type(tree[1]) == 'table' then + match = eval_tree(tree[1], value) + else + local operator = tree[1] + if operator == 'and' or operator == 'or' then + match = eval_tree(tree[2], value) + for i=3, #tree, 1 do + local m = eval_tree(tree[i], value) + if operator == 'or' then + match = match or m + else + match = match and m + end + end + else + local matcher = tree[2] + if operator == '==' then + return value == matcher + elseif operator == '!=' then + return value ~= matcher + elseif operator == '>' then + return value > matcher + elseif operator == '<' then + return value < matcher + elseif operator == '>=' then + return value >= matcher + elseif operator == '<=' then + return value <= matcher + elseif operator == '=~' then + local ok, m = pcall(string.find, value, matcher) + return ok and m ~= nil + elseif operator == '!~' then + local ok, m = pcall(string.find, value, matcher) + return ok and m == nil + end + end + end + return match +end + +local MatchExpression = {} +MatchExpression.__index = MatchExpression + +setfenv(1, MatchExpression) -- Remove external access to contain everything in the module + +function MatchExpression.new(expression) + local r = {} + setmetatable(r, MatchExpression) + if is_complex:match(expression) then + r.is_plain_numeric_exp = is_plain_numeric:match(expression) ~= nil + + if r.is_plain_numeric_exp then + r.tree = numerical_expression:match(expression) + elseif is_plain_string:match(expression) ~= nil then + r.tree = string_expression:match(expression) + end + if r.tree == nil then + error('Invalid expression: ' .. expression) + end + else + if expression == '' or Only_spaces:match(expression) then + error('Expression is empty') + end + r.is_simple_equality_matching = true + end + r.expression = expression + + return r +end + +function MatchExpression:matches(value) + if self.is_simple_equality_matching then + return self.expression == value or + tonumber(self.expression) == value or + tonumber(value) == self.expression + end + if self.is_plain_numeric_exp then + value = tonumber(value) + if value == nil then + return false + end + end + return eval_tree(self.tree, value) +end + +return MatchExpression diff --git a/docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua b/docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua new file mode 100644 index 0000000..e3775bd --- /dev/null +++ b/docker/hindsight/modules_alarms/afd_node_default_cpu_alarms.lua @@ -0,0 +1,71 @@ +local M = {} +setfenv(1, M) -- Remove external access to contain everything in the module + +local alarms = { + { + ['name'] = 'cpu-critical', + ['description'] = 'The CPU usage is too high', + ['severity'] = 'critical', + ['trigger'] = { + ['logical_operator'] = 'or', + ['rules'] = { + { + ['metric'] = 'intel.procfs.cpu.idle_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '<=', + ['threshold'] = '5', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + { + ['metric'] = 'intel.procfs.cpu.iowait_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '>=', + ['threshold'] = '35', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + }, + }, + }, + { + ['name'] = 'cpu-warning', + ['description'] = 'The CPU usage is high', + ['severity'] = 'warning', + ['trigger'] = { + ['logical_operator'] = 'or', + ['rules'] = { + { + ['metric'] = 'intel.procfs.cpu.idle_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '<=', + ['threshold'] = '15', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + { + ['metric'] = 'intel.procfs.cpu.iowait_percentage', + ['fields'] = { + ['cpuID'] = 'all' + }, + ['relational_operator'] = '>=', + ['threshold'] = '25', + ['window'] = '120', + ['periods'] = '0', + ['function'] = 'avg', + }, + }, + }, + }, +} + +return alarms diff --git a/service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 b/service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 new file mode 100644 index 0000000..a75ff2f --- /dev/null +++ b/service/files/hindsight_afd_node_default_cpu_alarms.cfg.j2 @@ -0,0 +1,9 @@ +filename = "afd.lua" +log_level = 7 +message_matcher = "TRUE" +ticker_interval = 10 +afd_type = "node" +afd_file = "afd_node_default_cpu_alarms" +afd_cluster_name = "default" +afd_logical_name = "cpu" +hostname = "{{ CCP_HINDSIGHT_NODE_NAME }}" diff --git a/service/stacklight-collector.yaml b/service/stacklight-collector.yaml index 177bd52..07286bc 100644 --- a/service/stacklight-collector.yaml +++ b/service/stacklight-collector.yaml @@ -15,6 +15,7 @@ service: - prune-input.cfg - influxdb-tcp.cfg - kubelet-stats.cfg + - afd-node-default-cpu-alarms.cfg volumes: - name: hindsight-output type: empty-dir @@ -70,6 +71,10 @@ files: path: /var/lib/hindsight/run/input/kubelet_stats.cfg content: hindsight_kubelet_stats.cfg.j2 perm: "0600" + afd-node-default-cpu-alarms.cfg: + path: /var/lib/hindsight/run/analysis/afd_node_default_cpu_alarms.cfg + content: hindsight_afd_node_default_cpu_alarms.cfg.j2 + perm: "0600" snap.conf: path: /etc/snap/snap.conf content: snap.conf.j2