Add alarm debounce support to collectd alarm notifier

This update implements a 3 minute alarm debounce feature
to the existing alarm notifier.

To ensure proper alarm/degrade accounting the mtce degrade
notifier was merged with the alarm notifier.

This update changes the existing 'update_alarm' function
to 'debounce' which returns True once the resource has
been debounced the alarm/degrade settings need to be
updated with the current notification severity.

Test Plan:

PASS: Verify debounce from ok to major
PASS: Verify debounce from ok to critical
PASS: verify debounce from major to ok
PASS: Verify debounce from major to critical
PASS: verify debounce from critical to ok
PASS: Verify debounce from critical to major
PASS: Verify major to major alarm persists
PASS: Verify critical to critical alarm persists
PASS: Verify handling of major startup alarm that escalates to critical
PASS: Verify handling of critical startup alarm that drops to major threshold
PASS: Verify handling of critical startup alarm that drops below alarming threshold
PASS: Verify clear of major alarmed fs over swact
PASS: Verify clear of critical alarmed/degraded fs over swact
PASS: Verify end to end degrade handling with single source
PASS: Verify end to end degrade handling with multiple sources
PASS: Verify end to end filesystem alarm/degrade management
PASS: Verify end to end interface alarm/degrade management
PASS: Verify debounce handling with random value/wait script loop

Change-Id: Ibb9461ce027c5ab5accb64507c7141f10f0d1a88
Partial-Bug: 1848580
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-10-24 00:06:14 -04:00
parent c52e8f11ba
commit b330b1bd5c
13 changed files with 556 additions and 455 deletions

View File

@ -4,7 +4,6 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/collectd.conf.pmon \
$PKG_BASE/src/collectd.service \
$PKG_BASE/src/fm_notifier.py \
$PKG_BASE/src/mtce_notifier.py \
$PKG_BASE/src/plugin_common.py \
$PKG_BASE/src/python_plugins.conf \
$PKG_BASE/src/cpu.py \
@ -22,4 +21,4 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/ptp.conf \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
TIS_PATCH_VER=13
TIS_PATCH_VER=14

View File

@ -14,7 +14,6 @@ Source2: collectd.conf.pmon
# collectd python plugin files - notifiers
Source3: fm_notifier.py
Source4: mtce_notifier.py
Source5: plugin_common.py
# collectd python plugin files - resource plugins
@ -76,7 +75,6 @@ install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir}
# collectd python plugin files - notifiers
install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir}
# collectd python plugin files - resource plugins

View File

@ -15,7 +15,6 @@
PersistOK true
WarningMax 90.00
FailureMax 95.00
Hits 2
Invert false
</Type>
</Plugin>

View File

@ -64,7 +64,7 @@ class CPU_object:
# Platform CPU monitor
now = time.time() # epoch time in floating seconds
self._t0 = {} # cputime state information at start of sample interval
self._t0 = {} # cputime state info at start of sample interval
self._t0[TIMESTAMP] = now
self._t0_cpuacct = {}
@ -478,6 +478,16 @@ def read_func():
obj._data[pc.GROUP_K8S_SYSTEM],
obj._data[pc.GROUP_K8S_ADDON]))
# Fault insertion code to assis in regression UT
#
# if os.path.exists('/var/run/fit/cpu_data'):
# with open('/var/run/fit/cpu_data', 'r') as infile:
# for line in infile:
# obj._data[PLATFORM_CPU_PERCENT] = float(line)
# collectd.info("%s using FIT data:%.2f" %
# (PLUGIN, obj._data[PLATFORM_CPU_PERCENT] ))
# break
# Dispatch overall platform cpu usage percent value
val = collectd.Values(host=obj.hostname)
val.plugin = 'cpu'

View File

@ -34,7 +34,6 @@
FailureMax 90.00
Persist true
PersistOK true
Hits 2
Invert false
</Type>
</Plugin>

View File

@ -6,7 +6,6 @@
PersistOK true
WarningMax 49.00
FailureMax 74.00
Hits 1
Invert false
</Type>
</Plugin>

View File

@ -9,18 +9,19 @@
#
# This file is the collectd 'FM Alarm' Notifier.
#
# This notifier manages raising and clearing alarms based on collectd
# notifications ; i.e. automatic collectd calls to this handler/notifier.
# This notifier debounces and then manages raising and clearing alarms
# and sending degrade assert and clear messages to maintenance based on
# Collectd resource usage severity notifications.
#
# Collectd process startup automatically calls this module's init_func which
# declares and initializes a plugObject class for plugin type in preparation
# for periodic ongoing monitoring where collectd calls notify_func for each
# plugin and instance of that plugin.
# for periodic ongoing monitoring where Collectd calls notify_func for each
# plugin and instance of that plugin every audit interval.
#
# All other class or common member functions implemented herein exist in
# support of that aformentioned initialization and periodic monitoring.
#
# Collects provides information about each event as an object passed to the
# Collectd provides information about each event as an object passed to the
# notification handler ; the notification object.
#
# object.host - the hostname.
@ -38,22 +39,22 @@
# This notifier uses the notification object to manage plugin/instance alarms.
#
# To avoid stuck alarms or missing alarms the plugin thresholds should be
# configured with Persist = true and persistOK = true. Thes controls tell
# collectd to always send notifications regardless of state change ; which
# would be the case with these cobtrols set to false.
# configured with Persist = true and persistOK = true. These controls tell
# Collectd to send notifications every audit interval regardless of state
# change.
#
# Persist = false ; only send notifications on 'okay' to 'not okay' change.
# PersistOK = false ; only send notifications on 'not okay' to 'okay' change.
# Persist = False ; only send notifications on 'okay' to 'not okay' change.
# PersistOK = False ; only send notifications on 'not okay' to 'okay' change.
#
# With these both set to true in the threshold spec for the plugin then
# collectd will call this notifier for each audit plugin/instance audit.
# With these both set to True in the threshold spec for the plugin then
# Collectd will call this notifier for each audit plugin/instance audit.
#
# Collectd supports only 2 threshold severities ; warning and failure.
# The 'failure' maps to 'critical' while 'warning' maps to 'major' in FM.
#
# To avoid unnecessary load on FM, this notifier maintains current alarm
# state and only makes an FM call on alarm state changes. Current alarm state
# is queried by the init function called by collectd on process startup.
# is queried by the init function called by Collectd on process startup.
#
# Current alarm state is maintained by two severity lists for each plugin,
# a warnings list and a failures list.
@ -84,7 +85,7 @@
# UT imports
import os
import re
import uuid
import socket
import collectd
from threading import RLock as Lock
from fm_api import constants as fm_constants
@ -110,12 +111,17 @@ DEBUG_AUDIT = 2
# write a 'value' log on a the resource sample change of more than this amount
LOG_STEP = 10
# Number of back to back database update misses
MAX_NO_UPDATE_B4_ALARM = 5
# Same state message throttle count.
# Only send the degrade message every 'this' number
# while the state of assert or clear remains the same.
ONE_EVERY = 20
# This plugin name
PLUGIN = 'alarm notifier'
# This plugin's degrade function
PLUGIN_DEGRADE = 'degrade notifier'
# Path to the plugin's drop dir
PLUGIN_PATH = '/etc/collectd.d/'
@ -127,6 +133,12 @@ READING_TYPE__PERCENT_USAGE = '% usage'
# Default invalid threshold value
INVALID_THRESHOLD = float(-1)
# 3 minute alarm assertion debounce
# 1 minute alarm clear debounce
# assuming 30 second interval
DEBOUNCE_FROM_CLEAR_THLD = 7 # (((3 * 60) / 30) + 1)
DEBOUNCE_FROM_ASSERT_THLD = 3
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
@ -136,6 +148,11 @@ NOTIF_OKAY = 4
PASS = 0
FAIL = 1
# Maintenance Degrade Service definitions
# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101
# Some plugin_instances are mangled by collectd.
# The filesystem plugin is especially bad for this.
@ -210,6 +227,325 @@ PLUGIN_NAME_LIST = [PLUGIN__CPU,
PLUGIN__VSWITCH_IFACE,
PLUGIN__EXAMPLE]
# Used to find plugin name based on alarm id
# for managing degrade for startup alarms.
ALARM_ID__TO__PLUGIN_DICT = {ALARM_ID__CPU: PLUGIN__CPU,
ALARM_ID__MEM: PLUGIN__MEM,
ALARM_ID__DF: PLUGIN__DF,
ALARM_ID__VSWITCH_CPU: PLUGIN__VSWITCH_CPU,
ALARM_ID__VSWITCH_MEM: PLUGIN__VSWITCH_MEM,
ALARM_ID__VSWITCH_PORT: PLUGIN__VSWITCH_PORT,
ALARM_ID__VSWITCH_IFACE: PLUGIN__VSWITCH_IFACE}
#########################################
# The collectd Maintenance Degrade Object
#########################################
class DegradeObject:
def __init__(self, port):
"""DegradeObject Class constructor"""
# maintenance port for degrade messages
self.port = port
# controller floating address
self.addr = None
# specifies the protocol family to use when messaging maintenance.
# if system is IPV6, then that is learned and this 'protocol' is
# updated with AF_INET6
self.protocol = socket.AF_INET
self.resource = ""
# List of plugin names that require degrade for specified severity.
self.degrade_list__failure = [PLUGIN__DF,
PLUGIN__MEM,
PLUGIN__CPU,
PLUGIN__INTERFACE]
self.degrade_list__warning = [PLUGIN__INTERFACE]
# The running list of resources that require degrade.
# a degrade clear message is sent whenever this list is empty.
# a degrade assert message is sent whenever this list is not empty.
self.degrade_list = []
# throttle down sending of duplicate degrade assert/clear messages
self.last_state = "undef"
self.msg_throttle = 0
##########################################################################
#
# Name : _get_active_controller_ip
#
# Purpose : Lookup the active controller's ip address.
#
# Updates : self.addr with the active controller's address or
# None if lookup fails.
#
# Returns : Nothing
#
##########################################################################
def _get_active_controller_ip(self):
"""Get the active controller host IP"""
try:
self.addr = socket.getaddrinfo('controller', None)[0][4][0]
collectd.info("%s controller ip: %s" %
(PLUGIN_DEGRADE, self.addr))
except Exception as ex:
self.addr = None
collectd.error("%s failed to get controller ip ; %s" %
(PLUGIN_DEGRADE, str(ex)))
##########################################################################
#
# Name : mtce_degrade_notifier
#
# Purpose : Message mtcAgent with its requested degrade state of
# the host.
#
# Description: If the degrade list is empty then a clear state is sent to
# maintenance.
#
# If degrade list is NOT empty then an assert state is sent
# to maintenance.
#
# For logging and to ease debug the code below will create a list of
# degraded resource instances to be included in the message to maintenance
# for mtcAgent to optionally log it.
#
# Updates : Preserves this state as last state
#
# Returns : Nothing
#
##########################################################################
def mtce_degrade_notifier(self, nObject):
"""Message mtcAgent with collectd degrade state of the host"""
resources = ""
if self.degrade_list:
# loop over the list,
# limit the degraded resource list being sent to mtce to 5
for r in self.degrade_list[0:1:5]:
resources += r + ','
resources = resources[:-1]
state = "assert"
else:
state = "clear"
# Degrade message throttling ....
#
# Avoid sending the same last state message for up to ONE_EVERY count.
# Degrade state is refreshed every 10 minutes with audit at 30 seconds.
# Just reduce load on mtcAgent.
if self.last_state == state and self.msg_throttle < ONE_EVERY:
self.msg_throttle += 1
return 0
else:
# Clear the message throttle counter
self.msg_throttle = 0
# if the degrade state has changed then log it and proceed
if self.last_state != state:
if self.last_state != "undef":
collectd.info("%s degrade %s %s" %
(PLUGIN_DEGRADE,
state,
self.degrade_list))
# Save state for next time
self.last_state = state
# Send the degrade state ; assert or clear message to mtcAgent.
# If we get a send failure then log it and set the addr to None
# so it forces us to refresh the controller address on the next
# notification
try:
mtce_socket = socket.socket(self.protocol, socket.SOCK_DGRAM)
if mtce_socket:
if self.addr is None:
self._get_active_controller_ip()
if self.addr is None:
collectd.error("%s cannot send degrade notification ; "
"controller address lookup failed" %
PLUGIN_DEGRADE)
return 0
# Create the Maintenance message.
message = "{\"service\":\"collectd_notifier\","
message += "\"hostname\":\"" + nObject.host + "\","
message += "\"degrade\":\"" + state + "\","
message += "\"resource\":\"" + resources + "\"}"
collectd.info("%s: %s" % (PLUGIN_DEGRADE, message))
mtce_socket.settimeout(1.0)
mtce_socket.sendto(message, (self.addr, self.port))
mtce_socket.close()
else:
collectd.error("%s %s failed to open socket (%s)" %
(PLUGIN_DEGRADE, self.resource, self.addr))
except socket.error as e:
if e.args[0] == socket.EAI_ADDRFAMILY:
# Handle IPV4 to IPV6 switchover:
self.protocol = socket.AF_INET6
collectd.info("%s %s ipv6 addressing (%s)" %
(PLUGIN_DEGRADE, self.resource, self.addr))
else:
collectd.error("%s %s socket error (%s) ; %s" %
(PLUGIN_DEGRADE,
self.resource,
self.addr,
str(e)))
# try self correction
self.addr = None
self.protocol = socket.AF_INET
##########################################################################
#
# Name : _df_instance_to_path
#
# Purpose : Convert filesystem instance to path
#
# Returns : Created path
#
##########################################################################
def _df_instance_to_path(self, df_inst):
"""Convert a df instance name to a mountpoint"""
# df_root is not a dynamic file system. Ignore that one.
if df_inst == 'df_root':
return '/'
else:
# For all others replace all '-' with '/'
return('/' + df_inst[3:].replace('-', '/'))
##########################################################################
#
# Name : remove_degrade_for_missing_filesystems
#
# Purpose : Removes degraded filesystems that are no longer mounted.
#
# Updates : might update self.degrade_list
#
# Returns : Nothing
#
##########################################################################
def remove_degrade_for_missing_filesystems(self):
"""Remove file systems that are no longer mounted"""
for df_inst in self.degrade_list:
# Only file system plugins are looked at.
# File system plugin instance names are prefixed with 'df_'
# as the first 3 chars in the instance name.
if df_inst[0:3] == 'df_':
path = self._df_instance_to_path(df_inst)
# check the mount point.
# if the mount point no longer exists then remove
# this instance from the degrade list.
if os.path.ismount(path) is False:
collectd.info("%s clearing degrade for missing %s ; %s" %
(PLUGIN_DEGRADE, path, self.degrade_list))
self.degrade_list.remove(df_inst)
##########################################################################
#
# Name : manage_degrade_list
#
# Purpose : Track the resources that require this host to be degraded.
#
# Description: Manages the 'degrade_list' based on collectd notifications.
#
# Updates : self.degrade list with resource names that have severity
# levels that require the host to be degraded.
#
# Returns : Nothing
#
###########################################################################
def manage_degrade_list(self, nObject):
"""Collectd Mtce Notifier Handler Function"""
remove = False
add = False
# Create the resource name from the notifier object.
# format: <plugin name>_<plugin_instance_name>
resource = nObject.plugin
if nObject.plugin_instance:
resource += "_" + nObject.plugin_instance
# This block looks at the current notification severity
# and manages the degrade_list.
# If the specified plugin name exists in each of the warnings
# or failure lists and there is a current severity match then
# add that resource instance to the degrade list.
# Conversely, if this notification is OKAY then make sure this
# resource instance is not in the degrade list (remove it if it is)
if nObject.severity is NOTIF_OKAY:
if self.degrade_list and resource in self.degrade_list:
remove = True
elif nObject.severity is NOTIF_FAILURE:
if self.degrade_list__failure:
if nObject.plugin in self.degrade_list__failure:
if resource not in self.degrade_list:
# handle dynamic filesystems going missing over a swact
# or unmount and being reported as a transient error by
# the df plugin. Don't add it to the failed list if the
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = self._df_instance_to_path(resource)
add = os.path.ismount(path)
else:
# If severity is failure and no failures cause degrade
# then make sure this plugin is not in the degrade list,
# Should never occur.
if resource in self.degrade_list:
remove = True
elif nObject.severity is NOTIF_WARNING:
if self.degrade_list__warning:
if nObject.plugin in self.degrade_list__warning:
if resource not in self.degrade_list:
# handle dynamic filesystems going missing over a swact
# or unmount and being reported as a transient error by
# the df plugin. Don't add it to the failed list if the
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = self._df_instance_to_path(resource)
add = os.path.ismount(path)
elif resource in self.degrade_list:
remove = True
else:
# If severity is warning and no warnings cause degrade
# then make sure this plugin is not in the degrade list.
if resource in self.degrade_list:
remove = True
else:
collectd.info("%s unsupported severity %d" %
(PLUGIN_DEGRADE, nObject.severity))
if remove is True:
self.degrade_list.remove(resource)
collectd.info("%s %s removed from degrade list" %
(PLUGIN_DEGRADE, resource))
elif add is True:
self.degrade_list.append(resource)
collectd.info("%s %s added to degrade list" %
(PLUGIN_DEGRADE, resource))
# Instantiate the maintenance degrade object
# This object persists from notification to notification
mtcDegradeObj = DegradeObject(MTCE_CMD_RX_PORT)
# PluginObject Class
class PluginObject:
@ -269,6 +605,10 @@ class PluginObject:
self.warnings = []
self.failures = []
# alarm debounce control
self.warnings_debounce_counter = 0
self.failures_debounce_counter = 0
# total notification count
self.count = 0
@ -332,7 +672,7 @@ class PluginObject:
##########################################################################
#
# Name : _manage_change
# Name : manage_change
#
# Purpose : Manage sample value change.
#
@ -344,7 +684,7 @@ class PluginObject:
#
##########################################################################
def _manage_change(self, nObject):
def manage_change(self, nObject):
"""Log resource instance value on step state change"""
# filter out messages to ignore ; notifications that have no value
@ -430,6 +770,7 @@ class PluginObject:
#
# Note: only usage type so far
if logit:
resource = self.resource_name
# setup resource name for filesystem instance usage log
@ -469,48 +810,157 @@ class PluginObject:
self.reading_type,
resource))
# update last logged value
self.last_value = round(self.value, 2)
##########################################################################
#
# Name : _update_alarm
# Name : debounce
#
# Purpose : Compare current severity to instance severity lists to
# facilitate early 'do nothing' exit from a notification.
# Purpose : Debounce alarm and degrade action handling based on
# severity notifications from plugins.
#
# Description: Avoid clearing an already cleared alarm.
# Refresh asserted alarm data for usage reading type alarms
# Description: Clear to assert has a 3 minute debounce
# All other state changes have 1 minute debounce.
# A true return indicates that debounce is complete and the
# current alarm severity needs to be acted upon.
#
# Returns : True if the alarm needs refresh, otherwise false.
# A false return means that there is no severity change or
# that debouning a severity change is in progress and the
# caller should not take action on the current notification.
#
# Returns : True if the alarm needs state change.
# False during debounce of if no alarm state change needed.
#
##########################################################################
def _update_alarm(self, entity_id, severity, this_value, last_value):
def debounce(self, base_obj, entity_id, severity, this_value):
"""Check for need to update alarm data"""
if entity_id in self.warnings:
rc = False
logit = False
# Only % Usage readings are debounced and alarmed
if base_obj.reading_type != READING_TYPE__PERCENT_USAGE:
return False
if entity_id in base_obj.warnings:
self._llog(entity_id + " is already in warnings list")
current_severity_str = "warning"
elif entity_id in self.failures:
elif entity_id in base_obj.failures:
self._llog(entity_id + " is already in failures list")
current_severity_str = "failure"
else:
self._llog(entity_id + " is already OK")
current_severity_str = "okay"
# Compare to current state to previous state.
# If they are the same then return done.
# No severity change case
# Always clear debounce counters with no severity level change
if severity == current_severity_str:
if severity == "okay":
return False
if self.reading_type != READING_TYPE__PERCENT_USAGE:
return False
elif round(last_value, 2) == round(this_value, 2):
return False
return True
self.warnings_debounce_counter = 0
self.failures_debounce_counter = 0
# From Okay -> Warning Case - PASS
elif current_severity_str == "okay" and severity == "warning":
logit = True
self.warnings_debounce_counter += 1
if self.warnings_debounce_counter >= DEBOUNCE_FROM_CLEAR_THLD:
rc = True
# Special Case: failures debounce counter should clear in this case
# so that ; max-x failures and then a warning followed by more
# failures should not allow the failure alarm assertion.
# Need back to back DEBOUNCE_FROM_CLEAR_THLD failures to
# constitute a failure alarm.
self.failures_debounce_counter = 0
# From Okay -> Failure
elif current_severity_str == "okay" and severity == "failure":
logit = True
self.failures_debounce_counter += 1
if self.failures_debounce_counter >= DEBOUNCE_FROM_CLEAR_THLD:
rc = True
# Special Case: warning debounce counter should track failure
# so that ; say 2 failures and then a warning would constitute
# a valid okay to warning alarm assertion.
self.warnings_debounce_counter += 1
# From Failure -> Okay Case
elif current_severity_str == "failure" and severity == "okay":
logit = True
self.failures_debounce_counter += 1
if self.failures_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
rc = True
# Special Case: Recovery from failure can be to okay or warning
# so that ; say at failure and we get 2 okay's and a warning
# we should allow that as a valid debounce from failure to warning.
self.warnings_debounce_counter += 1
# From Failure -> Warning Case
elif current_severity_str == "failure" and severity == "warning":
logit = True
self.failures_debounce_counter += 1
if self.failures_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
rc = True
# From Warning -> Okay Case
elif current_severity_str == "warning" and severity == "okay":
logit = True
self.warnings_debounce_counter += 1
if self.warnings_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
rc = True
# Special Case: Any previously thresholded failure count
# should be cleared. Say we are at this warning level but
# started debouncing a failure severity. Then before the
# failure debounce completed we got an okay (this clause).
# Then on the next audit get another failure event.
# Without clearing the failure count on this okay we would
# mistakenly qualify for a failure debounce by continuing
# to count up the failures debounce count.
self.failures_debounce_counter = 0
# From Warning -> Failure Case
elif current_severity_str == "warning" and severity == "failure":
logit = True
self.failures_debounce_counter += 1
if self.failures_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
rc = True
# Special Case: While in warning severity and debouncing to okay
# we get a failure reading then we need to clear the warning
# debounce count. Otherwise the next okay would qualify the clear
# which it should not because we got a failure while the warning
# to okay debounce.
self.warnings_debounce_counter = 0
if logit is True:
collectd.info("%s %s %s debounce '%s -> %s' (%2.2f) (%d:%d) %s" % (
PLUGIN,
base_obj.resource_name,
self.instance,
current_severity_str,
severity,
this_value,
self.warnings_debounce_counter,
self.failures_debounce_counter,
rc))
if rc is True:
# clear both debounce counters on every state change
self.warnings_debounce_counter = 0
self.failures_debounce_counter = 0
return rc
########################################################################
#
# Name : _manage_alarm
# Name : manage_alarm_lists
#
# Putpose : Alarm Severity Tracking
# Purpose : Alarm Severity Tracking
#
# This class member function accepts a severity level and entity id.
# It manages the content of the current alarm object's 'failures' and
@ -550,7 +1000,7 @@ class PluginObject:
#
#########################################################################
def _manage_alarm(self, entity_id, severity):
def manage_alarm_lists(self, entity_id, severity):
"""Manage the alarm severity lists and report state change"""
collectd.debug("%s manage alarm %s %s %s" %
@ -728,13 +1178,13 @@ class PluginObject:
##########################################################################
#
# Name : _create_instance_object
# Name : create_instance_object
#
# Purpose : Create a new instance object and tack it on the supplied base
# object's instance object dictionary.
#
##########################################################################
def _create_instance_object(self, instance):
def create_instance_object(self, instance):
try:
# create a new plugin object
@ -767,7 +1217,7 @@ class PluginObject:
##########################################################################
#
# Name : _create_instance_objects
# Name : create_instance_objects
#
# Purpose : Create a list of instance objects for 'self' type plugin and
# add those objects to the parent's instance_objects dictionary.
@ -776,7 +1226,7 @@ class PluginObject:
# All other instance creations/allocations are done on-demand.
#
##########################################################################
def _create_instance_objects(self):
def create_instance_objects(self):
"""Create, initialize and add an instance object to this/self plugin"""
# Create the File System subordinate instance objects.
@ -882,7 +1332,7 @@ def clear_alarm(alarm_id, eid):
return False
def _get_base_object(alarm_id):
def get_base_object(alarm_id):
"""Get the alarm object for the specified alarm id"""
for plugin in PLUGIN_NAME_LIST:
if PLUGINS[plugin].id == alarm_id:
@ -890,10 +1340,10 @@ def _get_base_object(alarm_id):
return None
def _get_object(alarm_id, eid):
def get_object(alarm_id, eid):
"""Get the plugin object for the specified alarm id and eid"""
base_obj = _get_base_object(alarm_id)
base_obj = get_base_object(alarm_id)
if len(base_obj.instance_objects):
try:
return(base_obj.instance_objects[eid])
@ -1143,7 +1593,7 @@ def _clear_alarm_for_missing_filesystems():
# At this point we don't care about severity, we just need to
# determine if an any-severity' alarmed filesystem no longer exists
# so we can cleanup by clearing its alarm.
# Note: the 2 lists shpould always contain unique data between them
# Note: the 2 lists should always contain unique data between them
alarm_list = df_base_obj.warnings + df_base_obj.failures
if len(alarm_list):
for eid in alarm_list:
@ -1162,7 +1612,7 @@ def _clear_alarm_for_missing_filesystems():
if clear_alarm(df_base_obj.id, obj.entity_id) is True:
collectd.info("%s cleared alarm for missing %s" %
(PLUGIN, path))
df_base_obj._manage_alarm(obj.entity_id, "okay")
df_base_obj.manage_alarm_lists(obj.entity_id, "okay")
else:
collectd.debug("%s maintaining alarm for %s" %
(PLUGIN, path))
@ -1174,6 +1624,10 @@ def _clear_alarm_for_missing_filesystems():
def init_func():
"""Collectd FM Notifier Initialization Function"""
mtcDegradeObj.port = MTCE_CMD_RX_PORT
collectd.error("%s mtce port %d" %
(PLUGIN, mtcDegradeObj.port))
PluginObject.lock = Lock()
PluginObject.host = os.uname()[1]
@ -1211,7 +1665,7 @@ def init_func():
# The FileSystem (DF) plugin has multiple instances
# One instance per file system mount point being monitored.
# Create one DF instance object per mount point
obj._create_instance_objects()
obj.create_instance_objects()
# ntp query is for controllers only
if want_vswitch is False:
@ -1323,9 +1777,10 @@ def notifier_func(nObject):
if PluginObject.host not in eid:
continue
base_obj = _get_base_object(alarm_id)
if base_obj is None:
# might be a plugin instance - clear it
base_obj = get_base_object(alarm_id)
inst_obj = get_object(alarm_id, eid)
if base_obj != inst_obj:
# is a plugin instance - clear it
want_alarm_clear = True
collectd.info('%s found %s %s alarm [%s]' %
@ -1359,7 +1814,25 @@ def notifier_func(nObject):
# Load the alarm severity by plugin/instance lookup.
if base_obj is not None:
base_obj._manage_alarm(eid, sev)
base_obj.manage_alarm_lists(eid, sev)
# handle degrade for alarmed resources
# over process startup.
# Note: 'ap' stands for alarmed_plugin
ap = ALARM_ID__TO__PLUGIN_DICT[alarm_id]
add = False
if alarm.severity == "critical" and\
ap in mtcDegradeObj.degrade_list__failure:
add = True
elif alarm.severity == "major" and\
ap in mtcDegradeObj.degrade_list__warning:
add = True
if add is True:
mtcDegradeObj.degrade_list.append(ap)
collectd.info("%s '%s' plugin added to "
"degrade list due to found "
"startup alarm %s" %
(PLUGIN_DEGRADE, ap, alarm_id))
PluginObject.fm_connectivity = True
collectd.info("%s initialization complete" % PLUGIN)
@ -1443,7 +1916,7 @@ def notifier_func(nObject):
need_instance_object_create = True
if need_instance_object_create is True:
base_obj._create_instance_object(nObject.plugin_instance)
base_obj.create_instance_object(nObject.plugin_instance)
inst_obj = base_obj._get_instance_object(eid)
if inst_obj:
collectd.debug("%s %s:%s inst object created" %
@ -1479,7 +1952,6 @@ def notifier_func(nObject):
# If want_state_audit is True then run the audit.
# Primarily used for debug
# default state is False
# TODO: comment out for production code.
if want_state_audit:
obj.audit_threshold += 1
if obj.audit_threshold == DEBUG_AUDIT:
@ -1487,7 +1959,7 @@ def notifier_func(nObject):
obj._state_audit("audit")
# manage reading value change ; store last and log if gt obj.step
action = obj._manage_change(nObject)
action = obj.manage_change(nObject)
if action == "done":
return 0
@ -1499,15 +1971,25 @@ def notifier_func(nObject):
# always be there.
if obj.instance == '/':
_clear_alarm_for_missing_filesystems()
if len(mtcDegradeObj.degrade_list):
mtcDegradeObj.remove_degrade_for_missing_filesystems()
# exit early if there is no alarm update to be made
if base_obj._update_alarm(obj.entity_id,
severity_str,
obj.value,
obj.last_value) is False:
if obj.debounce(base_obj,
obj.entity_id,
severity_str,
obj.value) is False:
# Call the degrade notifier at steady state,
# degrade or clear, so that the required collectd
# degrade state is periodically refreshed.
# However, rather than do this refresh on every notification,
# just do it for the root filesystem instance case.
if obj.instance == '/':
mtcDegradeObj.mtce_degrade_notifier(nObject)
return 0
obj.last_value = round(obj.value, 2)
mtcDegradeObj.manage_degrade_list(nObject)
mtcDegradeObj.mtce_degrade_notifier(nObject)
if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
if clear_alarm(obj.id, obj.entity_id) is False:
@ -1565,7 +2047,7 @@ def notifier_func(nObject):
return 0
# update the lists now that
base_obj._manage_alarm(obj.entity_id, severity_str)
base_obj.manage_alarm_lists(obj.entity_id, severity_str)
collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % (
PLUGIN,

View File

@ -6,7 +6,6 @@
PersistOK true
WarningMin 51
FailureMin 1
# Hits 2
Invert false
</Type>
</Plugin>

View File

@ -14,7 +14,6 @@
PersistOK true
WarningMax 80.00
FailureMax 90.00
Hits 2
Invert false
</Type>
</Plugin>

View File

@ -1,380 +0,0 @@
#
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#############################################################################
#
# This file is the collectd 'Maintenance' Notifier.
#
# Collects provides information about each event as an object passed to the
# notification handler ; the notification object.
#
# object.host - the hostname
#
# object.plugin - the name of the plugin aka resource
# object.plugin_instance - plugin instance string i.e. say mountpoint
# for df plugin
# object.type, - the unit i.e. percent or absolute
# object.type_instance - the attribute i.e. free, used, etc
#
# object.severity - a integer value 0=OK , 1=warning, 2=failure
# object.message - a log-able message containing the above along
# with the value
#
# This notifier manages requesting mtce to assert or clear its collectd
# host-degrade-cause flag based on notification messages sent from collectd.
#
# Messages to maintenance are throttled ONE_EVERY while this state is the
# same as last state.
#
# Message is sent on every state change
# from clear to assert or
# from assert to clear
#
# See code comments for details.
#
############################################################################
#
# Import list
import os
import socket
import collectd
import tsconfig.tsconfig as tsc
# This plugin name
PLUGIN = 'degrade notifier'
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
NOTIF_WARNING = 2
NOTIF_OKAY = 4
# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101
# same state message throttle count.
# ... only send the degrade message every 'this' number
# while the state of assert or clear remains the same.
ONE_EVERY = 10
PLUGIN__DF = 'df'
PLUGIN__MEM = 'memory'
PLUGIN__CPU = 'cpu'
PLUGIN__VSWITCH_MEM = 'vswitch_mem'
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN_INTERFACE = 'interface'
PLUGIN__EXAMPLE = 'example'
# The collectd Maintenance Notifier Object
class collectdMtceNotifierObject:
def __init__(self, port):
"""collectdMtceNotifierObject Class constructor"""
# default maintenance port
self.port = port
self.addr = None
# specifies the protocol family to use when messaging maintenance.
# if system is IPV6, then that is learned and this 'protocol' is
# updated with AF_INET6
self.protocol = socket.AF_INET
# List of plugin names that require degrade for specified severity.
self.degrade_list__failure = [PLUGIN__DF,
PLUGIN__MEM,
PLUGIN__CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN_INTERFACE,
PLUGIN__EXAMPLE]
self.degrade_list__warning = [PLUGIN_INTERFACE]
# the running list of resources that require degrade.
# a degrade clear message is sent whenever this list is empty.
# a degrade assert message is sent whenever this list is not empty.
self.degrade_list = []
# throttle down sending of duplicate degrade assert/clear messages
self.last_state = "undef"
self.msg_throttle = 0
# Instantiate the mtce_notifier object
# This object persists from notificaiton to notification
obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT)
def _get_active_controller_ip():
"""Get the active controller host IP"""
try:
obj.addr = socket.getaddrinfo('controller', None)[0][4][0]
collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr))
except Exception as ex:
obj.addr = None
collectd.error("%s failed to get controller ip ; %s" %
(PLUGIN, str(ex)))
return 0
def _df_instance_to_path(df_inst):
"""Convert a df instance name to a mountpoint"""
# df_root is not a dynamic file system. Ignore that one.
if df_inst == 'df_root':
return '/'
else:
# For all others replace all '-' with '/'
return('/' + df_inst[3:].replace('-', '/'))
# This function removes degraded file systems that are no longer present.
def _clear_degrade_for_missing_filesystems():
"""Remove degraded file systems that are no longer mounted or present"""
for df_inst in obj.degrade_list:
# Only file system plugins are looked at.
# File system plugin instance names are prefixed with 'df_'
# as the first 3 chars in the instance name.
if df_inst[0:3] == 'df_':
path = _df_instance_to_path(df_inst)
# check the mount point.
# if the mount point no longer exists then remove
# this instance from the degrade list.
if os.path.ismount(path) is False:
collectd.info("%s clearing degrade for missing %s ; %s" %
(PLUGIN, path, obj.degrade_list))
obj.degrade_list.remove(df_inst)
return 0
# The collectd configuration interface
#
# Used to configure the maintenance port.
# key = 'port'
# val = port number
#
def config_func(config):
"""Configure the maintenance degrade notifier plugin"""
collectd.debug('%s config function' % PLUGIN)
for node in config.children:
key = node.key.lower()
val = node.values[0]
if key == 'port':
obj.port = int(val)
collectd.info("%s configured mtce port: %d" %
(PLUGIN, obj.port))
return 0
obj.port = MTCE_CMD_RX_PORT
collectd.error("%s no mtce port provided ; defaulting to %d" %
(PLUGIN, obj.port))
# Collectd calls this function on startup.
def init_func():
"""Collectd Mtce Notifier Initialization Function"""
obj.host = os.uname()[1]
collectd.info("%s %s:%s sending to mtce port %d" %
(PLUGIN, tsc.nodetype, obj.host, obj.port))
collectd.debug("%s init function" % PLUGIN)
# This is the Notifier function that is called by collectd.
#
# Handling steps are
#
# 1. build resource name from notification object.
# 2. check resource against severity lists.
# 3. manage this instance's degrade state.
# 4. send mtcAgent the degrade state message.
#
def notifier_func(nObject):
"""Collectd Mtce Notifier Handler Function"""
# Create the resource name from the notifier object.
# format: <plugin name>_<plugin_instance_name>
resource = nObject.plugin
if nObject.plugin_instance:
resource += "_" + nObject.plugin_instance
# This block looks at the current notification severity
# and manages the degrade_list.
# If the specified plugin name exists in each of the warnings
# or failure lists and there is a current severity match then
# add that resource instance to the degrade list.
# Conversly if this notification is OKAY then make sure this
# resource instance is not in the degrade list (remove it if it is)
if nObject.severity is NOTIF_OKAY:
if obj.degrade_list and resource in obj.degrade_list:
obj.degrade_list.remove(resource)
elif nObject.severity is NOTIF_FAILURE:
if obj.degrade_list__failure:
if nObject.plugin in obj.degrade_list__failure:
if resource not in obj.degrade_list:
# handle dynamic filesystems going missing over a swact
# or unmount and being reported as a transient error by
# the df plugin. Don't add it to the failed list if the
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is failure and no failures cause degrade
# then make sure this plugin is not in the degrade list,
# Should never occur.
if resource in obj.degrade_list:
obj.degrade_list.remove(resource)
elif nObject.severity is NOTIF_WARNING:
if obj.degrade_list__warning:
if nObject.plugin in obj.degrade_list__warning:
if resource not in obj.degrade_list:
# handle dynamic filesystems going missing over a swact
# or unmount and being reported as a transient error by
# the df plugin. Don't add it to the failed list if the
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = _df_instance_to_path(resource)
add = os.path.ismount(path)
if add is True:
collectd.info("%s %s added to degrade list" %
(PLUGIN, resource))
obj.degrade_list.append(resource)
else:
# If severity is warning and no warnings cause degrade
# then make sure this plugin is not in the degrade list.
# Should never occur..
if resource in obj.degrade_list:
obj.degrade_list.remove(resource)
else:
collectd.info("%s unsupported severity %d" %
(PLUGIN, nObject.severity))
return 0
# running counter of notifications.
obj.msg_throttle += 1
# Support for Dynamic File Systems
# --------------------------------
# Some active controller mounted filesystems can become
# unmounted under the watch of collectd. This can occur
# as a result of a Swact. If an 'degrade' is raised at the
# time an fs disappears then that state can become stuck
# active until the next Swact. This call handles this case.
#
# Audit file system presence every time we get the
# notification for the root file system.
# Depending on the root filesystem always being there.
if nObject.plugin == 'df' \
and nObject.plugin_instance == 'root' \
and len(obj.degrade_list):
_clear_degrade_for_missing_filesystems()
# If degrade list is empty then a clear state is sent to maintenance.
# If degrade list is NOT empty then an assert state is sent to maintenance
# For logging and to ease debug the code below will create a list of
# degraded resource instances to be included in the message to maintenance
# for mtcAgent to optionally log it.
resources = ""
if obj.degrade_list:
# loop over the list,
# limit the degraded resource list being sent to mtce to 5
for r in obj.degrade_list[0:1:5]:
resources += r + ','
resources = resources[:-1]
state = "assert"
else:
state = "clear"
# Message throttling ....
# Avoid sending the same last state message for up to ONE_EVERY count.
# Just reduce load on mtcAgent
if obj.last_state == state and obj.msg_throttle < ONE_EVERY:
return 0
# if the degrade state has changed then log it and proceed
if obj.last_state != state:
if obj.last_state != "undef":
collectd.info("%s degrade %s %s" %
(PLUGIN,
state,
obj.degrade_list))
# Save state for next time
obj.last_state = state
# Clear the message throttle counter
obj.msg_throttle = 0
# Send the degrade state ; assert or clear message to mtcAgent.
# If we get a send failure then log it and set the addr to None
# so it forces us to refresh the controller address on the next
# notification
try:
mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM)
if mtce_socket:
if obj.addr is None:
_get_active_controller_ip()
if obj.addr is None:
return 0
# Create the Maintenance message.
message = "{\"service\":\"collectd_notifier\","
message += "\"hostname\":\"" + nObject.host + "\","
message += "\"degrade\":\"" + state + "\","
message += "\"resource\":\"" + resources + "\"}"
collectd.debug("%s: %s" % (PLUGIN, message))
mtce_socket.settimeout(1.0)
mtce_socket.sendto(message, (obj.addr, obj.port))
mtce_socket.close()
else:
collectd.error("%s %s failed to open socket (%s)" %
(PLUGIN, resource, obj.addr))
except socket.error as e:
if e.args[0] == socket.EAI_ADDRFAMILY:
# Handle IPV4 to IPV6 switchover:
obj.protocol = socket.AF_INET6
collectd.info("%s %s ipv6 addressing (%s)" %
(PLUGIN, resource, obj.addr))
else:
collectd.error("%s %s socket error (%s) ; %s" %
(PLUGIN, resource, obj.addr, str(e)))
# try self correction
obj.addr = None
obj.protocol = socket.AF_INET
return 0
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_notification(notifier_func)

View File

@ -6,7 +6,6 @@
PersistOK true
WarningMin 1
FailureMin 0
Hits 2
Invert false
</Type>
</Plugin>

View File

@ -8,7 +8,6 @@
FailureMax 1000000
WarningMin -1000
FailureMin -1000000
Hits 2
Invert false
</Type>
</Plugin>

View File

@ -6,7 +6,6 @@
PersistOK true
WarningMin 1
FailureMin 0
Hits 2
Invert false
</Type>
</Plugin>