Add alarm audit to starlingx collectd fm notifier plugin

This update adds common plugin support for alarm state auditing.
The audit is able to detect and correct the following alarm
state errors:

   Error Case                Correction Action
   -----------------------   -----------------
 - stale alarm             ; delete alarm
 - missing alarm           ; assert alarm
 - alarm severity mismatch ; refresh alarm

The common audit is enabled for the fm_notifier plugin that supports
alarm managment for the following resources.

 - CPU with alarm id 100.101
 - Memory with alarm id 100.103
 - Filesystem with alarm id 100.104

Other plugins may use this common audit in the future but only the
above resources have the audit enabled for them by this update.

Test Plan:

PASS: Verify stale alarm detection/correction handling
PASS: Verify missing alarm detection/correction  handling
PASS: Verify alarm severity mismatch detection/correction handling
PASS: Verify hosts only audits its own specified alarms
PASS: Verify success path of monitoring a single and mix
      of base and instance alarms of varying severity while
      such alarm conditions come and go
PASS: Verify alarm audit of mix of base and instance alarms
      over a collectd process restart
PASS: Verify audit handling of alarm that migrates from
      major to critical to major to clear
PASS: Verify audit handling transition between alarm and
      no alarm conditions
PASS: Verify soak of random cpu, memory and filesystem
      overage alarm assertions and clears that also involve
      manual alarm deletions, assertions and severity changes
      that exercise new audit features

Regression:

PASS: Verify alarm and audit handling over Swact with mounted
      filesystem that has active alarm
PASS: Verify collectd logs following a system install and
      while alarms are managed during above soak
PASS: Verify behavior while FM is killed or stopped/started
PASS: Verify Standard system install with Sanity and Regression
PASS: Verify AIO DX/DC systems install with Sanity and Regression

Closes-Bug: 1925210
Change-Id: I1cafd17ad07ec769240de92ae4e67cb1357f0992
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2021-04-20 10:03:07 -04:00
parent 3628db6e77
commit d37490b814
2 changed files with 330 additions and 25 deletions

View File

@ -105,8 +105,9 @@ debug_lists = False
want_state_audit = False
want_vswitch = False
# number of notifier loops before the state is object dumped
DEBUG_AUDIT = 2
# Number of notifier loop between each audit.
# @ 30 sec interval audit rate is every 5 minutes
AUDIT_RATE = 10
# write a 'value' log on a the resource sample change of more than this amount
LOG_STEP = 10
@ -207,6 +208,10 @@ ALARM_ID_LIST = [ALARM_ID__CPU,
ALARM_ID__VSWITCH_PORT,
ALARM_ID__VSWITCH_IFACE]
AUDIT_ALARM_ID_LIST = [ALARM_ID__CPU,
ALARM_ID__MEM,
ALARM_ID__DF]
# ADD_NEW_PLUGIN: add plugin name definition
# WARNING: This must line up exactly with the plugin
# filename without the extension.
@ -616,15 +621,17 @@ class fmAlarmObject:
# total notification count
self.count = 0
# Debug: state audit controls
self.audit_threshold = 0
self.audit_count = 0
# audit counters
self.alarm_audit_threshold = 0
self.state_audit_count = 0
# For plugins that have multiple instances like df (filesystem plugin)
# we need to create an instance of this object for each one.
# This dictionary is used to associate an instance with its object.
self.instance_objects = {}
self.fault = None
def _ilog(self, string):
"""Create a collectd notifier info log with the string param"""
collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string))
@ -658,18 +665,18 @@ class fmAlarmObject:
if self.id == ALARM_ID__CPU:
_print_state()
self.audit_count += 1
self.state_audit_count += 1
if self.warnings:
collectd.info("%s AUDIT %d: %s warning list %s:%s" %
(PLUGIN,
self.audit_count,
self.state_audit_count,
self.plugin,
location,
self.warnings))
if self.failures:
collectd.info("%s AUDIT %d: %s failure list %s:%s" %
(PLUGIN,
self.audit_count,
self.state_audit_count,
self.plugin,
location,
self.failures))
@ -1461,7 +1468,7 @@ def _print_obj(obj):
collectd.info("%s %s %s - %s - %s\n" %
(PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id))
collectd.info("%s %s fault obj: %s\n" % (PLUGIN, prefix, obj.fault))
collectd.info("%s %s entity id: %s\n" % (PLUGIN, prefix, obj.entity_id))
collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id))
@ -1817,7 +1824,7 @@ def notifier_func(nObject):
if eid.split(base_eid)[1]:
want_alarm_clear = True
collectd.info('%s found %s %s alarm [%s]' %
collectd.info('%s alarm %s:%s:%s found at startup' %
(PLUGIN,
alarm.severity,
alarm_id,
@ -1825,8 +1832,9 @@ def notifier_func(nObject):
if want_alarm_clear is True:
if clear_alarm(alarm_id, eid) is False:
collectd.error("%s %s:%s clear failed" %
(PLUGIN,
collectd.error("%s alarm %s:%s:%s clear "
"failed" %
(PLUGIN, alarm.severity,
alarm_id,
eid))
continue
@ -1982,15 +1990,6 @@ def notifier_func(nObject):
# if obj.warnings or obj.failures:
# _print_state(obj)
# If want_state_audit is True then run the audit.
# Primarily used for debug
# default state is False
if want_state_audit:
obj.audit_threshold += 1
if obj.audit_threshold == DEBUG_AUDIT:
obj.audit_threshold = 0
obj._state_audit("audit")
# manage reading value change ; store last and log if gt obj.step
action = obj.manage_change(nObject)
if action == "done":
@ -2013,6 +2012,83 @@ def notifier_func(nObject):
if len(mtcDegradeObj.degrade_list):
mtcDegradeObj.remove_degrade_for_missing_filesystems()
obj.alarm_audit_threshold += 1
if obj.alarm_audit_threshold >= AUDIT_RATE:
if want_state_audit:
obj._state_audit("audit")
obj.alarm_audit_threshold = 0
#################################################################
#
# Audit Asserted Alarms
#
# Loop over the list of auditable alarm ids building two
# dictionaries, one containing warning (major) and the other
# failure (critical) with alarm info needed to detect and
# correct stale, missing or severity mismatched alarms for
# the listed alarm ids <100.xxx>.
#
# Note: Conversion in terminology from
# warning -> major and
# failures -> critical
# is done because fm speaks in terms of major and critical
# while the plugin speaks in terms of warning and failure.
#
major_alarm_dict = {}
critical_alarm_dict = {}
for alarm_id in AUDIT_ALARM_ID_LIST:
tmp_base_obj = get_base_object(alarm_id)
if tmp_base_obj is None:
collectd.error("%s audit %s base object lookup failed" %
(PLUGIN, alarm_id))
continue
# Build 2 dictionaries containing current alarmed info.
# Dictionary entries are indexed by entity id to fetch the
# alarm id and last fault object used to create the alarm
# for the mismatch and missing case handling.
#
# { eid : { alarm : <alarm id>, fault : <fault obj> }}, ... }
# major list for base object from warnings list
if tmp_base_obj.entity_id in tmp_base_obj.warnings:
info = {}
info[pc.AUDIT_INFO_ALARM] = alarm_id
info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault
major_alarm_dict[tmp_base_obj.entity_id] = info
# major list for instance objects from warnings list
for _inst_obj in tmp_base_obj.instance_objects:
inst_obj = tmp_base_obj.instance_objects[_inst_obj]
if inst_obj.entity_id in tmp_base_obj.warnings:
info = {}
info[pc.AUDIT_INFO_ALARM] = alarm_id
info[pc.AUDIT_INFO_FAULT] = inst_obj.fault
major_alarm_dict[inst_obj.entity_id] = info
# critical list for base object from failures list
if tmp_base_obj.entity_id in tmp_base_obj.failures:
info = {}
info[pc.AUDIT_INFO_ALARM] = alarm_id
info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault
critical_alarm_dict[tmp_base_obj.entity_id] = info
# critical list for instance objects from failures list
for _inst_obj in tmp_base_obj.instance_objects:
inst_obj = tmp_base_obj.instance_objects[_inst_obj]
if inst_obj.entity_id in tmp_base_obj.failures:
info = {}
info[pc.AUDIT_INFO_ALARM] = alarm_id
info[pc.AUDIT_INFO_FAULT] = inst_obj.fault
critical_alarm_dict[inst_obj.entity_id] = info
pluginObject.alarms_audit(api, AUDIT_ALARM_ID_LIST,
major_alarm_dict,
critical_alarm_dict)
# end alarms audit
#################################################################
# exit early if there is no alarm update to be made
if obj.debounce(base_obj,
obj.entity_id,
@ -2053,7 +2129,7 @@ def notifier_func(nObject):
reason = obj.reason_warning
# build the alarm object
fault = fm_api.Fault(
obj.fault = fm_api.Fault(
alarm_id=obj.id,
alarm_state=_alarm_state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
@ -2067,7 +2143,7 @@ def notifier_func(nObject):
suppression=base_obj.suppression)
try:
alarm_uuid = api.set_fault(fault)
alarm_uuid = api.set_fault(obj.fault)
if pc.is_uuid_like(alarm_uuid) is False:
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
(PLUGIN,

View File

@ -98,6 +98,9 @@ RESERVED_CPULIST_KEY = 'PLATFORM_CPU_LIST'
PLUGIN_PASS = 0
PLUGIN_FAIL = 1
AUDIT_INFO_ALARM = 'alarm'
AUDIT_INFO_FAULT = 'fault'
class PluginObject(object):
@ -162,8 +165,10 @@ class PluginObject(object):
def init_completed(self):
"""Declare plugin init complete"""
collectd.info("%s initialization completed" % self.plugin)
self.hostname = self.gethostname()
self.base_eid = 'host=' + self.hostname
collectd.info("%s %s initialization completed" %
(self.plugin, self.hostname))
self.init_complete = True
###########################################################################
@ -349,6 +354,230 @@ class PluginObject(object):
return True
#####################################################################
#
# Name : clear_alarm
#
# Description: Clear the specified alarm.
#
# Returns : True if operation succeeded
# False if there was an error exception.
#
# Assumptions: Caller can decide to retry based on return status.
#
#####################################################################
def clear_alarm(self, fm, alarm_id, eid):
"""Clear the specified alarm:eid
:param fm The Fault Manager's API Object
:param alarm_id The alarm identifier , ie 100.103
:param eid The entity identifier ; host=<hostname>.<instance>
"""
try:
if fm.clear_fault(alarm_id, eid) is True:
collectd.info("%s %s:%s alarm cleared" %
(self.plugin, alarm_id, eid))
else:
collectd.info("%s %s:%s alarm already cleared" %
(self.plugin, alarm_id, eid))
return True
except Exception as ex:
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
(self.plugin, alarm_id, eid, ex))
return False
#########################################################################
#
# Name : __missing_or_mismatch_alarm_handler
#
# Purpose: Find and correct missing or mismatch alarms
#
# Scope: Private
#
#########################################################################
def __missing_or_mismatch_alarm_handler(self,
fm,
alarms,
alarm_id,
severity,
sev_alarm_dict):
"""Find and correct missing or mismatch alarms
:param fm The Fault Manager's API Object
:param alarms List of database alarms for alarm id and this host
:param alarm_id The alarm id in context
:param severity Specifies the severity level of sev_alarm_dict
:param sev_alarm_dict An alarm dictionary for either (not both) major
or critical alarms
"""
plugin_prefix = self.plugin + ' audit'
for eid in sev_alarm_dict:
found = False
if alarm_id == sev_alarm_dict[eid].get(AUDIT_INFO_ALARM):
error_case = "missing"
if alarms:
for alarm in alarms:
if alarm.entity_instance_id == eid:
if alarm.severity == severity:
collectd.info("%s alarm %s:%s:%s is correct" %
(plugin_prefix, severity,
alarm_id, eid))
found = True
else:
error_case = "mismatch"
break
if found is False:
fault = sev_alarm_dict[eid].get(AUDIT_INFO_FAULT)
if fault:
collectd.info("%s alarm %s:%s:%s %s ; refreshing" %
(plugin_prefix,
severity, alarm_id, eid, error_case))
fm.set_fault(fault)
else:
collectd.info("%s alarm %s:%s:%s %s" %
(plugin_prefix,
severity, alarm_id, eid, error_case))
#########################################################################
#
# Name: alarms_audit
#
# Purpose: Ensure the alarm state in the FM database matches the plugin
#
# Description: Query FM for the specified alarm id list. Handle missing,
# stale or severity mismatched alarms.
#
# Algorithm : Each alarm id is queried and the response is filtered by
# current host. The plugin's running state takes precedence.
# This audit will only ever raise, modify or clear alarms in
# the database, never change the alarm state of the plugin.
#
# - clear any asserted alarms that have a clear state
# in the plugin.
# - raise an alarm that is cleared in fm but asserted
# in the plugin.
# - correct alarm severity in fm database to align with
# the plugin.
#
# Assumptions: The severity dictionary arguments (major and critical)
# are used to detect severity mismatches and support alarm
# ids with varying entity ids.
#
# The dictionaries are a list of key value pairs ; aid:eid
# - alarm id as 'aid'
# - entity_id as 'eid'
#
# No need to check for fm api call success and retry on
# failure. Stale alarm clear will be retried on next audit.
#
#########################################################################
def alarms_audit(self,
fm,
audit_alarm_id_list,
major_alarm_dict,
critical_alarm_dict):
"""Audit the fm database for this plugin's alarms state
:param fm The Fault Manager's API Object
:param audit_alarm_id_list A list of alarm ids to query
:param major_alarm_dict A dictionary of major alarms by aid:eid
:param critical_alarm_dict A dictionary of critical alarms by aid:eid
"""
if len(audit_alarm_id_list) == 0:
return
plugin_prefix = self.plugin + ' audit'
if len(major_alarm_dict):
collectd.debug("%s major_alarm_dict: %s" %
(plugin_prefix, major_alarm_dict))
if len(critical_alarm_dict):
collectd.debug("%s critical_alarm_dict: %s" %
(plugin_prefix, critical_alarm_dict))
for alarm_id in audit_alarm_id_list:
collectd.debug("%s searching for all '%s' alarms" %
(plugin_prefix, alarm_id))
try:
database_alarms = []
tmp = fm.get_faults_by_id(alarm_id)
if tmp is not None:
database_alarms = tmp
# database alarms might contain same alarm id for other
# hosts and needs to be filtered
alarms = []
for alarm in database_alarms:
base_eid = alarm.entity_instance_id.split('.')[0]
if self.base_eid == base_eid:
collectd.debug("%s alarm %s:%s:%s in fm" %
(plugin_prefix,
alarm.severity, alarm_id,
alarm.entity_instance_id))
alarms.append(alarm)
except Exception as ex:
collectd.error("%s get_faults_by_id %s failed "
"with exception ; %s" %
(plugin_prefix, alarm_id, ex))
continue
# Service database alarms case
# Stale database alarms handling case
remove_alarms_list = []
if alarms:
for alarm in alarms:
found = False
for eid in major_alarm_dict:
if alarm.entity_instance_id == eid:
found = True
break
if found is False:
for eid in critical_alarm_dict:
if alarm.entity_instance_id == eid:
found = True
break
if found is False:
collectd.info("%s alarm %s:%s:%s is stale ; clearing" %
(plugin_prefix,
alarm.severity, alarm_id,
alarm.entity_instance_id))
# clear stale alarm.
self.clear_alarm(fm, alarm_id,
alarm.entity_instance_id)
remove_alarms_list.append(alarm)
for alarm in remove_alarms_list:
alarms.remove(alarm)
else:
collectd.debug("%s database has no %s alarms" %
(plugin_prefix, alarm_id))
# If major alarms exist then check for
# missing or mismatch state in fm database
if len(major_alarm_dict):
self.__missing_or_mismatch_alarm_handler(fm,
alarms,
alarm_id,
'major',
major_alarm_dict)
# If critical alarms exist then check for
# missing or mismatch state in fm database.
if len(critical_alarm_dict):
self.__missing_or_mismatch_alarm_handler(fm,
alarms,
alarm_id,
'critical',
critical_alarm_dict)
###########################################################################
#
# Name : make_http_request