From d37490b81408ca53b1b8fd61992c6c9337dbcaed Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 20 Apr 2021 10:03:07 -0400 Subject: [PATCH] Add alarm audit to starlingx collectd fm notifier plugin This update adds common plugin support for alarm state auditing. The audit is able to detect and correct the following alarm state errors: Error Case Correction Action ----------------------- ----------------- - stale alarm ; delete alarm - missing alarm ; assert alarm - alarm severity mismatch ; refresh alarm The common audit is enabled for the fm_notifier plugin that supports alarm managment for the following resources. - CPU with alarm id 100.101 - Memory with alarm id 100.103 - Filesystem with alarm id 100.104 Other plugins may use this common audit in the future but only the above resources have the audit enabled for them by this update. Test Plan: PASS: Verify stale alarm detection/correction handling PASS: Verify missing alarm detection/correction handling PASS: Verify alarm severity mismatch detection/correction handling PASS: Verify hosts only audits its own specified alarms PASS: Verify success path of monitoring a single and mix of base and instance alarms of varying severity while such alarm conditions come and go PASS: Verify alarm audit of mix of base and instance alarms over a collectd process restart PASS: Verify audit handling of alarm that migrates from major to critical to major to clear PASS: Verify audit handling transition between alarm and no alarm conditions PASS: Verify soak of random cpu, memory and filesystem overage alarm assertions and clears that also involve manual alarm deletions, assertions and severity changes that exercise new audit features Regression: PASS: Verify alarm and audit handling over Swact with mounted filesystem that has active alarm PASS: Verify collectd logs following a system install and while alarms are managed during above soak PASS: Verify behavior while FM is killed or stopped/started PASS: Verify Standard system install with Sanity and Regression PASS: Verify AIO DX/DC systems install with Sanity and Regression Closes-Bug: 1925210 Change-Id: I1cafd17ad07ec769240de92ae4e67cb1357f0992 Signed-off-by: Eric MacDonald --- collectd-extensions/src/fm_notifier.py | 122 +++++++++--- collectd-extensions/src/plugin_common.py | 233 ++++++++++++++++++++++- 2 files changed, 330 insertions(+), 25 deletions(-) diff --git a/collectd-extensions/src/fm_notifier.py b/collectd-extensions/src/fm_notifier.py index 8948c24..b4a3776 100755 --- a/collectd-extensions/src/fm_notifier.py +++ b/collectd-extensions/src/fm_notifier.py @@ -105,8 +105,9 @@ debug_lists = False want_state_audit = False want_vswitch = False -# number of notifier loops before the state is object dumped -DEBUG_AUDIT = 2 +# Number of notifier loop between each audit. +# @ 30 sec interval audit rate is every 5 minutes +AUDIT_RATE = 10 # write a 'value' log on a the resource sample change of more than this amount LOG_STEP = 10 @@ -207,6 +208,10 @@ ALARM_ID_LIST = [ALARM_ID__CPU, ALARM_ID__VSWITCH_PORT, ALARM_ID__VSWITCH_IFACE] +AUDIT_ALARM_ID_LIST = [ALARM_ID__CPU, + ALARM_ID__MEM, + ALARM_ID__DF] + # ADD_NEW_PLUGIN: add plugin name definition # WARNING: This must line up exactly with the plugin # filename without the extension. @@ -616,15 +621,17 @@ class fmAlarmObject: # total notification count self.count = 0 - # Debug: state audit controls - self.audit_threshold = 0 - self.audit_count = 0 + # audit counters + self.alarm_audit_threshold = 0 + self.state_audit_count = 0 # For plugins that have multiple instances like df (filesystem plugin) # we need to create an instance of this object for each one. # This dictionary is used to associate an instance with its object. self.instance_objects = {} + self.fault = None + def _ilog(self, string): """Create a collectd notifier info log with the string param""" collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string)) @@ -658,18 +665,18 @@ class fmAlarmObject: if self.id == ALARM_ID__CPU: _print_state() - self.audit_count += 1 + self.state_audit_count += 1 if self.warnings: collectd.info("%s AUDIT %d: %s warning list %s:%s" % (PLUGIN, - self.audit_count, + self.state_audit_count, self.plugin, location, self.warnings)) if self.failures: collectd.info("%s AUDIT %d: %s failure list %s:%s" % (PLUGIN, - self.audit_count, + self.state_audit_count, self.plugin, location, self.failures)) @@ -1461,7 +1468,7 @@ def _print_obj(obj): collectd.info("%s %s %s - %s - %s\n" % (PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id)) - + collectd.info("%s %s fault obj: %s\n" % (PLUGIN, prefix, obj.fault)) collectd.info("%s %s entity id: %s\n" % (PLUGIN, prefix, obj.entity_id)) collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id)) @@ -1817,7 +1824,7 @@ def notifier_func(nObject): if eid.split(base_eid)[1]: want_alarm_clear = True - collectd.info('%s found %s %s alarm [%s]' % + collectd.info('%s alarm %s:%s:%s found at startup' % (PLUGIN, alarm.severity, alarm_id, @@ -1825,8 +1832,9 @@ def notifier_func(nObject): if want_alarm_clear is True: if clear_alarm(alarm_id, eid) is False: - collectd.error("%s %s:%s clear failed" % - (PLUGIN, + collectd.error("%s alarm %s:%s:%s clear " + "failed" % + (PLUGIN, alarm.severity, alarm_id, eid)) continue @@ -1982,15 +1990,6 @@ def notifier_func(nObject): # if obj.warnings or obj.failures: # _print_state(obj) - # If want_state_audit is True then run the audit. - # Primarily used for debug - # default state is False - if want_state_audit: - obj.audit_threshold += 1 - if obj.audit_threshold == DEBUG_AUDIT: - obj.audit_threshold = 0 - obj._state_audit("audit") - # manage reading value change ; store last and log if gt obj.step action = obj.manage_change(nObject) if action == "done": @@ -2013,6 +2012,83 @@ def notifier_func(nObject): if len(mtcDegradeObj.degrade_list): mtcDegradeObj.remove_degrade_for_missing_filesystems() + obj.alarm_audit_threshold += 1 + if obj.alarm_audit_threshold >= AUDIT_RATE: + if want_state_audit: + obj._state_audit("audit") + obj.alarm_audit_threshold = 0 + + ################################################################# + # + # Audit Asserted Alarms + # + # Loop over the list of auditable alarm ids building two + # dictionaries, one containing warning (major) and the other + # failure (critical) with alarm info needed to detect and + # correct stale, missing or severity mismatched alarms for + # the listed alarm ids <100.xxx>. + # + # Note: Conversion in terminology from + # warning -> major and + # failures -> critical + # is done because fm speaks in terms of major and critical + # while the plugin speaks in terms of warning and failure. + # + major_alarm_dict = {} + critical_alarm_dict = {} + for alarm_id in AUDIT_ALARM_ID_LIST: + + tmp_base_obj = get_base_object(alarm_id) + if tmp_base_obj is None: + collectd.error("%s audit %s base object lookup failed" % + (PLUGIN, alarm_id)) + continue + + # Build 2 dictionaries containing current alarmed info. + # Dictionary entries are indexed by entity id to fetch the + # alarm id and last fault object used to create the alarm + # for the mismatch and missing case handling. + # + # { eid : { alarm : , fault : }}, ... } + + # major list for base object from warnings list + if tmp_base_obj.entity_id in tmp_base_obj.warnings: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault + major_alarm_dict[tmp_base_obj.entity_id] = info + + # major list for instance objects from warnings list + for _inst_obj in tmp_base_obj.instance_objects: + inst_obj = tmp_base_obj.instance_objects[_inst_obj] + if inst_obj.entity_id in tmp_base_obj.warnings: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = inst_obj.fault + major_alarm_dict[inst_obj.entity_id] = info + + # critical list for base object from failures list + if tmp_base_obj.entity_id in tmp_base_obj.failures: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault + critical_alarm_dict[tmp_base_obj.entity_id] = info + + # critical list for instance objects from failures list + for _inst_obj in tmp_base_obj.instance_objects: + inst_obj = tmp_base_obj.instance_objects[_inst_obj] + if inst_obj.entity_id in tmp_base_obj.failures: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = inst_obj.fault + critical_alarm_dict[inst_obj.entity_id] = info + + pluginObject.alarms_audit(api, AUDIT_ALARM_ID_LIST, + major_alarm_dict, + critical_alarm_dict) + # end alarms audit + ################################################################# + # exit early if there is no alarm update to be made if obj.debounce(base_obj, obj.entity_id, @@ -2053,7 +2129,7 @@ def notifier_func(nObject): reason = obj.reason_warning # build the alarm object - fault = fm_api.Fault( + obj.fault = fm_api.Fault( alarm_id=obj.id, alarm_state=_alarm_state, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, @@ -2067,7 +2143,7 @@ def notifier_func(nObject): suppression=base_obj.suppression) try: - alarm_uuid = api.set_fault(fault) + alarm_uuid = api.set_fault(obj.fault) if pc.is_uuid_like(alarm_uuid) is False: collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % (PLUGIN, diff --git a/collectd-extensions/src/plugin_common.py b/collectd-extensions/src/plugin_common.py index fdb3568..5242f3a 100644 --- a/collectd-extensions/src/plugin_common.py +++ b/collectd-extensions/src/plugin_common.py @@ -98,6 +98,9 @@ RESERVED_CPULIST_KEY = 'PLATFORM_CPU_LIST' PLUGIN_PASS = 0 PLUGIN_FAIL = 1 +AUDIT_INFO_ALARM = 'alarm' +AUDIT_INFO_FAULT = 'fault' + class PluginObject(object): @@ -162,8 +165,10 @@ class PluginObject(object): def init_completed(self): """Declare plugin init complete""" - - collectd.info("%s initialization completed" % self.plugin) + self.hostname = self.gethostname() + self.base_eid = 'host=' + self.hostname + collectd.info("%s %s initialization completed" % + (self.plugin, self.hostname)) self.init_complete = True ########################################################################### @@ -349,6 +354,230 @@ class PluginObject(object): return True + ##################################################################### + # + # Name : clear_alarm + # + # Description: Clear the specified alarm. + # + # Returns : True if operation succeeded + # False if there was an error exception. + # + # Assumptions: Caller can decide to retry based on return status. + # + ##################################################################### + def clear_alarm(self, fm, alarm_id, eid): + """Clear the specified alarm:eid + + :param fm The Fault Manager's API Object + :param alarm_id The alarm identifier , ie 100.103 + :param eid The entity identifier ; host=. + """ + + try: + if fm.clear_fault(alarm_id, eid) is True: + collectd.info("%s %s:%s alarm cleared" % + (self.plugin, alarm_id, eid)) + else: + collectd.info("%s %s:%s alarm already cleared" % + (self.plugin, alarm_id, eid)) + return True + + except Exception as ex: + collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % + (self.plugin, alarm_id, eid, ex)) + return False + + ######################################################################### + # + # Name : __missing_or_mismatch_alarm_handler + # + # Purpose: Find and correct missing or mismatch alarms + # + # Scope: Private + # + ######################################################################### + def __missing_or_mismatch_alarm_handler(self, + fm, + alarms, + alarm_id, + severity, + sev_alarm_dict): + """Find and correct missing or mismatch alarms + + :param fm The Fault Manager's API Object + :param alarms List of database alarms for alarm id and this host + :param alarm_id The alarm id in context + :param severity Specifies the severity level of sev_alarm_dict + :param sev_alarm_dict An alarm dictionary for either (not both) major + or critical alarms + """ + plugin_prefix = self.plugin + ' audit' + for eid in sev_alarm_dict: + found = False + if alarm_id == sev_alarm_dict[eid].get(AUDIT_INFO_ALARM): + error_case = "missing" + if alarms: + for alarm in alarms: + if alarm.entity_instance_id == eid: + if alarm.severity == severity: + collectd.info("%s alarm %s:%s:%s is correct" % + (plugin_prefix, severity, + alarm_id, eid)) + found = True + else: + error_case = "mismatch" + break + + if found is False: + + fault = sev_alarm_dict[eid].get(AUDIT_INFO_FAULT) + if fault: + collectd.info("%s alarm %s:%s:%s %s ; refreshing" % + (plugin_prefix, + severity, alarm_id, eid, error_case)) + fm.set_fault(fault) + else: + collectd.info("%s alarm %s:%s:%s %s" % + (plugin_prefix, + severity, alarm_id, eid, error_case)) + + ######################################################################### + # + # Name: alarms_audit + # + # Purpose: Ensure the alarm state in the FM database matches the plugin + # + # Description: Query FM for the specified alarm id list. Handle missing, + # stale or severity mismatched alarms. + # + # Algorithm : Each alarm id is queried and the response is filtered by + # current host. The plugin's running state takes precedence. + # This audit will only ever raise, modify or clear alarms in + # the database, never change the alarm state of the plugin. + # + # - clear any asserted alarms that have a clear state + # in the plugin. + # - raise an alarm that is cleared in fm but asserted + # in the plugin. + # - correct alarm severity in fm database to align with + # the plugin. + # + # Assumptions: The severity dictionary arguments (major and critical) + # are used to detect severity mismatches and support alarm + # ids with varying entity ids. + # + # The dictionaries are a list of key value pairs ; aid:eid + # - alarm id as 'aid' + # - entity_id as 'eid' + # + # No need to check for fm api call success and retry on + # failure. Stale alarm clear will be retried on next audit. + # + ######################################################################### + def alarms_audit(self, + fm, + audit_alarm_id_list, + major_alarm_dict, + critical_alarm_dict): + """Audit the fm database for this plugin's alarms state + + :param fm The Fault Manager's API Object + :param audit_alarm_id_list A list of alarm ids to query + :param major_alarm_dict A dictionary of major alarms by aid:eid + :param critical_alarm_dict A dictionary of critical alarms by aid:eid + """ + + if len(audit_alarm_id_list) == 0: + return + + plugin_prefix = self.plugin + ' audit' + + if len(major_alarm_dict): + collectd.debug("%s major_alarm_dict: %s" % + (plugin_prefix, major_alarm_dict)) + + if len(critical_alarm_dict): + collectd.debug("%s critical_alarm_dict: %s" % + (plugin_prefix, critical_alarm_dict)) + + for alarm_id in audit_alarm_id_list: + collectd.debug("%s searching for all '%s' alarms" % + (plugin_prefix, alarm_id)) + try: + database_alarms = [] + tmp = fm.get_faults_by_id(alarm_id) + if tmp is not None: + database_alarms = tmp + + # database alarms might contain same alarm id for other + # hosts and needs to be filtered + alarms = [] + for alarm in database_alarms: + base_eid = alarm.entity_instance_id.split('.')[0] + if self.base_eid == base_eid: + collectd.debug("%s alarm %s:%s:%s in fm" % + (plugin_prefix, + alarm.severity, alarm_id, + alarm.entity_instance_id)) + alarms.append(alarm) + + except Exception as ex: + collectd.error("%s get_faults_by_id %s failed " + "with exception ; %s" % + (plugin_prefix, alarm_id, ex)) + continue + + # Service database alarms case + + # Stale database alarms handling case + remove_alarms_list = [] + if alarms: + for alarm in alarms: + found = False + for eid in major_alarm_dict: + if alarm.entity_instance_id == eid: + found = True + break + if found is False: + for eid in critical_alarm_dict: + if alarm.entity_instance_id == eid: + found = True + break + + if found is False: + collectd.info("%s alarm %s:%s:%s is stale ; clearing" % + (plugin_prefix, + alarm.severity, alarm_id, + alarm.entity_instance_id)) + + # clear stale alarm. + self.clear_alarm(fm, alarm_id, + alarm.entity_instance_id) + remove_alarms_list.append(alarm) + for alarm in remove_alarms_list: + alarms.remove(alarm) + else: + collectd.debug("%s database has no %s alarms" % + (plugin_prefix, alarm_id)) + + # If major alarms exist then check for + # missing or mismatch state in fm database + if len(major_alarm_dict): + self.__missing_or_mismatch_alarm_handler(fm, + alarms, + alarm_id, + 'major', + major_alarm_dict) + # If critical alarms exist then check for + # missing or mismatch state in fm database. + if len(critical_alarm_dict): + self.__missing_or_mismatch_alarm_handler(fm, + alarms, + alarm_id, + 'critical', + critical_alarm_dict) + ########################################################################### # # Name : make_http_request