Fix memory instance handling over collectd process restart

With a critical memory alarm raised, the collectd plugin fault notifier's degrade list is injected with the reporting plugin's name over a collectd process restart. The recent introduction of multiple instance based memory alarms has exposed a limitation in the management and content of the degrade list that can lead to both stuck degrade (this case) as well as missing degrade due to the lack of uniqueness of the content injected into the degrade list based on degradable events. This update modifies the content of the degrade list to ensure all entries are unique by using an alarm's entity id rather than the more generic plugin name. An additional issue was identified with respect to how filesystem usage overage alarms are managed, due to recent additions to the list of monitored filesystems. Filesystem overage alarms are also degrade list candidates so the aforementioned degrade list change needed to account for filesystem as well. One recently added monitored filesystem name conflicted with how filesystem instances were tracked that lead to a bouncing alarm if that filesystem experienced overage. Given that there was already a special case handling for the root fs, rather than add an additional special case to remedy this issue, the method of mapping filesystem-instance to mountpoint was changed from a list to a dictionary. With that cha nge there is no longer a limitation or special case handling required for filesystem mountpoints that conflicted with how the stock collectd plugin reports filesystem instances Test Plan: PASS: Verify existing alarm and degrade management of instance and non-instance based alarms ot both normal runtime as well as over a collectd process restart. PASS: Verify handling of non-instance based alarm(s) over process restart when the alarm condition no longer exists following the process restart. PASS; Verify degrade list management and content. PASS: Verify filesystem instance to mountpoint mapping. PASS: Verify data model content using state audit and list management with debug options turned on. PASS: Verify alarm and degrade handling of a filesystem and overage that follows the active controller. PASS: Verify update as patch Regression: PASS: Verify alarm and degrade handling of 'all' collectd plugins including over collectd process restarts. PASS: Verify alarm and degrade management stress soak that involved multiple plugins asserting/clearing multiple alarm and degradable conditions over a 24 hour period. Change-Id: I5ea389fb092a6404616d7ea0e8d54daa64ad7ea2 Closes-Bug: 1903731 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2020-11-16 12:07:16 -05:00 · 2020-11-16 12:07:16 -05:00 · c6cab97ee0
parent ee7ae99d41
commit c6cab97ee0
1 changed files with 169 additions and 145 deletions
--- a/collectd-extensions/src/fm_notifier.py
+++ b/collectd-extensions/src/fm_notifier.py
@ -154,32 +154,41 @@ FAIL = 1
 # ... with configuration override
 MTCE_CMD_RX_PORT = 2101

-# Some plugin_instances are mangled by collectd.
-# The filesystem plugin is especially bad for this.
+# Filesystem plugin_instances are mangled by collectd.
 # For instance the "/var/log" MountPoint instance is
 # reported as "var-log".
-# The following is a list of mangled instances list
-# that need the '-' replaced with '/'.
+#
+# The following is a dictionary that provides mapping between the
+# stock df plugin instance name and the linux filesystem pATH where the
+# key = mangled filesystem instance from stock df plugin
+# val = actual filesystem mountpoint path
 #
 # ADD_NEW_PLUGIN if there are new file systems being added that
 # have subdirectories in the name then they will need to be added
 # to the mangled list
-mangled_list = {"dev-shm",
-                "var-log",
-                "var-run",
-                "var-lock",
-                "var-lib-rabbitmq",
-                "var-lib-postgresql",
-                "var-lib-ceph-mon",
-                "var-lib-docker",
-                "var-lib-docker-distribution"
-                "var-lib-kubelet",
-                "var-lib-nova-instances",
-                "opt-platform",
-                "opt-cgcs",
-                "opt-etcd",
-                "opt-extension",
-                "opt-backups"}
+DF_MANGLED_DICT = {
+    # instance : path
+    'root': '/',
+    'dev': '/dev',
+    'tmp': '/tmp',
+    'boot': '/boot',
+    'scratch': '/scratch',
+    'dev-shm': '/dev/shm',
+    'var-log': '/var/log',
+    'var-run': '/var/run',
+    'var-lock': '/var/lock',
+    'var-lib-rabbitmq': '/var/lib/rabbitmq',
+    'var-lib-postgresql': '/var/lib/postgresql',
+    'var-lib-ceph-mon': '/var/lib/ceph/mon',
+    'var-lib-docker': '/var/lib/docker',
+    'var-lib-docker-distribution': '/var/lib/docker-distribution',
+    'var-lib-kubelet': '/var/lib/kubelet',
+    'var-lib-nova-instances': '/var/lib/nova/instances',
+    'opt-platform': '/opt/platform',
+    'opt-etcd': '/opt/etcd',
+    'opt-extension': '/opt/extension',
+    'opt-backups': '/opt/backups'}
+

 # ADD_NEW_PLUGIN: add new alarm id definition
 ALARM_ID__CPU = "100.101"
@ -330,8 +339,8 @@ class DegradeObject:
        resources = ""
        if self.degrade_list:
            # loop over the list,
-            # limit the degraded resource list being sent to mtce to 5
-            for r in self.degrade_list[0:1:5]:
+            # limit the degraded resource list being sent to mtce to 2
+            for r in self.degrade_list[0:2]:
                resources += r + ','
            resources = resources[:-1]
            state = "assert"
@ -350,13 +359,9 @@ class DegradeObject:
            # Clear the message throttle counter
            self.msg_throttle = 0

-        # if the degrade state has changed then log it and proceed
-        if self.last_state != state:
-            if self.last_state != "undef":
-                collectd.info("%s degrade %s %s" %
-                              (PLUGIN_DEGRADE,
-                               state,
-                               self.degrade_list))
+        if self.degrade_list:
+            collectd.info("%s degrade list: %s" %
+                          (PLUGIN_DEGRADE, self.degrade_list))

        # Save state for next time
        self.last_state = state
@ -405,25 +410,6 @@ class DegradeObject:
                self.addr = None
                self.protocol = socket.AF_INET

-    ##########################################################################
-    #
-    # Name    : _df_instance_to_path
-    #
-    # Purpose : Convert filesystem instance to path
-    #
-    # Returns : Created path
-    #
-    ##########################################################################
-    def _df_instance_to_path(self, df_inst):
-        """Convert a df instance name to a mountpoint"""
-
-        # df_root is not a dynamic file system. Ignore that one.
-        if df_inst == 'df_root':
-            return '/'
-        else:
-            # For all others replace all '-' with '/'
-            return('/' + df_inst[3:].replace('-', '/'))
-
    ##########################################################################
    #
    # Name    : remove_degrade_for_missing_filesystems
@ -441,10 +427,10 @@ class DegradeObject:
        for df_inst in self.degrade_list:

            # Only file system plugins are looked at.
-            # File system plugin instance names are prefixed with 'df_'
+            # File system plugin instance names are prefixed with 'df:'
            # as the first 3 chars in the instance name.
-            if df_inst[0:3] == 'df_':
-                path = self._df_instance_to_path(df_inst)
+            if df_inst[0:3] == 'df:':
+                path = df_inst.split('filesystem=')[1]

                # check the mount point.
                # if the mount point no longer exists then remove
@ -474,11 +460,21 @@ class DegradeObject:
        remove = False
        add = False

-        # Create the resource name from the notifier object.
-        # format: <plugin name>_<plugin_instance_name>
-        resource = nObject.plugin
-        if nObject.plugin_instance:
-            resource += "_" + nObject.plugin_instance
+        # Create the degrade id from the notifier object.
+        # Format: <plugin name>:host=<hostname>.<plugin_instance_name>
+        resource = nObject.plugin + ':' + 'host=' + os.uname()[1]
+        if nObject.plugin == PLUGIN__DF:
+            df_inst = DF_MANGLED_DICT.get(nObject.plugin_instance)
+            if df_inst:
+                resource += ".filesystem="
+                resource += df_inst
+            else:
+                collectd.error("%s df instance '%s' lookup failed; ignoring" %
+                               (PLUGIN_DEGRADE, nObject.plugin_instance))
+                return
+
+        elif nObject.plugin_instance:
+            resource += '.' + nObject.plugin + '=' + nObject.plugin_instance

        # This block looks at the current notification severity
        # and manages the degrade_list.
@ -501,7 +497,7 @@ class DegradeObject:
                        # mountpoint is gone.
                        add = True
                        if nObject.plugin == PLUGIN__DF:
-                            path = self._df_instance_to_path(resource)
+                            path = DF_MANGLED_DICT.get(nObject.plugin_instance)
                            add = os.path.ismount(path)

            else:
@ -521,7 +517,7 @@ class DegradeObject:
                        # mountpoint is gone.
                        add = True
                        if nObject.plugin == PLUGIN__DF:
-                            path = self._df_instance_to_path(resource)
+                            path = DF_MANGLED_DICT.get(nObject.plugin_instance)
                            add = os.path.ismount(path)

                elif resource in self.degrade_list:
@ -537,11 +533,11 @@ class DegradeObject:

        if remove is True:
            self.degrade_list.remove(resource)
-            collectd.info("%s %s removed from degrade list" %
+            collectd.info("%s '%s' removed from degrade list" %
                          (PLUGIN_DEGRADE, resource))
        elif add is True:
            self.degrade_list.append(resource)
-            collectd.info("%s %s added to degrade list" %
+            collectd.info("%s '%s' added to degrade list" %
                          (PLUGIN_DEGRADE, resource))


@ -573,6 +569,20 @@ class fmAlarmObject:
        self.resource_name = ""    # The top level name of the resource
        self.instance_name = ""    # The instance name

+        # Unique identifier used in the degrade list to represent
+        # this alarm object.
+        #
+        # Base Object:
+        #
+        # Format : PLUGIN:host=<hostname>
+        # Example: memory:host=controller-0
+        #
+        # Instance Object:
+        #
+        # Format: <Base Object>.instance>
+        # Example: memory:host=controller-0.memory=platform
+        self.degrade_id = plugin + ':' + 'host=' + os.uname()[1]
+
        # Instance specific learned static class members.
        self.entity_id = ""        # fm entity id host=<hostname>.<instance>
        self.instance = ""         # <plugin>_<instance>
@ -778,12 +788,11 @@ class fmAlarmObject:

            # setup resource name for filesystem instance usage log
            if self.plugin == PLUGIN__DF:
-                resource = self.instance
+                resource = self.instance_name

            elif self.plugin == PLUGIN__MEM:
                if self.instance_name:
-                    if self.instance_name != 'platform':
-                        resource += ' ' + self.instance_name
+                    resource = self.instance_name

            # setup resource name for vswitch process instance name
            elif self.plugin == PLUGIN__VSWITCH_MEM:
@ -944,7 +953,7 @@ class fmAlarmObject:
            collectd.info("%s %s %s debounce '%s -> %s' (%2.2f) (%d:%d) %s" % (
                          PLUGIN,
                          base_obj.resource_name,
-                          self.instance,
+                          entity_id,
                          current_severity_str,
                          severity,
                          this_value,
@ -1121,6 +1130,8 @@ class fmAlarmObject:
        :param eid: the index for the instance object dictionary
        :return: object or None
        """
+        if eid is None:
+            return None

        try:
            collectd.debug("%s %s Get   Lock ..." % (PLUGIN, self.plugin))
@ -1167,7 +1178,6 @@ class fmAlarmObject:
        """Copy select members of self object to target object"""

        object.resource_name = self.resource_name
-        object.instance_name = self.instance_name
        object.reading_type = self.reading_type

        object.reason_warning = self.reason_warning
@ -1196,9 +1206,8 @@ class fmAlarmObject:

            # initialize the object with instance specific data
            inst_obj.instance_name = instance
-            inst_obj.entity_id = _build_entity_id(self.plugin,
-                                                  instance)
-
+            inst_obj.degrade_id += '.' + self.plugin + '=' + instance
+            inst_obj.entity_id = _build_entity_id(self.plugin, instance)
            self._add_instance_object(inst_obj, inst_obj.entity_id)

            collectd.debug("%s created %s instance (%s) object %s" %
@ -1214,7 +1223,7 @@ class fmAlarmObject:
            return inst_obj

        except:
-            collectd.error("%s %s:%s inst object create failed" %
+            collectd.error("%s %s:%s inst object create failed [exception]" %
                           (PLUGIN, inst_obj.resource_name, instance))
        return None

@ -1266,20 +1275,25 @@ class fmAlarmObject:

                # initialize the object with instance specific data
                inst_obj.resource_name = self.resource_name
+                self._copy_instance_object(inst_obj)
                inst_obj.instance_name = mp
-                inst_obj.instance = mp
-                # build the plugin instance name from the mount point
-                if mp == '/':
-                    inst_obj.plugin_instance = 'root'
-                else:
-                    inst_obj.plugin_instance = mp[1:].replace('/', '-')
+                inst_obj.degrade_id += '.' + 'filesystem=' + mp

+                for plugin_instance in DF_MANGLED_DICT:
+                    if DF_MANGLED_DICT[plugin_instance] == mp:
+                        inst_obj.plugin_instance = plugin_instance
+                        break
+                else:
+                    collectd.debug("%s no %s mountpoint" %
+                                   (PLUGIN, mp))
+                    continue
                inst_obj.entity_id = _build_entity_id(PLUGIN__DF,
                                                      inst_obj.plugin_instance)

                # add this subordinate object to the parent's
                # instance object list
                self._add_instance_object(inst_obj, inst_obj.entity_id)
+                inst_obj.instance = inst_obj.instance_name

                collectd.info("%s monitoring %s usage" %
                              (PLUGIN, inst_obj.instance))
@ -1362,8 +1376,14 @@ def _build_entity_id(plugin, plugin_instance):
    entity_id += fmAlarmObject.host

    if plugin == PLUGIN__MEM:
-        if plugin_instance != 'platform':
+        if 'node' in plugin_instance:
            entity_id += '.numa=' + plugin_instance
+        elif plugin_instance:
+            entity_id += '.' + PLUGIN__MEM + '=' + plugin_instance
+
+    elif plugin == PLUGIN__CPU:
+        if plugin_instance:
+            entity_id += '.' + PLUGIN__CPU + '=' + plugin_instance

    elif plugin == PLUGIN__VSWITCH_MEM:

@ -1393,22 +1413,17 @@ def _build_entity_id(plugin, plugin_instance):

        # host=<hostname>.filesystem=<mountpoint>
        if plugin_instance:
-            instance = plugin_instance
-
            # build the entity_id for this plugin
-            entity_id += '.filesystem=/'
-
-            # collectd replaces the instance '/' with the word 'root'
-            # So skip over "root" as '/' is already part of the
-            # entity_id
-            if instance != 'root':
-                # Look for other instances that are in the mangled list
-                if instance in mangled_list:
-                    instance = instance.replace('-', '/')
-                entity_id += instance
+            path = DF_MANGLED_DICT.get(plugin_instance)
+            if path:
+                entity_id += ".filesystem="
+                entity_id += path
+            else:
+                inst_error = True

    if inst_error is True:
-        collectd.error("%s eid build failed ; missing instance" % plugin)
+        collectd.error("%s eid build failed; bad or missing instance '%s'" %
+                       (plugin, plugin_instance))
        return None

    return entity_id
@ -1450,41 +1465,46 @@ def _print_obj(obj):

    num = len(obj.instance_objects)
    if num > 0 or base_object is True:
-        prefix = "PLUGIN "
-        if num:
-            prefix += str(num)
-        else:
-            prefix += " "
+        prefix = "BASE " + str(num)
    else:
-        prefix = "INSTANCE"
+        prefix = "......."
+
+    collectd.info("%s %s %s - %s - %s\n" %
+                  (PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id))
+
+    collectd.info("%s %s  entity id: %s\n" % (PLUGIN, prefix, obj.entity_id))
+    collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id))
+
+    collectd.info("%s %s instance  : %s\n" %
+                  (PLUGIN, prefix, obj.instance_name))

    if obj.plugin_instance:
-        resource = obj.plugin + ":" + obj.plugin_instance
-    else:
-        resource = obj.plugin
+        collectd.info("%s %s Plugin Ins: %s\n" %
+                      (PLUGIN, prefix, obj.plugin_instance))
+    if obj.warnings:
+        collectd.info("%s %s   warnings: %s" %
+                      (PLUGIN, prefix, obj.warnings))
+    if obj.failures:
+        collectd.info("%s %s   failures: %s" %
+                      (PLUGIN, prefix, obj.failures))
+    if obj.repair:
+        collectd.info("%s %s     repair: %s" % (PLUGIN, prefix, obj.repair))

-    collectd.info("%s %s res: %s name: %s\n" %
-                  (PLUGIN, prefix, resource, obj.resource_name))
-    collectd.info("%s     eid : %s\n" % (PLUGIN, obj.entity_id))
-    collectd.info("%s     inst: %s name: %s\n" %
-                  (PLUGIN, obj.instance, obj.instance_name))
-    collectd.info("%s     value:%2.1f thld:%2.1f cause:%s (%d) type:%s" %
-                  (PLUGIN,
+    if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
+        collectd.info("%s %s  reason: w: %s\n" %
+                      (PLUGIN, prefix, obj.reason_warning))
+        collectd.info("%s %s  reason: f: %s\n" %
+                      (PLUGIN, prefix, obj.reason_failure))
+
+    collectd.info("%s %s value:%2.1f thld:%2.1f cause:%s count:%d type:%s\n" %
+                  (PLUGIN, prefix,
                   obj.value,
                   obj.threshold,
                   obj.cause,
                   obj.count,
                   obj.reading_type))
-    collectd.info("%s     warn:%s fail:%s" %
-                  (PLUGIN, obj.warnings, obj.failures))
-    collectd.info("%s     repair:t: %s" %
-                  (PLUGIN, obj.repair))
-    if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
-        collectd.info("%s     reason:w: %s\n"
-                      "%s     reason:f: %s\n" %
-                      (PLUGIN, obj.reason_warning,
-                       PLUGIN, obj.reason_failure))
-    # collectd.info(" ")
+
+    collectd.info("\n")


 def _print_state(obj=None):
@ -1604,18 +1624,16 @@ def _clear_alarm_for_missing_filesystems():
            if obj is not None and \
               obj.plugin == PLUGIN__DF and \
               obj.entity_id == eid and \
-               obj.plugin_instance != 'root':
+               obj.instance_name != '':

-                # For all others replace all '-' with '/'
-                path = '/' + obj.plugin_instance.replace('-', '/')
-                if os.path.ismount(path) is False:
+                if os.path.ismount(obj.instance_name) is False:
                    if clear_alarm(df_base_obj.id, obj.entity_id) is True:
                        collectd.info("%s cleared alarm for missing %s" %
-                                      (PLUGIN, path))
+                                      (PLUGIN, obj.instance_name))
                        df_base_obj.manage_alarm_lists(obj.entity_id, "okay")
                else:
                    collectd.debug("%s maintaining alarm for %s" %
-                                   (PLUGIN, path))
+                                   (PLUGIN, obj.instance_name))


 # Collectd calls this function on startup.
@ -1646,7 +1664,7 @@ def init_func():

    # Constant Memory Plugin Object settings
    obj = PLUGINS[PLUGIN__MEM]
-    obj.resource_name = "Platform Memory"
+    obj.resource_name = "Memory"
    obj.instance_name = PLUGIN__MEM
    obj.repair = "Monitor and if condition persists, "
    obj.repair += "contact next level of support; "
@ -1785,10 +1803,11 @@ def notifier_func(nObject):
                        if fmAlarmObject.host not in eid:
                            continue

-                        base_obj = get_base_object(alarm_id)
-                        inst_obj = get_object(alarm_id, eid)
-                        if base_obj != inst_obj:
-                            # is a plugin instance - clear it
+                        # get the instance part of the eid
+                        #  instance based alarms are cleared over a process
+                        #  restart to avoid the potential for stuck alarms.
+                        base_eid = 'host=' + os.uname()[1]
+                        if eid.split(base_eid)[1]:
                            want_alarm_clear = True

                        collectd.info('%s found %s %s alarm [%s]' %
@ -1798,18 +1817,11 @@ def notifier_func(nObject):
                                       eid))

                        if want_alarm_clear is True:
-
                            if clear_alarm(alarm_id, eid) is False:
                                collectd.error("%s %s:%s clear failed" %
                                               (PLUGIN,
                                                alarm_id,
                                                eid))
-                            else:
-                                collectd.info("%s clear %s %s alarm %s" %
-                                              (PLUGIN,
-                                               alarm.severity,
-                                               alarm_id,
-                                               eid))
                            continue

                        if alarm.severity == "critical":
@ -1821,26 +1833,32 @@ def notifier_func(nObject):
                            continue

                        # Load the alarm severity by plugin/instance lookup.
+                        base_obj = get_base_object(alarm_id)
                        if base_obj is not None:
                            base_obj.manage_alarm_lists(eid, sev)

+                            # the eid at this point is really the plugin id
+                            pid = eid
+
+                            # here the eid is used to represent the degrade id
+                            eid = base_obj.degrade_id
+
                            # handle degrade for alarmed resources
                            # over process startup.
-                            # Note: 'ap' stands for alarmed_plugin
-                            ap = ALARM_ID__TO__PLUGIN_DICT[alarm_id]
                            add = False
                            if alarm.severity == "critical" and\
-                                    ap in mtcDegradeObj.degrade_list__failure:
+                                    pid in mtcDegradeObj.degrade_list__failure:
                                add = True
                            elif alarm.severity == "major" and\
-                                    ap in mtcDegradeObj.degrade_list__warning:
+                                    pid in mtcDegradeObj.degrade_list__warning:
                                add = True
                            if add is True:
-                                mtcDegradeObj.degrade_list.append(ap)
+
+                                mtcDegradeObj.degrade_list.append(eid)
                                collectd.info("%s '%s' plugin added to "
                                              "degrade list due to found "
                                              "startup alarm %s" %
-                                              (PLUGIN_DEGRADE, ap, alarm_id))
+                                              (PLUGIN_DEGRADE, eid, alarm_id))

        fmAlarmObject.fm_connectivity = True
        collectd.info("%s connectivity with fm complete" % PLUGIN)
@ -1904,7 +1922,6 @@ def notifier_func(nObject):
        elif nObject.plugin_instance:
            need_instance_object_create = False
            # Build the entity_id from the parent object if needed
-            # Build the entity_id from the parent object if needed
            eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
            try:
                # Need lock when reading/writing any obj.instance_objects list
@ -1927,15 +1944,16 @@ def notifier_func(nObject):
                base_obj.create_instance_object(nObject.plugin_instance)
                inst_obj = base_obj._get_instance_object(eid)
                if inst_obj:
+                    inst_obj.instance_name = nObject.plugin_instance
                    collectd.debug("%s %s:%s inst object created" %
                                   (PLUGIN,
                                    inst_obj.plugin,
-                                    inst_obj.instance))
+                                    inst_obj.instance_name))
                else:
                    collectd.error("%s %s:%s inst object create failed" %
                                   (PLUGIN,
                                    nObject.plugin,
-                                    nObject.plugin_instance))
+                                    nObject.plugin_instance_name))
                    return 0

            # re-assign the object
@ -1971,13 +1989,19 @@ def notifier_func(nObject):
    if action == "done":
        return 0

+    # Handle degrade state update early in process start.
+    # Ensure that a degrade condition that clears over a collectd
+    # collectd process restart is cleared as soon as possible.
+    if obj.count == 0:
+        mtcDegradeObj.mtce_degrade_notifier(nObject)
+
    # increment just before any possible return for a valid sample
    obj.count += 1

    # audit file system presence every time we get the
    # notification for the root file system ; which will
    # always be there.
-    if obj.instance == '/':
+    if obj.instance_name == '/':
        _clear_alarm_for_missing_filesystems()
        if len(mtcDegradeObj.degrade_list):
            mtcDegradeObj.remove_degrade_for_missing_filesystems()
@ -1992,7 +2016,7 @@ def notifier_func(nObject):
        #  degrade state is periodically refreshed.
        # However, rather than do this refresh on every notification,
        #  just do it for the root filesystem instance case.
-        if obj.instance == '/':
+        if obj.instance_name == '/':
            mtcDegradeObj.mtce_degrade_notifier(nObject)
        return 0

@ -2062,7 +2086,7 @@ def notifier_func(nObject):
                  _alarm_state,
                  base_obj.id,
                  severity_str,
-                  obj.instance,
+                  obj.instance_name,
                  obj.entity_id,
                  obj.value))