diff --git a/collectd-extensions/src/fm_notifier.py b/collectd-extensions/src/fm_notifier.py index c84e136..d2a5297 100755 --- a/collectd-extensions/src/fm_notifier.py +++ b/collectd-extensions/src/fm_notifier.py @@ -154,32 +154,41 @@ FAIL = 1 # ... with configuration override MTCE_CMD_RX_PORT = 2101 -# Some plugin_instances are mangled by collectd. -# The filesystem plugin is especially bad for this. +# Filesystem plugin_instances are mangled by collectd. # For instance the "/var/log" MountPoint instance is # reported as "var-log". -# The following is a list of mangled instances list -# that need the '-' replaced with '/'. +# +# The following is a dictionary that provides mapping between the +# stock df plugin instance name and the linux filesystem pATH where the +# key = mangled filesystem instance from stock df plugin +# val = actual filesystem mountpoint path # # ADD_NEW_PLUGIN if there are new file systems being added that # have subdirectories in the name then they will need to be added # to the mangled list -mangled_list = {"dev-shm", - "var-log", - "var-run", - "var-lock", - "var-lib-rabbitmq", - "var-lib-postgresql", - "var-lib-ceph-mon", - "var-lib-docker", - "var-lib-docker-distribution" - "var-lib-kubelet", - "var-lib-nova-instances", - "opt-platform", - "opt-cgcs", - "opt-etcd", - "opt-extension", - "opt-backups"} +DF_MANGLED_DICT = { + # instance : path + 'root': '/', + 'dev': '/dev', + 'tmp': '/tmp', + 'boot': '/boot', + 'scratch': '/scratch', + 'dev-shm': '/dev/shm', + 'var-log': '/var/log', + 'var-run': '/var/run', + 'var-lock': '/var/lock', + 'var-lib-rabbitmq': '/var/lib/rabbitmq', + 'var-lib-postgresql': '/var/lib/postgresql', + 'var-lib-ceph-mon': '/var/lib/ceph/mon', + 'var-lib-docker': '/var/lib/docker', + 'var-lib-docker-distribution': '/var/lib/docker-distribution', + 'var-lib-kubelet': '/var/lib/kubelet', + 'var-lib-nova-instances': '/var/lib/nova/instances', + 'opt-platform': '/opt/platform', + 'opt-etcd': '/opt/etcd', + 'opt-extension': '/opt/extension', + 'opt-backups': '/opt/backups'} + # ADD_NEW_PLUGIN: add new alarm id definition ALARM_ID__CPU = "100.101" @@ -330,8 +339,8 @@ class DegradeObject: resources = "" if self.degrade_list: # loop over the list, - # limit the degraded resource list being sent to mtce to 5 - for r in self.degrade_list[0:1:5]: + # limit the degraded resource list being sent to mtce to 2 + for r in self.degrade_list[0:2]: resources += r + ',' resources = resources[:-1] state = "assert" @@ -350,13 +359,9 @@ class DegradeObject: # Clear the message throttle counter self.msg_throttle = 0 - # if the degrade state has changed then log it and proceed - if self.last_state != state: - if self.last_state != "undef": - collectd.info("%s degrade %s %s" % - (PLUGIN_DEGRADE, - state, - self.degrade_list)) + if self.degrade_list: + collectd.info("%s degrade list: %s" % + (PLUGIN_DEGRADE, self.degrade_list)) # Save state for next time self.last_state = state @@ -405,25 +410,6 @@ class DegradeObject: self.addr = None self.protocol = socket.AF_INET - ########################################################################## - # - # Name : _df_instance_to_path - # - # Purpose : Convert filesystem instance to path - # - # Returns : Created path - # - ########################################################################## - def _df_instance_to_path(self, df_inst): - """Convert a df instance name to a mountpoint""" - - # df_root is not a dynamic file system. Ignore that one. - if df_inst == 'df_root': - return '/' - else: - # For all others replace all '-' with '/' - return('/' + df_inst[3:].replace('-', '/')) - ########################################################################## # # Name : remove_degrade_for_missing_filesystems @@ -441,10 +427,10 @@ class DegradeObject: for df_inst in self.degrade_list: # Only file system plugins are looked at. - # File system plugin instance names are prefixed with 'df_' + # File system plugin instance names are prefixed with 'df:' # as the first 3 chars in the instance name. - if df_inst[0:3] == 'df_': - path = self._df_instance_to_path(df_inst) + if df_inst[0:3] == 'df:': + path = df_inst.split('filesystem=')[1] # check the mount point. # if the mount point no longer exists then remove @@ -474,11 +460,21 @@ class DegradeObject: remove = False add = False - # Create the resource name from the notifier object. - # format: _ - resource = nObject.plugin - if nObject.plugin_instance: - resource += "_" + nObject.plugin_instance + # Create the degrade id from the notifier object. + # Format: :host=. + resource = nObject.plugin + ':' + 'host=' + os.uname()[1] + if nObject.plugin == PLUGIN__DF: + df_inst = DF_MANGLED_DICT.get(nObject.plugin_instance) + if df_inst: + resource += ".filesystem=" + resource += df_inst + else: + collectd.error("%s df instance '%s' lookup failed; ignoring" % + (PLUGIN_DEGRADE, nObject.plugin_instance)) + return + + elif nObject.plugin_instance: + resource += '.' + nObject.plugin + '=' + nObject.plugin_instance # This block looks at the current notification severity # and manages the degrade_list. @@ -501,7 +497,7 @@ class DegradeObject: # mountpoint is gone. add = True if nObject.plugin == PLUGIN__DF: - path = self._df_instance_to_path(resource) + path = DF_MANGLED_DICT.get(nObject.plugin_instance) add = os.path.ismount(path) else: @@ -521,7 +517,7 @@ class DegradeObject: # mountpoint is gone. add = True if nObject.plugin == PLUGIN__DF: - path = self._df_instance_to_path(resource) + path = DF_MANGLED_DICT.get(nObject.plugin_instance) add = os.path.ismount(path) elif resource in self.degrade_list: @@ -537,11 +533,11 @@ class DegradeObject: if remove is True: self.degrade_list.remove(resource) - collectd.info("%s %s removed from degrade list" % + collectd.info("%s '%s' removed from degrade list" % (PLUGIN_DEGRADE, resource)) elif add is True: self.degrade_list.append(resource) - collectd.info("%s %s added to degrade list" % + collectd.info("%s '%s' added to degrade list" % (PLUGIN_DEGRADE, resource)) @@ -573,6 +569,20 @@ class fmAlarmObject: self.resource_name = "" # The top level name of the resource self.instance_name = "" # The instance name + # Unique identifier used in the degrade list to represent + # this alarm object. + # + # Base Object: + # + # Format : PLUGIN:host= + # Example: memory:host=controller-0 + # + # Instance Object: + # + # Format: .instance> + # Example: memory:host=controller-0.memory=platform + self.degrade_id = plugin + ':' + 'host=' + os.uname()[1] + # Instance specific learned static class members. self.entity_id = "" # fm entity id host=. self.instance = "" # _ @@ -778,12 +788,11 @@ class fmAlarmObject: # setup resource name for filesystem instance usage log if self.plugin == PLUGIN__DF: - resource = self.instance + resource = self.instance_name elif self.plugin == PLUGIN__MEM: if self.instance_name: - if self.instance_name != 'platform': - resource += ' ' + self.instance_name + resource = self.instance_name # setup resource name for vswitch process instance name elif self.plugin == PLUGIN__VSWITCH_MEM: @@ -944,7 +953,7 @@ class fmAlarmObject: collectd.info("%s %s %s debounce '%s -> %s' (%2.2f) (%d:%d) %s" % ( PLUGIN, base_obj.resource_name, - self.instance, + entity_id, current_severity_str, severity, this_value, @@ -1121,6 +1130,8 @@ class fmAlarmObject: :param eid: the index for the instance object dictionary :return: object or None """ + if eid is None: + return None try: collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin)) @@ -1167,7 +1178,6 @@ class fmAlarmObject: """Copy select members of self object to target object""" object.resource_name = self.resource_name - object.instance_name = self.instance_name object.reading_type = self.reading_type object.reason_warning = self.reason_warning @@ -1196,9 +1206,8 @@ class fmAlarmObject: # initialize the object with instance specific data inst_obj.instance_name = instance - inst_obj.entity_id = _build_entity_id(self.plugin, - instance) - + inst_obj.degrade_id += '.' + self.plugin + '=' + instance + inst_obj.entity_id = _build_entity_id(self.plugin, instance) self._add_instance_object(inst_obj, inst_obj.entity_id) collectd.debug("%s created %s instance (%s) object %s" % @@ -1214,7 +1223,7 @@ class fmAlarmObject: return inst_obj except: - collectd.error("%s %s:%s inst object create failed" % + collectd.error("%s %s:%s inst object create failed [exception]" % (PLUGIN, inst_obj.resource_name, instance)) return None @@ -1266,20 +1275,25 @@ class fmAlarmObject: # initialize the object with instance specific data inst_obj.resource_name = self.resource_name + self._copy_instance_object(inst_obj) inst_obj.instance_name = mp - inst_obj.instance = mp - # build the plugin instance name from the mount point - if mp == '/': - inst_obj.plugin_instance = 'root' - else: - inst_obj.plugin_instance = mp[1:].replace('/', '-') + inst_obj.degrade_id += '.' + 'filesystem=' + mp + for plugin_instance in DF_MANGLED_DICT: + if DF_MANGLED_DICT[plugin_instance] == mp: + inst_obj.plugin_instance = plugin_instance + break + else: + collectd.debug("%s no %s mountpoint" % + (PLUGIN, mp)) + continue inst_obj.entity_id = _build_entity_id(PLUGIN__DF, inst_obj.plugin_instance) # add this subordinate object to the parent's # instance object list self._add_instance_object(inst_obj, inst_obj.entity_id) + inst_obj.instance = inst_obj.instance_name collectd.info("%s monitoring %s usage" % (PLUGIN, inst_obj.instance)) @@ -1362,8 +1376,14 @@ def _build_entity_id(plugin, plugin_instance): entity_id += fmAlarmObject.host if plugin == PLUGIN__MEM: - if plugin_instance != 'platform': + if 'node' in plugin_instance: entity_id += '.numa=' + plugin_instance + elif plugin_instance: + entity_id += '.' + PLUGIN__MEM + '=' + plugin_instance + + elif plugin == PLUGIN__CPU: + if plugin_instance: + entity_id += '.' + PLUGIN__CPU + '=' + plugin_instance elif plugin == PLUGIN__VSWITCH_MEM: @@ -1393,22 +1413,17 @@ def _build_entity_id(plugin, plugin_instance): # host=.filesystem= if plugin_instance: - instance = plugin_instance - # build the entity_id for this plugin - entity_id += '.filesystem=/' - - # collectd replaces the instance '/' with the word 'root' - # So skip over "root" as '/' is already part of the - # entity_id - if instance != 'root': - # Look for other instances that are in the mangled list - if instance in mangled_list: - instance = instance.replace('-', '/') - entity_id += instance + path = DF_MANGLED_DICT.get(plugin_instance) + if path: + entity_id += ".filesystem=" + entity_id += path + else: + inst_error = True if inst_error is True: - collectd.error("%s eid build failed ; missing instance" % plugin) + collectd.error("%s eid build failed; bad or missing instance '%s'" % + (plugin, plugin_instance)) return None return entity_id @@ -1450,41 +1465,46 @@ def _print_obj(obj): num = len(obj.instance_objects) if num > 0 or base_object is True: - prefix = "PLUGIN " - if num: - prefix += str(num) - else: - prefix += " " + prefix = "BASE " + str(num) else: - prefix = "INSTANCE" + prefix = "......." + + collectd.info("%s %s %s - %s - %s\n" % + (PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id)) + + collectd.info("%s %s entity id: %s\n" % (PLUGIN, prefix, obj.entity_id)) + collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id)) + + collectd.info("%s %s instance : %s\n" % + (PLUGIN, prefix, obj.instance_name)) if obj.plugin_instance: - resource = obj.plugin + ":" + obj.plugin_instance - else: - resource = obj.plugin + collectd.info("%s %s Plugin Ins: %s\n" % + (PLUGIN, prefix, obj.plugin_instance)) + if obj.warnings: + collectd.info("%s %s warnings: %s" % + (PLUGIN, prefix, obj.warnings)) + if obj.failures: + collectd.info("%s %s failures: %s" % + (PLUGIN, prefix, obj.failures)) + if obj.repair: + collectd.info("%s %s repair: %s" % (PLUGIN, prefix, obj.repair)) - collectd.info("%s %s res: %s name: %s\n" % - (PLUGIN, prefix, resource, obj.resource_name)) - collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id)) - collectd.info("%s inst: %s name: %s\n" % - (PLUGIN, obj.instance, obj.instance_name)) - collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" % - (PLUGIN, + if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50: + collectd.info("%s %s reason: w: %s\n" % + (PLUGIN, prefix, obj.reason_warning)) + collectd.info("%s %s reason: f: %s\n" % + (PLUGIN, prefix, obj.reason_failure)) + + collectd.info("%s %s value:%2.1f thld:%2.1f cause:%s count:%d type:%s\n" % + (PLUGIN, prefix, obj.value, obj.threshold, obj.cause, obj.count, obj.reading_type)) - collectd.info("%s warn:%s fail:%s" % - (PLUGIN, obj.warnings, obj.failures)) - collectd.info("%s repair:t: %s" % - (PLUGIN, obj.repair)) - if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50: - collectd.info("%s reason:w: %s\n" - "%s reason:f: %s\n" % - (PLUGIN, obj.reason_warning, - PLUGIN, obj.reason_failure)) - # collectd.info(" ") + + collectd.info("\n") def _print_state(obj=None): @@ -1604,18 +1624,16 @@ def _clear_alarm_for_missing_filesystems(): if obj is not None and \ obj.plugin == PLUGIN__DF and \ obj.entity_id == eid and \ - obj.plugin_instance != 'root': + obj.instance_name != '': - # For all others replace all '-' with '/' - path = '/' + obj.plugin_instance.replace('-', '/') - if os.path.ismount(path) is False: + if os.path.ismount(obj.instance_name) is False: if clear_alarm(df_base_obj.id, obj.entity_id) is True: collectd.info("%s cleared alarm for missing %s" % - (PLUGIN, path)) + (PLUGIN, obj.instance_name)) df_base_obj.manage_alarm_lists(obj.entity_id, "okay") else: collectd.debug("%s maintaining alarm for %s" % - (PLUGIN, path)) + (PLUGIN, obj.instance_name)) # Collectd calls this function on startup. @@ -1646,7 +1664,7 @@ def init_func(): # Constant Memory Plugin Object settings obj = PLUGINS[PLUGIN__MEM] - obj.resource_name = "Platform Memory" + obj.resource_name = "Memory" obj.instance_name = PLUGIN__MEM obj.repair = "Monitor and if condition persists, " obj.repair += "contact next level of support; " @@ -1785,10 +1803,11 @@ def notifier_func(nObject): if fmAlarmObject.host not in eid: continue - base_obj = get_base_object(alarm_id) - inst_obj = get_object(alarm_id, eid) - if base_obj != inst_obj: - # is a plugin instance - clear it + # get the instance part of the eid + # instance based alarms are cleared over a process + # restart to avoid the potential for stuck alarms. + base_eid = 'host=' + os.uname()[1] + if eid.split(base_eid)[1]: want_alarm_clear = True collectd.info('%s found %s %s alarm [%s]' % @@ -1798,18 +1817,11 @@ def notifier_func(nObject): eid)) if want_alarm_clear is True: - if clear_alarm(alarm_id, eid) is False: collectd.error("%s %s:%s clear failed" % (PLUGIN, alarm_id, eid)) - else: - collectd.info("%s clear %s %s alarm %s" % - (PLUGIN, - alarm.severity, - alarm_id, - eid)) continue if alarm.severity == "critical": @@ -1821,26 +1833,32 @@ def notifier_func(nObject): continue # Load the alarm severity by plugin/instance lookup. + base_obj = get_base_object(alarm_id) if base_obj is not None: base_obj.manage_alarm_lists(eid, sev) + # the eid at this point is really the plugin id + pid = eid + + # here the eid is used to represent the degrade id + eid = base_obj.degrade_id + # handle degrade for alarmed resources # over process startup. - # Note: 'ap' stands for alarmed_plugin - ap = ALARM_ID__TO__PLUGIN_DICT[alarm_id] add = False if alarm.severity == "critical" and\ - ap in mtcDegradeObj.degrade_list__failure: + pid in mtcDegradeObj.degrade_list__failure: add = True elif alarm.severity == "major" and\ - ap in mtcDegradeObj.degrade_list__warning: + pid in mtcDegradeObj.degrade_list__warning: add = True if add is True: - mtcDegradeObj.degrade_list.append(ap) + + mtcDegradeObj.degrade_list.append(eid) collectd.info("%s '%s' plugin added to " "degrade list due to found " "startup alarm %s" % - (PLUGIN_DEGRADE, ap, alarm_id)) + (PLUGIN_DEGRADE, eid, alarm_id)) fmAlarmObject.fm_connectivity = True collectd.info("%s connectivity with fm complete" % PLUGIN) @@ -1904,7 +1922,6 @@ def notifier_func(nObject): elif nObject.plugin_instance: need_instance_object_create = False # Build the entity_id from the parent object if needed - # Build the entity_id from the parent object if needed eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) try: # Need lock when reading/writing any obj.instance_objects list @@ -1927,15 +1944,16 @@ def notifier_func(nObject): base_obj.create_instance_object(nObject.plugin_instance) inst_obj = base_obj._get_instance_object(eid) if inst_obj: + inst_obj.instance_name = nObject.plugin_instance collectd.debug("%s %s:%s inst object created" % (PLUGIN, inst_obj.plugin, - inst_obj.instance)) + inst_obj.instance_name)) else: collectd.error("%s %s:%s inst object create failed" % (PLUGIN, nObject.plugin, - nObject.plugin_instance)) + nObject.plugin_instance_name)) return 0 # re-assign the object @@ -1971,13 +1989,19 @@ def notifier_func(nObject): if action == "done": return 0 + # Handle degrade state update early in process start. + # Ensure that a degrade condition that clears over a collectd + # collectd process restart is cleared as soon as possible. + if obj.count == 0: + mtcDegradeObj.mtce_degrade_notifier(nObject) + # increment just before any possible return for a valid sample obj.count += 1 # audit file system presence every time we get the # notification for the root file system ; which will # always be there. - if obj.instance == '/': + if obj.instance_name == '/': _clear_alarm_for_missing_filesystems() if len(mtcDegradeObj.degrade_list): mtcDegradeObj.remove_degrade_for_missing_filesystems() @@ -1992,7 +2016,7 @@ def notifier_func(nObject): # degrade state is periodically refreshed. # However, rather than do this refresh on every notification, # just do it for the root filesystem instance case. - if obj.instance == '/': + if obj.instance_name == '/': mtcDegradeObj.mtce_degrade_notifier(nObject) return 0 @@ -2062,7 +2086,7 @@ def notifier_func(nObject): _alarm_state, base_obj.id, severity_str, - obj.instance, + obj.instance_name, obj.entity_id, obj.value))