Merge "Fix memory instance handling over collectd process restart"

This commit is contained in:
Zuul 2020-12-10 14:23:24 +00:00 committed by Gerrit Code Review
commit 1e951176df
1 changed files with 169 additions and 145 deletions

View File

@ -154,32 +154,41 @@ FAIL = 1
# ... with configuration override
MTCE_CMD_RX_PORT = 2101
# Some plugin_instances are mangled by collectd.
# The filesystem plugin is especially bad for this.
# Filesystem plugin_instances are mangled by collectd.
# For instance the "/var/log" MountPoint instance is
# reported as "var-log".
# The following is a list of mangled instances list
# that need the '-' replaced with '/'.
#
# The following is a dictionary that provides mapping between the
# stock df plugin instance name and the linux filesystem pATH where the
# key = mangled filesystem instance from stock df plugin
# val = actual filesystem mountpoint path
#
# ADD_NEW_PLUGIN if there are new file systems being added that
# have subdirectories in the name then they will need to be added
# to the mangled list
mangled_list = {"dev-shm",
"var-log",
"var-run",
"var-lock",
"var-lib-rabbitmq",
"var-lib-postgresql",
"var-lib-ceph-mon",
"var-lib-docker",
"var-lib-docker-distribution"
"var-lib-kubelet",
"var-lib-nova-instances",
"opt-platform",
"opt-cgcs",
"opt-etcd",
"opt-extension",
"opt-backups"}
DF_MANGLED_DICT = {
# instance : path
'root': '/',
'dev': '/dev',
'tmp': '/tmp',
'boot': '/boot',
'scratch': '/scratch',
'dev-shm': '/dev/shm',
'var-log': '/var/log',
'var-run': '/var/run',
'var-lock': '/var/lock',
'var-lib-rabbitmq': '/var/lib/rabbitmq',
'var-lib-postgresql': '/var/lib/postgresql',
'var-lib-ceph-mon': '/var/lib/ceph/mon',
'var-lib-docker': '/var/lib/docker',
'var-lib-docker-distribution': '/var/lib/docker-distribution',
'var-lib-kubelet': '/var/lib/kubelet',
'var-lib-nova-instances': '/var/lib/nova/instances',
'opt-platform': '/opt/platform',
'opt-etcd': '/opt/etcd',
'opt-extension': '/opt/extension',
'opt-backups': '/opt/backups'}
# ADD_NEW_PLUGIN: add new alarm id definition
ALARM_ID__CPU = "100.101"
@ -330,8 +339,8 @@ class DegradeObject:
resources = ""
if self.degrade_list:
# loop over the list,
# limit the degraded resource list being sent to mtce to 5
for r in self.degrade_list[0:1:5]:
# limit the degraded resource list being sent to mtce to 2
for r in self.degrade_list[0:2]:
resources += r + ','
resources = resources[:-1]
state = "assert"
@ -350,13 +359,9 @@ class DegradeObject:
# Clear the message throttle counter
self.msg_throttle = 0
# if the degrade state has changed then log it and proceed
if self.last_state != state:
if self.last_state != "undef":
collectd.info("%s degrade %s %s" %
(PLUGIN_DEGRADE,
state,
self.degrade_list))
if self.degrade_list:
collectd.info("%s degrade list: %s" %
(PLUGIN_DEGRADE, self.degrade_list))
# Save state for next time
self.last_state = state
@ -405,25 +410,6 @@ class DegradeObject:
self.addr = None
self.protocol = socket.AF_INET
##########################################################################
#
# Name : _df_instance_to_path
#
# Purpose : Convert filesystem instance to path
#
# Returns : Created path
#
##########################################################################
def _df_instance_to_path(self, df_inst):
"""Convert a df instance name to a mountpoint"""
# df_root is not a dynamic file system. Ignore that one.
if df_inst == 'df_root':
return '/'
else:
# For all others replace all '-' with '/'
return('/' + df_inst[3:].replace('-', '/'))
##########################################################################
#
# Name : remove_degrade_for_missing_filesystems
@ -441,10 +427,10 @@ class DegradeObject:
for df_inst in self.degrade_list:
# Only file system plugins are looked at.
# File system plugin instance names are prefixed with 'df_'
# File system plugin instance names are prefixed with 'df:'
# as the first 3 chars in the instance name.
if df_inst[0:3] == 'df_':
path = self._df_instance_to_path(df_inst)
if df_inst[0:3] == 'df:':
path = df_inst.split('filesystem=')[1]
# check the mount point.
# if the mount point no longer exists then remove
@ -474,11 +460,21 @@ class DegradeObject:
remove = False
add = False
# Create the resource name from the notifier object.
# format: <plugin name>_<plugin_instance_name>
resource = nObject.plugin
if nObject.plugin_instance:
resource += "_" + nObject.plugin_instance
# Create the degrade id from the notifier object.
# Format: <plugin name>:host=<hostname>.<plugin_instance_name>
resource = nObject.plugin + ':' + 'host=' + os.uname()[1]
if nObject.plugin == PLUGIN__DF:
df_inst = DF_MANGLED_DICT.get(nObject.plugin_instance)
if df_inst:
resource += ".filesystem="
resource += df_inst
else:
collectd.error("%s df instance '%s' lookup failed; ignoring" %
(PLUGIN_DEGRADE, nObject.plugin_instance))
return
elif nObject.plugin_instance:
resource += '.' + nObject.plugin + '=' + nObject.plugin_instance
# This block looks at the current notification severity
# and manages the degrade_list.
@ -501,7 +497,7 @@ class DegradeObject:
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = self._df_instance_to_path(resource)
path = DF_MANGLED_DICT.get(nObject.plugin_instance)
add = os.path.ismount(path)
else:
@ -521,7 +517,7 @@ class DegradeObject:
# mountpoint is gone.
add = True
if nObject.plugin == PLUGIN__DF:
path = self._df_instance_to_path(resource)
path = DF_MANGLED_DICT.get(nObject.plugin_instance)
add = os.path.ismount(path)
elif resource in self.degrade_list:
@ -537,11 +533,11 @@ class DegradeObject:
if remove is True:
self.degrade_list.remove(resource)
collectd.info("%s %s removed from degrade list" %
collectd.info("%s '%s' removed from degrade list" %
(PLUGIN_DEGRADE, resource))
elif add is True:
self.degrade_list.append(resource)
collectd.info("%s %s added to degrade list" %
collectd.info("%s '%s' added to degrade list" %
(PLUGIN_DEGRADE, resource))
@ -573,6 +569,20 @@ class fmAlarmObject:
self.resource_name = "" # The top level name of the resource
self.instance_name = "" # The instance name
# Unique identifier used in the degrade list to represent
# this alarm object.
#
# Base Object:
#
# Format : PLUGIN:host=<hostname>
# Example: memory:host=controller-0
#
# Instance Object:
#
# Format: <Base Object>.instance>
# Example: memory:host=controller-0.memory=platform
self.degrade_id = plugin + ':' + 'host=' + os.uname()[1]
# Instance specific learned static class members.
self.entity_id = "" # fm entity id host=<hostname>.<instance>
self.instance = "" # <plugin>_<instance>
@ -778,12 +788,11 @@ class fmAlarmObject:
# setup resource name for filesystem instance usage log
if self.plugin == PLUGIN__DF:
resource = self.instance
resource = self.instance_name
elif self.plugin == PLUGIN__MEM:
if self.instance_name:
if self.instance_name != 'platform':
resource += ' ' + self.instance_name
resource = self.instance_name
# setup resource name for vswitch process instance name
elif self.plugin == PLUGIN__VSWITCH_MEM:
@ -944,7 +953,7 @@ class fmAlarmObject:
collectd.info("%s %s %s debounce '%s -> %s' (%2.2f) (%d:%d) %s" % (
PLUGIN,
base_obj.resource_name,
self.instance,
entity_id,
current_severity_str,
severity,
this_value,
@ -1121,6 +1130,8 @@ class fmAlarmObject:
:param eid: the index for the instance object dictionary
:return: object or None
"""
if eid is None:
return None
try:
collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin))
@ -1167,7 +1178,6 @@ class fmAlarmObject:
"""Copy select members of self object to target object"""
object.resource_name = self.resource_name
object.instance_name = self.instance_name
object.reading_type = self.reading_type
object.reason_warning = self.reason_warning
@ -1196,9 +1206,8 @@ class fmAlarmObject:
# initialize the object with instance specific data
inst_obj.instance_name = instance
inst_obj.entity_id = _build_entity_id(self.plugin,
instance)
inst_obj.degrade_id += '.' + self.plugin + '=' + instance
inst_obj.entity_id = _build_entity_id(self.plugin, instance)
self._add_instance_object(inst_obj, inst_obj.entity_id)
collectd.debug("%s created %s instance (%s) object %s" %
@ -1214,7 +1223,7 @@ class fmAlarmObject:
return inst_obj
except:
collectd.error("%s %s:%s inst object create failed" %
collectd.error("%s %s:%s inst object create failed [exception]" %
(PLUGIN, inst_obj.resource_name, instance))
return None
@ -1266,20 +1275,25 @@ class fmAlarmObject:
# initialize the object with instance specific data
inst_obj.resource_name = self.resource_name
self._copy_instance_object(inst_obj)
inst_obj.instance_name = mp
inst_obj.instance = mp
# build the plugin instance name from the mount point
if mp == '/':
inst_obj.plugin_instance = 'root'
else:
inst_obj.plugin_instance = mp[1:].replace('/', '-')
inst_obj.degrade_id += '.' + 'filesystem=' + mp
for plugin_instance in DF_MANGLED_DICT:
if DF_MANGLED_DICT[plugin_instance] == mp:
inst_obj.plugin_instance = plugin_instance
break
else:
collectd.debug("%s no %s mountpoint" %
(PLUGIN, mp))
continue
inst_obj.entity_id = _build_entity_id(PLUGIN__DF,
inst_obj.plugin_instance)
# add this subordinate object to the parent's
# instance object list
self._add_instance_object(inst_obj, inst_obj.entity_id)
inst_obj.instance = inst_obj.instance_name
collectd.info("%s monitoring %s usage" %
(PLUGIN, inst_obj.instance))
@ -1362,8 +1376,14 @@ def _build_entity_id(plugin, plugin_instance):
entity_id += fmAlarmObject.host
if plugin == PLUGIN__MEM:
if plugin_instance != 'platform':
if 'node' in plugin_instance:
entity_id += '.numa=' + plugin_instance
elif plugin_instance:
entity_id += '.' + PLUGIN__MEM + '=' + plugin_instance
elif plugin == PLUGIN__CPU:
if plugin_instance:
entity_id += '.' + PLUGIN__CPU + '=' + plugin_instance
elif plugin == PLUGIN__VSWITCH_MEM:
@ -1393,22 +1413,17 @@ def _build_entity_id(plugin, plugin_instance):
# host=<hostname>.filesystem=<mountpoint>
if plugin_instance:
instance = plugin_instance
# build the entity_id for this plugin
entity_id += '.filesystem=/'
# collectd replaces the instance '/' with the word 'root'
# So skip over "root" as '/' is already part of the
# entity_id
if instance != 'root':
# Look for other instances that are in the mangled list
if instance in mangled_list:
instance = instance.replace('-', '/')
entity_id += instance
path = DF_MANGLED_DICT.get(plugin_instance)
if path:
entity_id += ".filesystem="
entity_id += path
else:
inst_error = True
if inst_error is True:
collectd.error("%s eid build failed ; missing instance" % plugin)
collectd.error("%s eid build failed; bad or missing instance '%s'" %
(plugin, plugin_instance))
return None
return entity_id
@ -1450,41 +1465,46 @@ def _print_obj(obj):
num = len(obj.instance_objects)
if num > 0 or base_object is True:
prefix = "PLUGIN "
if num:
prefix += str(num)
else:
prefix += " "
prefix = "BASE " + str(num)
else:
prefix = "INSTANCE"
prefix = "......."
collectd.info("%s %s %s - %s - %s\n" %
(PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id))
collectd.info("%s %s entity id: %s\n" % (PLUGIN, prefix, obj.entity_id))
collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id))
collectd.info("%s %s instance : %s\n" %
(PLUGIN, prefix, obj.instance_name))
if obj.plugin_instance:
resource = obj.plugin + ":" + obj.plugin_instance
else:
resource = obj.plugin
collectd.info("%s %s Plugin Ins: %s\n" %
(PLUGIN, prefix, obj.plugin_instance))
if obj.warnings:
collectd.info("%s %s warnings: %s" %
(PLUGIN, prefix, obj.warnings))
if obj.failures:
collectd.info("%s %s failures: %s" %
(PLUGIN, prefix, obj.failures))
if obj.repair:
collectd.info("%s %s repair: %s" % (PLUGIN, prefix, obj.repair))
collectd.info("%s %s res: %s name: %s\n" %
(PLUGIN, prefix, resource, obj.resource_name))
collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id))
collectd.info("%s inst: %s name: %s\n" %
(PLUGIN, obj.instance, obj.instance_name))
collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" %
(PLUGIN,
if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
collectd.info("%s %s reason: w: %s\n" %
(PLUGIN, prefix, obj.reason_warning))
collectd.info("%s %s reason: f: %s\n" %
(PLUGIN, prefix, obj.reason_failure))
collectd.info("%s %s value:%2.1f thld:%2.1f cause:%s count:%d type:%s\n" %
(PLUGIN, prefix,
obj.value,
obj.threshold,
obj.cause,
obj.count,
obj.reading_type))
collectd.info("%s warn:%s fail:%s" %
(PLUGIN, obj.warnings, obj.failures))
collectd.info("%s repair:t: %s" %
(PLUGIN, obj.repair))
if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
collectd.info("%s reason:w: %s\n"
"%s reason:f: %s\n" %
(PLUGIN, obj.reason_warning,
PLUGIN, obj.reason_failure))
# collectd.info(" ")
collectd.info("\n")
def _print_state(obj=None):
@ -1604,18 +1624,16 @@ def _clear_alarm_for_missing_filesystems():
if obj is not None and \
obj.plugin == PLUGIN__DF and \
obj.entity_id == eid and \
obj.plugin_instance != 'root':
obj.instance_name != '':
# For all others replace all '-' with '/'
path = '/' + obj.plugin_instance.replace('-', '/')
if os.path.ismount(path) is False:
if os.path.ismount(obj.instance_name) is False:
if clear_alarm(df_base_obj.id, obj.entity_id) is True:
collectd.info("%s cleared alarm for missing %s" %
(PLUGIN, path))
(PLUGIN, obj.instance_name))
df_base_obj.manage_alarm_lists(obj.entity_id, "okay")
else:
collectd.debug("%s maintaining alarm for %s" %
(PLUGIN, path))
(PLUGIN, obj.instance_name))
# Collectd calls this function on startup.
@ -1646,7 +1664,7 @@ def init_func():
# Constant Memory Plugin Object settings
obj = PLUGINS[PLUGIN__MEM]
obj.resource_name = "Platform Memory"
obj.resource_name = "Memory"
obj.instance_name = PLUGIN__MEM
obj.repair = "Monitor and if condition persists, "
obj.repair += "contact next level of support; "
@ -1785,10 +1803,11 @@ def notifier_func(nObject):
if fmAlarmObject.host not in eid:
continue
base_obj = get_base_object(alarm_id)
inst_obj = get_object(alarm_id, eid)
if base_obj != inst_obj:
# is a plugin instance - clear it
# get the instance part of the eid
# instance based alarms are cleared over a process
# restart to avoid the potential for stuck alarms.
base_eid = 'host=' + os.uname()[1]
if eid.split(base_eid)[1]:
want_alarm_clear = True
collectd.info('%s found %s %s alarm [%s]' %
@ -1798,18 +1817,11 @@ def notifier_func(nObject):
eid))
if want_alarm_clear is True:
if clear_alarm(alarm_id, eid) is False:
collectd.error("%s %s:%s clear failed" %
(PLUGIN,
alarm_id,
eid))
else:
collectd.info("%s clear %s %s alarm %s" %
(PLUGIN,
alarm.severity,
alarm_id,
eid))
continue
if alarm.severity == "critical":
@ -1821,26 +1833,32 @@ def notifier_func(nObject):
continue
# Load the alarm severity by plugin/instance lookup.
base_obj = get_base_object(alarm_id)
if base_obj is not None:
base_obj.manage_alarm_lists(eid, sev)
# the eid at this point is really the plugin id
pid = eid
# here the eid is used to represent the degrade id
eid = base_obj.degrade_id
# handle degrade for alarmed resources
# over process startup.
# Note: 'ap' stands for alarmed_plugin
ap = ALARM_ID__TO__PLUGIN_DICT[alarm_id]
add = False
if alarm.severity == "critical" and\
ap in mtcDegradeObj.degrade_list__failure:
pid in mtcDegradeObj.degrade_list__failure:
add = True
elif alarm.severity == "major" and\
ap in mtcDegradeObj.degrade_list__warning:
pid in mtcDegradeObj.degrade_list__warning:
add = True
if add is True:
mtcDegradeObj.degrade_list.append(ap)
mtcDegradeObj.degrade_list.append(eid)
collectd.info("%s '%s' plugin added to "
"degrade list due to found "
"startup alarm %s" %
(PLUGIN_DEGRADE, ap, alarm_id))
(PLUGIN_DEGRADE, eid, alarm_id))
fmAlarmObject.fm_connectivity = True
collectd.info("%s connectivity with fm complete" % PLUGIN)
@ -1904,7 +1922,6 @@ def notifier_func(nObject):
elif nObject.plugin_instance:
need_instance_object_create = False
# Build the entity_id from the parent object if needed
# Build the entity_id from the parent object if needed
eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
try:
# Need lock when reading/writing any obj.instance_objects list
@ -1927,15 +1944,16 @@ def notifier_func(nObject):
base_obj.create_instance_object(nObject.plugin_instance)
inst_obj = base_obj._get_instance_object(eid)
if inst_obj:
inst_obj.instance_name = nObject.plugin_instance
collectd.debug("%s %s:%s inst object created" %
(PLUGIN,
inst_obj.plugin,
inst_obj.instance))
inst_obj.instance_name))
else:
collectd.error("%s %s:%s inst object create failed" %
(PLUGIN,
nObject.plugin,
nObject.plugin_instance))
nObject.plugin_instance_name))
return 0
# re-assign the object
@ -1971,13 +1989,19 @@ def notifier_func(nObject):
if action == "done":
return 0
# Handle degrade state update early in process start.
# Ensure that a degrade condition that clears over a collectd
# collectd process restart is cleared as soon as possible.
if obj.count == 0:
mtcDegradeObj.mtce_degrade_notifier(nObject)
# increment just before any possible return for a valid sample
obj.count += 1
# audit file system presence every time we get the
# notification for the root file system ; which will
# always be there.
if obj.instance == '/':
if obj.instance_name == '/':
_clear_alarm_for_missing_filesystems()
if len(mtcDegradeObj.degrade_list):
mtcDegradeObj.remove_degrade_for_missing_filesystems()
@ -1992,7 +2016,7 @@ def notifier_func(nObject):
# degrade state is periodically refreshed.
# However, rather than do this refresh on every notification,
# just do it for the root filesystem instance case.
if obj.instance == '/':
if obj.instance_name == '/':
mtcDegradeObj.mtce_degrade_notifier(nObject)
return 0
@ -2062,7 +2086,7 @@ def notifier_func(nObject):
_alarm_state,
base_obj.id,
severity_str,
obj.instance,
obj.instance_name,
obj.entity_id,
obj.value))