Merge remote-tracking branch 'starlingx/master' into HEAD

Change-Id: Ic0dcbf0f9548cb23f48565d702a00d3b07480735
Signed-off-by: Scott Little <scott.little@windriver.com>
This commit is contained in:
Scott Little 2019-02-14 12:29:21 -05:00
commit 01c473bb5e
5 changed files with 188 additions and 99 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -102,6 +102,7 @@ api = fm_api.FaultAPIs()
debug = False
debug_lists = False
want_state_audit = False
want_vswitch = False
# number of notifier loops before the state is object dumped
DEBUG_AUDIT = 2
@ -123,6 +124,8 @@ DATABASE_NAME = 'collectd samples'
READING_TYPE__PERCENT_USAGE = '% usage'
# Default invalid threshold value
INVALID_THRESHOLD = float(-1)
# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
@ -231,9 +234,11 @@ class PluginObject:
# [ 'float value string','float threshold string]
self.values = []
self.threshold = float(0) # float value of threshold
self.value = float(0) # float value of reading
# float value of threshold
self.threshold = float(INVALID_THRESHOLD)
# Common static class members.
self.reason_warning = ""
self.reason_failure = ""
@ -333,7 +338,8 @@ class PluginObject:
# Purpose : Manage sample value change.
#
# Handle no sample update case.
# Parse the notification log
# Parse the notification log.
# Handle base object instances.
# Generate a log entry if the sample value changes more than
# step value.
#
@ -385,6 +391,18 @@ class PluginObject:
# get the threshold if its there.
if len(self.values) > 1:
self.threshold = float(self.values[1])
if nObject.plugin == PLUGIN__MEM:
if self.reading_type == READING_TYPE__PERCENT_USAGE:
# Note: add one to % usage reading types so that it
# matches how rmond did it. In collectd an
# overage is over the specified threshold
# whereas in rmon an overage is at threshold
# or above.
self.threshold = float(self.values[1]) + 1
else:
self.threshold = float(self.values[1])
else:
self.threshold = float(INVALID_THRESHOLD) # invalid value
except ValueError as ex:
collectd.error("%s %s value not integer or float (%s) (%s)" %
@ -429,6 +447,11 @@ class PluginObject:
if self.plugin == PLUGIN__DF:
resource = self.instance
elif self.plugin == PLUGIN__MEM:
if self.instance_name:
if self.instance_name != 'platform':
resource += ' ' + self.instance_name
# setup resource name for vswitch process instance name
elif self.plugin == PLUGIN__VSWITCH_MEM:
resource += ' Processor '
@ -754,7 +777,7 @@ class PluginObject:
(PLUGIN, inst_obj.resource_name,
inst_obj.entity_id, inst_obj))
collectd.debug("%s monitoring %s %s %s" %
collectd.info("%s monitoring %s %s %s" %
(PLUGIN,
inst_obj.resource_name,
inst_obj.instance_name,
@ -891,7 +914,11 @@ def _build_entity_id(plugin, plugin_instance):
entity_id = 'host='
entity_id += PluginObject.host
if plugin == PLUGIN__VSWITCH_MEM:
if plugin == PLUGIN__MEM:
if plugin_instance != 'platform':
entity_id += '.numa=' + plugin_instance
elif plugin == PLUGIN__VSWITCH_MEM:
# host=<hostname>.processor=<socket-id>
if plugin_instance:
@ -933,15 +960,6 @@ def _build_entity_id(plugin, plugin_instance):
instance = instance.replace('-', '/')
entity_id += instance
# Will be uncommented when the numa memory monitor is added
# to the platform memory plugin.
#
#elif plugin == PLUGIN__MEM:
# if plugin_instance is not 'platform':
# # host=controller-0.numa=node0
# entity_id += '.numa='
# entity_id += plugin_instance
if inst_error is True:
collectd.error("%s eid build failed ; missing instance" % plugin)
return None
@ -1211,7 +1229,9 @@ def init_func():
obj._create_instance_objects()
# ntp query is for controllers only
if tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
if want_vswitch is False:
collectd.debug("%s vSwitch monitoring disabled" % PLUGIN)
elif tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:
#######################################################################
@ -1507,9 +1527,9 @@ def notifier_func(nObject):
# if this is a threshold alarm then build the reason text that
# includes the threahold and the reading that caused the assertion.
reason = obj.resource_name
reason += " threshold exceeded"
if obj.threshold:
reason += "; threshold {:2.0f} ".format(obj.threshold) + "%, "
reason += " threshold exceeded ;"
if obj.threshold != INVALID_THRESHOLD:
reason += " threshold {:2.0f}".format(obj.threshold) + "%,"
if obj.value:
reason += " actual {:2.0f}".format(obj.value) + "%"
@ -1542,14 +1562,13 @@ def notifier_func(nObject):
# update the lists now that
base_obj._manage_alarm(obj.entity_id, severity_str)
collectd.info("%s %s alarm %s:%s %s:%s thld:%2.2f value:%2.2f" % (
collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % (
PLUGIN,
_alarm_state,
base_obj.id,
severity_str,
obj.instance,
obj.entity_id,
obj.threshold,
obj.value))
# Debug only: comment out for production code.

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -19,11 +19,9 @@ import collectd
debug = False
# general return codes
PASS = 0
FAIL = 1
PLUGIN = 'platform memory usage'
PLUGIN_NUMA = 'numa memory usage'
PLUGIN_HUGE = 'hugepage memory usage'
# CPU Control class
@ -41,8 +39,10 @@ class MEM:
CommitLimit = 0
Committed_AS = 0
HugePages_Total = 0
HugePages_Free = 0
Hugepagesize = 0
AnonPages = 0
FilePages = 0
# derived values
avail = 0
@ -54,6 +54,27 @@ class MEM:
obj = MEM()
def log_meminfo(plugin, name, meminfo):
""" Log the supplied meminfo """
if debug is False:
return
collectd.info("%s %s" % (plugin, name))
collectd.info("%s ---------------------------" % plugin)
collectd.info("%s memTotal_kB : %f" % (plugin, meminfo.memTotal_kB))
collectd.info("%s memFree_kB : %f" % (plugin, meminfo.memFree_kB))
collectd.info("%s Buffers : %f" % (plugin, meminfo.buffers))
collectd.info("%s Cached : %f" % (plugin, meminfo.cached))
collectd.info("%s SReclaimable : %f" % (plugin, meminfo.SReclaimable))
collectd.info("%s CommitLimit : %f" % (plugin, meminfo.CommitLimit))
collectd.info("%s Committed_AS : %f" % (plugin, meminfo.Committed_AS))
collectd.info("%s HugePages_Total: %f" % (plugin, meminfo.HugePages_Total))
collectd.info("%s HugePages_Free : %f" % (plugin, meminfo.HugePages_Free))
collectd.info("%s Hugepagesize : %f" % (plugin, meminfo.Hugepagesize))
collectd.info("%s AnonPages : %f" % (plugin, meminfo.AnonPages))
def config_func(config):
"""
Configure the memory usage plugin
@ -110,7 +131,12 @@ def read_func():
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN, str(e)))
return FAIL
return 0
# setup the sample structure
val = collectd.Values(host=obj.hostname)
val.type = 'percent'
val.type_instance = 'used'
# remove the 'unit' (kB) suffix that might be on some of the lines
for line in meminfo:
@ -130,20 +156,11 @@ def read_func():
obj.CommitLimit = float(meminfo['CommitLimit'])
obj.Committed_AS = float(meminfo['Committed_AS'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.HugePages_Free = float(meminfo['HugePages_Free'])
obj.Hugepagesize = float(meminfo['Hugepagesize'])
obj.AnonPages = float(meminfo['AnonPages'])
# collectd.info("%s /proc/meminfo: %s" % (PLUGIN, meminfo))
# collectd.info("%s ---------------------------" % PLUGIN)
# collectd.info("%s memTotal_kB : %f" % (PLUGIN, obj.memTotal_kB))
# collectd.info("%s memFree_kB : %f" % (PLUGIN, obj.memFree_kB))
# collectd.info("%s Buffers : %f" % (PLUGIN, obj.buffers))
# collectd.info("%s Cached : %f" % (PLUGIN, obj.cached))
# collectd.info("%s SReclaimable : %f" % (PLUGIN, obj.SReclaimable))
# collectd.info("%s CommitLimit : %f" % (PLUGIN, obj.CommitLimit))
# collectd.info("%s Committed_AS : %f" % (PLUGIN, obj.Committed_AS))
# collectd.info("%s HugePages_Total: %f" % (PLUGIN, obj.HugePages_Total))
# collectd.info("%s AnonPages : %f" % (PLUGIN, obj.AnonPages))
log_meminfo(PLUGIN, "/proc/meminfo", obj)
obj.avail = float(float(obj.memFree_kB) +
float(obj.buffers) +
@ -152,38 +169,93 @@ def read_func():
obj.total = float(float(obj.avail) +
float(obj.AnonPages))
# collectd.info("%s ---------------------------" % PLUGIN)
# collectd.info("%s memTotal: %d" % (PLUGIN, obj.avail))
# collectd.info("%s memAvail: %d" % (PLUGIN, obj.total))
if obj.strict == 1:
obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit))
else:
obj.value = float(float(obj.AnonPages) / float(obj.total))
obj.value = float(float(obj.value) * 100)
# get numa node memory
# numa_node_files = []
# fn = "/sys/devices/system/node/"
# files = os.listdir(fn)
# for file in files:
# if 'node' in file:
# numa_node_files.append(fn + file)
# collectd.info("%s numa node files: %s" %
# (PLUGIN, numa_node_files))
collectd.debug('%s reports %.2f %% usage' %
(PLUGIN, obj.value))
if debug is True:
collectd.info("%s ---------------------------" % PLUGIN)
collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail))
collectd.info("%s memTotal: %d" % (PLUGIN, obj.total))
collectd.info('%s reports %.2f %% usage' % (PLUGIN, obj.value))
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = 'memory'
val.type = 'percent'
val.type_instance = 'used'
val.plugin_instance = 'platform'
val.dispatch(values=[obj.value])
return PASS
#####################################################################
# Now get the Numa Node Memory Usage
#####################################################################
numa_node_files = []
fn = "/sys/devices/system/node/"
files = os.listdir(fn)
for file in files:
if 'node' in file:
numa_node_files.append(fn + file + '/meminfo')
for numa_node in numa_node_files:
meminfo = {}
try:
with open(numa_node) as fd:
for line in fd:
meminfo[line.split()[2][0:-1]] = line.split()[3].strip()
obj.memFree_kB = float(meminfo['MemFree'])
obj.FilePages = float(meminfo['FilePages'])
obj.SReclaimable = float(meminfo['SReclaimable'])
obj.AnonPages = float(meminfo['AnonPages'])
obj.HugePages_Total = float(meminfo['HugePages_Total'])
obj.HugePages_Free = float(meminfo['HugePages_Free'])
log_meminfo(PLUGIN, numa_node, obj)
avail = float(float(obj.memFree_kB) +
float(obj.FilePages) +
float(obj.SReclaimable))
total = float(float(avail) +
float(obj.AnonPages))
obj.value = float(float(obj.AnonPages)) / float(total)
obj.value = float(float(obj.value) * 100)
# Dispatch usage value to collectd for this numa node
val.plugin_instance = numa_node.split('/')[5]
val.dispatch(values=[obj.value])
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
(PLUGIN_NUMA,
val.plugin,
obj.value,
val.plugin_instance))
# Numa Node Huge Page Memory Monitoring
#
# Only monitor if there is Huge Page Memory
if obj.HugePages_Total > 0:
obj.value = \
float(float(obj.HugePages_Total -
obj.HugePages_Free)) / \
float(obj.HugePages_Total)
obj.value = float(float(obj.value) * 100)
# Dispatch huge page memory usage value
# to collectd for this numa node.
val.plugin_instance = numa_node.split('/')[5] + '_hugepages'
val.dispatch(values=[obj.value])
collectd.debug('%s reports %s at %.2f %% usage (%s)' %
(PLUGIN_HUGE,
val.plugin,
obj.value,
val.plugin_instance))
except EnvironmentError as e:
collectd.error("%s unable to read from %s ; str(e)" %
(PLUGIN_NUMA, str(e)))
return 0
collectd.register_config(config_func)

View File

@ -1,8 +1,10 @@
#
# Copyright (c) 2018 Wind River Systems, Inc.
# Copyright (c) 2018-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#############################################################################
#
# This file is the collectd 'Maintenance' Notifier.
#
# Collects provides information about each event as an object passed to the
@ -50,10 +52,6 @@ NOTIF_FAILURE = 1
NOTIF_WARNING = 2
NOTIF_OKAY = 4
# generic return codes
PASS = 0
FAIL = 1
# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101
@ -292,7 +290,7 @@ def notifier_func(nObject):
else:
collectd.info("%s unsupported severity %d" %
(PLUGIN, nObject.severity))
return FAIL
return 0
# running counter of notifications.
obj.msg_throttle += 1