e8c9676d98
This update introduces interface monitoring for oam, mgmt and infra networks as a collectd plugin. The interface plugin runs and queries the new maintenance Link Monitor daemon for Link Model and Information every 10 seconds. The plugin then manages alarms based on the link model similar to how rmon did in the past ; port and interface alarms. Severity: Interface and Port levels Alarm Level Minor Major Critical ----------- ----- --------------------- ---------------------------- Interface N/A One of lag pair is Up All Interface ports are Down Port N/A Physical Link is Down N/A Degrade support for interface monitoring is add to the mtce degrade notifier. Any link down condition results in a host degrade condition like was in rmon. Sample Data: represented as % of total links Up for that network interface 100 or 100% percent used - all links of interface are up. 50 or 50% percent used - one of lag pair is Up and the other is Down 0 or 0% percent used - all ports for that network are Down The plugin documents all of this in its header. This update also 1. Adds the new lmond process to syslog-ng config file. 2. Adds the new lmond process to the mtce patch script. 3. Modifies the cpu, df and memory threshold settings by -1. rmon thresholds were precise whereas collectd requires that the samples cross the thresholds, not just meet them. So for example, in terms of a 90% usage action the threshold needs to be 89. Test Plan: (WIP but almost complete) PASS: Verify interface plugin startup PASS: Verify interface plugin logging PASS: Verify interface plugin Link Status Query and response handling PASS: Verify monitor, sample storage and grafana display PASS: verify port and interface alarm matches what rmon produced PASS: Verify lmon port config from manifest configured plugin PASS: Verify lmon port config from lmon.conf PASS: Verify single interface failure handling and recovery PASS: Verify lagged interface failure handling and recovery PASS: Verify link loss of lagged interface shared between mgmt and oam (hp380) PASS: Verify network interface failure handling ; single port PASS: Verify network interface degrade handling ; lagged interface PEND: Verify network interface degrade handling ; vlan interface PASS: Verify HTTP request timeout period and handling PASS: Verify link status query failure handling - invalid uri (timeout) PASS: Verify link status query failure handling - missing uri (timeout) PASS: Verify link status query failure handling - status fail PASS: Verify link status query failure handling - bad json resp Change-Id: I2e2dfe6ddfa06a46770245540c7153d330bdf196 Story: 2002823 Task: 28635 Depends-On: https://review.openstack.org/#/c/633264 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
396 lines
14 KiB
Python
Executable File
396 lines
14 KiB
Python
Executable File
#
|
|
# Copyright (c) 2018 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# This file is the collectd 'Maintenance' Notifier.
|
|
#
|
|
# Collects provides information about each event as an object passed to the
|
|
# notification handler ; the notification object.
|
|
#
|
|
# object.host - the hostname
|
|
#
|
|
# object.plugin - the name of the plugin aka resource
|
|
# object.plugin_instance - plugin instance string i.e. say mountpoint
|
|
# for df plugin
|
|
# object.type, - the unit i.e. percent or absolute
|
|
# object.type_instance - the attribute i.e. free, used, etc
|
|
#
|
|
# object.severity - a integer value 0=OK , 1=warning, 2=failure
|
|
# object.message - a log-able message containing the above along
|
|
# with the value
|
|
#
|
|
# This notifier manages requesting mtce to assert or clear its collectd
|
|
# host-degrade-cause flag based on notification messages sent from collectd.
|
|
#
|
|
# Messages to maintenance are throttled ONE_EVERY while this state is the
|
|
# same as last state.
|
|
#
|
|
# Message is sent on every state change
|
|
# from clear to assert or
|
|
# from assert to clear
|
|
#
|
|
# See code comments for details.
|
|
#
|
|
############################################################################
|
|
#
|
|
# Import list
|
|
|
|
import os
|
|
import socket
|
|
import collectd
|
|
import tsconfig.tsconfig as tsc
|
|
|
|
# This plugin name
|
|
PLUGIN = 'degrade notifier'
|
|
|
|
# collectd severity definitions ;
|
|
# Note: can't seem to pull then in symbolically with a header
|
|
NOTIF_FAILURE = 1
|
|
NOTIF_WARNING = 2
|
|
NOTIF_OKAY = 4
|
|
|
|
# generic return codes
|
|
PASS = 0
|
|
FAIL = 1
|
|
|
|
# default mtce port.
|
|
# ... with configuration override
|
|
MTCE_CMD_RX_PORT = 2101
|
|
|
|
# same state message throttle count.
|
|
# ... only send the degrade message every 'this' number
|
|
# while the state of assert or clear remains the same.
|
|
ONE_EVERY = 10
|
|
|
|
PLUGIN__DF = 'df'
|
|
PLUGIN__MEM = 'memory'
|
|
PLUGIN__CPU = 'cpu'
|
|
|
|
PLUGIN__VSWITCH_MEM = 'vswitch_mem'
|
|
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
|
|
PLUGIN__VSWITCH_PORT = "vswitch_port"
|
|
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
|
|
|
|
|
|
PLUGIN_INTERFACE = 'interface'
|
|
PLUGIN__EXAMPLE = 'example'
|
|
|
|
|
|
# The collectd Maintenance Notifier Object
|
|
class collectdMtceNotifierObject:
|
|
|
|
def __init__(self, port):
|
|
"""
|
|
collectdMtceNotifierObject Class constructor
|
|
"""
|
|
# default maintenance port
|
|
self.port = port
|
|
self.addr = None
|
|
|
|
# specifies the protocol family to use when messaging maintenance.
|
|
# if system is IPV6, then that is learned and this 'protocol' is
|
|
# updated with AF_INET6
|
|
self.protocol = socket.AF_INET
|
|
|
|
# List of plugin names that require degrade for specified severity.
|
|
self.degrade_list__failure = [PLUGIN__DF,
|
|
PLUGIN__MEM,
|
|
PLUGIN__CPU,
|
|
PLUGIN__VSWITCH_MEM,
|
|
PLUGIN__VSWITCH_CPU,
|
|
PLUGIN__VSWITCH_PORT,
|
|
PLUGIN__VSWITCH_IFACE,
|
|
PLUGIN_INTERFACE,
|
|
PLUGIN__EXAMPLE]
|
|
self.degrade_list__warning = [PLUGIN_INTERFACE]
|
|
|
|
# the running list of resources that require degrade.
|
|
# a degrade clear message is sent whenever this list is empty.
|
|
# a degrade assert message is sent whenever this list is not empty.
|
|
self.degrade_list = []
|
|
|
|
# throttle down sending of duplicate degrade assert/clear messages
|
|
self.last_state = "undef"
|
|
self.msg_throttle = 0
|
|
|
|
|
|
# Instantiate the mtce_notifier object
|
|
# This object persists from notificaiton to notification
|
|
obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT)
|
|
|
|
|
|
def _get_active_controller_ip():
|
|
"""
|
|
Get the active controller host IP
|
|
"""
|
|
|
|
try:
|
|
obj.addr = socket.getaddrinfo('controller', None)[0][4][0]
|
|
collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr))
|
|
except Exception as ex:
|
|
obj.addr = None
|
|
collectd.error("%s failed to get controller ip ; %s" %
|
|
(PLUGIN, str(ex)))
|
|
return 0
|
|
|
|
|
|
def _df_instance_to_path(df_inst):
|
|
"""
|
|
Convert a df instance name to a mountpoint
|
|
"""
|
|
|
|
# df_root is not a dynamic file system. Ignore that one.
|
|
if df_inst == 'df_root':
|
|
return '/'
|
|
else:
|
|
# For all others replace all '-' with '/'
|
|
return('/' + df_inst[3:].replace('-', '/'))
|
|
|
|
|
|
# This function removes degraded file systems that are no longer present.
|
|
def _clear_degrade_for_missing_filesystems():
|
|
"""
|
|
Remove degraded file systems that are no longer mounted or present.
|
|
"""
|
|
|
|
for df_inst in obj.degrade_list:
|
|
|
|
# Only file system plugins are looked at.
|
|
# File system plugin instance names are prefixed with 'df_'
|
|
# as the first 3 chars in the instance name.
|
|
if df_inst[0:3] == 'df_':
|
|
path = _df_instance_to_path(df_inst)
|
|
|
|
# check the mount point.
|
|
# if the mount point no longer exists then remove
|
|
# this instance from the degrade list.
|
|
if os.path.ismount(path) is False:
|
|
collectd.info("%s clearing degrade for missing %s ; %s" %
|
|
(PLUGIN, path, obj.degrade_list))
|
|
obj.degrade_list.remove(df_inst)
|
|
|
|
return 0
|
|
|
|
|
|
# The collectd configuration interface
|
|
#
|
|
# Used to configure the maintenance port.
|
|
# key = 'port'
|
|
# val = port number
|
|
#
|
|
def config_func(config):
|
|
"""
|
|
Configure the maintenance degrade notifier plugin.
|
|
"""
|
|
|
|
collectd.debug('%s config function' % PLUGIN)
|
|
for node in config.children:
|
|
key = node.key.lower()
|
|
val = node.values[0]
|
|
|
|
if key == 'port':
|
|
obj.port = int(val)
|
|
collectd.info("%s configured mtce port: %d" %
|
|
(PLUGIN, obj.port))
|
|
return 0
|
|
|
|
obj.port = MTCE_CMD_RX_PORT
|
|
collectd.error("%s no mtce port provided ; defaulting to %d" %
|
|
(PLUGIN, obj.port))
|
|
|
|
|
|
# Collectd calls this function on startup.
|
|
def init_func():
|
|
"""
|
|
Collectd Mtce Notifier Initialization Function
|
|
"""
|
|
|
|
obj.host = os.uname()[1]
|
|
collectd.info("%s %s:%s sending to mtce port %d" %
|
|
(PLUGIN, tsc.nodetype, obj.host, obj.port))
|
|
|
|
collectd.debug("%s init function" % PLUGIN)
|
|
|
|
|
|
# This is the Notifier function that is called by collectd.
|
|
#
|
|
# Handling steps are
|
|
#
|
|
# 1. build resource name from notification object.
|
|
# 2. check resource against severity lists.
|
|
# 3. manage this instance's degrade state.
|
|
# 4. send mtcAgent the degrade state message.
|
|
#
|
|
def notifier_func(nObject):
|
|
"""
|
|
Collectd Mtce Notifier Handler Function
|
|
"""
|
|
|
|
# Create the resource name from the notifier object.
|
|
# format: <plugin name>_<plugin_instance_name>
|
|
resource = nObject.plugin
|
|
if nObject.plugin_instance:
|
|
resource += "_" + nObject.plugin_instance
|
|
|
|
# This block looks at the current notification severity
|
|
# and manages the degrade_list.
|
|
# If the specified plugin name exists in each of the warnings
|
|
# or failure lists and there is a current severity match then
|
|
# add that resource instance to the degrade list.
|
|
# Conversly if this notification is OKAY then make sure this
|
|
# resource instance is not in the degrade list (remove it if it is)
|
|
if nObject.severity is NOTIF_OKAY:
|
|
if obj.degrade_list and resource in obj.degrade_list:
|
|
obj.degrade_list.remove(resource)
|
|
|
|
elif nObject.severity is NOTIF_FAILURE:
|
|
if obj.degrade_list__failure:
|
|
if nObject.plugin in obj.degrade_list__failure:
|
|
if resource not in obj.degrade_list:
|
|
# handle dynamic filesystems going missing over a swact
|
|
# or unmount and being reported as a transient error by
|
|
# the df plugin. Don't add it to the failed list if the
|
|
# mountpoint is gone.
|
|
add = True
|
|
if nObject.plugin == PLUGIN__DF:
|
|
path = _df_instance_to_path(resource)
|
|
add = os.path.ismount(path)
|
|
if add is True:
|
|
collectd.info("%s %s added to degrade list" %
|
|
(PLUGIN, resource))
|
|
obj.degrade_list.append(resource)
|
|
else:
|
|
# If severity is failure and no failures cause degrade
|
|
# then make sure this plugin is not in the degrade list,
|
|
# Should never occur.
|
|
if resource in obj.degrade_list:
|
|
obj.degrade_list.remove(resource)
|
|
|
|
elif nObject.severity is NOTIF_WARNING:
|
|
if obj.degrade_list__warning:
|
|
if nObject.plugin in obj.degrade_list__warning:
|
|
if resource not in obj.degrade_list:
|
|
# handle dynamic filesystems going missing over a swact
|
|
# or unmount and being reported as a transient error by
|
|
# the df plugin. Don't add it to the failed list if the
|
|
# mountpoint is gone.
|
|
add = True
|
|
if nObject.plugin == PLUGIN__DF:
|
|
path = _df_instance_to_path(resource)
|
|
add = os.path.ismount(path)
|
|
if add is True:
|
|
collectd.info("%s %s added to degrade list" %
|
|
(PLUGIN, resource))
|
|
obj.degrade_list.append(resource)
|
|
else:
|
|
# If severity is warning and no warnings cause degrade
|
|
# then make sure this plugin is not in the degrade list.
|
|
# Should never occur..
|
|
if resource in obj.degrade_list:
|
|
obj.degrade_list.remove(resource)
|
|
else:
|
|
collectd.info("%s unsupported severity %d" %
|
|
(PLUGIN, nObject.severity))
|
|
return FAIL
|
|
|
|
# running counter of notifications.
|
|
obj.msg_throttle += 1
|
|
|
|
# Support for Dynamic File Systems
|
|
# --------------------------------
|
|
# Some active controller mounted filesystems can become
|
|
# unmounted under the watch of collectd. This can occur
|
|
# as a result of a Swact. If an 'degrade' is raised at the
|
|
# time an fs disappears then that state can become stuck
|
|
# active until the next Swact. This call handles this case.
|
|
#
|
|
# Audit file system presence every time we get the
|
|
# notification for the root file system.
|
|
# Depending on the root filesystem always being there.
|
|
if nObject.plugin == 'df' \
|
|
and nObject.plugin_instance == 'root' \
|
|
and len(obj.degrade_list):
|
|
_clear_degrade_for_missing_filesystems()
|
|
|
|
# If degrade list is empty then a clear state is sent to maintenance.
|
|
# If degrade list is NOT empty then an assert state is sent to maintenance
|
|
# For logging and to ease debug the code below will create a list of
|
|
# degraded resource instances to be included in the message to maintenance
|
|
# for mtcAgent to optionally log it.
|
|
resources = ""
|
|
if obj.degrade_list:
|
|
# loop over the list,
|
|
# limit the degraded resource list being sent to mtce to 5
|
|
for r in obj.degrade_list[0:1:5]:
|
|
resources += r + ','
|
|
resources = resources[:-1]
|
|
state = "assert"
|
|
else:
|
|
state = "clear"
|
|
|
|
# Message throttling ....
|
|
|
|
# Avoid sending the same last state message for up to ONE_EVERY count.
|
|
# Just reduce load on mtcAgent
|
|
if obj.last_state == state and obj.msg_throttle < ONE_EVERY:
|
|
return 0
|
|
|
|
# if the degrade state has changed then log it and proceed
|
|
if obj.last_state != state:
|
|
if obj.last_state != "undef":
|
|
collectd.info("%s degrade %s %s" %
|
|
(PLUGIN,
|
|
state,
|
|
obj.degrade_list))
|
|
|
|
# Save state for next time
|
|
obj.last_state = state
|
|
|
|
# Clear the message throttle counter
|
|
obj.msg_throttle = 0
|
|
|
|
# Send the degrade state ; assert or clear message to mtcAgent.
|
|
# If we get a send failure then log it and set the addr to None
|
|
# so it forces us to refresh the controller address on the next
|
|
# notification
|
|
try:
|
|
mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM)
|
|
if mtce_socket:
|
|
if obj.addr is None:
|
|
_get_active_controller_ip()
|
|
if obj.addr is None:
|
|
return 0
|
|
|
|
# Create the Maintenance message.
|
|
message = "{\"service\":\"collectd_notifier\","
|
|
message += "\"hostname\":\"" + nObject.host + "\","
|
|
message += "\"degrade\":\"" + state + "\","
|
|
message += "\"resource\":\"" + resources + "\"}"
|
|
collectd.debug("%s: %s" % (PLUGIN, message))
|
|
|
|
mtce_socket.settimeout(1.0)
|
|
mtce_socket.sendto(message, (obj.addr, obj.port))
|
|
mtce_socket.close()
|
|
else:
|
|
collectd.error("%s %s failed to open socket (%s)" %
|
|
(PLUGIN, resource, obj.addr))
|
|
except socket.error as e:
|
|
if e.args[0] == socket.EAI_ADDRFAMILY:
|
|
# Handle IPV4 to IPV6 switchover:
|
|
obj.protocol = socket.AF_INET6
|
|
collectd.info("%s %s ipv6 addressing (%s)" %
|
|
(PLUGIN, resource, obj.addr))
|
|
else:
|
|
collectd.error("%s %s socket error (%s) ; %s" %
|
|
(PLUGIN, resource, obj.addr, str(e)))
|
|
# try self correction
|
|
obj.addr = None
|
|
obj.protocol = socket.AF_INET
|
|
|
|
return 0
|
|
|
|
collectd.register_config(config_func)
|
|
collectd.register_init(init_func)
|
|
collectd.register_notification(notifier_func)
|