Add Remote Logging Server connectivity monitoring to collectd

This update adds titled support to the starlingX set
of collectd monitoring plugins.

This update excludes monitoring of IPV6 remote logging servers.
Only IPV4 remote logging servers are supported.

Story: 2002823
Task: 28636

Test Plan:
PASS: Verify monitoring on controller nodes only
PASS: Verify system install
PASS: Verify plugin logging is value added
PASS: Verify connectivity failure to success handling
PASS: Verify connectivity success to failure handling
PASS: Verify connected / not connected logging on service state change
PASS: Verify connected / not connected logging on connectivity state change
PASS: Verify service enabled to disabled state transition with alarm asserted
PASS: Verify service enabled to disabled state transition while connected
PASS: Verify service disabled to enabled state transition with connectivity
PASS: Verify service disabled to enabled state transition without connectivity
PASS: Verify plugin audit interval is every 60 seconds
PASS: Verify plugin alarm assert debounce of 2
PASS: Verify plugin alarm clear with no debounce
PASS: Verify plugin alarm assert over process start on TCP conn failure
PASS: Verify plugin alarm severity as Minor
PASS: Verify plugin alarm clear over process restart
PASS: Verify plugin alarm is cleared on service disable transition
PASS: Verify plugin sample data

Change-Id: I73cd35170ed19abce17bb4f511f0c5e04bc101c6
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-03-10 15:19:48 -04:00
parent 01c473bb5e
commit 8be6e4846a
6 changed files with 369 additions and 1 deletions

View File

@ -16,7 +16,9 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/ntpq.conf \
$PKG_BASE/src/interface.py \
$PKG_BASE/src/interface.conf \
$PKG_BASE/src/remotels.py \
$PKG_BASE/src/remotels.conf \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
TIS_PATCH_VER=7
TIS_PATCH_VER=8

View File

@ -23,6 +23,7 @@ Source12: memory.py
Source14: example.py
Source15: ntpq.py
Source16: interface.py
Source17: remotels.py
# collectd plugin conf files into /etc/collectd.d
Source100: python_plugins.conf
@ -32,6 +33,7 @@ Source103: df.conf
Source104: example.conf
Source105: ntpq.conf
Source106: interface.conf
Source107: remotels.conf
BuildRequires: systemd-devel
@ -75,6 +77,7 @@ install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
# collectd plugin conf files into /etc/collectd.d
@ -85,6 +88,7 @@ install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir}
%clean
rm -rf $RPM_BUILD_ROOT

View File

@ -33,6 +33,7 @@ class PluginObject(object):
self.plugin = plugin # the name of this plugin
self.hostname = '' # the name of this host
self.port = 0 # the port number for this plugin
self.base_eid = '' # the base entity id host=<hostname>
# dynamic gate variables
self.config_complete = False # set to True once config is complete
@ -42,6 +43,8 @@ class PluginObject(object):
# dynamic variables set in read_func
self.usage = float(0) # last usage value recorded as float
self.audits = 0 # number of audit since init
self.enabled = False # tracks a plugin's enabled state
self.alarmed = False # tracks the current alarmed state
# http and json specific variables
self.url = url # target url

View File

@ -14,6 +14,7 @@ LoadPlugin python
<Module "interface">
Port 2122
</Module>
Import "remotels"
LogTraces = true
Encoding "utf-8"
</Plugin>

View File

@ -0,0 +1,13 @@
<Plugin "threshold">
<Plugin "remotels">
<Type "absolute">
Instance "reachable"
Persist true
PersistOK true
WarningMin 1
FailureMin 0
Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,345 @@
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This is the Remote Logging Server plugin for collectd.
#
# The Remote Logging Server is enabled if /etc/syslog-ng/syslog-ng.conf
# contains '@include remotelogging.conf'
#
# There is no asynchronous notification of remote logging server
# configuration enable/disable state changes. Therefore, each audit
# interval needs to check whether its enabled or not.
#
# every audit interval ...
#
# read_func:
# check enabled:
# if disabled and alarmed:
# clear alarm
# if enabled:
# get ip and port
# query status
# if connected and alarmed:
# clear alarm
# if not connected and not alarmed:
# raise alarm
#
# system remotelogging-modify --ip_address <ip address>
# --transport tcp
# --enabled True
#
############################################################################
import os
import collectd
import tsconfig.tsconfig as tsc
import plugin_common as pc
from fm_api import constants as fm_constants
from oslo_concurrency import processutils
from fm_api import fm_api
# Fault manager API Object
api = fm_api.FaultAPIs()
# name of the plugin
PLUGIN_NAME = 'remotels'
# all logs produced by this plugin are prefixed with this
PLUGIN = 'remote logging server'
# Interface Monitoring Interval in seconds
PLUGIN_AUDIT_INTERVAL = 60
# Sample Data 'type' and 'instance' database field values.
PLUGIN_TYPE = 'absolute'
PLUGIN_TYPE_INSTANCE = 'reachable'
# Remote Logging Connectivity Alarm ID
PLUGIN_ALARMID = '100.118'
# The file where this plugin learns if remote logging is enabled
SYSLOG_CONF_FILE = '/etc/syslog-ng/syslog-ng.conf'
# Plugin Control Object
obj = pc.PluginObject(PLUGIN, "")
# Raise Remote Logging Server Alarm
def raise_alarm():
""" Raise Remote Logging Server Alarm. """
repair = 'Ensure Remote Log Server IP is reachable from '
repair += 'Controller through OAM interface; otherwise '
repair += 'contact next level of support.'
reason = 'Controller cannot establish connection with '
reason += 'remote logging server.'
try:
fault = fm_api.Fault(
alarm_id=PLUGIN_ALARMID,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=obj.base_eid,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text=reason,
alarm_type=fm_constants.FM_ALARM_TYPE_1,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_6,
proposed_repair_action=repair,
service_affecting=False,
suppression=False)
alarm_uuid = api.set_fault(fault)
if pc.is_uuid_like(alarm_uuid) is False:
collectd.error("%s %s:%s set_fault failed:%s" %
(PLUGIN, PLUGIN_ALARMID,
obj.base_eid, alarm_uuid))
else:
collectd.info("%s %s:%s alarm raised" %
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
obj.alarmed = True
except:
collectd.error("%s %s:%s set_fault exception" %
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
# Clear remote logging server alarm
def clear_alarm():
""" Clear remote logging server alarm """
try:
if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is True:
collectd.info("%s alarm cleared" % PLUGIN)
obj.alarmed = False
return True
except:
collectd.error("%s %s:%s clear failed ; will retry" %
(PLUGIN, PLUGIN_ALARMID, obj.base_eid))
return False
# The config function - called once on collectd process startup
def config_func(config):
""" Configure the plugin """
# all configuration is learned during normal monitoring
obj.config_done = True
return 0
# The init function - called once on collectd process startup
def init_func():
""" Init the plugin """
# remote logging server monitoring is for controllers only
if tsc.nodetype != 'controller':
return 0
if obj.init_done is False:
if obj.init_ready() is False:
return False
obj.hostname = obj.gethostname()
obj.base_eid = 'host=' + obj.hostname
obj.init_done = True
collectd.info("%s initialization complete" % PLUGIN)
return True
# The sample read function - called on every audit interval
def read_func():
""" Remote logging server connectivity plugin read function """
# remote logging server monitoring is for controllers only
if tsc.nodetype != 'controller':
return 0
if obj.init_done is False:
init_func()
return 0
# get current state
current_enabled_state = obj.enabled
# check to see if remote logging is enabled
obj.enabled = False # assume disabled
if os.path.exists(SYSLOG_CONF_FILE) is True:
with open(SYSLOG_CONF_FILE, 'r') as infile:
for line in infile:
if line.startswith('@include '):
service = line.rstrip().split(' ')[1]
if service == '"remotelogging.conf"':
obj.enabled = True
break
if current_enabled_state == obj.enabled:
logit = False
else:
if obj.enabled is False:
collectd.info("%s is disabled" % PLUGIN)
else:
collectd.info("%s is enabled" % PLUGIN)
logit = True
# Handle startup case by clearing existing alarm if its raised.
# Its runtime cheaper and simpler to issue a blind clear than query.
if obj.audits == 0:
if clear_alarm() is False:
# if clear fails then retry next time
return 0
if obj.enabled is False:
collectd.info("%s is disabled" % PLUGIN)
obj.audits = 1
if obj.enabled is False:
if obj.alarmed is True:
clear_alarm()
return 0
# If we get here then the server is enabled ...
# Need to query it
# Get the ip and port from line that looks like this
#
# tag proto address port
# ----------------------------- --- -------------- ---
# destination remote_log_server {tcp("128.224.186.65" port(514));};
#
address = protocol = port = ''
with open(SYSLOG_CONF_FILE, 'r') as infile:
for line in infile:
if line.startswith('destination remote_log_server'):
try:
if len(line.split('{')) > 1:
protocol = line.split('{')[1][0:3]
address = line.split('{')[1].split('"')[1]
port = line.split('{')[1].split('(')[2].split(')')[0]
if not protocol or not address or not port:
collectd.error("%s remote log server credentials "
"parse error ; (%s:%s:%s)" %
(PLUGIN, protocol, address, port))
return 1
else:
# line parsed ; move on ...
break
else:
collectd.error("%s remote log server line parse error"
" ; %s" % (PLUGIN, line))
except Exception as ex:
collectd.error("%s remote log server credentials "
"parse exception ; (%s)" % (PLUGIN, line))
if ':' in address:
ipv = 6
protocol += 6
# Monitoring of IPV6 is not currently supported
return 0
else:
ipv = 4
# This plugin detects server connectivity through its socket status.
# To get that construct the remote logging server IP string.
# The files being looked at(/proc/net/tcp(udp)) use hex values,
# so convert the string caps hex value with reverse ordering of
# the "ipv4" values
index = 3
addr = [0, 0, 0, 0]
# swap order
for tup in address.split('.'):
addr[index] = int(tup)
index -= 1
# build the CAPs HEX address
UPPER_HEX_IP = ''
for tup in addr:
val = hex(int(tup)).split('x')[-1].upper()
if len(val) == 1:
UPPER_HEX_IP += '0'
UPPER_HEX_IP += val
UPPER_HEX_IP += ':'
tmp = hex(int(port)).split('x')[-1].upper()
for i in range(4-len(tmp)):
UPPER_HEX_IP += '0'
UPPER_HEX_IP += tmp
# log example tcp:ipv4:128.224.186.65:514 : IP:41BAE080:0202
collectd.debug("%s %s:ipv%d:%s:%s : IP:%s" %
(PLUGIN, protocol, ipv, address, port, UPPER_HEX_IP))
cmd = "cat /proc/net/" + protocol
cmd += " | awk '{print $3 \" \" $4}' | grep " + UPPER_HEX_IP
cmd += " | awk '{print $2}'"
res, err = processutils.execute(cmd, shell=True)
if err:
collectd.error("%s processutils error:%s" % (PLUGIN, err))
# cmd example:
# cat /proc/net/tcp | awk '{print $3 " " $4}'
# | grep 41BAE080:0202
# | awk '{print $2}'
collectd.debug("%s Cmd:%s" % (PLUGIN, cmd))
return 0
if res and res.rstrip() == '01':
# connected state reads 01
# Example log: Res:[01]
# clear alarm if
# - currently alarmed and
# - debounced by 1 ; need 2 connected readings in a row
if obj.alarmed is True:
clear_alarm()
# Only log on state change
if obj.usage != 1:
logit = True
obj.usage = 1
conn = ''
else:
# res typically reads 02 when notr connected
# Example log: Res:[02]
collectd.debug("%s Res:[%s] " % (PLUGIN, res.rstrip()))
# raise alarm if
# - not already alarmed
# - debounced by 1 ; need 2 failures in a row
if obj.alarmed is False and obj.usage == 0:
raise_alarm()
# only log on state change
if obj.usage == 1 or obj.audits == 1:
logit = True
obj.usage = 0
conn = 'not '
if logit is True:
collectd.info("%s is %sconnected [%s ipv%d %s:%s]" %
(PLUGIN, conn, protocol, ipv, address, port))
obj.audits += 1
# Dispatch usage value to collectd
val = collectd.Values(host=obj.hostname)
val.plugin = PLUGIN_NAME
val.type = PLUGIN_TYPE
val.type_instance = PLUGIN_TYPE_INSTANCE
val.dispatch(values=[obj.usage])
return 0
# register the config, init and read functions
collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL)