Add node ready check to collectd plugins

This update adds a second collectd plugin
initialization enhancement. First update
added a config complete gate:

https://review.opendev.org/c/starlingx/monitoring/+/736817

Turns out that not all plugins are ready to sample
immediately following the config complete state.
One example is FM on the active controller needs
time to get going before plugins can query their
alarms on startup. Also, some plugins need more
time than others.

To account for both cases this update adds a
thresholded node ready gate that can be tailored
to a plugin to hold off fm access and sampling
until its ready threshold is reached.

Test Plan:

PASS: Verify AIO SX and DX system install
PROG: Verify Storage system install
PASS: Verify AIO SX node lock and unlock
PASS: Verify AIO Standby controller lock and unlock
PASS: Verify Standard controller lock and unlock
PASS: Verify Compute and Storage node lock and unlock
PASS: Verify Dead-Office-Recovery (AIO DX)
PASS: Verify collectd sampling and logs

Partial-Bug: 1872979
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
Change-Id: I044d812542a4222214c7d13e231ac4024cca9800
This commit is contained in:
Eric MacDonald 2021-01-25 09:40:06 -05:00
parent b66c85287d
commit ea4b515f91
12 changed files with 98 additions and 34 deletions

View File

@ -19,7 +19,6 @@ Source5: plugin_common.py
# collectd python plugin files - resource plugins
Source11: cpu.py
Source12: memory.py
Source14: example.py
Source15: ntpq.py
Source16: interface.py
Source17: remotels.py
@ -31,7 +30,6 @@ Source100: python_plugins.conf
Source101: cpu.conf
Source102: memory.conf
Source103: df.conf
Source104: example.conf
Source105: ntpq.conf
Source106: interface.conf
Source107: remotels.conf
@ -82,7 +80,6 @@ install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir}
# collectd python plugin files - resource plugins
install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
@ -95,13 +92,20 @@ install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE101} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE108} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE109} %{buildroot}%{local_plugin_dir}
%pre
rm -f /etc/collectd.d/90-default-plugins-syslog.conf
rm -f /etc/collectd.d/90-default-plugins-memory.conf
rm -f /etc/collectd.d/90-default-plugins-load.conf
rm -f /etc/collectd.d/90-default-plugins-interface.conf
rm -f /etc/collectd.d/90-default-plugins-cpu.conf
%clean
rm -rf $RPM_BUILD_ROOT

View File

@ -3,6 +3,7 @@ Description=Collectd statistics daemon and extension services
Documentation=man:collectd(1) man:collectd.conf(5)
Before=pmon.service
After=local-fs.target network-online.target
After=config.service syslog.service
Requires=local-fs.target network-online.target
[Service]

View File

@ -22,6 +22,7 @@ import plugin_common as pc
import re
import socket
import time
import tsconfig.tsconfig as tsc
PLUGIN = 'platform cpu usage plugin'
PLUGIN_DEBUG = 'DEBUG platform cpu'
@ -413,7 +414,11 @@ def init_func():
# do nothing till config is complete.
if obj.config_complete() is False:
return False
return pc.PLUGIN_PASS
if obj._node_ready is False:
obj.node_ready()
return pc.PLUGIN_PASS
obj.hostname = socket.gethostname()
@ -472,7 +477,7 @@ def read_func():
if obj.init_complete is False:
init_func()
return 0
return pc.PLUGIN_PASS
# epoch time in floating seconds
now0 = time.time()

View File

@ -61,6 +61,10 @@ def read_func():
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
# do the work to create the sample
low = int(obj.plugin_data[0])
high = int(obj.plugin_data[1])

View File

@ -194,7 +194,6 @@ DF_MANGLED_DICT = {
ALARM_ID__CPU = "100.101"
ALARM_ID__MEM = "100.103"
ALARM_ID__DF = "100.104"
ALARM_ID__EXAMPLE = "100.113"
ALARM_ID__VSWITCH_CPU = "100.102"
ALARM_ID__VSWITCH_MEM = "100.115"
@ -209,8 +208,7 @@ ALARM_ID_LIST = [ALARM_ID__CPU,
ALARM_ID__VSWITCH_CPU,
ALARM_ID__VSWITCH_MEM,
ALARM_ID__VSWITCH_PORT,
ALARM_ID__VSWITCH_IFACE,
ALARM_ID__EXAMPLE]
ALARM_ID__VSWITCH_IFACE]
# ADD_NEW_PLUGIN: add plugin name definition
# WARNING: This must line up exactly with the plugin
@ -224,7 +222,6 @@ PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_CPU = "vswitch_cpu"
PLUGIN__VSWITCH_MEM = "vswitch_mem"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
PLUGIN__EXAMPLE = "example"
# ADD_NEW_PLUGIN: add plugin name to list
PLUGIN_NAME_LIST = [PLUGIN__CPU,
@ -233,8 +230,7 @@ PLUGIN_NAME_LIST = [PLUGIN__CPU,
PLUGIN__VSWITCH_CPU,
PLUGIN__VSWITCH_MEM,
PLUGIN__VSWITCH_PORT,
PLUGIN__VSWITCH_IFACE,
PLUGIN__EXAMPLE]
PLUGIN__VSWITCH_IFACE]
# Used to find plugin name based on alarm id
# for managing degrade for startup alarms.
@ -555,8 +551,6 @@ class fmAlarmObject:
database_setup = False # state of database setup
database_setup_in_progress = False # connection mutex
# Set to True once FM connectivity is verified
# Used to ensure alarms are queried on startup
fm_connectivity = False
def __init__(self, id, plugin):
@ -1312,8 +1306,7 @@ PLUGINS = {
PLUGIN__VSWITCH_PORT: fmAlarmObject(ALARM_ID__VSWITCH_PORT,
PLUGIN__VSWITCH_PORT),
PLUGIN__VSWITCH_IFACE: fmAlarmObject(ALARM_ID__VSWITCH_IFACE,
PLUGIN__VSWITCH_IFACE),
PLUGIN__EXAMPLE: fmAlarmObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
PLUGIN__VSWITCH_IFACE)}
#####################################################################
@ -1744,12 +1737,6 @@ def init_func():
###########################################################################
obj = PLUGINS[PLUGIN__EXAMPLE]
obj.resource_name = "Example"
obj.instance_name = PLUGIN__EXAMPLE
obj.repair = "Not Applicable"
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
# ...
# ADD_NEW_PLUGIN: Add new plugin object initialization here ...
# ...
@ -1772,8 +1759,17 @@ def notifier_func(nObject):
if pluginObject.config_complete() is False:
return 0
if fmAlarmObject.fm_connectivity is False:
if pluginObject._node_ready is False:
collectd.info("%s %s not ready ; from:%s:%s:%s" %
(PLUGIN,
fmAlarmObject.host,
nObject.host,
nObject.plugin,
nObject.plugin_instance))
pluginObject.node_ready()
return 0
if fmAlarmObject.fm_connectivity is False:
# handle multi threading startup
with fmAlarmObject.lock:
if fmAlarmObject.fm_connectivity is True:
@ -1791,8 +1787,12 @@ def notifier_func(nObject):
try:
alarms = api.get_faults_by_id(alarm_id)
except Exception as ex:
collectd.error("%s 'get_faults_by_id' exception ; %s" %
(PLUGIN, ex))
collectd.warning("%s 'get_faults_by_id' exception ; %s" %
(PLUGIN, ex))
# if fm is not responding then the node is not ready
pluginObject._node_ready = False
pluginObject.node_ready_count = 0
return 0
if alarms:
@ -1861,7 +1861,7 @@ def notifier_func(nObject):
(PLUGIN_DEGRADE, eid, alarm_id))
fmAlarmObject.fm_connectivity = True
collectd.info("%s connectivity with fm complete" % PLUGIN)
collectd.info("%s node ready" % PLUGIN)
collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % (
PLUGIN,

View File

@ -829,6 +829,10 @@ def read_func():
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
if obj.phase < RUN_PHASE__ALARMS_CLEARED:
# clear all alarms on first audit

View File

@ -373,6 +373,9 @@ def init_func():
if obj.config_complete() is False:
return 0
# override node ready threshold for this plugin
obj.node_ready_threshold = 1
obj.hostname = socket.gethostname()
collectd.info('%s: init function for %s' % (PLUGIN, obj.hostname))
@ -398,6 +401,10 @@ def read_func():
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
# Get epoch time in floating seconds
now0 = time.time()

View File

@ -555,6 +555,10 @@ def init_func():
if obj.config_complete() is False:
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
# get current hostname
obj.hostname = obj.gethostname()
if not obj.hostname:

View File

@ -903,6 +903,10 @@ def read_func():
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
if obj.phase < RUN_PHASE__ALARMS_CLEARED:
# clear all alarms on first audit

View File

@ -114,7 +114,7 @@ class PluginObject(object):
self._config_complete = False # set to True once config is complete
self.config_done = False # set true if config_func completed ok
self.init_complete = False # set true if init_func completed ok
self.fm_connectivity = False # set true when fm connectivity ok
self._node_ready = False # set true when node is ready
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
@ -146,9 +146,8 @@ class PluginObject(object):
self.http_retry_count = 0 # track http error cases
self.HTTP_RETRY_THROTTLE = 6 # http retry threshold
self.phase = 0 # tracks current phase; init, sampling
collectd.debug("%s Common PluginObject constructor [%s]" %
(plugin, url))
self.node_ready_threshold = 3 # wait for node ready before sampling
self.node_ready_count = 0 # node ready count up counter
###########################################################################
#
@ -206,6 +205,35 @@ class PluginObject(object):
return True
###########################################################################
#
# Name : node_ready
#
# Description: Test for node ready condition.
# Currently that's just a thresholded count
#
# Parameters : plugin name
#
# Returns : False if node is not ready
# True if node is ready
#
###########################################################################
def node_ready(self):
"""Check for node ready state"""
if tsc.nodetype == 'controller':
self.node_ready_count += 1
if self.node_ready_count < self.node_ready_threshold:
collectd.info("%s node ready count %d of %d" %
(self.plugin,
self.node_ready_count,
self.node_ready_threshold))
return False
self._node_ready = True
return True
###########################################################################
#
# Name : gethostname

View File

@ -661,7 +661,9 @@ def read_func():
init_func()
return 0
if obj.fm_connectivity is False:
if obj._node_ready is False:
if obj.node_ready() is False:
return 0
try:
# query FM for existing alarms.
@ -723,9 +725,6 @@ def read_func():
else:
collectd.info("%s no startup alarms found" % PLUGIN)
obj.fm_connectivity = True
# assert_all_alarms()
# This plugin supports PTP in-service state change by checking
# service state on every audit ; every 5 minutes.
data = subprocess.check_output([SYSTEMCTL,

View File

@ -169,6 +169,10 @@ def read_func():
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
# get current state
current_enabled_state = obj.enabled