Add node ready check to collectd plugins
This update adds a second collectd plugin initialization enhancement. First update added a config complete gate: https://review.opendev.org/c/starlingx/monitoring/+/736817 Turns out that not all plugins are ready to sample immediately following the config complete state. One example is FM on the active controller needs time to get going before plugins can query their alarms on startup. Also, some plugins need more time than others. To account for both cases this update adds a thresholded node ready gate that can be tailored to a plugin to hold off fm access and sampling until its ready threshold is reached. Test Plan: PASS: Verify AIO SX and DX system install PROG: Verify Storage system install PASS: Verify AIO SX node lock and unlock PASS: Verify AIO Standby controller lock and unlock PASS: Verify Standard controller lock and unlock PASS: Verify Compute and Storage node lock and unlock PASS: Verify Dead-Office-Recovery (AIO DX) PASS: Verify collectd sampling and logs Partial-Bug: 1872979 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com> Change-Id: I044d812542a4222214c7d13e231ac4024cca9800
This commit is contained in:
parent
b66c85287d
commit
ea4b515f91
@ -19,7 +19,6 @@ Source5: plugin_common.py
|
||||
# collectd python plugin files - resource plugins
|
||||
Source11: cpu.py
|
||||
Source12: memory.py
|
||||
Source14: example.py
|
||||
Source15: ntpq.py
|
||||
Source16: interface.py
|
||||
Source17: remotels.py
|
||||
@ -31,7 +30,6 @@ Source100: python_plugins.conf
|
||||
Source101: cpu.conf
|
||||
Source102: memory.conf
|
||||
Source103: df.conf
|
||||
Source104: example.conf
|
||||
Source105: ntpq.conf
|
||||
Source106: interface.conf
|
||||
Source107: remotels.conf
|
||||
@ -82,7 +80,6 @@ install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir}
|
||||
# collectd python plugin files - resource plugins
|
||||
install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
|
||||
@ -95,13 +92,20 @@ install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE101} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE108} %{buildroot}%{local_plugin_dir}
|
||||
install -m 600 %{SOURCE109} %{buildroot}%{local_plugin_dir}
|
||||
|
||||
%pre
|
||||
rm -f /etc/collectd.d/90-default-plugins-syslog.conf
|
||||
rm -f /etc/collectd.d/90-default-plugins-memory.conf
|
||||
rm -f /etc/collectd.d/90-default-plugins-load.conf
|
||||
rm -f /etc/collectd.d/90-default-plugins-interface.conf
|
||||
rm -f /etc/collectd.d/90-default-plugins-cpu.conf
|
||||
|
||||
|
||||
%clean
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
|
||||
|
@ -3,6 +3,7 @@ Description=Collectd statistics daemon and extension services
|
||||
Documentation=man:collectd(1) man:collectd.conf(5)
|
||||
Before=pmon.service
|
||||
After=local-fs.target network-online.target
|
||||
After=config.service syslog.service
|
||||
Requires=local-fs.target network-online.target
|
||||
|
||||
[Service]
|
||||
|
@ -22,6 +22,7 @@ import plugin_common as pc
|
||||
import re
|
||||
import socket
|
||||
import time
|
||||
import tsconfig.tsconfig as tsc
|
||||
|
||||
PLUGIN = 'platform cpu usage plugin'
|
||||
PLUGIN_DEBUG = 'DEBUG platform cpu'
|
||||
@ -413,7 +414,11 @@ def init_func():
|
||||
|
||||
# do nothing till config is complete.
|
||||
if obj.config_complete() is False:
|
||||
return False
|
||||
return pc.PLUGIN_PASS
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return pc.PLUGIN_PASS
|
||||
|
||||
obj.hostname = socket.gethostname()
|
||||
|
||||
@ -472,7 +477,7 @@ def read_func():
|
||||
|
||||
if obj.init_complete is False:
|
||||
init_func()
|
||||
return 0
|
||||
return pc.PLUGIN_PASS
|
||||
|
||||
# epoch time in floating seconds
|
||||
now0 = time.time()
|
||||
|
@ -61,6 +61,10 @@ def read_func():
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
# do the work to create the sample
|
||||
low = int(obj.plugin_data[0])
|
||||
high = int(obj.plugin_data[1])
|
||||
|
@ -194,7 +194,6 @@ DF_MANGLED_DICT = {
|
||||
ALARM_ID__CPU = "100.101"
|
||||
ALARM_ID__MEM = "100.103"
|
||||
ALARM_ID__DF = "100.104"
|
||||
ALARM_ID__EXAMPLE = "100.113"
|
||||
|
||||
ALARM_ID__VSWITCH_CPU = "100.102"
|
||||
ALARM_ID__VSWITCH_MEM = "100.115"
|
||||
@ -209,8 +208,7 @@ ALARM_ID_LIST = [ALARM_ID__CPU,
|
||||
ALARM_ID__VSWITCH_CPU,
|
||||
ALARM_ID__VSWITCH_MEM,
|
||||
ALARM_ID__VSWITCH_PORT,
|
||||
ALARM_ID__VSWITCH_IFACE,
|
||||
ALARM_ID__EXAMPLE]
|
||||
ALARM_ID__VSWITCH_IFACE]
|
||||
|
||||
# ADD_NEW_PLUGIN: add plugin name definition
|
||||
# WARNING: This must line up exactly with the plugin
|
||||
@ -224,7 +222,6 @@ PLUGIN__VSWITCH_PORT = "vswitch_port"
|
||||
PLUGIN__VSWITCH_CPU = "vswitch_cpu"
|
||||
PLUGIN__VSWITCH_MEM = "vswitch_mem"
|
||||
PLUGIN__VSWITCH_IFACE = "vswitch_iface"
|
||||
PLUGIN__EXAMPLE = "example"
|
||||
|
||||
# ADD_NEW_PLUGIN: add plugin name to list
|
||||
PLUGIN_NAME_LIST = [PLUGIN__CPU,
|
||||
@ -233,8 +230,7 @@ PLUGIN_NAME_LIST = [PLUGIN__CPU,
|
||||
PLUGIN__VSWITCH_CPU,
|
||||
PLUGIN__VSWITCH_MEM,
|
||||
PLUGIN__VSWITCH_PORT,
|
||||
PLUGIN__VSWITCH_IFACE,
|
||||
PLUGIN__EXAMPLE]
|
||||
PLUGIN__VSWITCH_IFACE]
|
||||
|
||||
# Used to find plugin name based on alarm id
|
||||
# for managing degrade for startup alarms.
|
||||
@ -555,8 +551,6 @@ class fmAlarmObject:
|
||||
database_setup = False # state of database setup
|
||||
database_setup_in_progress = False # connection mutex
|
||||
|
||||
# Set to True once FM connectivity is verified
|
||||
# Used to ensure alarms are queried on startup
|
||||
fm_connectivity = False
|
||||
|
||||
def __init__(self, id, plugin):
|
||||
@ -1312,8 +1306,7 @@ PLUGINS = {
|
||||
PLUGIN__VSWITCH_PORT: fmAlarmObject(ALARM_ID__VSWITCH_PORT,
|
||||
PLUGIN__VSWITCH_PORT),
|
||||
PLUGIN__VSWITCH_IFACE: fmAlarmObject(ALARM_ID__VSWITCH_IFACE,
|
||||
PLUGIN__VSWITCH_IFACE),
|
||||
PLUGIN__EXAMPLE: fmAlarmObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)}
|
||||
PLUGIN__VSWITCH_IFACE)}
|
||||
|
||||
|
||||
#####################################################################
|
||||
@ -1744,12 +1737,6 @@ def init_func():
|
||||
|
||||
###########################################################################
|
||||
|
||||
obj = PLUGINS[PLUGIN__EXAMPLE]
|
||||
obj.resource_name = "Example"
|
||||
obj.instance_name = PLUGIN__EXAMPLE
|
||||
obj.repair = "Not Applicable"
|
||||
collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))
|
||||
|
||||
# ...
|
||||
# ADD_NEW_PLUGIN: Add new plugin object initialization here ...
|
||||
# ...
|
||||
@ -1772,8 +1759,17 @@ def notifier_func(nObject):
|
||||
if pluginObject.config_complete() is False:
|
||||
return 0
|
||||
|
||||
if fmAlarmObject.fm_connectivity is False:
|
||||
if pluginObject._node_ready is False:
|
||||
collectd.info("%s %s not ready ; from:%s:%s:%s" %
|
||||
(PLUGIN,
|
||||
fmAlarmObject.host,
|
||||
nObject.host,
|
||||
nObject.plugin,
|
||||
nObject.plugin_instance))
|
||||
pluginObject.node_ready()
|
||||
return 0
|
||||
|
||||
if fmAlarmObject.fm_connectivity is False:
|
||||
# handle multi threading startup
|
||||
with fmAlarmObject.lock:
|
||||
if fmAlarmObject.fm_connectivity is True:
|
||||
@ -1791,8 +1787,12 @@ def notifier_func(nObject):
|
||||
try:
|
||||
alarms = api.get_faults_by_id(alarm_id)
|
||||
except Exception as ex:
|
||||
collectd.error("%s 'get_faults_by_id' exception ; %s" %
|
||||
(PLUGIN, ex))
|
||||
collectd.warning("%s 'get_faults_by_id' exception ; %s" %
|
||||
(PLUGIN, ex))
|
||||
|
||||
# if fm is not responding then the node is not ready
|
||||
pluginObject._node_ready = False
|
||||
pluginObject.node_ready_count = 0
|
||||
return 0
|
||||
|
||||
if alarms:
|
||||
@ -1861,7 +1861,7 @@ def notifier_func(nObject):
|
||||
(PLUGIN_DEGRADE, eid, alarm_id))
|
||||
|
||||
fmAlarmObject.fm_connectivity = True
|
||||
collectd.info("%s connectivity with fm complete" % PLUGIN)
|
||||
collectd.info("%s node ready" % PLUGIN)
|
||||
|
||||
collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % (
|
||||
PLUGIN,
|
||||
|
@ -829,6 +829,10 @@ def read_func():
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
if obj.phase < RUN_PHASE__ALARMS_CLEARED:
|
||||
|
||||
# clear all alarms on first audit
|
||||
|
@ -373,6 +373,9 @@ def init_func():
|
||||
if obj.config_complete() is False:
|
||||
return 0
|
||||
|
||||
# override node ready threshold for this plugin
|
||||
obj.node_ready_threshold = 1
|
||||
|
||||
obj.hostname = socket.gethostname()
|
||||
collectd.info('%s: init function for %s' % (PLUGIN, obj.hostname))
|
||||
|
||||
@ -398,6 +401,10 @@ def read_func():
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
# Get epoch time in floating seconds
|
||||
now0 = time.time()
|
||||
|
||||
|
@ -555,6 +555,10 @@ def init_func():
|
||||
if obj.config_complete() is False:
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
# get current hostname
|
||||
obj.hostname = obj.gethostname()
|
||||
if not obj.hostname:
|
||||
|
@ -903,6 +903,10 @@ def read_func():
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
if obj.phase < RUN_PHASE__ALARMS_CLEARED:
|
||||
|
||||
# clear all alarms on first audit
|
||||
|
@ -114,7 +114,7 @@ class PluginObject(object):
|
||||
self._config_complete = False # set to True once config is complete
|
||||
self.config_done = False # set true if config_func completed ok
|
||||
self.init_complete = False # set true if init_func completed ok
|
||||
self.fm_connectivity = False # set true when fm connectivity ok
|
||||
self._node_ready = False # set true when node is ready
|
||||
|
||||
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
|
||||
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
|
||||
@ -146,9 +146,8 @@ class PluginObject(object):
|
||||
self.http_retry_count = 0 # track http error cases
|
||||
self.HTTP_RETRY_THROTTLE = 6 # http retry threshold
|
||||
self.phase = 0 # tracks current phase; init, sampling
|
||||
|
||||
collectd.debug("%s Common PluginObject constructor [%s]" %
|
||||
(plugin, url))
|
||||
self.node_ready_threshold = 3 # wait for node ready before sampling
|
||||
self.node_ready_count = 0 # node ready count up counter
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
@ -206,6 +205,35 @@ class PluginObject(object):
|
||||
|
||||
return True
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Name : node_ready
|
||||
#
|
||||
# Description: Test for node ready condition.
|
||||
# Currently that's just a thresholded count
|
||||
#
|
||||
# Parameters : plugin name
|
||||
#
|
||||
# Returns : False if node is not ready
|
||||
# True if node is ready
|
||||
#
|
||||
###########################################################################
|
||||
|
||||
def node_ready(self):
|
||||
"""Check for node ready state"""
|
||||
|
||||
if tsc.nodetype == 'controller':
|
||||
self.node_ready_count += 1
|
||||
if self.node_ready_count < self.node_ready_threshold:
|
||||
collectd.info("%s node ready count %d of %d" %
|
||||
(self.plugin,
|
||||
self.node_ready_count,
|
||||
self.node_ready_threshold))
|
||||
return False
|
||||
|
||||
self._node_ready = True
|
||||
return True
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# Name : gethostname
|
||||
|
@ -661,7 +661,9 @@ def read_func():
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj.fm_connectivity is False:
|
||||
if obj._node_ready is False:
|
||||
if obj.node_ready() is False:
|
||||
return 0
|
||||
|
||||
try:
|
||||
# query FM for existing alarms.
|
||||
@ -723,9 +725,6 @@ def read_func():
|
||||
else:
|
||||
collectd.info("%s no startup alarms found" % PLUGIN)
|
||||
|
||||
obj.fm_connectivity = True
|
||||
# assert_all_alarms()
|
||||
|
||||
# This plugin supports PTP in-service state change by checking
|
||||
# service state on every audit ; every 5 minutes.
|
||||
data = subprocess.check_output([SYSTEMCTL,
|
||||
|
@ -169,6 +169,10 @@ def read_func():
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
# get current state
|
||||
current_enabled_state = obj.enabled
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user