From ea4b515f91f38523a22e877ebba9d552962153b2 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 25 Jan 2021 09:40:06 -0500 Subject: [PATCH] Add node ready check to collectd plugins This update adds a second collectd plugin initialization enhancement. First update added a config complete gate: https://review.opendev.org/c/starlingx/monitoring/+/736817 Turns out that not all plugins are ready to sample immediately following the config complete state. One example is FM on the active controller needs time to get going before plugins can query their alarms on startup. Also, some plugins need more time than others. To account for both cases this update adds a thresholded node ready gate that can be tailored to a plugin to hold off fm access and sampling until its ready threshold is reached. Test Plan: PASS: Verify AIO SX and DX system install PROG: Verify Storage system install PASS: Verify AIO SX node lock and unlock PASS: Verify AIO Standby controller lock and unlock PASS: Verify Standard controller lock and unlock PASS: Verify Compute and Storage node lock and unlock PASS: Verify Dead-Office-Recovery (AIO DX) PASS: Verify collectd sampling and logs Partial-Bug: 1872979 Signed-off-by: Eric MacDonald Change-Id: I044d812542a4222214c7d13e231ac4024cca9800 --- .../centos/collectd-extensions.spec | 12 ++++-- collectd-extensions/src/collectd.service | 1 + collectd-extensions/src/cpu.py | 9 ++++- collectd-extensions/src/example.py | 4 ++ collectd-extensions/src/fm_notifier.py | 40 +++++++++---------- collectd-extensions/src/interface.py | 4 ++ collectd-extensions/src/memory.py | 7 ++++ collectd-extensions/src/ntpq.py | 4 ++ collectd-extensions/src/ovs_interface.py | 4 ++ collectd-extensions/src/plugin_common.py | 36 +++++++++++++++-- collectd-extensions/src/ptp.py | 7 ++-- collectd-extensions/src/remotels.py | 4 ++ 12 files changed, 98 insertions(+), 34 deletions(-) diff --git a/collectd-extensions/centos/collectd-extensions.spec b/collectd-extensions/centos/collectd-extensions.spec index 8c76983..cbbbc45 100644 --- a/collectd-extensions/centos/collectd-extensions.spec +++ b/collectd-extensions/centos/collectd-extensions.spec @@ -19,7 +19,6 @@ Source5: plugin_common.py # collectd python plugin files - resource plugins Source11: cpu.py Source12: memory.py -Source14: example.py Source15: ntpq.py Source16: interface.py Source17: remotels.py @@ -31,7 +30,6 @@ Source100: python_plugins.conf Source101: cpu.conf Source102: memory.conf Source103: df.conf -Source104: example.conf Source105: ntpq.conf Source106: interface.conf Source107: remotels.conf @@ -82,7 +80,6 @@ install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir} # collectd python plugin files - resource plugins install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir} @@ -95,13 +92,20 @@ install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE101} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE108} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE109} %{buildroot}%{local_plugin_dir} +%pre +rm -f /etc/collectd.d/90-default-plugins-syslog.conf +rm -f /etc/collectd.d/90-default-plugins-memory.conf +rm -f /etc/collectd.d/90-default-plugins-load.conf +rm -f /etc/collectd.d/90-default-plugins-interface.conf +rm -f /etc/collectd.d/90-default-plugins-cpu.conf + + %clean rm -rf $RPM_BUILD_ROOT diff --git a/collectd-extensions/src/collectd.service b/collectd-extensions/src/collectd.service index 1ac7cb0..f2b62cd 100644 --- a/collectd-extensions/src/collectd.service +++ b/collectd-extensions/src/collectd.service @@ -3,6 +3,7 @@ Description=Collectd statistics daemon and extension services Documentation=man:collectd(1) man:collectd.conf(5) Before=pmon.service After=local-fs.target network-online.target +After=config.service syslog.service Requires=local-fs.target network-online.target [Service] diff --git a/collectd-extensions/src/cpu.py b/collectd-extensions/src/cpu.py index cc05323..123cfa5 100755 --- a/collectd-extensions/src/cpu.py +++ b/collectd-extensions/src/cpu.py @@ -22,6 +22,7 @@ import plugin_common as pc import re import socket import time +import tsconfig.tsconfig as tsc PLUGIN = 'platform cpu usage plugin' PLUGIN_DEBUG = 'DEBUG platform cpu' @@ -413,7 +414,11 @@ def init_func(): # do nothing till config is complete. if obj.config_complete() is False: - return False + return pc.PLUGIN_PASS + + if obj._node_ready is False: + obj.node_ready() + return pc.PLUGIN_PASS obj.hostname = socket.gethostname() @@ -472,7 +477,7 @@ def read_func(): if obj.init_complete is False: init_func() - return 0 + return pc.PLUGIN_PASS # epoch time in floating seconds now0 = time.time() diff --git a/collectd-extensions/src/example.py b/collectd-extensions/src/example.py index 50ad8c4..11767da 100755 --- a/collectd-extensions/src/example.py +++ b/collectd-extensions/src/example.py @@ -61,6 +61,10 @@ def read_func(): init_func() return 0 + if obj._node_ready is False: + obj.node_ready() + return 0 + # do the work to create the sample low = int(obj.plugin_data[0]) high = int(obj.plugin_data[1]) diff --git a/collectd-extensions/src/fm_notifier.py b/collectd-extensions/src/fm_notifier.py index d2a5297..4e06d89 100755 --- a/collectd-extensions/src/fm_notifier.py +++ b/collectd-extensions/src/fm_notifier.py @@ -194,7 +194,6 @@ DF_MANGLED_DICT = { ALARM_ID__CPU = "100.101" ALARM_ID__MEM = "100.103" ALARM_ID__DF = "100.104" -ALARM_ID__EXAMPLE = "100.113" ALARM_ID__VSWITCH_CPU = "100.102" ALARM_ID__VSWITCH_MEM = "100.115" @@ -209,8 +208,7 @@ ALARM_ID_LIST = [ALARM_ID__CPU, ALARM_ID__VSWITCH_CPU, ALARM_ID__VSWITCH_MEM, ALARM_ID__VSWITCH_PORT, - ALARM_ID__VSWITCH_IFACE, - ALARM_ID__EXAMPLE] + ALARM_ID__VSWITCH_IFACE] # ADD_NEW_PLUGIN: add plugin name definition # WARNING: This must line up exactly with the plugin @@ -224,7 +222,6 @@ PLUGIN__VSWITCH_PORT = "vswitch_port" PLUGIN__VSWITCH_CPU = "vswitch_cpu" PLUGIN__VSWITCH_MEM = "vswitch_mem" PLUGIN__VSWITCH_IFACE = "vswitch_iface" -PLUGIN__EXAMPLE = "example" # ADD_NEW_PLUGIN: add plugin name to list PLUGIN_NAME_LIST = [PLUGIN__CPU, @@ -233,8 +230,7 @@ PLUGIN_NAME_LIST = [PLUGIN__CPU, PLUGIN__VSWITCH_CPU, PLUGIN__VSWITCH_MEM, PLUGIN__VSWITCH_PORT, - PLUGIN__VSWITCH_IFACE, - PLUGIN__EXAMPLE] + PLUGIN__VSWITCH_IFACE] # Used to find plugin name based on alarm id # for managing degrade for startup alarms. @@ -555,8 +551,6 @@ class fmAlarmObject: database_setup = False # state of database setup database_setup_in_progress = False # connection mutex - # Set to True once FM connectivity is verified - # Used to ensure alarms are queried on startup fm_connectivity = False def __init__(self, id, plugin): @@ -1312,8 +1306,7 @@ PLUGINS = { PLUGIN__VSWITCH_PORT: fmAlarmObject(ALARM_ID__VSWITCH_PORT, PLUGIN__VSWITCH_PORT), PLUGIN__VSWITCH_IFACE: fmAlarmObject(ALARM_ID__VSWITCH_IFACE, - PLUGIN__VSWITCH_IFACE), - PLUGIN__EXAMPLE: fmAlarmObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)} + PLUGIN__VSWITCH_IFACE)} ##################################################################### @@ -1744,12 +1737,6 @@ def init_func(): ########################################################################### - obj = PLUGINS[PLUGIN__EXAMPLE] - obj.resource_name = "Example" - obj.instance_name = PLUGIN__EXAMPLE - obj.repair = "Not Applicable" - collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) - # ... # ADD_NEW_PLUGIN: Add new plugin object initialization here ... # ... @@ -1772,8 +1759,17 @@ def notifier_func(nObject): if pluginObject.config_complete() is False: return 0 - if fmAlarmObject.fm_connectivity is False: + if pluginObject._node_ready is False: + collectd.info("%s %s not ready ; from:%s:%s:%s" % + (PLUGIN, + fmAlarmObject.host, + nObject.host, + nObject.plugin, + nObject.plugin_instance)) + pluginObject.node_ready() + return 0 + if fmAlarmObject.fm_connectivity is False: # handle multi threading startup with fmAlarmObject.lock: if fmAlarmObject.fm_connectivity is True: @@ -1791,8 +1787,12 @@ def notifier_func(nObject): try: alarms = api.get_faults_by_id(alarm_id) except Exception as ex: - collectd.error("%s 'get_faults_by_id' exception ; %s" % - (PLUGIN, ex)) + collectd.warning("%s 'get_faults_by_id' exception ; %s" % + (PLUGIN, ex)) + + # if fm is not responding then the node is not ready + pluginObject._node_ready = False + pluginObject.node_ready_count = 0 return 0 if alarms: @@ -1861,7 +1861,7 @@ def notifier_func(nObject): (PLUGIN_DEGRADE, eid, alarm_id)) fmAlarmObject.fm_connectivity = True - collectd.info("%s connectivity with fm complete" % PLUGIN) + collectd.info("%s node ready" % PLUGIN) collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % ( PLUGIN, diff --git a/collectd-extensions/src/interface.py b/collectd-extensions/src/interface.py index 0a2022c..e705329 100755 --- a/collectd-extensions/src/interface.py +++ b/collectd-extensions/src/interface.py @@ -829,6 +829,10 @@ def read_func(): init_func() return 0 + if obj._node_ready is False: + obj.node_ready() + return 0 + if obj.phase < RUN_PHASE__ALARMS_CLEARED: # clear all alarms on first audit diff --git a/collectd-extensions/src/memory.py b/collectd-extensions/src/memory.py index a163a7f..25f5915 100755 --- a/collectd-extensions/src/memory.py +++ b/collectd-extensions/src/memory.py @@ -373,6 +373,9 @@ def init_func(): if obj.config_complete() is False: return 0 + # override node ready threshold for this plugin + obj.node_ready_threshold = 1 + obj.hostname = socket.gethostname() collectd.info('%s: init function for %s' % (PLUGIN, obj.hostname)) @@ -398,6 +401,10 @@ def read_func(): init_func() return 0 + if obj._node_ready is False: + obj.node_ready() + return 0 + # Get epoch time in floating seconds now0 = time.time() diff --git a/collectd-extensions/src/ntpq.py b/collectd-extensions/src/ntpq.py index 166f513..e6f31ef 100755 --- a/collectd-extensions/src/ntpq.py +++ b/collectd-extensions/src/ntpq.py @@ -555,6 +555,10 @@ def init_func(): if obj.config_complete() is False: return 0 + if obj._node_ready is False: + obj.node_ready() + return 0 + # get current hostname obj.hostname = obj.gethostname() if not obj.hostname: diff --git a/collectd-extensions/src/ovs_interface.py b/collectd-extensions/src/ovs_interface.py index d2877c6..7033f66 100755 --- a/collectd-extensions/src/ovs_interface.py +++ b/collectd-extensions/src/ovs_interface.py @@ -903,6 +903,10 @@ def read_func(): init_func() return 0 + if obj._node_ready is False: + obj.node_ready() + return 0 + if obj.phase < RUN_PHASE__ALARMS_CLEARED: # clear all alarms on first audit diff --git a/collectd-extensions/src/plugin_common.py b/collectd-extensions/src/plugin_common.py index 934de28..33082c0 100644 --- a/collectd-extensions/src/plugin_common.py +++ b/collectd-extensions/src/plugin_common.py @@ -114,7 +114,7 @@ class PluginObject(object): self._config_complete = False # set to True once config is complete self.config_done = False # set true if config_func completed ok self.init_complete = False # set true if init_func completed ok - self.fm_connectivity = False # set true when fm connectivity ok + self._node_ready = False # set true when node is ready self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS @@ -146,9 +146,8 @@ class PluginObject(object): self.http_retry_count = 0 # track http error cases self.HTTP_RETRY_THROTTLE = 6 # http retry threshold self.phase = 0 # tracks current phase; init, sampling - - collectd.debug("%s Common PluginObject constructor [%s]" % - (plugin, url)) + self.node_ready_threshold = 3 # wait for node ready before sampling + self.node_ready_count = 0 # node ready count up counter ########################################################################### # @@ -206,6 +205,35 @@ class PluginObject(object): return True + ########################################################################### + # + # Name : node_ready + # + # Description: Test for node ready condition. + # Currently that's just a thresholded count + # + # Parameters : plugin name + # + # Returns : False if node is not ready + # True if node is ready + # + ########################################################################### + + def node_ready(self): + """Check for node ready state""" + + if tsc.nodetype == 'controller': + self.node_ready_count += 1 + if self.node_ready_count < self.node_ready_threshold: + collectd.info("%s node ready count %d of %d" % + (self.plugin, + self.node_ready_count, + self.node_ready_threshold)) + return False + + self._node_ready = True + return True + ########################################################################### # # Name : gethostname diff --git a/collectd-extensions/src/ptp.py b/collectd-extensions/src/ptp.py index 5d3ad93..f083061 100755 --- a/collectd-extensions/src/ptp.py +++ b/collectd-extensions/src/ptp.py @@ -661,7 +661,9 @@ def read_func(): init_func() return 0 - if obj.fm_connectivity is False: + if obj._node_ready is False: + if obj.node_ready() is False: + return 0 try: # query FM for existing alarms. @@ -723,9 +725,6 @@ def read_func(): else: collectd.info("%s no startup alarms found" % PLUGIN) - obj.fm_connectivity = True - # assert_all_alarms() - # This plugin supports PTP in-service state change by checking # service state on every audit ; every 5 minutes. data = subprocess.check_output([SYSTEMCTL, diff --git a/collectd-extensions/src/remotels.py b/collectd-extensions/src/remotels.py index 1330220..7207fab 100755 --- a/collectd-extensions/src/remotels.py +++ b/collectd-extensions/src/remotels.py @@ -169,6 +169,10 @@ def read_func(): init_func() return 0 + if obj._node_ready is False: + obj.node_ready() + return 0 + # get current state current_enabled_state = obj.enabled