Add PTP monitoring to collectd
This update adds Precision Time Protocol (PTP) monitoring to the current list of inhouse developed collectd plugins. Refer to the ptp.py header for a description of the monitoring service algorithm and inline comments for detailed behavior. Test Plan: Useability: ----------- PASS: Verify monitoring behavior around ptp service enable and disable PASS: Verify ptp monitoring behavior over lock and unlock PASS: Verify behavior with bonded interfaces (skew oot alarm) PASS: Verify no-lock hosts lock to remote grandmaster when available PASS: Verify AIO SX PTP Enable over Lock/Unlock System Level: ------------- PASS: Verify large system install PASS: Verify AIO SX system install Host Level: ----------- PASS: Verify controller monitoring PASS: Verify worker monitoring PASS: Verify storage monitoring PASS: Verify worker/storage behavior when the only controller is rebooted. PASS: Verify startup handling of fm calls while fm is not running PASS: Verify runtime handling of fm calls while fm is not running Config Level: ------------- PASS: Verify PTP Enable and auto start monitoring PASS: Verify PTP Disable and auto stop monitoring PASS: Verify audit interval is every 60 seconds PASS: Verify hardware timestamp monitoring PASS: Verify software timestamp monitoring PASS: verify legacy timestamp monitoring PASS: Verify hardware to software config change PASS: Verify software to legacy config change PASS; Verify legacy to hardware config change PASS: Verify software to hardware config change Alarm Management: ----------------- PASS: Verify end-to-end handling of 'nolock' alarm management PASS: Verify end-to-end handling of 'out-of-tolerance' alarm management PASS: Verify end-to-end handling of 'process' alarm management PASS: Verify end-to-end handling of 'unsupported mode' alarm management PASS: Verify all ptp alarms get cleared on collectd process start PASS: Verify plugin startup behavior when FM is not running PASS: Verify plugin with FM V2 API PASS: Verify thresholed out-of-tolerance alarm handling PASS: Verify plugin logging is value added PASS: Verify alarm assert debounce of 2 PASS: Verify alarm clear with no debounce PASS: Verify only major out-of-tolerance alarm for software mode PASS: Verify only major out-of-tolerance alarm for legacy mode PASS: Verify minor/major out-of-tolerance alarm for hardware mode PASS: Verify no-lock alarm if compute GM ID is the same as its own PASS: Verify no-lock alarm is not raised on GM reboot PASS: Verify GM switches to alternate when GM host is rebooted Change-Id: If36aece94dd5511bf9deba0753f3863237e2a7fe Story: 2002823 Task: 29492 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
a2b1bc6f05
commit
b4a23c57aa
@ -18,7 +18,8 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
|
|||||||
$PKG_BASE/src/interface.conf \
|
$PKG_BASE/src/interface.conf \
|
||||||
$PKG_BASE/src/remotels.py \
|
$PKG_BASE/src/remotels.py \
|
||||||
$PKG_BASE/src/remotels.conf \
|
$PKG_BASE/src/remotels.conf \
|
||||||
|
$PKG_BASE/src/ptp.py \
|
||||||
|
$PKG_BASE/src/ptp.conf \
|
||||||
$PKG_BASE/src/example.py \
|
$PKG_BASE/src/example.py \
|
||||||
$PKG_BASE/src/example.conf"
|
$PKG_BASE/src/example.conf"
|
||||||
|
TIS_PATCH_VER=9
|
||||||
TIS_PATCH_VER=8
|
|
||||||
|
@ -24,6 +24,7 @@ Source14: example.py
|
|||||||
Source15: ntpq.py
|
Source15: ntpq.py
|
||||||
Source16: interface.py
|
Source16: interface.py
|
||||||
Source17: remotels.py
|
Source17: remotels.py
|
||||||
|
Source18: ptp.py
|
||||||
|
|
||||||
# collectd plugin conf files into /etc/collectd.d
|
# collectd plugin conf files into /etc/collectd.d
|
||||||
Source100: python_plugins.conf
|
Source100: python_plugins.conf
|
||||||
@ -34,6 +35,7 @@ Source104: example.conf
|
|||||||
Source105: ntpq.conf
|
Source105: ntpq.conf
|
||||||
Source106: interface.conf
|
Source106: interface.conf
|
||||||
Source107: remotels.conf
|
Source107: remotels.conf
|
||||||
|
Source108: ptp.conf
|
||||||
|
|
||||||
BuildRequires: systemd-devel
|
BuildRequires: systemd-devel
|
||||||
|
|
||||||
@ -78,6 +80,7 @@ install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
|
|||||||
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
|
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
|
||||||
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
|
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
|
||||||
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
|
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
|
||||||
|
install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir}
|
||||||
|
|
||||||
|
|
||||||
# collectd plugin conf files into /etc/collectd.d
|
# collectd plugin conf files into /etc/collectd.d
|
||||||
@ -89,6 +92,7 @@ install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
|
|||||||
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
|
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
|
||||||
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
|
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
|
||||||
install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir}
|
install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir}
|
||||||
|
install -m 600 %{SOURCE108} %{buildroot}%{local_plugin_dir}
|
||||||
|
|
||||||
%clean
|
%clean
|
||||||
rm -rf $RPM_BUILD_ROOT
|
rm -rf $RPM_BUILD_ROOT
|
||||||
|
@ -15,6 +15,7 @@ import uuid
|
|||||||
import httplib2
|
import httplib2
|
||||||
import socket
|
import socket
|
||||||
import os
|
import os
|
||||||
|
from oslo_concurrency import processutils
|
||||||
from fm_api import constants as fm_constants
|
from fm_api import constants as fm_constants
|
||||||
import tsconfig.tsconfig as tsc
|
import tsconfig.tsconfig as tsc
|
||||||
|
|
||||||
@ -34,28 +35,42 @@ class PluginObject(object):
|
|||||||
self.hostname = '' # the name of this host
|
self.hostname = '' # the name of this host
|
||||||
self.port = 0 # the port number for this plugin
|
self.port = 0 # the port number for this plugin
|
||||||
self.base_eid = '' # the base entity id host=<hostname>
|
self.base_eid = '' # the base entity id host=<hostname>
|
||||||
|
self.controller = False # set true if node is controller
|
||||||
|
|
||||||
# dynamic gate variables
|
# dynamic gate variables
|
||||||
|
self.virtual = False # set to True if host is virtual
|
||||||
self.config_complete = False # set to True once config is complete
|
self.config_complete = False # set to True once config is complete
|
||||||
self.config_done = False # set true if config_func completed ok
|
self.config_done = False # set true if config_func completed ok
|
||||||
self.init_done = False # set true if init_func completed ok
|
self.init_done = False # set true if init_func completed ok
|
||||||
|
self.fm_connectivity = False # set true when fm connectivity ok
|
||||||
|
|
||||||
|
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
|
||||||
|
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
|
||||||
|
self.suppression = True
|
||||||
|
self.service_affecting = False
|
||||||
|
|
||||||
# dynamic variables set in read_func
|
# dynamic variables set in read_func
|
||||||
self.usage = float(0) # last usage value recorded as float
|
self.usage = float(0) # last usage value recorded as float
|
||||||
|
self.value = float(0) # last read value
|
||||||
self.audits = 0 # number of audit since init
|
self.audits = 0 # number of audit since init
|
||||||
self.enabled = False # tracks a plugin's enabled state
|
self.enabled = False # tracks a plugin's enabled state
|
||||||
self.alarmed = False # tracks the current alarmed state
|
self.alarmed = False # tracks the current alarmed state
|
||||||
|
self.mode = '' # mode specific to plugin
|
||||||
|
|
||||||
# http and json specific variables
|
# http and json specific variables
|
||||||
self.url = url # target url
|
self.url = url # target url
|
||||||
self.jresp = None # used to store the json response
|
self.jresp = None # used to store the json response
|
||||||
self.resp = ''
|
self.resp = ''
|
||||||
|
|
||||||
|
self.objects = [] # list of plugin specific objects
|
||||||
|
self.cmd = '' # plugin specific command string
|
||||||
|
|
||||||
# Log controls
|
# Log controls
|
||||||
self.config_logged = False # used to log once the plugin config
|
self.config_logged = False # used to log once the plugin config
|
||||||
self.error_logged = False # used to prevent log flooding
|
self.error_logged = False # used to prevent log flooding
|
||||||
self.log_throttle_count = 0 # used to count throttle logs
|
self.log_throttle_count = 0 # used to count throttle logs
|
||||||
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
|
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
|
||||||
|
self.phase = 0 # tracks current phase; init, sampling
|
||||||
|
|
||||||
collectd.debug("%s Common PluginObject constructor [%s]" %
|
collectd.debug("%s Common PluginObject constructor [%s]" %
|
||||||
(plugin, url))
|
(plugin, url))
|
||||||
@ -114,6 +129,39 @@ class PluginObject(object):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
#
|
||||||
|
# Name : is_virtual
|
||||||
|
#
|
||||||
|
# Description: Execute facter command with output filter on 'is_virtual'
|
||||||
|
#
|
||||||
|
# Parameters : None
|
||||||
|
#
|
||||||
|
# Returns : True if current host is virtual.
|
||||||
|
# False if current host is NOT virtual
|
||||||
|
#
|
||||||
|
###########################################################################
|
||||||
|
def is_virtual(self):
|
||||||
|
""" Check for virtual host """
|
||||||
|
|
||||||
|
try:
|
||||||
|
cmd = '/usr/bin/facter is_virtual'
|
||||||
|
res, err = processutils.execute(cmd, shell=True)
|
||||||
|
if err:
|
||||||
|
return False
|
||||||
|
elif res:
|
||||||
|
# remove the trailing '\n' with strip()
|
||||||
|
if res.strip() == 'true':
|
||||||
|
collectd.info("%s %s is virtual" %
|
||||||
|
(self.plugin, self.hostname))
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.info("%s failed to execute '/usr/bin/facter' ; %s" %
|
||||||
|
self.plugin, ex)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
#
|
#
|
||||||
# Name : check_for_fit
|
# Name : check_for_fit
|
||||||
|
15
monitoring/collectd-extensions/src/ptp.conf
Normal file
15
monitoring/collectd-extensions/src/ptp.conf
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
<Plugin "threshold">
|
||||||
|
<Plugin "ptp">
|
||||||
|
<Type "time_offset">
|
||||||
|
Instance "nsec"
|
||||||
|
Persist true
|
||||||
|
PersistOK true
|
||||||
|
WarningMax 1000
|
||||||
|
FailureMax 1000000
|
||||||
|
WarningMin -1000
|
||||||
|
FailureMin -1000000
|
||||||
|
Hits 2
|
||||||
|
Invert false
|
||||||
|
</Type>
|
||||||
|
</Plugin>
|
||||||
|
</Plugin>
|
971
monitoring/collectd-extensions/src/ptp.py
Executable file
971
monitoring/collectd-extensions/src/ptp.py
Executable file
@ -0,0 +1,971 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
############################################################################
|
||||||
|
#
|
||||||
|
# This file is the collectd 'Precision Time Protocol' Service Monitor.
|
||||||
|
#
|
||||||
|
# Algorithm:
|
||||||
|
#
|
||||||
|
# while not config ; check again
|
||||||
|
# while not init ; retry
|
||||||
|
# if startup
|
||||||
|
# clear all ptp alarms
|
||||||
|
# if ptp enabled
|
||||||
|
# if ptp not running
|
||||||
|
# raise 'process' alarm
|
||||||
|
# else
|
||||||
|
# read grand master and current skew
|
||||||
|
# if not controller and is grand master
|
||||||
|
# raise 'no lock' alarm
|
||||||
|
# if skew is out-of-tolerance
|
||||||
|
# raise out-of-tolerance alarm
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# manage alarm state throught
|
||||||
|
# retry on alarm state change failures
|
||||||
|
# only make raise/clear alarm calls on severity state changes
|
||||||
|
#
|
||||||
|
############################################################################
|
||||||
|
import os
|
||||||
|
import collectd
|
||||||
|
import subprocess
|
||||||
|
import tsconfig.tsconfig as tsc
|
||||||
|
import plugin_common as pc
|
||||||
|
from fm_api import constants as fm_constants
|
||||||
|
from fm_api import fm_api
|
||||||
|
|
||||||
|
debug = False
|
||||||
|
|
||||||
|
# Fault manager API Object
|
||||||
|
api = fm_api.FaultAPIsV2()
|
||||||
|
|
||||||
|
PLUGIN_ALARMID = "100.119"
|
||||||
|
|
||||||
|
# name of the plugin - all logs produced by this plugin are prefixed with this
|
||||||
|
PLUGIN = 'ptp plugin'
|
||||||
|
|
||||||
|
# Service name
|
||||||
|
PTP = 'Precision Time Protocol (PTP)'
|
||||||
|
|
||||||
|
# Interface Monitoring Interval in seconds
|
||||||
|
PLUGIN_AUDIT_INTERVAL = 60
|
||||||
|
|
||||||
|
# Sample Data 'type' and 'instance' database field values.
|
||||||
|
PLUGIN_TYPE = 'time_offset'
|
||||||
|
PLUGIN_TYPE_INSTANCE = 'nsec'
|
||||||
|
|
||||||
|
# Primary PTP service name
|
||||||
|
PLUGIN_SERVICE = 'ptp4l.service'
|
||||||
|
|
||||||
|
# Plugin configuration file
|
||||||
|
#
|
||||||
|
# This plugin looks for the timestamping mode in the ptp4l config file.
|
||||||
|
# time_stamping hardware
|
||||||
|
#
|
||||||
|
PLUGIN_CONF_FILE = '/etc/ptp4l.conf'
|
||||||
|
PLUGIN_CONF_TIMESTAMPING = 'time_stamping'
|
||||||
|
|
||||||
|
# Tools used by plugin
|
||||||
|
SYSTEMCTL = '/usr/bin/systemctl'
|
||||||
|
ETHTOOL = '/usr/sbin/ethtool'
|
||||||
|
PLUGIN_STATUS_QUERY_EXEC = '/usr/sbin/pmc'
|
||||||
|
|
||||||
|
# Query PTP service administrative (enabled/disabled) state
|
||||||
|
#
|
||||||
|
# > systemctl is-enabled ptp4l
|
||||||
|
# enabled
|
||||||
|
# > systemctl disable ptp4l
|
||||||
|
# > systemctl is-enabled ptp4l
|
||||||
|
# disabled
|
||||||
|
|
||||||
|
SYSTEMCTL_IS_ENABLED_OPTION = 'is-enabled'
|
||||||
|
SYSTEMCTL_IS_ENABLED_RESPONSE = 'enabled'
|
||||||
|
SYSTEMCTL_IS_DISABLED_RESPONSE = 'disabled'
|
||||||
|
|
||||||
|
# Query PTP service activity (active=running / inactive) state
|
||||||
|
#
|
||||||
|
# > systemctl is-active ptp4l
|
||||||
|
# active
|
||||||
|
# > systemctl stop ptp4l
|
||||||
|
# > systemctl is-active ptp4l
|
||||||
|
# inactive
|
||||||
|
|
||||||
|
SYSTEMCTL_IS_ACTIVE_OPTION = 'is-active'
|
||||||
|
SYSTEMCTL_IS_ACTIVE_RESPONSE = 'active'
|
||||||
|
SYSTEMCTL_IS_INACTIVE_RESPONSE = 'inactive'
|
||||||
|
|
||||||
|
# Alarm Cause codes ; used to specify what alarm EID to assert or clear.
|
||||||
|
ALARM_CAUSE__NONE = 0
|
||||||
|
ALARM_CAUSE__PROCESS = 1
|
||||||
|
ALARM_CAUSE__OOT = 2
|
||||||
|
ALARM_CAUSE__NO_LOCK = 3
|
||||||
|
ALARM_CAUSE__UNSUPPORTED_HW = 4
|
||||||
|
ALARM_CAUSE__UNSUPPORTED_SW = 5
|
||||||
|
ALARM_CAUSE__UNSUPPORTED_LEGACY = 6
|
||||||
|
|
||||||
|
# Run Phase
|
||||||
|
RUN_PHASE__INIT = 0
|
||||||
|
RUN_PHASE__DISABLED = 1
|
||||||
|
RUN_PHASE__NOT_RUNNING = 2
|
||||||
|
RUN_PHASE__SAMPLING = 3
|
||||||
|
|
||||||
|
# Clock Sync Out-Of-Tolerance thresholds
|
||||||
|
OOT_MINOR_THRESHOLD = int(1000)
|
||||||
|
OOT_MAJOR_THRESHOLD = int(1000000)
|
||||||
|
|
||||||
|
# Instantiate the common plugin control object
|
||||||
|
obj = pc.PluginObject(PLUGIN, "")
|
||||||
|
|
||||||
|
|
||||||
|
# Create an alarm management class
|
||||||
|
class PTP_alarm_object:
|
||||||
|
|
||||||
|
def __init__(self, interface=None):
|
||||||
|
|
||||||
|
self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
||||||
|
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50
|
||||||
|
self.alarm = ALARM_CAUSE__NONE
|
||||||
|
self.interface = interface
|
||||||
|
self.raised = False
|
||||||
|
self.reason = ''
|
||||||
|
self.repair = ''
|
||||||
|
self.eid = ''
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin specific control class and object.
|
||||||
|
class PTP_ctrl_object:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.gm_log_throttle = 0
|
||||||
|
self.nolock_alarm_object = None
|
||||||
|
self.process_alarm_object = None
|
||||||
|
self.oot_alarm_object = None
|
||||||
|
|
||||||
|
|
||||||
|
ctrl = PTP_ctrl_object()
|
||||||
|
|
||||||
|
|
||||||
|
# Alarm object list, one entry for each interface and alarm cause case
|
||||||
|
ALARM_OBJ_LIST = []
|
||||||
|
|
||||||
|
|
||||||
|
# UT verification utilities
|
||||||
|
def assert_all_alarms():
|
||||||
|
for o in ALARM_OBJ_LIST:
|
||||||
|
raise_alarm(o.alarm, o.interface, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def clear_all_alarms():
|
||||||
|
for o in ALARM_OBJ_LIST:
|
||||||
|
if clear_alarm(o.eid) is True:
|
||||||
|
msg = 'cleared'
|
||||||
|
else:
|
||||||
|
msg = 'clear failed'
|
||||||
|
collectd.info("%s %s:%s alarm %s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, o.eid, msg))
|
||||||
|
|
||||||
|
|
||||||
|
def print_alarm_object(o):
|
||||||
|
collectd.info("%s Interface:%s Cause: %d Severity:%s Raised:%d" %
|
||||||
|
(PLUGIN,
|
||||||
|
o.interface,
|
||||||
|
o.alarm,
|
||||||
|
o.severity,
|
||||||
|
o.raised))
|
||||||
|
collectd.info("%s Entity:[%s]" % (PLUGIN, o.eid))
|
||||||
|
collectd.info("%s Reason:[%s]" % (PLUGIN, o.reason))
|
||||||
|
collectd.info("%s Repair:[%s]" % (PLUGIN, o.repair))
|
||||||
|
|
||||||
|
|
||||||
|
def print_alarm_objects():
|
||||||
|
for o in ALARM_OBJ_LIST:
|
||||||
|
print_alarm_object(o)
|
||||||
|
|
||||||
|
|
||||||
|
# Interface:Supported Modes dictionary. key:value
|
||||||
|
#
|
||||||
|
# interface:modes
|
||||||
|
#
|
||||||
|
interfaces = {}
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : _get_supported_modes
|
||||||
|
#
|
||||||
|
# Description: Invoke ethtool -T <interface> and load its
|
||||||
|
# time stamping capabilities.
|
||||||
|
#
|
||||||
|
# hardware, software or legacy.
|
||||||
|
#
|
||||||
|
# Parameters : The name of the physical interface to query the
|
||||||
|
# supported modes for.
|
||||||
|
#
|
||||||
|
# Interface Capabilities Output Examples:
|
||||||
|
#
|
||||||
|
# vbox prints this as it only supports software timestamping
|
||||||
|
# software-transmit (SOF_TIMESTAMPING_TX_SOFTWARE)
|
||||||
|
# software-receive (SOF_TIMESTAMPING_RX_SOFTWARE)
|
||||||
|
#
|
||||||
|
# full support output looks like this
|
||||||
|
# hardware-transmit (SOF_TIMESTAMPING_TX_HARDWARE)
|
||||||
|
# software-transmit (SOF_TIMESTAMPING_TX_SOFTWARE)
|
||||||
|
# hardware-receive (SOF_TIMESTAMPING_RX_HARDWARE)
|
||||||
|
# software-receive (SOF_TIMESTAMPING_RX_SOFTWARE)
|
||||||
|
# hardware-raw-clock (SOF_TIMESTAMPING_RAW_HARDWARE)
|
||||||
|
#
|
||||||
|
# Only legacy support output looks like this
|
||||||
|
# hardware-raw-clock (SOF_TIMESTAMPING_RAW_HARDWARE)
|
||||||
|
#
|
||||||
|
# Provisionable PTP Modes are
|
||||||
|
# hardware -> hardware-transmit/receive
|
||||||
|
# software -> software-transmit/receive
|
||||||
|
# legacy -> hardware-raw-clock
|
||||||
|
|
||||||
|
TIMESTAMP_MODE__HW = 'hardware'
|
||||||
|
TIMESTAMP_MODE__SW = 'software'
|
||||||
|
TIMESTAMP_MODE__LEGACY = 'legacy'
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Returns : a list of supported modes
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def _get_supported_modes(interface):
|
||||||
|
""" Get the supported modes for the specified interface """
|
||||||
|
|
||||||
|
hw_tx = hw_rx = sw_tx = sw_rx = False
|
||||||
|
modes = []
|
||||||
|
data = subprocess.check_output([ETHTOOL, '-T', interface]).split('\n')
|
||||||
|
if data:
|
||||||
|
collectd.debug("%s 'ethtool -T %s' output:%s\n" %
|
||||||
|
(PLUGIN, interface, data))
|
||||||
|
check_for_modes = False
|
||||||
|
for i in range(0, len(data)):
|
||||||
|
collectd.debug("%s data[%d]:%s\n" % (PLUGIN, i, data[i]))
|
||||||
|
if 'Capabilities' in data[i]:
|
||||||
|
|
||||||
|
# start of capabilities list
|
||||||
|
check_for_modes = True
|
||||||
|
|
||||||
|
elif check_for_modes is True:
|
||||||
|
|
||||||
|
if 'PTP Hardware Clock' in data[i]:
|
||||||
|
# no more modes after this label
|
||||||
|
break
|
||||||
|
elif 'hardware-transmit' in data[i]:
|
||||||
|
hw_tx = True
|
||||||
|
elif 'hardware-receive' in data[i]:
|
||||||
|
hw_rx = True
|
||||||
|
elif 'software-transmit' in data[i]:
|
||||||
|
sw_tx = True
|
||||||
|
elif 'software-receive' in data[i]:
|
||||||
|
sw_rx = True
|
||||||
|
elif 'hardware-raw-clock' in data[i]:
|
||||||
|
modes.append(TIMESTAMP_MODE__LEGACY)
|
||||||
|
|
||||||
|
if sw_tx is True and sw_rx is True:
|
||||||
|
modes.append(TIMESTAMP_MODE__SW)
|
||||||
|
|
||||||
|
if hw_tx is True and hw_rx is True:
|
||||||
|
modes.append(TIMESTAMP_MODE__HW)
|
||||||
|
|
||||||
|
if modes:
|
||||||
|
collectd.debug("%s %s interface PTP capabilities: %s" %
|
||||||
|
(PLUGIN, interface, modes))
|
||||||
|
else:
|
||||||
|
collectd.info("%s no capabilities advertised for %s" %
|
||||||
|
(PLUGIN, interface))
|
||||||
|
|
||||||
|
else:
|
||||||
|
collectd.info("%s no ethtool output for %s" % (PLUGIN, interface))
|
||||||
|
return None
|
||||||
|
|
||||||
|
return modes
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : get_alarm_object
|
||||||
|
#
|
||||||
|
# Description: Search the alarm list based on the alarm cause
|
||||||
|
# code and interface.
|
||||||
|
#
|
||||||
|
# Returns : Alarm object if found ; otherwise None
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def get_alarm_object(alarm, interface=None):
|
||||||
|
""" Alarm object lookup """
|
||||||
|
|
||||||
|
for o in ALARM_OBJ_LIST:
|
||||||
|
# print_alarm_object(o)
|
||||||
|
if interface is None:
|
||||||
|
if o.alarm == alarm:
|
||||||
|
return o
|
||||||
|
else:
|
||||||
|
if o.interface == interface:
|
||||||
|
if o.alarm == alarm:
|
||||||
|
return o
|
||||||
|
|
||||||
|
collectd.info("%s alarm object lookup failed ; %d:%s" %
|
||||||
|
(PLUGIN, alarm, interface))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : clear_alarm
|
||||||
|
#
|
||||||
|
# Description: Clear the ptp alarm with the specified entity ID.
|
||||||
|
#
|
||||||
|
# Returns : True if operation succeeded
|
||||||
|
# False if there was an error exception.
|
||||||
|
#
|
||||||
|
# Assumptions: Caller can decide to retry based on return status.
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def clear_alarm(eid):
|
||||||
|
""" Clear the ptp alarm with the specified entity ID """
|
||||||
|
|
||||||
|
try:
|
||||||
|
if api.clear_fault(PLUGIN_ALARMID, eid) is True:
|
||||||
|
collectd.info("%s %s:%s alarm cleared" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s:%s alarm clear ; None found" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, eid, ex))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : raise_alarm
|
||||||
|
#
|
||||||
|
# Description: Assert a specific PTP alarm based on the alarm cause
|
||||||
|
# code and interface.
|
||||||
|
#
|
||||||
|
# Handle special case cause codes
|
||||||
|
# Handle failure to raise fault
|
||||||
|
#
|
||||||
|
# Assumptions: Short circuited Success return if the alarm is
|
||||||
|
# already known to be asserted.
|
||||||
|
#
|
||||||
|
# Returns : False on Failure
|
||||||
|
# True on Success
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def raise_alarm(alarm_cause, interface=None, data=0):
|
||||||
|
""" Assert a cause based PTP alarm """
|
||||||
|
|
||||||
|
collectd.debug("%s Raising Alarm %d" % (PLUGIN, alarm_cause))
|
||||||
|
|
||||||
|
alarm = get_alarm_object(alarm_cause, interface)
|
||||||
|
if alarm is None:
|
||||||
|
# log created for None case in the get_alarm_object util
|
||||||
|
return True
|
||||||
|
|
||||||
|
# copy the reason as it might be updated for the OOT,
|
||||||
|
# most typical, case.
|
||||||
|
reason = alarm.reason
|
||||||
|
|
||||||
|
# Handle some special cases
|
||||||
|
#
|
||||||
|
|
||||||
|
if alarm_cause == ALARM_CAUSE__OOT:
|
||||||
|
# If this is an out of tolerance alarm then add the
|
||||||
|
# out of tolerance reading to the reason string before
|
||||||
|
# asserting the alarm.
|
||||||
|
#
|
||||||
|
# Keep the alarm updated with the latest sample reading
|
||||||
|
# and severity even if its already asserted.
|
||||||
|
if abs(float(data)) > 100000000000:
|
||||||
|
reason += 'more than 100 seconds'
|
||||||
|
elif abs(float(data)) > 10000000000:
|
||||||
|
reason += 'more than 10 seconds'
|
||||||
|
elif abs(float(data)) > 1000000000:
|
||||||
|
reason += 'more than 1 second'
|
||||||
|
elif abs(float(data)) > 1000000:
|
||||||
|
reason += str(abs(int(data)) / 1000000)
|
||||||
|
reason += ' millisecs'
|
||||||
|
elif abs(float(data)) > 1000:
|
||||||
|
reason += str(abs(int(data)) / 1000)
|
||||||
|
reason += ' microsecs'
|
||||||
|
else:
|
||||||
|
reason += str(float(data))
|
||||||
|
reason += ' ' + PLUGIN_TYPE_INSTANCE
|
||||||
|
|
||||||
|
elif alarm.raised is True:
|
||||||
|
# If alarm already raised then exit.
|
||||||
|
#
|
||||||
|
# All other alarms are a Major so there is no need to
|
||||||
|
# track a change in severity and update accordingly.
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif alarm_cause == ALARM_CAUSE__PROCESS:
|
||||||
|
reason = 'Provisioned ' + PTP + ' \'' + obj.mode
|
||||||
|
reason += '\' time stamping mode seems to be unsupported by this host'
|
||||||
|
|
||||||
|
try:
|
||||||
|
fault = fm_api.Fault(
|
||||||
|
alarm_id=PLUGIN_ALARMID,
|
||||||
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||||
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
|
entity_instance_id=alarm.eid,
|
||||||
|
severity=alarm.severity,
|
||||||
|
reason_text=reason,
|
||||||
|
alarm_type=obj.alarm_type,
|
||||||
|
probable_cause=alarm.cause,
|
||||||
|
proposed_repair_action=alarm.repair,
|
||||||
|
service_affecting=False, # obj.service_affecting,
|
||||||
|
suppression=True) # obj.suppression)
|
||||||
|
|
||||||
|
alarm_uuid = api.set_fault(fault)
|
||||||
|
if pc.is_uuid_like(alarm_uuid) is False:
|
||||||
|
|
||||||
|
# Don't _add_unreachable_server list if the fm call failed.
|
||||||
|
# That way it will be retried at a later time.
|
||||||
|
collectd.error("%s %s:%s set_fault failed:%s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm_uuid))
|
||||||
|
return False
|
||||||
|
|
||||||
|
else:
|
||||||
|
collectd.info("%s %s:%s:%s alarm raised" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm.severity))
|
||||||
|
alarm.raised = True
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" %
|
||||||
|
(PLUGIN,
|
||||||
|
PLUGIN_ALARMID,
|
||||||
|
alarm.eid,
|
||||||
|
alarm.severity,
|
||||||
|
ex))
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : create_interface_alarm_objects
|
||||||
|
#
|
||||||
|
# Description: Create alarm objects for specified interface
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def create_interface_alarm_objects(interface=None):
|
||||||
|
""" Create alarm objects """
|
||||||
|
|
||||||
|
collectd.debug("%s Alarm Object Create: Interface:%s " %
|
||||||
|
(PLUGIN, interface))
|
||||||
|
|
||||||
|
if interface is None:
|
||||||
|
o = PTP_alarm_object()
|
||||||
|
o.alarm = ALARM_CAUSE__PROCESS
|
||||||
|
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
|
o.reason = obj.hostname + ' does not support the provisioned '
|
||||||
|
o.reason += PTP + ' mode '
|
||||||
|
o.repair = 'Check host hardware reference manual '
|
||||||
|
o.repair += 'to verify that the selected PTP mode is supported'
|
||||||
|
o.eid = obj.base_eid + '.ptp'
|
||||||
|
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN # 'unknown'
|
||||||
|
ALARM_OBJ_LIST.append(o)
|
||||||
|
ctrl.process_alarm_object = o
|
||||||
|
|
||||||
|
o = PTP_alarm_object()
|
||||||
|
o.alarm = ALARM_CAUSE__OOT
|
||||||
|
o.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
||||||
|
o.reason = obj.hostname + ' '
|
||||||
|
o.reason += PTP + " clocking is out of tolerance by "
|
||||||
|
o.repair = "Check quality of the clocking network"
|
||||||
|
o.eid = obj.base_eid + '.ptp=out-of-tolerance'
|
||||||
|
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
|
||||||
|
ALARM_OBJ_LIST.append(o)
|
||||||
|
ctrl.oot_alarm_object = o
|
||||||
|
|
||||||
|
o = PTP_alarm_object()
|
||||||
|
# Only applies to storage and worker nodes
|
||||||
|
o.alarm = ALARM_CAUSE__NO_LOCK
|
||||||
|
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
|
o.reason = obj.hostname
|
||||||
|
o.reason += ' is not locked to remote PTP Grand Master'
|
||||||
|
o.repair = 'Check network'
|
||||||
|
o.eid = obj.base_eid + '.ptp=no-lock'
|
||||||
|
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_51 # timing-problem
|
||||||
|
ALARM_OBJ_LIST.append(o)
|
||||||
|
ctrl.nolock_alarm_object = o
|
||||||
|
|
||||||
|
else:
|
||||||
|
o = PTP_alarm_object(interface)
|
||||||
|
o.alarm = ALARM_CAUSE__UNSUPPORTED_HW
|
||||||
|
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
|
o.reason = obj.hostname + " '" + interface + "' does not support "
|
||||||
|
o.reason += PTP + ' Hardware timestamping'
|
||||||
|
o.repair = 'Check host hardware reference manual to verify PTP '
|
||||||
|
o.repair += 'Hardware timestamping is supported by this interface'
|
||||||
|
o.eid = obj.base_eid + '.ptp=' + interface
|
||||||
|
o.eid += '.unsupported=hardware-timestamping'
|
||||||
|
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error'
|
||||||
|
ALARM_OBJ_LIST.append(o)
|
||||||
|
|
||||||
|
o = PTP_alarm_object(interface)
|
||||||
|
o.alarm = ALARM_CAUSE__UNSUPPORTED_SW
|
||||||
|
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
|
o.reason = obj.hostname + " '" + interface + "' does not support "
|
||||||
|
o.reason += PTP + ' Software timestamping'
|
||||||
|
o.repair = 'Check host hardware reference manual to verify PTP '
|
||||||
|
o.repair += 'Software timestamping is supported by this interface'
|
||||||
|
o.eid = obj.base_eid + '.ptp=' + interface
|
||||||
|
o.eid += '.unsupported=software-timestamping'
|
||||||
|
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error'
|
||||||
|
ALARM_OBJ_LIST.append(o)
|
||||||
|
|
||||||
|
o = PTP_alarm_object(interface)
|
||||||
|
o.alarm = ALARM_CAUSE__UNSUPPORTED_LEGACY
|
||||||
|
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
|
o.reason = obj.hostname + " '" + interface + "' does not support "
|
||||||
|
o.reason += PTP + " Legacy timestamping"
|
||||||
|
o.repair = 'Check host hardware reference manual to verify PTP '
|
||||||
|
o.repair += 'Legacy or Raw Clock is supported by this host'
|
||||||
|
o.eid = obj.base_eid + '.ptp=' + interface
|
||||||
|
o.eid += '.unsupported=legacy-timestamping'
|
||||||
|
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error'
|
||||||
|
ALARM_OBJ_LIST.append(o)
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : read_timestamp_mode
|
||||||
|
#
|
||||||
|
# Description: Refresh the timestamping mode if it changes
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def read_timestamp_mode():
|
||||||
|
""" Load timestamping mode """
|
||||||
|
|
||||||
|
if os.path.exists(PLUGIN_CONF_FILE):
|
||||||
|
current_mode = obj.mode
|
||||||
|
with open(PLUGIN_CONF_FILE, 'r') as infile:
|
||||||
|
for line in infile:
|
||||||
|
if PLUGIN_CONF_TIMESTAMPING in line:
|
||||||
|
obj.mode = line.split()[1].strip('\n')
|
||||||
|
break
|
||||||
|
|
||||||
|
if obj.mode:
|
||||||
|
if obj.mode != current_mode:
|
||||||
|
collectd.info("%s Timestamping Mode: %s" %
|
||||||
|
(PLUGIN, obj.mode))
|
||||||
|
else:
|
||||||
|
collectd.error("%s failed to get Timestamping Mode" % PLUGIN)
|
||||||
|
else:
|
||||||
|
collectd.error("%s failed to load ptp4l configuration" % PLUGIN)
|
||||||
|
obj.mode = None
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : init_func
|
||||||
|
#
|
||||||
|
# Description: The collectd initialization entrypoint for
|
||||||
|
# this plugin
|
||||||
|
#
|
||||||
|
# Assumptions: called only once
|
||||||
|
#
|
||||||
|
# Algorithm : check for no
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def init_func():
|
||||||
|
|
||||||
|
if obj.init_ready() is False:
|
||||||
|
return False
|
||||||
|
|
||||||
|
obj.hostname = obj.gethostname()
|
||||||
|
obj.base_eid = 'host=' + obj.hostname
|
||||||
|
|
||||||
|
# Create the interface independent alarm objects.
|
||||||
|
create_interface_alarm_objects()
|
||||||
|
|
||||||
|
# load monitored interfaces and supported modes
|
||||||
|
if os.path.exists(PLUGIN_CONF_FILE):
|
||||||
|
with open(PLUGIN_CONF_FILE, 'r') as infile:
|
||||||
|
for line in infile:
|
||||||
|
# The PTP interfaces used are specified in the ptp4l.conf
|
||||||
|
# file as [interface]. There may be more than one.
|
||||||
|
# Presently there is no need to track the function of the
|
||||||
|
# interface ; namely mgmnt or oam.
|
||||||
|
if line[0] == '[':
|
||||||
|
interface = line.split(']')[0].split('[')[1]
|
||||||
|
if interface and interface != 'global':
|
||||||
|
interfaces[interface] = _get_supported_modes(interface)
|
||||||
|
create_interface_alarm_objects(interface)
|
||||||
|
|
||||||
|
if PLUGIN_CONF_TIMESTAMPING in line:
|
||||||
|
obj.mode = line.split()[1].strip('\n')
|
||||||
|
|
||||||
|
if obj.mode:
|
||||||
|
collectd.info("%s Timestamping Mode: %s" %
|
||||||
|
(PLUGIN, obj.mode))
|
||||||
|
else:
|
||||||
|
collectd.error("%s failed to get Timestamping Mode" % PLUGIN)
|
||||||
|
else:
|
||||||
|
collectd.error("%s failed to load ptp4l configuration" % PLUGIN)
|
||||||
|
obj.mode = None
|
||||||
|
|
||||||
|
for key, value in interfaces.items():
|
||||||
|
collectd.info("%s interface %s supports timestamping modes: %s" %
|
||||||
|
(PLUGIN, key, value))
|
||||||
|
|
||||||
|
# remove '# to dump alarm object data
|
||||||
|
# print_alarm_objects()
|
||||||
|
|
||||||
|
if tsc.nodetype == 'controller':
|
||||||
|
obj.controller = True
|
||||||
|
|
||||||
|
obj.virtual = obj.is_virtual()
|
||||||
|
obj.init_done = True
|
||||||
|
obj.log_throttle_count = 0
|
||||||
|
collectd.info("%s initialization complete" % PLUGIN)
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
#
|
||||||
|
# Name : read_func
|
||||||
|
#
|
||||||
|
# Description: The collectd audit entrypoint for PTP Monitoring
|
||||||
|
#
|
||||||
|
# Assumptions: collectd calls init_func one time.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# retry init if needed
|
||||||
|
# retry fm connect if needed
|
||||||
|
# check service enabled state
|
||||||
|
# check service running state
|
||||||
|
# error -> alarm host=<hostname>.ptp
|
||||||
|
# check
|
||||||
|
#
|
||||||
|
#####################################################################
|
||||||
|
def read_func():
|
||||||
|
|
||||||
|
if obj.virtual is True:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# check and run init until it reports init_done True
|
||||||
|
if obj.init_done is False:
|
||||||
|
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
|
||||||
|
collectd.info("%s re-running init" % PLUGIN)
|
||||||
|
obj.log_throttle_count += 1
|
||||||
|
init_func()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if obj.fm_connectivity is False:
|
||||||
|
|
||||||
|
try:
|
||||||
|
# query FM for existing alarms.
|
||||||
|
alarms = api.get_faults_by_id(PLUGIN_ALARMID)
|
||||||
|
except Exception as ex:
|
||||||
|
collectd.error("%s 'get_faults_by_id' exception ; %s" %
|
||||||
|
(PLUGIN, ex))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if alarms:
|
||||||
|
for alarm in alarms:
|
||||||
|
collectd.debug("%s found startup alarm '%s'" %
|
||||||
|
(PLUGIN, alarm.entity_instance_id))
|
||||||
|
|
||||||
|
eid = alarm.entity_instance_id
|
||||||
|
if eid is None:
|
||||||
|
collectd.error("%s startup alarm query error ; no eid" %
|
||||||
|
PLUGIN)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# get the hostname host=<hostname>.stuff
|
||||||
|
# split over base eid and then
|
||||||
|
# compare that to this plugin's base eid
|
||||||
|
# ignore alarms not for this host
|
||||||
|
if eid.split('.')[0] != obj.base_eid:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# load the state of the specific alarm
|
||||||
|
instance = eid.split('.')[1].split('=')
|
||||||
|
if instance[0] == 'ptp':
|
||||||
|
# clear all ptp alarms on process startup
|
||||||
|
# just in case interface names have changed
|
||||||
|
# since the alarm was raised.
|
||||||
|
if clear_alarm(eid) is False:
|
||||||
|
# if we can't clear the alarm now then error out.
|
||||||
|
collectd.error("%s failed to clear startup "
|
||||||
|
"alarm %s:%s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, eid))
|
||||||
|
# try again next time around
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
collectd.info("%s cleared startup alarm '%s'" %
|
||||||
|
(PLUGIN, alarm.entity_instance_id))
|
||||||
|
else:
|
||||||
|
|
||||||
|
if clear_alarm(eid) is False:
|
||||||
|
collectd.error("%s failed to clear invalid PTP "
|
||||||
|
"alarm %s:%s" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID,
|
||||||
|
alarm.entity_instance_id))
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
collectd.info("%s cleared found invalid startup"
|
||||||
|
" alarm %s:%s" %
|
||||||
|
(PLUGIN,
|
||||||
|
PLUGIN_ALARMID,
|
||||||
|
alarm.entity_instance_id))
|
||||||
|
else:
|
||||||
|
collectd.info("%s no startup alarms found" % PLUGIN)
|
||||||
|
|
||||||
|
obj.config_complete = True
|
||||||
|
obj.fm_connectivity = True
|
||||||
|
# assert_all_alarms()
|
||||||
|
|
||||||
|
# This plugin supports PTP in-service state change by checking
|
||||||
|
# service state on every audit ; every 5 minutes.
|
||||||
|
data = subprocess.check_output([SYSTEMCTL,
|
||||||
|
SYSTEMCTL_IS_ENABLED_OPTION,
|
||||||
|
PLUGIN_SERVICE])
|
||||||
|
collectd.debug("%s PTP admin state:%s" % (PLUGIN, data.rstrip()))
|
||||||
|
|
||||||
|
if data.rstrip() == SYSTEMCTL_IS_DISABLED_RESPONSE:
|
||||||
|
|
||||||
|
# Manage execution phase
|
||||||
|
if obj.phase != RUN_PHASE__DISABLED:
|
||||||
|
obj.phase = RUN_PHASE__DISABLED
|
||||||
|
obj.log_throttle_count = 0
|
||||||
|
|
||||||
|
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
|
||||||
|
collectd.info("%s PTP Service Disabled" % PLUGIN)
|
||||||
|
obj.log_throttle_count += 1
|
||||||
|
|
||||||
|
for o in ALARM_OBJ_LIST:
|
||||||
|
if o.raised is True:
|
||||||
|
if clear_alarm(o.eid) is True:
|
||||||
|
o.raised = False
|
||||||
|
else:
|
||||||
|
collectd.error("%s %s:%s clear alarm failed "
|
||||||
|
"; will retry" %
|
||||||
|
(PLUGIN, PLUGIN_ALARMID, o.eid))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
data = subprocess.check_output([SYSTEMCTL,
|
||||||
|
SYSTEMCTL_IS_ACTIVE_OPTION,
|
||||||
|
PLUGIN_SERVICE])
|
||||||
|
|
||||||
|
if data.rstrip() == SYSTEMCTL_IS_INACTIVE_RESPONSE:
|
||||||
|
|
||||||
|
# Manage execution phase
|
||||||
|
if obj.phase != RUN_PHASE__NOT_RUNNING:
|
||||||
|
obj.phase = RUN_PHASE__NOT_RUNNING
|
||||||
|
obj.log_throttle_count = 0
|
||||||
|
|
||||||
|
if ctrl.process_alarm_object.alarm == ALARM_CAUSE__PROCESS:
|
||||||
|
if ctrl.process_alarm_object.raised is False:
|
||||||
|
collectd.error("%s PTP service enabled but not running" %
|
||||||
|
PLUGIN)
|
||||||
|
if raise_alarm(ALARM_CAUSE__PROCESS) is True:
|
||||||
|
ctrl.process_alarm_object.raised = True
|
||||||
|
|
||||||
|
# clear all other alarms if the 'process' alarm is raised
|
||||||
|
elif ctrl.process_alarm_object.raised is True:
|
||||||
|
if clear_alarm(ctrl.process_alarm_object.eid) is True:
|
||||||
|
msg = 'cleared'
|
||||||
|
ctrl.process_alarm_object.raised = False
|
||||||
|
else:
|
||||||
|
msg = 'failed to clear'
|
||||||
|
collectd.info("%s %s %s:%s" %
|
||||||
|
(PLUGIN, msg, PLUGIN_ALARMID,
|
||||||
|
ctrl.process_alarm_object.eid))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Handle clearing the 'process' alarm if it is asserted and
|
||||||
|
# the process is now running
|
||||||
|
if ctrl.process_alarm_object.raised is True:
|
||||||
|
if clear_alarm(ctrl.process_alarm_object.eid) is True:
|
||||||
|
ctrl.process_alarm_object.raised = False
|
||||||
|
collectd.info("%s PTP service enabled and running" % PLUGIN)
|
||||||
|
|
||||||
|
# Auto refresh the timestamping mode in case collectd runs
|
||||||
|
# before the ptp manifest or the mode changes on the fly by
|
||||||
|
# an in-service manifest.
|
||||||
|
# Every 4 audits.
|
||||||
|
obj.audits += 1
|
||||||
|
if not obj.audits % 4:
|
||||||
|
read_timestamp_mode()
|
||||||
|
|
||||||
|
# Manage execution phase
|
||||||
|
if obj.phase != RUN_PHASE__SAMPLING:
|
||||||
|
obj.phase = RUN_PHASE__SAMPLING
|
||||||
|
obj.log_throttle_count = 0
|
||||||
|
|
||||||
|
# Let's read the clock info, Grand Master sig and skew
|
||||||
|
#
|
||||||
|
# sudo /usr/sbin/pmc -u -b 0 'GET TIME_STATUS_NP'
|
||||||
|
#
|
||||||
|
data = subprocess.check_output([PLUGIN_STATUS_QUERY_EXEC,
|
||||||
|
'-u', '-b', '0', 'GET TIME_STATUS_NP'])
|
||||||
|
|
||||||
|
got_master_offset = False
|
||||||
|
master_offset = 0
|
||||||
|
my_identity = ''
|
||||||
|
gm_identity = ''
|
||||||
|
gm_present = False
|
||||||
|
obj.resp = data.split('\n')
|
||||||
|
for line in obj.resp:
|
||||||
|
if 'RESPONSE MANAGEMENT TIME_STATUS_NP' in line:
|
||||||
|
collectd.debug("%s key : %s" %
|
||||||
|
(PLUGIN, line.split()[0].split('-')[0]))
|
||||||
|
my_identity = line.split()[0].split('-')[0]
|
||||||
|
if 'master_offset' in line:
|
||||||
|
collectd.debug("%s Offset : %s" % (PLUGIN, line.split()[1]))
|
||||||
|
master_offset = float(line.split()[1])
|
||||||
|
got_master_offset = True
|
||||||
|
if 'gmPresent' in line:
|
||||||
|
collectd.debug("%s gmPresent : %s" % (PLUGIN, line.split()[1]))
|
||||||
|
gm_present = line.split()[1]
|
||||||
|
if 'gmIdentity' in line:
|
||||||
|
collectd.debug("%s gmIdentity: %s" % (PLUGIN, line.split()[1]))
|
||||||
|
gm_identity = line.split()[1]
|
||||||
|
|
||||||
|
# Handle case where this host is the Grand Master
|
||||||
|
# ... or assumes it is.
|
||||||
|
if my_identity == gm_identity:
|
||||||
|
|
||||||
|
if obj.controller is False:
|
||||||
|
|
||||||
|
# Compute and storage nodes should not be the Grand Master
|
||||||
|
if ctrl.nolock_alarm_object.raised is False:
|
||||||
|
if raise_alarm(ALARM_CAUSE__NO_LOCK, None, 0) is True:
|
||||||
|
ctrl.nolock_alarm_object.raised = True
|
||||||
|
|
||||||
|
# produce a throttled log while this host is not locked to the GM
|
||||||
|
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
|
||||||
|
collectd.info("%s %s not locked to remote Grand Master "
|
||||||
|
"(%s)" % (PLUGIN, obj.hostname, gm_identity))
|
||||||
|
obj.log_throttle_count += 1
|
||||||
|
|
||||||
|
# No samples for storage and compute nodes that are not
|
||||||
|
# locked to a Grand Master
|
||||||
|
return 0
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Controllers can be a Grand Master ; throttle the log
|
||||||
|
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
|
||||||
|
collectd.info("%s %s is Grand Master:%s" %
|
||||||
|
(PLUGIN, obj.hostname, gm_identity))
|
||||||
|
obj.log_throttle_count += 1
|
||||||
|
|
||||||
|
# The Grand Master will always be 0 so there is no point
|
||||||
|
# creating a sample for it.
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Handle clearing nolock alarm for computes and storage nodes
|
||||||
|
elif obj.controller is False:
|
||||||
|
if ctrl.nolock_alarm_object.raised is True:
|
||||||
|
if clear_alarm(ctrl.nolock_alarm_object.eid) is True:
|
||||||
|
ctrl.nolock_alarm_object.raised = False
|
||||||
|
|
||||||
|
# Keep this FIT test code but make it commented out for security
|
||||||
|
# if os.path.exists('/var/run/fit/ptp_data'):
|
||||||
|
# master_offset = 0
|
||||||
|
# with open('/var/run/fit/ptp_data', 'r') as infile:
|
||||||
|
# for line in infile:
|
||||||
|
# master_offset = int(line)
|
||||||
|
# got_master_offset = True
|
||||||
|
# collectd.info("%s using ptp FIT data skew:%d" %
|
||||||
|
# (PLUGIN, master_offset))
|
||||||
|
# break
|
||||||
|
|
||||||
|
# Send sample and Manage the Out-Of-Tolerance alarm
|
||||||
|
if got_master_offset is True:
|
||||||
|
|
||||||
|
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
|
||||||
|
collectd.info("%s %s is collecting samples [%5d] "
|
||||||
|
"with Grand Master %s" %
|
||||||
|
(PLUGIN, obj.hostname,
|
||||||
|
float(master_offset), gm_identity))
|
||||||
|
|
||||||
|
obj.log_throttle_count += 1
|
||||||
|
|
||||||
|
# setup the sample structure and dispatch
|
||||||
|
val = collectd.Values(host=obj.hostname)
|
||||||
|
val.type = PLUGIN_TYPE
|
||||||
|
val.type_instance = PLUGIN_TYPE_INSTANCE
|
||||||
|
val.plugin = 'ptp'
|
||||||
|
val.dispatch(values=[float(master_offset)])
|
||||||
|
|
||||||
|
# Manage the sample OOT alarm severity
|
||||||
|
severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
|
||||||
|
if abs(master_offset) > OOT_MAJOR_THRESHOLD:
|
||||||
|
severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
|
||||||
|
elif abs(master_offset) > OOT_MINOR_THRESHOLD:
|
||||||
|
severity = fm_constants.FM_ALARM_SEVERITY_MINOR
|
||||||
|
|
||||||
|
# Handle clearing of Out-Of-Tolerance alarm
|
||||||
|
if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR:
|
||||||
|
if ctrl.oot_alarm_object.raised is True:
|
||||||
|
if clear_alarm(ctrl.oot_alarm_object.eid) is True:
|
||||||
|
ctrl.oot_alarm_object.severity = \
|
||||||
|
fm_constants.FM_ALARM_SEVERITY_CLEAR
|
||||||
|
ctrl.oot_alarm_object.raised = False
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Special Case:
|
||||||
|
# -------------
|
||||||
|
# Don't raise minor alarm when in software timestamping mode.
|
||||||
|
# Too much skew in software or legacy mode ; alarm would bounce.
|
||||||
|
# TODO: Consider making ptp a real time process
|
||||||
|
if severity == fm_constants.FM_ALARM_SEVERITY_MINOR \
|
||||||
|
and obj.mode != 'hardware':
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Handle debounce of the OOT alarm.
|
||||||
|
# Debounce by 1 for the same severity level.
|
||||||
|
if ctrl.oot_alarm_object.severity != severity:
|
||||||
|
ctrl.oot_alarm_object.severity = severity
|
||||||
|
|
||||||
|
# This will keep refreshing the alarm text with the current
|
||||||
|
# skew value while still debounce on state transitions.
|
||||||
|
#
|
||||||
|
# Precision ... (PTP) clocking is out of tolerance by 1004 nsec
|
||||||
|
#
|
||||||
|
elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR:
|
||||||
|
# Handle raising the Minor OOT Alarm.
|
||||||
|
rc = raise_alarm(ALARM_CAUSE__OOT, None, master_offset)
|
||||||
|
if rc is True:
|
||||||
|
ctrl.oot_alarm_object.raised = True
|
||||||
|
|
||||||
|
elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR:
|
||||||
|
# Handle raising the Major OOT Alarm.
|
||||||
|
rc = raise_alarm(ALARM_CAUSE__OOT, None, master_offset)
|
||||||
|
if rc is True:
|
||||||
|
ctrl.oot_alarm_object.raised = True
|
||||||
|
|
||||||
|
# Record the value that is alarmable
|
||||||
|
if severity != fm_constants.FM_ALARM_SEVERITY_CLEAR:
|
||||||
|
collectd.info("%s Grand Master ID: %s ; "
|
||||||
|
"HOST ID: %s ; "
|
||||||
|
"GM Present:%s ; "
|
||||||
|
"Skew:%5d" % (PLUGIN,
|
||||||
|
gm_identity,
|
||||||
|
my_identity,
|
||||||
|
gm_present,
|
||||||
|
master_offset))
|
||||||
|
else:
|
||||||
|
collectd.info("%s No Clock Sync" % PLUGIN)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
collectd.register_init(init_func)
|
||||||
|
collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL)
|
@ -10,6 +10,7 @@ LoadPlugin python
|
|||||||
Path "/proc/meminfo"
|
Path "/proc/meminfo"
|
||||||
</Module>
|
</Module>
|
||||||
Import "ntpq"
|
Import "ntpq"
|
||||||
|
Import "ptp"
|
||||||
Import "interface"
|
Import "interface"
|
||||||
<Module "interface">
|
<Module "interface">
|
||||||
Port 2122
|
Port 2122
|
||||||
|
Loading…
Reference in New Issue
Block a user