Add PTP monitoring to collectd

This update adds Precision Time Protocol (PTP) monitoring
to the current list of inhouse developed collectd plugins.

Refer to the ptp.py header for a description of the monitoring
service algorithm and inline comments for detailed behavior.

Test Plan:

Useability:
-----------
PASS: Verify monitoring behavior around ptp service enable and disable
PASS: Verify ptp monitoring behavior over lock and unlock
PASS: Verify behavior with bonded interfaces (skew oot alarm)
PASS: Verify no-lock hosts lock to remote grandmaster when available
PASS: Verify AIO SX PTP Enable over Lock/Unlock

System Level:
-------------
PASS: Verify large system install
PASS: Verify AIO SX system install

Host Level:
-----------
PASS: Verify controller monitoring
PASS: Verify worker monitoring
PASS: Verify storage monitoring
PASS: Verify worker/storage behavior when the only controller is rebooted.
PASS: Verify startup handling of fm calls while fm is not running
PASS: Verify runtime handling of fm calls while fm is not running

Config Level:
-------------
PASS: Verify PTP Enable and auto start monitoring
PASS: Verify PTP Disable and auto stop monitoring
PASS: Verify audit interval is every 60 seconds
PASS: Verify hardware timestamp monitoring
PASS: Verify software timestamp monitoring
PASS: verify legacy   timestamp monitoring
PASS: Verify hardware to software config change
PASS: Verify software to legacy   config change
PASS; Verify   legacy to hardware config change
PASS: Verify software to hardware config change

Alarm Management:
-----------------
PASS: Verify end-to-end handling of 'nolock' alarm management
PASS: Verify end-to-end handling of 'out-of-tolerance' alarm management
PASS: Verify end-to-end handling of 'process' alarm management
PASS: Verify end-to-end handling of 'unsupported mode' alarm management
PASS: Verify all ptp alarms get cleared on collectd process start
PASS: Verify plugin startup behavior when FM is not running
PASS: Verify plugin with FM V2 API
PASS: Verify thresholed out-of-tolerance alarm handling
PASS: Verify plugin logging is value added
PASS: Verify alarm assert debounce of 2
PASS: Verify alarm clear with no debounce
PASS: Verify only major out-of-tolerance alarm for software mode
PASS: Verify only major out-of-tolerance alarm for legacy mode
PASS: Verify minor/major out-of-tolerance alarm for hardware mode
PASS: Verify no-lock alarm if compute GM ID is the same as its own
PASS: Verify no-lock alarm is not raised on GM reboot
PASS: Verify GM switches to alternate when GM host is rebooted

Change-Id: If36aece94dd5511bf9deba0753f3863237e2a7fe
Story: 2002823
Task: 29492
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-03-25 11:56:48 -04:00
parent a2b1bc6f05
commit b4a23c57aa
6 changed files with 1042 additions and 2 deletions

View File

@ -18,7 +18,8 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/interface.conf \ $PKG_BASE/src/interface.conf \
$PKG_BASE/src/remotels.py \ $PKG_BASE/src/remotels.py \
$PKG_BASE/src/remotels.conf \ $PKG_BASE/src/remotels.conf \
$PKG_BASE/src/ptp.py \
$PKG_BASE/src/ptp.conf \
$PKG_BASE/src/example.py \ $PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf" $PKG_BASE/src/example.conf"
TIS_PATCH_VER=9
TIS_PATCH_VER=8

View File

@ -24,6 +24,7 @@ Source14: example.py
Source15: ntpq.py Source15: ntpq.py
Source16: interface.py Source16: interface.py
Source17: remotels.py Source17: remotels.py
Source18: ptp.py
# collectd plugin conf files into /etc/collectd.d # collectd plugin conf files into /etc/collectd.d
Source100: python_plugins.conf Source100: python_plugins.conf
@ -34,6 +35,7 @@ Source104: example.conf
Source105: ntpq.conf Source105: ntpq.conf
Source106: interface.conf Source106: interface.conf
Source107: remotels.conf Source107: remotels.conf
Source108: ptp.conf
BuildRequires: systemd-devel BuildRequires: systemd-devel
@ -78,6 +80,7 @@ install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir}
# collectd plugin conf files into /etc/collectd.d # collectd plugin conf files into /etc/collectd.d
@ -89,6 +92,7 @@ install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir} install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir}
install -m 600 %{SOURCE108} %{buildroot}%{local_plugin_dir}
%clean %clean
rm -rf $RPM_BUILD_ROOT rm -rf $RPM_BUILD_ROOT

View File

@ -15,6 +15,7 @@ import uuid
import httplib2 import httplib2
import socket import socket
import os import os
from oslo_concurrency import processutils
from fm_api import constants as fm_constants from fm_api import constants as fm_constants
import tsconfig.tsconfig as tsc import tsconfig.tsconfig as tsc
@ -34,28 +35,42 @@ class PluginObject(object):
self.hostname = '' # the name of this host self.hostname = '' # the name of this host
self.port = 0 # the port number for this plugin self.port = 0 # the port number for this plugin
self.base_eid = '' # the base entity id host=<hostname> self.base_eid = '' # the base entity id host=<hostname>
self.controller = False # set true if node is controller
# dynamic gate variables # dynamic gate variables
self.virtual = False # set to True if host is virtual
self.config_complete = False # set to True once config is complete self.config_complete = False # set to True once config is complete
self.config_done = False # set true if config_func completed ok self.config_done = False # set true if config_func completed ok
self.init_done = False # set true if init_func completed ok self.init_done = False # set true if init_func completed ok
self.fm_connectivity = False # set true when fm connectivity ok
self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
self.suppression = True
self.service_affecting = False
# dynamic variables set in read_func # dynamic variables set in read_func
self.usage = float(0) # last usage value recorded as float self.usage = float(0) # last usage value recorded as float
self.value = float(0) # last read value
self.audits = 0 # number of audit since init self.audits = 0 # number of audit since init
self.enabled = False # tracks a plugin's enabled state self.enabled = False # tracks a plugin's enabled state
self.alarmed = False # tracks the current alarmed state self.alarmed = False # tracks the current alarmed state
self.mode = '' # mode specific to plugin
# http and json specific variables # http and json specific variables
self.url = url # target url self.url = url # target url
self.jresp = None # used to store the json response self.jresp = None # used to store the json response
self.resp = '' self.resp = ''
self.objects = [] # list of plugin specific objects
self.cmd = '' # plugin specific command string
# Log controls # Log controls
self.config_logged = False # used to log once the plugin config self.config_logged = False # used to log once the plugin config
self.error_logged = False # used to prevent log flooding self.error_logged = False # used to prevent log flooding
self.log_throttle_count = 0 # used to count throttle logs self.log_throttle_count = 0 # used to count throttle logs
self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold
self.phase = 0 # tracks current phase; init, sampling
collectd.debug("%s Common PluginObject constructor [%s]" % collectd.debug("%s Common PluginObject constructor [%s]" %
(plugin, url)) (plugin, url))
@ -114,6 +129,39 @@ class PluginObject(object):
return None return None
###########################################################################
#
# Name : is_virtual
#
# Description: Execute facter command with output filter on 'is_virtual'
#
# Parameters : None
#
# Returns : True if current host is virtual.
# False if current host is NOT virtual
#
###########################################################################
def is_virtual(self):
""" Check for virtual host """
try:
cmd = '/usr/bin/facter is_virtual'
res, err = processutils.execute(cmd, shell=True)
if err:
return False
elif res:
# remove the trailing '\n' with strip()
if res.strip() == 'true':
collectd.info("%s %s is virtual" %
(self.plugin, self.hostname))
return True
except Exception as ex:
collectd.info("%s failed to execute '/usr/bin/facter' ; %s" %
self.plugin, ex)
return False
########################################################################### ###########################################################################
# #
# Name : check_for_fit # Name : check_for_fit

View File

@ -0,0 +1,15 @@
<Plugin "threshold">
<Plugin "ptp">
<Type "time_offset">
Instance "nsec"
Persist true
PersistOK true
WarningMax 1000
FailureMax 1000000
WarningMin -1000
FailureMin -1000000
Hits 2
Invert false
</Type>
</Plugin>
</Plugin>

View File

@ -0,0 +1,971 @@
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# This file is the collectd 'Precision Time Protocol' Service Monitor.
#
# Algorithm:
#
# while not config ; check again
# while not init ; retry
# if startup
# clear all ptp alarms
# if ptp enabled
# if ptp not running
# raise 'process' alarm
# else
# read grand master and current skew
# if not controller and is grand master
# raise 'no lock' alarm
# if skew is out-of-tolerance
# raise out-of-tolerance alarm
#
#
# manage alarm state throught
# retry on alarm state change failures
# only make raise/clear alarm calls on severity state changes
#
############################################################################
import os
import collectd
import subprocess
import tsconfig.tsconfig as tsc
import plugin_common as pc
from fm_api import constants as fm_constants
from fm_api import fm_api
debug = False
# Fault manager API Object
api = fm_api.FaultAPIsV2()
PLUGIN_ALARMID = "100.119"
# name of the plugin - all logs produced by this plugin are prefixed with this
PLUGIN = 'ptp plugin'
# Service name
PTP = 'Precision Time Protocol (PTP)'
# Interface Monitoring Interval in seconds
PLUGIN_AUDIT_INTERVAL = 60
# Sample Data 'type' and 'instance' database field values.
PLUGIN_TYPE = 'time_offset'
PLUGIN_TYPE_INSTANCE = 'nsec'
# Primary PTP service name
PLUGIN_SERVICE = 'ptp4l.service'
# Plugin configuration file
#
# This plugin looks for the timestamping mode in the ptp4l config file.
# time_stamping hardware
#
PLUGIN_CONF_FILE = '/etc/ptp4l.conf'
PLUGIN_CONF_TIMESTAMPING = 'time_stamping'
# Tools used by plugin
SYSTEMCTL = '/usr/bin/systemctl'
ETHTOOL = '/usr/sbin/ethtool'
PLUGIN_STATUS_QUERY_EXEC = '/usr/sbin/pmc'
# Query PTP service administrative (enabled/disabled) state
#
# > systemctl is-enabled ptp4l
# enabled
# > systemctl disable ptp4l
# > systemctl is-enabled ptp4l
# disabled
SYSTEMCTL_IS_ENABLED_OPTION = 'is-enabled'
SYSTEMCTL_IS_ENABLED_RESPONSE = 'enabled'
SYSTEMCTL_IS_DISABLED_RESPONSE = 'disabled'
# Query PTP service activity (active=running / inactive) state
#
# > systemctl is-active ptp4l
# active
# > systemctl stop ptp4l
# > systemctl is-active ptp4l
# inactive
SYSTEMCTL_IS_ACTIVE_OPTION = 'is-active'
SYSTEMCTL_IS_ACTIVE_RESPONSE = 'active'
SYSTEMCTL_IS_INACTIVE_RESPONSE = 'inactive'
# Alarm Cause codes ; used to specify what alarm EID to assert or clear.
ALARM_CAUSE__NONE = 0
ALARM_CAUSE__PROCESS = 1
ALARM_CAUSE__OOT = 2
ALARM_CAUSE__NO_LOCK = 3
ALARM_CAUSE__UNSUPPORTED_HW = 4
ALARM_CAUSE__UNSUPPORTED_SW = 5
ALARM_CAUSE__UNSUPPORTED_LEGACY = 6
# Run Phase
RUN_PHASE__INIT = 0
RUN_PHASE__DISABLED = 1
RUN_PHASE__NOT_RUNNING = 2
RUN_PHASE__SAMPLING = 3
# Clock Sync Out-Of-Tolerance thresholds
OOT_MINOR_THRESHOLD = int(1000)
OOT_MAJOR_THRESHOLD = int(1000000)
# Instantiate the common plugin control object
obj = pc.PluginObject(PLUGIN, "")
# Create an alarm management class
class PTP_alarm_object:
def __init__(self, interface=None):
self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50
self.alarm = ALARM_CAUSE__NONE
self.interface = interface
self.raised = False
self.reason = ''
self.repair = ''
self.eid = ''
# Plugin specific control class and object.
class PTP_ctrl_object:
def __init__(self):
self.gm_log_throttle = 0
self.nolock_alarm_object = None
self.process_alarm_object = None
self.oot_alarm_object = None
ctrl = PTP_ctrl_object()
# Alarm object list, one entry for each interface and alarm cause case
ALARM_OBJ_LIST = []
# UT verification utilities
def assert_all_alarms():
for o in ALARM_OBJ_LIST:
raise_alarm(o.alarm, o.interface, 0)
def clear_all_alarms():
for o in ALARM_OBJ_LIST:
if clear_alarm(o.eid) is True:
msg = 'cleared'
else:
msg = 'clear failed'
collectd.info("%s %s:%s alarm %s" %
(PLUGIN, PLUGIN_ALARMID, o.eid, msg))
def print_alarm_object(o):
collectd.info("%s Interface:%s Cause: %d Severity:%s Raised:%d" %
(PLUGIN,
o.interface,
o.alarm,
o.severity,
o.raised))
collectd.info("%s Entity:[%s]" % (PLUGIN, o.eid))
collectd.info("%s Reason:[%s]" % (PLUGIN, o.reason))
collectd.info("%s Repair:[%s]" % (PLUGIN, o.repair))
def print_alarm_objects():
for o in ALARM_OBJ_LIST:
print_alarm_object(o)
# Interface:Supported Modes dictionary. key:value
#
# interface:modes
#
interfaces = {}
#####################################################################
#
# Name : _get_supported_modes
#
# Description: Invoke ethtool -T <interface> and load its
# time stamping capabilities.
#
# hardware, software or legacy.
#
# Parameters : The name of the physical interface to query the
# supported modes for.
#
# Interface Capabilities Output Examples:
#
# vbox prints this as it only supports software timestamping
# software-transmit (SOF_TIMESTAMPING_TX_SOFTWARE)
# software-receive (SOF_TIMESTAMPING_RX_SOFTWARE)
#
# full support output looks like this
# hardware-transmit (SOF_TIMESTAMPING_TX_HARDWARE)
# software-transmit (SOF_TIMESTAMPING_TX_SOFTWARE)
# hardware-receive (SOF_TIMESTAMPING_RX_HARDWARE)
# software-receive (SOF_TIMESTAMPING_RX_SOFTWARE)
# hardware-raw-clock (SOF_TIMESTAMPING_RAW_HARDWARE)
#
# Only legacy support output looks like this
# hardware-raw-clock (SOF_TIMESTAMPING_RAW_HARDWARE)
#
# Provisionable PTP Modes are
# hardware -> hardware-transmit/receive
# software -> software-transmit/receive
# legacy -> hardware-raw-clock
TIMESTAMP_MODE__HW = 'hardware'
TIMESTAMP_MODE__SW = 'software'
TIMESTAMP_MODE__LEGACY = 'legacy'
#
# Returns : a list of supported modes
#
#####################################################################
def _get_supported_modes(interface):
""" Get the supported modes for the specified interface """
hw_tx = hw_rx = sw_tx = sw_rx = False
modes = []
data = subprocess.check_output([ETHTOOL, '-T', interface]).split('\n')
if data:
collectd.debug("%s 'ethtool -T %s' output:%s\n" %
(PLUGIN, interface, data))
check_for_modes = False
for i in range(0, len(data)):
collectd.debug("%s data[%d]:%s\n" % (PLUGIN, i, data[i]))
if 'Capabilities' in data[i]:
# start of capabilities list
check_for_modes = True
elif check_for_modes is True:
if 'PTP Hardware Clock' in data[i]:
# no more modes after this label
break
elif 'hardware-transmit' in data[i]:
hw_tx = True
elif 'hardware-receive' in data[i]:
hw_rx = True
elif 'software-transmit' in data[i]:
sw_tx = True
elif 'software-receive' in data[i]:
sw_rx = True
elif 'hardware-raw-clock' in data[i]:
modes.append(TIMESTAMP_MODE__LEGACY)
if sw_tx is True and sw_rx is True:
modes.append(TIMESTAMP_MODE__SW)
if hw_tx is True and hw_rx is True:
modes.append(TIMESTAMP_MODE__HW)
if modes:
collectd.debug("%s %s interface PTP capabilities: %s" %
(PLUGIN, interface, modes))
else:
collectd.info("%s no capabilities advertised for %s" %
(PLUGIN, interface))
else:
collectd.info("%s no ethtool output for %s" % (PLUGIN, interface))
return None
return modes
#####################################################################
#
# Name : get_alarm_object
#
# Description: Search the alarm list based on the alarm cause
# code and interface.
#
# Returns : Alarm object if found ; otherwise None
#
#####################################################################
def get_alarm_object(alarm, interface=None):
""" Alarm object lookup """
for o in ALARM_OBJ_LIST:
# print_alarm_object(o)
if interface is None:
if o.alarm == alarm:
return o
else:
if o.interface == interface:
if o.alarm == alarm:
return o
collectd.info("%s alarm object lookup failed ; %d:%s" %
(PLUGIN, alarm, interface))
return None
#####################################################################
#
# Name : clear_alarm
#
# Description: Clear the ptp alarm with the specified entity ID.
#
# Returns : True if operation succeeded
# False if there was an error exception.
#
# Assumptions: Caller can decide to retry based on return status.
#
#####################################################################
def clear_alarm(eid):
""" Clear the ptp alarm with the specified entity ID """
try:
if api.clear_fault(PLUGIN_ALARMID, eid) is True:
collectd.info("%s %s:%s alarm cleared" %
(PLUGIN, PLUGIN_ALARMID, eid))
else:
collectd.info("%s %s:%s alarm clear ; None found" %
(PLUGIN, PLUGIN_ALARMID, eid))
return True
except Exception as ex:
collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
(PLUGIN, PLUGIN_ALARMID, eid, ex))
return False
#####################################################################
#
# Name : raise_alarm
#
# Description: Assert a specific PTP alarm based on the alarm cause
# code and interface.
#
# Handle special case cause codes
# Handle failure to raise fault
#
# Assumptions: Short circuited Success return if the alarm is
# already known to be asserted.
#
# Returns : False on Failure
# True on Success
#
#####################################################################
def raise_alarm(alarm_cause, interface=None, data=0):
""" Assert a cause based PTP alarm """
collectd.debug("%s Raising Alarm %d" % (PLUGIN, alarm_cause))
alarm = get_alarm_object(alarm_cause, interface)
if alarm is None:
# log created for None case in the get_alarm_object util
return True
# copy the reason as it might be updated for the OOT,
# most typical, case.
reason = alarm.reason
# Handle some special cases
#
if alarm_cause == ALARM_CAUSE__OOT:
# If this is an out of tolerance alarm then add the
# out of tolerance reading to the reason string before
# asserting the alarm.
#
# Keep the alarm updated with the latest sample reading
# and severity even if its already asserted.
if abs(float(data)) > 100000000000:
reason += 'more than 100 seconds'
elif abs(float(data)) > 10000000000:
reason += 'more than 10 seconds'
elif abs(float(data)) > 1000000000:
reason += 'more than 1 second'
elif abs(float(data)) > 1000000:
reason += str(abs(int(data)) / 1000000)
reason += ' millisecs'
elif abs(float(data)) > 1000:
reason += str(abs(int(data)) / 1000)
reason += ' microsecs'
else:
reason += str(float(data))
reason += ' ' + PLUGIN_TYPE_INSTANCE
elif alarm.raised is True:
# If alarm already raised then exit.
#
# All other alarms are a Major so there is no need to
# track a change in severity and update accordingly.
return True
elif alarm_cause == ALARM_CAUSE__PROCESS:
reason = 'Provisioned ' + PTP + ' \'' + obj.mode
reason += '\' time stamping mode seems to be unsupported by this host'
try:
fault = fm_api.Fault(
alarm_id=PLUGIN_ALARMID,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=alarm.eid,
severity=alarm.severity,
reason_text=reason,
alarm_type=obj.alarm_type,
probable_cause=alarm.cause,
proposed_repair_action=alarm.repair,
service_affecting=False, # obj.service_affecting,
suppression=True) # obj.suppression)
alarm_uuid = api.set_fault(fault)
if pc.is_uuid_like(alarm_uuid) is False:
# Don't _add_unreachable_server list if the fm call failed.
# That way it will be retried at a later time.
collectd.error("%s %s:%s set_fault failed:%s" %
(PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm_uuid))
return False
else:
collectd.info("%s %s:%s:%s alarm raised" %
(PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm.severity))
alarm.raised = True
return True
except Exception as ex:
collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" %
(PLUGIN,
PLUGIN_ALARMID,
alarm.eid,
alarm.severity,
ex))
return False
#####################################################################
#
# Name : create_interface_alarm_objects
#
# Description: Create alarm objects for specified interface
#
#####################################################################
def create_interface_alarm_objects(interface=None):
""" Create alarm objects """
collectd.debug("%s Alarm Object Create: Interface:%s " %
(PLUGIN, interface))
if interface is None:
o = PTP_alarm_object()
o.alarm = ALARM_CAUSE__PROCESS
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
o.reason = obj.hostname + ' does not support the provisioned '
o.reason += PTP + ' mode '
o.repair = 'Check host hardware reference manual '
o.repair += 'to verify that the selected PTP mode is supported'
o.eid = obj.base_eid + '.ptp'
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN # 'unknown'
ALARM_OBJ_LIST.append(o)
ctrl.process_alarm_object = o
o = PTP_alarm_object()
o.alarm = ALARM_CAUSE__OOT
o.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
o.reason = obj.hostname + ' '
o.reason += PTP + " clocking is out of tolerance by "
o.repair = "Check quality of the clocking network"
o.eid = obj.base_eid + '.ptp=out-of-tolerance'
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS
ALARM_OBJ_LIST.append(o)
ctrl.oot_alarm_object = o
o = PTP_alarm_object()
# Only applies to storage and worker nodes
o.alarm = ALARM_CAUSE__NO_LOCK
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
o.reason = obj.hostname
o.reason += ' is not locked to remote PTP Grand Master'
o.repair = 'Check network'
o.eid = obj.base_eid + '.ptp=no-lock'
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_51 # timing-problem
ALARM_OBJ_LIST.append(o)
ctrl.nolock_alarm_object = o
else:
o = PTP_alarm_object(interface)
o.alarm = ALARM_CAUSE__UNSUPPORTED_HW
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
o.reason = obj.hostname + " '" + interface + "' does not support "
o.reason += PTP + ' Hardware timestamping'
o.repair = 'Check host hardware reference manual to verify PTP '
o.repair += 'Hardware timestamping is supported by this interface'
o.eid = obj.base_eid + '.ptp=' + interface
o.eid += '.unsupported=hardware-timestamping'
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error'
ALARM_OBJ_LIST.append(o)
o = PTP_alarm_object(interface)
o.alarm = ALARM_CAUSE__UNSUPPORTED_SW
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
o.reason = obj.hostname + " '" + interface + "' does not support "
o.reason += PTP + ' Software timestamping'
o.repair = 'Check host hardware reference manual to verify PTP '
o.repair += 'Software timestamping is supported by this interface'
o.eid = obj.base_eid + '.ptp=' + interface
o.eid += '.unsupported=software-timestamping'
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error'
ALARM_OBJ_LIST.append(o)
o = PTP_alarm_object(interface)
o.alarm = ALARM_CAUSE__UNSUPPORTED_LEGACY
o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
o.reason = obj.hostname + " '" + interface + "' does not support "
o.reason += PTP + " Legacy timestamping"
o.repair = 'Check host hardware reference manual to verify PTP '
o.repair += 'Legacy or Raw Clock is supported by this host'
o.eid = obj.base_eid + '.ptp=' + interface
o.eid += '.unsupported=legacy-timestamping'
o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error'
ALARM_OBJ_LIST.append(o)
#####################################################################
#
# Name : read_timestamp_mode
#
# Description: Refresh the timestamping mode if it changes
#
#####################################################################
def read_timestamp_mode():
""" Load timestamping mode """
if os.path.exists(PLUGIN_CONF_FILE):
current_mode = obj.mode
with open(PLUGIN_CONF_FILE, 'r') as infile:
for line in infile:
if PLUGIN_CONF_TIMESTAMPING in line:
obj.mode = line.split()[1].strip('\n')
break
if obj.mode:
if obj.mode != current_mode:
collectd.info("%s Timestamping Mode: %s" %
(PLUGIN, obj.mode))
else:
collectd.error("%s failed to get Timestamping Mode" % PLUGIN)
else:
collectd.error("%s failed to load ptp4l configuration" % PLUGIN)
obj.mode = None
#####################################################################
#
# Name : init_func
#
# Description: The collectd initialization entrypoint for
# this plugin
#
# Assumptions: called only once
#
# Algorithm : check for no
#
#
#####################################################################
def init_func():
if obj.init_ready() is False:
return False
obj.hostname = obj.gethostname()
obj.base_eid = 'host=' + obj.hostname
# Create the interface independent alarm objects.
create_interface_alarm_objects()
# load monitored interfaces and supported modes
if os.path.exists(PLUGIN_CONF_FILE):
with open(PLUGIN_CONF_FILE, 'r') as infile:
for line in infile:
# The PTP interfaces used are specified in the ptp4l.conf
# file as [interface]. There may be more than one.
# Presently there is no need to track the function of the
# interface ; namely mgmnt or oam.
if line[0] == '[':
interface = line.split(']')[0].split('[')[1]
if interface and interface != 'global':
interfaces[interface] = _get_supported_modes(interface)
create_interface_alarm_objects(interface)
if PLUGIN_CONF_TIMESTAMPING in line:
obj.mode = line.split()[1].strip('\n')
if obj.mode:
collectd.info("%s Timestamping Mode: %s" %
(PLUGIN, obj.mode))
else:
collectd.error("%s failed to get Timestamping Mode" % PLUGIN)
else:
collectd.error("%s failed to load ptp4l configuration" % PLUGIN)
obj.mode = None
for key, value in interfaces.items():
collectd.info("%s interface %s supports timestamping modes: %s" %
(PLUGIN, key, value))
# remove '# to dump alarm object data
# print_alarm_objects()
if tsc.nodetype == 'controller':
obj.controller = True
obj.virtual = obj.is_virtual()
obj.init_done = True
obj.log_throttle_count = 0
collectd.info("%s initialization complete" % PLUGIN)
#####################################################################
#
# Name : read_func
#
# Description: The collectd audit entrypoint for PTP Monitoring
#
# Assumptions: collectd calls init_func one time.
#
#
# retry init if needed
# retry fm connect if needed
# check service enabled state
# check service running state
# error -> alarm host=<hostname>.ptp
# check
#
#####################################################################
def read_func():
if obj.virtual is True:
return 0
# check and run init until it reports init_done True
if obj.init_done is False:
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
collectd.info("%s re-running init" % PLUGIN)
obj.log_throttle_count += 1
init_func()
return 0
if obj.fm_connectivity is False:
try:
# query FM for existing alarms.
alarms = api.get_faults_by_id(PLUGIN_ALARMID)
except Exception as ex:
collectd.error("%s 'get_faults_by_id' exception ; %s" %
(PLUGIN, ex))
return 0
if alarms:
for alarm in alarms:
collectd.debug("%s found startup alarm '%s'" %
(PLUGIN, alarm.entity_instance_id))
eid = alarm.entity_instance_id
if eid is None:
collectd.error("%s startup alarm query error ; no eid" %
PLUGIN)
continue
# get the hostname host=<hostname>.stuff
# split over base eid and then
# compare that to this plugin's base eid
# ignore alarms not for this host
if eid.split('.')[0] != obj.base_eid:
continue
else:
# load the state of the specific alarm
instance = eid.split('.')[1].split('=')
if instance[0] == 'ptp':
# clear all ptp alarms on process startup
# just in case interface names have changed
# since the alarm was raised.
if clear_alarm(eid) is False:
# if we can't clear the alarm now then error out.
collectd.error("%s failed to clear startup "
"alarm %s:%s" %
(PLUGIN, PLUGIN_ALARMID, eid))
# try again next time around
return 0
else:
collectd.info("%s cleared startup alarm '%s'" %
(PLUGIN, alarm.entity_instance_id))
else:
if clear_alarm(eid) is False:
collectd.error("%s failed to clear invalid PTP "
"alarm %s:%s" %
(PLUGIN, PLUGIN_ALARMID,
alarm.entity_instance_id))
return 0
else:
collectd.info("%s cleared found invalid startup"
" alarm %s:%s" %
(PLUGIN,
PLUGIN_ALARMID,
alarm.entity_instance_id))
else:
collectd.info("%s no startup alarms found" % PLUGIN)
obj.config_complete = True
obj.fm_connectivity = True
# assert_all_alarms()
# This plugin supports PTP in-service state change by checking
# service state on every audit ; every 5 minutes.
data = subprocess.check_output([SYSTEMCTL,
SYSTEMCTL_IS_ENABLED_OPTION,
PLUGIN_SERVICE])
collectd.debug("%s PTP admin state:%s" % (PLUGIN, data.rstrip()))
if data.rstrip() == SYSTEMCTL_IS_DISABLED_RESPONSE:
# Manage execution phase
if obj.phase != RUN_PHASE__DISABLED:
obj.phase = RUN_PHASE__DISABLED
obj.log_throttle_count = 0
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
collectd.info("%s PTP Service Disabled" % PLUGIN)
obj.log_throttle_count += 1
for o in ALARM_OBJ_LIST:
if o.raised is True:
if clear_alarm(o.eid) is True:
o.raised = False
else:
collectd.error("%s %s:%s clear alarm failed "
"; will retry" %
(PLUGIN, PLUGIN_ALARMID, o.eid))
return 0
data = subprocess.check_output([SYSTEMCTL,
SYSTEMCTL_IS_ACTIVE_OPTION,
PLUGIN_SERVICE])
if data.rstrip() == SYSTEMCTL_IS_INACTIVE_RESPONSE:
# Manage execution phase
if obj.phase != RUN_PHASE__NOT_RUNNING:
obj.phase = RUN_PHASE__NOT_RUNNING
obj.log_throttle_count = 0
if ctrl.process_alarm_object.alarm == ALARM_CAUSE__PROCESS:
if ctrl.process_alarm_object.raised is False:
collectd.error("%s PTP service enabled but not running" %
PLUGIN)
if raise_alarm(ALARM_CAUSE__PROCESS) is True:
ctrl.process_alarm_object.raised = True
# clear all other alarms if the 'process' alarm is raised
elif ctrl.process_alarm_object.raised is True:
if clear_alarm(ctrl.process_alarm_object.eid) is True:
msg = 'cleared'
ctrl.process_alarm_object.raised = False
else:
msg = 'failed to clear'
collectd.info("%s %s %s:%s" %
(PLUGIN, msg, PLUGIN_ALARMID,
ctrl.process_alarm_object.eid))
return 0
# Handle clearing the 'process' alarm if it is asserted and
# the process is now running
if ctrl.process_alarm_object.raised is True:
if clear_alarm(ctrl.process_alarm_object.eid) is True:
ctrl.process_alarm_object.raised = False
collectd.info("%s PTP service enabled and running" % PLUGIN)
# Auto refresh the timestamping mode in case collectd runs
# before the ptp manifest or the mode changes on the fly by
# an in-service manifest.
# Every 4 audits.
obj.audits += 1
if not obj.audits % 4:
read_timestamp_mode()
# Manage execution phase
if obj.phase != RUN_PHASE__SAMPLING:
obj.phase = RUN_PHASE__SAMPLING
obj.log_throttle_count = 0
# Let's read the clock info, Grand Master sig and skew
#
# sudo /usr/sbin/pmc -u -b 0 'GET TIME_STATUS_NP'
#
data = subprocess.check_output([PLUGIN_STATUS_QUERY_EXEC,
'-u', '-b', '0', 'GET TIME_STATUS_NP'])
got_master_offset = False
master_offset = 0
my_identity = ''
gm_identity = ''
gm_present = False
obj.resp = data.split('\n')
for line in obj.resp:
if 'RESPONSE MANAGEMENT TIME_STATUS_NP' in line:
collectd.debug("%s key : %s" %
(PLUGIN, line.split()[0].split('-')[0]))
my_identity = line.split()[0].split('-')[0]
if 'master_offset' in line:
collectd.debug("%s Offset : %s" % (PLUGIN, line.split()[1]))
master_offset = float(line.split()[1])
got_master_offset = True
if 'gmPresent' in line:
collectd.debug("%s gmPresent : %s" % (PLUGIN, line.split()[1]))
gm_present = line.split()[1]
if 'gmIdentity' in line:
collectd.debug("%s gmIdentity: %s" % (PLUGIN, line.split()[1]))
gm_identity = line.split()[1]
# Handle case where this host is the Grand Master
# ... or assumes it is.
if my_identity == gm_identity:
if obj.controller is False:
# Compute and storage nodes should not be the Grand Master
if ctrl.nolock_alarm_object.raised is False:
if raise_alarm(ALARM_CAUSE__NO_LOCK, None, 0) is True:
ctrl.nolock_alarm_object.raised = True
# produce a throttled log while this host is not locked to the GM
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
collectd.info("%s %s not locked to remote Grand Master "
"(%s)" % (PLUGIN, obj.hostname, gm_identity))
obj.log_throttle_count += 1
# No samples for storage and compute nodes that are not
# locked to a Grand Master
return 0
else:
# Controllers can be a Grand Master ; throttle the log
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
collectd.info("%s %s is Grand Master:%s" %
(PLUGIN, obj.hostname, gm_identity))
obj.log_throttle_count += 1
# The Grand Master will always be 0 so there is no point
# creating a sample for it.
return 0
# Handle clearing nolock alarm for computes and storage nodes
elif obj.controller is False:
if ctrl.nolock_alarm_object.raised is True:
if clear_alarm(ctrl.nolock_alarm_object.eid) is True:
ctrl.nolock_alarm_object.raised = False
# Keep this FIT test code but make it commented out for security
# if os.path.exists('/var/run/fit/ptp_data'):
# master_offset = 0
# with open('/var/run/fit/ptp_data', 'r') as infile:
# for line in infile:
# master_offset = int(line)
# got_master_offset = True
# collectd.info("%s using ptp FIT data skew:%d" %
# (PLUGIN, master_offset))
# break
# Send sample and Manage the Out-Of-Tolerance alarm
if got_master_offset is True:
if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE):
collectd.info("%s %s is collecting samples [%5d] "
"with Grand Master %s" %
(PLUGIN, obj.hostname,
float(master_offset), gm_identity))
obj.log_throttle_count += 1
# setup the sample structure and dispatch
val = collectd.Values(host=obj.hostname)
val.type = PLUGIN_TYPE
val.type_instance = PLUGIN_TYPE_INSTANCE
val.plugin = 'ptp'
val.dispatch(values=[float(master_offset)])
# Manage the sample OOT alarm severity
severity = fm_constants.FM_ALARM_SEVERITY_CLEAR
if abs(master_offset) > OOT_MAJOR_THRESHOLD:
severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
elif abs(master_offset) > OOT_MINOR_THRESHOLD:
severity = fm_constants.FM_ALARM_SEVERITY_MINOR
# Handle clearing of Out-Of-Tolerance alarm
if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR:
if ctrl.oot_alarm_object.raised is True:
if clear_alarm(ctrl.oot_alarm_object.eid) is True:
ctrl.oot_alarm_object.severity = \
fm_constants.FM_ALARM_SEVERITY_CLEAR
ctrl.oot_alarm_object.raised = False
else:
# Special Case:
# -------------
# Don't raise minor alarm when in software timestamping mode.
# Too much skew in software or legacy mode ; alarm would bounce.
# TODO: Consider making ptp a real time process
if severity == fm_constants.FM_ALARM_SEVERITY_MINOR \
and obj.mode != 'hardware':
return 0
# Handle debounce of the OOT alarm.
# Debounce by 1 for the same severity level.
if ctrl.oot_alarm_object.severity != severity:
ctrl.oot_alarm_object.severity = severity
# This will keep refreshing the alarm text with the current
# skew value while still debounce on state transitions.
#
# Precision ... (PTP) clocking is out of tolerance by 1004 nsec
#
elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR:
# Handle raising the Minor OOT Alarm.
rc = raise_alarm(ALARM_CAUSE__OOT, None, master_offset)
if rc is True:
ctrl.oot_alarm_object.raised = True
elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR:
# Handle raising the Major OOT Alarm.
rc = raise_alarm(ALARM_CAUSE__OOT, None, master_offset)
if rc is True:
ctrl.oot_alarm_object.raised = True
# Record the value that is alarmable
if severity != fm_constants.FM_ALARM_SEVERITY_CLEAR:
collectd.info("%s Grand Master ID: %s ; "
"HOST ID: %s ; "
"GM Present:%s ; "
"Skew:%5d" % (PLUGIN,
gm_identity,
my_identity,
gm_present,
master_offset))
else:
collectd.info("%s No Clock Sync" % PLUGIN)
return 0
collectd.register_init(init_func)
collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL)

View File

@ -10,6 +10,7 @@ LoadPlugin python
Path "/proc/meminfo" Path "/proc/meminfo"
</Module> </Module>
Import "ntpq" Import "ntpq"
Import "ptp"
Import "interface" Import "interface"
<Module "interface"> <Module "interface">
Port 2122 Port 2122