############################################################################ # Copyright (c) 2018-2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # ############################################################################# # # This is the NTP connectivity monitor plugin for collectd. # # This plugin uses the industry standard ntpq exec to query NTP attributes. # # This plugin executes 'ntpq -np' to determined which provisioned servers # are reachable. The ntpq output includes Tally Code. The tally Code is # represented by the first character in each server's line item. # # The only ntpq output looked at by this plugin are the Tally Codes and # associated IPs. # # Tally Code Summary: # # A server is considered reachable only when the Tally Code is a * or a +. # A server is considered unreachable if the Tally Code is a ' ' (space) # A server with a '*' Tally Code is the 'selected' server. # # Here is an example of the ntpq command output # # remote refid st t when poll reach delay offset jitter # ============================================================================= # +192.168.204.104 206.108.0.133 2 u 203 1024 377 0.226 -3.443 1.137 # +97.107.129.217 200.98.196.212 2 u 904 1024 377 21.677 5.577 0.624 # 192.95.27.155 24.150.203.150 2 u 226 1024 377 15.867 0.381 1.124 # -97.107.129.217 200.98.196.212 2 u 904 1024 377 21.677 5.577 0.624 # *182.95.27.155 24.150.203.150 2 u 226 1024 377 15.867 0.381 1.124 # # The local controller node is not to be considered a reachable server and is # never alarmed if it is not reachable. # # Normal running modes with no alarms include # # 0 - All NTP servers are reachable and one is selected # 1 - No NTP servers are provisioned # # Failure modes that warrant alarms include # # 2 - None of the NTP servers are reachable - major alarm # 3 - Some NTP servers reachable and one is selected - server IP minor alarm # 4 - Some NTP servers reachable but none is selected - major alarm # # None of these failures result in a host being degraded. # # This script will only be run on the controller nodes. # # This script logs to daemon.log with the 'collectd' process label # ############################################################################### import os import subprocess import uuid import collectd from fm_api import constants as fm_constants from fm_api import fm_api import tsconfig.tsconfig as tsc import socket api = fm_api.FaultAPIsV2() PLUGIN = 'NTP query plugin' PLUGIN_INTERVAL = 600 # audit interval in secs PLUGIN_CONF = '/etc/ntp.conf' PLUGIN_EXEC = '/usr/sbin/ntpq' PLUGIN_EXEC_OPTIONS = '-pn' PLUGIN_ALARMID = "100.114" # define a class here that will persist over read calls class NtpqObject: # static variables set in init hostname = '' # the name of this host base_eid = '' # the eid for the major alarm init_complete = False # set to true once config is complete alarm_raised = False # True when the major alarm is asserted server_list_conf = [] # list of servers in the /etc/ntp.conf file server_list_ntpq = [] # list of servers in the ntpq -np output unreachable_servers = [] # list of unreachable servers reachable_servers = [] # list of reachable servers selected_server = 'None' # the ip address of the selected server selected_server_save = 'None' # the last selected server ; note change peer_selected = False # true when peer is selected # variables used to raise alarms to FM suppression = True service_affecting = False name = "NTP" alarm_type = fm_constants.FM_ALARM_TYPE_1 cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN repair = "Monitor and if condition persists, " repair += "contact next level of support." # This plugin's class object - persists over read calls obj = NtpqObject() ############################################################################### # # Name : _add_unreachable_server # # Description: This private interface is used to add an ip to the # unreachable servers list. # # Parameters : IP address # ############################################################################### def _add_unreachable_server(ip=None): """Add ip to unreachable_servers list""" if ip: if ip not in obj.unreachable_servers: collectd.debug("%s adding '%s' to unreachable servers list: %s" % (PLUGIN, ip, obj.unreachable_servers)) obj.unreachable_servers.append(ip) collectd.info("%s added '%s' to unreachable servers list: %s" % (PLUGIN, ip, obj.unreachable_servers)) else: collectd.debug("%s ip '%s' already in unreachable_servers list" % (PLUGIN, ip)) else: collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN) ############################################################################### # # Name : _raise_alarm # # Description: This private interface is used to raise NTP alarms. # # Parameters : Optional IP address # # If called with no or empty IP then a generic major alarm is raised. # If called with an IP then an IP specific minor alarm is raised. # # Returns : Error indication. # # True : is error. FM call failed to set the # alarm and needs to be retried. # # False: no error. FM call succeeds # ############################################################################### def _raise_alarm(ip=None): """Assert an NTP alarm""" if not ip: # Don't re-raise the alarm if its already raised if obj.alarm_raised is True: return False if obj.peer_selected: reason = "NTP cannot reach external time source; " \ "syncing with peer controller only" fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR else: reason = "NTP configuration does not contain any valid " reason += "or reachable NTP servers." fm_severity = fm_constants.FM_ALARM_SEVERITY_MAJOR eid = obj.base_eid else: reason = "NTP address " reason += ip reason += " is not a valid or a reachable NTP server." eid = obj.base_eid + '=' + ip fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR try: fault = fm_api.Fault( alarm_id=PLUGIN_ALARMID, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, entity_instance_id=eid, severity=fm_severity, reason_text=reason, alarm_type=obj.alarm_type, probable_cause=obj.cause, proposed_repair_action=obj.repair, service_affecting=obj.service_affecting, suppression=obj.suppression) alarm_uuid = api.set_fault(fault) if _is_uuid_like(alarm_uuid) is False: # Don't _add_unreachable_server list if the fm call failed. # That way it will be retried at a later time. collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % (PLUGIN, PLUGIN_ALARMID, eid, alarm_uuid)) return 0 else: collectd.info("%s raised alarm %s:%s" % (PLUGIN, PLUGIN_ALARMID, eid)) if ip: _add_unreachable_server(ip) else: obj.alarm_raised = True except Exception as ex: collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" % (PLUGIN, PLUGIN_ALARMID, eid, fm_severity, ex)) return 0 ############################################################################### # # Name : _clear_base_alarm # # Description: This private interface is used to clear the NTP base alarm. # # Parameters : None # # Returns : Error indication. # # False: is error. FM call failed to clear the # alarm and needs to be retried. # # True : no error. FM call succeeds # ############################################################################### def _clear_base_alarm(): """Clear the NTP base alarm""" try: if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False: collectd.info("%s %s:%s alarm already cleared" % (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) else: collectd.info("%s %s:%s alarm cleared" % (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) obj.alarm_raised = False return True except Exception as ex: collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % (PLUGIN, PLUGIN_ALARMID, obj.base_eid, ex)) return False ############################################################################### # # Name : _remove_ip_from_unreachable_list # # Description: This private interface is used to remove the specified IP # from the unreachable servers list and clear its alarm if raised. # # Parameters : IP address # ############################################################################### def _remove_ip_from_unreachable_list(ip): """Remove an IP address from the unreachable list and clear its NTP alarms""" # remove from unreachable list if its there if ip and ip in obj.unreachable_servers: eid = obj.base_eid + '=' + ip collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid)) try: # clear the alarm if its asserted if api.clear_fault(PLUGIN_ALARMID, eid) is True: collectd.info("%s %s:%s alarm cleared " % (PLUGIN, PLUGIN_ALARMID, eid)) else: # alarm does not exist collectd.info("%s %s:%s alarm clear" % (PLUGIN, PLUGIN_ALARMID, eid)) obj.unreachable_servers.remove(ip) except Exception as ex: collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % (PLUGIN, PLUGIN_ALARMID, eid, ex)) ############################################################################### # # Name : _add_ip_to_ntpq_server_list # # Description: This private interface is used to create a list if servers # found in the ntpq output. # # This list is used to detect and handle servers that might come # and go between readings that might otherwise result in stuck # alarms. # # Parameters : IP address # # Returns : nothing # ############################################################################### def _add_ip_to_ntpq_server_list(ip): """Add this IP to the list of servers that ntpq reports against""" if ip not in obj.server_list_ntpq: obj.server_list_ntpq.append(ip) ############################################################################## # # Name : _cleanup_stale_servers # # Description: This private interface walks through each server tracking list # removing any that it finds that are not in the ntpq server list. # # Alarms are cleared as needed to avoid stale alarms # # Parameters : None # # Returns : nothing # ############################################################################### def _cleanup_stale_servers(): """Cleanup the server IP tracking lists""" collectd.debug("%s CLEANUP REACHABLE: %s %s" % (PLUGIN, obj.server_list_ntpq, obj.reachable_servers)) for ip in obj.reachable_servers: if ip not in obj.server_list_ntpq: collectd.info("%s removing missing '%s' server from reachable " "server list" % (PLUGIN, ip)) obj.reachable_servers.remove(ip) collectd.debug("%s CLEANUP UNREACHABLE: %s %s" % (PLUGIN, obj.server_list_ntpq, obj.unreachable_servers)) for ip in obj.unreachable_servers: if ip not in obj.server_list_ntpq: collectd.info("%s removing missing '%s' server from unreachable " "server list" % (PLUGIN, ip)) _remove_ip_from_unreachable_list(ip) ############################################################################### # # Name : _get_ntp_servers # # Description: This private interface reads the list of ntp servers from the # ntp.conf file # # Parameters : None # # Returns : nothing # # Updates : server_list_conf # ############################################################################### def _get_ntp_servers(): """Read the provisioned servers from the ntp conf file""" with open(PLUGIN_CONF, 'r') as infile: for line in infile: if line.startswith('server '): ip = line.rstrip().split(' ')[1] if ip not in obj.server_list_conf: obj.server_list_conf.append(ip) if len(obj.server_list_conf): collectd.info("%s server list: %s" % (PLUGIN, obj.server_list_conf)) else: ################################################################## # # Handle NTP_NOT_PROVISIONED (1) case # # There is no alarming for this case. # Clear any that may have been raised. # ################################################################## collectd.info("%s NTP Service Disabled ; no provisioned servers" % PLUGIN) # clear all alarms if obj.alarm_raised: _clear_base_alarm() if obj.unreachable_servers: for ip in obj.unreachable_servers: _remove_ip_from_unreachable_list(ip) ############################################################################### # # Name : is_controller # # Description: This private interface returns a True if the specified ip is # associated with a local controller. # # Parameters : IP address # # Returns : True or False # ############################################################################### def _is_controller(ip): """Returns True if this IP corresponds to one of the controllers""" collectd.debug("%s check if '%s' is a controller ip" % (PLUGIN, ip)) with open('/etc/hosts', 'r') as infile: for line in infile: # skip over file comment lines prefixed with '#' if line[0] == '#': continue # line format is 'ip' 'name' .... split_line = line.split() if len(split_line) >= 2: # look for exact match ip that contains controller in its name if split_line[0] == ip and 'controller' in line: collectd.debug("%s %s is a controller" % (PLUGIN, ip)) return True return False ############################################################################### # # Name : _is_ip_address # # Description: This private interface returns: # AF_INET if val is ipv4 # AF_INET6 if val is ipv6 # False if val is not a valid ip address # # Parameters : val is a uuid string # # Returns : socket.AF_INET for ipv4, socket.AF_INET6 for ipv6 # or False for invalid # ############################################################################### def _is_ip_address(val): try: socket.inet_pton(socket.AF_INET, val) return socket.AF_INET except socket.error: pass try: socket.inet_pton(socket.AF_INET6, val) return socket.AF_INET6 except socket.error: pass return False ############################################################################### # # Name : is_uuid_like # # Description: This private interface returns a True if the specified value is # a valid uuid. # # Parameters : val is a uuid string # # Returns : True or False # ############################################################################### def _is_uuid_like(val): """Returns validation of a value as a UUID""" try: return str(uuid.UUID(val)) == val except (TypeError, ValueError, AttributeError): return False ############################################################################### # # Name : config_func # # Description: The configuration interface this plugin publishes to collectd. # # collectd calls this interface one time on its process startup # when it loads this plugin. # # There is currently no specific configuration options to parse # for this plugin. # # Parameters : collectd config object # # Returns : zero # ############################################################################### def config_func(config): """Configure the plugin""" collectd.debug('%s config function' % PLUGIN) return 0 ############################################################################### # # Name : init_func # # Description: The initialization interface this plugin publishes to collectd. # # collectd calls this interface one time on its process startup # when it loads this plugin. # # 1. get hostname # 2. build base entity id for the NTP alarm # 3. query FM for existing NTP alarms # - base alarm is maintained and state loaded if it exists # - ntp ip minor alalrms are cleared on init. This is done to # auto correct ntp server IP address changes over process # restart ; avoid stuck alarms. # # Parameters : None # # Returns : zero # ############################################################################### def init_func(): # ntp query is for controllers only if tsc.nodetype != 'controller': return 0 # do nothing till config is complete. # init_func will be called again by read_func once config is complete. if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is False: return 0 # get current hostname obj.hostname = os.uname()[1] if not obj.hostname: collectd.error("%s failed to get hostname" % PLUGIN) return 1 obj.base_eid = 'host=' + obj.hostname + '.ntp' collectd.debug("%s on %s with entity id '%s'" % (PLUGIN, obj.hostname, obj.base_eid)) # get a list of provisioned ntp servers _get_ntp_servers() # manage existing alarms. try: alarms = api.get_faults_by_id(PLUGIN_ALARMID) except Exception as ex: collectd.error("%s 'get_faults_by_id' exception ; %s ; %s" % (PLUGIN, PLUGIN_ALARMID, ex)) return 0 if alarms: for alarm in alarms: eid = alarm.entity_instance_id # ignore alarms not for this host if obj.hostname not in eid: continue # maintain only the base alarm. if alarm.entity_instance_id != obj.base_eid: # clear any ntp server specific alarms over process restart # this is done to avoid the potential for stuck ntp ip alarms collectd.info("%s clearing found startup alarm '%s'" % (PLUGIN, alarm.entity_instance_id)) try: api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id) except Exception as ex: collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % (PLUGIN, PLUGIN_ALARMID, alarm.entity_instance_id, ex)) return 0 else: obj.alarm_raised = True collectd.info("%s found alarm %s:%s" % (PLUGIN, PLUGIN_ALARMID, alarm.entity_instance_id)) # ensure the base alarm is cleared if there are no # provisioned servers. if not obj.server_list_conf: _clear_base_alarm() else: collectd.info("%s no major startup alarms found" % PLUGIN) obj.init_complete = True return 0 ############################################################################### # # Name : read_func # # Description: The sample read interface this plugin publishes to collectd. # # collectd calls this interface every audit interval. # # Runs ntpq -np to query NTP status and manages alarms based on # the result. # # See file header (above) for more specific behavioral detail. # # Should only run on a controller ; both # # Parameters : None # # Returns : zero or non-zero on significant error # ############################################################################### def read_func(): # ntp query is for controllers only if tsc.nodetype != 'controller': return 0 if obj.init_complete is False: if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is True: collectd.info("%s re-running init" % PLUGIN) init_func() return 0 # get a list if provisioned ntp servers _get_ntp_servers() # nothing to do while there are no provisioned NTP servers if len(obj.server_list_conf) == 0: return 0 # Do NTP Query data = subprocess.check_output([PLUGIN_EXEC, PLUGIN_EXEC_OPTIONS]) # Keep this FIT test code but make it commented out for security # # if os.path.exists('/var/run/fit/ntpq_data'): # data = '' # collectd.info("%s using ntpq FIT data" % PLUGIN) # with open('/var/run/fit/ntpq_data', 'r') as infile: # for line in infile: # data += line if not data: collectd.error("%s no data from query" % PLUGIN) return 0 # Get the ntp query output into a list of lines obj.ntpq = data.split('\n') # keep track of changes ; only log on changes reachable_list_changed = False unreachable_list_changed = False # Manage the selected server name # # save the old value so we can print a log if the selected server changes if obj.selected_server: obj.selected_server_save = obj.selected_server # always assume no selected server ; till its learned obj.selected_server = '' # start with a fresh empty list for this new run to populate obj.server_list_ntpq = [] # Loop through the ntpq output. # Ignore the first 2 lines ; just header data. for i in range(2, len(obj.ntpq)): # ignore empty or lines that are not long enough if len(obj.ntpq[i]) < 10: continue # log the ntpq output ; minus the 2 lines of header collectd.info("NTPQ: %s" % obj.ntpq[i]) # Unreachable servers are ones whose line start with a space ip = '' if obj.ntpq[i][0] == ' ': # get the ip address # example format of line:['', '132.163.4.102', '', '', '.INIT.', # get ip from index [1] of the list unreachable = obj.ntpq[i].split(' ')[1] if unreachable: # check to see if its a controller ip # we skip over controller ips if _is_controller(unreachable) is False: _add_ip_to_ntpq_server_list(unreachable) if unreachable not in obj.unreachable_servers: if _raise_alarm(unreachable) is False: unreachable_list_changed = True # if the FM call to raise the alarm worked then # add this ip to the unreachable list if its not # already in it _add_unreachable_server(unreachable) # Reachable servers are ones whose line start with a '+' elif obj.ntpq[i][0] == '+': # remove the '+' and get the ip ip = obj.ntpq[i].split(' ')[0][1:] elif obj.ntpq[i][0] == '*': # remove the '*' and get the ip cols = obj.ntpq[i].split(' ') ip = cols[0][1:] if ip: ip_family = _is_ip_address(ip) obj.peer_selected = _is_controller(ip) if ip != obj.selected_server and obj.alarm_raised is True: # a new ntp server is selected, old alarm may not be # valid _clear_base_alarm() obj.alarm_raised = False if obj.peer_selected is False: if obj.selected_server: # done update the selected server if more selections # are found. go with the first one found. collectd.info("%s additional selected server found" " '%s'; current selection is '%s'" % (PLUGIN, ip, obj.selected_server)) else: # update the selected server list obj.selected_server = ip collectd.debug("%s selected server is '%s'" % (PLUGIN, obj.selected_server)) else: # refer to peer refid = '' for i in range(1, len(cols)): if cols[i] != '': refid = cols[i] break if refid not in ('', '127.0.0.1') and \ not _is_controller(refid) and \ socket.AF_INET == ip_family: # ipv4, peer controller refer to a time source is not # itself or a controller (this node) obj.selected_server = ip collectd.debug("peer controller has a reliable " "source") # anything else is unreachable else: unreachable = obj.ntpq[i][1:].split(' ')[0] if _is_controller(unreachable) is False: _add_ip_to_ntpq_server_list(unreachable) if unreachable not in obj.unreachable_servers: if _raise_alarm(unreachable) is False: unreachable_list_changed = True # if the FM call to raise the alarm worked then # add this ip to the unreachable list if its not # already in it _add_unreachable_server(unreachable) if ip: # if the ip is valid then manage it if _is_controller(ip) is False: _add_ip_to_ntpq_server_list(ip) # add the ip to the reachable servers list # if its not already there if ip not in obj.reachable_servers: obj.reachable_servers.append(ip) reachable_list_changed = True # make sure this IP is no longer in the unreachable # list and that alarms for it are cleared _remove_ip_from_unreachable_list(ip) _cleanup_stale_servers() if obj.selected_server: if obj.selected_server != obj.selected_server_save: collectd.info("%s selected server changed from '%s' to '%s'" % (PLUGIN, obj.selected_server_save, obj.selected_server)) obj.selected_server_save = obj.selected_server if obj.alarm_raised is True: _clear_base_alarm() elif obj.alarm_raised is False: if obj.peer_selected: collectd.info("%s peer is selected" % PLUGIN) else: collectd.error("%s no selected server" % PLUGIN) if _raise_alarm() is False: obj.selected_server_save = 'None' # only log and act on changes if reachable_list_changed is True: if obj.reachable_servers: collectd.info("%s reachable servers: %s" % (PLUGIN, obj.reachable_servers)) if obj.alarm_raised is True: if obj.selected_server and obj.reachable_servers: _clear_base_alarm() else: collectd.error("%s no reachable servers" % PLUGIN) _raise_alarm() # only log changes if unreachable_list_changed is True: if obj.unreachable_servers: collectd.info("%s unreachable servers: %s" % (PLUGIN, obj.unreachable_servers)) else: collectd.info("%s all servers are reachable" % PLUGIN) # The sample published to the database is simply the number # of reachable servers if one is selected if not obj.selected_server: sample = 0 else: sample = len(obj.reachable_servers) # Dispatch usage value to collectd val = collectd.Values(host=obj.hostname) val.plugin = 'ntpq' val.type = 'absolute' val.type_instance = 'reachable' val.dispatch(values=[sample]) return 0 # register the config, init and read functions collectd.register_config(config_func) collectd.register_init(init_func) collectd.register_read(read_func, interval=PLUGIN_INTERVAL)