monitoring/collectd-extensions/src/fm_notifier.py

#
# Copyright (c) 2018-2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# Version 1.0
#
############################################################################
#
# This file is the collectd 'FM Alarm' Notifier.
#
# This notifier debounces and then manages raising and clearing alarms
# and sending degrade assert and clear messages to maintenance based on
# Collectd resource usage severity notifications.
#
# Collectd process startup automatically calls this module's init_func which
# declares and initializes a plugObject class for plugin type in preparation
# for periodic ongoing monitoring where Collectd calls notify_func for each
# plugin and instance of that plugin every audit interval.
#
# All other class or common member functions implemented herein exist in
# support of that aformentioned initialization and periodic monitoring.
#
# Collectd provides information about each event as an object passed to the
# notification handler ; the notification object.
#
#    object.host              - the hostname.
#
#    object.plugin            - the name of the plugin aka resource.
#    object.plugin_instance   - plugin instance string i.e. say mountpoint
#                               for df plugin or numa? node for memory.
#    object.type,             - the unit i.e. percent or absolute.
#    object.type_instance     - the attribute i.e. free, used, etc.
#
#    object.severity          - a integer value 0=OK , 1=warning, 2=failure.
#    object.message           - a log-able message containing the above along
#                               with the value.
#
# This notifier uses the notification object to manage plugin/instance alarms.
#
# To avoid stuck alarms or missing alarms the plugin thresholds should be
# configured with Persist = true and persistOK = true. These controls tell
# Collectd to send notifications every audit interval regardless of state
# change.
#
# Persist   = False ; only send notifications on 'okay' to 'not okay' change.
# PersistOK = False ; only send notifications on 'not okay' to 'okay' change.
#
# With these both set to True in the threshold spec for the plugin then
# Collectd will call this notifier for each audit plugin/instance audit.
#
# Collectd supports only 2 threshold severities ; warning and failure.
# The 'failure' maps to 'critical' while 'warning' maps to 'major' in FM.
#
# To avoid unnecessary load on FM, this notifier maintains current alarm
# state and only makes an FM call on alarm state changes. Current alarm state
# is queried by the init function called by Collectd on process startup.
#
# Current alarm state is maintained by two severity lists for each plugin,
# a warnings list and a failures list.
#
# When a failure is reported against a specific plugin then that resources's
# entity_id is added to that plugin's alarm object's failures list. Similarly,
# warning assertions get their entity id added to plugin's alarm object's
# warnings list. Any entity id should only exist in one of the lists at one
# time or in none at all if the notification condition is 'okay' and the alarm
# is cleared.
#
# Adding Plugins:
#
# To add new plugin support just search for ADD_NEW_PLUGIN and add the data
# requested in that area.
#
# Example commands to read samples from the influx database
#
# SELECT * FROM df_value WHERE instance='root' AND type='percent_bytes' AND
#                                                      type_instance='used'
# SELECT * FROM cpu_value WHERE type='percent' AND type_instance='used'
# SELECT * FROM memory_value WHERE type='percent' AND type_instance='used'
#
############################################################################
#
# Import list

# UT imports
import os
import re
import socket
import collectd
from threading import RLock as Lock
from oslo_utils import encodeutils
from fm_api import constants as fm_constants
from fm_api import fm_api
import tsconfig.tsconfig as tsc
import plugin_common as pc

# only load influxdb on the controller
if tsc.nodetype == 'controller':
    from influxdb import InfluxDBClient

api = fm_api.FaultAPIsV2()

# Debug control
debug = False
debug_lists = False
want_state_audit = False
want_vswitch = False

# Number of notifier loop between each audit.
# @ 30 sec interval audit rate is every 5 minutes
AUDIT_RATE = 10

# write a 'value' log on a the resource sample change of more than this amount
LOG_STEP = 10

# Same state message throttle count.
# Only send the degrade message every 'this' number
# while the state of assert or clear remains the same.
ONE_EVERY = 20

# This plugin name
PLUGIN = 'alarm notifier'

# This plugin's degrade function
PLUGIN_DEGRADE = 'degrade notifier'

# the name of the collectd samples database
DATABASE_NAME = 'collectd samples'

READING_TYPE__PERCENT_USAGE = '% usage'

# Default invalid threshold value
INVALID_THRESHOLD = float(-1)

# 3 minute alarm assertion debounce
# 1 minute alarm clear debounce
#    assuming 30 second interval
DEBOUNCE_FROM_CLEAR_THLD = 7  # (((3 * 60) / 30) + 1)
DEBOUNCE_FROM_ASSERT_THLD = 3

# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
NOTIF_WARNING = 2
NOTIF_OKAY = 4

PASS = 0
FAIL = 1

# Maintenance Degrade Service definitions

# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101

# Filesystem plugin_instances are mangled by collectd.
# For instance the "/var/log" MountPoint instance is
# reported as "var-log".
#
# The following is a dictionary that provides mapping between the
# stock df plugin instance name and the linux filesystem pATH where the
# key = mangled filesystem instance from stock df plugin
# val = actual filesystem mountpoint path
#
# ADD_NEW_PLUGIN if there are new file systems being added that
# have subdirectories in the name then they will need to be added
# to the mangled list
DF_MANGLED_DICT = {
    # instance : path
    'root': '/',
    'dev': '/dev',
    'tmp': '/tmp',
    'boot': '/boot',
    'scratch': '/scratch',
    'dev-shm': '/dev/shm',
    'var-log': '/var/log',
    'var-run': '/var/run',
    'var-lock': '/var/lock',
    'var-lib-rabbitmq': '/var/lib/rabbitmq',
    'var-lib-postgresql': '/var/lib/postgresql',
    'var-lib-ceph-mon': '/var/lib/ceph/mon',
    'var-lib-docker': '/var/lib/docker',
    'var-lib-docker-distribution': '/var/lib/docker-distribution',
    'var-lib-kubelet': '/var/lib/kubelet',
    'var-lib-nova-instances': '/var/lib/nova/instances',
    'opt-platform': '/opt/platform',
    'opt-etcd': '/opt/etcd',
    'opt-extension': '/opt/extension',
    'opt-backups': '/opt/backups'}


# ADD_NEW_PLUGIN: add new alarm id definition
ALARM_ID__CPU = "100.101"
ALARM_ID__MEM = "100.103"
ALARM_ID__DF = "100.104"

ALARM_ID__VSWITCH_CPU = "100.102"
ALARM_ID__VSWITCH_MEM = "100.115"
ALARM_ID__VSWITCH_PORT = "300.001"
ALARM_ID__VSWITCH_IFACE = "300.002"


# ADD_NEW_PLUGIN: add new alarm id to the list
ALARM_ID_LIST = [ALARM_ID__CPU,
                 ALARM_ID__MEM,
                 ALARM_ID__DF,
                 ALARM_ID__VSWITCH_CPU,
                 ALARM_ID__VSWITCH_MEM,
                 ALARM_ID__VSWITCH_PORT,
                 ALARM_ID__VSWITCH_IFACE]

AUDIT_ALARM_ID_LIST = [ALARM_ID__CPU,
                       ALARM_ID__MEM,
                       ALARM_ID__DF]

# ADD_NEW_PLUGIN: add plugin name definition
# WARNING: This must line up exactly with the plugin
#          filename without the extension.
PLUGIN__DF = "df"
PLUGIN__CPU = "cpu"
PLUGIN__MEM = "memory"
PLUGIN__INTERFACE = "interface"
PLUGIN__NTP_QUERY = "ntpq"
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_CPU = "vswitch_cpu"
PLUGIN__VSWITCH_MEM = "vswitch_mem"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"

# ADD_NEW_PLUGIN: add plugin name to list
PLUGIN_NAME_LIST = [PLUGIN__CPU,
                    PLUGIN__MEM,
                    PLUGIN__DF,
                    PLUGIN__VSWITCH_CPU,
                    PLUGIN__VSWITCH_MEM,
                    PLUGIN__VSWITCH_PORT,
                    PLUGIN__VSWITCH_IFACE]

# Used to find plugin name based on alarm id
# for managing degrade for startup alarms.
ALARM_ID__TO__PLUGIN_DICT = {ALARM_ID__CPU: PLUGIN__CPU,
                             ALARM_ID__MEM: PLUGIN__MEM,
                             ALARM_ID__DF: PLUGIN__DF,
                             ALARM_ID__VSWITCH_CPU: PLUGIN__VSWITCH_CPU,
                             ALARM_ID__VSWITCH_MEM: PLUGIN__VSWITCH_MEM,
                             ALARM_ID__VSWITCH_PORT: PLUGIN__VSWITCH_PORT,
                             ALARM_ID__VSWITCH_IFACE: PLUGIN__VSWITCH_IFACE}

# Common plugin object
pluginObject = pc.PluginObject(PLUGIN, '')


#########################################
# The collectd Maintenance Degrade Object
#########################################
class DegradeObject:

    def __init__(self, port):
        """DegradeObject Class constructor"""

        # maintenance port for degrade messages
        self.port = port

        # controller floating address
        self.addr = None

        # specifies the protocol family to use when messaging maintenance.
        # if system is IPV6, then that is learned and this 'protocol' is
        # updated with AF_INET6
        self.protocol = socket.AF_INET

        self.resource = ""

        # List of plugin names that require degrade for specified severity.
        self.degrade_list__failure = [PLUGIN__DF,
                                      PLUGIN__MEM,
                                      PLUGIN__CPU,
                                      PLUGIN__INTERFACE]
        self.degrade_list__warning = [PLUGIN__INTERFACE]

        # The running list of resources that require degrade.
        # a degrade clear message is sent whenever this list is empty.
        # a degrade assert message is sent whenever this list is not empty.
        self.degrade_list = []

        # throttle down sending of duplicate degrade assert/clear messages
        self.last_state = "undef"
        self.msg_throttle = 0

    ##########################################################################
    #
    # Name    : _get_active_controller_ip
    #
    # Purpose : Lookup the active controller's ip address.
    #
    # Updates : self.addr with the active controller's address or
    #           None if lookup fails.
    #
    # Returns : Nothing
    #
    ##########################################################################
    def _get_active_controller_ip(self):
        """Get the active controller host IP"""

        try:
            self.addr = socket.getaddrinfo('controller', None)[0][4][0]
            collectd.info("%s controller ip: %s" %
                          (PLUGIN_DEGRADE, self.addr))
        except Exception as ex:
            self.addr = None
            collectd.error("%s failed to get controller ip ; %s" %
                           (PLUGIN_DEGRADE, str(ex)))

    ##########################################################################
    #
    # Name       : mtce_degrade_notifier
    #
    # Purpose    : Message mtcAgent with its requested degrade state of
    #              the host.
    #
    # Description: If the degrade list is empty then a clear state is sent to
    #              maintenance.
    #
    #              If degrade list is NOT empty then an assert state is sent
    #              to maintenance.
    #
    # For logging and to ease debug the code below will create a list of
    # degraded resource instances to be included in the message to maintenance
    # for mtcAgent to optionally log it.
    #
    # Updates    : Preserves this state as last state
    #
    # Returns    : Nothing
    #
    ##########################################################################
    def mtce_degrade_notifier(self, nObject):
        """Message mtcAgent with collectd degrade state of the host"""

        resources = ""
        if self.degrade_list:
            # loop over the list,
            # limit the degraded resource list being sent to mtce to 2
            for r in self.degrade_list[0:2]:
                resources += r + ','
            resources = resources[:-1]
            state = "assert"
        else:
            state = "clear"

        # Degrade message throttling ....
        #
        # Avoid sending the same last state message for up to ONE_EVERY count.
        # Degrade state is refreshed every 10 minutes with audit at 30 seconds.
        # Just reduce load on mtcAgent.
        if self.last_state == state and self.msg_throttle < ONE_EVERY:
            self.msg_throttle += 1
            return 0
        else:
            # Clear the message throttle counter
            self.msg_throttle = 0

        if self.degrade_list:
            collectd.info("%s degrade list: %s" %
                          (PLUGIN_DEGRADE, self.degrade_list))

        # Save state for next time
        self.last_state = state

        # Send the degrade state ; assert or clear message to mtcAgent.
        # If we get a send failure then log it and set the addr to None
        # so it forces us to refresh the controller address on the next
        # notification
        try:
            mtce_socket = socket.socket(self.protocol, socket.SOCK_DGRAM)
            if mtce_socket:
                if self.addr is None:
                    self._get_active_controller_ip()
                    if self.addr is None:
                        collectd.error("%s cannot send degrade notification ; "
                                       "controller address lookup failed" %
                                       PLUGIN_DEGRADE)
                        return 0

                # Create the Maintenance message.
                message = "{\"service\":\"collectd_notifier\","
                message += "\"hostname\":\"" + nObject.host + "\","
                message += "\"degrade\":\"" + state + "\","
                message += "\"resource\":\"" + resources + "\"}"
                collectd.info("%s: %s" % (PLUGIN_DEGRADE, message))

                mtce_socket.settimeout(1.0)
                mtce_socket.sendto(encodeutils.safe_encode(message), (self.addr, self.port))
                mtce_socket.close()
            else:
                collectd.error("%s %s failed to open socket (%s)" %
                               (PLUGIN_DEGRADE, self.resource, self.addr))
        except socket.error as e:
            if e.args[0] == socket.EAI_ADDRFAMILY:
                # Handle IPV4 to IPV6 switchover:
                self.protocol = socket.AF_INET6
                collectd.info("%s %s ipv6 addressing (%s)" %
                              (PLUGIN_DEGRADE, self.resource, self.addr))
            else:
                collectd.error("%s %s socket error (%s) ; %s" %
                               (PLUGIN_DEGRADE,
                                self.resource,
                                self.addr,
                                str(e)))
                # try self correction
                self.addr = None
                self.protocol = socket.AF_INET

    ##########################################################################
    #
    # Name    : remove_degrade_for_missing_filesystems
    #
    # Purpose : Removes degraded filesystems that are no longer mounted.
    #
    # Updates : might update self.degrade_list
    #
    # Returns : Nothing
    #
    ##########################################################################
    def remove_degrade_for_missing_filesystems(self):
        """Remove file systems that are no longer mounted"""

        for df_inst in self.degrade_list:

            # Only file system plugins are looked at.
            # File system plugin instance names are prefixed with 'df:'
            # as the first 3 chars in the instance name.
            if df_inst[0:3] == 'df:':
                path = df_inst.split('filesystem=')[1]

                # check the mount point.
                # if the mount point no longer exists then remove
                # this instance from the degrade list.
                if os.path.ismount(path) is False:
                    collectd.info("%s clearing degrade for missing %s ; %s" %
                                  (PLUGIN_DEGRADE, path, self.degrade_list))
                    self.degrade_list.remove(df_inst)

    ##########################################################################
    #
    # Name       : manage_degrade_list
    #
    # Purpose    : Track the resources that require this host to be degraded.
    #
    # Description: Manages the 'degrade_list' based on collectd notifications.
    #
    # Updates    : self.degrade list with resource names that have severity
    #              levels that require the host to be degraded.
    #
    # Returns    : Nothing
    #
    ###########################################################################
    def manage_degrade_list(self, nObject):
        """Collectd Mtce Notifier Handler Function"""

        remove = False
        add = False

        # Create the degrade id from the notifier object.
        # Format: <plugin name>:host=<hostname>.<plugin_instance_name>
        resource = nObject.plugin + ':' + 'host=' + os.uname()[1]
        if nObject.plugin == PLUGIN__DF:
            df_inst = DF_MANGLED_DICT.get(nObject.plugin_instance)
            if df_inst:
                resource += ".filesystem="
                resource += df_inst
            else:
                collectd.error("%s df instance '%s' lookup failed; ignoring" %
                               (PLUGIN_DEGRADE, nObject.plugin_instance))
                return

        elif nObject.plugin_instance:
            resource += '.' + nObject.plugin + '=' + nObject.plugin_instance

        # This block looks at the current notification severity
        # and manages the degrade_list.
        # If the specified plugin name exists in each of the warnings
        # or failure lists and there is a current severity match then
        # add that resource instance to the degrade list.
        # Conversely, if this notification is OKAY then make sure this
        # resource instance is not in the degrade list (remove it if it is)
        if nObject.severity is NOTIF_OKAY:
            if self.degrade_list and resource in self.degrade_list:
                remove = True

        elif nObject.severity is NOTIF_FAILURE:
            if self.degrade_list__failure:
                if nObject.plugin in self.degrade_list__failure:
                    if resource not in self.degrade_list:
                        # handle dynamic filesystems going missing over a swact
                        # or unmount and being reported as a transient error by
                        # the df plugin. Don't add it to the failed list if the
                        # mountpoint is gone.
                        add = True
                        if nObject.plugin == PLUGIN__DF:
                            path = DF_MANGLED_DICT.get(nObject.plugin_instance)
                            add = os.path.ismount(path)

            else:
                # If severity is failure and no failures cause degrade
                # then make sure this plugin is not in the degrade list,
                # Should never occur.
                if resource in self.degrade_list:
                    remove = True

        elif nObject.severity is NOTIF_WARNING:
            if self.degrade_list__warning:
                if nObject.plugin in self.degrade_list__warning:
                    if resource not in self.degrade_list:
                        # handle dynamic filesystems going missing over a swact
                        # or unmount and being reported as a transient error by
                        # the df plugin. Don't add it to the failed list if the
                        # mountpoint is gone.
                        add = True
                        if nObject.plugin == PLUGIN__DF:
                            path = DF_MANGLED_DICT.get(nObject.plugin_instance)
                            add = os.path.ismount(path)

                elif resource in self.degrade_list:
                    remove = True
            else:
                # If severity is warning and no warnings cause degrade
                # then make sure this plugin is not in the degrade list.
                if resource in self.degrade_list:
                    remove = True
        else:
            collectd.info("%s unsupported severity %d" %
                          (PLUGIN_DEGRADE, nObject.severity))

        if remove is True:
            self.degrade_list.remove(resource)
            collectd.info("%s '%s' removed from degrade list" %
                          (PLUGIN_DEGRADE, resource))
        elif add is True:
            self.degrade_list.append(resource)
            collectd.info("%s '%s' added to degrade list" %
                          (PLUGIN_DEGRADE, resource))


# Instantiate the maintenance degrade object
# This object persists from notification to notification
mtcDegradeObj = DegradeObject(MTCE_CMD_RX_PORT)


# fmAlarmObject Class
class fmAlarmObject:

    dbObj = None                           # shared database connection obj
    host = None                            # saved hostname
    lock = None                            # global lock for mread_func mutex
    database_setup = False                 # state of database setup
    database_setup_in_progress = False     # connection mutex
    plugin_path = None
    fm_connectivity = False

    def __init__(self, id, plugin):
        """fmAlarmObject Class constructor"""

        # plugin specific static class members.
        self.id = id               # alarm id ; 100.1??
        self.plugin = plugin       # name of the plugin ; df, cpu, memory ...
        self.plugin_instance = ""  # the instance name for the plugin
        self.resource_name = ""    # The top level name of the resource
        self.instance_name = ""    # The instance name

        # Unique identifier used in the degrade list to represent
        # this alarm object.
        #
        # Base Object:
        #
        # Format : PLUGIN:host=<hostname>
        # Example: memory:host=controller-0
        #
        # Instance Object:
        #
        # Format: <Base Object>.instance>
        # Example: memory:host=controller-0.memory=platform
        self.degrade_id = plugin + ':' + 'host=' + os.uname()[1]

        # Instance specific learned static class members.
        self.entity_id = ""        # fm entity id host=<hostname>.<instance>
        self.instance = ""         # <plugin>_<instance>

        # [ 'float value string','float threshold string]
        self.values = []
        self.value = float(0)       # float value of reading

        # This member is used to help log change values using the
        # LOG_STEP threshold consant
        self.last_value = float(0)

        # float value of threshold
        self.threshold = float(INVALID_THRESHOLD)

        # Common static class members.
        self.reason_warning = ""
        self.reason_failure = ""
        self.repair = ""
        self.alarm_type = fm_constants.FM_ALARM_TYPE_7     # OPERATIONAL
        self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50  # THRESHOLD CROSS
        self.suppression = True
        self.service_affecting = False

        # default most reading types are usage
        self.reading_type = READING_TYPE__PERCENT_USAGE

        # Severity tracking lists.
        # Maintains severity state between notifications.
        # Each is a list of entity ids for severity asserted alarms.
        # As alarms are cleared so is the entry in these lists.
        # The entity id should only be in one lists for any given raised alarm.
        self.warnings = []
        self.failures = []

        # alarm debounce control
        self.warnings_debounce_counter = 0
        self.failures_debounce_counter = 0

        # total notification count
        self.count = 0

        # audit counters
        self.alarm_audit_threshold = 0
        self.state_audit_count = 0

        # For plugins that have multiple instances like df (filesystem plugin)
        # we need to create an instance of this object for each one.
        # This dictionary is used to associate an instance with its object.
        self.instance_objects = {}

        self.fault = None

    def _ilog(self, string):
        """Create a collectd notifier info log with the string param"""
        collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string))

    def _llog(self, string):
        """Create a collectd notifier info log when debug_lists not empty"""
        if debug_lists:
            collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string))

    def _elog(self, string):
        """Create a collectd notifier error log with the string param"""
        collectd.error('%s %s : %s' % (PLUGIN, self.plugin, string))

    ##########################################################################
    #
    # Name    : _state_audit
    #
    # Purpose : Debug Tool to log plugin object info.
    #
    #           Not called in production code.
    #
    # Only the severity lists are dumped for now.
    # Other info can be added as needed.
    # Can be run as an audit or called directly.
    #
    ##########################################################################

    def _state_audit(self, location):
        """Log the state of the specified object"""

        if self.id == ALARM_ID__CPU:
            _print_state()

        self.state_audit_count += 1
        if self.warnings:
            collectd.info("%s AUDIT %d: %s warning list %s:%s" %
                          (PLUGIN,
                           self.state_audit_count,
                           self.plugin,
                           location,
                           self.warnings))
        if self.failures:
            collectd.info("%s AUDIT %d: %s failure list %s:%s" %
                          (PLUGIN,
                           self.state_audit_count,
                           self.plugin,
                           location,
                           self.failures))

    ##########################################################################
    #
    # Name    : manage_change
    #
    # Purpose : Manage sample value change.
    #
    #           Handle no sample update case.
    #           Parse the notification log.
    #           Handle base object instances.
    #           Generate a log entry if the sample value changes more than
    #             step value.
    #
    ##########################################################################

    def manage_change(self, nObject):
        """Log resource instance value on step state change"""

        # filter out messages to ignore ; notifications that have no value
        if "has not been updated for" in nObject.message:
            collectd.info("%s %s %s (%s)" %
                          (PLUGIN,
                           self.entity_id,
                           nObject.message,
                           nObject.severity))
            return "done"

        # Get the value from the notification message.
        # The location in the message is different based on the message type ;
        #   normal reading or overage reading
        #
        # message: Host controller-0, plugin memory type percent   ... [snip]
        #          All data sources are within range again.
        #          Current value of "value" is 51.412038.            <------
        #
        # message: Host controller-0, plugin df (instance scratch) ... [snip]
        #          Data source "value" is currently 97.464027.       <------
        #          That is above the failure threshold of 90.000000. <------

        # recognized strings - value only     value and threshold
        #                      ------------   -------------------
        value_sig_list = ['Current value of', 'is currently']

        # list of parsed 'string version' float values ['value','threshold']
        self.values = []
        for sig in value_sig_list:
            index = nObject.message.find(sig)
            if index != -1:
                self.values = \
                    re.findall(r"[-+]?\d*\.\d+|\d+", nObject.message[index:-1])

        # contains string versions of the float values extracted from
        # the notification message. The threshold value is included for
        # readings that are out of threshold.
        if len(self.values):
            # validate the reading
            try:
                self.value = round(float(self.values[0]), 2)
                # get the threshold if its there.
                if len(self.values) > 1:
                    self.threshold = float(self.values[1])
                else:
                    self.threshold = float(INVALID_THRESHOLD)  # invalid value

            except ValueError as ex:
                collectd.error("%s %s value not integer or float (%s) (%s)" %
                               (PLUGIN, self.entity_id, self.value, str(ex)))
                return "done"
            except TypeError as ex:
                collectd.info("%s %s value has no type (%s)" %
                              (PLUGIN, self.entity_id, str(ex)))
                return "done"
        else:
            collectd.info("%s %s reported no value (%s)" %
                          (PLUGIN, self.entity_id, nObject.message))
            return "done"

        # get the last reading
        if self.last_value:
            last = float(self.last_value)
        else:
            last = float(0)

        # Determine if the change is large enough to log and save the new value
        logit = False
        if self.count == 0 or LOG_STEP == 0:
            logit = True
        elif self.reading_type == "connections":
            if self.value != last:
                logit = True
        elif self.value > last:
            if (last + LOG_STEP) < self.value:
                logit = True
        elif last > self.value:
            if (self.value + LOG_STEP) < last:
                logit = True

        # Case on types.
        #
        # Note: only usage type so far
        if logit:

            resource = self.resource_name

            # setup resource name for filesystem instance usage log
            if self.plugin == PLUGIN__DF:
                resource = self.instance_name

            elif self.plugin == PLUGIN__MEM:
                if self.instance_name:
                    resource = self.instance_name

            # setup resource name for vswitch process instance name
            elif self.plugin == PLUGIN__VSWITCH_MEM:
                resource += ' Processor '
                resource += self.instance_name

            if self.reading_type == READING_TYPE__PERCENT_USAGE:
                tmp = str(self.value).split('.')
                if len(tmp[0]) == 1:
                    pre = ':  '
                else:
                    pre = ': '
                collectd.info("%s reading%s%2.2f %s - %s" %
                              (PLUGIN,
                               pre,
                               self.value,
                               self.reading_type,
                               resource))

            elif self.reading_type == "connections" and \
                    self.instance_objects and \
                    self.value != self.last_value:
                if self.instance_objects:
                    collectd.info("%s monitor: %2d %s - %s" %
                                  (PLUGIN,
                                   self.value,
                                   self.reading_type,
                                   resource))

            # update last logged value
            self.last_value = round(self.value, 2)

    ##########################################################################
    #
    # Name       : debounce
    #
    # Purpose    : Debounce alarm and degrade action handling based on
    #              severity notifications from plugins.
    #
    # Description: Clear to assert has a 3 minute debounce
    #              All other state changes have 1 minute debounce.

    #              A true return indicates that debounce is complete and the
    #              current alarm severity needs to be acted upon.
    #
    #              A false return means that there is no severity change or
    #              that debouning a severity change is in progress and the
    #              caller should not take action on the current notification.
    #
    # Returns    : True if the alarm needs state change.
    #              False during debounce of if no alarm state change needed.
    #
    ##########################################################################

    def debounce(self, base_obj, entity_id, severity, this_value):
        """Check for need to update alarm data"""

        rc = False
        logit = False

        # Only % Usage readings are debounced and alarmed
        if base_obj.reading_type != READING_TYPE__PERCENT_USAGE:
            return False

        if entity_id in base_obj.warnings:
            self._llog(entity_id + " is already in warnings list")
            current_severity_str = "warning"
        elif entity_id in base_obj.failures:
            self._llog(entity_id + " is already in failures list")
            current_severity_str = "failure"
        else:
            self._llog(entity_id + " is already OK")
            current_severity_str = "okay"

        # No severity change case
        # Always clear debounce counters with no severity level change
        if severity == current_severity_str:
            self.warnings_debounce_counter = 0
            self.failures_debounce_counter = 0

        # From Okay -> Warning Case - PASS
        elif current_severity_str == "okay" and severity == "warning":
            logit = True
            self.warnings_debounce_counter += 1
            if self.warnings_debounce_counter >= DEBOUNCE_FROM_CLEAR_THLD:
                rc = True

            # Special Case: failures debounce counter should clear in this case
            # so that ; max-x failures and then a warning followed by more
            # failures should not allow the failure alarm assertion.
            # Need back to back DEBOUNCE_FROM_CLEAR_THLD failures to
            # constitute a failure alarm.
            self.failures_debounce_counter = 0

        # From Okay -> Failure
        elif current_severity_str == "okay" and severity == "failure":
            logit = True
            self.failures_debounce_counter += 1
            if self.failures_debounce_counter >= DEBOUNCE_FROM_CLEAR_THLD:
                rc = True

            # Special Case: warning debounce counter should track failure
            # so that ; say 2 failures and then a warning would constitute
            # a valid okay to warning alarm assertion.
            self.warnings_debounce_counter += 1

        # From Failure -> Okay Case
        elif current_severity_str == "failure" and severity == "okay":
            logit = True
            self.failures_debounce_counter += 1
            if self.failures_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
                rc = True

            # Special Case: Recovery from failure can be to okay or warning
            # so that ; say at failure and we get 2 okay's and a warning
            # we should allow that as a valid debounce from failure to warning.
            self.warnings_debounce_counter += 1

        # From Failure -> Warning Case
        elif current_severity_str == "failure" and severity == "warning":
            logit = True
            self.failures_debounce_counter += 1
            if self.failures_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
                rc = True

        # From Warning -> Okay Case
        elif current_severity_str == "warning" and severity == "okay":
            logit = True
            self.warnings_debounce_counter += 1
            if self.warnings_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
                rc = True

            # Special Case: Any previously thresholded failure count
            # should be cleared. Say we are at this warning level but
            # started debouncing a failure severity. Then before the
            # failure debounce completed we got an okay (this clause).
            # Then on the next audit get another failure event.
            # Without clearing the failure count on this okay we would
            # mistakenly qualify for a failure debounce by continuing
            # to count up the failures debounce count.
            self.failures_debounce_counter = 0

        # From Warning -> Failure Case
        elif current_severity_str == "warning" and severity == "failure":
            logit = True
            self.failures_debounce_counter += 1
            if self.failures_debounce_counter >= DEBOUNCE_FROM_ASSERT_THLD:
                rc = True

            # Special Case: While in warning severity and debouncing to okay
            # we get a failure reading then we need to clear the warning
            # debounce count. Otherwise the next okay would qualify the clear
            # which it should not because we got a failure while the warning
            # to okay debounce.
            self.warnings_debounce_counter = 0

        if logit is True:
            collectd.info("%s %s %s debounce '%s -> %s' (%2.2f) (%d:%d) %s" % (
                          PLUGIN,
                          base_obj.resource_name,
                          entity_id,
                          current_severity_str,
                          severity,
                          this_value,
                          self.warnings_debounce_counter,
                          self.failures_debounce_counter,
                          rc))

        if rc is True:
            # clear both debounce counters on every state change
            self.warnings_debounce_counter = 0
            self.failures_debounce_counter = 0

        return rc

    ########################################################################
    #
    # Name    : manage_alarm_lists
    #
    # Purpose : Alarm Severity Tracking
    #
    # This class member function accepts a severity level and entity id.
    # It manages the content of the current alarm object's 'failures' and
    # 'warnings' lists ; aka Severity Lists.
    #
    # These Severity Lists are used to record current alarmed state for
    # each instance of a plugin.
    # If an alarm is raised then its entity id is added to the appropriate
    # severity list.
    #
    # A failure notification or critical alarm goes in the failures list.
    # A warning notification or major alarm goes into the warnings list.
    #
    # These lists are used to avoid making unnecessary calls to FM.
    #
    # Startup Behavior:
    #
    # The collectd daemon runs the init function of every plugin on startup.
    # That includes this notifier plugin. The init function queries the FM
    # database for any active alarms.
    #
    # This member function is called for any active alarms that are found.
    # The entity id for active alarms is added to the appropriate
    # Severity List. This way existing alarms are maintained over collectd
    # process startup.
    #
    # Runtime Behavior:
    #
    # The current severity state is first queried and compared to the
    # newly reported severity level. If they are the same then a "done"
    # is returned telling the caller that there is no further work to do.
    # Otherwise, the lists are managed in a way that has the entity id
    # of a raised alarm in the corresponding severity list.
    #
    # See inline comments below for each specific severity and state
    # transition case.
    #
    #########################################################################

    def manage_alarm_lists(self, entity_id, severity):
        """Manage the alarm severity lists and report state change"""

        collectd.debug("%s manage alarm %s %s %s" %
                       (PLUGIN,
                        self.id,
                        severity,
                        entity_id))

        # Get the instance's current state
        if entity_id in self.warnings:
            current_severity_str = "warning"
        elif entity_id in self.failures:
            current_severity_str = "failure"
        else:
            current_severity_str = "okay"

        # Compare to current state to previous state.
        # If they are the same then return done.
        if severity == current_severity_str:
            return "done"

        # Otherwise, manage the severity lists ; case by case.
        warnings_list_change = False
        failures_list_change = False

        # Case 1: Handle warning to failure severity change.
        if severity == "warning" and current_severity_str == "failure":

            if entity_id in self.failures:
                self.failures.remove(entity_id)
                failures_list_change = True
                self._llog(entity_id + " is removed from failures list")
            else:
                self._elog(entity_id + " UNEXPECTEDLY not in failures list")

            # Error detection
            if entity_id in self.warnings:
                self.warnings.remove(entity_id)
                self._elog(entity_id + " UNEXPECTEDLY in warnings list")

            self.warnings.append(entity_id)
            warnings_list_change = True
            self._llog(entity_id + " is added to warnings list")

        # Case 2: Handle failure to warning alarm severity change.
        elif severity == "failure" and current_severity_str == "warning":

            if entity_id in self.warnings:
                self.warnings.remove(entity_id)
                warnings_list_change = True
                self._llog(entity_id + " is removed from warnings list")
            else:
                self._elog(entity_id + " UNEXPECTEDLY not in warnings list")

            # Error detection
            if entity_id in self.failures:
                self.failures.remove(entity_id)
                self._elog(entity_id + " UNEXPECTEDLY in failures list")

            self.failures.append(entity_id)
            failures_list_change = True
            self._llog(entity_id + " is added to failures list")

        # Case 3: Handle new alarm.
        elif severity != "okay" and current_severity_str == "okay":
            if severity == "warning":
                self.warnings.append(entity_id)
                warnings_list_change = True
                self._llog(entity_id + " added to warnings list")
            elif severity == "failure":
                self.failures.append(entity_id)
                failures_list_change = True
                self._llog(entity_id + " added to failures list")

        # Case 4: Handle alarm clear.
        else:
            # plugin is okay, ensure this plugin's entity id
            # is not in either list
            if entity_id in self.warnings:
                self.warnings.remove(entity_id)
                warnings_list_change = True
                self._llog(entity_id + " removed from warnings list")
            if entity_id in self.failures:
                self.failures.remove(entity_id)
                failures_list_change = True
                self._llog(entity_id + " removed from failures list")

        if warnings_list_change is True:
            if self.warnings:
                collectd.info("%s %s warnings %s" %
                              (PLUGIN, self.plugin, self.warnings))
            else:
                collectd.info("%s %s no warnings" %
                              (PLUGIN, self.plugin))

        if failures_list_change is True:
            if self.failures:
                collectd.info("%s %s failures %s" %
                              (PLUGIN, self.plugin, self.failures))
            else:
                collectd.info("%s %s no failures" %
                              (PLUGIN, self.plugin))

    ##########################################################################
    #
    # Name    : _get_instance_object
    #
    # Purpose : Safely get an object from the self instance object list
    #           indexed by eid.
    #
    ##########################################################################
    def _get_instance_object(self, eid):
        """Safely get an object from the self instance object dict while locked

        :param eid: the index for the instance object dictionary
        :return: object or None
        """
        if eid is None:
            return None

        try:
            collectd.debug("%s %s Get   Lock ..." % (PLUGIN, self.plugin))
            with fmAlarmObject.lock:
                obj = self.instance_objects[eid]
                return obj
        except:
            collectd.error("%s failed to get instance from %s object list" %
                           (PLUGIN, self.plugin))
            return None

    ##########################################################################
    #
    # Name    : _add_instance_object
    #
    # Purpose : Safely add an object to the self instance object list
    #           indexed by eid while locked. if found locked the instance
    #           add will be re-attempted on next sample.
    #
    ##########################################################################
    def _add_instance_object(self, obj, eid):
        """Update self instance_objects list while locked

        :param obj: the object to add
        :param eid: index for instance_objects
        :return: nothing
        """
        try:
            collectd.debug("%s %s Add   Lock ..." % (PLUGIN, self.plugin))
            with fmAlarmObject.lock:
                self.instance_objects[eid] = obj
        except:
            collectd.error("%s failed to add instance to %s object list" %
                           (PLUGIN, self.plugin))

    ##########################################################################
    #
    # Name    : _copy_instance_object
    #
    # Purpose : Copy select members of self object to target object.
    #
    ##########################################################################
    def _copy_instance_object(self, object):
        """Copy select members of self object to target object"""

        object.resource_name = self.resource_name
        object.reading_type = self.reading_type

        object.reason_warning = self.reason_warning
        object.reason_failure = self.reason_failure
        object.repair = self.repair

        object.alarm_type = self.alarm_type
        object.cause = self.cause
        object.suppression = self.suppression
        object.service_affecting = self.service_affecting

    ##########################################################################
    #
    # Name    : create_instance_object
    #
    # Purpose : Create a new instance object and tack it on the supplied base
    #           object's instance object dictionary.
    #
    ##########################################################################
    def create_instance_object(self, instance):

        try:
            # create a new plugin object
            inst_obj = fmAlarmObject(self.id, self.plugin)
            self._copy_instance_object(inst_obj)

            # initialize the object with instance specific data
            inst_obj.instance_name = instance
            inst_obj.degrade_id += '.' + self.plugin + '=' + instance
            inst_obj.entity_id = _build_entity_id(self.plugin, instance)
            self._add_instance_object(inst_obj, inst_obj.entity_id)

            collectd.debug("%s created %s instance (%s) object %s" %
                           (PLUGIN, inst_obj.resource_name,
                            inst_obj.entity_id, inst_obj))

            collectd.info("%s monitoring %s %s %s" %
                          (PLUGIN,
                           inst_obj.resource_name,
                           inst_obj.instance_name,
                           inst_obj.reading_type))

            return inst_obj

        except:
            collectd.error("%s %s:%s inst object create failed [exception]" %
                           (PLUGIN, inst_obj.resource_name, instance))
        return None

    ##########################################################################
    #
    # Name    : create_instance_objects
    #
    # Purpose : Create a list of instance objects for 'self' type plugin and
    #           add those objects to the parent's instance_objects dictionary.
    #
    # Note    : This is currently only used for the DF (filesystem) plugin.
    #           All other instance creations/allocations are done on-demand.
    #
    ##########################################################################
    def create_instance_objects(self):
        """Create, initialize and add an instance object to this/self plugin"""

        # Create the File System subordinate instance objects.
        if self.id == ALARM_ID__DF:

            # read the df.conf file and return/get a list of mount points
            conf_file = fmAlarmObject.plugin_path + 'df.conf'
            if not os.path.exists(conf_file):
                collectd.error("%s cannot create filesystem "
                               "instance objects ; missing : %s" %
                               (PLUGIN, conf_file))
                return FAIL

            mountpoints = []
            with open(conf_file, 'r') as infile:
                for line in infile:
                    if 'MountPoint ' in line:

                        # get the mountpoint path from the line
                        try:
                            mountpoint = line.split('MountPoint ')[1][1:-2]
                            mountpoints.append(mountpoint)
                        except:
                            collectd.error("%s skipping invalid '%s' "
                                           "mountpoint line: %s" %
                                           (PLUGIN, conf_file, line))

            collectd.debug("%s MountPoints: %s" % (PLUGIN, mountpoints))

            # loop over the mount points
            for mp in mountpoints:
                # create a new plugin object
                inst_obj = fmAlarmObject(ALARM_ID__DF, PLUGIN__DF)

                # initialize the object with instance specific data
                inst_obj.resource_name = self.resource_name
                self._copy_instance_object(inst_obj)
                inst_obj.instance_name = mp
                inst_obj.degrade_id += '.' + 'filesystem=' + mp

                for plugin_instance in DF_MANGLED_DICT:
                    if DF_MANGLED_DICT[plugin_instance] == mp:
                        inst_obj.plugin_instance = plugin_instance
                        break
                else:
                    collectd.debug("%s no %s mountpoint" %
                                   (PLUGIN, mp))
                    continue
                inst_obj.entity_id = _build_entity_id(PLUGIN__DF,
                                                      inst_obj.plugin_instance)

                # add this subordinate object to the parent's
                # instance object list
                self._add_instance_object(inst_obj, inst_obj.entity_id)
                inst_obj.instance = inst_obj.instance_name

                collectd.info("%s monitoring %s usage" %
                              (PLUGIN, inst_obj.instance))


# ADD_NEW_PLUGIN: add plugin to this table
# This instantiates the plugin objects
PLUGINS = {
    PLUGIN__CPU: fmAlarmObject(ALARM_ID__CPU, PLUGIN__CPU),
    PLUGIN__MEM: fmAlarmObject(ALARM_ID__MEM, PLUGIN__MEM),
    PLUGIN__DF: fmAlarmObject(ALARM_ID__DF, PLUGIN__DF),
    PLUGIN__VSWITCH_CPU: fmAlarmObject(ALARM_ID__VSWITCH_CPU,
                                       PLUGIN__VSWITCH_CPU),
    PLUGIN__VSWITCH_MEM: fmAlarmObject(ALARM_ID__VSWITCH_MEM,
                                       PLUGIN__VSWITCH_MEM),
    PLUGIN__VSWITCH_PORT: fmAlarmObject(ALARM_ID__VSWITCH_PORT,
                                        PLUGIN__VSWITCH_PORT),
    PLUGIN__VSWITCH_IFACE: fmAlarmObject(ALARM_ID__VSWITCH_IFACE,
                                         PLUGIN__VSWITCH_IFACE)}


#####################################################################
#
# Name       : clear_alarm
#
# Description: Clear the specified alarm with the specified entity ID.
#
# Returns    : True if operation succeeded
#              False if there was an error exception.
#
# Assumptions: Caller can decide to retry based on return status.
#
#####################################################################
def clear_alarm(alarm_id, eid):
    """Clear the specified alarm:eid"""

    try:
        if api.clear_fault(alarm_id, eid) is True:
            collectd.info("%s %s:%s alarm cleared" %
                          (PLUGIN, alarm_id, eid))
        else:
            collectd.info("%s %s:%s alarm already cleared" %
                          (PLUGIN, alarm_id, eid))
        return True

    except Exception as ex:
        collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" %
                       (PLUGIN, alarm_id, eid, ex))
        return False


def get_base_object(alarm_id):
    """Get the alarm object for the specified alarm id"""
    for plugin in PLUGIN_NAME_LIST:
        if PLUGINS[plugin].id == alarm_id:
            return PLUGINS[plugin]
    return None


def get_object(alarm_id, eid):
    """Get the plugin object for the specified alarm id and eid"""

    base_obj = get_base_object(alarm_id)
    if len(base_obj.instance_objects):
        try:
            return(base_obj.instance_objects[eid])
        except:
            collectd.debug("%s %s has no instance objects" %
                           (PLUGIN, base_obj.plugin))
    return base_obj


def _build_entity_id(plugin, plugin_instance):
    """Builds an entity id string based on the collectd notification object"""

    inst_error = False

    entity_id = 'host='
    entity_id += fmAlarmObject.host

    if plugin == PLUGIN__MEM:
        if 'node' in plugin_instance:
            entity_id += '.numa=' + plugin_instance
        elif plugin_instance:
            entity_id += '.' + PLUGIN__MEM + '=' + plugin_instance

    elif plugin == PLUGIN__CPU:
        if plugin_instance:
            entity_id += '.' + PLUGIN__CPU + '=' + plugin_instance

    elif plugin == PLUGIN__VSWITCH_MEM:

        # host=<hostname>.processor=<socket-id>
        if plugin_instance:
            entity_id += '.processor=' + plugin_instance
        else:
            inst_error = True

    elif plugin == PLUGIN__VSWITCH_IFACE:

        # host=<hostname>.interface=<if-uuid>
        if plugin_instance:
            entity_id += '.interface=' + plugin_instance
        else:
            inst_error = True

    elif plugin == PLUGIN__VSWITCH_PORT:

        # host=<hostname>.port=<port-uuid>
        if plugin_instance:
            entity_id += '.port=' + plugin_instance
        else:
            inst_error = True

    elif plugin == PLUGIN__DF:

        # host=<hostname>.filesystem=<mountpoint>
        if plugin_instance:
            # build the entity_id for this plugin
            path = DF_MANGLED_DICT.get(plugin_instance)
            if path:
                entity_id += ".filesystem="
                entity_id += path
            else:
                inst_error = True

    if inst_error is True:
        collectd.error("%s eid build failed; bad or missing instance '%s'" %
                       (plugin, plugin_instance))
        return None

    return entity_id


def _get_df_mountpoints():

    conf_file = fmAlarmObject.plugin_path + 'df.conf'
    if not os.path.exists(conf_file):
        collectd.error("%s cannot create filesystem "
                       "instance objects ; missing : %s" %
                       (PLUGIN, conf_file))
        return FAIL

    mountpoints = []
    with open(conf_file, 'r') as infile:
        for line in infile:
            if 'MountPoint ' in line:

                # get the mountpoint path from the line
                try:
                    mountpoint = line.split('MountPoint ')[1][1:-2]
                    mountpoints.append(mountpoint)
                except:
                    collectd.error("%s skipping invalid '%s' "
                                   "mountpoint line: %s" %
                                   (PLUGIN, conf_file, line))

    return(mountpoints)


def _print_obj(obj):
    """Print a single object"""
    base_object = False
    for plugin in PLUGIN_NAME_LIST:
        if PLUGINS[plugin] == obj:
            base_object = True
            break

    num = len(obj.instance_objects)
    if num > 0 or base_object is True:
        prefix = "BASE " + str(num)
    else:
        prefix = "......."

    collectd.info("%s %s %s - %s - %s\n" %
                  (PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id))
    collectd.info("%s %s  fault obj: %s\n" % (PLUGIN, prefix, obj.fault))
    collectd.info("%s %s  entity id: %s\n" % (PLUGIN, prefix, obj.entity_id))
    collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id))

    collectd.info("%s %s instance  : %s\n" %
                  (PLUGIN, prefix, obj.instance_name))

    if obj.plugin_instance:
        collectd.info("%s %s Plugin Ins: %s\n" %
                      (PLUGIN, prefix, obj.plugin_instance))
    if obj.warnings:
        collectd.info("%s %s   warnings: %s" %
                      (PLUGIN, prefix, obj.warnings))
    if obj.failures:
        collectd.info("%s %s   failures: %s" %
                      (PLUGIN, prefix, obj.failures))
    if obj.repair:
        collectd.info("%s %s     repair: %s" % (PLUGIN, prefix, obj.repair))

    if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50:
        collectd.info("%s %s  reason: w: %s\n" %
                      (PLUGIN, prefix, obj.reason_warning))
        collectd.info("%s %s  reason: f: %s\n" %
                      (PLUGIN, prefix, obj.reason_failure))

    collectd.info("%s %s value:%2.1f thld:%2.1f cause:%s count:%d type:%s\n" %
                  (PLUGIN, prefix,
                   obj.value,
                   obj.threshold,
                   obj.cause,
                   obj.count,
                   obj.reading_type))

    collectd.info("\n")


def _print_state(obj=None):
    """Print the current object state"""
    try:
        objs = []
        if obj is None:
            for plugin in PLUGIN_NAME_LIST:
                objs.append(PLUGINS[plugin])
        else:
            objs.append(obj)

        collectd.debug("%s _print_state Lock ..." % PLUGIN)
        with fmAlarmObject.lock:
            for o in objs:
                _print_obj(o)
                if len(o.instance_objects):
                    for inst_obj in o.instance_objects:
                        _print_obj(o.instance_objects[inst_obj])

    except Exception as ex:
        collectd.error("%s _print_state exception ; %s" %
                       (PLUGIN, ex))


def _database_setup(database):
    """Setup the influx database for collectd resource samples"""

    collectd.info("%s setting up influxdb:%s database" %
                  (PLUGIN, database))

    error_str = ""

    # http://influxdb-python.readthedocs.io/en/latest/examples.html
    # http://influxdb-python.readthedocs.io/en/latest/api-documentation.html
    fmAlarmObject.dbObj = InfluxDBClient('127.0.0.1', '8086', database)
    if fmAlarmObject.dbObj:
        try:
            fmAlarmObject.dbObj.create_database('collectd')

            ############################################################
            #
            # TODO: Read current retention period from service parameter
            #       Make it a puppet implementation.
            #
            # Create a '1 week' samples retention policy
            # -----------------------------------------
            # name     = 'collectd samples'
            # duration = set retention period in time
            #               xm - minutes
            #               xh - hours
            #               xd - days
            #               xw - weeks
            #               xy - years
            # database = 'collectd'
            # default  = True ; make it the default
            #
            ############################################################

            fmAlarmObject.dbObj.create_retention_policy(
                DATABASE_NAME, '1w', 1, database, True)
        except Exception as ex:
            if str(ex) == 'database already exists':
                try:
                    collectd.info("%s influxdb:collectd %s" %
                                  (PLUGIN, str(ex)))
                    fmAlarmObject.dbObj.create_retention_policy(
                        DATABASE_NAME, '1w', 1, database, True)
                except Exception as ex:
                    if str(ex) == 'retention policy already exists':
                        collectd.info("%s influxdb:collectd %s" %
                                      (PLUGIN, str(ex)))
                    else:
                        error_str = "failure from influxdb ; "
                        error_str += str(ex)
            else:
                error_str = "failed to create influxdb:" + database
    else:
        error_str = "failed to connect to influxdb:" + database

    if not error_str:
            found = False
            retention = \
                fmAlarmObject.dbObj.get_list_retention_policies(database)
            for r in range(len(retention)):
                if retention[r]["name"] == DATABASE_NAME:
                    collectd.info("%s influxdb:%s samples retention "
                                  "policy: %s" %
                                  (PLUGIN, database, retention[r]))
                    found = True
            if found is True:
                collectd.info("%s influxdb:%s is setup" % (PLUGIN, database))
                fmAlarmObject.database_setup = True
            else:
                collectd.error("%s influxdb:%s retention policy NOT setup" %
                               (PLUGIN, database))


def _clear_alarm_for_missing_filesystems():
    """Clear alarmed file systems that are no longer mounted or present"""

    # get the DF (filesystem plugin) base object.
    df_base_obj = PLUGINS[PLUGIN__DF]
    # create a single alarm list from both wranings and failures list
    # to avoid having to duplicate the code below for each.
    # At this point we don't care about severity, we just need to
    # determine if an any-severity' alarmed filesystem no longer exists
    # so we can cleanup by clearing its alarm.
    # Note: the 2 lists should always contain unique data between them
    alarm_list = df_base_obj.warnings + df_base_obj.failures
    if len(alarm_list):
        for eid in alarm_list:
            # search for any of them that might be alarmed.
            obj = df_base_obj._get_instance_object(eid)

            # only care about df (file system plugins)
            if obj is not None and \
               obj.plugin == PLUGIN__DF and \
               obj.entity_id == eid and \
               obj.instance_name != '':

                if os.path.ismount(obj.instance_name) is False:
                    if clear_alarm(df_base_obj.id, obj.entity_id) is True:
                        collectd.info("%s cleared alarm for missing %s" %
                                      (PLUGIN, obj.instance_name))
                        df_base_obj.manage_alarm_lists(obj.entity_id, "okay")
                else:
                    collectd.debug("%s maintaining alarm for %s" %
                                   (PLUGIN, obj.instance_name))


# Collectd calls this function on startup.
# Initialize each plugin object with plugin specific data.
# Query FM for existing alarms and run with that starting state.
def init_func():
    """Collectd FM Notifier Initialization Function"""

    mtcDegradeObj.port = MTCE_CMD_RX_PORT
    collectd.info("%s mtce port %d" %
                  (PLUGIN, mtcDegradeObj.port))

    fmAlarmObject.lock = Lock()

    fmAlarmObject.host = pluginObject.gethostname()
    collectd.info("%s %s:%s init function" %
                  (PLUGIN, tsc.nodetype, fmAlarmObject.host))

    # The path to where collectd is looking for its plugins is specified
    # at the end of the /etc/collectd.conf file.
    # Because so we search for the 'Include' label in reverse order.
    for line in reversed(open("/etc/collectd.conf", 'r').readlines()):
        if line.startswith('Include'):
            plugin_path = line.split(' ')[1].strip("\n").strip('"') + '/'
            fmAlarmObject.plugin_path = plugin_path
            collectd.info("plugin path: %s" % fmAlarmObject.plugin_path)
            break

    # Constant CPU Plugin Object Settings
    obj = PLUGINS[PLUGIN__CPU]
    obj.resource_name = "Platform CPU"
    obj.instance_name = PLUGIN__CPU
    obj.repair = "Monitor and if condition persists, "
    obj.repair += "contact next level of support."
    collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))

    ###########################################################################

    # Constant Memory Plugin Object settings
    obj = PLUGINS[PLUGIN__MEM]
    obj.resource_name = "Memory"
    obj.instance_name = PLUGIN__MEM
    obj.repair = "Monitor and if condition persists, "
    obj.repair += "contact next level of support; "
    obj.repair += "may require additional memory on Host."
    collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))

    ###########################################################################

    # Constant FileSystem Plugin Object settings
    obj = PLUGINS[PLUGIN__DF]
    obj.resource_name = "File System"
    obj.instance_name = PLUGIN__DF
    obj.repair = "Monitor and if condition persists, "
    obj.repair += "contact next level of support."

    # The FileSystem (DF) plugin has multiple instances
    # One instance per file system mount point being monitored.
    # Create one DF instance object per mount point
    obj.create_instance_objects()

    # ntp query is for controllers only
    if want_vswitch is False:
        collectd.debug("%s vSwitch monitoring disabled" % PLUGIN)
    elif tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions:

        #######################################################################

        # Constant vSwitch CPU Usage Plugin Object settings
        obj = PLUGINS[PLUGIN__VSWITCH_CPU]
        obj.resource_name = "vSwitch CPU"
        obj.instance_name = PLUGIN__VSWITCH_CPU
        obj.repair = "Monitor and if condition persists, "
        obj.repair += "contact next level of support."
        collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))

        #######################################################################

        # Constant vSwitch Memory Usage Plugin Object settings
        obj = PLUGINS[PLUGIN__VSWITCH_MEM]
        obj.resource_name = "vSwitch Memory"
        obj.instance_name = PLUGIN__VSWITCH_MEM
        obj.repair = "Monitor and if condition persists, "
        obj.repair += "contact next level of support."
        collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name))

        #######################################################################

        # Constant vSwitch Port State Monitor Plugin Object settings
        obj = PLUGINS[PLUGIN__VSWITCH_PORT]
        obj.resource_name = "vSwitch Port"
        obj.instance_name = PLUGIN__VSWITCH_PORT
        obj.reading_type = "state"
        obj.reason_failure = "'Data' Port failed."
        obj.reason_warning = "'Data' Port failed."
        obj.repair = "Check cabling and far-end port configuration and "
        obj.repair += "status on adjacent equipment."
        obj.alarm_type = fm_constants.FM_ALARM_TYPE_4     # EQUIPMENT
        obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29  # LOSS_OF_SIGNAL
        obj.service_affecting = True
        collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))

        #######################################################################

        # Constant vSwitch Interface State Monitor Plugin Object settings
        obj = PLUGINS[PLUGIN__VSWITCH_IFACE]
        obj.resource_name = "vSwitch Interface"
        obj.instance_name = PLUGIN__VSWITCH_IFACE
        obj.reading_type = "state"
        obj.reason_failure = "'Data' Interface failed."
        obj.reason_warning = "'Data' Interface degraded."
        obj.repair = "Check cabling and far-end port configuration and "
        obj.repair += "status on adjacent equipment."
        obj.alarm_type = fm_constants.FM_ALARM_TYPE_4     # EQUIPMENT
        obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29  # LOSS_OF_SIGNAL
        obj.service_affecting = True
        collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name))

    ###########################################################################

    # ...
    # ADD_NEW_PLUGIN: Add new plugin object initialization here ...
    # ...

    if tsc.nodetype == 'controller':
        fmAlarmObject.database_setup_in_progress = True
        _database_setup('collectd')
        fmAlarmObject.database_setup_in_progress = False

    pluginObject.init_completed()
    return 0


# The notifier function inspects the collectd notification and determines if
# the representative alarm needs to be asserted, severity changed, or cleared.
def notifier_func(nObject):

    # do nothing till config is complete.
    if pluginObject._config_complete is False:
        if pluginObject.config_complete() is False:
            return 0

    if pluginObject._node_ready is False:
        collectd.info("%s %s not ready ; from:%s:%s:%s" %
                      (PLUGIN,
                       fmAlarmObject.host,
                       nObject.host,
                       nObject.plugin,
                       nObject.plugin_instance))
        pluginObject.node_ready()
        return 0

    if fmAlarmObject.fm_connectivity is False:
        # handle multi threading startup
        with fmAlarmObject.lock:
            if fmAlarmObject.fm_connectivity is True:
                return 0

            ##################################################################
            #
            # With plugin objects initialized ...
            # Query FM for any resource alarms that may already be raised
            # Load the queries severity state into the appropriate
            # severity list for those that are.
            for alarm_id in ALARM_ID_LIST:
                collectd.debug("%s searching for all '%s' alarms " %
                               (PLUGIN, alarm_id))
                try:
                    alarms = api.get_faults_by_id(alarm_id)
                except Exception as ex:
                    collectd.warning("%s 'get_faults_by_id' exception ; %s" %
                                     (PLUGIN, ex))

                    # if fm is not responding then the node is not ready
                    pluginObject._node_ready = False
                    pluginObject.node_ready_count = 0
                    return 0

                if alarms:
                    for alarm in alarms:
                        want_alarm_clear = False
                        eid = alarm.entity_instance_id
                        # ignore alarms not for this host
                        if fmAlarmObject.host not in eid:
                            continue

                        # get the instance part of the eid
                        #  instance based alarms are cleared over a process
                        #  restart to avoid the potential for stuck alarms.
                        base_eid = 'host=' + os.uname()[1]
                        if eid.split(base_eid)[1]:
                            want_alarm_clear = True

                        collectd.info('%s alarm %s:%s:%s found at startup' %
                                      (PLUGIN,
                                       alarm.severity,
                                       alarm_id,
                                       eid))

                        if want_alarm_clear is True:
                            if clear_alarm(alarm_id, eid) is False:
                                collectd.error("%s alarm %s:%s:%s clear "
                                               "failed" %
                                               (PLUGIN, alarm.severity,
                                                alarm_id,
                                                eid))
                            continue

                        if alarm.severity == "critical":
                            sev = "failure"
                        elif alarm.severity == "major":
                            sev = "warning"
                        else:
                            sev = "okay"
                            continue

                        # Load the alarm severity by plugin/instance lookup.
                        base_obj = get_base_object(alarm_id)
                        if base_obj is not None:
                            base_obj.manage_alarm_lists(eid, sev)

                            # the eid at this point is really the plugin id
                            pid = eid

                            # here the eid is used to represent the degrade id
                            eid = base_obj.degrade_id

                            # handle degrade for alarmed resources
                            # over process startup.
                            add = False
                            if alarm.severity == "critical" and\
                                    pid in mtcDegradeObj.degrade_list__failure:
                                add = True
                            elif alarm.severity == "major" and\
                                    pid in mtcDegradeObj.degrade_list__warning:
                                add = True
                            if add is True:

                                mtcDegradeObj.degrade_list.append(eid)
                                collectd.info("%s '%s' plugin added to "
                                              "degrade list due to found "
                                              "startup alarm %s" %
                                              (PLUGIN_DEGRADE, eid, alarm_id))

        fmAlarmObject.fm_connectivity = True
        collectd.info("%s node ready" % PLUGIN)

    collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % (
        PLUGIN,
        nObject.host,
        nObject.plugin,
        nObject.plugin_instance,
        nObject.type,
        nObject.type_instance,
        nObject.severity,
        nObject.message))

    # Load up severity variables and alarm actions based on
    # this notification's severity level.
    if nObject.severity == NOTIF_OKAY:
        severity_str = "okay"
        _severity_num = fm_constants.FM_ALARM_SEVERITY_CLEAR
        _alarm_state = fm_constants.FM_ALARM_STATE_CLEAR
    elif nObject.severity == NOTIF_FAILURE:
        severity_str = "failure"
        _severity_num = fm_constants.FM_ALARM_SEVERITY_CRITICAL
        _alarm_state = fm_constants.FM_ALARM_STATE_SET
    elif nObject.severity == NOTIF_WARNING:
        severity_str = "warning"
        _severity_num = fm_constants.FM_ALARM_SEVERITY_MAJOR
        _alarm_state = fm_constants.FM_ALARM_STATE_SET
    else:
        collectd.debug('%s with unsupported severity %d' %
                       (PLUGIN, nObject.severity))
        return 0

    if tsc.nodetype == 'controller':
        if fmAlarmObject.database_setup is False:
            if fmAlarmObject.database_setup_in_progress is False:
                fmAlarmObject.database_setup_in_progress = True
                _database_setup('collectd')
                fmAlarmObject.database_setup_in_progress = False

    # get plugin object
    if nObject.plugin in PLUGINS:
        base_obj = obj = PLUGINS[nObject.plugin]

        # if this notification is for a plugin instance then get that
        # instances's object instead.
        # If that object does not yet exists then create it.
        eid = ''

        # DF instances are statically allocated
        if nObject.plugin == PLUGIN__DF:
            eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)

            # get this instances object
            obj = base_obj._get_instance_object(eid)
            if obj is None:
                # path should never be hit since all DF instances
                # are statically allocated.
                return 0

        elif nObject.plugin_instance:
            need_instance_object_create = False
            # Build the entity_id from the parent object if needed
            eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)
            try:
                # Need lock when reading/writing any obj.instance_objects list
                with fmAlarmObject.lock:

                    # we will take an exception if this object is not
                    # in the list. The exception handling code below will
                    # create and add this object for success path the next
                    # time around.
                    inst_obj = base_obj.instance_objects[eid]

                    collectd.debug("%s %s instance %s already exists %s" %
                                   (PLUGIN, nObject.plugin, eid, inst_obj))
                    # _print_state(inst_obj)

            except:
                need_instance_object_create = True

            if need_instance_object_create is True:
                base_obj.create_instance_object(nObject.plugin_instance)
                inst_obj = base_obj._get_instance_object(eid)
                if inst_obj:
                    inst_obj.instance_name = nObject.plugin_instance
                    collectd.debug("%s %s:%s inst object created" %
                                   (PLUGIN,
                                    inst_obj.plugin,
                                    inst_obj.instance_name))
                else:
                    collectd.error("%s %s:%s inst object create failed" %
                                   (PLUGIN,
                                    nObject.plugin,
                                    nObject.plugin_instance_name))
                    return 0

            # re-assign the object
            obj = inst_obj
        else:
            if not len(base_obj.entity_id):
                # Build the entity_id from the parent object if needed
                eid = _build_entity_id(nObject.plugin, nObject.plugin_instance)

        # update the object with the eid if its not already set.
        if not len(obj.entity_id):
            obj.entity_id = eid

    else:
        collectd.debug("%s notification for unknown plugin: %s %s" %
                       (PLUGIN, nObject.plugin, nObject.plugin_instance))
        return 0

    # if obj.warnings or obj.failures:
    #     _print_state(obj)

    # manage reading value change ; store last and log if gt obj.step
    action = obj.manage_change(nObject)
    if action == "done":
        return 0

    # Handle degrade state update early in process start.
    # Ensure that a degrade condition that clears over a collectd
    # collectd process restart is cleared as soon as possible.
    if obj.count == 0:
        mtcDegradeObj.mtce_degrade_notifier(nObject)

    # increment just before any possible return for a valid sample
    obj.count += 1

    # audit file system presence every time we get the
    # notification for the root file system ; which will
    # always be there.
    if obj.instance_name == '/':
        _clear_alarm_for_missing_filesystems()
        if len(mtcDegradeObj.degrade_list):
            mtcDegradeObj.remove_degrade_for_missing_filesystems()

        obj.alarm_audit_threshold += 1
        if obj.alarm_audit_threshold >= AUDIT_RATE:
            if want_state_audit:
                obj._state_audit("audit")
            obj.alarm_audit_threshold = 0

            #################################################################
            #
            # Audit Asserted Alarms
            #
            # Loop over the list of auditable alarm ids building two
            # dictionaries, one containing warning (major) and the other
            # failure (critical) with alarm info needed to detect and
            # correct stale, missing or severity mismatched alarms for
            # the listed alarm ids <100.xxx>.
            #
            # Note: Conversion in terminology from
            #         warning -> major and
            #         failures -> critical
            #       is done because fm speaks in terms of major and critical
            #       while the plugin speaks in terms of warning and failure.
            #
            major_alarm_dict = {}
            critical_alarm_dict = {}
            for alarm_id in AUDIT_ALARM_ID_LIST:

                tmp_base_obj = get_base_object(alarm_id)
                if tmp_base_obj is None:
                    collectd.error("%s audit %s base object lookup failed" %
                                   (PLUGIN, alarm_id))
                    continue

                # Build 2 dictionaries containing current alarmed info.
                # Dictionary entries are indexed by entity id to fetch the
                # alarm id and last fault object used to create the alarm
                # for the mismatch and missing case handling.
                #
                # { eid : { alarm : <alarm id>, fault : <fault obj> }}, ... }

                # major list for base object from warnings list
                if tmp_base_obj.entity_id in tmp_base_obj.warnings:
                    info = {}
                    info[pc.AUDIT_INFO_ALARM] = alarm_id
                    info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault
                    major_alarm_dict[tmp_base_obj.entity_id] = info

                # major list for instance objects from warnings list
                for _inst_obj in tmp_base_obj.instance_objects:
                    inst_obj = tmp_base_obj.instance_objects[_inst_obj]
                    if inst_obj.entity_id in tmp_base_obj.warnings:
                        info = {}
                        info[pc.AUDIT_INFO_ALARM] = alarm_id
                        info[pc.AUDIT_INFO_FAULT] = inst_obj.fault
                        major_alarm_dict[inst_obj.entity_id] = info

                # critical list for base object from failures list
                if tmp_base_obj.entity_id in tmp_base_obj.failures:
                    info = {}
                    info[pc.AUDIT_INFO_ALARM] = alarm_id
                    info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault
                    critical_alarm_dict[tmp_base_obj.entity_id] = info

                # critical list for instance objects from failures list
                for _inst_obj in tmp_base_obj.instance_objects:
                    inst_obj = tmp_base_obj.instance_objects[_inst_obj]
                    if inst_obj.entity_id in tmp_base_obj.failures:
                        info = {}
                        info[pc.AUDIT_INFO_ALARM] = alarm_id
                        info[pc.AUDIT_INFO_FAULT] = inst_obj.fault
                        critical_alarm_dict[inst_obj.entity_id] = info

            pluginObject.alarms_audit(api, AUDIT_ALARM_ID_LIST,
                                      major_alarm_dict,
                                      critical_alarm_dict)
            # end alarms audit
            #################################################################

    # exit early if there is no alarm update to be made
    if obj.debounce(base_obj,
                    obj.entity_id,
                    severity_str,
                    obj.value) is False:
        # Call the degrade notifier at steady state,
        #  degrade or clear, so that the required collectd
        #  degrade state is periodically refreshed.
        # However, rather than do this refresh on every notification,
        #  just do it for the root filesystem instance case.
        if obj.instance_name == '/':
            mtcDegradeObj.mtce_degrade_notifier(nObject)
        return 0

    mtcDegradeObj.manage_degrade_list(nObject)
    mtcDegradeObj.mtce_degrade_notifier(nObject)

    if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
        if clear_alarm(obj.id, obj.entity_id) is False:
            return 0
    else:

        # manage addition of the failure reason text
        if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50:
            # if this is a threshold alarm then build the reason text that
            # includes the threshold and the reading that caused the assertion.
            reason = obj.resource_name
            reason += " threshold exceeded ;"
            if obj.threshold != INVALID_THRESHOLD:
                reason += " threshold {:2.2f}".format(obj.threshold) + "%,"
            if obj.value:
                reason += " actual {:2.2f}".format(obj.value) + "%"

        elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
            reason = obj.reason_failure

        else:
            reason = obj.reason_warning

        # build the alarm object
        obj.fault = fm_api.Fault(
            alarm_id=obj.id,
            alarm_state=_alarm_state,
            entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
            entity_instance_id=obj.entity_id,
            severity=_severity_num,
            reason_text=reason,
            alarm_type=base_obj.alarm_type,
            probable_cause=base_obj.cause,
            proposed_repair_action=base_obj.repair,
            service_affecting=base_obj.service_affecting,
            suppression=base_obj.suppression)

        try:
            alarm_uuid = api.set_fault(obj.fault)
            if pc.is_uuid_like(alarm_uuid) is False:
                collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
                               (PLUGIN,
                                base_obj.id,
                                obj.entity_id,
                                alarm_uuid))
                return 0

        except Exception as ex:
            collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" %
                           (PLUGIN,
                            obj.id,
                            obj.entity_id,
                            _severity_num,
                            ex))
            return 0

    # update the lists now that
    base_obj.manage_alarm_lists(obj.entity_id, severity_str)

    collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % (
                  PLUGIN,
                  _alarm_state,
                  base_obj.id,
                  severity_str,
                  obj.instance_name,
                  obj.entity_id,
                  obj.value))

    # Debug only: comment out for production code.
    # obj._state_audit("change")

    return 0


collectd.register_init(init_func)
collectd.register_notification(notifier_func)