integ/monitoring/collectd-extensions/src/mtce_notifier.py

#
# Copyright (c) 2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This file is the collectd 'Maintenance' Notifier.
#
# Collects provides information about each event as an object passed to the
# notification handler ; the notification object.
#
#    object.host              - the hostname
#
#    object.plugin            - the name of the plugin aka resource
#    object.plugin_instance   - plugin instance string i.e. say mountpoint
#                               for df plugin
#    object.type,             - the unit i.e. percent or absolute
#    object.type_instance     - the attribute i.e. free, used, etc
#
#    object.severity          - a integer value 0=OK , 1=warning, 2=failure
#    object.message           - a log-able message containing the above along
#                               with the value
#
# This notifier manages requesting mtce to assert or clear its collectd
# host-degrade-cause flag based on notification messages sent from collectd.
#
# Messages to maintenance are throttled ONE_EVERY while this state is the
# same as last state.
#
# Message is sent on every state change
#  from clear  to assert or
#  from assert to clear
#
# See code comments for details.
#
############################################################################
#
# Import list

import os
import socket
import collectd
import tsconfig.tsconfig as tsc

# This plugin name
PLUGIN = 'degrade notifier'

# collectd severity definitions ;
# Note: can't seem to pull then in symbolically with a header
NOTIF_FAILURE = 1
NOTIF_WARNING = 2
NOTIF_OKAY = 4

# generic return codes
PASS = 0
FAIL = 1

# default mtce port.
# ... with configuration override
MTCE_CMD_RX_PORT = 2101

# same state message throttle count.
# ... only send the degrade message every 'this' number
#     while the state of assert or clear remains the same.
ONE_EVERY = 10

PLUGIN__DF = 'df'
PLUGIN__MEM = 'memory'
PLUGIN__CPU = 'cpu'

PLUGIN__VSWITCH_MEM = 'vswitch_mem'
PLUGIN__VSWITCH_CPU = 'vswitch_cpu'
PLUGIN__VSWITCH_PORT = "vswitch_port"
PLUGIN__VSWITCH_IFACE = "vswitch_iface"


PLUGIN_INTERFACE = 'interface'
PLUGIN__EXAMPLE = 'example'


# The collectd Maintenance Notifier Object
class collectdMtceNotifierObject:

    def __init__(self, port):
        """
        collectdMtceNotifierObject Class constructor
        """
        # default maintenance port
        self.port = port
        self.addr = None

        # specifies the protocol family to use when messaging maintenance.
        # if system is IPV6, then that is learned and this 'protocol' is
        # updated with AF_INET6
        self.protocol = socket.AF_INET

        # List of plugin names that require degrade for specified severity.
        self.degrade_list__failure = [PLUGIN__DF,
                                      PLUGIN__MEM,
                                      PLUGIN__CPU,
                                      PLUGIN__VSWITCH_MEM,
                                      PLUGIN__VSWITCH_CPU,
                                      PLUGIN__VSWITCH_PORT,
                                      PLUGIN__VSWITCH_IFACE,
                                      PLUGIN_INTERFACE,
                                      PLUGIN__EXAMPLE]
        self.degrade_list__warning = [PLUGIN_INTERFACE]

        # the running list of resources that require degrade.
        # a degrade clear message is sent whenever this list is empty.
        # a degrade assert message is sent whenever this list is not empty.
        self.degrade_list = []

        # throttle down sending of duplicate degrade assert/clear messages
        self.last_state = "undef"
        self.msg_throttle = 0


# Instantiate the mtce_notifier object
# This object persists from notificaiton to notification
obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT)


def _get_active_controller_ip():
    """
    Get the active controller host IP
    """

    try:
        obj.addr = socket.getaddrinfo('controller', None)[0][4][0]
        collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr))
    except Exception as ex:
        obj.addr = None
        collectd.error("%s failed to get controller ip ; %s" %
                       (PLUGIN, str(ex)))
        return 0


def _df_instance_to_path(df_inst):
    """
    Convert a df instance name to a mountpoint
    """

    # df_root is not a dynamic file system. Ignore that one.
    if df_inst == 'df_root':
        return '/'
    else:
        # For all others replace all '-' with '/'
        return('/' + df_inst[3:].replace('-', '/'))


# This function removes degraded file systems that are no longer present.
def _clear_degrade_for_missing_filesystems():
    """
    Remove degraded file systems that are no longer mounted or present.
    """

    for df_inst in obj.degrade_list:

        # Only file system plugins are looked at.
        # File system plugin instance names are prefixed with 'df_'
        # as the first 3 chars in the instance name.
        if df_inst[0:3] == 'df_':
            path = _df_instance_to_path(df_inst)

            # check the mount point.
            # if the mount point no longer exists then remove
            # this instance from the degrade list.
            if os.path.ismount(path) is False:
                collectd.info("%s clearing degrade for missing %s ; %s" %
                              (PLUGIN, path, obj.degrade_list))
                obj.degrade_list.remove(df_inst)

    return 0


# The collectd configuration interface
#
# Used to configure the maintenance port.
#    key = 'port'
#    val = port number
#
def config_func(config):
    """
    Configure the maintenance degrade notifier plugin.
    """

    collectd.debug('%s config function' % PLUGIN)
    for node in config.children:
        key = node.key.lower()
        val = node.values[0]

        if key == 'port':
            obj.port = int(val)
            collectd.info("%s configured mtce port: %d" %
                          (PLUGIN, obj.port))
            return 0

    obj.port = MTCE_CMD_RX_PORT
    collectd.error("%s no mtce port provided ; defaulting to %d" %
                   (PLUGIN, obj.port))


# Collectd calls this function on startup.
def init_func():
    """
    Collectd Mtce Notifier Initialization Function
    """

    obj.host = os.uname()[1]
    collectd.info("%s %s:%s sending to mtce port %d" %
                  (PLUGIN, tsc.nodetype, obj.host, obj.port))

    collectd.debug("%s init function" % PLUGIN)


# This is the Notifier function that is called by collectd.
#
# Handling steps are
#
#  1. build resource name from notification object.
#  2. check resource against severity lists.
#  3. manage this instance's degrade state.
#  4. send mtcAgent the degrade state message.
#
def notifier_func(nObject):
    """
    Collectd Mtce Notifier Handler Function
    """

    # Create the resource name from the notifier object.
    # format: <plugin name>_<plugin_instance_name>
    resource = nObject.plugin
    if nObject.plugin_instance:
        resource += "_" + nObject.plugin_instance

    # This block looks at the current notification severity
    # and manages the degrade_list.
    # If the specified plugin name exists in each of the warnings
    # or failure lists and there is a current severity match then
    # add that resource instance to the degrade list.
    # Conversly if this notification is OKAY then make sure this
    # resource instance is not in the degrade list (remove it if it is)
    if nObject.severity is NOTIF_OKAY:
        if obj.degrade_list and resource in obj.degrade_list:
            obj.degrade_list.remove(resource)

    elif nObject.severity is NOTIF_FAILURE:
        if obj.degrade_list__failure:
            if nObject.plugin in obj.degrade_list__failure:
                if resource not in obj.degrade_list:
                    # handle dynamic filesystems going missing over a swact
                    # or unmount and being reported as a transient error by
                    # the df plugin. Don't add it to the failed list if the
                    # mountpoint is gone.
                    add = True
                    if nObject.plugin == PLUGIN__DF:
                        path = _df_instance_to_path(resource)
                        add = os.path.ismount(path)
                    if add is True:
                        collectd.info("%s %s added to degrade list" %
                                      (PLUGIN, resource))
                        obj.degrade_list.append(resource)
        else:
            # If severity is failure and no failures cause degrade
            # then make sure this plugin is not in the degrade list,
            # Should never occur.
            if resource in obj.degrade_list:
                obj.degrade_list.remove(resource)

    elif nObject.severity is NOTIF_WARNING:
        if obj.degrade_list__warning:
            if nObject.plugin in obj.degrade_list__warning:
                if resource not in obj.degrade_list:
                    # handle dynamic filesystems going missing over a swact
                    # or unmount and being reported as a transient error by
                    # the df plugin. Don't add it to the failed list if the
                    # mountpoint is gone.
                    add = True
                    if nObject.plugin == PLUGIN__DF:
                        path = _df_instance_to_path(resource)
                        add = os.path.ismount(path)
                    if add is True:
                        collectd.info("%s %s added to degrade list" %
                                      (PLUGIN, resource))
                        obj.degrade_list.append(resource)
        else:
            # If severity is warning and no warnings cause degrade
            # then make sure this plugin is not in the degrade list.
            # Should never occur..
            if resource in obj.degrade_list:
                obj.degrade_list.remove(resource)
    else:
        collectd.info("%s unsupported severity %d" %
                      (PLUGIN, nObject.severity))
        return FAIL

    # running counter of notifications.
    obj.msg_throttle += 1

    # Support for Dynamic File Systems
    # --------------------------------
    # Some active controller mounted filesystems can become
    # unmounted under the watch of collectd. This can occur
    # as a result of a Swact. If an 'degrade' is raised at the
    # time an fs disappears then that state can become stuck
    # active until the next Swact. This call handles this case.
    #
    # Audit file system presence every time we get the
    # notification for the root file system.
    # Depending on the root filesystem always being there.
    if nObject.plugin == 'df' \
       and nObject.plugin_instance == 'root' \
       and len(obj.degrade_list):
        _clear_degrade_for_missing_filesystems()

    # If degrade list is empty then a clear state is sent to maintenance.
    # If degrade list is NOT empty then an assert state is sent to maintenance
    # For logging and to ease debug the code below will create a list of
    # degraded resource instances to be included in the message to maintenance
    # for mtcAgent to optionally log it.
    resources = ""
    if obj.degrade_list:
        # loop over the list,
        # limit the degraded resource list being sent to mtce to 5
        for r in obj.degrade_list[0:1:5]:
            resources += r + ','
        resources = resources[:-1]
        state = "assert"
    else:
        state = "clear"

    # Message throttling ....

    # Avoid sending the same last state message for up to ONE_EVERY count.
    # Just reduce load on mtcAgent
    if obj.last_state == state and obj.msg_throttle < ONE_EVERY:
        return 0

    # if the degrade state has changed then log it and proceed
    if obj.last_state != state:
        if obj.last_state != "undef":
            collectd.info("%s degrade %s %s" %
                          (PLUGIN,
                           state,
                           obj.degrade_list))

    # Save state for next time
    obj.last_state = state

    # Clear the message throttle counter
    obj.msg_throttle = 0

    # Send the degrade state ; assert or clear message to mtcAgent.
    # If we get a send failure then log it and set the addr to None
    # so it forces us to refresh the controller address on the next
    # notification
    try:
        mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM)
        if mtce_socket:
            if obj.addr is None:
                _get_active_controller_ip()
                if obj.addr is None:
                    return 0

            # Create the Maintenance message.
            message = "{\"service\":\"collectd_notifier\","
            message += "\"hostname\":\"" + nObject.host + "\","
            message += "\"degrade\":\"" + state + "\","
            message += "\"resource\":\"" + resources + "\"}"
            collectd.debug("%s: %s" % (PLUGIN, message))

            mtce_socket.settimeout(1.0)
            mtce_socket.sendto(message, (obj.addr, obj.port))
            mtce_socket.close()
        else:
            collectd.error("%s %s failed to open socket (%s)" %
                          (PLUGIN, resource, obj.addr))
    except socket.error as e:
        if e.args[0] == socket.EAI_ADDRFAMILY:
            # Handle IPV4 to IPV6 switchover:
            obj.protocol = socket.AF_INET6
            collectd.info("%s %s ipv6 addressing (%s)" %
                          (PLUGIN, resource, obj.addr))
        else:
            collectd.error("%s %s socket error (%s) ; %s" %
                          (PLUGIN, resource, obj.addr, str(e)))
            # try self correction
            obj.addr = None
            obj.protocol = socket.AF_INET

    return 0

collectd.register_config(config_func)
collectd.register_init(init_func)
collectd.register_notification(notifier_func)