monasca-agent/monasca_agent/common/check_status.py

"""
This module contains classes which are used to occasionally persist the status
of checks.
"""

# stdlib
import datetime
import logging
import os
import pickle
import platform
import sys
import tempfile
import time

# project
import collections
import config
import util
import yaml

# 3rd party
import ntplib

STATUS_OK = 'OK'
STATUS_ERROR = 'ERROR'
STATUS_WARNING = 'WARNING'

NTP_OFFSET_THRESHOLD = 600


log = logging.getLogger(__name__)


class Stylizer(object):

    STYLES = {
        'bold': 1,
        'grey': 30,
        'red': 31,
        'green': 32,
        'yellow': 33,
        'blue': 34,
        'magenta': 35,
        'cyan': 36,
        'white': 37,
    }

    HEADER = '\033[1m'
    UNDERLINE = '\033[2m'

    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    RESET = '\033[0m'

    ENABLED = False

    @classmethod
    def stylize(cls, text, *styles):
        """stylize the text. """
        if not cls.ENABLED:
            return text
        # don't bother about escaping, not that complicated.
        fmt = '\033[%dm%s'

        for style in styles or []:
            text = fmt % (cls.STYLES[style], text)

        return text + fmt % (0, '')  # reset


# a small convienence method
def style(*args):
    return Stylizer.stylize(*args)


def logger_info():
    loggers = []
    root_logger = logging.getLogger()
    if len(root_logger.handlers) > 0:
        for handler in root_logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                loggers.append(handler.stream.name)
            if isinstance(handler, logging.handlers.SysLogHandler):
                if isinstance(handler.address, basestring):
                    loggers.append('syslog:%s' % handler.address)
                else:
                    loggers.append('syslog:(%s, %s)' % handler.address)
    else:
        loggers.append("No loggers configured")
    return ', '.join(loggers)


def get_ntp_info():
    ntp_offset = ntplib.NTPClient().request('pool.ntp.org', version=3).offset
    if abs(ntp_offset) > NTP_OFFSET_THRESHOLD:
        ntp_styles = ['red', 'bold']
    else:
        ntp_styles = []
    return ntp_offset, ntp_styles


class AgentStatus(object):
    """A small class used to load and save status messages to the filesystem.
    """

    NAME = None
    agent_config = config.Config()

    def __init__(self):
        self.created_at = datetime.datetime.now()
        self.created_by_pid = os.getpid()

    def has_error(self):
        raise NotImplementedError

    def persist(self):
        try:
            path = self._get_pickle_path()
            log.debug("Persisting status to %s" % path)
            f = open(path, 'w')
            try:
                pickle.dump(self, f)
            finally:
                f.close()
        except Exception:
            log.exception("Error persisting status")

    def created_seconds_ago(self):
        td = datetime.datetime.now() - self.created_at
        return td.seconds

    def render(self):
        indent = "  "
        lines = self._header_lines(indent) + [
            indent + l for l in self.body_lines()
        ] + ["", ""]
        return "\n".join(lines)

    @classmethod
    def _title_lines(cls):
        name_line = "%s (v %s)" % (cls.NAME, AgentStatus.agent_config.get_version())
        lines = [
            "=" * len(name_line),
            "%s" % name_line,
            "=" * len(name_line),
            "",
        ]
        return lines

    def _header_lines(self, indent):
        # Don't indent the header
        lines = self._title_lines()
        if self.created_seconds_ago() > 120:
            styles = ['red', 'bold']
        else:
            styles = []
        # We color it in red if the status is too old
        fields = [
            (
                style("Status date", *styles),
                style("%s (%ss ago)" %
                      (self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                       self.created_seconds_ago()), *styles)
            )
        ]

        fields += [
            ("Pid", self.created_by_pid),
            ("Platform", platform.platform()),
            ("Python Version", platform.python_version()),
            ("Logs", logger_info()),
        ]

        for key, value in fields:
            l = indent + "%s: %s" % (key, value)
            lines.append(l)
        return lines + [""]

    def to_dict(self):
        return {
            'pid': self.created_by_pid,
            'status_date': "%s (%ss ago)" % (self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                                             self.created_seconds_ago())}

    @classmethod
    def _not_running_message(cls):
        lines = cls._title_lines() + [
            style("  %s is not running." % cls.NAME, 'red'),
            style("""  You can get more details in the logs: %s""" % logger_info(), 'red'),
            "",
            ""
        ]
        return "\n".join(lines)

    @classmethod
    def remove_latest_status(cls):
        log.debug("Removing latest status")
        try:
            os.remove(cls._get_pickle_path())
        except OSError:
            pass

    @classmethod
    def load_latest_status(cls):
        try:
            f = open(cls._get_pickle_path())
            try:
                return pickle.load(f)
            finally:
                f.close()
        except IOError:
            log.info("Couldn't load latest status")
            return None

    @classmethod
    def print_latest_status(cls, verbose=False):
        cls.verbose = verbose
        Stylizer.ENABLED = False
        try:
            if sys.stdout.isatty():
                Stylizer.ENABLED = True
        except Exception:
            # Don't worry if we can't enable the
            # stylizer.
            pass

        message = cls._not_running_message()
        exit_code = -1

        module_status = cls.load_latest_status()
        if module_status:
            message = module_status.render()
            exit_code = 0
            if module_status.has_error():
                exit_code = 1

        sys.stdout.write(message)
        return exit_code

    @classmethod
    def _get_pickle_path(cls):
        return os.path.join(tempfile.gettempdir(), cls.__name__ + '.pickle')


class InstanceStatus(object):

    def __init__(self, instance_id, status, error=None, tb=None, warnings=None, metric_count=None):
        self.instance_id = instance_id
        self.status = status
        self.error = repr(error)
        self.traceback = tb
        self.warnings = warnings
        self.metric_count = metric_count

    def has_error(self):
        return self.status == STATUS_ERROR

    def has_warnings(self):
        return self.status == STATUS_WARNING


class CheckStatus(object):

    def __init__(self, check_name, instance_statuses, metric_count,
                 event_count, init_failed_error=None,
                 init_failed_traceback=None, library_versions=None):
        self.name = check_name
        self.instance_statuses = instance_statuses
        self.metric_count = metric_count
        self.event_count = event_count
        self.init_failed_error = init_failed_error
        self.init_failed_traceback = init_failed_traceback
        self.library_versions = library_versions

    @property
    def status(self):
        if self.init_failed_error:
            return STATUS_ERROR
        for instance_status in self.instance_statuses:
            if instance_status.status == STATUS_ERROR:
                return STATUS_ERROR
        return STATUS_OK

    def has_error(self):
        return self.status == STATUS_ERROR


class EmitterStatus(object):

    def __init__(self, name, error=None):
        self.name = name
        self.error = None
        if error:
            self.error = repr(error)

    @property
    def status(self):
        if self.error:
            return STATUS_ERROR
        else:
            return STATUS_OK

    def has_error(self):
        return self.status != STATUS_OK


class CollectorStatus(AgentStatus):

    NAME = 'Collector'

    def __init__(self, check_statuses=None, emitter_statuses=None):
        AgentStatus.__init__(self)
        self.check_statuses = check_statuses or []
        self.emitter_statuses = emitter_statuses or []

    @property
    def status(self):
        for check_status in self.check_statuses:
            if check_status.status == STATUS_ERROR:
                return STATUS_ERROR
        return STATUS_OK

    def has_error(self):
        return self.status != STATUS_OK

    def body_lines(self):
        lines = [
            'Clocks',
            '======',
            ''
        ]
        try:
            ntp_offset, ntp_styles = get_ntp_info()
            lines.append('  ' + style('NTP offset', *ntp_styles) + ': ' +
                         style('%s s' % round(ntp_offset, 4), *ntp_styles))
        except Exception as e:
            lines.append('  NTP offset: Unkwown (%s)' % str(e))
        lines.append('  System UTC time: ' + datetime.datetime.utcnow().__str__())
        lines.append('')

        # Paths to checks_d/conf.d
        lines += [
            'Paths',
            '=====',
            ''
        ]

        paths = util.Paths()
        try:
            confd_path = paths.get_confd_path()
        except util.PathNotFound:
            confd_path = 'Not found'

        try:
            checksd_path = paths.get_checksd_path()
        except util.PathNotFound:
            checksd_path = 'Not found'

        lines.append('  conf.d: ' + confd_path)
        lines.append('  checks_d: ' + checksd_path)
        lines.append('')

        # Hostnames
        lines += [
            'Hostnames',
            '=========',
            ''
        ]

        # Checks.d Status
        lines += [
            'Checks',
            '======',
            ''
        ]
        check_statuses = self.check_statuses + get_jmx_status()
        if not check_statuses:
            lines.append("  No checks have run yet.")
        else:
            for cs in check_statuses:
                check_lines = [
                    '  ' + cs.name,
                    '  ' + '-' * len(cs.name)
                ]
                if cs.init_failed_error:
                    check_lines.append("    - initialize check class [%s]: %s" %
                                       (style(STATUS_ERROR, 'red'),
                                        repr(cs.init_failed_error)))
                    if self.verbose and cs.init_failed_traceback:
                        check_lines.extend('      ' + line for line in
                                           cs.init_failed_traceback.split('\n'))
                else:
                    for s in cs.instance_statuses:
                        c = 'green'
                        if s.has_warnings():
                            c = 'yellow'
                        if s.has_error():
                            c = 'red'
                        line = "    - instance #%s [%s]" % (s.instance_id, style(s.status, c))
                        if s.has_error():
                            line += u": %s" % s.error
                        if s.metric_count is not None:
                            line += " collected %s metrics" % s.metric_count

                        check_lines.append(line)

                        if s.has_warnings():
                            for warning in s.warnings:
                                warn = warning.split('\n')
                                if not len(warn):
                                    continue
                                check_lines.append(u"        %s: %s" %
                                                   (style("Warning", 'yellow'), warn[0]))
                                check_lines.extend(u"        %s" % l for l in warn[1:])
                        if self.verbose and s.traceback is not None:
                            check_lines.extend('      ' + line for line in s.traceback.split('\n'))

                    check_lines += [
                        "    - Collected %s metrics & %s events" % (
                            cs.metric_count, cs.event_count),
                    ]

                    if cs.library_versions is not None:
                        check_lines += [
                            "    - Dependencies:"]
                        for library, version in cs.library_versions.iteritems():
                            check_lines += ["        - %s: %s" % (library, version)]

                    check_lines += [""]

                lines += check_lines

        # Emitter status
        lines += [
            "",
            "Emitters",
            "========",
            ""
        ]
        if not self.emitter_statuses:
            lines.append("  No emitters have run yet.")
        else:
            for es in self.emitter_statuses:
                c = 'green'
                if es.has_error():
                    c = 'red'
                line = "  - %s [%s]" % (es.name, style(es.status, c))
                if es.status != STATUS_OK:
                    line += ": %s" % es.error
                lines.append(line)

        return lines

    def to_dict(self):
        status_info = AgentStatus.to_dict(self)

        # Hostnames
        status_info['hostnames'] = {}

        # Checks.d Status
        status_info['checks'] = {}
        check_statuses = self.check_statuses + get_jmx_status()
        for cs in check_statuses:
            status_info['checks'][cs.name] = {'instances': {}}
            if cs.init_failed_error:
                status_info['checks'][cs.name]['init_failed'] = True
                status_info['checks'][cs.name]['traceback'] = cs.init_failed_traceback
            else:
                status_info['checks'][cs.name] = {'instances': {}}
                status_info['checks'][cs.name]['init_failed'] = False
                for s in cs.instance_statuses:
                    status_info['checks'][cs.name]['instances'][s.instance_id] = {
                        'status': s.status,
                        'has_error': s.has_error(),
                        'has_warnings': s.has_warnings(),
                    }
                    if s.has_error():
                        status_info['checks'][cs.name]['instances'][
                            s.instance_id]['error'] = s.error
                    if s.has_warnings():
                        status_info['checks'][cs.name]['instances'][
                            s.instance_id]['warnings'] = s.warnings
                status_info['checks'][cs.name]['metric_count'] = cs.metric_count
                status_info['checks'][cs.name]['event_count'] = cs.event_count

        # Emitter status
        status_info['emitter'] = []
        for es in self.emitter_statuses:
            check_status = {'name': es.name,
                            'status': es.status,
                            'has_error': es.has_error()}
            if es.has_error():
                check_status['error'] = es.error
            status_info['emitter'].append(check_status)

        paths = util.Paths()
        try:
            status_info['confd_path'] = paths.get_confd_path()
        except config.PathNotFound:
            status_info['confd_path'] = 'Not found'

        try:
            status_info['checksd_path'] = paths.get_checksd_path()
        except config.PathNotFound:
            status_info['checksd_path'] = 'Not found'

        return status_info


class MonascaStatsdStatus(AgentStatus):

    NAME = 'Monasca_Statsd'

    def __init__(self, flush_count=0, packet_count=0,
                 packets_per_second=0, metric_count=0, event_count=0):
        AgentStatus.__init__(self)
        self.flush_count = flush_count
        self.packet_count = packet_count
        self.packets_per_second = packets_per_second
        self.metric_count = metric_count
        self.event_count = event_count

    def has_error(self):
        return self.flush_count == 0 and self.packet_count == 0 and self.metric_count == 0

    def body_lines(self):
        lines = [
            "Flush count: %s" % self.flush_count,
            "Packet Count: %s" % self.packet_count,
            "Packets per second: %s" % self.packets_per_second,
            "Metric count: %s" % self.metric_count,
            "Event count: %s" % self.event_count,
        ]
        return lines

    def to_dict(self):
        status_info = AgentStatus.to_dict(self)
        status_info.update({
            'flush_count': self.flush_count,
            'packet_count': self.packet_count,
            'packets_per_second': self.packets_per_second,
            'metric_count': self.metric_count,
            'event_count': self.event_count,
        })
        return status_info


class ForwarderStatus(AgentStatus):

    NAME = 'Forwarder'

    def __init__(self, queue_length=0, queue_size=0, flush_count=0, transactions_received=0,
                 transactions_flushed=0):
        AgentStatus.__init__(self)
        self.queue_length = queue_length
        self.queue_size = queue_size
        self.flush_count = flush_count
        self.transactions_received = transactions_received
        self.transactions_flushed = transactions_flushed

    def body_lines(self):
        lines = [
            "Queue Size: %s bytes" % self.queue_size,
            "Queue Length: %s" % self.queue_length,
            "Flush Count: %s" % self.flush_count,
            "Transactions received: %s" % self.transactions_received,
            "Transactions flushed: %s" % self.transactions_flushed
        ]
        if self.transactions_flushed == 0:
            lines.append("[%s]: Unable to flush transactions\n           %s" %
                         (style(STATUS_ERROR, 'red'),
                          "Please verify monasca-api is running as configured"))
        elif self.transactions_flushed != self.transactions_received:
            lines.append("[%s]: Transactions out of sync\n             %s" %
                         (style(STATUS_WARNING, 'yellow'),
                          "Likely contact interruption with monasca-api"))
        else:
            lines.append("[%s]: Transactions up to date" %
                         style(STATUS_OK, 'green'))

        return lines

    def has_error(self):
        return self.flush_count == 0

    def to_dict(self):
        status_info = AgentStatus.to_dict(self)
        status_info.update({
            'flush_count': self.flush_count,
            'queue_length': self.queue_length,
            'queue_size': self.queue_size,
        })
        return status_info


def get_jmx_instance_status(instance_name, status, message, metric_count):
    if status == STATUS_ERROR:
        instance_status = InstanceStatus(
            instance_name, STATUS_ERROR, error=message, metric_count=metric_count)

    elif status == STATUS_WARNING:
        instance_status = InstanceStatus(
            instance_name, STATUS_WARNING, warnings=[message], metric_count=metric_count)

    elif status == STATUS_OK:
        instance_status = InstanceStatus(instance_name, STATUS_OK, metric_count=metric_count)

    return instance_status


def get_jmx_status():
    """This function tries to read the 2 jmxfetch status file which are yaml file
    located in the temp directory.

    There are 2 files:
        - One generated by the Agent itself, for jmx checks that can't be initialized because
        there are missing stuff.
        Its format is as following:

        ###
        invalid_checks:
              jmx: !!python/object/apply:jmxfetch.InvalidJMXConfiguration [You need to have at
                              least one instance defined in the YAML file for this check]
        timestamp: 1391040927.136523
        ###

        - One generated by jmxfetch that return information about the collection of metrics
        its format is as following:

        ###
        timestamp: 1391037347435
        checks:
          failed_checks:
            jmx:
            - {message: Unable to create instance. Please check your yaml file, status: ERROR}
          initialized_checks:
            tomcat:
            - {message: null, status: OK, metric_count: 7, instance_name: jmx-remihakim.fr-3000}
        ###
    """
    check_statuses = []
    java_status_path = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")
    python_status_path = os.path.join(tempfile.gettempdir(), "jmx_status_python.yaml")
    if not os.path.exists(java_status_path) and not os.path.exists(python_status_path):
        log.debug("There is no jmx_status file at: %s or at: %s" %
                  (java_status_path, python_status_path))
        return []

    check_data = collections.defaultdict(lambda: collections.defaultdict(list))
    try:
        if os.path.exists(java_status_path):
            java_jmx_stats = yaml.load(file(java_status_path))

            # JMX timestamp is saved in milliseconds
            status_age = time.time() - java_jmx_stats.get('timestamp') / 1000
            jmx_checks = java_jmx_stats.get('checks', {})

            if status_age > 60:
                check_statuses.append(
                    CheckStatus(
                        "jmx", [
                            InstanceStatus(
                                0, STATUS_ERROR, error="JMXfetch didn't return any metrics during the last minute")], 0, 0))
            else:

                for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems():
                    for info in instances:
                        message = info.get('message', None)
                        metric_count = info.get('metric_count', 0)
                        status = info.get('status')
                        instance_name = info.get('instance_name', None)
                        check_data[check_name]['statuses'].append(
                            get_jmx_instance_status(
                                instance_name,
                                status,
                                message,
                                metric_count))
                        check_data[check_name]['metric_count'].append(metric_count)

                for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems():
                    for info in instances:
                        message = info.get('message', None)
                        metric_count = info.get('metric_count', 0)
                        status = info.get('status')
                        instance_name = info.get('instance_name', None)
                        check_data[check_name]['statuses'].append(
                            get_jmx_instance_status(
                                instance_name,
                                status,
                                message,
                                metric_count))
                        check_data[check_name]['metric_count'].append(metric_count)

                for check_name, data in check_data.iteritems():
                    check_status = CheckStatus(
                        check_name, data['statuses'], sum(data['metric_count']), 0)
                    check_statuses.append(check_status)

        if os.path.exists(python_status_path):
            python_jmx_stats = yaml.load(file(python_status_path))
            jmx_checks = python_jmx_stats.get('invalid_checks', {})
            for check_name, excep in jmx_checks.iteritems():
                check_statuses.append(CheckStatus(check_name, [], 0, 0, init_failed_error=excep))

        return check_statuses

    except Exception:
        log.exception("Couldn't load latest jmx status")
        return []