monasca-agent/monasca_agent/collector/checks/collector.py

# Core modules
import logging
import socket
import threading
import time

import monasca_agent.common.check_status as check_status
import monasca_agent.common.metrics as metrics
import monasca_agent.common.util as util
import system.win32 as w32


log = logging.getLogger(__name__)


MAX_THREADS_COUNT = 50
MAX_COLLECTION_TIME = 30
MAX_CPU_PCT = 10
FLUSH_LOGGING_PERIOD = 10
FLUSH_LOGGING_INITIAL = 5


class Collector(util.Dimensions):

    """The collector is responsible for collecting data from each check and

    passing it along to the emitters, who send it to their final destination.
    """

    def __init__(self, agent_config, emitter, checksd=None):
        super(Collector, self).__init__(agent_config)
        self.agent_config = agent_config
        self.os = util.get_os()
        self.plugins = None
        self.emitter = emitter
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.initialized_checks_d = []
        self.init_failed_checks_d = []

        # add windows system checks
        if self.os == 'windows':
            self._checks = [w32.Disk(log),
                            w32.IO(log),
                            w32.Processes(log),
                            w32.Memory(log),
                            w32.Network(log),
                            w32.Cpu(log)]
        else:
            self._checks = []

        if checksd:
            # is of type {check_name: check}
            self.initialized_checks_d = checksd['initialized_checks']
            # is of type {check_name: {error, traceback}}
            self.init_failed_checks_d = checksd['init_failed_checks']

    def _emit(self, payload):
        """Send the payload via the emitter.
        """
        statuses = []
        # Don't try to send to an emitter if we're stopping/
        if self.continue_running:
            name = self.emitter.__name__
            emitter_status = check_status.EmitterStatus(name)
            try:
                self.emitter(payload, log, self.agent_config['forwarder_url'])
            except Exception as e:
                log.exception("Error running emitter: %s" % self.emitter.__name__)
                emitter_status = check_status.EmitterStatus(name, e)
            statuses.append(emitter_status)
        return statuses

    def _set_status(self, check_statuses, emitter_statuses, collect_duration):
        try:
            check_status.CollectorStatus(check_statuses, emitter_statuses).persist()
        except Exception:
            log.exception("Error persisting collector status")

        if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
            log.info("Finished run #%s. Collection time: %ss." %
                     (self.run_count, round(collect_duration, 2)))
            if self.run_count == FLUSH_LOGGING_INITIAL:
                log.info("First flushes done, next flushes will be logged every %s flushes." %
                         FLUSH_LOGGING_PERIOD)

        else:
            log.debug("Finished run #%s. Collection time: %ss." %
                      (self.run_count, round(collect_duration, 2),))

    def collector_stats(self, num_metrics, num_events, collection_time):
        metrics = {}
        thread_count = threading.active_count()
        metrics['monasca.thread_count'] = thread_count
        if thread_count > MAX_THREADS_COUNT:
            log.warn("Collector thread count is high: %d" % thread_count)

        metrics['monasca.collection_time_sec'] = collection_time
        if collection_time > MAX_COLLECTION_TIME:
            log.info("Collection time (s) is high: %.1f, metrics count: %d, events count: %d" %
                     (collection_time, num_metrics, num_events))

        return metrics

    def run(self):
        """Collect data from each check and submit their data.

        There are currently two types of checks the system checks and the configured ones from checks_d
        """
        timer = util.Timer()
        self.run_count += 1
        log.debug("Starting collection run #%s" % self.run_count)

        metrics_list = []

        timestamp = time.time()
        events = {}

        if self.os == 'windows':  # Windows uses old style checks.
            for check_type in self._checks:
                try:
                    for name, value in check_type.check().iteritems():
                        metrics_list.append(metrics.Measurement(name,
                                                                timestamp,
                                                                value,
                                                                self._set_dimensions(None),
                                                                None))
                except Exception:
                    log.exception('Error running check.')
        else:
            for check_type in self._checks:
                metrics_list.extend(check_type.check())

        # checks_d checks
        checks_d_metrics, checks_d_events, checks_statuses = self.run_checks_d()
        metrics_list.extend(checks_d_metrics)
        events.update(checks_d_events)

        # Store the metrics and events in the payload.
        collect_duration = timer.step()

        dimensions = {'component': 'monasca-agent'}
        # Add in metrics on the collector run
        for name, value in self.collector_stats(len(metrics_list), len(events),
                                                collect_duration).iteritems():
            metrics_list.append(metrics.Measurement(name,
                                                    timestamp,
                                                    value,
                                                    self._set_dimensions(dimensions),
                                                    None))
        emitter_statuses = self._emit(metrics_list)

        # Persist the status of the collection run.
        self._set_status(checks_statuses, emitter_statuses, collect_duration)

    def run_checks_d(self):
        """Run defined checks_d checks.

        returns a list of Measurements, a dictionary of events and a list of check statuses.
        """
        measurements = []
        events = {}
        check_statuses = []
        for check in self.initialized_checks_d:
            if not self.continue_running:
                return
            log.info("Running check %s" % check.name)
            instance_statuses = []
            metric_count = 0
            event_count = 0
            try:
                # Run the check.
                instance_statuses = check.run()

                # Collect the metrics and events.
                current_check_metrics = check.get_metrics()
                current_check_events = check.get_events()

                # Save them for the payload.
                measurements.extend(current_check_metrics)
                if current_check_events:
                    if check.name not in events:
                        events[check.name] = current_check_events
                    else:
                        events[check.name] += current_check_events

                # Save the status of the check.
                metric_count = len(current_check_metrics)
                event_count = len(current_check_events)
            except Exception:
                log.exception("Error running check %s" % check.name)

            status_check = check_status.CheckStatus(check.name, instance_statuses, metric_count, event_count,
                                                    library_versions=check.get_library_info())
            check_statuses.append(status_check)

        for check_name, info in self.init_failed_checks_d.iteritems():
            if not self.continue_running:
                return
            status_check = check_status.CheckStatus(check_name, None, None, None,
                                                    init_failed_error=info['error'],
                                                    init_failed_traceback=info['traceback'])
            check_statuses.append(status_check)

        return measurements, events, check_statuses

    def stop(self):
        """Tell the collector to stop at the next logical point.
        """
        # This is called when the process is being killed, so
        # try to stop the collector as soon as possible.
        # Most importantly, don't try to submit to the emitters
        # because the forwarder is quite possibly already killed
        # in which case we'll get a misleading error in the logs.
        # Best to not even try.
        self.continue_running = False
        for check in self.initialized_checks_d:
            check.stop()