monasca-agent/monasca_agent/collector/checks/collector.py

220 lines
8.5 KiB
Python

# Core modules
import logging
import socket
import threading
import time
import monasca_agent.common.check_status as check_status
import monasca_agent.common.metrics as metrics
import monasca_agent.common.util as util
import system.win32 as w32
log = logging.getLogger(__name__)
MAX_THREADS_COUNT = 50
MAX_COLLECTION_TIME = 30
MAX_CPU_PCT = 10
FLUSH_LOGGING_PERIOD = 10
FLUSH_LOGGING_INITIAL = 5
class Collector(util.Dimensions):
"""The collector is responsible for collecting data from each check and
passing it along to the emitters, who send it to their final destination.
"""
def __init__(self, agent_config, emitter, checksd=None):
super(Collector, self).__init__(agent_config)
self.agent_config = agent_config
self.os = util.get_os()
self.plugins = None
self.emitter = emitter
socket.setdefaulttimeout(15)
self.run_count = 0
self.continue_running = True
self.initialized_checks_d = []
self.init_failed_checks_d = []
# add windows system checks
if self.os == 'windows':
self._checks = [w32.Disk(log),
w32.IO(log),
w32.Processes(log),
w32.Memory(log),
w32.Network(log),
w32.Cpu(log)]
else:
self._checks = []
if checksd:
# is of type {check_name: check}
self.initialized_checks_d = checksd['initialized_checks']
# is of type {check_name: {error, traceback}}
self.init_failed_checks_d = checksd['init_failed_checks']
def _emit(self, payload):
"""Send the payload via the emitter.
"""
statuses = []
# Don't try to send to an emitter if we're stopping/
if self.continue_running:
name = self.emitter.__name__
emitter_status = check_status.EmitterStatus(name)
try:
self.emitter(payload, log, self.agent_config['forwarder_url'])
except Exception as e:
log.exception("Error running emitter: %s" % self.emitter.__name__)
emitter_status = check_status.EmitterStatus(name, e)
statuses.append(emitter_status)
return statuses
def _set_status(self, check_statuses, emitter_statuses, collect_duration):
try:
check_status.CollectorStatus(check_statuses, emitter_statuses).persist()
except Exception:
log.exception("Error persisting collector status")
if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
log.info("Finished run #%s. Collection time: %ss." %
(self.run_count, round(collect_duration, 2)))
if self.run_count == FLUSH_LOGGING_INITIAL:
log.info("First flushes done, next flushes will be logged every %s flushes." %
FLUSH_LOGGING_PERIOD)
else:
log.debug("Finished run #%s. Collection time: %ss." %
(self.run_count, round(collect_duration, 2),))
def collector_stats(self, num_metrics, num_events, collection_time):
metrics = {}
thread_count = threading.active_count()
metrics['monasca.thread_count'] = thread_count
if thread_count > MAX_THREADS_COUNT:
log.warn("Collector thread count is high: %d" % thread_count)
metrics['monasca.collection_time_sec'] = collection_time
if collection_time > MAX_COLLECTION_TIME:
log.info("Collection time (s) is high: %.1f, metrics count: %d, events count: %d" %
(collection_time, num_metrics, num_events))
return metrics
def run(self):
"""Collect data from each check and submit their data.
There are currently two types of checks the system checks and the configured ones from checks_d
"""
timer = util.Timer()
self.run_count += 1
log.debug("Starting collection run #%s" % self.run_count)
metrics_list = []
timestamp = time.time()
events = {}
if self.os == 'windows': # Windows uses old style checks.
for check_type in self._checks:
try:
for name, value in check_type.check().iteritems():
metrics_list.append(metrics.Measurement(name,
timestamp,
value,
self._set_dimensions(None),
None))
except Exception:
log.exception('Error running check.')
else:
for check_type in self._checks:
metrics_list.extend(check_type.check())
# checks_d checks
checks_d_metrics, checks_d_events, checks_statuses = self.run_checks_d()
metrics_list.extend(checks_d_metrics)
events.update(checks_d_events)
# Store the metrics and events in the payload.
collect_duration = timer.step()
dimensions = {'component': 'monasca-agent'}
# Add in metrics on the collector run
for name, value in self.collector_stats(len(metrics_list), len(events),
collect_duration).iteritems():
metrics_list.append(metrics.Measurement(name,
timestamp,
value,
self._set_dimensions(dimensions),
None))
emitter_statuses = self._emit(metrics_list)
# Persist the status of the collection run.
self._set_status(checks_statuses, emitter_statuses, collect_duration)
def run_checks_d(self):
"""Run defined checks_d checks.
returns a list of Measurements, a dictionary of events and a list of check statuses.
"""
measurements = []
events = {}
check_statuses = []
for check in self.initialized_checks_d:
if not self.continue_running:
return
log.info("Running check %s" % check.name)
instance_statuses = []
metric_count = 0
event_count = 0
try:
# Run the check.
instance_statuses = check.run()
# Collect the metrics and events.
current_check_metrics = check.get_metrics()
current_check_events = check.get_events()
# Save them for the payload.
measurements.extend(current_check_metrics)
if current_check_events:
if check.name not in events:
events[check.name] = current_check_events
else:
events[check.name] += current_check_events
# Save the status of the check.
metric_count = len(current_check_metrics)
event_count = len(current_check_events)
except Exception:
log.exception("Error running check %s" % check.name)
status_check = check_status.CheckStatus(check.name, instance_statuses, metric_count, event_count,
library_versions=check.get_library_info())
check_statuses.append(status_check)
for check_name, info in self.init_failed_checks_d.iteritems():
if not self.continue_running:
return
status_check = check_status.CheckStatus(check_name, None, None, None,
init_failed_error=info['error'],
init_failed_traceback=info['traceback'])
check_statuses.append(status_check)
return measurements, events, check_statuses
def stop(self):
"""Tell the collector to stop at the next logical point.
"""
# This is called when the process is being killed, so
# try to stop the collector as soon as possible.
# Most importantly, don't try to submit to the emitters
# because the forwarder is quite possibly already killed
# in which case we'll get a misleading error in the logs.
# Best to not even try.
self.continue_running = False
for check in self.initialized_checks_d:
check.stop()