220 lines
8.5 KiB
Python
220 lines
8.5 KiB
Python
# Core modules
|
|
import logging
|
|
import socket
|
|
import threading
|
|
import time
|
|
|
|
import monasca_agent.common.check_status as check_status
|
|
import monasca_agent.common.metrics as metrics
|
|
import monasca_agent.common.util as util
|
|
import system.win32 as w32
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
MAX_THREADS_COUNT = 50
|
|
MAX_COLLECTION_TIME = 30
|
|
MAX_CPU_PCT = 10
|
|
FLUSH_LOGGING_PERIOD = 10
|
|
FLUSH_LOGGING_INITIAL = 5
|
|
|
|
|
|
class Collector(util.Dimensions):
|
|
|
|
"""The collector is responsible for collecting data from each check and
|
|
|
|
passing it along to the emitters, who send it to their final destination.
|
|
"""
|
|
|
|
def __init__(self, agent_config, emitter, checksd=None):
|
|
super(Collector, self).__init__(agent_config)
|
|
self.agent_config = agent_config
|
|
self.os = util.get_os()
|
|
self.plugins = None
|
|
self.emitter = emitter
|
|
socket.setdefaulttimeout(15)
|
|
self.run_count = 0
|
|
self.continue_running = True
|
|
self.initialized_checks_d = []
|
|
self.init_failed_checks_d = []
|
|
|
|
# add windows system checks
|
|
if self.os == 'windows':
|
|
self._checks = [w32.Disk(log),
|
|
w32.IO(log),
|
|
w32.Processes(log),
|
|
w32.Memory(log),
|
|
w32.Network(log),
|
|
w32.Cpu(log)]
|
|
else:
|
|
self._checks = []
|
|
|
|
if checksd:
|
|
# is of type {check_name: check}
|
|
self.initialized_checks_d = checksd['initialized_checks']
|
|
# is of type {check_name: {error, traceback}}
|
|
self.init_failed_checks_d = checksd['init_failed_checks']
|
|
|
|
def _emit(self, payload):
|
|
"""Send the payload via the emitter.
|
|
"""
|
|
statuses = []
|
|
# Don't try to send to an emitter if we're stopping/
|
|
if self.continue_running:
|
|
name = self.emitter.__name__
|
|
emitter_status = check_status.EmitterStatus(name)
|
|
try:
|
|
self.emitter(payload, log, self.agent_config['forwarder_url'])
|
|
except Exception as e:
|
|
log.exception("Error running emitter: %s" % self.emitter.__name__)
|
|
emitter_status = check_status.EmitterStatus(name, e)
|
|
statuses.append(emitter_status)
|
|
return statuses
|
|
|
|
def _set_status(self, check_statuses, emitter_statuses, collect_duration):
|
|
try:
|
|
check_status.CollectorStatus(check_statuses, emitter_statuses).persist()
|
|
except Exception:
|
|
log.exception("Error persisting collector status")
|
|
|
|
if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0:
|
|
log.info("Finished run #%s. Collection time: %ss." %
|
|
(self.run_count, round(collect_duration, 2)))
|
|
if self.run_count == FLUSH_LOGGING_INITIAL:
|
|
log.info("First flushes done, next flushes will be logged every %s flushes." %
|
|
FLUSH_LOGGING_PERIOD)
|
|
|
|
else:
|
|
log.debug("Finished run #%s. Collection time: %ss." %
|
|
(self.run_count, round(collect_duration, 2),))
|
|
|
|
def collector_stats(self, num_metrics, num_events, collection_time):
|
|
metrics = {}
|
|
thread_count = threading.active_count()
|
|
metrics['monasca.thread_count'] = thread_count
|
|
if thread_count > MAX_THREADS_COUNT:
|
|
log.warn("Collector thread count is high: %d" % thread_count)
|
|
|
|
metrics['monasca.collection_time_sec'] = collection_time
|
|
if collection_time > MAX_COLLECTION_TIME:
|
|
log.info("Collection time (s) is high: %.1f, metrics count: %d, events count: %d" %
|
|
(collection_time, num_metrics, num_events))
|
|
|
|
return metrics
|
|
|
|
def run(self):
|
|
"""Collect data from each check and submit their data.
|
|
|
|
There are currently two types of checks the system checks and the configured ones from checks_d
|
|
"""
|
|
timer = util.Timer()
|
|
self.run_count += 1
|
|
log.debug("Starting collection run #%s" % self.run_count)
|
|
|
|
metrics_list = []
|
|
|
|
timestamp = time.time()
|
|
events = {}
|
|
|
|
if self.os == 'windows': # Windows uses old style checks.
|
|
for check_type in self._checks:
|
|
try:
|
|
for name, value in check_type.check().iteritems():
|
|
metrics_list.append(metrics.Measurement(name,
|
|
timestamp,
|
|
value,
|
|
self._set_dimensions(None),
|
|
None))
|
|
except Exception:
|
|
log.exception('Error running check.')
|
|
else:
|
|
for check_type in self._checks:
|
|
metrics_list.extend(check_type.check())
|
|
|
|
# checks_d checks
|
|
checks_d_metrics, checks_d_events, checks_statuses = self.run_checks_d()
|
|
metrics_list.extend(checks_d_metrics)
|
|
events.update(checks_d_events)
|
|
|
|
# Store the metrics and events in the payload.
|
|
collect_duration = timer.step()
|
|
|
|
dimensions = {'component': 'monasca-agent'}
|
|
# Add in metrics on the collector run
|
|
for name, value in self.collector_stats(len(metrics_list), len(events),
|
|
collect_duration).iteritems():
|
|
metrics_list.append(metrics.Measurement(name,
|
|
timestamp,
|
|
value,
|
|
self._set_dimensions(dimensions),
|
|
None))
|
|
emitter_statuses = self._emit(metrics_list)
|
|
|
|
# Persist the status of the collection run.
|
|
self._set_status(checks_statuses, emitter_statuses, collect_duration)
|
|
|
|
def run_checks_d(self):
|
|
"""Run defined checks_d checks.
|
|
|
|
returns a list of Measurements, a dictionary of events and a list of check statuses.
|
|
"""
|
|
measurements = []
|
|
events = {}
|
|
check_statuses = []
|
|
for check in self.initialized_checks_d:
|
|
if not self.continue_running:
|
|
return
|
|
log.info("Running check %s" % check.name)
|
|
instance_statuses = []
|
|
metric_count = 0
|
|
event_count = 0
|
|
try:
|
|
# Run the check.
|
|
instance_statuses = check.run()
|
|
|
|
# Collect the metrics and events.
|
|
current_check_metrics = check.get_metrics()
|
|
current_check_events = check.get_events()
|
|
|
|
# Save them for the payload.
|
|
measurements.extend(current_check_metrics)
|
|
if current_check_events:
|
|
if check.name not in events:
|
|
events[check.name] = current_check_events
|
|
else:
|
|
events[check.name] += current_check_events
|
|
|
|
# Save the status of the check.
|
|
metric_count = len(current_check_metrics)
|
|
event_count = len(current_check_events)
|
|
except Exception:
|
|
log.exception("Error running check %s" % check.name)
|
|
|
|
status_check = check_status.CheckStatus(check.name, instance_statuses, metric_count, event_count,
|
|
library_versions=check.get_library_info())
|
|
check_statuses.append(status_check)
|
|
|
|
for check_name, info in self.init_failed_checks_d.iteritems():
|
|
if not self.continue_running:
|
|
return
|
|
status_check = check_status.CheckStatus(check_name, None, None, None,
|
|
init_failed_error=info['error'],
|
|
init_failed_traceback=info['traceback'])
|
|
check_statuses.append(status_check)
|
|
|
|
return measurements, events, check_statuses
|
|
|
|
def stop(self):
|
|
"""Tell the collector to stop at the next logical point.
|
|
"""
|
|
# This is called when the process is being killed, so
|
|
# try to stop the collector as soon as possible.
|
|
# Most importantly, don't try to submit to the emitters
|
|
# because the forwarder is quite possibly already killed
|
|
# in which case we'll get a misleading error in the logs.
|
|
# Best to not even try.
|
|
self.continue_running = False
|
|
for check in self.initialized_checks_d:
|
|
check.stop()
|