Adjusted whitespace for pep8 compliance

This commit is contained in:
Jonathan Halterman 2014-07-01 14:27:12 -07:00
parent 9014f20a6d
commit 194c3625dd
116 changed files with 3236 additions and 2794 deletions

View File

@ -24,6 +24,7 @@ FLUSH_LOGGING_INITIAL = 5
class Collector(object):
"""
The collector is responsible for collecting data from each check and
passing it along to the emitters, who send it to their final destination.
@ -43,12 +44,13 @@ class Collector(object):
self._checks = []
self._legacy_checks = [
# todo dogstreams should be removed or moved over to a standard output type
# Dogstreams.init(log, self.agent_config) # dogstreams
# todo dogstreams should be removed or moved over to a standard output type
# Dogstreams.init(log, self.agent_config) # dogstreams
]
# add system checks
# todo all these (legacy and system) should be moved to the newer AgentCheck class rather than check
# todo all these (legacy and system) should be moved to the newer
# AgentCheck class rather than check
if self.os == 'windows':
legacy_checks = [w32.Disk(log),
w32.IO(log),
@ -66,8 +68,10 @@ class Collector(object):
self._legacy_checks.extend(legacy_checks)
if checksd:
self.initialized_checks_d = checksd['initialized_checks'] # is of type {check_name: check}
self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}}
# is of type {check_name: check}
self.initialized_checks_d = checksd['initialized_checks']
# is of type {check_name: {error, traceback}}
self.init_failed_checks_d = checksd['init_failed_checks']
def _emit(self, payload):
""" Send the payload via the emitter. """
@ -94,7 +98,8 @@ class Collector(object):
log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
(self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
if self.run_count == FLUSH_LOGGING_INITIAL:
log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)
log.info("First flushes done, next flushes will be logged every %s flushes." %
FLUSH_LOGGING_PERIOD)
else:
log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
@ -110,12 +115,12 @@ class Collector(object):
metrics['monagent.collector.collection.time'] = collection_time
if collection_time > MAX_COLLECTION_TIME:
log.info("Collection time (s) is high: %.1f, metrics count: %d, events count: %d" %
(collection_time, num_metrics, num_events))
(collection_time, num_metrics, num_events))
metrics['monagent.collector.emit.time'] = emit_time
if emit_time is not None and emit_time > MAX_EMIT_TIME:
log.info("Emit time (s) is high: %.1f, metrics count: %d, events count: %d" %
(emit_time, num_metrics, num_events))
(emit_time, num_metrics, num_events))
return metrics

View File

@ -33,6 +33,7 @@ class EventDefaults(object):
class Dogstreams(object):
@classmethod
def init(cls, logger, config):
dogstreams_config = config.get('dogstreams', None)
@ -87,7 +88,8 @@ class Dogstreams(object):
try:
result = dogstream.check(agentConfig, move_end)
# result may contain {"dogstream": [new]}.
# If output contains {"dogstream": [old]}, that old value will get concatenated with the new value
# If output contains {"dogstream": [old]}, that old value will get
# concatenated with the new value
assert type(result) == type(output), "dogstream.check must return a dictionary"
for k in result:
if k in output:
@ -128,7 +130,8 @@ class Dogstream(object):
parser_spec,
os.environ.get('PYTHONPATH', ''))
)
logger.info("dogstream: parsing %s with %s (requested %s)" % (log_path, parse_func, parser_spec))
logger.info("dogstream: parsing %s with %s (requested %s)" %
(log_path, parse_func, parser_spec))
else:
logger.info("dogstream: parsing %s with default parser" % log_path)
@ -221,7 +224,8 @@ class Dogstream(object):
# FIXME when the backend treats those as true synonyms, we can
# deprecate event_object.
if 'event_object' in datum or 'aggregation_key' in datum:
datum['aggregation_key'] = datum.get('event_object', datum.get('aggregation_key'))
datum['aggregation_key'] = datum.get(
'event_object', datum.get('aggregation_key'))
else:
datum['aggregation_key'] = EventDefaults.EVENT_OBJECT
datum['event_object'] = datum['aggregation_key']
@ -511,4 +515,4 @@ class NagiosServicePerfData(NagiosPerfData):
return metric
if __name__ == '__main__':
testddForwarder()
testddForwarder()

View File

@ -1,4 +1,4 @@
## {{{ http://code.activestate.com/recipes/576519/ (r9)
# {{{ http://code.activestate.com/recipes/576519/ (r9)
# Author: David Decotigny, Oct 1 2008
# @brief Pool of threads similar to multiprocessing.Pool
# See http://docs.python.org/dev/library/multiprocessing.html
@ -26,7 +26,7 @@ import threading
import traceback
## Item pushed on the work queue to tell the worker threads to terminate
# Item pushed on the work queue to tell the worker threads to terminate
SENTINEL = "QUIT"
@ -37,12 +37,15 @@ def is_sentinel(obj):
class TimeoutError(Exception):
"""Raised when a result is not available within the given timeout"""
pass
class PoolWorker(threading.Thread):
"""Thread that consumes WorkUnits from a queue to process them"""
def __init__(self, workq, *args, **kwds):
"""\param workq: Queue object to consume the work units from"""
threading.Thread.__init__(self, *args, **kwds)
@ -64,6 +67,7 @@ class PoolWorker(threading.Thread):
class Pool(object):
"""
The Pool class represents a pool of worker threads. It has methods
which allows tasks to be offloaded to the worker processes in a
@ -75,8 +79,8 @@ class Pool(object):
\param nworkers (integer) number of worker threads to start
\param name (string) prefix for the worker threads' name
"""
self._workq = Queue.Queue()
self._closed = False
self._workq = Queue.Queue()
self._closed = False
self._workers = []
for idx in xrange(nworkers):
thr = PoolWorker(self._workq, name="Worker-%s-%d" % (name, idx))
@ -95,7 +99,8 @@ class Pool(object):
def apply(self, func, args=(), kwds=None):
"""Equivalent of the apply() builtin function. It blocks till
the result is ready."""
if not kwds: kwds = dict()
if not kwds:
kwds = dict()
return self.apply_async(func, args, kwds).get()
def map(self, func, iterable, chunksize=None):
@ -134,7 +139,7 @@ class Pool(object):
collector = UnorderedResultCollector()
self._create_sequences(func, iterable, chunksize, collector)
return iter(collector)
def apply_async(self, func, args=(), kwds=None, callback=None):
"""A variant of the apply() method which returns an
ApplyResult object.
@ -144,8 +149,9 @@ class Pool(object):
callback is applied to it (unless the call failed). callback
should complete immediately since otherwise the thread which
handles the results will get blocked."""
if not kwds: kwds = dict()
assert not self._closed # No lock here. We assume it's atomic...
if not kwds:
kwds = dict()
assert not self._closed # No lock here. We assume it's atomic...
apply_result = ApplyResult(callback=callback)
job = Job(func, args, kwds, apply_result)
self._workq.put(job)
@ -161,7 +167,7 @@ class Pool(object):
should complete immediately since otherwise the thread which
handles the results will get blocked."""
apply_result = ApplyResult(callback=callback)
collector = OrderedResultCollector(apply_result, as_iterator=False)
collector = OrderedResultCollector(apply_result, as_iterator=False)
self._create_sequences(func, iterable, chunksize, collector)
return apply_result
@ -176,7 +182,7 @@ class Pool(object):
failed). callback should complete immediately since otherwise
the thread which handles the results will get blocked."""
apply_result = ApplyResult(callback=callback)
collector = OrderedResultCollector(apply_result, as_iterator=True)
collector = OrderedResultCollector(apply_result, as_iterator=True)
self._create_sequences(func, iterable, chunksize, collector)
return apply_result
@ -192,7 +198,7 @@ class Pool(object):
failed). callback should complete immediately since otherwise
the thread which handles the results will get blocked."""
apply_result = ApplyResult(callback=callback)
collector = UnorderedResultCollector(apply_result)
collector = UnorderedResultCollector(apply_result)
self._create_sequences(func, iterable, chunksize, collector)
return apply_result
@ -227,7 +233,7 @@ class Pool(object):
for thr in self._workers:
thr.join()
def _create_sequences(self, func, iterable, chunksize, collector = None):
def _create_sequences(self, func, iterable, chunksize, collector=None):
"""
Create the WorkUnit objects to process and pushes them on the
work queue. Each work unit is meant to process a slice of
@ -238,9 +244,9 @@ class Pool(object):
\return the list of WorkUnit objects (basically: JobSequences)
pushed onto the work queue
"""
assert not self._closed # No lock here. We assume it's atomic...
assert not self._closed # No lock here. We assume it's atomic...
sequences = []
results = []
results = []
it_ = iter(iterable)
exit_loop = False
while not exit_loop:
@ -264,15 +270,19 @@ class Pool(object):
class WorkUnit(object):
"""ABC for a unit of work submitted to the worker threads. It's
basically just an object equipped with a process() method"""
def process(self):
"""Do the work. Shouldn't raise any exception"""
raise NotImplementedError("Children must override Process")
class Job(WorkUnit):
"""A work unit that corresponds to the execution of a single function"""
def __init__(self, func, args, kwds, apply_result):
"""
\param func/args/kwds used to call the function
@ -280,9 +290,9 @@ class Job(WorkUnit):
of the function call
"""
WorkUnit.__init__(self)
self._func = func
self._args = args
self._kwds = kwds
self._func = func
self._args = args
self._kwds = kwds
self._result = apply_result
def process(self):
@ -300,8 +310,10 @@ class Job(WorkUnit):
class JobSequence(WorkUnit):
"""A work unit that corresponds to the processing of a continuous
sequence of Job objects"""
def __init__(self, jobs):
WorkUnit.__init__(self)
self._jobs = jobs
@ -315,6 +327,7 @@ class JobSequence(WorkUnit):
class ApplyResult(object):
"""An object associated with a Job object that holds its result:
it's available during the whole life the Job and after, even when
the Job didn't process yet. It's possible to use this object to
@ -322,6 +335,7 @@ class ApplyResult(object):
The result objects returns by the Pool::*_async() methods are of
this type"""
def __init__(self, collector=None, callback=None):
"""
\param collector when not None, the notify_ready() method of
@ -331,11 +345,11 @@ class ApplyResult(object):
result becomes available (this is the paramater passed to the
Pool::*_async() methods.
"""
self._success = False
self._event = threading.Event()
self._data = None
self._success = False
self._event = threading.Event()
self._data = None
self._collector = None
self._callback = callback
self._callback = callback
if collector is not None:
collector.register_result(self)
@ -354,7 +368,7 @@ class ApplyResult(object):
return self._data
raise self._data[0], self._data[1], self._data[2]
def wait(self, timeout = None):
def wait(self, timeout=None):
"""Waits until the result is available or until timeout
seconds pass."""
self._event.wait(timeout)
@ -377,7 +391,7 @@ class ApplyResult(object):
ready and successful. The collector's notify_ready() method
will be called, and the callback method too"""
assert not self.ready()
self._data = value
self._data = value
self._success = True
self._event.set()
if self._collector is not None:
@ -394,7 +408,7 @@ class ApplyResult(object):
ready but not successful. The collector's notify_ready()
method will be called, but NOT the callback method"""
assert not self.ready()
self._data = sys.exc_info()
self._data = sys.exc_info()
self._success = False
self._event.set()
if self._collector is not None:
@ -402,6 +416,7 @@ class ApplyResult(object):
class AbstractResultCollector(object):
"""ABC to define the interface of a ResultCollector object. It is
basically an object which knows whuich results it's waiting for,
and which is able to get notify when they get available. It is
@ -414,7 +429,7 @@ class AbstractResultCollector(object):
results we're waiting for become available. Can be None.
"""
self._to_notify = to_notify
def register_result(self, apply_result):
"""Used to identify which results we're waiting for. Will
always be called BEFORE the Jobs get submitted to the work
@ -433,7 +448,7 @@ class AbstractResultCollector(object):
"""
raise NotImplementedError("Children classes must implement it")
def _get_result(self, idx, timeout = None):
def _get_result(self, idx, timeout=None):
"""Called by the CollectorIterator object to retrieve the
result's values one after another (order defined by the
implementation)
@ -450,19 +465,21 @@ class AbstractResultCollector(object):
class CollectorIterator(object):
"""An iterator that allows to iterate over the result values
available in the given collector object. Equipped with an extended
next() method accepting a timeout argument. Created by the
AbstractResultCollector::__iter__() method"""
def __init__(self, collector):
"""\param AbstractResultCollector instance"""
self._collector = collector
self._idx = 0
self._idx = 0
def __iter__(self):
return self
def next(self, timeout = None):
def next(self, timeout=None):
"""Return the next result value in the sequence. Raise
StopIteration at the end. Can raise the exception raised by
the Job"""
@ -481,20 +498,21 @@ class CollectorIterator(object):
class UnorderedResultCollector(AbstractResultCollector):
"""An AbstractResultCollector implementation that collects the
values of the ApplyResult objects in the order they become ready. The
CollectorIterator object returned by __iter__() will iterate over
them in the order they become ready"""
def __init__(self, to_notify = None):
def __init__(self, to_notify=None):
"""
\param to_notify ApplyResult object to notify when all the
results we're waiting for become available. Can be None.
"""
AbstractResultCollector.__init__(self, to_notify)
self._cond = threading.Condition()
self._cond = threading.Condition()
self._collection = []
self._expected = 0
self._expected = 0
def register_result(self, apply_result):
"""Used to identify which results we're waiting for. Will
@ -505,7 +523,7 @@ class UnorderedResultCollector(AbstractResultCollector):
"""
self._expected += 1
def _get_result(self, idx, timeout = None):
def _get_result(self, idx, timeout=None):
"""Called by the CollectorIterator object to retrieve the
result's values one after another, in the order the results have
become available.
@ -549,18 +567,19 @@ class UnorderedResultCollector(AbstractResultCollector):
self._cond.notifyAll()
finally:
self._cond.release()
if first_item and self._to_notify is not None:
self._to_notify._set_value(iter(self))
class OrderedResultCollector(AbstractResultCollector):
"""An AbstractResultCollector implementation that collects the
values of the ApplyResult objects in the order they have been
submitted. The CollectorIterator object returned by __iter__()
will iterate over them in the order they have been submitted"""
def __init__(self, to_notify = None, as_iterator = True):
def __init__(self, to_notify=None, as_iterator=True):
"""
\param to_notify ApplyResult object to notify when all the
results we're waiting for become available. Can be None.
@ -570,9 +589,9 @@ class OrderedResultCollector(AbstractResultCollector):
result arrived)
"""
AbstractResultCollector.__init__(self, to_notify)
self._results = []
self._lock = threading.Lock()
self._remaining = 0
self._results = []
self._lock = threading.Lock()
self._remaining = 0
self._as_iterator = as_iterator
def register_result(self, apply_result):
@ -585,7 +604,7 @@ class OrderedResultCollector(AbstractResultCollector):
self._results.append(apply_result)
self._remaining += 1
def _get_result(self, idx, timeout = None):
def _get_result(self, idx, timeout=None):
"""Called by the CollectorIterator object to retrieve the
result's values one after another (order defined by the
implementation)
@ -606,13 +625,13 @@ class OrderedResultCollector(AbstractResultCollector):
has been processed
"""
got_first = False
got_last = False
got_last = False
self._lock.acquire()
try:
assert self._remaining > 0
got_first = (len(self._results) == self._remaining)
self._remaining -= 1
got_last = (self._remaining == 0)
got_last = (self._remaining == 0)
finally:
self._lock.release()
@ -630,18 +649,19 @@ class OrderedResultCollector(AbstractResultCollector):
def _test():
"""Some tests"""
import thread, time
import thread
import time
def f(x):
return x*x
return x * x
def work(seconds):
print "[%d] Start to work for %fs..." % (thread.get_ident(), seconds)
time.sleep(seconds)
print "[%d] Work done (%fs)." % (thread.get_ident(), seconds)
return "%d slept %fs" % (thread.get_ident(), seconds)
### Test copy/pasted from multiprocessing
# Test copy/pasted from multiprocessing
pool = Pool(9) # start 4 worker threads
result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously
@ -749,4 +769,4 @@ def _test():
if __name__ == "__main__":
_test()
## end of http://code.activestate.com/recipes/576519/ }}}
# end of http://code.activestate.com/recipes/576519/ }}}

View File

@ -25,16 +25,16 @@ class ServicesCheck(AgentCheck):
This class should never be directly instanciated.
Work flow:
The main agent loop will call the check function for each instance for
The main agent loop will call the check function for each instance for
each iteration of the loop.
The check method will make an asynchronous call to the _process method in
The check method will make an asynchronous call to the _process method in
one of the thread initiated in the thread pool created in this class constructor.
The _process method will call the _check method of the inherited class
which will perform the actual check.
The _check method must return a tuple which first element is either
Status.UP or Status.DOWN.
The second element is a short error message that will be displayed
The second element is a short error message that will be displayed
when the service turns down.
"""
@ -81,9 +81,11 @@ class ServicesCheck(AgentCheck):
def check(self, instance):
if not self.pool_started:
self.start_pool()
# On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1
# On Windows the agent runs on multiple threads so we need to have an
# offset of 5 in case the pool_size is 1
if threading.activeCount() > 5 * self.pool_size + 5:
raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount())
raise Exception("Thread number (%s) is exploding. Skipping this check" %
threading.activeCount())
self._process_results()
self._clean()
name = instance.get('name', None)
@ -91,7 +93,7 @@ class ServicesCheck(AgentCheck):
self.log.error('Each service check must have a name')
return
if name not in self.jobs_status:
if name not in self.jobs_status:
# A given instance should be processed one at a time
self.jobs_status[name] = time.time()
self.pool.apply_async(self._process, args=(instance,))
@ -175,5 +177,3 @@ class ServicesCheck(AgentCheck):
if now - start_time > TIMEOUT:
self.log.critical("Restarting Pool. One check is stuck.")
self.restart_pool()

View File

@ -22,6 +22,7 @@ to_float = lambda s: float(s.replace(",", "."))
class Disk(Check):
""" Collects metrics about the machine's disks. """
def check(self):
@ -67,7 +68,7 @@ class Disk(Check):
def parse_df_output(self, df_output, platform_name, inodes=False, use_mount=False, blacklist_re=None):
"""
Parse the output of the df command. If use_volume is true the volume
is used to anchor the metric, otherwise false the mount
is used to anchor the metric, otherwise false the mount
point is used. Returns a tuple of (disk, inode).
"""
usage_data = {}
@ -181,7 +182,7 @@ class Disk(Check):
if blacklist_re and blacklist_re.match(device[0]):
return False
return True
devices = filter(keep_device, flattened_devices)
return devices
@ -233,7 +234,7 @@ class IO(Check):
io_stats[device][self.xlate(header_name, "linux")] = values[header_index]
return io_stats
@staticmethod
def _parse_darwin(output):
lines = [l.split() for l in output.split("\n") if len(l) > 0]
@ -243,10 +244,10 @@ class IO(Check):
for idx, disk in enumerate(disks):
kb_t, tps, mb_s = map(float, lastline[(3 * idx):(3 * idx) + 3]) # 3 cols at a time
io[disk] = {
'system.io.bytes_per_s': mb_s * 10**6,
'system.io.bytes_per_s': mb_s * 10 ** 6,
}
return io
@staticmethod
def xlate(metric_name, os_name):
"""Standardize on linux metric names"""
@ -284,7 +285,7 @@ class IO(Check):
stdout=sp.PIPE,
close_fds=True).communicate()[0]
# Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU)
# Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU)
#
# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util
# sda1 0.00 17.61 0.26 32.63 4.23 201.04 12.48 0.16 4.81 0.53 1.73
@ -298,7 +299,8 @@ class IO(Check):
# sdb 0.00 0.00 0.00 2.97 0.00 11.88 8.00 0.00 0.00 0.00 0.00
# sdg 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
# sdf 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
# md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
# md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00
# 0.00 0.00 0.00 0.00
io.update(self._parse_linux2(stdout))
elif sys.platform == "sunos5":
@ -315,12 +317,12 @@ class IO(Check):
# device r/s w/s kr/s kw/s wait actv svc_t %w %b
# ramdisk1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0
# sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0
# sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1
# sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1
# discard the first half of the display (stats since boot)
lines = [l for l in iostat.split("\n") if len(l) > 0]
lines = lines[len(lines)/2:]
lines = lines[len(lines) / 2:]
assert "extended device statistics" in lines[0]
headers = lines[1].split()
assert "device" in headers
@ -331,25 +333,25 @@ class IO(Check):
io[cols[0]] = {}
for i in range(1, len(cols)):
io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i]
elif sys.platform.startswith("freebsd"):
iostat = sp.Popen(["iostat", "-x", "-d", "1", "2"],
stdout=sp.PIPE,
close_fds=True).communicate()[0]
# Be careful!
# Be careful!
# It looks like SunOS, but some columms (wait, svc_t) have different meaning
# extended device statistics
# device r/s w/s kr/s kw/s wait svc_t %b
# extended device statistics
# device r/s w/s kr/s kw/s wait svc_t %b
# ad0 3.1 1.3 49.9 18.8 0 0.7 0
# extended device statistics
# device r/s w/s kr/s kw/s wait svc_t %b
# extended device statistics
# device r/s w/s kr/s kw/s wait svc_t %b
# ad0 0.0 2.0 0.0 31.8 0 0.2 0
# discard the first half of the display (stats since boot)
lines = [l for l in iostat.split("\n") if len(l) > 0]
lines = lines[len(lines)/2:]
lines = lines[len(lines) / 2:]
assert "extended device statistics" in lines[0]
headers = lines[1].split()
assert "device" in headers
@ -361,12 +363,12 @@ class IO(Check):
for i in range(1, len(cols)):
io[cols[0]][self.xlate(headers[i], "freebsd")] = cols[i]
elif sys.platform == 'darwin':
iostat = sp.Popen(['iostat', '-d', '-c', '2', '-w', '1'],
iostat = sp.Popen(['iostat', '-d', '-c', '2', '-w', '1'],
stdout=sp.PIPE,
close_fds=True).communicate()[0]
# disk0 disk1 <-- number of disks
# KB/t tps MB/s KB/t tps MB/s
# 21.11 23 0.47 20.01 0 0.00
# KB/t tps MB/s KB/t tps MB/s
# 21.11 23 0.47 20.01 0 0.00
# 6.67 3 0.02 0.00 0 0.00 <-- line of interest
io = self._parse_darwin(iostat)
else:
@ -388,7 +390,8 @@ class IO(Check):
measurements = []
timestamp = time.time()
for dev_name, stats in filtered_io.iteritems():
filtered_stats = {stat: stats[stat] for stat in stats.iterkeys() if stat not in self.stat_blacklist}
filtered_stats = {stat: stats[stat]
for stat in stats.iterkeys() if stat not in self.stat_blacklist}
m_list = [Measurement(key, timestamp, value, {'device': dev_name})
for key, value in filtered_stats.iteritems()]
measurements.extend(m_list)
@ -401,7 +404,7 @@ class IO(Check):
class Load(Check):
def check(self):
if Platform.is_linux():
try:
@ -411,9 +414,9 @@ class Load(Check):
except Exception:
self.logger.exception('Cannot extract load')
return {}
uptime = uptime[0] # readlines() provides a list but we want a string
elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"):
# Get output from uptime
try:
@ -423,7 +426,7 @@ class Load(Check):
except Exception:
self.logger.exception('Cannot extract load')
return {}
# Split out the 3 load average values
load = [res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)]
return {'load_avg_1_min': float(load[0]),
@ -433,13 +436,14 @@ class Load(Check):
class Memory(Check):
def __init__(self, logger):
Check.__init__(self, logger)
macV = None
if sys.platform == 'darwin':
macV = platform.mac_ver()
macV_minor_version = int(re.match(r'10\.(\d+)\.?.*', macV[0]).group(1))
# Output from top is slightly modified on OS X 10.6 (case #28239) and greater
if macV and (macV_minor_version >= 6):
self.topIndex = 6
@ -456,7 +460,7 @@ class Memory(Check):
except Exception:
# No page size available
pass
def check(self):
if Platform.is_linux():
try:
@ -466,7 +470,7 @@ class Memory(Check):
except Exception:
self.logger.exception('Cannot get memory metrics from /proc/meminfo')
return {}
# $ cat /proc/meminfo
# MemTotal: 7995360 kB
# MemFree: 1045120 kB
@ -509,8 +513,9 @@ class Memory(Check):
# Hugepagesize: 2048 kB
# DirectMap4k: 10112 kB
# DirectMap2M: 8243200 kB
regexp = re.compile(r'^(\w+):\s+([0-9]+)') # We run this several times so one-time compile now
# We run this several times so one-time compile now
regexp = re.compile(r'^(\w+):\s+([0-9]+)')
meminfo = {}
for line in lines:
@ -520,7 +525,7 @@ class Memory(Check):
meminfo[match.group(1)] = match.group(2)
except Exception:
self.logger.exception("Cannot parse /proc/meminfo")
memData = {}
# Physical memory
@ -534,43 +539,48 @@ class Memory(Check):
memData['mem_usable_perc'] = memData['mem_total_mb'] - memData['mem_free_mb']
# Usable is relative since cached and buffers are actually used to speed things up.
memData['mem_usable_mb'] = memData['mem_free_mb'] + memData['memphysBuffers'] + memData['memphysCached']
memData['mem_usable_mb'] = memData['mem_free_mb'] + \
memData['memphysBuffers'] + memData['memphysCached']
if memData['mem_total_mb'] > 0:
memData['mem_usable_perc'] = float(memData['mem_usable_mb']) / float(memData['mem_total_mb'])
memData['mem_usable_perc'] = float(
memData['mem_usable_mb']) / float(memData['mem_total_mb'])
except Exception:
self.logger.exception('Cannot compute stats from /proc/meminfo')
# Swap
# FIXME units are in MB, we should use bytes instead
try:
memData['mem_swap_total_mb'] = int(meminfo.get('SwapTotal', 0)) / 1024
memData['mem_swap_free_mb'] = int(meminfo.get('SwapFree', 0)) / 1024
memData['mem_swap_used_mb'] = memData['mem_swap_total_mb'] - memData['mem_swap_free_mb']
memData['mem_swap_used_mb'] = memData[
'mem_swap_total_mb'] - memData['mem_swap_free_mb']
if memData['mem_swap_total_mb'] > 0:
memData['mem_swap_free_perc'] = float(memData['mem_swap_free_mb']) / float(memData['mem_swap_total_mb'])
memData['mem_swap_free_perc'] = float(
memData['mem_swap_free_mb']) / float(memData['mem_swap_total_mb'])
except Exception:
self.logger.exception('Cannot compute swap stats')
return memData
return memData
elif sys.platform == 'darwin':
macV = platform.mac_ver()
macV_minor_version = int(re.match(r'10\.(\d+)\.?.*', macV[0]).group(1))
try:
top = sp.Popen(['top', '-l 1'], stdout=sp.PIPE, close_fds=True).communicate()[0]
sysctl = sp.Popen(['sysctl', 'vm.swapusage'], stdout=sp.PIPE, close_fds=True).communicate()[0]
sysctl = sp.Popen(
['sysctl', 'vm.swapusage'], stdout=sp.PIPE, close_fds=True).communicate()[0]
except StandardError:
self.logger.exception('getMemoryUsage')
return {}
# Deal with top
lines = top.split('\n')
physParts = re.findall(r'([0-9]\d+)', lines[self.topIndex])
# Deal with sysctl
swapParts = re.findall(r'([0-9]+\.\d+)', sysctl)
@ -585,10 +595,11 @@ class Memory(Check):
'physFree': physParts[physFreePartIndex],
'swapUsed': swapParts[1],
'swapFree': swapParts[2]}
elif sys.platform.startswith("freebsd"):
try:
sysctl = sp.Popen(['sysctl', 'vm.stats.vm'], stdout=sp.PIPE, close_fds=True).communicate()[0]
sysctl = sp.Popen(
['sysctl', 'vm.stats.vm'], stdout=sp.PIPE, close_fds=True).communicate()[0]
except Exception:
self.logger.exception('getMemoryUsage')
return {}
@ -638,13 +649,15 @@ class Memory(Check):
pageSize) / 1048576
if memData['physTotal'] > 0:
memData['physPctUsable'] = float(memData['physUsable']) / float(memData['physTotal'])
memData['physPctUsable'] = float(
memData['physUsable']) / float(memData['physTotal'])
except Exception:
self.logger.exception('Cannot compute stats from /proc/meminfo')
# Swap
try:
sysctl = sp.Popen(['swapinfo', '-m'], stdout=sp.PIPE, close_fds=True).communicate()[0]
sysctl = sp.Popen(
['swapinfo', '-m'], stdout=sp.PIPE, close_fds=True).communicate()[0]
except Exception:
self.logger.exception('getMemoryUsage')
return {}
@ -669,7 +682,7 @@ class Memory(Check):
memData['swapUsed'] += int(line[2])
except Exception:
self.logger.exception('Cannot compute stats from swapinfo')
return memData
elif sys.platform == 'sunos5':
try:
@ -694,14 +707,15 @@ class Memory(Check):
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime 16787393.9439095
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap 91828224 <--
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap 1073741824 <--
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename
# 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c
# turn memory_cap:360:zone_name:key value
# into { "key": value, ...}
kv = [l.strip().split() for l in kmem.split("\n") if len(l) > 0]
entries = dict([(k.split(":")[-1], v) for (k, v) in kv])
# extract rss, physcap, swap, swapcap, turn into MB
convert = lambda v: int(long(v))/2**20
convert = lambda v: int(long(v)) / 2 ** 20
memData["physTotal"] = convert(entries["physcap"])
memData["physUsed"] = convert(entries["rss"])
memData["physFree"] = memData["physTotal"] - memData["physUsed"]
@ -710,7 +724,8 @@ class Memory(Check):
memData["swapFree"] = memData["swapTotal"] - memData["swapUsed"]
if memData['swapTotal'] > 0:
memData['swapPctFree'] = float(memData['swapFree']) / float(memData['swapTotal'])
memData['swapPctFree'] = float(
memData['swapFree']) / float(memData['swapTotal'])
return memData
except Exception:
self.logger.exception("Cannot compute mem stats from kstat -c zone_memory_cap")
@ -778,7 +793,7 @@ class Cpu(Check):
data = avg[0].split()
# Userland
# Debian lenny says %user so we look for both
# Debian lenny says %user so we look for both
# One of them will be 0
cpu_metrics = {"%usr": None, "%user": None, "%nice": None,
"%iowait": None, "%idle": None, "%sys": None,
@ -798,16 +813,17 @@ class Cpu(Check):
return format_results(cpu_user,
cpu_system,
cpu_wait,
cpu_wait,
cpu_idle,
cpu_stolen)
else:
return {}
elif sys.platform == 'darwin':
# generate 3 seconds of data
# [' disk0 disk1 cpu load average', ' KB/t tps MB/s KB/t tps MB/s us sy id 1m 5m 15m', ' 21.23 13 0.27 17.85 7 0.13 14 7 79 1.04 1.27 1.31', ' 4.00 3 0.01 5.00 8 0.04 12 10 78 1.04 1.27 1.31', '']
iostats = sp.Popen(['iostat', '-C', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
iostats = sp.Popen(
['iostat', '-C', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
lines = [l for l in iostats.split("\n") if len(l) > 0]
legend = [l for l in lines if "us" in l]
if len(legend) == 1:
@ -830,7 +846,8 @@ class Cpu(Check):
# tin tout KB/t tps MB/s KB/t tps MB/s KB/t tps MB/s us ni sy in id
# 0 69 26.71 0 0.01 0.00 0 0.00 0.00 0 0.00 2 0 0 1 97
# 0 78 0.00 0 0.00 0.00 0 0.00 0.00 0 0.00 0 0 0 0 100
iostats = sp.Popen(['iostat', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
iostats = sp.Popen(
['iostat', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
lines = [l for l in iostats.split("\n") if len(l) > 0]
legend = [l for l in lines if "us" in l]
if len(legend) == 1:
@ -862,10 +879,11 @@ class Cpu(Check):
#
# Will aggregate over all processor sets
try:
mpstat = sp.Popen(['mpstat', '-aq', '1', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
mpstat = sp.Popen(
['mpstat', '-aq', '1', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
lines = [l for l in mpstat.split("\n") if len(l) > 0]
# discard the first len(lines)/2 lines
lines = lines[len(lines)/2:]
lines = lines[len(lines) / 2:]
legend = [l for l in lines if "SET" in l]
assert len(legend) == 1
if len(legend) == 1:
@ -879,7 +897,7 @@ class Cpu(Check):
idle = [get_value(headers, l.split(), "idl") for l in d_lines]
size = [get_value(headers, l.split(), "sze") for l in d_lines]
count = sum(size)
rel_size = [s/count for s in size]
rel_size = [s / count for s in size]
dot = lambda v1, v2: reduce(operator.add, map(operator.mul, v1, v2))
return format_results(dot(user, rel_size),
dot(kern, rel_size),
@ -929,5 +947,3 @@ if __name__ == '__main__':
print(mem.check(config))
print("\n\n\n")
time.sleep(1)

View File

@ -16,6 +16,7 @@ KB2MB = B2KB = float(1024)
class Processes(Check):
def __init__(self, logger):
Check.__init__(self, logger)
self.gauge('system.proc.queue_length')
@ -44,6 +45,7 @@ class Processes(Check):
class Memory(Check):
def __init__(self, logger):
Check.__init__(self, logger)
self.logger = logger
@ -83,6 +85,7 @@ class Memory(Check):
class Cpu(Check):
def __init__(self, logger):
Check.__init__(self, logger)
self.logger = logger
@ -140,6 +143,7 @@ class Cpu(Check):
class Network(Check):
def __init__(self, logger):
Check.__init__(self, logger)
self.logger = logger
@ -166,6 +170,7 @@ class Network(Check):
class Disk(Check):
def __init__(self, logger):
Check.__init__(self, logger)
self.logger = logger
@ -199,6 +204,7 @@ class Disk(Check):
class IO(Check):
def __init__(self, logger):
Check.__init__(self, logger)
self.logger = logger

View File

@ -41,11 +41,11 @@ class TailFile(object):
self._crc = None
self._log = logger
self._callback = callback
def _open_file(self, move_end=False, pos=False):
already_open = False
#close and reopen to handle logrotate
# close and reopen to handle logrotate
if self._f is not None:
self._f.close()
self._f = None

View File

@ -6,6 +6,7 @@ from monagent.collector.checks.utils import add_basic_auth
class Apache(AgentCheck):
"""Tracks basic connection/requests/workers metrics
See http://httpd.apache.org/docs/2.2/mod/mod_status.html for more details
@ -70,12 +71,12 @@ class Apache(AgentCheck):
if metric_count == 0:
if self.assumed_url.get(instance['apache_status_url'], None) is None and url[-5:] != '?auto':
self.assumed_url[instance['apache_status_url']]= '%s?auto' % url
self.assumed_url[instance['apache_status_url']] = '%s?auto' % url
self.warning("Assuming url was not correct. Trying to add ?auto suffix to the url")
self.check(instance)
else:
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['apache_status_url'])
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance[
'apache_status_url'])
@staticmethod
def parse_agent_config(agentConfig):

View File

@ -27,6 +27,7 @@ CACTI_TO_DD = {
class Cacti(AgentCheck):
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
self.last_ts = {}
@ -41,10 +42,10 @@ class Cacti(AgentCheck):
except AttributeError:
version = "Unknown"
return {"rrdtool": version}
return {"rrdtool": version}
def check(self, instance):
# Load the instance config
config = self._get_config(instance)
@ -52,13 +53,15 @@ class Cacti(AgentCheck):
try:
import rrdtool
except ImportError, e:
raise Exception("Cannot import rrdtool module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
raise Exception(
"Cannot import rrdtool module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
# Try importing MySQL
try:
import MySQLdb
except ImportError, e:
raise Exception("Cannot import MySQLdb module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
raise Exception(
"Cannot import MySQLdb module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
connection = MySQLdb.connect(config.host, config.user, config.password, config.db)
@ -132,7 +135,7 @@ class Cacti(AgentCheck):
return metric_count
# Find the consolidation functions for the RRD metrics
c_funcs = set([v for k,v in info.items() if k.endswith('.cf')])
c_funcs = set([v for k, v in info.items() if k.endswith('.cf')])
for c in list(c_funcs):
last_ts_key = '%s.%s' % (rrd_path, c)
@ -165,7 +168,7 @@ class Cacti(AgentCheck):
# Save this metric as a gauge
val = self._transform_metric(m_name, p[k])
self.gauge(m_name, val, hostname=hostname,
device_name=device_name, timestamp=ts)
device_name=device_name, timestamp=ts)
metric_count += 1
last_ts = (ts + interval)
@ -178,7 +181,7 @@ class Cacti(AgentCheck):
tuples of (hostname, device_name, rrd_path)
'''
def _in_whitelist(rrd):
path = rrd.replace('<path_rra>/','')
path = rrd.replace('<path_rra>/', '')
for p in whitelist:
if fnmatch(path, p):
return True
@ -186,7 +189,8 @@ class Cacti(AgentCheck):
c = connection.cursor()
and_parameters = " OR ".join(["hsc.field_name = '%s'" % field_name for field_name in field_names])
and_parameters = " OR ".join(
["hsc.field_name = '%s'" % field_name for field_name in field_names])
# Check for the existence of the `host_snmp_cache` table
rrd_query = """
@ -202,7 +206,7 @@ class Cacti(AgentCheck):
WHERE dt.data_source_path IS NOT NULL
AND dt.data_source_path != ''
AND (%s OR hsc.field_name is NULL) """ % and_parameters
c.execute(rrd_query)
res = []
for hostname, device_name, rrd_path in c.fetchall():
@ -244,7 +248,6 @@ class Cacti(AgentCheck):
return val / 1024
return val
'''
For backwards compatability with pre-checks_d configuration.
Convert old-style config to new-style config.

View File

@ -6,9 +6,11 @@ from monagent.collector.checks import AgentCheck
class CouchDb(AgentCheck):
"""Extracts stats from CouchDB via its REST API
http://wiki.apache.org/couchdb/Runtime_Statistics
"""
def _create_metric(self, data, dimensions=None):
overall_stats = data.get('stats', {})
for key, stats in overall_stats.items():
@ -16,7 +18,7 @@ class CouchDb(AgentCheck):
if val['current'] is not None:
metric_name = '.'.join(['couchdb', key, metric])
self.gauge(metric_name, val['current'], dimensions=dimensions)
for db_name, db_stats in data.get('databases', {}).items():
for name, val in db_stats.items():
if name in ['doc_count', 'disk_size'] and val is not None:
@ -80,7 +82,6 @@ class CouchDb(AgentCheck):
if not agentConfig.get('couchdb_server'):
return False
return {
'instances': [{
'server': agentConfig.get('couchdb_server'),

View File

@ -8,13 +8,13 @@ from monagent.collector.checks import AgentCheck
from monagent.collector.checks.utils import add_basic_auth
#Constants
# Constants
COUCHBASE_STATS_PATH = '/pools/nodes'
DEFAULT_TIMEOUT = 10
class Couchbase(AgentCheck):
"""Extracts stats from Couchbase via its REST API
http://docs.couchbase.com/couchbase-manual-2.0/#using-the-rest-api
"""
@ -24,25 +24,29 @@ class Couchbase(AgentCheck):
for key, storage_type in storage_totals.items():
for metric_name, val in storage_type.items():
if val is not None:
metric_name = '.'.join(['couchbase', key, self.camel_case_to_joined_lower(metric_name)])
metric_name = '.'.join(
['couchbase', key, self.camel_case_to_joined_lower(metric_name)])
self.gauge(metric_name, val, dimensions=dimensions)
for bucket_name, bucket_stats in data['buckets'].items():
for metric_name, val in bucket_stats.items():
if val is not None:
metric_name = '.'.join(['couchbase', 'by_bucket', self.camel_case_to_joined_lower(metric_name)])
metric_name = '.'.join(
['couchbase', 'by_bucket', self.camel_case_to_joined_lower(metric_name)])
metric_dimensions = dimensions.copy()
metric_dimensions['bucket'] = bucket_name
self.gauge(metric_name, val[0], dimensions=metric_dimensions, device_name=bucket_name)
self.gauge(
metric_name, val[0], dimensions=metric_dimensions, device_name=bucket_name)
for node_name, node_stats in data['nodes'].items():
for metric_name, val in node_stats['interestingStats'].items():
if val is not None:
metric_name = '.'.join(['couchbase', 'by_node', self.camel_case_to_joined_lower(metric_name)])
metric_name = '.'.join(
['couchbase', 'by_node', self.camel_case_to_joined_lower(metric_name)])
metric_dimensions = dimensions.copy()
metric_dimensions['node'] = node_name
self.gauge(metric_name, val, dimensions=metric_dimensions, device_name=node_name)
self.gauge(
metric_name, val, dimensions=metric_dimensions, device_name=node_name)
def _get_stats(self, url, instance):
"Hit a given URL and return the parsed json"
@ -52,8 +56,8 @@ class Couchbase(AgentCheck):
add_basic_auth(req, instance['user'], instance['password'])
if instance['is_recent_python']:
timeout = instance.get('timeout' , DEFAULT_TIMEOUT)
request = urllib2.urlopen(req,timeout=timeout)
timeout = instance.get('timeout', DEFAULT_TIMEOUT)
request = urllib2.urlopen(req, timeout=timeout)
else:
request = urllib2.urlopen(req)
@ -73,9 +77,9 @@ class Couchbase(AgentCheck):
def get_data(self, server, instance):
# The dictionary to be returned.
couchbase = {'stats': None,
'buckets': {},
'nodes': {}
}
'buckets': {},
'nodes': {}
}
# build couchbase stats entry point
url = '%s%s' % (server, COUCHBASE_STATS_PATH)
@ -84,7 +88,7 @@ class Couchbase(AgentCheck):
# No overall stats? bail out now
if overall_stats is None:
raise Exception("No data returned from couchbase endpoint: %s" % url)
couchbase['stats'] = overall_stats
nodes = overall_stats['nodes']
@ -104,7 +108,8 @@ class Couchbase(AgentCheck):
for bucket in buckets:
bucket_name = bucket['name']
# We have to manually build the URI for the stats bucket, as this is not auto discoverable
# We have to manually build the URI for the stats bucket, as this is not
# auto discoverable
url = '%s/pools/nodes/buckets/%s/stats' % (server, bucket_name)
bucket_stats = self._get_stats(url, instance)
bucket_samples = bucket_stats['op']['samples']
@ -124,9 +129,8 @@ class Couchbase(AgentCheck):
# remove duplicate _
converted_variable = re.sub('_+', '_', converted_variable)
# handle special case of starting/ending underscores
converted_variable = re.sub('^_|_$', '', converted_variable)
return converted_variable

View File

@ -7,6 +7,7 @@ from monagent.collector.checks import AgentCheck
class DirectoryCheck(AgentCheck):
"""This check is for monitoring and reporting metrics on the files for a provided directory
WARNING: the user/group that dd-agent runs as must have access to stat the files in the desired directory
@ -17,6 +18,7 @@ class DirectoryCheck(AgentCheck):
"pattern" - string, the `fnmatch` pattern to use when reading the "directory"'s files. default "*"
"recursive" - boolean, when true the stats will recurse into directories. default False
"""
def check(self, instance):
if "directory" not in instance:
raise Exception('DirectoryCheck: missing "directory" in config')
@ -51,7 +53,8 @@ class DirectoryCheck(AgentCheck):
directory_files += 1
directory_bytes += file_stat.st_size
# file specific metrics
self.histogram("system.disk.directory.file.bytes", file_stat.st_size, dimensions=dimensions)
self.histogram(
"system.disk.directory.file.bytes", file_stat.st_size, dimensions=dimensions)
self.histogram("system.disk.directory.file.modified_sec_ago", time.time() - file_stat.st_mtime,
dimensions=dimensions)
self.histogram("system.disk.directory.file.created_sec_ago", time.time() - file_stat.st_ctime,

View File

@ -68,8 +68,10 @@ DOCKER_TAGS = [
class UnixHTTPConnection(httplib.HTTPConnection, object):
"""Class used in conjuction with UnixSocketHandler to make urllib2
compatible with Unix sockets."""
def __init__(self, unix_socket):
self._unix_socket = unix_socket
@ -84,8 +86,10 @@ class UnixHTTPConnection(httplib.HTTPConnection, object):
class UnixSocketHandler(urllib2.AbstractHTTPHandler):
"""Class that makes Unix sockets work with urllib2 without any additional
dependencies."""
def unix_open(self, req):
full_path = "%s%s" % urlsplit(req.get_full_url())[1:3]
path = os.path.sep
@ -104,6 +108,7 @@ class UnixSocketHandler(urllib2.AbstractHTTPHandler):
class Docker(AgentCheck):
def __init__(self, *args, **kwargs):
super(Docker, self).__init__(*args, **kwargs)
urllib2.install_opener(urllib2.build_opener(UnixSocketHandler()))
@ -121,7 +126,8 @@ class Docker(AgentCheck):
if not instance.get("exclude") or not instance.get("include"):
if len(containers) > max_containers:
self.warning("Too many containers to collect. Please refine the containers to collect by editing the configuration file. Truncating to %s containers" % max_containers)
self.warning(
"Too many containers to collect. Please refine the containers to collect by editing the configuration file. Truncating to %s containers" % max_containers)
containers = containers[:max_containers]
collected_containers = 0
@ -136,19 +142,22 @@ class Docker(AgentCheck):
collected_containers += 1
if collected_containers > max_containers:
self.warning("Too many containers are matching the current configuration. Some containers will not be collected. Please refine your configuration")
self.warning(
"Too many containers are matching the current configuration. Some containers will not be collected. Please refine your configuration")
break
for key, (dd_key, metric_type) in DOCKER_METRICS.items():
if key in container:
getattr(self, metric_type)(dd_key, int(container[key]), dimensions=container_dimensions)
getattr(self, metric_type)(
dd_key, int(container[key]), dimensions=container_dimensions)
for metric in LXC_METRICS:
mountpoint = self._mounpoints[metric["cgroup"]]
stat_file = os.path.join(mountpoint, metric["file"] % container["Id"])
stats = self._parse_cgroup_file(stat_file)
for key, (dd_key, metric_type) in metric["metrics"].items():
if key in stats:
getattr(self, metric_type)(dd_key, int(stats[key]), dimensions=container_dimensions)
getattr(self, metric_type)(
dd_key, int(stats[key]), dimensions=container_dimensions)
@staticmethod
def _make_tag(key, value):
@ -187,7 +196,8 @@ class Docker(AgentCheck):
request = urllib2.urlopen(req)
except urllib2.URLError, e:
if "Errno 13" in str(e):
raise Exception("Unable to connect to socket. dd-agent user must be part of the 'docker' group")
raise Exception(
"Unable to connect to socket. dd-agent user must be part of the 'docker' group")
raise
response = request.read()
return json.loads(response)
@ -217,7 +227,8 @@ class Docker(AgentCheck):
try:
fp = open(file_)
except IOError:
raise IOError("Can't open %s. If you are using Docker 0.9.0 or higher, the Datadog agent is not yet compatible with these versions. Please get in touch with Datadog Support for more information" % file_)
raise IOError(
"Can't open %s. If you are using Docker 0.9.0 or higher, the Datadog agent is not yet compatible with these versions. Please get in touch with Datadog Support for more information" % file_)
return dict(map(lambda x: x.split(), fp.read().splitlines()))
finally:

View File

@ -21,35 +21,35 @@ class ElasticSearch(AgentCheck):
"elasticsearch.docs.deleted": ("gauge", "indices.docs.deleted"),
"elasticsearch.store.size": ("gauge", "indices.store.size_in_bytes"),
"elasticsearch.indexing.index.total": ("gauge", "indices.indexing.index_total"),
"elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.indexing.index.current": ("gauge", "indices.indexing.index_current"),
"elasticsearch.indexing.delete.total": ("gauge", "indices.indexing.delete_total"),
"elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.indexing.delete.current": ("gauge", "indices.indexing.delete_current"),
"elasticsearch.get.total": ("gauge", "indices.get.total"),
"elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v)/1000),
"elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.get.current": ("gauge", "indices.get.current"),
"elasticsearch.get.exists.total": ("gauge", "indices.get.exists_total"),
"elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.get.missing.total": ("gauge", "indices.get.missing_total"),
"elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.search.query.total": ("gauge", "indices.search.query_total"),
"elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.search.query.current": ("gauge", "indices.search.query_current"),
"elasticsearch.search.fetch.total": ("gauge", "indices.search.fetch_total"),
"elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.search.fetch.current": ("gauge", "indices.search.fetch_current"),
"elasticsearch.merges.current": ("gauge", "indices.merges.current"),
"elasticsearch.merges.current.docs": ("gauge", "indices.merges.current_docs"),
"elasticsearch.merges.current.size": ("gauge", "indices.merges.current_size_in_bytes"),
"elasticsearch.merges.total": ("gauge", "indices.merges.total"),
"elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.merges.total.docs": ("gauge", "indices.merges.total_docs"),
"elasticsearch.merges.total.size": ("gauge", "indices.merges.total_size_in_bytes"),
"elasticsearch.refresh.total": ("gauge", "indices.refresh.total"),
"elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.flush.total": ("gauge", "indices.flush.total"),
"elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v)/1000),
"elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v) / 1000),
"elasticsearch.process.open_fd": ("gauge", "process.open_file_descriptors"),
"elasticsearch.transport.rx_count": ("gauge", "transport.rx_count"),
"elasticsearch.transport.tx_count": ("gauge", "transport.tx_count"),
@ -92,9 +92,9 @@ class ElasticSearch(AgentCheck):
"elasticsearch.http.current_open": ("gauge", "http.current_open"),
"elasticsearch.http.total_opened": ("gauge", "http.total_opened"),
"jvm.gc.concurrent_mark_sweep.count": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_count"),
"jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v)/1000),
"jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v) / 1000),
"jvm.gc.par_new.count": ("gauge", "jvm.gc.collectors.ParNew.collection_count"),
"jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v)/1000),
"jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v) / 1000),
"jvm.mem.heap_committed": ("gauge", "jvm.mem.heap_committed_in_bytes"),
"jvm.mem.heap_used": ("gauge", "jvm.mem.heap_used_in_bytes"),
"jvm.mem.non_heap_committed": ("gauge", "jvm.mem.non_heap_committed_in_bytes"),
@ -153,7 +153,6 @@ class ElasticSearch(AgentCheck):
health_data = self._get_data(url, auth)
self._process_health_data(config_url, health_data, dimensions=dimensions)
def _get_es_version(self, config_url, auth=None):
"""
Get the running version of Elastic Search
@ -163,7 +162,8 @@ class ElasticSearch(AgentCheck):
data = self._get_data(config_url, auth)
version = map(int, data['version']['number'].split('.'))
except Exception, e:
self.warning("Error while trying to get Elasticsearch version from %s %s" % (config_url, str(e)))
self.warning("Error while trying to get Elasticsearch version from %s %s" %
(config_url, str(e)))
version = [0, 0, 0]
self.log.debug("Elasticsearch version is %s" % version)
@ -174,7 +174,7 @@ class ElasticSearch(AgentCheck):
Define the set of URLs and METRICS to use depending on the running ES version
"""
if version >= [0,90,10]:
if version >= [0, 90, 10]:
# ES versions 0.90.10 and above
# Metrics architecture changed starting with version 0.90.10
self.HEALTH_URL = "/_cluster/health?pretty=true"
@ -185,9 +185,9 @@ class ElasticSearch(AgentCheck):
"elasticsearch.search.fetch.open_contexts": ("gauge", "indices.search.open_contexts"),
"elasticsearch.cache.filter.evictions": ("gauge", "indices.filter_cache.evictions"),
"elasticsearch.cache.filter.size": ("gauge", "indices.filter_cache.memory_size_in_bytes"),
"elasticsearch.id_cache.size": ("gauge","indices.id_cache.memory_size_in_bytes"),
"elasticsearch.fielddata.size": ("gauge","indices.fielddata.memory_size_in_bytes"),
"elasticsearch.fielddata.evictions": ("gauge","indices.fielddata.evictions")
"elasticsearch.id_cache.size": ("gauge", "indices.id_cache.memory_size_in_bytes"),
"elasticsearch.fielddata.size": ("gauge", "indices.fielddata.memory_size_in_bytes"),
"elasticsearch.fielddata.evictions": ("gauge", "indices.fielddata.evictions")
}
else:
@ -206,9 +206,9 @@ class ElasticSearch(AgentCheck):
"elasticsearch.thread_pool.cache.threads": ("gauge", "thread_pool.cache.threads"),
"elasticsearch.thread_pool.cache.queue": ("gauge", "thread_pool.cache.queue"),
"jvm.gc.collection_count": ("gauge", "jvm.gc.collection_count"),
"jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v)/1000),
"jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v) / 1000),
"jvm.gc.copy.count": ("gauge", "jvm.gc.collectors.Copy.collection_count"),
"jvm.gc.copy.collection_time": ("gauge", "jvm.gc.collectors.Copy.collection_time_in_millis", lambda v: float(v)/1000)
"jvm.gc.copy.collection_time": ("gauge", "jvm.gc.collectors.Copy.collection_time_in_millis", lambda v: float(v) / 1000)
}
self.METRICS.update(additional_metrics)
@ -278,8 +278,8 @@ class ElasticSearch(AgentCheck):
if node_name in data['nodes']:
node = data['nodes'][node_name]
if 'network' in node\
and 'primary_interface' in node['network']\
and 'address' in node['network']['primary_interface']:
and 'primary_interface' in node['network']\
and 'address' in node['network']['primary_interface']:
return node['network']['primary_interface']['address']
raise NodeNotFound()
@ -295,7 +295,7 @@ class ElasticSearch(AgentCheck):
else:
ifaces = subprocess.Popen(['ip', 'addr'], stdout=subprocess.PIPE)
grepper = subprocess.Popen(['grep', 'inet'], stdin=ifaces.stdout,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ifaces.stdout.close()
out, err = grepper.communicate()
@ -305,7 +305,7 @@ class ElasticSearch(AgentCheck):
for iface in out.split("\n"):
iface = iface.strip()
if iface:
ips.append( iface.split(' ')[1].split('/')[0] )
ips.append(iface.split(' ')[1].split('/')[0])
# Check the interface addresses against the primary address
return primary_addrs in ips
@ -326,7 +326,8 @@ class ElasticSearch(AgentCheck):
break
if value is not None:
if xform: value = xform(value)
if xform:
value = xform(value)
if self.METRICS[metric][0] == "gauge":
self.gauge(metric, value, dimensions=dimensions)
else:
@ -346,7 +347,6 @@ class ElasticSearch(AgentCheck):
event = self._create_event(data['status'])
self.event(event)
def process_metric(metric, xtype, path, xform=None):
# closure over data
self._process_metric(data, metric, path, xform, dimensions=dimensions)
@ -356,7 +356,6 @@ class ElasticSearch(AgentCheck):
desc = self.METRICS[metric]
process_metric(metric, *desc)
def _metric_not_found(self, metric, path):
self.log.debug("Metric not found: %s -> %s", path, metric)
@ -377,15 +376,15 @@ class ElasticSearch(AgentCheck):
msg = "ElasticSearch: %s just reported as %s" % (hostname, status)
return { 'timestamp': int(time.time()),
'event_type': 'elasticsearch',
'host': hostname,
'msg_text':msg,
'msg_title': msg_title,
"alert_type": alert_type,
"source_type_name": "elasticsearch",
"event_object": hostname
}
return {'timestamp': int(time.time()),
'event_type': 'elasticsearch',
'host': hostname,
'msg_text': msg,
'msg_title': msg_title,
"alert_type": alert_type,
"source_type_name": "elasticsearch",
"event_object": hostname
}
@staticmethod
def parse_agent_config(agentConfig):
@ -397,4 +396,3 @@ class ElasticSearch(AgentCheck):
'url': agentConfig.get('elasticsearch'),
}]
}

View File

@ -30,7 +30,7 @@ class Gearman(AgentCheck):
running = 0
queued = 0
workers = 0
for stat in data:
running += stat['running']
queued += stat['queued']

View File

@ -81,11 +81,11 @@ class GUnicornCheck(AgentCheck):
except psutil.NoSuchProcess:
self.warning('Process %s disappeared while scanning' % proc.name)
continue
# Let them do a little bit more work.
time.sleep(self.CPU_SLEEP_SECS)
# Processes which have used more CPU are considered active (this is a very
# Processes which have used more CPU are considered active (this is a very
# naive check, but gunicorn exposes no stats API)
for proc in worker_procs:
if proc.pid not in cpu_time_by_pid:
@ -109,11 +109,13 @@ class GUnicornCheck(AgentCheck):
def _get_master_proc_by_name(name):
""" Return a psutil process for the master gunicorn process with the given name. """
master_name = GUnicornCheck._get_master_proc_name(name)
master_procs = [p for p in psutil.process_iter() if p.cmdline and p.cmdline[0] == master_name]
master_procs = [
p for p in psutil.process_iter() if p.cmdline and p.cmdline[0] == master_name]
if len(master_procs) == 0:
raise GUnicornCheckError("Found no master process with name: %s" % master_name)
elif len(master_procs) > 1:
raise GUnicornCheckError("Found more than one master process with name: %s" % master_name)
raise GUnicornCheckError(
"Found more than one master process with name: %s" % master_name)
else:
return master_procs[0]
@ -124,9 +126,8 @@ class GUnicornCheck(AgentCheck):
# root 22976 0.1 0.1 60364 13424 ? Ss 19:30 0:00 gunicorn: master [web1]
# web 22984 20.7 2.3 521924 176136 ? Sl 19:30 1:58 gunicorn: worker [web1]
# web 22985 26.4 6.1 795288 449596 ? Sl 19:30 2:32 gunicorn: worker [web1]
return "gunicorn: master [%s]" % name
return "gunicorn: master [%s]" % name
class GUnicornCheckError(Exception):
pass

View File

@ -17,6 +17,7 @@ class Services(object):
class HAProxy(AgentCheck):
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
@ -60,7 +61,8 @@ class HAProxy(AgentCheck):
process_events = instance.get('status_check', self.init_config.get('status_check', False))
self._process_data(data, collect_aggregates_only, process_events, url=url, collect_status_metrics=collect_status_metrics)
self._process_data(data, collect_aggregates_only, process_events,
url=url, collect_status_metrics=collect_status_metrics)
def _fetch_data(self, url, username, password):
''' Hit a given URL and return the parsed json '''
@ -95,7 +97,7 @@ class HAProxy(AgentCheck):
# Holds a list of dictionaries describing each system
data_list = []
for line in data[1:]: # Skip the first line
for line in data[1:]: # Skip the first line
if not line.strip():
continue
data_dict = {}
@ -123,7 +125,6 @@ class HAProxy(AgentCheck):
if collect_status_metrics and 'status' in data_dict and 'pxname' in data_dict:
hosts_statuses[(data_dict['pxname'], data_dict['status'])] += 1
if data_dict['svname'] in Services.ALL:
data_list.append(data_dict)
@ -143,7 +144,7 @@ class HAProxy(AgentCheck):
return data
def _process_status_metric(self, hosts_statuses):
agg_statuses = defaultdict(lambda:{'available':0, 'unavailable':0})
agg_statuses = defaultdict(lambda: {'available': 0, 'unavailable': 0})
for (service, status), count in hosts_statuses.iteritems():
status = status.lower()
@ -196,7 +197,7 @@ class HAProxy(AgentCheck):
for data in data_list:
hostname = data['svname']
service_name = data['pxname']
key = "%s:%s" % (hostname,service_name)
key = "%s:%s" % (hostname, service_name)
status = self.host_status[url][key]
if status is None:
@ -227,17 +228,18 @@ class HAProxy(AgentCheck):
alert_type = "success"
else:
alert_type = "info"
title = "HAProxy %s front-end reported %s back and %s" % (service_name, hostname, status)
title = "HAProxy %s front-end reported %s back and %s" % (
service_name, hostname, status)
return {
'timestamp': int(time.time() - lastchg),
'event_type': EVENT_TYPE,
'host': hostname,
'msg_title': title,
'alert_type': alert_type,
"source_type_name": SOURCE_TYPE_NAME,
"event_object": hostname,
"dimensions": {"frontend": service_name, "host": hostname}
'timestamp': int(time.time() - lastchg),
'event_type': EVENT_TYPE,
'host': hostname,
'msg_title': title,
'alert_type': alert_type,
"source_type_name": SOURCE_TYPE_NAME,
"event_object": hostname,
"dimensions": {"frontend": service_name, "host": hostname}
}
@staticmethod

View File

@ -2,6 +2,7 @@ from monagent.collector.checks import AgentCheck
class HDFSCheck(AgentCheck):
"""Report on free space and space used in HDFS.
"""

View File

@ -9,6 +9,7 @@ from monagent.collector.checks.services_checks import ServicesCheck, Status
class HostAlive(ServicesCheck):
"""Inherit ServicesCheck class to test if a host is alive or not"""
def __init__(self, name, init_config, agent_config, instances=None):
@ -84,11 +85,11 @@ class HostAlive(ServicesCheck):
if instance['alive_test'] == 'ssh':
success = self._test_ssh(instance['host_name'],
self.init_config.get('ssh_port'),
self.init_config.get('ssh_timeout'))
self.init_config.get('ssh_port'),
self.init_config.get('ssh_timeout'))
elif instance['alive_test'] == 'ping':
success = self._test_ping(instance['host_name'],
self.init_config.get('ping_timeout'))
self.init_config.get('ping_timeout'))
else:
self.log.info("Unrecognized alive_test " + instance['alive_test'])
@ -99,4 +100,3 @@ class HostAlive(ServicesCheck):
self.gauge('host_alive', 1, dimensions=dimensions)
self.log.error("Host down: " + instance['host_name'])
return Status.DOWN, "DOWN"

View File

@ -89,8 +89,8 @@ class IIS(AgentCheck):
for metric, mtype, wmi_val in self.METRICS:
if not hasattr(iis_site, wmi_val):
self.warning('Unable to fetch metric %s. Missing %s in Win32_PerfFormattedData_W3SVC_WebService' \
% (metric, wmi_val))
self.warning('Unable to fetch metric %s. Missing %s in Win32_PerfFormattedData_W3SVC_WebService'
% (metric, wmi_val))
continue
# Submit the metric value with the correct type

View File

@ -22,6 +22,7 @@ class Skip(Exception):
Raised by :class:`Jenkins` when it comes across
a build or job that should be excluded from being checked.
"""
def __init__(self, reason, dir_name):
message = 'skipping build or job at %s because %s' % (dir_name, reason)
Exception.__init__(self, message)
@ -66,15 +67,15 @@ class Jenkins(AgentCheck):
d = dict([(k, v.text) for k, v in kv_pairs if v is not None])
try:
d['branch'] = tree.find('actions')\
.find('hudson.plugins.git.util.BuildData')\
.find('buildsByBranchName')\
.find('entry')\
.find('hudson.plugins.git.util.Build')\
.find('revision')\
.find('branches')\
.find('hudson.plugins.git.Branch')\
.find('name')\
d['branch'] = tree.find('actions') \
.find('hudson.plugins.git.util.BuildData') \
.find('buildsByBranchName') \
.find('entry') \
.find('hudson.plugins.git.util.Build') \
.find('revision') \
.find('branches') \
.find('hudson.plugins.git.Branch') \
.find('name') \
.text
except Exception:
pass
@ -104,9 +105,9 @@ class Jenkins(AgentCheck):
continue
output = {
'job_name': job_name,
'timestamp': timestamp,
'event_type': 'build result'
'job_name': job_name,
'timestamp': timestamp,
'event_type': 'build result'
}
output.update(build_metadata)
self.high_watermarks[instance_key][job_name] = timestamp
@ -123,8 +124,8 @@ class Jenkins(AgentCheck):
# so that we only send events that occured after the agent
# started.
# (Setting high_watermarks in the next statement prevents
# any kind of infinite loop (assuming nothing ever sets
# high_watermarks to None again!))
# any kind of infinite loop (assuming nothing ever sets
# high_watermarks to None again!))
self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0)
self.check(instance, create_event=False)
@ -150,7 +151,8 @@ class Jenkins(AgentCheck):
dimensions = {'job_name': output['job_name']}
if 'branch' in output:
dimensions['branch'] = output['branch']
self.gauge("jenkins.job.duration", float(output['duration'])/1000.0, dimensions=dimensions)
self.gauge("jenkins.job.duration", float(
output['duration']) / 1000.0, dimensions=dimensions)
if output['result'] == 'SUCCESS':
self.increment('jenkins.job.success', dimensions=dimensions)
@ -168,4 +170,3 @@ class Jenkins(AgentCheck):
'jenkins_home': agentConfig.get('hudson_home'),
}]
}

View File

@ -22,6 +22,7 @@ import random
class KafkaCheck(AgentCheck):
def check(self, instance):
consumer_groups = self.read_config(instance, 'consumer_groups',
cast=self._validate_consumer_groups)
@ -131,4 +132,5 @@ consumer_groups:
return host_ports
except Exception, e:
self.log.exception(e)
raise Exception('Could not parse %s. Must be in the form of `host0:port0,host1:port1,host2:port2`' % val)
raise Exception(
'Could not parse %s. Must be in the form of `host0:port0,host1:port1,host2:port2`' % val)

View File

@ -15,31 +15,31 @@ class KyotoTycoonCheck(AgentCheck):
"""
GAUGES = {
'repl_delay': 'replication.delay',
'serv_thread_count': 'threads',
'repl_delay': 'replication.delay',
'serv_thread_count': 'threads',
}
RATES = {
'serv_conn_count': 'connections',
'cnt_get': 'ops.get.hits',
'cnt_get_misses': 'ops.get.misses',
'cnt_set': 'ops.set.hits',
'cnt_set_misses': 'ops.set.misses',
'cnt_remove': 'ops.del.hits',
'cnt_remove_misses': 'ops.del.misses',
'serv_conn_count': 'connections',
'cnt_get': 'ops.get.hits',
'cnt_get_misses': 'ops.get.misses',
'cnt_set': 'ops.set.hits',
'cnt_set_misses': 'ops.set.misses',
'cnt_remove': 'ops.del.hits',
'cnt_remove_misses': 'ops.del.misses',
}
DB_GAUGES = {
'count': 'records',
'size': 'size',
'count': 'records',
'size': 'size',
}
TOTALS = {
'cnt_get': 'ops.get.total',
'cnt_get_misses': 'ops.get.total',
'cnt_set': 'ops.set.total',
'cnt_set_misses': 'ops.set.total',
'cnt_remove': 'ops.del.total',
'cnt_remove_misses': 'ops.del.total',
'cnt_get': 'ops.get.total',
'cnt_get_misses': 'ops.get.total',
'cnt_set': 'ops.set.total',
'cnt_set_misses': 'ops.set.total',
'cnt_remove': 'ops.del.total',
'cnt_remove_misses': 'ops.del.total',
}
def check(self, instance):
@ -65,7 +65,7 @@ class KyotoTycoonCheck(AgentCheck):
if key in self.GAUGES:
name = self.GAUGES[key]
self.gauge('kyototycoon.%s' % name, float(value), dimensions=dimensions)
elif key in self.RATES:
name = self.RATES[key]
self.rate('kyototycoon.%s_per_s' % name, float(value), dimensions=dimensions)

View File

@ -6,6 +6,7 @@ from monagent.collector.checks.utils import add_basic_auth
class Lighttpd(AgentCheck):
"""Tracks basic connection/requests/workers metrics
See http://redmine.lighttpd.net/projects/1/wiki/Docs_ModStatus for Lighttpd details
@ -114,10 +115,12 @@ class Lighttpd(AgentCheck):
url_suffix = self.URL_SUFFIX_PER_VERSION[server_version]
if self.assumed_url.get(instance['lighttpd_status_url'], None) is None and url[-len(url_suffix):] != url_suffix:
self.assumed_url[instance['lighttpd_status_url']] = '%s%s' % (url, url_suffix)
self.warning("Assuming url was not correct. Trying to add %s suffix to the url" % url_suffix)
self.warning(
"Assuming url was not correct. Trying to add %s suffix to the url" % url_suffix)
self.check(instance)
else:
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['lighttpd_status_url'])
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance[
'lighttpd_status_url'])
def _get_server_version(self, headers):
for h in headers:
@ -133,4 +136,3 @@ class Lighttpd(AgentCheck):
self.log.debug("Lighttpd server version is Unknown")
return "Unknown"

View File

@ -9,34 +9,34 @@ from monagent.collector.checks import *
# version string Version string of this server
# pointer_size 32 Default size of pointers on the host OS
# (generally 32 or 64)
# rusage_user 32u:32u Accumulated user time for this process
# rusage_user 32u:32u Accumulated user time for this process
# (seconds:microseconds)
# rusage_system 32u:32u Accumulated system time for this process
# rusage_system 32u:32u Accumulated system time for this process
# (seconds:microseconds)
# curr_items 32u Current number of items stored by the server
# total_items 32u Total number of items stored by this server
# total_items 32u Total number of items stored by this server
# ever since it started
# bytes 64u Current number of bytes used by this server
# bytes 64u Current number of bytes used by this server
# to store items
# curr_connections 32u Number of open connections
# total_connections 32u Total number of connections opened since
# total_connections 32u Total number of connections opened since
# the server started running
# connection_structures 32u Number of connection structures allocated
# connection_structures 32u Number of connection structures allocated
# by the server
# cmd_get 64u Cumulative number of retrieval requests
# cmd_set 64u Cumulative number of storage requests
# get_hits 64u Number of keys that have been requested and
# get_hits 64u Number of keys that have been requested and
# found present
# get_misses 64u Number of items that have been requested
# get_misses 64u Number of items that have been requested
# and not found
# evictions 64u Number of valid items removed from cache
# to free memory for new items
# bytes_read 64u Total number of bytes read by this server
# bytes_read 64u Total number of bytes read by this server
# from network
# bytes_written 64u Total number of bytes sent by this server to
# bytes_written 64u Total number of bytes sent by this server to
# network
# limit_maxbytes 32u Number of bytes this server is allowed to
# use for storage.
# use for storage.
# threads 32u Number of worker threads requested.
# (see doc/threads.txt)
# >>> mc.get_stats()
@ -54,6 +54,7 @@ from monagent.collector.checks import *
# http://www.couchbase.org/wiki/display/membase/Membase+Statistics
# https://github.com/membase/ep-engine/blob/master/docs/stats.org
class Memcache(AgentCheck):
DEFAULT_PORT = 11211
@ -102,7 +103,8 @@ class Memcache(AgentCheck):
mc = memcache.Client(["%s:%d" % (server, port)])
raw_stats = mc.get_stats()
assert len(raw_stats) == 1 and len(raw_stats[0]) == 2, "Malformed response: %s" % raw_stats
assert len(raw_stats) == 1 and len(
raw_stats[0]) == 2, "Malformed response: %s" % raw_stats
# Access the dict
stats = raw_stats[0][1]
for metric in stats:
@ -147,7 +149,8 @@ class Memcache(AgentCheck):
except ZeroDivisionError:
pass
except AssertionError:
raise Exception("Unable to retrieve stats from memcache instance: " + server + ":" + str(port) + ". Please check your configuration")
raise Exception("Unable to retrieve stats from memcache instance: " +
server + ":" + str(port) + ". Please check your configuration")
if mc is not None:
mc.disconnect_all()
@ -162,7 +165,8 @@ class Memcache(AgentCheck):
try:
import memcache
except ImportError:
raise Exception("Cannot import memcache module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/mcache")
raise Exception(
"Cannot import memcache module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/mcache")
# Hacky monkeypatch to fix a memory leak in the memcache library.
# See https://github.com/DataDog/dd-agent/issues/278 for details.
@ -192,9 +196,9 @@ class Memcache(AgentCheck):
all_instances.append(instance)
# Load the conf according to the new schema
#memcache_instance_1: first_host:first_port:first_tag
#memcache_instance_2: second_host:second_port:second_tag
#memcache_instance_3: third_host:third_port:third_tag
# memcache_instance_1: first_host:first_port:first_tag
# memcache_instance_2: second_host:second_port:second_tag
# memcache_instance_3: third_host:third_port:third_tag
index = 1
instance = agentConfig.get("memcache_instance_%s" % index, None)
while instance:

View File

@ -6,7 +6,6 @@ from monagent.collector.checks import AgentCheck
from monagent.common.util import get_hostname
# When running with pymongo < 2.0
# Not the full spec for mongo URIs -- just extract username and password
# http://www.mongodb.org/display/DOCS/connections6
@ -113,16 +112,26 @@ class MongoDb(AgentCheck):
state of a mongo node"""
def get_state_description(state):
if state == 0: return 'Starting Up'
elif state == 1: return 'Primary'
elif state == 2: return 'Secondary'
elif state == 3: return 'Recovering'
elif state == 4: return 'Fatal'
elif state == 5: return 'Starting up (forking threads)'
elif state == 6: return 'Unknown'
elif state == 7: return 'Arbiter'
elif state == 8: return 'Down'
elif state == 9: return 'Rollback'
if state == 0:
return 'Starting Up'
elif state == 1:
return 'Primary'
elif state == 2:
return 'Secondary'
elif state == 3:
return 'Recovering'
elif state == 4:
return 'Fatal'
elif state == 5:
return 'Starting up (forking threads)'
elif state == 6:
return 'Unknown'
elif state == 7:
return 'Arbiter'
elif state == 8:
return 'Down'
elif state == 9:
return 'Rollback'
status = get_state_description(state)
hostname = get_hostname(agentConfig)
@ -152,7 +161,7 @@ class MongoDb(AgentCheck):
'ssl': instance.get('ssl', None),
'ssl_keyfile': instance.get('ssl_keyfile', None),
'ssl_certfile': instance.get('ssl_certfile', None),
'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
'ssl_ca_certs': instance.get('ssl_ca_certs', None)
}
@ -166,8 +175,10 @@ class MongoDb(AgentCheck):
try:
from pymongo import Connection
except ImportError:
self.log.error('mongo.yaml exists but pymongo module can not be imported. Skipping check.')
raise Exception('Python PyMongo Module can not be imported. Please check the installation instruction on the Datadog Website')
self.log.error(
'mongo.yaml exists but pymongo module can not be imported. Skipping check.')
raise Exception(
'Python PyMongo Module can not be imported. Please check the installation instruction on the Datadog Website')
try:
from pymongo import uri_parser
@ -194,7 +205,7 @@ class MongoDb(AgentCheck):
do_auth = False
conn = Connection(server, network_timeout=DEFAULT_TIMEOUT,
**ssl_params)
**ssl_params)
db = conn[db_name]
if do_auth:
if not db.authenticate(username, password):
@ -204,7 +215,8 @@ class MongoDb(AgentCheck):
status['stats'] = db.command('dbstats')
# Handle replica data, if any
# See http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus
# See
# http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus
try:
data = {}
@ -224,11 +236,11 @@ class MongoDb(AgentCheck):
if current is not None and primary is not None:
lag = current['optimeDate'] - primary['optimeDate']
# Python 2.7 has this built in, python < 2.7 don't...
if hasattr(lag,'total_seconds'):
if hasattr(lag, 'total_seconds'):
data['replicationLag'] = lag.total_seconds()
else:
data['replicationLag'] = (lag.microseconds + \
(lag.seconds + lag.days * 24 * 3600) * 10**6) / 10.0**6
data['replicationLag'] = (lag.microseconds +
(lag.seconds + lag.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6
if current is not None:
data['health'] = current['health']

View File

@ -45,6 +45,7 @@ STATUS_VARS = {
class MySql(AgentCheck):
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
self.mysql_version = {}
@ -63,7 +64,8 @@ class MySql(AgentCheck):
return {"MySQLdb": version}
def check(self, instance):
host, port, user, password, mysql_sock, defaults_file, dimensions, options = self._get_config(instance)
host, port, user, password, mysql_sock, defaults_file, dimensions, options = self._get_config(
instance)
if (not host or not user) and not defaults_file:
raise Exception("Mysql host and user are needed.")
@ -92,23 +94,23 @@ class MySql(AgentCheck):
import MySQLdb
except ImportError:
raise Exception("Cannot import MySQLdb module. Check the instructions "
"to install this module at https://app.datadoghq.com/account/settings#integrations/mysql")
"to install this module at https://app.datadoghq.com/account/settings#integrations/mysql")
if defaults_file != '':
db = MySQLdb.connect(read_default_file=defaults_file)
elif mysql_sock != '':
elif mysql_sock != '':
db = MySQLdb.connect(unix_socket=mysql_sock,
user=user,
passwd=password)
user=user,
passwd=password)
elif port:
db = MySQLdb.connect(host=host,
port=port,
user=user,
passwd=password)
port=port,
user=user,
passwd=password)
else:
db = MySQLdb.connect(host=host,
user=user,
passwd=password)
user=user,
passwd=password)
self.log.debug("Connected to MySQL")
return db
@ -125,15 +127,21 @@ class MySql(AgentCheck):
# Be sure InnoDB is enabled
if 'Innodb_page_size' in results:
page_size = self._collect_scalar('Innodb_page_size', results)
innodb_buffer_pool_pages_total = self._collect_scalar('Innodb_buffer_pool_pages_total', results)
innodb_buffer_pool_pages_free = self._collect_scalar('Innodb_buffer_pool_pages_free', results)
innodb_buffer_pool_pages_total = self._collect_scalar(
'Innodb_buffer_pool_pages_total', results)
innodb_buffer_pool_pages_free = self._collect_scalar(
'Innodb_buffer_pool_pages_free', results)
innodb_buffer_pool_pages_total = innodb_buffer_pool_pages_total * page_size
innodb_buffer_pool_pages_free = innodb_buffer_pool_pages_free * page_size
innodb_buffer_pool_pages_used = innodb_buffer_pool_pages_total - innodb_buffer_pool_pages_free
innodb_buffer_pool_pages_used = innodb_buffer_pool_pages_total - \
innodb_buffer_pool_pages_free
self.gauge("mysql.innodb.buffer_pool_free", innodb_buffer_pool_pages_free, dimensions=dimensions)
self.gauge("mysql.innodb.buffer_pool_used", innodb_buffer_pool_pages_used, dimensions=dimensions)
self.gauge("mysql.innodb.buffer_pool_total", innodb_buffer_pool_pages_total, dimensions=dimensions)
self.gauge("mysql.innodb.buffer_pool_free",
innodb_buffer_pool_pages_free, dimensions=dimensions)
self.gauge("mysql.innodb.buffer_pool_used",
innodb_buffer_pool_pages_used, dimensions=dimensions)
self.gauge("mysql.innodb.buffer_pool_total",
innodb_buffer_pool_pages_total, dimensions=dimensions)
if 'galera_cluster' in options and options['galera_cluster']:
value = self._collect_scalar('wsrep_cluster_size', results)
@ -181,7 +189,8 @@ class MySql(AgentCheck):
greater_502 = True
except Exception, exception:
self.warning("Cannot compute mysql version, assuming older than 5.0.2: %s" % str(exception))
self.warning("Cannot compute mysql version, assuming older than 5.0.2: %s" %
str(exception))
self.greater_502[host] = greater_502
@ -250,7 +259,8 @@ class MySql(AgentCheck):
else:
self.log.debug("Received value is None for index %d" % col_idx)
except ValueError:
self.log.exception("Cannot find %s in the columns %s" % (field, cursor.description))
self.log.exception("Cannot find %s in the columns %s" %
(field, cursor.description))
cursor.close()
del cursor
except Exception:
@ -281,10 +291,13 @@ class MySql(AgentCheck):
# Convert time to s (number of second of CPU used by mysql)
# It's a counter, it will be divided by the period, multiply by 100
# to get the percentage of CPU used by mysql over the period
self.rate("mysql.performance.user_time", int((float(ucpu)/float(clk_tck)) * 100), dimensions=dimensions)
self.rate("mysql.performance.kernel_time", int((float(kcpu)/float(clk_tck)) * 100), dimensions=dimensions)
self.rate("mysql.performance.user_time", int(
(float(ucpu) / float(clk_tck)) * 100), dimensions=dimensions)
self.rate("mysql.performance.kernel_time", int(
(float(kcpu) / float(clk_tck)) * 100), dimensions=dimensions)
except Exception:
self.warning("Error while reading mysql (pid: %s) procfs data\n%s" % (pid, traceback.format_exc()))
self.warning("Error while reading mysql (pid: %s) procfs data\n%s" %
(pid, traceback.format_exc()))
def _get_server_pid(self, db):
pid = None
@ -331,10 +344,10 @@ class MySql(AgentCheck):
return {
'instances': [{
'server': agent_config.get('mysql_server',''),
'sock': agent_config.get('mysql_sock',''),
'user': agent_config.get('mysql_user',''),
'pass': agent_config.get('mysql_pass',''),
'server': agent_config.get('mysql_server', ''),
'sock': agent_config.get('mysql_sock', ''),
'user': agent_config.get('mysql_user', ''),
'pass': agent_config.get('mysql_pass', ''),
'options': {'replication': True},
}]
}

View File

@ -13,6 +13,7 @@ from monagent.collector.checks.services_checks import ServicesCheck, Status
class WrapNagios(ServicesCheck):
"""Inherit ServicesCheck class to process Nagios checks"""
def __init__(self, name, init_config, agent_config, instances=None):
@ -55,7 +56,8 @@ class WrapNagios(ServicesCheck):
if last_run_path.endswith('/') is False:
last_run_path += '/'
last_run_file = (last_run_path + 'nagios_wrapper_' + hashlib.md5(instance['service_name']).hexdigest() + '.pck')
last_run_file = (
last_run_path + 'nagios_wrapper_' + hashlib.md5(instance['service_name']).hexdigest() + '.pck')
# Load last-run data from shared memory file
last_run_data = {}

View File

@ -86,7 +86,7 @@ class Network(AgentCheck):
# For reasons i don't understand only these metrics are skipped if a
# particular interface is in the `excluded_interfaces` config list.
# Not sure why the others aren't included. Until I understand why, I'm
# Not sure why the others aren't included. Until I understand why, I'm
# going to keep the same behaviour.
exclude_iface_metrics = [
'packets_in',
@ -104,7 +104,6 @@ class Network(AgentCheck):
count += 1
self.log.debug("tracked %s network metrics for interface %s" % (count, iface))
@staticmethod
def _parse_value(v):
if v == "-":
@ -150,7 +149,6 @@ class Network(AgentCheck):
for metric, value in metrics.iteritems():
self.gauge(metric, value)
proc = open('/proc/net/dev', 'r')
try:
lines = proc.readlines()
@ -160,7 +158,8 @@ class Network(AgentCheck):
# face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed
# lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0
# eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0
# eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# eth1: 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0
for l in lines[2:]:
cols = l.split(':', 1)
x = cols[1].split()
@ -198,7 +197,8 @@ class Network(AgentCheck):
# ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0
# ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 -
# ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 -
# ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 -
# ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204
# 18742 - 8494811 -
lines = netstat.split("\n")
headers = lines[0].split()

View File

@ -7,6 +7,7 @@ from monagent.collector.checks.utils import add_basic_auth
class Nginx(AgentCheck):
"""Tracks basic nginx metrics via the status module
* number of connections
* number of requets per second
@ -21,6 +22,7 @@ class Nginx(AgentCheck):
Reading: 0 Writing: 2 Waiting: 6
"""
def check(self, instance):
if 'nginx_status_url' not in instance:
raise Exception('NginX instance missing "nginx_status_url" value.')
@ -37,7 +39,6 @@ class Nginx(AgentCheck):
request = urllib2.urlopen(req)
return request.read()
def _get_metrics(self, response, dimensions):
# Thanks to http://hostingfu.com/files/nginx/nginxstats.py for this code
# Connections
@ -85,7 +86,7 @@ class Nginx(AgentCheck):
'nginx_status_url': ":".join(instance[:-1]),
'dimensions': {'instance': instance[-1]}
})
load_conf(index+1)
load_conf(index + 1)
load_conf()

View File

@ -4,6 +4,7 @@ from monagent.collector.checks import AgentCheck
class PostfixCheck(AgentCheck):
"""This check provides metrics on the number of messages in a given postfix queue
WARNING: the user that dd-agent runs as must have sudo access for the 'find' command
@ -16,6 +17,7 @@ class PostfixCheck(AgentCheck):
"directory" - the value of 'postconf -h queue_directory'
"queues" - the postfix mail queues you would like to get message count totals for
"""
def check(self, instance):
config = self._get_config(instance)
@ -67,4 +69,3 @@ class PostfixCheck(AgentCheck):
# these can be retrieved in a single graph statement
# for example:
# sum:postfix.queue.size{instance:postfix-2,queue:incoming,host:hostname.domain.tld}

View File

@ -7,12 +7,13 @@ class ShouldRestartException(Exception):
class PostgreSql(AgentCheck):
"""Collects per-database, and optionally per-relation metrics
"""
RATE = AgentCheck.rate
GAUGE = AgentCheck.gauge
# turning columns into dimensions
DB_METRICS = {
'descriptors': [('datname', 'db')],
@ -133,7 +134,7 @@ SELECT relname,
metric_scope = (self.DB_METRICS,)
else:
metric_scope = (self.DB_METRICS, self.REL_METRICS, self.IDX_METRICS)
for scope in metric_scope:
# build query
cols = scope['metrics'].keys() # list of metrics to query, in some order
@ -143,7 +144,7 @@ SELECT relname,
except InterfaceError, e:
self.log.error("Connection seems broken: %s" % str(e))
raise ShouldRestartException
# if this is a relation-specific query, we need to list all relations last
if scope['relation'] and len(relations) > 0:
query = scope['query'] % (", ".join(cols), "%s") # Keep the last %s intact
@ -156,7 +157,7 @@ SELECT relname,
results = cursor.fetchall()
cursor.close()
# parse & submit results
# A row should look like this
# (descriptor, descriptor, ..., value, value, value, value, ...)
@ -166,7 +167,7 @@ SELECT relname,
desc = scope['descriptors']
# Check that all columns will be processed
assert len(row) == len(cols) + len(desc)
# Build dimensions
# descriptors are: (pg_name, dd_tag_name): value
# Special-case the "db" tag, which overrides the one that is passed as instance_dimensions
@ -181,13 +182,13 @@ SELECT relname,
# metric-map is: (dd_name, "rate"|"gauge")
# shift the results since the first columns will be the "descriptors"
values = zip([scope['metrics'][c] for c in cols], row[len(desc):])
# To submit simply call the function for each value v
# v[0] == (metric_name, submit_function)
# v[1] == the actual value
# dimensions are
[v[0][1](self, v[0][0], v[1], dimensions=dimensions) for v in values]
def get_connection(self, key, host, port, user, password, dbname, use_cached=True):
"Get and memoize connections to instances"
if key in self.dbs and use_cached:
@ -197,17 +198,18 @@ SELECT relname,
try:
import psycopg2 as pg
except ImportError:
raise ImportError("psycopg2 library cannot be imported. Please check the installation instruction on the Datadog Website.")
raise ImportError(
"psycopg2 library cannot be imported. Please check the installation instruction on the Datadog Website.")
if host == 'localhost' and password == '':
# Use ident method
connection = pg.connect("user=%s dbname=%s" % (user, dbname))
elif port != '':
connection = pg.connect(host=host, port=port, user=user,
password=password, database=dbname)
password=password, database=dbname)
else:
connection = pg.connect(host=host, user=user, password=password,
database=dbname)
database=dbname)
else:
if not host:
raise CheckException("Please specify a Postgres host to connect to.")
@ -220,7 +222,7 @@ SELECT relname,
# connection.autocommit was added in version 2.4.2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
self.dbs[key] = connection
return connection
@ -233,21 +235,21 @@ SELECT relname,
dbname = instance.get('dbname', 'postgres')
relations = instance.get('relations', [])
key = '%s:%s:%s' % (host, port,dbname)
key = '%s:%s:%s' % (host, port, dbname)
db = self.get_connection(key, host, port, user, password, dbname)
# Clean up dimensions in case there was a None entry in the instance
# e.g. if the yaml contains dimensions: but no actual dimensions
if dimensions is None:
dimensions = {}
# preset dimensions to the database name
dimensions["db"] = dbname
# Check version
version = self._get_version(key, db)
self.log.debug("Running check against version %s" % version)
# Collect metrics
try:
self._collect_stats(key, db, dimensions, relations)
@ -258,10 +260,10 @@ SELECT relname,
@staticmethod
def parse_agent_config(agentConfig):
server = agentConfig.get('postgresql_server','')
port = agentConfig.get('postgresql_port','')
user = agentConfig.get('postgresql_user','')
passwd = agentConfig.get('postgresql_pass','')
server = agentConfig.get('postgresql_server', '')
port = agentConfig.get('postgresql_port', '')
user = agentConfig.get('postgresql_user', '')
passwd = agentConfig.get('postgresql_pass', '')
if server != '' and user != '':
return {

View File

@ -93,7 +93,7 @@ class ProcessCheck(AgentCheck):
# process metrics available for psutil versions 0.5.0 and later on UNIX
extended_metrics_0_5_0_unix = self.is_psutil_version_later_than((0, 5, 0)) and \
Platform.is_unix()
Platform.is_unix()
if extended_metrics_0_5_0_unix:
open_file_descriptors = 0
else:
@ -158,11 +158,12 @@ class ProcessCheck(AgentCheck):
pass
if got_denied:
self.warning("The Monitoring Agent was denied access when trying to get the number of file descriptors")
self.warning(
"The Monitoring Agent was denied access when trying to get the number of file descriptors")
#Memory values are in Byte
# Memory values are in Byte
return (thr, cpu, rss, vms, real, open_file_descriptors,
read_count, write_count, read_bytes, write_bytes, voluntary_ctx_switches, involuntary_ctx_switches)
read_count, write_count, read_bytes, write_bytes, voluntary_ctx_switches, involuntary_ctx_switches)
def check(self, instance):
try:
@ -194,7 +195,7 @@ class ProcessCheck(AgentCheck):
self.gauge('processes_pid_count', len(pids), dimensions=dimensions)
metrics = dict(zip(ProcessCheck.PROCESS_GAUGE, self.get_process_metrics(pids,
psutil, cpu_check_interval)))
psutil, cpu_check_interval)))
for metric, value in metrics.iteritems():
if value is not None:

View File

@ -11,7 +11,8 @@ QUEUE_TYPE = 'queues'
NODE_TYPE = 'nodes'
MAX_DETAILED_QUEUES = 200
MAX_DETAILED_NODES = 100
# Post an event in the stream when the number of queues or nodes to collect is above 90% of the limit
# Post an event in the stream when the number of queues or nodes to
# collect is above 90% of the limit
ALERT_THRESHOLD = 0.9
QUEUE_ATTRIBUTES = ['active_consumers',
'consumers',
@ -39,6 +40,7 @@ METRIC_SUFFIX = {QUEUE_TYPE: "queue", NODE_TYPE: "node"}
class RabbitMQ(AgentCheck):
"""This check is for gathering statistics from the RabbitMQ
Management Plugin (http://www.rabbitmq.com/management.html)
"""
@ -67,7 +69,7 @@ class RabbitMQ(AgentCheck):
}
# List of queues/nodes to collect metrics from
specified = {
specified = {
QUEUE_TYPE: instance.get('queues', []),
NODE_TYPE: instance.get('nodes', []),
}
@ -78,16 +80,17 @@ class RabbitMQ(AgentCheck):
# setup urllib2 for Basic Auth
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='RabbitMQ Management', uri=base_url, user=username, passwd=password)
auth_handler.add_password(
realm='RabbitMQ Management', uri=base_url, user=username, passwd=password)
opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(opener)
return base_url, max_detailed, specified
def check(self, instance):
base_url, max_detailed, specified = self._get_config(instance)
self.get_stats(instance, base_url, QUEUE_TYPE, max_detailed[QUEUE_TYPE], specified[QUEUE_TYPE])
self.get_stats(
instance, base_url, QUEUE_TYPE, max_detailed[QUEUE_TYPE], specified[QUEUE_TYPE])
self.get_stats(instance, base_url, NODE_TYPE, max_detailed[NODE_TYPE], specified[NODE_TYPE])
@staticmethod
@ -100,7 +103,6 @@ class RabbitMQ(AgentCheck):
raise Exception('Cannot parse JSON response from API url: %s %s' % (url, str(e)))
return data
def get_stats(self, instance, base_url, object_type, max_detailed, specified_list):
"""
instance: the check instance
@ -111,20 +113,23 @@ class RabbitMQ(AgentCheck):
"""
data = self._get_data(urlparse.urljoin(base_url, object_type))
specified_items = list(specified_list) # Make a copy of this list as we will remove items from it at each iteration
# Make a copy of this list as we will remove items from it at each iteration
specified_items = list(specified_list)
""" data is a list of nodes or queues:
data = [
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue1', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
{'status': 'running', 'node': 'rabbit@host, 'name': 'queue10', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue11', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue1', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
{'status': 'running', 'node': 'rabbit@host, 'name': 'queue10', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue11', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
...
]
"""
if len(specified_items) > max_detailed:
raise Exception("The maximum number of %s you can specify is %d." % (object_type, max_detailed))
raise Exception("The maximum number of %s you can specify is %d." %
(object_type, max_detailed))
if specified_items is not None and len(specified_items) > 0: # a list of queues/nodes is specified. We process only those
# a list of queues/nodes is specified. We process only those
if specified_items is not None and len(specified_items) > 0:
if object_type == NODE_TYPE:
for data_line in data:
name = data_line.get("name")
@ -132,7 +137,7 @@ class RabbitMQ(AgentCheck):
self._get_metrics(data_line, object_type)
specified_items.remove(name)
else: # object_type == QUEUE_TYPE
else: # object_type == QUEUE_TYPE
for data_line in data:
name = data_line.get("name")
absolute_name = '%s/%s' % (data_line.get("vhost"), name)
@ -143,14 +148,16 @@ class RabbitMQ(AgentCheck):
self._get_metrics(data_line, object_type)
specified_items.remove(absolute_name)
else: # No queues/node are specified. We will process every queue/node if it's under the limit
# No queues/node are specified. We will process every queue/node if it's under the limit
else:
if len(data) > ALERT_THRESHOLD * max_detailed:
# Post a message on the dogweb stream to warn
self.alert(base_url, max_detailed, len(data), object_type)
if len(data) > max_detailed:
# Display a warning in the info page
self.warning("Too many queues to fetch. You must choose the %s you are interested in by editing the rabbitmq.yaml configuration file or get in touch with Datadog Support" % object_type)
self.warning(
"Too many queues to fetch. You must choose the %s you are interested in by editing the rabbitmq.yaml configuration file or get in touch with Datadog Support" % object_type)
for data_line in data[:max_detailed]:
# We truncate the list of nodes/queues if it's above the limit
@ -168,9 +175,11 @@ class RabbitMQ(AgentCheck):
value = data.get(attribute, None)
if value is not None:
try:
self.gauge('rabbitmq.%s.%s' % (METRIC_SUFFIX[object_type], attribute), float(value), dimensions=dimensions)
self.gauge('rabbitmq.%s.%s' % (METRIC_SUFFIX[object_type], attribute), float(
value), dimensions=dimensions)
except ValueError:
self.log.debug("Caught ValueError for %s %s = %s with dimensions: %s" % (METRIC_SUFFIX[object_type], attribute, value, dimensions))
self.log.debug("Caught ValueError for %s %s = %s with dimensions: %s" % (
METRIC_SUFFIX[object_type], attribute, value, dimensions))
def alert(self, base_url, max_detailed, size, object_type):
key = "%s%s" % (base_url, object_type)
@ -180,20 +189,21 @@ class RabbitMQ(AgentCheck):
self.already_alerted.append(key)
title = "RabbitMQ integration is approaching the limit on the number of %s that can be collected from on %s" % (object_type, self.hostname)
msg = """%s %s are present. The limit is %s.
title = "RabbitMQ integration is approaching the limit on the number of %s that can be collected from on %s" % (
object_type, self.hostname)
msg = """%s %s are present. The limit is %s.
Please get in touch with Datadog support to increase the limit.""" % (size, object_type, max_detailed)
event = {
"timestamp": int(time.time()),
"event_type": EVENT_TYPE,
"msg_title": title,
"msg_text": msg,
"alert_type": 'warning',
"source_type_name": SOURCE_TYPE_NAME,
"host": self.hostname,
"dimensions": {"base_url": base_url, "host": self.hostname},
"event_object": "rabbitmq.limit.%s" % object_type,
}
"timestamp": int(time.time()),
"event_type": EVENT_TYPE,
"msg_title": title,
"msg_text": msg,
"alert_type": 'warning',
"source_type_name": SOURCE_TYPE_NAME,
"host": self.hostname,
"dimensions": {"base_url": base_url, "host": self.hostname},
"event_object": "rabbitmq.limit.%s" % object_type,
}
self.event(event)

View File

@ -12,59 +12,58 @@ class Redis(AgentCheck):
subkeys = ['keys', 'expires']
GAUGE_KEYS = {
# Append-only metrics
'aof_last_rewrite_time_sec': 'redis.aof.last_rewrite_time',
'aof_rewrite_in_progress': 'redis.aof.rewrite',
'aof_current_size': 'redis.aof.size',
'aof_buffer_length': 'redis.aof.buffer_length',
'aof_last_rewrite_time_sec': 'redis.aof.last_rewrite_time',
'aof_rewrite_in_progress': 'redis.aof.rewrite',
'aof_current_size': 'redis.aof.size',
'aof_buffer_length': 'redis.aof.buffer_length',
# Network
'connected_clients': 'redis.net.clients',
'connected_slaves': 'redis.net.slaves',
'rejected_connections': 'redis.net.rejected',
'connected_clients': 'redis.net.clients',
'connected_slaves': 'redis.net.slaves',
'rejected_connections': 'redis.net.rejected',
# clients
'blocked_clients': 'redis.clients.blocked',
'client_biggest_input_buf': 'redis.clients.biggest_input_buf',
'client_longest_output_list': 'redis.clients.longest_output_list',
'blocked_clients': 'redis.clients.blocked',
'client_biggest_input_buf': 'redis.clients.biggest_input_buf',
'client_longest_output_list': 'redis.clients.longest_output_list',
# Keys
'evicted_keys': 'redis.keys.evicted',
'expired_keys': 'redis.keys.expired',
'evicted_keys': 'redis.keys.evicted',
'expired_keys': 'redis.keys.expired',
# stats
'keyspace_hits': 'redis.stats.keyspace_hits',
'keyspace_misses': 'redis.stats.keyspace_misses',
'latest_fork_usec': 'redis.perf.latest_fork_usec',
'keyspace_hits': 'redis.stats.keyspace_hits',
'keyspace_misses': 'redis.stats.keyspace_misses',
'latest_fork_usec': 'redis.perf.latest_fork_usec',
# pubsub
'pubsub_channels': 'redis.pubsub.channels',
'pubsub_patterns': 'redis.pubsub.patterns',
'pubsub_channels': 'redis.pubsub.channels',
'pubsub_patterns': 'redis.pubsub.patterns',
# rdb
'rdb_bgsave_in_progress': 'redis.rdb.bgsave',
'rdb_changes_since_last_save': 'redis.rdb.changes_since_last',
'rdb_last_bgsave_time_sec': 'redis.rdb.last_bgsave_time',
'rdb_bgsave_in_progress': 'redis.rdb.bgsave',
'rdb_changes_since_last_save': 'redis.rdb.changes_since_last',
'rdb_last_bgsave_time_sec': 'redis.rdb.last_bgsave_time',
# memory
'mem_fragmentation_ratio': 'redis.mem.fragmentation_ratio',
'used_memory': 'redis.mem.used',
'used_memory_lua': 'redis.mem.lua',
'used_memory_peak': 'redis.mem.peak',
'used_memory_rss': 'redis.mem.rss',
'mem_fragmentation_ratio': 'redis.mem.fragmentation_ratio',
'used_memory': 'redis.mem.used',
'used_memory_lua': 'redis.mem.lua',
'used_memory_peak': 'redis.mem.peak',
'used_memory_rss': 'redis.mem.rss',
# replication
'master_last_io_seconds_ago': 'redis.replication.last_io_seconds_ago',
'master_sync_in_progress': 'redis.replication.sync',
'master_sync_left_bytes': 'redis.replication.sync_left_bytes',
'master_last_io_seconds_ago': 'redis.replication.last_io_seconds_ago',
'master_sync_in_progress': 'redis.replication.sync',
'master_sync_left_bytes': 'redis.replication.sync_left_bytes',
}
RATE_KEYS = {
# cpu
'used_cpu_sys': 'redis.cpu.sys',
'used_cpu_sys_children': 'redis.cpu.sys_children',
'used_cpu_user': 'redis.cpu.user',
'used_cpu_user_children': 'redis.cpu.user_children',
'used_cpu_sys': 'redis.cpu.sys',
'used_cpu_sys_children': 'redis.cpu.sys_children',
'used_cpu_user': 'redis.cpu.user',
'used_cpu_user_children': 'redis.cpu.user_children',
}
def __init__(self, name, init_config, agent_config):
@ -75,6 +74,7 @@ class Redis(AgentCheck):
def get_library_versions():
try:
import redis
version = redis.__version__
except ImportError:
version = "Not Found"
@ -107,20 +107,22 @@ class Redis(AgentCheck):
def _get_conn(self, instance):
import redis
key = self._generate_instance_key(instance)
if key not in self.connections:
try:
# Only send useful parameters to the redis client constructor
list_params = ['host', 'port', 'db', 'password', 'socket_timeout',
'connection_pool', 'charset', 'errors', 'unix_socket_path']
'connection_pool', 'charset', 'errors', 'unix_socket_path']
connection_params = dict((k, instance[k]) for k in list_params if k in instance)
self.connections[key] = redis.Redis(**connection_params)
except TypeError:
raise Exception("You need a redis library that supports authenticated connections. Try sudo easy_install redis.")
raise Exception(
"You need a redis library that supports authenticated connections. Try sudo easy_install redis.")
return self.connections[key]
@ -143,12 +145,13 @@ class Redis(AgentCheck):
try:
info = conn.info()
except ValueError, e:
# This is likely a know issue with redis library 2.0.0
# This is likely a know issue with redis library 2.0.0
# See https://github.com/DataDog/dd-agent/issues/374 for details
import redis
raise Exception("""Unable to run the info command. This is probably an issue with your version of the python-redis library.
Minimum required version: 2.4.11
Your current version: %s
Your current version: %s
Please upgrade to a newer version by running sudo easy_install redis""" % redis.__version__)
latency_ms = round((time.time() - start) * 1000, 2)
@ -172,8 +175,10 @@ class Redis(AgentCheck):
self.gauge(metric, val, dimensions=db_dimensions)
# Save a subset of db-wide statistics
[self.gauge(self.GAUGE_KEYS[k], info[k], dimensions=dimensions) for k in self.GAUGE_KEYS if k in info]
[self.rate (self.RATE_KEYS[k], info[k], dimensions=dimensions) for k in self.RATE_KEYS if k in info]
[self.gauge(self.GAUGE_KEYS[k], info[k], dimensions=dimensions)
for k in self.GAUGE_KEYS if k in info]
[self.rate(self.RATE_KEYS[k], info[k], dimensions=dimensions)
for k in self.RATE_KEYS if k in info]
# Save the number of commands.
self.rate('redis.net.commands', info['total_commands_processed'], dimensions=dimensions)
@ -182,7 +187,8 @@ class Redis(AgentCheck):
try:
import redis
except ImportError:
raise Exception('Python Redis Module can not be imported. Please check the installation instruction on the Datadog Website')
raise Exception(
'Python Redis Module can not be imported. Please check the installation instruction on the Datadog Website')
if (not "host" in instance or not "port" in instance) and not "unix_socket_path" in instance:
raise Exception("You must specify a host/port couple or a unix_socket_path")

View File

@ -49,11 +49,10 @@ class Riak(AgentCheck):
self.prev_coord_redirs_total = -1
def check(self, instance):
url = instance['url']
url = instance['url']
default_timeout = self.init_config.get('default_timeout', 5)
timeout = float(instance.get('timeout', default_timeout))
timeout = float(instance.get('timeout', default_timeout))
aggregation_key = md5(url).hexdigest()

View File

@ -32,7 +32,7 @@ class SQLServer(AgentCheck):
if row['type'] not in VALID_METRIC_TYPES:
self.log.error('%s has an invalid metric type: %s' % (row['name'], row['type']))
self.METRICS.append((row['name'], row['type'], row['counter_name'],
row.get('instance_name', ''), row.get('tag_by', None)))
row.get('instance_name', ''), row.get('tag_by', None)))
# Cache connections
self.connections = {}
@ -76,8 +76,8 @@ class SQLServer(AgentCheck):
self.connections[conn_key] = conn
except Exception, e:
cx = "%s - %s" % (host, database)
raise Exception("Unable to connect to SQL Server for instance %s.\n %s" \
% (cx, traceback.format_exc()))
raise Exception("Unable to connect to SQL Server for instance %s.\n %s"
% (cx, traceback.format_exc()))
conn = self.connections[conn_key]
cursor = conn.cursor()
@ -137,4 +137,3 @@ class SQLServer(AgentCheck):
dimensions[tag_by] = instance_name.strip()
metric_func = getattr(self, mtype)
metric_func(mname, value, dimensions=dimensions)

View File

@ -61,9 +61,11 @@ class TCPCheck(ServicesCheck):
sock.close()
except socket.timeout, e:
# The connection timed out because it took more time than the specified value in the yaml config file
# The connection timed out because it took more time than the specified
# value in the yaml config file
length = int((time.time() - start) * 1000)
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" % (addr, port, str(e), length))
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" %
(addr, port, str(e), length))
return Status.DOWN, "%s. Connection failed after %s ms" % (str(e), length)
except socket.error, e:
@ -71,18 +73,21 @@ class TCPCheck(ServicesCheck):
if "timed out" in str(e):
# The connection timed out becase it took more time than the system tcp stack allows
self.log.warning("The connection timed out because it took more time than the system tcp stack allows. You might want to change this setting to allow longer timeouts")
self.log.warning(
"The connection timed out because it took more time than the system tcp stack allows. You might want to change this setting to allow longer timeouts")
self.log.info("System tcp timeout. Assuming that the checked system is down")
return Status.DOWN, """Socket error: %s.
The connection timed out after %s ms because it took more time than the system tcp stack allows.
You might want to change this setting to allow longer timeouts""" % (str(e), length)
else:
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" % (addr, port, str(e), length))
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" %
(addr, port, str(e), length))
return Status.DOWN, "%s. Connection failed after %s ms" % (str(e), length)
except Exception, e:
length = int((time.time() - start) * 1000)
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" % (addr, port, str(e), length))
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" %
(addr, port, str(e), length))
return Status.DOWN, "%s. Connection failed after %s ms" % (str(e), length)
if response_time:
@ -92,7 +97,6 @@ class TCPCheck(ServicesCheck):
self.log.debug("%s:%s is UP" % (addr, port))
return Status.UP, "UP"
def _create_status_event(self, status, msg, instance):
# Get the instance settings
host = instance.get('host', None)
@ -106,7 +110,6 @@ class TCPCheck(ServicesCheck):
if custom_message:
custom_message += " \n"
# Let the possibility to override the source type name
instance_source_type_name = instance.get('source_type', None)
if instance_source_type_name is None:
@ -123,29 +126,28 @@ class TCPCheck(ServicesCheck):
notify_list.append("@%s" % handle.strip())
notify_message = " ".join(notify_list) + " \n"
if status == Status.DOWN:
title = "[Alert] %s reported that %s is down" % (self.hostname, name)
alert_type = "error"
msg = """%s %s %s reported that %s (%s:%s) failed %s time(s) within %s last attempt(s).
Last error: %s""" % (notify_message,
custom_message, self.hostname, name, host, port, nb_failures, nb_tries, msg)
custom_message, self.hostname, name, host, port, nb_failures, nb_tries, msg)
event_type = EventType.DOWN
else: # Status is UP
else: # Status is UP
title = "[Recovered] %s reported that %s is up" % (self.hostname, name)
alert_type = "success"
msg = "%s %s %s reported that %s (%s:%s) recovered." % (notify_message,
custom_message, self.hostname, name, host, port)
custom_message, self.hostname, name, host, port)
event_type = EventType.UP
return {
'timestamp': int(time.time()),
'event_type': event_type,
'host': self.hostname,
'msg_text': msg,
'msg_title': title,
'alert_type': alert_type,
"source_type_name": source_type,
"event_object": name,
'timestamp': int(time.time()),
'event_type': event_type,
'host': self.hostname,
'msg_text': msg,
'msg_title': title,
'alert_type': alert_type,
"source_type_name": source_type,
"event_object": name,
}

View File

@ -7,6 +7,7 @@ from monagent.collector.checks import AgentCheck
class Varnish(AgentCheck):
# XML parsing bits, a.k.a. Kafka in Code
def _reset(self):
self._current_element = ""
self._current_metric = "varnish"
@ -92,7 +93,7 @@ class Varnish(AgentCheck):
# Assumptions regarding varnish's version
use_xml = True
arg = "-x" # varnishstat argument
arg = "-x" # varnishstat argument
version = 3
m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
@ -123,7 +124,7 @@ class Varnish(AgentCheck):
dimensions[u'varnish_name': 'default']
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stderr=subprocess.PIPE)
output, error = proc.communicate()
except Exception:
self.log.error(u"Failed to run %s" % repr(cmd))

View File

@ -15,6 +15,7 @@ EVENT_TYPE = 'win32_log_event'
class Win32EventLog(AgentCheck):
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
self.last_ts = {}
@ -81,8 +82,9 @@ class Win32EventLog(AgentCheck):
class EventLogQuery(object):
def __init__(self, ltype=None, user=None, source_name=None, log_file=None,
start_ts=None, message_filters=None):
start_ts=None, message_filters=None):
self.filters = [
('Type', self._convert_event_types(ltype)),
('User', user),
@ -139,8 +141,8 @@ class EventLogQuery(object):
time struct.
'''
return wmi.from_time(year=dt.year, month=dt.month, day=dt.day,
hours=dt.hour, minutes=dt.minute, seconds=dt.second, microseconds=0,
timezone=0)
hours=dt.hour, minutes=dt.minute, seconds=dt.second, microseconds=0,
timezone=0)
@staticmethod
def _convert_event_types(types):
@ -149,7 +151,9 @@ class EventLogQuery(object):
'''
return types
class LogEvent(object):
def __init__(self, ev, api_key, hostname, tags, notify_list):
self.event = ev
self.api_key = api_key
@ -183,9 +187,9 @@ class LogEvent(object):
''' Convert a wmi formatted timestamp into an epoch using wmi.to_time().
'''
year, month, day, hour, minute, second, microsecond, tz = \
wmi.to_time(wmi_ts)
wmi.to_time(wmi_ts)
dt = datetime(year=year, month=month, day=day, hour=hour, minute=minute,
second=second, microsecond=microsecond)
second=second, microsecond=microsecond)
return int(calendar.timegm(dt.timetuple()))
@staticmethod

View File

@ -17,6 +17,7 @@ SEARCH_WILDCARD = '*'
class WMICheck(AgentCheck):
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
self.wmi_conns = {}
@ -30,7 +31,7 @@ class WMICheck(AgentCheck):
def check(self, instance):
if wmi is None:
raise Exception("Missing 'wmi' module")
host = instance.get('host', None)
user = instance.get('username', None)
password = instance.get('password', None)
@ -62,7 +63,8 @@ class WMICheck(AgentCheck):
def _extract_metrics(self, results, metrics, tag_by):
if len(results) > 1 and tag_by is None:
raise Exception('WMI query returned multiple rows but no `tag_by` value was given. metrics=%s' % metrics)
raise Exception(
'WMI query returned multiple rows but no `tag_by` value was given. metrics=%s' % metrics)
for wmi_property, name, mtype in metrics:
for res in results:

View File

@ -60,7 +60,8 @@ class Zookeeper(AgentCheck):
while chunk:
if num_reads > max_reads:
# Safeguard against an infinite loop
raise Exception("Read %s bytes before exceeding max reads of %s. " % (buf.tell(), max_reads))
raise Exception(
"Read %s bytes before exceeding max reads of %s. " % (buf.tell(), max_reads))
chunk = sock.recv(chunk_size)
buf.write(chunk)
num_reads += 1
@ -101,7 +102,7 @@ class Zookeeper(AgentCheck):
has_connections_val = version_tuple >= ('3', '4', '4')
# Clients:
buf.readline() # skip the Clients: header
buf.readline() # skip the Clients: header
connections = 0
client_line = buf.readline().strip()
if client_line:

View File

@ -51,8 +51,10 @@ START_COMMANDS = ['start', 'restart', 'foreground']
log = logging.getLogger('collector')
# todo the collector has daemon code but is always run in foreground mode from the supervisor, is there a reason for the daemon code then?
# todo the collector has daemon code but is always run in foreground mode
# from the supervisor, is there a reason for the daemon code then?
class CollectorDaemon(Daemon):
"""
The agent class is a daemon that runs the collector in a background process.
"""
@ -115,7 +117,7 @@ class CollectorDaemon(Daemon):
# Run the main loop.
while self.run_forever:
# enable profiler if needed
profiled = False
if config.get('profile', False) and config.get('profile').lower() == 'yes':
@ -127,7 +129,7 @@ class CollectorDaemon(Daemon):
log.debug("Agent profiling is enabled")
except Exception:
log.warn("Cannot enable profiler")
# Do the work.
self.collector.run()
@ -171,7 +173,7 @@ class CollectorDaemon(Daemon):
watchdog = None
if agentConfig.get("watchdog", True):
watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER,
max_mem_mb=agentConfig.get('limit_memory_consumption', None))
max_mem_mb=agentConfig.get('limit_memory_consumption', None))
watchdog.reset()
return watchdog
@ -186,6 +188,7 @@ class CollectorDaemon(Daemon):
self.collector.stop()
sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def main():
options, args = get_parsed_args()
agentConfig = get_config(options=options)
@ -295,20 +298,20 @@ def main():
return 0
else:
print("Fix the invalid yaml files above in order to start the Monitoring agent. "
"A useful external tool for yaml parsing can be found at "
"http://yaml-online-parser.appspot.com/")
"A useful external tool for yaml parsing can be found at "
"http://yaml-online-parser.appspot.com/")
return 1
elif 'jmx' == command:
from collector.jmxfetch import JMX_LIST_COMMANDS, JMXFetch
if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys():
print "#" * 80
print "JMX tool to be used to help configuring your JMX checks."
print "See http://docs.datadoghq.com/integrations/java/ for more information"
print "#" * 80
print "\n"
print "You have to specify one of the following command:"
print "You have to specify one of the following command:"
for command, desc in JMX_LIST_COMMANDS.iteritems():
print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc)
print "Example: sudo /etc/init.d/mon-agent jmx list_matching_attributes tomcat jmx solr"
@ -318,12 +321,12 @@ def main():
jmx_command = args[1]
checks_list = args[2:]
confd_directory = get_confd_path(get_os())
should_run = JMXFetch.init(confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console")
should_run = JMXFetch.init(
confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console")
if not should_run:
print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory
print "Have you enabled any JMX check ?"
return 0

View File

@ -36,7 +36,7 @@ LOG_PATTERN = re.compile(r"".join([
r"\s*(?P<priority>%s)\s+" % "|".join("(%s)" % p for p in LOG4J_PRIORITY),
r"(\[CompactionExecutor:\d*\]\s+)?", # optional thread name and number
r"((?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2},\d*)|",
r"(?P<time>\d{2}:\d{2}:\d{2},\d*))\s+",
r"(?P<time>\d{2}:\d{2}:\d{2},\d*))\s+",
r"(\w+\.java \(line \d+\)\s+)?", # optional source file and line
r"(?P<msg>Compact(ed|ing) .*)\s*",
]))
@ -50,6 +50,7 @@ def parse_date(timestamp):
timestamp, _ = timestamp.split(',')
return common.parse_date(timestamp, LEGACY_DATE_FORMAT)
def parse_cassandra(log, line):
matched = LOG_PATTERN.match(line)
if matched:
@ -58,7 +59,8 @@ def parse_cassandra(log, line):
# Convert the timestamp string into an epoch timestamp
time_val = event.get('time', None)
if time_val:
event['timestamp'] = parse_date("%s %s" % (datetime.utcnow().strftime("%Y-%m-%d"), time_val))
event['timestamp'] = parse_date(
"%s %s" % (datetime.utcnow().strftime("%Y-%m-%d"), time_val))
else:
try:
event['timestamp'] = parse_date(event['timestamp'])

View File

@ -20,9 +20,9 @@ SUPERVISORD_LEVELS = [
'INFO', # normal informational output
# IGNORED...
#'DEBG', # messages useful for users trying to debug configurations
#'TRAC', # messages useful to developers trying to debug plugins
#'BLAT', # messages useful for developers trying to debug supervisor
# 'DEBG', # messages useful for users trying to debug configurations
# 'TRAC', # messages useful to developers trying to debug plugins
# 'BLAT', # messages useful for developers trying to debug supervisor
]
@ -35,6 +35,7 @@ ALERT_TYPES_MAPPING = {"CRIT": "error",
# regex to extract the 'program' supervisord is managing from the text
program_matcher = re.compile("^\w+:? '?(?P<program>\w+)'?")
def parse_supervisord(log, line):
"""
Parse the supervisord.log line into a dogstream event
@ -42,7 +43,8 @@ def parse_supervisord(log, line):
if len(line) == 0:
log.info("Skipping empty line of supervisord.log")
return None
if log: log.debug('PARSE supervisord:%s' % line)
if log:
log.debug('PARSE supervisord:%s' % line)
line_items = line.split(' ', 3)
timestamp = ' '.join(line_items[:2])
timestamp_parts = timestamp.split(',')
@ -52,7 +54,7 @@ def parse_supervisord(log, line):
event_type = line_items[2]
msg = line_items[3]
if event_type in SUPERVISORD_LEVELS:
alert_type=ALERT_TYPES_MAPPING.get(event_type, 'info')
alert_type = ALERT_TYPES_MAPPING.get(event_type, 'info')
if alert_type == 'info' and 'success' in msg:
alert_type = 'success'
event = dict(timestamp=date,
@ -62,7 +64,8 @@ def parse_supervisord(log, line):
program_result = program_matcher.match(msg)
if program_result:
event['event_object'] = program_result.groupdict()['program']
if log: log.debug('RESULT supervisord:%s' %event)
if log:
log.debug('RESULT supervisord:%s' % event)
return [event]
else:
return None

View File

@ -1,5 +1,6 @@
# std
import yaml
try:
from yaml import CLoader as Loader
except ImportError:
@ -37,13 +38,13 @@ JMX_CHECKS = [
]
JMX_COLLECT_COMMAND = 'collect'
JMX_LIST_COMMANDS = {
'list_everything': 'List every attributes available that has a type supported by JMXFetch',
'list_collected_attributes': 'List attributes that will actually be collected by your current instances configuration',
'list_matching_attributes': 'List attributes that match at least one of your instances configuration',
'list_not_matching_attributes': "List attributes that don't match any of your instances configuration",
'list_limited_attributes': "List attributes that do match one of your instances configuration but that are not being collected because it would exceed the number of metrics that can be collected",
JMX_COLLECT_COMMAND: "Start the collection of metrics based on your current configuration and display them in the console"
}
'list_everything': 'List every attributes available that has a type supported by JMXFetch',
'list_collected_attributes': 'List attributes that will actually be collected by your current instances configuration',
'list_matching_attributes': 'List attributes that match at least one of your instances configuration',
'list_not_matching_attributes': "List attributes that don't match any of your instances configuration",
'list_limited_attributes': "List attributes that do match one of your instances configuration but that are not being collected because it would exceed the number of metrics that can be collected",
JMX_COLLECT_COMMAND: "Start the collection of metrics based on your current configuration and display them in the console"
}
PYTHON_JMX_STATUS_FILE = 'jmx_status_python.yaml'
@ -55,7 +56,6 @@ class InvalidJMXConfiguration(Exception):
class JMXFetch(object):
pid_file = PidFile("jmxfetch")
pid_file_path = pid_file.get_path()
@ -64,7 +64,8 @@ class JMXFetch(object):
default_check_frequency, command=None, checks_list=None, reporter=None):
try:
command = command or JMX_COLLECT_COMMAND
jmx_checks, invalid_checks, java_bin_path, java_options = JMXFetch.should_run(confd_path, checks_list)
jmx_checks, invalid_checks, java_bin_path, java_options = JMXFetch.should_run(
confd_path, checks_list)
if len(invalid_checks) > 0:
try:
JMXFetch.write_status_file(invalid_checks)
@ -77,8 +78,8 @@ class JMXFetch(object):
JMXFetch.stop()
JMXFetch.start(confd_path, agentConfig, logging_config,
java_bin_path, java_options, default_check_frequency,
jmx_checks, command, reporter)
java_bin_path, java_options, default_check_frequency,
jmx_checks, command, reporter)
return True
except Exception:
log.exception("Error while initiating JMXFetch")
@ -86,7 +87,7 @@ class JMXFetch(object):
@classmethod
def write_status_file(cls, invalid_checks):
data = {
'timestamp': time.time(),
'timestamp': time.time(),
'invalid_checks': invalid_checks
}
stream = file(os.path.join(tempfile.gettempdir(), PYTHON_JMX_STATUS_FILE), 'w')
@ -138,7 +139,8 @@ class JMXFetch(object):
continue
try:
is_jmx, check_java_bin_path, check_java_options = JMXFetch.is_jmx_check(check_config, check_name, checks_list)
is_jmx, check_java_bin_path, check_java_options = JMXFetch.is_jmx_check(
check_config, check_name, checks_list)
if is_jmx:
jmx_checks.append(filename)
if java_bin_path is None and check_java_bin_path is not None:
@ -170,11 +172,13 @@ class JMXFetch(object):
if is_jmx:
instances = check_config.get('instances', [])
if type(instances) != list or len(instances) == 0:
raise InvalidJMXConfiguration('You need to have at least one instance defined in the YAML file for this check')
raise InvalidJMXConfiguration(
'You need to have at least one instance defined in the YAML file for this check')
for inst in instances:
if type(inst) != dict:
raise InvalidJMXConfiguration("Each instance should be a dictionary. %s" % LINK_TO_DOC)
raise InvalidJMXConfiguration(
"Each instance should be a dictionary. %s" % LINK_TO_DOC)
host = inst.get('host', None)
port = inst.get('port', None)
conf = inst.get('conf', init_config.get('conf', None))
@ -184,22 +188,28 @@ class JMXFetch(object):
raise InvalidJMXConfiguration("A numeric port must be specified")
if conf is None:
log.warning("%s doesn't have a 'conf' section. Only basic JVM metrics will be collected. %s" % (inst, LINK_TO_DOC))
log.warning(
"%s doesn't have a 'conf' section. Only basic JVM metrics will be collected. %s" % (
inst, LINK_TO_DOC))
else:
if type(conf) != list or len(conf) == 0:
raise InvalidJMXConfiguration("'conf' section should be a list of configurations %s" % LINK_TO_DOC)
raise InvalidJMXConfiguration(
"'conf' section should be a list of configurations %s" % LINK_TO_DOC)
for config in conf:
include = config.get('include', None)
if include is None:
raise InvalidJMXConfiguration("Each configuration must have an 'include' section. %s" % LINK_TO_DOC)
raise InvalidJMXConfiguration(
"Each configuration must have an 'include' section. %s" % LINK_TO_DOC)
if type(include) != dict:
raise InvalidJMXConfiguration("'include' section must be a dictionary %s" % LINK_TO_DOC)
raise InvalidJMXConfiguration(
"'include' section must be a dictionary %s" % LINK_TO_DOC)
if java_bin_path is None:
if init_config and init_config.get('java_bin_path'):
# We get the java bin path from the yaml file for backward compatibility purposes
# We get the java bin path from the yaml file for backward compatibility
# purposes
java_bin_path = init_config.get('java_bin_path')
else:
@ -235,12 +245,15 @@ class JMXFetch(object):
return True
except Exception, e:
if "Errno 3" not in str(e):
log.debug("Couldn't determine if JMXFetch is running. We suppose it's not. %s" % str(e))
log.debug(
"Couldn't determine if JMXFetch is running. We suppose it's not. %s" % str(
e))
return False
# Else we are on windows, we need another way to check if it's running
try:
import ctypes # Available from python2.5
import ctypes # Available from python2.5
kernel32 = ctypes.windll.kernel32
SYNCHRONIZE = 0x100000
@ -281,13 +294,16 @@ class JMXFetch(object):
@classmethod
def get_path_to_jmxfetch(cls):
if get_os() != 'windows':
return os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "../collector/checks", "libs", JMX_FETCH_JAR_NAME))
return os.path.realpath(
os.path.join(os.path.abspath(__file__), "..", "../collector/checks", "libs",
JMX_FETCH_JAR_NAME))
return os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "../../", "jmxfetch", JMX_FETCH_JAR_NAME))
return os.path.realpath(
os.path.join(os.path.abspath(__file__), "..", "../../", "jmxfetch", JMX_FETCH_JAR_NAME))
@classmethod
def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_opts,
default_check_frequency, jmx_checks, command, reporter=None):
default_check_frequency, jmx_checks, command, reporter=None):
statsd_port = agentConfig.get('monstatsd_port', "8125")
if reporter is None:
@ -302,16 +318,21 @@ class JMXFetch(object):
path_to_status_file = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")
subprocess_args = [
path_to_java, # Path to the java bin
path_to_java, # Path to the java bin
'-jar',
r"%s" % path_to_jmxfetch, # Path to the jmxfetch jar
'--check_period', str(default_check_frequency * 1000), # Period of the main loop of jmxfetch in ms
'--conf_directory', r"%s" % confd_path, # Path of the conf.d directory that will be read by jmxfetch,
'--log_level', JAVA_LOGGING_LEVEL.get(logging_config.get("log_level"), "INFO"), # Log Level: Mapping from Python log level to log4j log levels
'--log_location', r"%s" % logging_config.get('jmxfetch_log_file'), # Path of the log file
'--reporter', reporter, # Reporter to use
'--status_location', r"%s" % path_to_status_file, # Path to the status file to write
command, # Name of the command
r"%s" % path_to_jmxfetch, # Path to the jmxfetch jar
# Period of the main loop of jmxfetch in ms
'--check_period', str(default_check_frequency * 1000),
# Path of the conf.d directory that will be read by jmxfetch,
'--conf_directory', r"%s" % confd_path,
# Log Level: Mapping from Python log level to log4j log levels
'--log_level', JAVA_LOGGING_LEVEL.get(logging_config.get("log_level"), "INFO"),
# Path of the log file
'--log_location', r"%s" % logging_config.get('jmxfetch_log_file'),
'--reporter', reporter, # Reporter to use
# Path to the status file to write
'--status_location', r"%s" % path_to_status_file,
command, # Name of the command
]
subprocess_args.insert(3, '--check')
@ -320,7 +341,7 @@ class JMXFetch(object):
if java_run_opts:
for opt in java_run_opts.split():
subprocess_args.insert(1,opt)
subprocess_args.insert(1, opt)
log.info("Running %s" % " ".join(subprocess_args))
if reporter != "console":

View File

@ -38,7 +38,7 @@ def load_qualified_module(full_module_name, path=None):
def module_name_for_filename(filename):
"""Given the name of a Python file, find an appropropriate module name.
This involves determining whether the file is within a package, and
determining the name of same."""
all_segments = filename.split(os.sep)
@ -54,7 +54,7 @@ def module_name_for_filename(filename):
def get_module(name):
"""Given either an absolute path to a Python file or a module name, load
and return a Python module.
If the module is already loaded, takes no action."""
if name.startswith('/'):
basename, modulename = module_name_for_filename(name)

View File

@ -22,6 +22,7 @@ RECENT_POINT_THRESHOLD_DEFAULT = 3600
class Aggregator(object):
"""
Abstract metric aggregator class.
"""
@ -57,7 +58,7 @@ class Aggregator(object):
def packets_per_second(self, interval):
if interval == 0:
return 0
return round(float(self.count)/interval, 2)
return round(float(self.count) / interval, 2)
def submit_metric(self, name, value, mtype, dimensions=None, hostname=None, device_name=None, timestamp=None,
sample_rate=1):
@ -108,12 +109,14 @@ class Aggregator(object):
class MetricsBucketAggregator(Aggregator):
"""
A metric aggregator class.
"""
def __init__(self, hostname, interval=1.0, expiry_seconds=300, recent_point_threshold=None):
super(MetricsBucketAggregator, self).__init__(hostname, interval, expiry_seconds, recent_point_threshold)
super(MetricsBucketAggregator, self).__init__(
hostname, interval, expiry_seconds, recent_point_threshold)
self.metric_by_bucket = {}
self.last_sample_time_by_context = {}
self.current_bucket = None
@ -171,7 +174,8 @@ class MetricsBucketAggregator(Aggregator):
# (Set, Gauge, Histogram) do not report if no data is submitted
for context, last_sample_time in sample_time_by_context.items():
if last_sample_time < expiry_timestamp:
log.debug("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
log.debug("%s hasn't been submitted in %ss. Expiring." %
(context, self.expiry_seconds))
self.last_sample_time_by_context.pop(context, None)
else:
# The expiration currently only applies to Counters
@ -197,7 +201,8 @@ class MetricsBucketAggregator(Aggregator):
for context, metric in metric_by_context.items():
if metric.last_sample_time < expiry_timestamp:
# This should never happen
log.warning("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
log.warning("%s hasn't been submitted in %ss. Expiring." %
(context, self.expiry_seconds))
not_sampled_in_this_bucket.pop(context, None)
self.last_sample_time_by_context.pop(context, None)
else:
@ -205,21 +210,24 @@ class MetricsBucketAggregator(Aggregator):
if isinstance(metric, Counter):
self.last_sample_time_by_context[context] = metric.last_sample_time
not_sampled_in_this_bucket.pop(context, None)
# We need to account for Metrics that have not expired and were not flushed for this bucket
# We need to account for Metrics that have not expired and were not
# flushed for this bucket
self.create_empty_metrics(not_sampled_in_this_bucket, expiry_timestamp, bucket_start_timestamp,
metrics)
del self.metric_by_bucket[bucket_start_timestamp]
else:
# Even if there are no metrics in this flush, there may be some non-expired counters
# We should only create these non-expired metrics if we've passed an interval since the last flush
# We should only create these non-expired metrics if we've passed an
# interval since the last flush
if flush_cutoff_time >= self.last_flush_cutoff_time + self.interval:
self.create_empty_metrics(self.last_sample_time_by_context.copy(), expiry_timestamp,
flush_cutoff_time-self.interval, metrics)
flush_cutoff_time - self.interval, metrics)
# Log a warning regarding metrics with old timestamps being submitted
if self.num_discarded_old_points > 0:
log.warn('%s points were discarded as a result of having an old timestamp' % self.num_discarded_old_points)
log.warn('%s points were discarded as a result of having an old timestamp' %
self.num_discarded_old_points)
self.num_discarded_old_points = 0
# Save some stats.
@ -233,12 +241,14 @@ class MetricsBucketAggregator(Aggregator):
class MetricsAggregator(Aggregator):
"""
A metric aggregator class.
"""
def __init__(self, hostname, interval=1.0, expiry_seconds=300, recent_point_threshold=None):
super(MetricsAggregator, self).__init__(hostname, interval, expiry_seconds, recent_point_threshold)
super(MetricsAggregator, self).__init__(
hostname, interval, expiry_seconds, recent_point_threshold)
self.metrics = {}
self.metric_type_to_class = {
'g': Gauge,
@ -294,14 +304,16 @@ class MetricsAggregator(Aggregator):
metrics = []
for context, metric in self.metrics.items():
if metric.last_sample_time < expiry_timestamp:
log.debug("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
log.debug("%s hasn't been submitted in %ss. Expiring." %
(context, self.expiry_seconds))
del self.metrics[context]
else:
metrics += metric.flush(timestamp, self.interval)
# Log a warning regarding metrics with old timestamps being submitted
if self.num_discarded_old_points > 0:
log.warn('%s points were discarded as a result of having an old timestamp' % self.num_discarded_old_points)
log.warn('%s points were discarded as a result of having an old timestamp' %
self.num_discarded_old_points)
self.num_discarded_old_points = 0
# Save some stats.
@ -309,4 +321,3 @@ class MetricsAggregator(Aggregator):
self.total_count += self.count
self.count = 0
return metrics

View File

@ -102,6 +102,7 @@ def get_ntp_info():
class AgentStatus(object):
"""
A small class used to load and save status messages to the filesystem.
"""
@ -162,7 +163,7 @@ class AgentStatus(object):
style("Status date", *styles),
style("%s (%ss ago)" %
(self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
self.created_seconds_ago()), *styles)
self.created_seconds_ago()), *styles)
)
]
@ -188,15 +189,12 @@ class AgentStatus(object):
def _not_running_message(cls):
lines = cls._title_lines() + [
style(" %s is not running." % cls.NAME, 'red'),
style(""" You can get more details in the logs:
%s""" % logger_info(), 'red'),
style(""" You can get more details in the logs: %s""" % logger_info(), 'red'),
"",
""
]
return "\n".join(lines)
@classmethod
def remove_latest_status(cls):
log.debug("Removing latest status")
@ -205,7 +203,6 @@ class AgentStatus(object):
except OSError:
pass
@classmethod
def load_latest_status(cls):
try:
@ -218,7 +215,6 @@ class AgentStatus(object):
log.info("Couldn't load latest status")
return None
@classmethod
def print_latest_status(cls, verbose=False):
cls.verbose = verbose
@ -358,7 +354,7 @@ class CollectorStatus(AgentStatus):
confd_path = config.get_confd_path(osname)
except config.PathNotFound:
confd_path = 'Not found'
try:
checksd_path = config.get_checksd_path(osname)
except config.PathNotFound:
@ -393,7 +389,7 @@ class CollectorStatus(AgentStatus):
if cs.init_failed_error:
check_lines.append(" - initialize check class [%s]: %s" %
(style(STATUS_ERROR, 'red'),
repr(cs.init_failed_error)))
repr(cs.init_failed_error)))
if self.verbose and cs.init_failed_traceback:
check_lines.extend(' ' + line for line in
cs.init_failed_traceback.split('\n'))
@ -417,13 +413,15 @@ class CollectorStatus(AgentStatus):
warn = warning.split('\n')
if not len(warn):
continue
check_lines.append(u" %s: %s" % (style("Warning", 'yellow'), warn[0]))
check_lines.append(u" %s: %s" %
(style("Warning", 'yellow'), warn[0]))
check_lines.extend(u" %s" % l for l in warn[1:])
if self.verbose and s.traceback is not None:
check_lines.extend(' ' + line for line in s.traceback.split('\n'))
check_lines += [
" - Collected %s metrics & %s events" % (cs.metric_count, cs.event_count),
" - Collected %s metrics & %s events" % (
cs.metric_count, cs.event_count),
]
if cs.library_versions is not None:
@ -481,9 +479,11 @@ class CollectorStatus(AgentStatus):
'has_warnings': s.has_warnings(),
}
if s.has_error():
status_info['checks'][cs.name]['instances'][s.instance_id]['error'] = s.error
status_info['checks'][cs.name]['instances'][
s.instance_id]['error'] = s.error
if s.has_warnings():
status_info['checks'][cs.name]['instances'][s.instance_id]['warnings'] = s.warnings
status_info['checks'][cs.name]['instances'][
s.instance_id]['warnings'] = s.warnings
status_info['checks'][cs.name]['metric_count'] = cs.metric_count
status_info['checks'][cs.name]['event_count'] = cs.event_count
@ -503,7 +503,7 @@ class CollectorStatus(AgentStatus):
status_info['confd_path'] = config.get_confd_path(osname)
except config.PathNotFound:
status_info['confd_path'] = 'Not found'
try:
status_info['checksd_path'] = config.get_checksd_path(osname)
except config.PathNotFound:
@ -554,7 +554,7 @@ class ForwarderStatus(AgentStatus):
NAME = 'Forwarder'
def __init__(self, queue_length=0, queue_size=0, flush_count=0, transactions_received=0,
transactions_flushed=0):
transactions_flushed=0):
AgentStatus.__init__(self)
self.queue_length = queue_length
self.queue_size = queue_size
@ -587,10 +587,12 @@ class ForwarderStatus(AgentStatus):
def get_jmx_instance_status(instance_name, status, message, metric_count):
if status == STATUS_ERROR:
instance_status = InstanceStatus(instance_name, STATUS_ERROR, error=message, metric_count=metric_count)
instance_status = InstanceStatus(
instance_name, STATUS_ERROR, error=message, metric_count=metric_count)
elif status == STATUS_WARNING:
instance_status = InstanceStatus(instance_name, STATUS_WARNING, warnings=[message], metric_count=metric_count)
instance_status = InstanceStatus(
instance_name, STATUS_WARNING, warnings=[message], metric_count=metric_count)
elif status == STATUS_OK:
instance_status = InstanceStatus(instance_name, STATUS_OK, metric_count=metric_count)
@ -616,7 +618,7 @@ def get_jmx_status():
- One generated by jmxfetch that return information about the collection of metrics
its format is as following:
###
timestamp: 1391037347435
checks:
@ -632,7 +634,8 @@ def get_jmx_status():
java_status_path = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")
python_status_path = os.path.join(tempfile.gettempdir(), "jmx_status_python.yaml")
if not os.path.exists(java_status_path) and not os.path.exists(python_status_path):
log.debug("There is no jmx_status file at: %s or at: %s" % (java_status_path, python_status_path))
log.debug("There is no jmx_status file at: %s or at: %s" %
(java_status_path, python_status_path))
return []
check_data = defaultdict(lambda: defaultdict(list))
@ -640,12 +643,13 @@ def get_jmx_status():
if os.path.exists(java_status_path):
java_jmx_stats = yaml.load(file(java_status_path))
status_age = time.time() - java_jmx_stats.get('timestamp')/1000 # JMX timestamp is saved in milliseconds
# JMX timestamp is saved in milliseconds
status_age = time.time() - java_jmx_stats.get('timestamp') / 1000
jmx_checks = java_jmx_stats.get('checks', {})
if status_age > 60:
check_statuses.append(CheckStatus("jmx", [InstanceStatus(0, STATUS_ERROR,
error="JMXfetch didn't return any metrics during the last minute")],
error="JMXfetch didn't return any metrics during the last minute")],
0, 0))
else:
@ -658,7 +662,7 @@ def get_jmx_status():
check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status,
message, metric_count))
check_data[check_name]['metric_count'].append(metric_count)
for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems():
for info in instances:
message = info.get('message', None)
@ -670,7 +674,8 @@ def get_jmx_status():
check_data[check_name]['metric_count'].append(metric_count)
for check_name, data in check_data.iteritems():
check_status = CheckStatus(check_name, data['statuses'], sum(data['metric_count']), 0)
check_status = CheckStatus(
check_name, data['statuses'], sum(data['metric_count']), 0)
check_statuses.append(check_status)
if os.path.exists(python_status_path):

View File

@ -59,6 +59,7 @@ def get_parsed_args():
def get_version():
return __version__
def skip_leading_wsp(f):
"Works on a file, returns a file-like object"
return StringIO("\n".join(map(string.strip, f.readlines())))
@ -177,7 +178,8 @@ def get_config_path(cfg_path=None, os_name=None):
return os.path.join(path, AGENT_CONF)
# If all searches fail, exit the agent with an error
sys.stderr.write("Please supply a configuration file at %s or in the directory where the Agent is currently deployed.\n" % bad_path)
sys.stderr.write(
"Please supply a configuration file at %s or in the directory where the Agent is currently deployed.\n" % bad_path)
sys.exit(3)
@ -234,14 +236,16 @@ def get_config(parse_args=True, cfg_path=None, options=None):
# Concerns only Windows
if config.has_option('Main', 'use_web_info_page'):
agent_config['use_web_info_page'] = config.get('Main', 'use_web_info_page').lower() in ("yes", "true")
agent_config['use_web_info_page'] = config.get(
'Main', 'use_web_info_page').lower() in ("yes", "true")
else:
agent_config['use_web_info_page'] = True
# local traffic only? Default to no
agent_config['non_local_traffic'] = False
if config.has_option('Main', 'non_local_traffic'):
agent_config['non_local_traffic'] = config.get('Main', 'non_local_traffic').lower() in ("yes", "true")
agent_config['non_local_traffic'] = config.get(
'Main', 'non_local_traffic').lower() in ("yes", "true")
if config.has_option('Main', 'check_freq'):
try:
@ -267,14 +271,15 @@ def get_config(parse_args=True, cfg_path=None, options=None):
else:
agent_config[key] = value
#Forwarding to external statsd server
# Forwarding to external statsd server
if config.has_option('Main', 'statsd_forward_host'):
agent_config['statsd_forward_host'] = config.get('Main', 'statsd_forward_host')
if config.has_option('Main', 'statsd_forward_port'):
agent_config['statsd_forward_port'] = int(config.get('Main', 'statsd_forward_port'))
# normalize 'yes'/'no' to boolean
monstatsd_defaults['monstatsd_normalize'] = _is_affirmative(monstatsd_defaults['monstatsd_normalize'])
monstatsd_defaults['monstatsd_normalize'] = _is_affirmative(
monstatsd_defaults['monstatsd_normalize'])
# Optional config
# FIXME not the prettiest code ever...
@ -298,7 +303,8 @@ def get_config(parse_args=True, cfg_path=None, options=None):
# Older version, single log support
log_path = config.get("Main", "dogstream_log")
if config.has_option("Main", "dogstream_line_parser"):
agent_config["dogstreams"] = ':'.join([log_path, config.get("Main", "dogstream_line_parser")])
agent_config["dogstreams"] = ':'.join(
[log_path, config.get("Main", "dogstream_line_parser")])
else:
agent_config["dogstreams"] = log_path
@ -314,13 +320,15 @@ def get_config(parse_args=True, cfg_path=None, options=None):
agent_config['WMI'][key] = value
if config.has_option("Main", "limit_memory_consumption") and \
config.get("Main", "limit_memory_consumption") is not None:
agent_config["limit_memory_consumption"] = int(config.get("Main", "limit_memory_consumption"))
config.get("Main", "limit_memory_consumption") is not None:
agent_config["limit_memory_consumption"] = int(
config.get("Main", "limit_memory_consumption"))
else:
agent_config["limit_memory_consumption"] = None
if config.has_option("Main", "skip_ssl_validation"):
agent_config["skip_ssl_validation"] = _is_affirmative(config.get("Main", "skip_ssl_validation"))
agent_config["skip_ssl_validation"] = _is_affirmative(
config.get("Main", "skip_ssl_validation"))
agent_config['Api'] = get_mon_api_config(config)
@ -333,7 +341,8 @@ def get_config(parse_args=True, cfg_path=None, options=None):
sys.exit(2)
except ConfigParser.NoOptionError, e:
sys.stderr.write('There are some items missing from your config file, but nothing fatal [%s]' % e)
sys.stderr.write(
'There are some items missing from your config file, but nothing fatal [%s]' % e)
# Storing proxy settings in the agent_config
agent_config['proxy_settings'] = get_proxy(agent_config)
@ -357,11 +366,12 @@ def set_win32_cert_path():
else:
cur_path = os.path.dirname(__file__)
crt_path = os.path.join(cur_path, 'packaging', 'mon-agent', 'win32',
'install_files', 'ca-certificates.crt')
'install_files', 'ca-certificates.crt')
import tornado.simple_httpclient
log.info("Windows certificate path: %s" % crt_path)
tornado.simple_httpclient._DEFAULT_CA_CERTS = crt_path
def get_proxy(agent_config, use_system_settings=False):
proxy_settings = {}
@ -378,7 +388,8 @@ def get_proxy(agent_config, use_system_settings=False):
proxy_settings['user'] = agent_config.get('proxy_user', None)
proxy_settings['password'] = agent_config.get('proxy_password', None)
proxy_settings['system_settings'] = False
log.debug("Proxy Settings: %s:%s@%s:%s" % (proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
log.debug("Proxy Settings: %s:%s@%s:%s" %
(proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
return proxy_settings
# If no proxy configuration was specified in agent.conf
@ -404,11 +415,13 @@ def get_proxy(agent_config, use_system_settings=False):
if len(creds) == 2:
proxy_settings['password'] = creds[1]
log.debug("Proxy Settings: %s:%s@%s:%s" % (proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
log.debug("Proxy Settings: %s:%s@%s:%s" % (
proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
return proxy_settings
except Exception, e:
log.debug("Error while trying to fetch proxy settings using urllib %s. Proxy is probably not set" % str(e))
log.debug(
"Error while trying to fetch proxy settings using urllib %s. Proxy is probably not set" % str(e))
log.debug("No proxy configured")
@ -486,12 +499,14 @@ def check_yaml(conf_path):
valid_instances = False
break
if not valid_instances:
raise Exception('You need to have at least one instance defined in the YAML file for this check')
raise Exception(
'You need to have at least one instance defined in the YAML file for this check')
else:
return check_config
finally:
f.close()
def load_check_directory(agent_config):
''' Return the initialized checks from checks_d, and a mapping of checks that failed to
initialize. Only checks that have a configuration
@ -514,11 +529,13 @@ def load_check_directory(agent_config):
try:
confd_path = get_confd_path(osname)
except PathNotFound, e:
log.error("No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0])
log.error(
"No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0])
sys.exit(3)
# Start JMXFetch if needed
JMXFetch.init(confd_path, agent_config, get_logging_config(), DEFAULT_CHECK_FREQUENCY, JMX_COLLECT_COMMAND)
JMXFetch.init(confd_path, agent_config, get_logging_config(),
DEFAULT_CHECK_FREQUENCY, JMX_COLLECT_COMMAND)
# For backwards-compatability with old style checks, we have to load every
# checks_d module and check for a corresponding config OR check if the old
@ -529,7 +546,8 @@ def load_check_directory(agent_config):
for check in itertools.chain(*checks_paths):
check_name = os.path.basename(check).split('.')[0]
if check_name in initialized_checks or check_name in init_failed_checks:
log.debug('Skipping check %s because it has already been loaded from another location', check)
log.debug(
'Skipping check %s because it has already been loaded from another location', check)
continue
try:
check_module = imp.load_source('checksd_%s' % check_name, check)
@ -540,9 +558,9 @@ def load_check_directory(agent_config):
conf_path = os.path.join(confd_path, '%s.yaml' % check_name)
if os.path.exists(conf_path):
# There is a configuration file for that check but the module can't be imported
init_failed_checks[check_name] = {'error':e, 'traceback':traceback_message}
init_failed_checks[check_name] = {'error': e, 'traceback': traceback_message}
log.exception('Unable to import check module %s.py from checks_d' % check_name)
else: # There is no conf for that check. Let's not spam the logs for it.
else: # There is no conf for that check. Let's not spam the logs for it.
log.debug('Unable to import check module %s.py from checks_d' % check_name)
continue
@ -571,7 +589,7 @@ def load_check_directory(agent_config):
except Exception, e:
log.exception("Unable to parse yaml config in %s" % conf_path)
traceback_message = traceback.format_exc()
init_failed_checks[check_name] = {'error':e, 'traceback':traceback_message}
init_failed_checks[check_name] = {'error': e, 'traceback': traceback_message}
continue
elif hasattr(check_class, 'parse_agent_config'):
# FIXME: Remove this check once all old-style checks are gone
@ -618,7 +636,7 @@ def load_check_directory(agent_config):
except Exception, e:
log.exception('Unable to initialize check %s' % check_name)
traceback_message = traceback.format_exc()
init_failed_checks[check_name] = {'error':e, 'traceback':traceback_message}
init_failed_checks[check_name] = {'error': e, 'traceback': traceback_message}
else:
initialized_checks[check_name] = c
@ -633,8 +651,8 @@ def load_check_directory(agent_config):
log.info('initialized checks_d checks: %s' % initialized_checks.keys())
log.info('initialization failed checks_d checks: %s' % init_failed_checks.keys())
return {'initialized_checks':initialized_checks.values(),
'init_failed_checks':init_failed_checks,
return {'initialized_checks': initialized_checks.values(),
'init_failed_checks': init_failed_checks,
}
@ -644,6 +662,7 @@ def load_check_directory(agent_config):
def get_log_date_format():
return "%Y-%m-%d %H:%M:%S %Z"
def get_log_format(logger_name):
if get_os() != 'windows':
return '%%(asctime)s | %%(levelname)s | %s | %%(name)s(%%(filename)s:%%(lineno)s) | %%(message)s' % logger_name
@ -713,10 +732,12 @@ def get_logging_config(cfg_path=None):
logging_config['log_level'] = levels.get(config.get('Main', 'log_level'))
if config.has_option('Main', 'log_to_syslog'):
logging_config['log_to_syslog'] = config.get('Main', 'log_to_syslog').strip().lower() in ['yes', 'true', 1]
logging_config['log_to_syslog'] = config.get(
'Main', 'log_to_syslog').strip().lower() in ['yes', 'true', 1]
if config.has_option('Main', 'log_to_event_viewer'):
logging_config['log_to_event_viewer'] = config.get('Main', 'log_to_event_viewer').strip().lower() in ['yes', 'true', 1]
logging_config['log_to_event_viewer'] = config.get(
'Main', 'log_to_event_viewer').strip().lower() in ['yes', 'true', 1]
if config.has_option('Main', 'syslog_host'):
host = config.get('Main', 'syslog_host').strip()
@ -733,7 +754,8 @@ def get_logging_config(cfg_path=None):
logging_config['syslog_port'] = None
if config.has_option('Main', 'disable_file_logging'):
logging_config['disable_file_logging'] = config.get('Main', 'disable_file_logging').strip().lower() in ['yes', 'true', 1]
logging_config['disable_file_logging'] = config.get(
'Main', 'disable_file_logging').strip().lower() in ['yes', 'true', 1]
else:
logging_config['disable_file_logging'] = False
@ -760,7 +782,8 @@ def initialize_logging(logger_name):
# make sure the log directory is writeable
# NOTE: the entire directory needs to be writable so that rotation works
if os.access(os.path.dirname(log_file), os.R_OK | os.W_OK):
file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1)
file_handler = logging.handlers.RotatingFileHandler(
log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1)
formatter = logging.Formatter(get_log_format(logger_name), get_log_date_format())
file_handler.setFormatter(formatter)
@ -783,7 +806,8 @@ def initialize_logging(logger_name):
sys_log_addr = "/var/run/syslog"
handler = SysLogHandler(address=sys_log_addr, facility=SysLogHandler.LOG_DAEMON)
handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
handler.setFormatter(
logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
root_log = logging.getLogger()
root_log.addHandler(handler)
except Exception, e:
@ -794,8 +818,10 @@ def initialize_logging(logger_name):
if get_os() == 'windows' and logging_config['log_to_event_viewer']:
try:
from logging.handlers import NTEventLogHandler
nt_event_handler = NTEventLogHandler(logger_name,get_win32service_file('windows', 'win32service.pyd'), 'Application')
nt_event_handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
nt_event_handler = NTEventLogHandler(
logger_name, get_win32service_file('windows', 'win32service.pyd'), 'Application')
nt_event_handler.setFormatter(
logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
nt_event_handler.setLevel(logging.ERROR)
app_log = logging.getLogger(logger_name)
app_log.addHandler(nt_event_handler)
@ -834,8 +860,7 @@ def get_mon_api_config(config):
dim_list = [dim.split(':') for dim in config.get('Main', 'dimensions').split(',')]
mon_api_config['dimensions'] = {key.strip(): value.strip() for key, value in dim_list}
except ValueError:
mon_api_config['dimensions'] = { }
mon_api_config['dimensions'] = {}
if config.has_section("Api"):
options = {"url": config.get,

View File

@ -2,11 +2,11 @@
***
Modified generic daemon class
***
Author: http://www.jejik.com/articles/2007/02/a_simple_unix_linux_daemon_in_python/
www.boxedice.com
www.datadoghq.com
License: http://creativecommons.org/licenses/by-sa/3.0/
"""
@ -23,6 +23,7 @@ log = logging.getLogger(__name__)
class AgentSupervisor(object):
''' A simple supervisor to keep a restart a child on expected auto-restarts
'''
RESTART_EXIT_STATUS = 5
@ -71,40 +72,42 @@ class AgentSupervisor(object):
class Daemon(object):
"""
A generic daemon class.
Usage: subclass the Daemon class and override the run() method
"""
def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull, autorestart=False):
self.autorestart = autorestart
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.pidfile = pidfile
def daemonize(self):
"""
Do the UNIX double-fork magic, see Stevens' "Advanced
Do the UNIX double-fork magic, see Stevens' "Advanced
Programming in the UNIX Environment" for details (ISBN 0201563177)
http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
"""
try:
pid = os.fork()
try:
pid = os.fork()
if pid > 0:
# Exit first parent
sys.exit(0)
except OSError, e:
sys.exit(0)
except OSError, e:
msg = "fork #1 failed: %d (%s)" % (e.errno, e.strerror)
log.error(msg)
sys.stderr.write(msg + "\n")
sys.exit(1)
log.debug("Fork 1 ok")
log.debug("Fork 1 ok")
# Decouple from parent environment
os.chdir("/")
os.setsid()
os.chdir("/")
os.setsid()
if self.autorestart:
# Set up the supervisor callbacks and put a fork in it.
@ -123,7 +126,7 @@ class Daemon(object):
sys.stderr.write(msg + "\n")
sys.exit(1)
if sys.platform != 'darwin': # This block breaks on OS X
if sys.platform != 'darwin': # This block breaks on OS X
# Redirect standard file descriptors
sys.stdout.flush()
sys.stderr.flush()
@ -133,11 +136,11 @@ class Daemon(object):
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
log.info("Daemon started")
# Write pidfile
atexit.register(self.delpid) # Make sure pid file is removed if we quit
atexit.register(self.delpid) # Make sure pid file is removed if we quit
pid = str(os.getpid())
try:
fp = open(self.pidfile, 'w+')
@ -150,11 +153,10 @@ class Daemon(object):
sys.stderr.write(msg + "\n")
sys.exit(1)
def start(self):
log.info("Starting daemon")
pid = self.pid()
if pid:
message = "pidfile %s already exists. Is it already running?\n"
log.error(message % self.pidfile)
@ -162,12 +164,11 @@ class Daemon(object):
sys.exit(1)
log.info("Daemon pidfile: %s" % self.pidfile)
self.daemonize()
self.daemonize()
self.run()
def stop(self):
log.info("Stopping daemon")
log.info("Stopping daemon")
pid = self.pid()
# Clear the pid file
@ -195,20 +196,18 @@ class Daemon(object):
message = "Pidfile %s does not exist. Not running?\n" % self.pidfile
log.info(message)
sys.stderr.write(message)
# A ValueError might occur if the PID file is empty but does actually exist
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
return # Not an error in a restart
return # Not an error in a restart
def restart(self):
"Restart the daemon"
self.stop()
self.stop()
self.start()
def run(self):
"""
You should override this method when you subclass Daemon. It will be called after the process has been
@ -216,7 +215,6 @@ class Daemon(object):
"""
raise NotImplementedError
def info(self):
"""
You should override this method when you subclass Daemon. It will be
@ -224,7 +222,6 @@ class Daemon(object):
"""
raise NotImplementedError
def status(self):
"""
Get the status of the daemon. Exits with 0 if running, 1 if not.
@ -246,7 +243,8 @@ class Daemon(object):
os.kill(pid, 0)
except OSError, e:
if e.errno != errno.EPERM:
message = '%s pidfile contains pid %s, but no running process could be found' % (self.__class__.__name__, pid)
message = '%s pidfile contains pid %s, but no running process could be found' % (
self.__class__.__name__, pid)
exit_code = 1
else:
message = '%s is running with pid %s' % (self.__class__.__name__, pid)
@ -256,7 +254,6 @@ class Daemon(object):
sys.stdout.write(message + "\n")
sys.exit(exit_code)
def pid(self):
# Get the pid from the pidfile
try:
@ -269,7 +266,6 @@ class Daemon(object):
except ValueError:
return None
def delpid(self):
try:
os.remove(self.pidfile)

View File

@ -34,8 +34,10 @@ def http_emitter(message, log, url):
headers = post_headers(payload)
try:
proxy_handler = urllib2.ProxyHandler({}) # Make sure no proxy is autodetected for this localhost connection
opener = urllib2.build_opener(proxy_handler) # Should this be installed as the default opener and reused?
# Make sure no proxy is autodetected for this localhost connection
proxy_handler = urllib2.ProxyHandler({})
# Should this be installed as the default opener and reused?
opener = urllib2.build_opener(proxy_handler)
request = urllib2.Request(url, payload, headers)
response = opener.open(request)
try:

View File

@ -1,6 +1,7 @@
import json
import requests
class Keystone(object):
password_auth = {
@ -33,10 +34,11 @@ class Keystone(object):
# Make this a singleton class so we don't get the token every time
# the class is created
_instance = None
def __new__(cls, *args, **kwargs):
if not cls._instance:
cls._instance = super(Keystone, cls).__new__(
cls, *args, **kwargs)
cls, *args, **kwargs)
return cls._instance
def __init__(self, endpoint, user_id, password, project_name):
@ -57,7 +59,8 @@ class Keystone(object):
self.password_auth['auth']['scope']['project']['name'] = self.project_name
data = json.dumps(self.password_auth)
headers = {'Content-Type': 'application/json'}
response = requests.post(self.endpoint.rstrip('/') + '/auth/tokens', data=data, headers=headers)
response = requests.post(
self.endpoint.rstrip('/') + '/auth/tokens', data=data, headers=headers)
response.raise_for_status()
self.token = response.headers['X-Subject-Token']
return self.token

View File

@ -12,7 +12,8 @@ log = logging.getLogger(__name__)
# todo it would be best to implement a Measurement group/list container, it could then have methods for converting to json
# in the current setup both the emitter and the mon api are converting to json in for loops
# A Measurement is the standard format used to pass data from the collector and monstatsd to the forwarder
# A Measurement is the standard format used to pass data from the
# collector and monstatsd to the forwarder
Measurement = namedtuple('Measurement', ['name', 'timestamp', 'value', 'dimensions'])
@ -23,6 +24,7 @@ class MetricTypes(object):
class Metric(object):
"""
A base metric class that accepts points, slices them into time intervals
and performs roll-ups within those intervals.
@ -38,6 +40,7 @@ class Metric(object):
class Gauge(Metric):
""" A metric that tracks a value at particular points in time. """
def __init__(self, formatter, name, dimensions, hostname, device_name):
@ -74,6 +77,7 @@ class Gauge(Metric):
class BucketGauge(Gauge):
""" A metric that tracks a value at particular points in time.
The difference beween this class and Gauge is that this class will
report that gauge sample time as the time that Metric is flushed, as
@ -100,6 +104,7 @@ class BucketGauge(Gauge):
class Counter(Metric):
""" A metric that tracks a counter value. """
def __init__(self, formatter, name, dimensions, hostname, device_name):
@ -133,6 +138,7 @@ class Counter(Metric):
class Histogram(Metric):
""" A metric to track the distribution of a set of values. """
def __init__(self, formatter, name, dimensions, hostname, device_name):
@ -159,26 +165,26 @@ class Histogram(Metric):
length = len(self.samples)
max_ = self.samples[-1]
med = self.samples[int(round(length/2 - 1))]
med = self.samples[int(round(length / 2 - 1))]
avg = sum(self.samples) / float(length)
metric_aggrs = [
('max', max_, MetricTypes.GAUGE),
('median', med, MetricTypes.GAUGE),
('avg', avg, MetricTypes.GAUGE),
('count', self.count/interval, MetricTypes.RATE)
('count', self.count / interval, MetricTypes.RATE)
]
metrics = [self.formatter(
hostname=self.hostname,
device_name=self.device_name,
dimensions=self.dimensions,
metric='%s.%s' % (self.name, suffix),
value=value,
timestamp=ts,
metric_type=metric_type,
interval=interval,
) for suffix, value, metric_type in metric_aggrs
hostname=self.hostname,
device_name=self.device_name,
dimensions=self.dimensions,
metric='%s.%s' % (self.name, suffix),
value=value,
timestamp=ts,
metric_type=metric_type,
interval=interval,
) for suffix, value, metric_type in metric_aggrs
]
for p in self.percentiles:
@ -202,6 +208,7 @@ class Histogram(Metric):
class Set(Metric):
""" A metric to track the number of unique elements in a set. """
def __init__(self, formatter, name, dimensions, hostname, device_name):
@ -236,6 +243,7 @@ class Set(Metric):
class Rate(Metric):
""" Track the rate of metrics over each flush interval """
def __init__(self, formatter, name, dimensions, hostname, device_name):
@ -285,4 +293,4 @@ class Rate(Metric):
interval=interval
)]
finally:
self.samples = self.samples[-1:]
self.samples = self.samples[-1:]

View File

@ -17,7 +17,8 @@ try:
except ImportError:
pass # We are likely running the agent without the forwarder and tornado is not installed
VALID_HOSTNAME_RFC_1123_PATTERN = re.compile(r"^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\-]*[A-Za-z0-9])$")
VALID_HOSTNAME_RFC_1123_PATTERN = re.compile(
r"^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\-]*[A-Za-z0-9])$")
MAX_HOSTNAME_LEN = 255
import logging
@ -94,7 +95,7 @@ def isnan(val):
# for py < 2.6, use a different check
# http://stackoverflow.com/questions/944700/how-to-check-for-nan-in-python
return str(val) == str(1e400*0)
return str(val) == str(1e400 * 0)
def cast_metric_val(val):
@ -118,7 +119,8 @@ def is_valid_hostname(hostname):
log.warning("Hostname: %s is local" % hostname)
return False
if len(hostname) > MAX_HOSTNAME_LEN:
log.warning("Hostname: %s is too long (max length is %s characters)" % (hostname, MAX_HOSTNAME_LEN))
log.warning("Hostname: %s is too long (max length is %s characters)" %
(hostname, MAX_HOSTNAME_LEN))
return False
if VALID_HOSTNAME_RFC_1123_PATTERN.match(hostname) is None:
log.warning("Hostname: %s is not complying with RFC 1123" % hostname)
@ -175,23 +177,27 @@ def get_hostname(config=None):
hostname = socket_hostname
if hostname is None:
log.critical('Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
raise Exception('Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
log.critical(
'Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
raise Exception(
'Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
else:
return hostname
class Watchdog(object):
"""Simple signal-based watchdog that will scuttle the current process
if it has not been reset every N seconds, or if the processes exceeds
a specified memory threshold.
Can only be invoked once per process, so don't use with multiple threads.
If you instantiate more than one, you're also asking for trouble.
"""
def __init__(self, duration, max_mem_mb = None):
def __init__(self, duration, max_mem_mb=None):
import resource
#Set the duration
# Set the duration
self._duration = int(duration)
signal.signal(signal.SIGALRM, Watchdog.self_destruct)
@ -213,7 +219,6 @@ class Watchdog(object):
finally:
os.kill(os.getpid(), signal.SIGKILL)
def reset(self):
# self destruct if using too much memory, as tornado will swallow MemoryErrors
mem_usage_kb = int(os.popen('ps -p %d -o %s | tail -1' % (os.getpid(), 'rss')).read())
@ -225,6 +230,7 @@ class Watchdog(object):
class PidFile(object):
""" A small helper class for pidfiles. """
PID_DIR = '/var/run/mon-agent'
@ -283,6 +289,7 @@ class PidFile(object):
class LaconicFilter(logging.Filter):
"""
Filters messages, only print them once while keeping memory under control
"""
@ -312,6 +319,7 @@ class LaconicFilter(logging.Filter):
class Timer(object):
""" Helper class """
def __init__(self):
@ -328,7 +336,7 @@ class Timer(object):
def step(self):
now = self._now()
step = now - self.last
step = now - self.last
self.last = now
return step
@ -337,6 +345,7 @@ class Timer(object):
class Platform(object):
"""
Return information about the given platform.
"""
@ -373,7 +382,7 @@ class Platform(object):
return (Platform.is_darwin()
or Platform.is_linux()
or Platform.is_freebsd()
)
)
@staticmethod
def is_win32(name=None):
@ -384,6 +393,7 @@ class Platform(object):
Iterable Recipes
"""
def chunks(iterable, chunk_size):
"""Generate sequences of `chunk_size` elements from `iterable`."""
iterable = iter(iterable)

View File

@ -1 +0,0 @@

View File

@ -8,10 +8,12 @@ log = logging.getLogger(__name__)
class MonAPI(object):
"""Sends measurements to MonAPI
Any errors should raise an exception so the transaction calling
this is not committed
"""
def __init__(self, config):
"""
Initialize Mon api client connection.

View File

@ -54,7 +54,7 @@ MAX_WAIT_FOR_REPLAY = timedelta(seconds=90)
# Maximum queue size in bytes (when this is reached, old messages are dropped)
MAX_QUEUE_SIZE = 30 * 1024 * 1024 # 30MB
THROTTLING_DELAY = timedelta(microseconds=1000000/2) # 2 msg/second
THROTTLING_DELAY = timedelta(microseconds=1000000 / 2) # 2 msg/second
class MetricTransaction(Transaction):
@ -116,7 +116,8 @@ class StatusHandler(tornado.web.RequestHandler):
m = MetricTransaction.get_tr_manager()
self.write("<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
self.write(
"<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
transactions = m.get_transactions()
for tr in transactions:
self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
@ -135,7 +136,8 @@ class AgentInputHandler(tornado.web.RequestHandler):
The message is expected to follow the format:
"""
# read the message it should be a list of monagent.common.metrics.Measurements expressed as a dict
# read the message it should be a list of
# monagent.common.metrics.Measurements expressed as a dict
msg = tornado.escape.json_decode(self.request.body)
try:
log.debug(msg)
@ -167,14 +169,16 @@ class Forwarder(tornado.web.Application):
MetricTransaction.set_tr_manager(self._tr_manager)
self._watchdog = None
self.skip_ssl_validation = skip_ssl_validation or agent_config.get('skip_ssl_validation', False)
self.skip_ssl_validation = skip_ssl_validation or agent_config.get(
'skip_ssl_validation', False)
self.use_simple_http_client = use_simple_http_client
if self.skip_ssl_validation:
log.info("Skipping SSL hostname validation, useful when using a transparent proxy")
if watchdog:
watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
self._watchdog = Watchdog(
watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
def _post_metrics(self):
@ -225,7 +229,8 @@ class Forwarder(tornado.web.Application):
try:
http_server.listen(self._port, address="localhost")
except gaierror:
log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
log.warning(
"localhost seems undefined in your host file, using 127.0.0.1 instead")
http_server.listen(self._port, address="127.0.0.1")
except socket_error, e:
if "Errno 99" in str(e):
@ -234,7 +239,8 @@ class Forwarder(tornado.web.Application):
else:
raise
except socket_error, e:
log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
log.exception(
"Socket error %s. Is another application listening on the same port ? Exiting", e)
sys.exit(1)
except Exception:
log.exception("Uncaught exception. Forwarder is exiting.")
@ -253,7 +259,8 @@ class Forwarder(tornado.web.Application):
self._post_metrics()
self._tr_manager.flush()
tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
tr_sched = tornado.ioloop.PeriodicCallback(
flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
# Start everything
if self._watchdog:
@ -291,7 +298,8 @@ def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
def main():
define("sslcheck", default=1, help="Verify SSL hostname, on by default")
define("use_simple_http_client", default=0, help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
define("use_simple_http_client", default=0,
help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
args = parse_command_line()
skip_ssl_validation = False
use_simple_http_client = False

View File

@ -21,7 +21,7 @@ class Transaction(object):
self._id = None
self._error_count = 0
self._next_flush = datetime.now()
self._next_flush = datetime.now()
self._size = None
def get_id(self):
@ -35,7 +35,7 @@ class Transaction(object):
self._error_count += 1
def get_error_count(self):
return self._error_count
return self._error_count
def get_size(self):
if self._size is None:
@ -64,6 +64,7 @@ class Transaction(object):
class TransactionManager(object):
"""Holds any transaction derived object list and make sure they
are all commited, without exceeding parameters (throttling, memory consumption) """
@ -76,7 +77,7 @@ class TransactionManager(object):
self._transactions = [] # List of all non commited transactions
self._total_count = 0 # Maintain size/count not to recompute it everytime
self._total_size = 0
self._total_size = 0
self._flush_count = 0
self._transactions_received = 0
self._transactions_flushed = 0
@ -96,7 +97,7 @@ class TransactionManager(object):
def print_queue_stats(self):
log.debug("Queue size: at %s, %s transaction(s), %s KB" %
(time.time(), self._total_count, (self._total_size/1024)))
(time.time(), self._total_count, (self._total_size / 1024)))
def get_tr_id(self):
self._counter += 1
@ -110,8 +111,8 @@ class TransactionManager(object):
# Check the size
tr_size = tr.get_size()
log.debug("New transaction to add, total size of queue would be: %s KB" %
((self._total_size + tr_size)/1024))
log.debug("New transaction to add, total size of queue would be: %s KB" %
((self._total_size + tr_size) / 1024))
if (self._total_size + tr_size) > self._MAX_QUEUE_SIZE:
log.warn("Queue is too big, removing old transactions...")
@ -147,7 +148,7 @@ class TransactionManager(object):
count = len(to_flush)
should_log = self._flush_count + 1 <= FLUSH_LOGGING_INITIAL or \
(self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
(self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
if count > 0:
if should_log:
log.info("Flushing %s transaction%s during flush #%s" %
@ -165,7 +166,8 @@ class TransactionManager(object):
log.debug("No transaction to flush during flush #%s" % str(self._flush_count + 1))
if self._flush_count + 1 == FLUSH_LOGGING_INITIAL:
log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)
log.info("First flushes done, next flushes will be logged every %s flushes." %
FLUSH_LOGGING_PERIOD)
self._flush_count += 1
@ -185,7 +187,7 @@ class TransactionManager(object):
if hasattr(td, 'total_seconds'):
delay = td.total_seconds()
else:
delay = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10.0**6
delay = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6
if delay <= 0:
tr = self._trs_to_flush.pop()
@ -203,7 +205,7 @@ class TransactionManager(object):
if tornado_ioloop._running:
tornado_ioloop.add_timeout(time.time() + delay, lambda: self.flush_next())
elif self._flush_without_ioloop:
# Tornado is no started (ie, unittests), do it manually: BLOCKING
# Tornado is no started (ie, unittests), do it manually: BLOCKING
time.sleep(delay)
self.flush_next()
else:
@ -213,7 +215,7 @@ class TransactionManager(object):
tr.inc_error_count()
tr.compute_next_flush(self._MAX_WAIT_FOR_REPLAY)
log.warn("Transaction %d in error (%s error%s), it will be replayed after %s" %
(tr.get_id(), tr.get_error_count(), plural(tr.get_error_count()), tr.get_next_flush()))
(tr.get_id(), tr.get_error_count(), plural(tr.get_error_count()), tr.get_next_flush()))
def tr_success(self, tr):
log.debug("Transaction %d completed" % tr.get_id())
@ -222,5 +224,3 @@ class TransactionManager(object):
self._total_size += - tr.get_size()
self._transactions_flushed += 1
self.print_queue_stats()

View File

@ -30,6 +30,7 @@ log = logging.getLogger('monstatsd')
class Monstatsd(Daemon):
""" This class is the monstatsd daemon. """
def __init__(self, pid_file, server, reporter, autorestart):
@ -107,7 +108,8 @@ def init_monstatsd(config_path=None, use_watchdog=False):
if non_local_traffic:
server_host = ''
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host,
forward_to_port=forward_to_port)
return reporter, server, c

View File

@ -19,6 +19,7 @@ EVENT_CHUNK_SIZE = 50
class Reporter(threading.Thread):
"""
The reporter periodically sends the aggregated metrics to the
server.

View File

@ -10,6 +10,7 @@ UDP_SOCKET_TIMEOUT = 5
class Server(object):
"""
A statsd udp server.
"""
@ -55,9 +56,9 @@ class Server(object):
event = {
'title': metadata[:title_length],
'text': (metadata[title_length+1:title_length+text_length+1]).replace('\\n', '\n')
'text': (metadata[title_length + 1:title_length + text_length + 1]).replace('\\n', '\n')
}
meta = metadata[title_length+text_length+1:]
meta = metadata[title_length + text_length + 1:]
for m in meta.split('|')[1:]:
if m[0] == u't':
event['alert_type'] = m[2:]
@ -137,7 +138,8 @@ class Server(object):
# todo it seems like this count should be done in the submit_metric method
self.aggregator.count += 1
name, value, mtype, dimensions, sample_rate = self._parse_metric_packet(packet)
self.aggregator.submit_metric(name, value, mtype, dimensions=dimensions, sample_rate=sample_rate)
self.aggregator.submit_metric(
name, value, mtype, dimensions=dimensions, sample_rate=sample_rate)
def start(self):
""" Run the server. """
@ -149,7 +151,8 @@ class Server(object):
open_socket.bind(self.address)
except socket.gaierror:
if self.address[0] == 'localhost':
log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
log.warning(
"Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
self.address = ('127.0.0.1', self.address[1])
open_socket.bind(self.address)
@ -187,4 +190,4 @@ class Server(object):
log.exception('Error receiving datagram')
def stop(self):
self.running = False
self.running = False

View File

@ -24,6 +24,7 @@ from collector.jmxfetch import JMXFetch
log = logging.getLogger(__name__)
RESTART_INTERVAL = 24 * 60 * 60 # Defaults to 1 day
class AgentSvc(win32serviceutil.ServiceFramework):
_svc_name_ = "DatadogAgent"
_svc_display_name_ = "Datadog Agent"
@ -50,7 +51,7 @@ class AgentSvc(win32serviceutil.ServiceFramework):
'forwarder': DDForwarder(config),
'collector': DDAgent(agentConfig),
'dogstatsd': DogstatsdProcess(config),
'pup': PupProcess(config),
'pup': PupProcess(config),
}
def SvcStop(self):
@ -105,6 +106,7 @@ class AgentSvc(win32serviceutil.ServiceFramework):
class DDAgent(multiprocessing.Process):
def __init__(self, agentConfig, start_event=True):
multiprocessing.Process.__init__(self, name='ddagent')
self.config = agentConfig
@ -137,7 +139,7 @@ class DDAgent(multiprocessing.Process):
def get_emitters(self):
emitters = [http_emitter]
custom = [s.strip() for s in
self.config.get('custom_emitters', '').split(',')]
self.config.get('custom_emitters', '').split(',')]
for emitter_spec in custom:
if not emitter_spec:
continue
@ -145,7 +147,9 @@ class DDAgent(multiprocessing.Process):
return emitters
class DDForwarder(multiprocessing.Process):
def __init__(self, agentConfig):
multiprocessing.Process.__init__(self, name='ddforwarder')
self.config = agentConfig
@ -159,7 +163,7 @@ class DDForwarder(multiprocessing.Process):
port = 17123
else:
port = int(port)
app_config = get_config(parse_args = False)
app_config = get_config(parse_args=False)
self.forwarder = Application(port, app_config, watchdog=False)
self.forwarder.run()
@ -167,7 +171,9 @@ class DDForwarder(multiprocessing.Process):
log.debug("Windows Service - Stopping forwarder")
self.forwarder.stop()
class DogstatsdProcess(multiprocessing.Process):
def __init__(self, agentConfig):
multiprocessing.Process.__init__(self, name='dogstatsd')
self.config = agentConfig
@ -185,7 +191,9 @@ class DogstatsdProcess(multiprocessing.Process):
self.reporter.stop()
self.reporter.join()
class PupProcess(multiprocessing.Process):
def __init__(self, agentConfig):
multiprocessing.Process.__init__(self, name='pup')
self.config = agentConfig

View File

@ -2,7 +2,7 @@ import ctypes
def handle_exe_click(name):
''' When the executables are clicked directly in the UI, we must let the
''' When the executables are clicked directly in the UI, we must let the
user know that they have to install the program as a service instead of
running it directly. '''
message = """To use %(name)s, you must install it as a service.
@ -16,4 +16,4 @@ For all available options, including how to install the service for a particular
%(name)s.exe help
""" % ({'name': name})
MessageBox = ctypes.windll.user32.MessageBoxA
MessageBox(None, message, 'Install as a Service', 0)
MessageBox(None, message, 'Install as a Service', 0)

View File

@ -29,14 +29,14 @@ from spyderlib.widgets.sourcecode.codeeditor import CodeEditor
# Datadog
from common.util import get_os
from config import (get_confd_path, get_config_path, get_config,
_windows_commondata_path)
from config import (get_confd_path, get_config_path, get_config,
_windows_commondata_path)
EXCLUDED_WINDOWS_CHECKS = [
'cacti', 'directory', 'gearmand',
'hdfs', 'kafka_consumer', 'mcache', 'network',
'redis', 'postfix', 'process', 'gunicorn', 'zk',
]
]
MAIN_WINDOW_TITLE = "Datadog Agent Manager"
@ -81,8 +81,8 @@ def get_checks():
continue
agent_check = AgentCheck(filename, ext, conf_d_directory)
if (agent_check.enabled or agent_check.module_name not in checks or
(not agent_check.is_example and not checks[agent_check.module_name].enabled)):
if (agent_check.enabled or agent_check.module_name not in checks or
(not agent_check.is_example and not checks[agent_check.module_name].enabled)):
checks[agent_check.module_name] = agent_check
checks_list = checks.values()
@ -92,6 +92,7 @@ def get_checks():
class EditorFile(object):
def __init__(self, file_path, description):
self.file_path = file_path
self.description = description
@ -101,7 +102,7 @@ class EditorFile(object):
def save(self, content):
try:
f = open(self.file_path,'w')
f = open(self.file_path, 'w')
f.write(content)
self.content = content
info_popup("File saved.")
@ -111,9 +112,10 @@ class EditorFile(object):
class LogFile(EditorFile):
def __init__(self):
EditorFile.__init__(self, AGENT_LOG_FILE, "Agent log file")
class DatadogConf(EditorFile):
@ -128,7 +130,7 @@ class DatadogConf(EditorFile):
def check_api_key(self, editor):
if self.api_key is None:
api_key, ok = QInputDialog.getText(None, "Add your API KEY",
"You must first set your api key in this file. You can find it here: https://app.datadoghq.com/account/settings#api")
"You must first set your api key in this file. You can find it here: https://app.datadoghq.com/account/settings#api")
if ok and api_key:
new_content = []
for line in self.content.splitlines():
@ -149,12 +151,13 @@ class DatadogConf(EditorFile):
class AgentCheck(EditorFile):
def __init__(self, filename, ext, conf_d_directory):
file_path = osp.join(conf_d_directory, filename)
self.module_name = filename.split('.')[0]
EditorFile.__init__(self, file_path, description=self.module_name.replace("_", " ").title())
self.enabled = ext == '.yaml'
self.is_example = ext == '.example'
self.enabled_name = osp.join(conf_d_directory, "%s.yaml" % self.module_name)
@ -176,10 +179,11 @@ class AgentCheck(EditorFile):
class PropertiesWidget(QWidget):
def __init__(self, parent):
QWidget.__init__(self, parent)
font = QFont(get_family(MONOSPACE), 10, QFont.Normal)
info_icon = QLabel()
icon = get_std_icon('MessageBoxInformation').pixmap(24, 24)
info_icon.setPixmap(icon)
@ -201,10 +205,10 @@ class PropertiesWidget(QWidget):
layout.addWidget(info_icon)
layout.addWidget(self.desc_label)
layout.addStretch()
layout.addWidget(self.service_status_label )
layout.addWidget(self.service_status_label)
group_desc.setLayout(layout)
self.editor = CodeEditor(self)
self.editor.setup_editor(linenumbers=True, font=font)
self.editor.setReadOnly(False)
@ -212,27 +216,24 @@ class PropertiesWidget(QWidget):
layout = QVBoxLayout()
layout.addWidget(self.editor)
group_code.setLayout(layout)
self.enable_button = QPushButton(get_icon("apply.png"),
"Enable", self)
"Enable", self)
self.save_button = QPushButton(get_icon("filesave.png"),
"Save", self)
"Save", self)
self.edit_datadog_conf_button = QPushButton(get_icon("edit.png"),
"Edit agent settings", self)
"Edit agent settings", self)
self.disable_button = QPushButton(get_icon("delete.png"),
"Disable", self)
"Disable", self)
self.view_log_button = QPushButton(get_icon("txt.png"),
"View log", self)
self.view_log_button = QPushButton(get_icon("txt.png"),
"View log", self)
self.menu_button = QPushButton(get_icon("settings.png"),
"Manager", self)
"Manager", self)
hlayout = QHBoxLayout()
hlayout.addWidget(self.save_button)
@ -246,7 +247,7 @@ class PropertiesWidget(QWidget):
hlayout.addWidget(self.view_log_button)
hlayout.addStretch()
hlayout.addWidget(self.menu_button)
vlayout = QVBoxLayout()
vlayout.addWidget(group_desc)
vlayout.addWidget(group_code)
@ -254,7 +255,7 @@ class PropertiesWidget(QWidget):
self.setLayout(vlayout)
self.current_file = None
def set_item(self, check):
self.current_file = check
self.desc_label.setText(check.get_description())
@ -288,28 +289,32 @@ class PropertiesWidget(QWidget):
class MainWindow(QSplitter):
def __init__(self, parent=None):
QSplitter.__init__(self, parent)
self.setWindowTitle(MAIN_WINDOW_TITLE)
self.setWindowIcon(get_icon("agent.svg"))
self.sysTray = SystemTray(self)
self.connect(self.sysTray, SIGNAL("activated(QSystemTrayIcon::ActivationReason)"), self.__icon_activated)
self.connect(self.sysTray, SIGNAL(
"activated(QSystemTrayIcon::ActivationReason)"), self.__icon_activated)
checks = get_checks()
datadog_conf = DatadogConf(get_config_path(), description="Agent settings file: datadog.conf")
datadog_conf = DatadogConf(
get_config_path(), description="Agent settings file: datadog.conf")
self.log_file = LogFile()
listwidget = QListWidget(self)
listwidget.addItems([osp.basename(check.module_name).replace("_", " ").title() for check in checks])
listwidget.addItems(
[osp.basename(check.module_name).replace("_", " ").title() for check in checks])
self.properties = PropertiesWidget(self)
self.addWidget(listwidget)
self.addWidget(self.properties)
self.connect(self.properties.enable_button, SIGNAL("clicked()"),
lambda: enable_check(self.properties))
@ -330,11 +335,10 @@ class MainWindow(QSplitter):
self.manager_menu = Menu(self)
self.connect(self.properties.menu_button, SIGNAL("clicked()"),
lambda: self.manager_menu.popup(self.properties.menu_button.mapToGlobal(QPoint(0,0))))
lambda: self.manager_menu.popup(self.properties.menu_button.mapToGlobal(QPoint(0, 0))))
listwidget.setCurrentRow(0)
self.setSizes([150, 1])
self.setStretchFactor(1, 1)
self.resize(QSize(950, 600))
@ -355,13 +359,14 @@ class MainWindow(QSplitter):
def closeEvent(self, event):
self.hide()
self.sysTray.show()
self.sysTray.show()
event.ignore()
def __icon_activated(self, reason):
if reason == QSystemTrayIcon.DoubleClick:
self.show()
class Menu(QMenu):
def __init__(self, parent=None, ):
@ -375,7 +380,6 @@ class Menu(QMenu):
self.connect(self, SIGNAL("aboutToShow()"), lambda: self.update_options())
def update_options(self):
status = get_service_status()
if is_service_running(status):
@ -403,7 +407,7 @@ class SystemTray(QSystemTrayIcon):
menu = Menu(self.parent())
self.setContextMenu(menu)
def disable_check(properties):
check = properties.current_file
new_content = properties.editor.toPlainText().__str__()
@ -415,7 +419,8 @@ def disable_check(properties):
properties.enable_button.setEnabled(True)
properties.disable_button.setEnabled(False)
check.disable()
def enable_check(properties):
check = properties.current_file
@ -427,20 +432,22 @@ def enable_check(properties):
properties.enable_button.setEnabled(False)
properties.disable_button.setEnabled(True)
check.enable()
def save_file(properties):
current_file = properties.current_file
new_content = properties.editor.toPlainText().__str__()
current_file.save(new_content)
def check_yaml_syntax(content):
try:
yaml.load(content, Loader=Loader)
except Exception, e:
warning_popup("Unable to parse yaml: \n %s" % str(e))
raise
def _service_manager(action):
try:
if action == 'stop':
@ -452,36 +459,43 @@ def _service_manager(action):
except Exception, e:
warning_popup("Couldn't %s service: \n %s" % (action, str(e)))
def service_manager(action, async=True):
if not async:
_service_manager(action)
else:
thread.start_new_thread(_service_manager, (action,))
def get_service_status():
try:
return win32serviceutil.QueryServiceStatus(DATADOG_SERVICE)[1]
except Exception:
return "Unknown"
def is_service_running(status = None):
def is_service_running(status=None):
if status is None:
status = get_service_status()
return status == win32service.SERVICE_RUNNING
def is_service_pending(status = None):
def is_service_pending(status=None):
if status is None:
status = get_service_status()
return status in [win32service.SERVICE_STOP_PENDING, win32service.SERVICE_START_PENDING]
def is_service_stopped(status = None):
def is_service_stopped(status=None):
if status is None:
status = get_service_status()
return status == win32service.SERVICE_STOPPED
def warning_popup(message, parent=None):
QMessageBox.warning(parent, 'Message', message, QMessageBox.Ok)
def info_popup(message, parent=None):
QMessageBox.information(parent, 'Message', message, QMessageBox.Ok)
@ -491,4 +505,4 @@ if __name__ == '__main__':
app = QApplication([])
win = MainWindow()
win.show()
app.exec_()
app.exec_()

View File

@ -16,4 +16,4 @@ Datadog Agent v%s - Python Shell
print traceback.format_exc(e)
if __name__ == "__main__":
shell()
shell()

View File

@ -4,6 +4,7 @@ import collections
class Plugins(collections.defaultdict):
"""A container for the plugin configurations used by the mon-agent.
This is essentially a defaultdict(dict) but put into a class primarily to make the interface clear, also
to add a couple of helper methods.

View File

@ -4,6 +4,7 @@
from monclient import exc as exc, client
class Plugin(object):
"""Abstract class implemented by the mon-agent plugin detection classes
"""
# todo these should include dependency detection

View File

@ -8,6 +8,7 @@ log = logging.getLogger(__name__)
class Kafka(Plugin):
"""Detect Kafka daemons and sets up configuration to monitor them.
This plugin configures the kafka_consumer plugin and does not configure any jmx based checks against kafka.
Note this plugin will pull the same information from kafka on each node in the cluster it runs on.
@ -32,7 +33,8 @@ class Kafka(Plugin):
import kazoo
from kazoo.client import KazooClient
logging.getLogger('kazoo').setLevel(logging.WARN) # kazoo fills up the console without this
# kazoo fills up the console without this
logging.getLogger('kazoo').setLevel(logging.WARN)
zk = KazooClient(hosts='127.0.0.1:2181', read_only=True)
zk.start()
@ -40,7 +42,8 @@ class Kafka(Plugin):
for topic in zk.get_children('/brokers/topics'):
topics[topic] = zk.get_children('/brokers/topics/%s/partitions' % topic)
consumers = collections.defaultdict(dict) # {'consumer_group_name': { 'topic1': [ 0, 1, 2] # partitions }}
# {'consumer_group_name': { 'topic1': [ 0, 1, 2] # partitions }}
consumers = collections.defaultdict(dict)
for consumer in zk.get_children('/consumers'):
try:
for topic in zk.get_children('/consumers/%s/offsets' % consumer):

View File

@ -12,8 +12,10 @@ log = logging.getLogger(__name__)
class MonPersister(Plugin):
"""Detect mon_persister and setup monitoring.
"""
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
if find_process_cmdline('mon-persister') is not None:
@ -26,16 +28,18 @@ class MonPersister(Plugin):
return dropwizard_health_check('mon-persister', 'http://localhost:8091/healthcheck')
# todo
#log.info("\tEnabling the mon persister metric collection")
#http://localhost:8091/metrics
# log.info("\tEnabling the mon persister metric collection")
# http://localhost:8091/metrics
def dependencies_installed(self):
return True
class MonAPI(Plugin):
"""Detect mon_api and setup monitoring.
"""
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
if find_process_cmdline('mon-api') is not None:
@ -48,15 +52,17 @@ class MonAPI(Plugin):
return dropwizard_health_check('mon-api', 'http://localhost:8081/healthcheck')
# todo
#log.info("\tEnabling the mon api metric collection")
#http://localhost:8081/metrics
# log.info("\tEnabling the mon api metric collection")
# http://localhost:8081/metrics
def dependencies_installed(self):
return True
class MonThresh(Plugin):
"""Detect the running mon-thresh and monitor"""
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
if find_process_cmdline('mon-thresh') is not None:

View File

@ -7,6 +7,7 @@ log = logging.getLogger(__name__)
class MySQL(Plugin):
"""Detect MySQL daemons and setup configuration to monitor them.
This plugin needs user/pass infor for mysql setup, this is best placed in /root/.my.cnf in a format such as
[client]
@ -27,7 +28,8 @@ class MySQL(Plugin):
config.merge(watch_process(['mysqld']))
log.info("\tWatching the mysqld process.")
# Attempt login, requires either an empty root password from localhost or relying on a configured .my.cnf
# Attempt login, requires either an empty root password from localhost or
# relying on a configured .my.cnf
if self.dependencies_installed(): # ensures MySQLdb is available
import MySQLdb
import _mysql_exceptions
@ -36,7 +38,8 @@ class MySQL(Plugin):
except _mysql_exceptions.MySQLError:
pass
else:
log.info("\tConfiguring MySQL plugin to connect with auth settings from /root/.my.cnf")
log.info(
"\tConfiguring MySQL plugin to connect with auth settings from /root/.my.cnf")
config['mysql'] = {'init_config': None, 'instances':
[{'server': 'localhost', 'user': 'root', 'defaults_file': '/root/.my.cnf'}]}

View File

@ -6,6 +6,7 @@ from monsetup import agent_config
class Network(Plugin):
"""No configuration here, working networking is assumed so this is either on or off.
"""
@ -16,7 +17,8 @@ class Network(Plugin):
def build_config(self):
"""Build the config as a Plugins object and return.
"""
# A bit silly to parse the yaml only for it to be converted back but this plugin is the exception not the rule
# A bit silly to parse the yaml only for it to be converted back but this
# plugin is the exception not the rule
with open(os.path.join(self.template_dir, 'conf.d/network.yaml'), 'r') as network_template:
default_net_config = yaml.load(network_template.read())
config = agent_config.Plugins()

View File

@ -6,9 +6,11 @@ from monsetup import agent_config
class Postfix(Plugin):
"""If postfix is running install the default config
"""
# todo this is is disabled as postfix requires passwordless sudo for the mon-agent user, a bad practice
# todo this is is disabled as postfix requires passwordless sudo for the
# mon-agent user, a bad practice
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
@ -18,7 +20,8 @@ class Postfix(Plugin):
def build_config(self):
"""Build the config as a Plugins object and return.
"""
# A bit silly to parse the yaml only for it to be converted back but this plugin is the exception not the rule
# A bit silly to parse the yaml only for it to be converted back but this
# plugin is the exception not the rule
with open(os.path.join(self.template_dir, 'conf.d/postfix.yaml.example'), 'r') as postfix_template:
default_net_config = yaml.load(postfix_template.read())
config = agent_config.Plugins()

View File

@ -9,8 +9,10 @@ log = logging.getLogger(__name__)
class Zookeeper(Plugin):
"""Detect Zookeeper daemons and setup configuration to monitor them.
"""
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
if find_process_cmdline('zookeeper') is not None:
@ -32,4 +34,6 @@ class Zookeeper(Plugin):
return config
def dependencies_installed(self):
return True # The current plugin just does a simple socket connection to zookeeper and parses the stat command
# The current plugin just does a simple socket connection to zookeeper and
# parses the stat command
return True

View File

@ -29,15 +29,19 @@ log = logging.getLogger(__name__)
def main(argv=None):
parser = argparse.ArgumentParser(description='Detect running daemons then configure and start the agent.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-u', '--username', help="Keystone username used to post metrics", required=True)
parser.add_argument('-p', '--password', help="Keystone password used to post metrics", required=True)
parser.add_argument(
'-u', '--username', help="Keystone username used to post metrics", required=True)
parser.add_argument(
'-p', '--password', help="Keystone password used to post metrics", required=True)
parser.add_argument('--project_name', help="Keystone project/tenant name", required=True)
parser.add_argument('-s', '--service', help="Service this node is associated with.", required=True)
parser.add_argument(
'-s', '--service', help="Service this node is associated with.", required=True)
parser.add_argument('--keystone_url', help="Keystone url", required=True)
parser.add_argument('--mon_url', help="Mon API url", required=True)
parser.add_argument('--config_dir', help="Configuration directory", default='/etc/mon-agent')
parser.add_argument('--log_dir', help="mon-agent log directory", default='/var/log/mon-agent')
parser.add_argument('--template_dir', help="Alternative template directory", default='/usr/local/share/mon/agent')
parser.add_argument(
'--template_dir', help="Alternative template directory", default='/usr/local/share/mon/agent')
parser.add_argument('--headless', help="Run in a non-interactive mode", action="store_true")
parser.add_argument('--overwrite',
help="Overwrite existing plugin configuration." +
@ -58,7 +62,8 @@ def main(argv=None):
# Detect os
detected_os = 'linux' # todo add detection
# Service enable, includes setup of users/config directories so must be done before configuration
# Service enable, includes setup of users/config directories so must be
# done before configuration
agent_service = OS_SERVICE_MAP[detected_os](os.path.join(args.template_dir, 'mon-agent.init'), args.config_dir,
args.log_dir, username=args.user)
if not args.skip_enable:
@ -102,14 +107,15 @@ def main(argv=None):
if not detect.configure_alarms(args.mon_url, token):
log.warn('Unable to configure alarms for {0}'.format(detect.name))
#todo add option to install dependencies
# todo add option to install dependencies
# Write out the plugin config
for key, value in plugin_config.iteritems():
# todo if overwrite is set I should either warn or just delete any config files not in the new config
# todo add the ability to show a diff before overwriting or merging config
config_path = os.path.join(args.config_dir, 'conf.d', key + '.yaml')
if (not args.overwrite) and os.path.exists(config_path): # merge old and new config, new has precedence
# merge old and new config, new has precedence
if (not args.overwrite) and os.path.exists(config_path):
with open(config_path, 'r') as config_file:
old_config = yaml.load(config_file.read())
if old_config is not None:

View File

@ -1,2 +1 @@
from service import Service

View File

@ -4,7 +4,9 @@ import psutil
class Service(object):
"""Abstract base class implementing the interface for various service types."""
def __init__(self, config_dir, log_dir, name='mon-agent'):
self.config_dir = config_dir
self.log_dir = log_dir

View File

@ -12,6 +12,7 @@ log = logging.getLogger(__name__)
class SysV(Service):
def __init__(self, init_template, config_dir, log_dir, name='mon-agent', username='mon-agent'):
"""Setup this service with the given init template"""
super(SysV, self).__init__(config_dir, log_dir, name)
@ -84,4 +85,3 @@ class SysV(Service):
return True
else:
return False

View File

@ -11,7 +11,7 @@ setup_requires = [
]
# Prereqs of the install. Will install when deploying the egg.
install_requires=[
install_requires = [
'requests',
'gearman',
'httplib2',
@ -78,8 +78,9 @@ if sys.platform == 'win32':
]
class Target(object):
def __init__(self, **kw):
self.__dict__.update(kw)
self.__dict__.update(kw)
self.version = '1.0.0'
self.cmdline_style = 'pywin32'
@ -100,7 +101,8 @@ if sys.platform == 'win32':
'service': [agent_svc],
'windows': [{'script': 'win32\gui.py',
'dest_base': "agent-manager",
'uac_info': "requireAdministrator", # The manager needs to be administrator to stop/start the service
# The manager needs to be administrator to stop/start the service
'uac_info': "requireAdministrator",
'icon_resources': [(1, r"packaging\mon-agent\win32\install_files\dd_agent_win_256.ico")],
}],
'data_files': [

View File

@ -26,7 +26,8 @@ def load_check(name, config, agent_config):
else:
break
if check_class is None:
raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)
raise Exception(
"Unable to import check %s. Missing a class that inherits AgentCheck" % name)
init_config = config.get('init_config', None)
instances = config.get('instances')
@ -51,7 +52,7 @@ def kill_subprocess(process_obj):
import ctypes
PROCESS_TERMINATE = 1
handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False,
process_obj.pid)
process_obj.pid)
ctypes.windll.kernel32.TerminateProcess(handle, -1)
ctypes.windll.kernel32.CloseHandle(handle)
else:
@ -70,7 +71,8 @@ def get_check(name, config_str):
check_class = clsmember
break
if check_class is None:
raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)
raise Exception(
"Unable to import check %s. Missing a class that inherits AgentCheck" % name)
agent_config = {
'version': '0.1',

View File

@ -1,13 +1,3 @@
"""
Functional tests for dogstatsd.
"""

View File

@ -6,8 +6,6 @@ Performance tests for the agent/dogstatsd metrics aggregator.
from monagent.common.aggregator import MetricsAggregator
class TestAggregatorPerf(object):
FLUSH_COUNT = 10
@ -58,4 +56,4 @@ class TestAggregatorPerf(object):
if __name__ == '__main__':
t = TestAggregatorPerf()
t.test_dogstatsd_aggregation_perf()
#t.test_checksd_aggregation_perf()
# t.test_checksd_aggregation_perf()

View File

@ -17,7 +17,9 @@ instances:
rrd_whitelist: %s
""" % (os.path.join(os.path.dirname(__file__), "cacti", "whitelist.txt"))
class TestCacti(unittest.TestCase):
def setUp(self):
self.tmp_dir = '/tmp/cacti_test'
self.rrd_dir = os.path.join(os.path.dirname(__file__), "cacti")
@ -55,7 +57,7 @@ class TestCacti(unittest.TestCase):
# Bump the last timestamps back 20 minutes so we have some actual data
twenty_min = 20 * 60
for k,v in check.last_ts.items():
for k, v in check.last_ts.items():
check.last_ts[k] = v - twenty_min
# Do a first check

View File

@ -8,20 +8,25 @@ from collector.dogstream.cassandra import parse_cassandra
logger = logging.getLogger(__name__)
class TestCassandraDogstream(unittest.TestCase):
@attr('cassandra')
def testStart(self):
events = parse_cassandra(logger, " INFO [main] 2012-12-11 21:46:26,995 StorageService.java (line 687) Bootstrap/Replace/Move completed! Now serving reads.")
events = parse_cassandra(
logger, " INFO [main] 2012-12-11 21:46:26,995 StorageService.java (line 687) Bootstrap/Replace/Move completed! Now serving reads.")
self.assertTrue(events is None)
@attr('cassandra')
def testInfo(self):
events = parse_cassandra(logger, " INFO [CompactionExecutor:35] 2012-12-02 21:15:03,738 AutoSavingCache.java (line 268) Saved KeyCache (5 items) in 3 ms")
events = parse_cassandra(
logger, " INFO [CompactionExecutor:35] 2012-12-02 21:15:03,738 AutoSavingCache.java (line 268) Saved KeyCache (5 items) in 3 ms")
self.assertTrue(events is None)
@attr('cassandra')
def testWarn(self):
events = parse_cassandra(logger, " WARN [MemoryMeter:1] 2012-12-03 20:07:47,158 Memtable.java (line 197) setting live ratio to minimum of 1.0 instead of 0.9416553595658074")
events = parse_cassandra(
logger, " WARN [MemoryMeter:1] 2012-12-03 20:07:47,158 Memtable.java (line 197) setting live ratio to minimum of 1.0 instead of 0.9416553595658074")
self.assertTrue(events is None)
@attr('cassandra')
@ -55,13 +60,17 @@ java.util.concurrent.RejectedExecutionException
@attr('cassandra')
def testCompactionStart(self):
events = parse_cassandra(logger, " INFO [CompactionExecutor:2] 2012-12-11 21:46:27,012 CompactionTask.java (line 109) Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]")
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': "Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-1", 'msg_text': "Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]", 'auto_priority': 0}])
events = parse_cassandra(
logger, " INFO [CompactionExecutor:2] 2012-12-11 21:46:27,012 CompactionTask.java (line 109) Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]")
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': "Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-1", 'msg_text':
"Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]", 'auto_priority': 0}])
@attr('cassandra')
def testCompactionEnd(self):
events = parse_cassandra(logger, "INFO [CompactionExecutor:2] 2012-12-11 21:46:27,095 CompactionTask.java (line 221) Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.")
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 ', 'msg_text': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.', 'auto_priority': 0}])
events = parse_cassandra(
logger, "INFO [CompactionExecutor:2] 2012-12-11 21:46:27,095 CompactionTask.java (line 221) Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.")
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 ',
'msg_text': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.', 'auto_priority': 0}])
if __name__ == '__main__':
unittest.main()

View File

@ -13,7 +13,10 @@ from monagent.collector.jmxfetch import JMXFetch
STATSD_PORT = 8121
class DummyReporter(threading.Thread):
def __init__(self, metrics_aggregator):
threading.Thread.__init__(self)
self.finished = threading.Event()
@ -23,7 +26,6 @@ class DummyReporter(threading.Thread):
self.finished = False
self.start()
def run(self):
while not self.finished:
time.sleep(self.interval)
@ -34,26 +36,26 @@ class DummyReporter(threading.Thread):
if metrics:
self.metrics = metrics
class JMXTestCase(unittest.TestCase):
def setUp(self):
aggregator = MetricsAggregator("test_host")
self.server = Server(aggregator, "localhost", STATSD_PORT)
pid_file = PidFile('dogstatsd')
self.reporter = DummyReporter(aggregator)
self.t1 = threading.Thread(target=self.server.start)
self.t1.start()
confd_path = os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "jmx_yamls"))
JMXFetch.init(confd_path, {'dogstatsd_port':STATSD_PORT}, get_logging_config(), 15)
JMXFetch.init(confd_path, {'dogstatsd_port': STATSD_PORT}, get_logging_config(), 15)
def tearDown(self):
self.server.stop()
self.reporter.finished = True
JMXFetch.stop()
def testCustomJMXMetric(self):
raise SkipTest('Requires JMX be setup')
count = 0
@ -67,7 +69,8 @@ class JMXTestCase(unittest.TestCase):
self.assertTrue(type(metrics) == type([]))
self.assertTrue(len(metrics) > 0)
self.assertTrue(len([t for t in metrics if "cassandra.db." in t['metric'] and "instance:cassandra_instance" in t['dimensions']]) > 40, metrics)
self.assertTrue(len([t for t in metrics if "cassandra.db." in t[
'metric'] and "instance:cassandra_instance" in t['dimensions']]) > 40, metrics)
if __name__ == "__main__":
unittest.main()

View File

@ -12,34 +12,34 @@ class DummyAgentCheck(AgentCheck):
raise Exception("failure")
def test_check_status_fail():
instances = [
{'pass':True},
{'pass':False},
{'pass':True}
{'pass': True},
{'pass': False},
{'pass': True}
]
check = DummyAgentCheck('dummy_agent_check', {}, {}, instances)
instance_statuses = check.run()
assert len(instance_statuses) == 3
assert instance_statuses[0].status == STATUS_OK
assert instance_statuses[1].status == STATUS_ERROR
assert instance_statuses[2].status == STATUS_OK
def test_check_status_pass():
instances = [
{'pass':True},
{'pass':True},
{'pass': True},
{'pass': True},
]
check = DummyAgentCheck('dummy_agent_check', {}, {}, instances)
instances_status = check.run()
assert len(instances_status) == 2
for i in instances_status:
assert i.status == STATUS_OK
def test_persistence():
i1 = InstanceStatus(1, STATUS_OK)
chk1 = CheckStatus("dummy", [i1], 1, 2)
@ -54,6 +54,7 @@ def test_persistence():
assert chk2.metric_count == 1
assert chk2.event_count == 2
def test_persistence_fail():
# Assert remove doesn't crap out if a file doesn't exist.

View File

@ -6,9 +6,11 @@ from monagent.common.exceptions import UnknownValue, CheckException, Infinity
from monagent.collector.checks import Check
from monagent.common.aggregator import MetricsAggregator
class TestCore(unittest.TestCase):
"Tests to validate the core check logic"
def setUp(self):
self.c = Check(logger)
self.c.gauge("test-metric")
@ -28,7 +30,8 @@ class TestCore(unittest.TestCase):
self.assertEquals(len(self.c._sample_store["test-metric"]), 1)
# with explicit timestamp
self.c.save_sample("test-metric", 3.0, 1298066183.607717)
self.assertEquals(self.c.get_sample_with_timestamp("test-metric"), (1298066183.607717, 3.0, None, None))
self.assertEquals(self.c.get_sample_with_timestamp(
"test-metric"), (1298066183.607717, 3.0, None, None))
# get_samples()
self.assertEquals(self.c.get_samples(), {"test-metric": 3.0})
@ -44,7 +47,8 @@ class TestCore(unittest.TestCase):
self.assertRaises(UnknownValue, self.c.get_sample, "test-counter", expire=False)
self.c.save_sample("test-counter", 2.0, 2.0)
self.assertEquals(self.c.get_sample("test-counter", expire=False), 1.0)
self.assertEquals(self.c.get_sample_with_timestamp("test-counter", expire=False), (2.0, 1.0, None, None))
self.assertEquals(self.c.get_sample_with_timestamp(
"test-counter", expire=False), (2.0, 1.0, None, None))
self.assertEquals(self.c.get_samples(expire=False), {"test-counter": 1.0})
self.c.save_sample("test-counter", -2.0, 3.0)
self.assertRaises(UnknownValue, self.c.get_sample_with_timestamp, "test-counter")
@ -53,21 +57,27 @@ class TestCore(unittest.TestCase):
# Test metric dimensions
now = int(time.time())
# dimensions metrics
self.c.save_sample("test-counter", 1.0, 1.0, dimensions={"dim1": "value1", "dim2": "value2"})
self.c.save_sample("test-counter", 2.0, 2.0, dimensions={"dim1": "value1", "dim2": "value2"})
self.c.save_sample(
"test-counter", 1.0, 1.0, dimensions={"dim1": "value1", "dim2": "value2"})
self.c.save_sample(
"test-counter", 2.0, 2.0, dimensions={"dim1": "value1", "dim2": "value2"})
# Only 1 point recording for this combination of dimensions, won't be sent
self.c.save_sample("test-counter", 3.0, 3.0, dimensions={"dim1": "value1", "dim3": "value3"})
self.c.save_sample(
"test-counter", 3.0, 3.0, dimensions={"dim1": "value1", "dim3": "value3"})
self.c.save_sample("test-metric", 3.0, now, dimensions={"dim3": "value3", "dim4": "value4"})
# Arg checks
self.assertRaises(CheckException, self.c.save_sample, "test-metric", 4.0, now + 5, dimensions="abc")
self.assertRaises(
CheckException, self.c.save_sample, "test-metric", 4.0, now + 5, dimensions="abc")
# This is a different combination of dimensions
self.c.save_sample("test-metric", 3.0, now, dimensions={"dim5": "value5", "dim3": "value3"})
results = self.c.get_metrics()
results.sort()
self.assertEquals(results,
[("test-counter", 2.0, 1.0, {"dimensions": {"dim1": "value1", "dim2": "value2"}}),
("test-metric", now, 3.0, {"dimensions": {"dim3": "value3", "dim4": "value4"}}),
("test-metric", now, 3.0, {"dimensions": {"dim3": "value3", "dim5": "value5"}})
("test-metric", now, 3.0,
{"dimensions": {"dim3": "value3", "dim4": "value4"}}),
("test-metric", now, 3.0,
{"dimensions": {"dim3": "value3", "dim5": "value5"}})
])
# dimensions metrics are not available through get_samples anymore
self.assertEquals(self.c.get_samples(), {})
@ -77,26 +87,34 @@ class TestCore(unittest.TestCase):
self.c.save_sample("test-metric", 1.0, 0.0) # value, ts
self.c.save_sample("test-counter", 1.0, 1.0) # value, ts
self.c.save_sample("test-counter", 4.0, 2.0) # value, ts
assert "test-metric" in self.c.get_samples_with_timestamps(expire=False), self.c.get_samples_with_timestamps(expire=False)
self.assertEquals(self.c.get_samples_with_timestamps(expire=False)["test-metric"], (0.0, 1.0, None, None))
assert "test-counter" in self.c.get_samples_with_timestamps(expire=False), self.c.get_samples_with_timestamps(expire=False)
self.assertEquals(self.c.get_samples_with_timestamps(expire=False)["test-counter"], (2.0, 3.0, None, None))
assert "test-metric" in self.c.get_samples_with_timestamps(
expire=False), self.c.get_samples_with_timestamps(expire=False)
self.assertEquals(self.c.get_samples_with_timestamps(
expire=False)["test-metric"], (0.0, 1.0, None, None))
assert "test-counter" in self.c.get_samples_with_timestamps(
expire=False), self.c.get_samples_with_timestamps(expire=False)
self.assertEquals(self.c.get_samples_with_timestamps(
expire=False)["test-counter"], (2.0, 3.0, None, None))
def test_name(self):
self.assertEquals(self.c.normalize("metric"), "metric")
self.assertEquals(self.c.normalize("metric", "prefix"), "prefix.metric")
self.assertEquals(self.c.normalize("__metric__", "prefix"), "prefix.metric")
self.assertEquals(self.c.normalize("abc.metric(a+b+c{}/5)", "prefix"), "prefix.abc.metric_a_b_c_5")
self.assertEquals(self.c.normalize("VBE.default(127.0.0.1,,8080).happy", "varnish"), "varnish.VBE.default_127.0.0.1_8080.happy")
self.assertEquals(
self.c.normalize("abc.metric(a+b+c{}/5)", "prefix"), "prefix.abc.metric_a_b_c_5")
self.assertEquals(self.c.normalize(
"VBE.default(127.0.0.1,,8080).happy", "varnish"), "varnish.VBE.default_127.0.0.1_8080.happy")
class TestAggregator(unittest.TestCase):
def setUp(self):
self.aggr = MetricsAggregator('test-aggr')
def test_dupe_tags(self):
self.aggr.increment('test-counter', 1, dimensions={'a': 'avalue', 'b': 'bvalue'})
self.aggr.increment('test-counter', 1, dimensions={'a': 'avalue', 'b': 'bvalue', 'b': 'bvalue'})
self.aggr.increment(
'test-counter', 1, dimensions={'a': 'avalue', 'b': 'bvalue', 'b': 'bvalue'})
self.assertEquals(len(self.aggr.metrics), 1, self.aggr.metrics)
metric = self.aggr.metrics.values()[0]
self.assertEquals(metric.value, 2)

View File

@ -1,4 +1,4 @@
## -*- coding: latin-1 -*-
# -*- coding: latin-1 -*-
import unittest
import os.path
import tempfile
@ -8,10 +8,12 @@ from monagent.common.util import PidFile, is_valid_hostname
class TestConfig(unittest.TestCase):
def testWhiteSpaceConfig(self):
"""Leading whitespace confuse ConfigParser
"""
agent_config = get_config(cfg_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "badconfig.conf"))
agent_config = get_config(
cfg_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "badconfig.conf"))
self.assertEquals(agent_config["api_key"], "1234")
def testGoodPidFie(self):
@ -38,10 +40,10 @@ class TestConfig(unittest.TestCase):
u'i-123445',
u'5dfsdfsdrrfsv',
u'432498234234A'
u'234234235235235235', # Couldn't find anything in the RFC saying it's not valid
u'234234235235235235', # Couldn't find anything in the RFC saying it's not valid
u'A45fsdff045-dsflk4dfsdc.ret43tjssfd',
u'4354sfsdkfj4TEfdlv56gdgdfRET.dsf-dg',
u'r'*255,
u'r' * 255,
]
not_valid_hostnames = [
@ -60,4 +62,3 @@ class TestConfig(unittest.TestCase):
if __name__ == '__main__':
unittest.main()

View File

@ -24,4 +24,5 @@ class CouchDBTestCase(unittest.TestCase):
metrics = self.check.get_metrics()
self.assertTrue(type(metrics) == type([]), metrics)
self.assertTrue(len(metrics) > 3)
self.assertTrue(len([k for k in metrics if "instance:http://localhost:5984" in k[3]['dimensions']]) > 3)
self.assertTrue(
len([k for k in metrics if "instance:http://localhost:5984" in k[3]['dimensions']]) > 3)

View File

@ -3,6 +3,8 @@ from tests.common import load_check
from nose.plugins.attrib import attr
from nose.plugins.skip import SkipTest
class CouchbaseTestCase(unittest.TestCase):
def setUp(self):
@ -22,22 +24,22 @@ class CouchbaseTestCase(unittest.TestCase):
@attr('couchbase')
def test_camel_case_to_joined_lower(self):
test_pairs = {
'camelCase' : 'camel_case',
'FirstCapital' : 'first_capital',
'joined_lower' : 'joined_lower',
'joined_Upper1' : 'joined_upper1',
'Joined_upper2' : 'joined_upper2',
'Joined_Upper3' : 'joined_upper3',
'_leading_Underscore' : 'leading_underscore',
'Trailing_Underscore_' : 'trailing_underscore',
'DOubleCAps' : 'd_ouble_c_aps',
'@@@super--$$-Funky__$__$$%' : 'super_funky',
'camelCase': 'camel_case',
'FirstCapital': 'first_capital',
'joined_lower': 'joined_lower',
'joined_Upper1': 'joined_upper1',
'Joined_upper2': 'joined_upper2',
'Joined_Upper3': 'joined_upper3',
'_leading_Underscore': 'leading_underscore',
'Trailing_Underscore_': 'trailing_underscore',
'DOubleCAps': 'd_ouble_c_aps',
'@@@super--$$-Funky__$__$$%': 'super_funky',
}
for test_input, expected_output in test_pairs.items():
test_output = self.check.camel_case_to_joined_lower(test_input)
self.assertEqual(test_output, expected_output,
'Input was %s, expected output was %s, actual output was %s' % (test_input, expected_output, test_output))
self.assertEqual(test_output, expected_output,
'Input was %s, expected output was %s, actual output was %s' % (test_input, expected_output, test_output))
@attr('couchbase')
def test_metrics_casing(self):
@ -46,25 +48,28 @@ class CouchbaseTestCase(unittest.TestCase):
metrics = self.check.get_metrics()
camel_cased_metrics = [u'couchbase.hdd.used_by_data',
u'couchbase.ram.used_by_data',
u'couchbase.ram.quota_total',
u'couchbase.ram.quota_used',
]
camel_cased_metrics = [u'couchbase.hdd.used_by_data',
u'couchbase.ram.used_by_data',
u'couchbase.ram.quota_total',
u'couchbase.ram.quota_used',
]
found_metrics = [k[0] for k in metrics if k[0] in camel_cased_metrics]
self.assertEqual(found_metrics.sort(), camel_cased_metrics.sort())
@attr('couchbase')
def test_metrics(self):
raise SkipTest("Skipped for now as it's hard to configure couchbase on travis")
self.check.check(self.config['instances'][0])
metrics = self.check.get_metrics()
self.assertTrue(type(metrics) == type([]), metrics)
self.assertTrue(len(metrics) > 3)
self.assertTrue(len([k for k in metrics if "instance:http://localhost:8091" in k[3]['dimensions']]) > 3)
self.assertTrue(
len([k for k in metrics if "instance:http://localhost:8091" in k[3]['dimensions']]) > 3)
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_node')]) > 1, 'Unable to fund any per node metrics')
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_bucket')]) > 1, 'Unable to fund any per node metrics')
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_node')])
> 1, 'Unable to fund any per node metrics')
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_bucket')])
> 1, 'Unable to fund any per node metrics')

View File

@ -11,14 +11,16 @@ from collector.dogstream import cassandra, supervisord_log, common
log = logging.getLogger('datadog.test')
NAGIOS_TEST_HOST = os.path.join(os.path.dirname(__file__), "host-perfdata")
NAGIOS_TEST_SVC = os.path.join(os.path.dirname(__file__), "service-perfdata")
NAGIOS_TEST_HOST_TEMPLATE="[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$"
NAGIOS_TEST_SVC_TEMPLATE="[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$"
NAGIOS_TEST_HOST_TEMPLATE = "[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$"
NAGIOS_TEST_SVC_TEMPLATE = "[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$"
def parse_ancient_function_plugin(logger, line):
"""Ancient stateless parser"""
res = line.split()
res[3] = {'metric_type': 'gauge'}
def parse_function_plugin(logger, line, state):
"""Simple stateful parser"""
try:
@ -31,13 +33,16 @@ def parse_function_plugin(logger, line, state):
res[3] = {'metric_type': 'counter'}
return tuple(res)
class ParseClassPlugin(object):
"""Class-based stateful parser"""
def __init__(self, logger=None, user_args=(), **kwargs):
self.logger = logger
self.args = '.'.join(user_args)
self.acc = 0
self.logger.info('Completed initialization')
def parse_line(self, line):
self.logger.info('Parsing line %r; counter is %r', line, self.acc)
self.acc += 1
@ -47,21 +52,24 @@ class ParseClassPlugin(object):
res[3] = {'metric_type': 'counter'}
return tuple(res)
import time
from datetime import datetime
import calendar
log_event_pattern = re.compile("".join([
r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ", # iso timestamp
r"\[(?P<alert_type>(ERROR)|(RECOVERY))\] - ", # alert type
r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ", # iso timestamp
r"\[(?P<alert_type>(ERROR)|(RECOVERY))\] - ", # alert type
r"(?P<msg_title>(?P<host>[^ ]*).*)"
]))
alert_types = {
"ERROR": "error",
"RECOVERY": "success"
}
def parse_events(logger, line):
""" Expecting lines like this:
""" Expecting lines like this:
2012-05-14 12:46:01 [ERROR] - host0 is down (broke its collarbone)
"""
match = log_event_pattern.match(line)
@ -69,22 +77,25 @@ def parse_events(logger, line):
groups = match.groupdict()
groups.update({
'alert_type': alert_types.get(groups['alert_type'], ''),
'timestamp': calendar.timegm(datetime.strptime(groups['timestamp'], '%Y-%m-%d %H:%M:%S').timetuple()),
'timestamp': calendar.timegm(
datetime.strptime(groups['timestamp'], '%Y-%m-%d %H:%M:%S').timetuple()),
'msg_text': line
})
})
return groups
else:
return None
def repr_event_parser(logger, line):
return eval(line)
class TailTestCase(unittest.TestCase):
def setUp(self):
self.log_file = NamedTemporaryFile()
self.logger = logging.getLogger('test.dogstream')
def _write_log(self, log_data):
for data in log_data:
print >> self.log_file, data
@ -93,6 +104,7 @@ class TailTestCase(unittest.TestCase):
def tearDown(self):
self.log_file.close()
class TestDogstream(TailTestCase):
gauge = {'metric_type': 'gauge'}
counter = {'metric_type': 'counter'}
@ -107,7 +119,7 @@ class TestDogstream(TailTestCase):
log.info("Test config: %s" % self.config)
self.dogstream = Dogstreams.init(self.logger, self.config)
self.maxDiff = None
def test_dogstream_gauge(self):
log_data = [
# bucket 0
@ -122,21 +134,21 @@ class TestDogstream(TailTestCase):
('test.metric.a', '1000000006', '7', 'metric_type=gauge'),
('test.metric.a', '1000000007', '8', 'metric_type=gauge'),
]
expected_output = {
"dogstream": [
('test.metric.a', 1000000000, 5.0, self.gauge),
('test.metric.a', 1000000005, 8.0, self.gauge),
]
}
self._write_log((' '.join(data) for data in log_data))
actual_output = self.dogstream.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
for metric, timestamp, val, attr in expected_output['dogstream']:
assert isinstance(val, float)
def test_dogstream_counter(self):
log_data = [
# bucket 0
@ -151,14 +163,14 @@ class TestDogstream(TailTestCase):
('test.metric.a', '1000000006', '7', 'metric_type=counter'),
('test.metric.a', '1000000007', '8', 'metric_type=counter'),
]
expected_output = {
"dogstream": [
('test.metric.a', 1000000000, 42, self.counter),
('test.metric.a', 1000000005, 27, self.counter),
]
}
self._write_log((' '.join(data) for data in log_data))
actual_output = self.dogstream.check(self.config, move_end=False)
@ -173,12 +185,10 @@ class TestDogstream(TailTestCase):
('test_metric.e 1 1000000002 metric_type=gauge'),
('test_metric.e 1000000002 10 metric_type=gauge'),
]
expected_output = {"dogstream":
[('test_metric.e', 1000000000, 10, self.gauge)]
}
expected_output = {"dogstream": [('test_metric.e', 1000000000, 10, self.gauge)]}
self._write_log(log_data)
actual_output = self.dogstream.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
@ -194,7 +204,9 @@ class TestDogstream(TailTestCase):
('test.metric.simple', 1100000000, 1, self.gauge)]
}
self._write_log(log_data)
plugdog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_ancient_function_plugin' % self.log_file.name})
plugdog = Dogstreams.init(
self.logger, {
'dogstreams': '%s:tests.test_datadog:parse_ancient_function_plugin' % self.log_file.name})
actual_output = plugdog.check(self.config, move_end=False)
def test_dogstream_function_plugin(self):
@ -210,7 +222,9 @@ class TestDogstream(TailTestCase):
}
self._write_log(log_data)
statedog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_function_plugin' % self.log_file.name})
statedog = Dogstreams.init(
self.logger,
{'dogstreams': '%s:tests.test_datadog:parse_function_plugin' % self.log_file.name})
actual_output = statedog.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
@ -227,7 +241,9 @@ class TestDogstream(TailTestCase):
}
self._write_log(log_data)
statedog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:ParseClassPlugin:foo:bar' % self.log_file.name})
statedog = Dogstreams.init(
self.logger,
{'dogstreams': '%s:tests.test_datadog:ParseClassPlugin:foo:bar' % self.log_file.name})
actual_output = statedog.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
@ -289,7 +305,8 @@ class TestDogstream(TailTestCase):
self._write_log(log_data)
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_events' % self.log_file.name})
dogstream = Dogstreams.init(
self.logger, {'dogstreams': '%s:tests.test_datadog:parse_events' % self.log_file.name})
actual_output = dogstream.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
@ -323,7 +340,9 @@ class TestDogstream(TailTestCase):
self._write_log([repr(d) for d in log_data])
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:repr_event_parser' % self.log_file.name})
dogstream = Dogstreams.init(
self.logger,
{'dogstreams': '%s:tests.test_datadog:repr_event_parser' % self.log_file.name})
actual_output = dogstream.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
@ -347,55 +366,60 @@ class TestDogstream(TailTestCase):
event_object = EventDefaults.EVENT_OBJECT
expected_output = {
"dogstreamEvents":[
{
"timestamp": cassandra.parse_date("2012-05-12 21:10:48,058"),
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]"[0:common.MAX_TITLE_LEN],
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]",
"alert_type": alert_type,
"auto_priority": 0,
"event_type": event_type,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date("2012-05-12 21:10:54,851"),
"msg_title": "Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397",
"alert_type": alert_type,
"auto_priority": 0,
"event_type": event_type,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date("2012-05-13 13:15:01,927"),
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]"[0:common.MAX_TITLE_LEN],
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]",
"alert_type": alert_type,
"event_type": event_type,
"auto_priority": 0,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date("2012-05-13 13:27:17,685"),
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
"alert_type": alert_type,
"event_type": event_type,
"auto_priority": 0,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date(datetime.utcnow().strftime("%Y-%m-%d") + " 13:27:17,685"),
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
"alert_type": alert_type,
"event_type": event_type,
"auto_priority": 0,
"aggregation_key": event_object,
"event_object": event_object,
},
]}
"dogstreamEvents": [
{
"timestamp": cassandra.parse_date("2012-05-12 21:10:48,058"),
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]"[
0:common.MAX_TITLE_LEN],
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]",
"alert_type": alert_type,
"auto_priority": 0,
"event_type": event_type,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date("2012-05-12 21:10:54,851"),
"msg_title": "Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397",
"alert_type": alert_type,
"auto_priority": 0,
"event_type": event_type,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date("2012-05-13 13:15:01,927"),
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]"[
0:common.MAX_TITLE_LEN],
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]",
"alert_type": alert_type,
"event_type": event_type,
"auto_priority": 0,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date("2012-05-13 13:27:17,685"),
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
"alert_type": alert_type,
"event_type": event_type,
"auto_priority": 0,
"aggregation_key": event_object,
"event_object": event_object,
}, {
"timestamp": cassandra.parse_date(
datetime.utcnow().strftime("%Y-%m-%d") + " 13:27:17,685"),
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
"alert_type": alert_type,
"event_type": event_type,
"auto_priority": 0,
"aggregation_key": event_object,
"event_object": event_object,
},
]}
self._write_log(log_data.split("\n"))
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.cassandra:parse_cassandra' % self.log_file.name})
dogstream = Dogstreams.init(
self.logger,
{'dogstreams': '%s:dogstream.cassandra:parse_cassandra' % self.log_file.name})
actual_output = dogstream.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
@ -408,7 +432,7 @@ class TestDogstream(TailTestCase):
event_type = supervisord_log.EVENT_TYPE
expected_output = {
"dogstreamEvents":[
"dogstreamEvents": [
{
"alert_type": "info", "event_type": event_type,
"aggregation_key": "monitor",
@ -420,7 +444,7 @@ class TestDogstream(TailTestCase):
"aggregation_key": "foo_bar",
"event_object": "foo_bar",
"msg_title": "success: foo_bar entered RUNNING state, "
"process has stayed up for > than 2 seconds (startsecs)",
"process has stayed up for > than 2 seconds (startsecs)",
"timestamp": int(time.mktime(datetime(2012, 7, 14, 3, 2, 47).timetuple())),
}, {
"alert_type": "error", "event_type": event_type,
@ -438,10 +462,13 @@ class TestDogstream(TailTestCase):
]}
self._write_log(log_data.split("\n"))
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.supervisord_log:parse_supervisord' % self.log_file.name})
dogstream = Dogstreams.init(
self.logger,
{'dogstreams': '%s:dogstream.supervisord_log:parse_supervisord' % self.log_file.name})
actual_output = dogstream.check(self.config, move_end=False)
self.assertEquals(expected_output, actual_output)
class TestNagiosPerfData(TailTestCase):
def setUp(self):
TailTestCase.setUp(self)
@ -457,7 +484,7 @@ class TestNagiosPerfData(TailTestCase):
for data in config_data:
print >> self.nagios_config, data
self.nagios_config.flush()
def tearDown(self):
TailTestCase.tearDown(self)
self.nagios_config.close()
@ -474,72 +501,73 @@ class TestNagiosPerfData(TailTestCase):
self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams])
log_data = [
("DATATYPE::SERVICEPERFDATA",
"TIMET::1000000000",
"HOSTNAME::myhost0",
"SERVICEDESC::Pgsql Backends",
"SERVICEPERFDATA::" + " ".join([
"time=0.06",
"db0=33;180;190;0;200",
"db1=1;150;190;0;200",
"db2=0;120;290;1;200",
"db3=0;110;195;5;100"
]),
"SERVICECHECKCOMMAND::check_nrpe_1arg!check_postgres_backends",
"HOSTSTATE::UP",
"HOSTSTATETYPE::HARD",
"SERVICESTATE::OK",
"SERVICESTATETYPE::HARD",
(
"DATATYPE::SERVICEPERFDATA",
"TIMET::1000000000",
"HOSTNAME::myhost0",
"SERVICEDESC::Pgsql Backends",
"SERVICEPERFDATA::" + " ".join([
"time=0.06",
"db0=33;180;190;0;200",
"db1=1;150;190;0;200",
"db2=0;120;290;1;200",
"db3=0;110;195;5;100"
]),
"SERVICECHECKCOMMAND::check_nrpe_1arg!check_postgres_backends",
"HOSTSTATE::UP",
"HOSTSTATETYPE::HARD",
"SERVICESTATE::OK",
"SERVICESTATETYPE::HARD",
),
]
expected_output = [
('nagios.pgsql_backends.time', 1000000000, 0.06, {
'metric_type': 'gauge',
'host_name': 'myhost0',
}),
('nagios.pgsql_backends.db0', 1000000000, 33., {
('nagios.pgsql_backends.db0', 1000000000, 33., {
'metric_type': 'gauge',
'host_name': 'myhost0',
'warn': '180',
'crit': '190',
'min': '0',
'max': '200',
'min': '0',
'max': '200',
}),
('nagios.pgsql_backends.db1', 1000000000, 1., {
('nagios.pgsql_backends.db1', 1000000000, 1., {
'metric_type': 'gauge',
'host_name': 'myhost0',
'warn': '150',
'crit': '190',
'min': '0',
'max': '200',
'min': '0',
'max': '200',
}),
('nagios.pgsql_backends.db2', 1000000000, 0., {
('nagios.pgsql_backends.db2', 1000000000, 0., {
'metric_type': 'gauge',
'host_name': 'myhost0',
'warn': '120',
'crit': '290',
'min': '1',
'max': '200',
'min': '1',
'max': '200',
}),
('nagios.pgsql_backends.db3', 1000000000, 0., {
('nagios.pgsql_backends.db3', 1000000000, 0., {
'metric_type': 'gauge',
'host_name': 'myhost0',
'warn': '110',
'crit': '195',
'min': '5',
'max': '100',
'min': '5',
'max': '100',
}),
]
expected_output.sort(key=point_sorter)
self._write_log(('\t'.join(data) for data in log_data))
self._write_log(('\t'.join(data) for data in log_data))
actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream']
actual_output.sort(key=point_sorter)
self.assertEquals(expected_output, actual_output)
def test_service_perfdata_special_cases(self):
from collector.checks.datadog import NagiosServicePerfData
@ -552,7 +580,8 @@ class TestNagiosPerfData(TailTestCase):
self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams])
log_data = [
( "DATATYPE::SERVICEPERFDATA",
(
"DATATYPE::SERVICEPERFDATA",
"TIMET::1000000000",
"HOSTNAME::myhost2",
"SERVICEDESC::Disk Space",
@ -573,7 +602,7 @@ class TestNagiosPerfData(TailTestCase):
"SERVICESTATETYPE::HARD",
)
]
expected_output = [
('nagios.disk_space', 1000000000, 5477., {
'metric_type': 'gauge',
@ -658,12 +687,13 @@ class TestNagiosPerfData(TailTestCase):
]
expected_output.sort(key=point_sorter)
self._write_log(('\t'.join(data) for data in log_data))
self._write_log(('\t'.join(data) for data in log_data))
actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream']
actual_output.sort(key=point_sorter)
self.assertEquals(expected_output, actual_output)
def test_host_perfdata(self):
from collector.checks.datadog import NagiosHostPerfData
@ -676,19 +706,20 @@ class TestNagiosPerfData(TailTestCase):
self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams])
log_data = [
("DATATYPE::HOSTPERFDATA",
"TIMET::1000000010",
"HOSTNAME::myhost1",
"HOSTPERFDATA::" + " ".join([
"rta=0.978000ms;5000.000000;5000.000000;0.000000",
"pl=0%;100;100;0",
]),
"HOSTCHECKCOMMAND::check-host-alive",
"HOSTSTATE::UP",
"HOSTSTATETYPE::HARD",
(
"DATATYPE::HOSTPERFDATA",
"TIMET::1000000010",
"HOSTNAME::myhost1",
"HOSTPERFDATA::" + " ".join([
"rta=0.978000ms;5000.000000;5000.000000;0.000000",
"pl=0%;100;100;0",
]),
"HOSTCHECKCOMMAND::check-host-alive",
"HOSTSTATE::UP",
"HOSTSTATETYPE::HARD",
),
]
expected_output = [
('nagios.host.rta', 1000000010, 0.978, {
'metric_type': 'gauge',
@ -698,7 +729,7 @@ class TestNagiosPerfData(TailTestCase):
'crit': '5000.000000',
'min': '0.000000'
}),
('nagios.host.pl', 1000000010, 0., {
('nagios.host.pl', 1000000010, 0., {
'metric_type': 'gauge',
'host_name': 'myhost1',
'unit': '%',
@ -709,7 +740,7 @@ class TestNagiosPerfData(TailTestCase):
]
expected_output.sort(key=point_sorter)
self._write_log(('\t'.join(data) for data in log_data))
self._write_log(('\t'.join(data) for data in log_data))
actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream']
actual_output.sort(key=point_sorter)
@ -728,7 +759,23 @@ class TestNagiosPerfData(TailTestCase):
self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams])
actual_output = dogstream.check(self.agent_config, move_end=False)
expected_output = {'dogstream': [('nagios.current_users.users', 1339511440, 1.0, {'metric_type': 'gauge', 'warn': '20', 'host_name': 'localhost', 'crit': '50', 'min': '0'}), ('nagios.ping.pl', 1339511500, 0.0, {'warn': '20', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '60', 'unit': '%'}), ('nagios.ping.rta', 1339511500, 0.065, {'warn': '100.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '500.000000', 'unit': 'ms'}), ('nagios.root_partition', 1339511560, 2470.0, {'min': '0', 'max': '7315', 'device_name': '/', 'warn': '5852', 'metric_type': 'gauge', 'host_name': 'localhost', 'crit': '6583', 'unit': 'MB'})]}
expected_output = {'dogstream': [('nagios.current_users.users', 1339511440, 1.0,
{'metric_type': 'gauge', 'warn': '20',
'host_name': 'localhost', 'crit': '50', 'min': '0'}),
('nagios.ping.pl', 1339511500, 0.0,
{'warn': '20', 'metric_type': 'gauge',
'host_name': 'localhost', 'min': '0', 'crit': '60',
'unit': '%'}),
('nagios.ping.rta', 1339511500, 0.065,
{'warn': '100.000000', 'metric_type': 'gauge',
'host_name': 'localhost',
'min': '0.000000', 'crit': '500.000000',
'unit': 'ms'}),
('nagios.root_partition', 1339511560, 2470.0,
{'min': '0', 'max': '7315', 'device_name': '/',
'warn': '5852', 'metric_type': 'gauge',
'host_name': 'localhost', 'crit': '6583',
'unit': 'MB'})]}
self.assertEquals(expected_output, actual_output)
def test_alt_host_perfdata(self):
@ -743,9 +790,17 @@ class TestNagiosPerfData(TailTestCase):
self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams])
actual_output = dogstream.check(self.agent_config, move_end=False)
expected_output = {'dogstream': [('nagios.host.pl', 1339511440, 0.0, {'warn': '80', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '100', 'unit': '%'}), ('nagios.host.rta', 1339511440, 0.048, {'warn': '3000.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '5000.000000', 'unit': 'ms'})]}
expected_output = {'dogstream': [('nagios.host.pl', 1339511440, 0.0,
{'warn': '80', 'metric_type': 'gauge',
'host_name': 'localhost', 'min': '0', 'crit': '100',
'unit': '%'}),
('nagios.host.rta', 1339511440, 0.048,
{'warn': '3000.000000', 'metric_type': 'gauge',
'host_name': 'localhost', 'min': '0.000000',
'crit': '5000.000000', 'unit': 'ms'})]}
self.assertEquals(expected_output, actual_output)
if __name__ == '__main__':
logging.basicConfig(format="%(asctime)s %(levelname)s %(filename)s:%(lineno)d %(message)s")
unittest.main()
unittest.main()

View File

@ -11,6 +11,7 @@ from tests.common import load_check
PORT = 9200
MAX_WAIT = 150
class TestElastic(unittest.TestCase):
def _wait(self, url):
@ -24,17 +25,16 @@ class TestElastic(unittest.TestCase):
time.sleep(0.5)
loop += 1
if loop >= MAX_WAIT:
break
break
def setUp(self):
self.process = None
try:
# Start elasticsearch
self.process = subprocess.Popen(["elasticsearch","-f","elasticsearch"],
executable="elasticsearch",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
self.process = subprocess.Popen(["elasticsearch", "-f", "elasticsearch"],
executable="elasticsearch",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# Wait for it to really start
self._wait("http://localhost:%s" % PORT)
@ -44,16 +44,17 @@ class TestElastic(unittest.TestCase):
def tearDown(self):
if self.process is not None:
self.process.terminate()
def testElasticChecksD(self):
raise SkipTest("See https://github.com/DataDog/dd-agent/issues/825")
agent_config = {'elasticsearch': 'http://localhost:%s' % PORT, 'version': '0.1', 'api_key': 'toto'}
agent_config = {'elasticsearch': 'http://localhost:%s' %
PORT, 'version': '0.1', 'api_key': 'toto'}
# Initialize the check from checks_d
c = load_check('elastic', {'init_config': {}, 'instances': {}}, agent_config)
conf = c.parse_agent_config(agent_config)
self.check = load_check('elastic', conf, agent_config)
self.check.check(conf['instances'][0])
r = self.check.get_metrics()
@ -68,16 +69,17 @@ class TestElastic(unittest.TestCase):
self.assertEquals(len([t for t in r if t[0] == "jvm.threads.peak_count"]), 1, r)
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.transport.rx_count"]), 1, r)
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.transport.tx_size"]), 1, r)
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.transport.server_open"]), 1, r)
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.thread_pool.snapshot.queue"]), 1, r)
self.assertEquals(
len([t for t in r if t[0] == "elasticsearch.transport.server_open"]), 1, r)
self.assertEquals(
len([t for t in r if t[0] == "elasticsearch.thread_pool.snapshot.queue"]), 1, r)
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.active_shards"]), 1, r)
self.check.cluster_status[conf['instances'][0].get('url')] = "red"
self.check.check(conf['instances'][0])
events = self.check.get_events()
self.assertEquals(len(events),1,events)
self.assertEquals(len(events), 1, events)
if __name__ == "__main__":
unittest.main()

View File

@ -2,6 +2,7 @@ import unittest
from tests.common import load_check
from nose.plugins.skip import SkipTest
class GearmanTestCase(unittest.TestCase):
def testMetrics(self):

View File

@ -18,6 +18,7 @@ HAPROXY_OPEN_CFG = os.path.realpath(os.path.join(os.path.dirname(__file__), "hap
class HaproxyTestCase(unittest.TestCase):
def _wait(self, url):
loop = 0
while True:
@ -28,13 +29,13 @@ class HaproxyTestCase(unittest.TestCase):
authhandler = urllib2.HTTPBasicAuthHandler(passman)
opener = urllib2.build_opener(authhandler)
urllib2.install_opener(opener)
url = "%s%s" % (url,STATS_URL)
url = "%s%s" % (url, STATS_URL)
req = urllib2.Request(url)
request = urllib2.urlopen(req)
break
except Exception:
time.sleep(0.5)
loop+=1
loop += 1
if loop >= MAX_WAIT:
break
@ -56,10 +57,10 @@ class HaproxyTestCase(unittest.TestCase):
self.cfg.write(open(config_fn).read())
self.cfg.flush()
# Start haproxy
self.process = subprocess.Popen(["haproxy","-d", "-f", self.cfg.name],
executable="haproxy",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
self.process = subprocess.Popen(["haproxy", "-d", "-f", self.cfg.name],
executable="haproxy",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# Wait for it to really start
self._wait("http://localhost:3834/stats")
@ -95,9 +96,9 @@ class HaproxyTestCase(unittest.TestCase):
self.assertTrue(len(metrics) > 0)
self.assertEquals(len([t for t in metrics
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
self.assertEquals(len([t for t in metrics
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
inst = config['instances'][0]
data = self.check._fetch_data(inst['url'], inst['username'], inst['password'])
@ -158,9 +159,9 @@ class HaproxyTestCase(unittest.TestCase):
self.assertTrue(len(metrics) > 0)
self.assertEquals(len([t for t in metrics
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
self.assertEquals(len([t for t in metrics
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
def tearDown(self):
if self.process is not None:
@ -169,4 +170,3 @@ class HaproxyTestCase(unittest.TestCase):
if __name__ == "__main__":
unittest.main()

View File

@ -19,6 +19,7 @@ instances:
class IISTestCase(unittest.TestCase):
@attr('windows')
def testIIS(self):
raise SkipTest('Requires IIS and wmi')

View File

@ -13,7 +13,10 @@ from monagent.collector.jmxfetch import JMXFetch
STATSD_PORT = 8129
class DummyReporter(threading.Thread):
def __init__(self, metrics_aggregator):
threading.Thread.__init__(self)
self.finished = threading.Event()
@ -23,7 +26,6 @@ class DummyReporter(threading.Thread):
self.finished = False
self.start()
def run(self):
while not self.finished:
time.sleep(self.interval)
@ -34,26 +36,26 @@ class DummyReporter(threading.Thread):
if metrics:
self.metrics = metrics
class JMXTestCase(unittest.TestCase):
def setUp(self):
aggregator = MetricsAggregator("test_host")
self.server = Server(aggregator, "localhost", STATSD_PORT)
pid_file = PidFile('dogstatsd')
self.reporter = DummyReporter(aggregator)
self.t1 = threading.Thread(target=self.server.start)
self.t1.start()
confd_path = os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "jmx_yamls"))
JMXFetch.init(confd_path, {'dogstatsd_port':STATSD_PORT}, get_logging_config(), 15)
JMXFetch.init(confd_path, {'dogstatsd_port': STATSD_PORT}, get_logging_config(), 15)
def tearDown(self):
self.server.stop()
self.reporter.finished = True
JMXFetch.stop()
def testCustomJMXMetric(self):
raise SkipTest('Requires running JMX')
count = 0
@ -67,11 +69,13 @@ class JMXTestCase(unittest.TestCase):
self.assertTrue(type(metrics) == type([]))
self.assertTrue(len(metrics) > 0)
self.assertEquals(len([t for t in metrics if t['metric'] == "my.metric.buf" and "instance:jmx_instance1" in t['dimensions']]), 2, metrics)
self.assertTrue(len([t for t in metrics if 'type:ThreadPool' in t['dimensions'] and "instance:jmx_instance1" in t['dimensions'] and "jmx.catalina" in t['metric']]) > 8, metrics)
self.assertTrue(len([t for t in metrics if "jvm." in t['metric'] and "instance:jmx_instance1" in t['dimensions']]) == 7, metrics)
self.assertEquals(len([t for t in metrics if t[
'metric'] == "my.metric.buf" and "instance:jmx_instance1" in t['dimensions']]), 2, metrics)
self.assertTrue(len([t for t in metrics if 'type:ThreadPool' in t[
'dimensions'] and "instance:jmx_instance1" in t['dimensions'] and "jmx.catalina" in t['metric']]) > 8, metrics)
self.assertTrue(len([t for t in metrics if "jvm." in t['metric']
and "instance:jmx_instance1" in t['dimensions']]) == 7, metrics)
if __name__ == "__main__":
unittest.main()

View File

@ -6,8 +6,10 @@ from collector.checks import LaconicFilter
class TestLaconic(unittest.TestCase):
"""Verify that we only output messages once
"""
def setUp(self):
self.l = logging.getLogger("test_laconic")
self.sio = StringIO()
@ -35,11 +37,13 @@ class TestLaconic(unittest.TestCase):
def testRepeatingErrors(self):
for i in range(10):
self.l.error("Cannot find nagios.log")
self.assertEquals(self.sio.getvalue().count("Cannot find nagios.log"), 1, self.sio.getvalue())
self.assertEquals(
self.sio.getvalue().count("Cannot find nagios.log"), 1, self.sio.getvalue())
for i in range(10):
self.l.warn("Cannot find ganglia.log")
self.assertEquals(self.sio.getvalue().count("Cannot find ganglia.log"), 1, self.sio.getvalue())
self.assertEquals(
self.sio.getvalue().count("Cannot find ganglia.log"), 1, self.sio.getvalue())
for i in range(10):
try:
@ -47,7 +51,8 @@ class TestLaconic(unittest.TestCase):
except Exception:
self.l.exception("Caught!")
self.assertEquals(self.sio.getvalue().count("Ka-boom"), 2) # once for the traceback, once for the message
# once for the traceback, once for the message
self.assertEquals(self.sio.getvalue().count("Ka-boom"), 2)
def testNonRepeat(self):
for i in range(10):
@ -55,7 +60,6 @@ class TestLaconic(unittest.TestCase):
self.assertEquals(self.sio.getvalue().count(" nagios.log"), 10)
self.assertEquals(self.sio.getvalue().count(" 7"), 1)
def testBlowUp(self):
"""Try to use a lot of memory"""
for i in range(2 * self.laconic.LACONIC_MEM_LIMIT + 7):

View File

@ -8,6 +8,7 @@ from nose.plugins.skip import SkipTest
class TestMemCache(unittest.TestCase):
def setUp(self):
self.agent_config = {
"memcache_server": "localhost",
@ -20,7 +21,7 @@ class TestMemCache(unittest.TestCase):
def _countConnections(self, port):
pid = os.getpid()
p1 = Popen(['lsof', '-a', '-p%s' %
pid, '-i4'], stdout=PIPE)
pid, '-i4'], stdout=PIPE)
p2 = Popen(["grep", ":%s" % port], stdin=p1.stdout, stdout=PIPE)
p3 = Popen(["wc", "-l"], stdin=p2.stdout, stdout=PIPE)
output = p3.communicate()[0]
@ -50,7 +51,8 @@ class TestMemCache(unittest.TestCase):
self.assertEquals(len([t for t in r if t[0] == "memcache.total_items"]), 3, r)
# Check that we got 21 metrics for a specific host
self.assertEquals(len([t for t in r if t[3].get('dimensions') == {"instance": mythirdtag}]), 21, r)
self.assertEquals(
len([t for t in r if t[3].get('dimensions') == {"instance": mythirdtag}]), 21, r)
def testDimensions(self):
raise SkipTest('Requires mcache')
@ -68,7 +70,8 @@ class TestMemCache(unittest.TestCase):
r = self.c.get_metrics()
# Check the dimensions
self.assertEquals(len([t for t in r if t[3].get('dimensions') == {"regular_old": "dimensions"}]), 21, r)
self.assertEquals(
len([t for t in r if t[3].get('dimensions') == {"regular_old": "dimensions"}]), 21, r)
conf = {
'memcache_server': 'localhost',
@ -84,7 +87,8 @@ class TestMemCache(unittest.TestCase):
r = self.c.get_metrics()
# Check the dimensions
self.assertEquals(len([t for t in r if t[3].get('dimensions') == {"instance": "localhost_11211"}]), 21, r)
self.assertEquals(
len([t for t in r if t[3].get('dimensions') == {"instance": "localhost_11211"}]), 21, r)
def testDummyHost(self):
new_conf = self.c.parse_agent_config({"memcache_instance_1": "dummy:11211:myothertag"})

View File

@ -12,21 +12,27 @@ default_target = 'DEFAULT'
specified_target = 'SPECIFIED'
has_been_mutated = False
class TestModuleLoad(unittest.TestCase):
def setUp(self):
sys.modules[__name__].has_been_mutated = True
if 'tests.target_module' in sys.modules:
del sys.modules['tests.target_module']
def tearDown(self):
sys.modules[__name__].has_been_mutated = False
def test_cached_module(self):
"""Modules already in the cache should be reused"""
self.assertTrue(modules.load('%s:has_been_mutated' % __name__))
def test_cache_population(self):
"""Python module cache should be populated"""
self.assertTrue(not 'tests.target_module' in sys.modules)
modules.load('tests.target_module')
self.assertTrue('tests.target_module' in sys.modules)
def test_modname_load_default(self):
"""When the specifier contains no module name, any provided default
should be used"""
@ -36,6 +42,7 @@ class TestModuleLoad(unittest.TestCase):
'default_target'),
'DEFAULT'
)
def test_modname_load_specified(self):
"""When the specifier contains a module name, any provided default
should be overridden"""
@ -45,6 +52,7 @@ class TestModuleLoad(unittest.TestCase):
'default_target'),
'SPECIFIED'
)
def test_pathname_load_finds_package(self):
""""Loading modules by absolute path should correctly set the name of
the loaded module to include any package containing it."""

View File

@ -14,7 +14,9 @@ PORT1 = 37017
PORT2 = 37018
MAX_WAIT = 150
class TestMongo(unittest.TestCase):
def wait4mongo(self, process, port):
# Somehow process.communicate() hangs
out = process.stdout
@ -68,8 +70,10 @@ class TestMongo(unittest.TestCase):
def tearDown(self):
try:
if "p1" in dir(self): self.p1.terminate()
if "p2" in dir(self): self.p2.terminate()
if "p1" in dir(self):
self.p1.terminate()
if "p2" in dir(self):
self.p2.terminate()
except Exception:
logging.getLogger().exception("Cannot terminate mongod instances")
@ -79,7 +83,7 @@ class TestMongo(unittest.TestCase):
'instances': [{
'server': "mongodb://localhost:%s/test" % PORT1
},
{
{
'server': "mongodb://localhost:%s/test" % PORT2
}]
}
@ -111,7 +115,7 @@ class TestMongo(unittest.TestCase):
for m in metrics:
metric_name = m[0]
if metric_name in metric_val_checks:
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
self.assertTrue(metric_val_checks[metric_name](m[2]))
# Run the check against our running server
self.check.check(self.config['instances'][1])
@ -129,7 +133,7 @@ class TestMongo(unittest.TestCase):
for m in metrics:
metric_name = m[0]
if metric_name in metric_val_checks:
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
self.assertTrue(metric_val_checks[metric_name](m[2]))
def testMongoOldConfig(self):
raise SkipTest('Requires MongoDB')
@ -173,7 +177,7 @@ class TestMongo(unittest.TestCase):
for m in metrics:
metric_name = m[0]
if metric_name in metric_val_checks:
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
self.assertTrue(metric_val_checks[metric_name](m[2]))
# Test the second mongodb instance
self.check = load_check('mongo', conf2, self.agent_config2)
@ -194,7 +198,7 @@ class TestMongo(unittest.TestCase):
for m in metrics:
metric_name = m[0]
if metric_name in metric_val_checks:
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
self.assertTrue(metric_val_checks[metric_name](m[2]))
if __name__ == '__main__':
unittest.main()

Some files were not shown because too many files have changed in this diff Show More