Merge pull request #17 from tkuhlman/feature/cleanup

Feature/cleanup
This commit is contained in:
Tim Kuhlman 2014-06-30 16:54:27 -06:00
commit 52ed313d0c
25 changed files with 1338 additions and 1324 deletions

View File

@ -1,578 +1 @@
"""Base class for Checks.
If you are writing your own checks you should subclass the AgentCheck class.
The Check class is being deprecated so don't write new checks with it.
"""
import logging
import re
import time
import os
import traceback
from pprint import pprint
from monagent.common import check_status
from monagent.common.exceptions import NaN, CheckException, Infinity, UnknownValue
from monagent.common.util import LaconicFilter, get_os, get_hostname
from monagent.common.config import get_confd_path
log = logging.getLogger(__name__)
# Constants
# todo convert all checks to the new interface then remove this. Is the LaconicFilter on logs used elsewhere?
#==============================================================================
# DEPRECATED
# ------------------------------
# If you are writing your own check, you should inherit from AgentCheck
# and not this class. This class will be removed in a future version
# of the agent.
#==============================================================================
class Check(object):
"""
(Abstract) class for all checks with the ability to:
* store 1 (and only 1) sample for gauges per metric/tag combination
* compute rates for counters
* only log error messages once (instead of each time they occur)
"""
def __init__(self, logger, agent_config=None):
# where to store samples, indexed by metric_name
# metric_name: {("sorted", "dimensions"): [(ts, value), (ts, value)],
# tuple(dimensions) are stored as a key since lists are not hashable
# None: [(ts, value), (ts, value)]}
# untagged values are indexed by None
self.agent_config = agent_config
self._sample_store = {}
self._counters = {} # metric_name: bool
self.logger = logger
try:
self.logger.addFilter(LaconicFilter())
except Exception:
self.logger.exception("Trying to install laconic log filter and failed")
@staticmethod
def normalize(metric, prefix=None):
"""Turn a metric into a well-formed metric name
prefix.b.c
"""
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
# Eliminate multiple _
name = re.sub(r"__+", "_", name)
# Don't start/end with _
name = re.sub(r"^_", "", name)
name = re.sub(r"_$", "", name)
# Drop ._ and _.
name = re.sub(r"\._", ".", name)
name = re.sub(r"_\.", ".", name)
if prefix is not None:
return prefix + "." + name
else:
return name
@staticmethod
def normalize_device_name(device_name):
return device_name.strip().lower().replace(' ', '_')
def counter(self, metric):
"""
Treats the metric as a counter, i.e. computes its per second derivative
ACHTUNG: Resets previous values associated with this metric.
"""
self._counters[metric] = True
self._sample_store[metric] = {}
def is_counter(self, metric):
"Is this metric a counter?"
return metric in self._counters
def gauge(self, metric):
"""
Treats the metric as a gauge, i.e. keep the data as is
ACHTUNG: Resets previous values associated with this metric.
"""
self._sample_store[metric] = {}
def is_metric(self, metric):
return metric in self._sample_store
def is_gauge(self, metric):
return self.is_metric(metric) and not self.is_counter(metric)
def get_metric_names(self):
"Get all metric names"
return self._sample_store.keys()
def save_gauge(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
""" Save a gauge value. """
if not self.is_gauge(metric):
self.gauge(metric)
self.save_sample(metric, value, timestamp, dimensions, hostname, device_name)
def save_sample(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
"""Save a simple sample, evict old values if needed
"""
if dimensions is None:
dimensions = {}
from common.util import cast_metric_val
if timestamp is None:
timestamp = time.time()
if metric not in self._sample_store:
raise CheckException("Saving a sample for an undefined metric: %s" % metric)
try:
value = cast_metric_val(value)
except ValueError, ve:
raise NaN(ve)
# Sort and validate dimensions
if dimensions is not None and not isinstance(dimensions, dict):
raise CheckException("Dimensions must be a dictionary")
# Data eviction rules
key = (tuple(sorted(dimensions.items())), device_name)
if self.is_gauge(metric):
self._sample_store[metric][key] = ((timestamp, value, hostname, device_name), )
elif self.is_counter(metric):
if self._sample_store[metric].get(key) is None:
self._sample_store[metric][key] = [(timestamp, value, hostname, device_name)]
else:
self._sample_store[metric][key] = self._sample_store[metric][key][-1:] + \
[(timestamp, value, hostname, device_name)]
else:
raise CheckException("%s must be either gauge or counter, skipping sample at %s" %
(metric, time.ctime(timestamp)))
if self.is_gauge(metric):
# store[metric][dimensions] = (ts, val) - only 1 value allowed
assert len(self._sample_store[metric][key]) == 1, self._sample_store[metric]
elif self.is_counter(metric):
assert len(self._sample_store[metric][key]) in (1, 2), self._sample_store[metric]
@classmethod
def _rate(cls, sample1, sample2):
"Simple rate"
try:
interval = sample2[0] - sample1[0]
if interval == 0:
raise Infinity()
delta = sample2[1] - sample1[1]
if delta < 0:
raise UnknownValue()
return (sample2[0], delta / interval, sample2[2], sample2[3])
except Infinity:
raise
except UnknownValue:
raise
except Exception, e:
raise NaN(e)
def get_sample_with_timestamp(self, metric, dimensions=None, device_name=None, expire=True):
"""Get (timestamp-epoch-style, value)
"""
if dimensions is None:
dimensions = {}
# Get the proper dimensions
key = (tuple(sorted(dimensions.items())), device_name)
# Never seen this metric
if metric not in self._sample_store:
raise UnknownValue()
# Not enough value to compute rate
elif self.is_counter(metric) and len(self._sample_store[metric][key]) < 2:
raise UnknownValue()
elif self.is_counter(metric) and len(self._sample_store[metric][key]) >= 2:
res = self._rate(self._sample_store[metric][key][-2], self._sample_store[metric][key][-1])
if expire:
del self._sample_store[metric][key][:-1]
return res
elif self.is_gauge(metric) and len(self._sample_store[metric][key]) >= 1:
return self._sample_store[metric][key][-1]
else:
raise UnknownValue()
def get_sample(self, metric, dimensions=None, device_name=None, expire=True):
"Return the last value for that metric"
x = self.get_sample_with_timestamp(metric, dimensions, device_name, expire)
assert isinstance(x, tuple) and len(x) == 4, x
return x[1]
def get_samples_with_timestamps(self, expire=True):
"Return all values {metric: (ts, value)} for non-tagged metrics"
values = {}
for m in self._sample_store:
try:
values[m] = self.get_sample_with_timestamp(m, expire=expire)
except Exception:
pass
return values
def get_samples(self, expire=True):
"Return all values {metric: value} for non-tagged metrics"
values = {}
for m in self._sample_store:
try:
# Discard the timestamp
values[m] = self.get_sample_with_timestamp(m, expire=expire)[1]
except Exception:
pass
return values
def get_metrics(self, expire=True):
"""Get all metrics, including the ones that are tagged.
This is the preferred method to retrieve metrics
@return the list of samples
@rtype [(metric_name, timestamp, value, {"dimensions": {"name1": "key1", "name2": "key2"}}), ...]
"""
metrics = []
for m in self._sample_store:
try:
for key in self._sample_store[m]:
dimensions_list, device_name = key
dimensions = dict(dimensions_list)
try:
ts, val, hostname, device_name = self.get_sample_with_timestamp(m, dimensions, device_name, expire)
except UnknownValue:
continue
attributes = {}
if dimensions_list:
attributes['dimensions'] = dimensions
if hostname:
attributes['host_name'] = hostname
if device_name:
attributes['device_name'] = device_name
metrics.append((m, int(ts), val, attributes))
except Exception:
pass
return metrics
class AgentCheck(object):
def __init__(self, name, init_config, agent_config, instances=None):
"""
Initialize a new check.
:param name: The name of the check
:param init_config: The config for initializing the check
:param agent_config: The global configuration for the agent
:param instances: A list of configuration objects for each instance.
"""
from monagent.common.aggregator import MetricsAggregator
self.name = name
self.init_config = init_config
self.agent_config = agent_config
self.hostname = get_hostname(agent_config)
self.log = logging.getLogger('%s.%s' % (__name__, name))
self.aggregator = MetricsAggregator(self.hostname,
recent_point_threshold=agent_config.get('recent_point_threshold', None))
self.events = []
self.instances = instances or []
self.warnings = []
self.library_versions = None
def instance_count(self):
""" Return the number of instances that are configured for this check. """
return len(self.instances)
def gauge(self, metric, value, dimensions=None, hostname=None, device_name=None, timestamp=None):
"""
Record the value of a gauge, with optional dimensions, hostname and device
name.
:param metric: The name of the metric
:param value: The value of the gauge
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
:param timestamp: (optional) The timestamp for this metric value
"""
self.aggregator.gauge(metric, value, dimensions, hostname, device_name, timestamp)
def increment(self, metric, value=1, dimensions=None, hostname=None, device_name=None):
"""
Increment a counter with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to increment by
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.increment(metric, value, dimensions, hostname, device_name)
def decrement(self, metric, value=-1, dimensions=None, hostname=None, device_name=None):
"""
Increment a counter with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to decrement by
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.decrement(metric, value, dimensions, hostname, device_name)
def rate(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""
Submit a point for a metric that will be calculated as a rate on flush.
Values will persist across each call to `check` if there is not enough
point to generate a rate on the flush.
:param metric: The name of the metric
:param value: The value of the rate
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.rate(metric, value, dimensions, hostname, device_name)
def histogram(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""
Sample a histogram value, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to sample for the histogram
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.histogram(metric, value, dimensions, hostname, device_name)
def set(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""
Sample a set value, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value for the set
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.set(metric, value, dimensions, hostname, device_name)
def event(self, event):
"""
Save an event.
:param event: The event payload as a dictionary. Has the following
structure:
{
"timestamp": int, the epoch timestamp for the event,
"event_type": string, the event time name,
"api_key": string, the api key of the account to associate the event with,
"msg_title": string, the title of the event,
"msg_text": string, the text body of the event,
"alert_type": (optional) string, one of ('error', 'warning', 'success', 'info').
Defaults to 'info'.
"source_type_name": (optional) string, the source type name,
"host": (optional) string, the name of the host,
"dimensions": (optional) a dictionary of dimensions to associate with this event
}
"""
if event.get('api_key') is None:
event['api_key'] = self.agent_config['api_key']
self.events.append(event)
def has_events(self):
"""
Check whether the check has saved any events
@return whether or not the check has saved any events
@rtype boolean
"""
return len(self.events) > 0
def get_metrics(self):
"""
Get all metrics, including the ones that are tagged.
@return the list of samples
@rtype list of Measurement objects from monagent.common.metrics
"""
return self.aggregator.flush()
def get_events(self):
"""
Return a list of the events saved by the check, if any
@return the list of events saved by this check
@rtype list of event dictionaries
"""
events = self.events
self.events = []
return events
def has_warnings(self):
"""
Check whether the instance run created any warnings
"""
return len(self.warnings) > 0
def warning(self, warning_message):
""" Add a warning message that will be printed in the info page
:param warning_message: String. Warning message to be displayed
"""
self.warnings.append(warning_message)
def get_library_info(self):
if self.library_versions is not None:
return self.library_versions
try:
self.library_versions = self.get_library_versions()
except NotImplementedError:
pass
def get_library_versions(self):
""" Should return a string that shows which version
of the needed libraries are used """
raise NotImplementedError
def get_warnings(self):
"""
Return the list of warnings messages to be displayed in the info page
"""
warnings = self.warnings
self.warnings = []
return warnings
def run(self):
""" Run all instances. """
instance_statuses = []
for i, instance in enumerate(self.instances):
try:
self.check(instance)
if self.has_warnings():
instance_status = check_status.InstanceStatus(i,
check_status.STATUS_WARNING,
warnings=self.get_warnings())
else:
instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK)
except Exception, e:
self.log.exception("Check '%s' instance #%s failed" % (self.name, i))
instance_status = check_status.InstanceStatus(i,
check_status.STATUS_ERROR,
error=e,
tb=traceback.format_exc())
instance_statuses.append(instance_status)
return instance_statuses
def check(self, instance):
"""
Overriden by the check class. This will be called to run the check.
:param instance: A dict with the instance information. This will vary
depending on your config structure.
"""
raise NotImplementedError()
@staticmethod
def stop():
"""
To be executed when the agent is being stopped to clean ressources
"""
pass
@classmethod
def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None):
"""
A method used for testing your check without running the agent.
"""
import yaml
try:
from yaml import CLoader as Loader
except ImportError:
from yaml import Loader
if path_to_yaml:
check_name = os.path.basename(path_to_yaml).split('.')[0]
try:
f = open(path_to_yaml)
except IOError:
raise Exception('Unable to open yaml config: %s' % path_to_yaml)
yaml_text = f.read()
f.close()
config = yaml.load(yaml_text, Loader=Loader)
check = cls(check_name, config.get('init_config') or {}, agentConfig or {})
return check, config.get('instances', [])
@staticmethod
def normalize(metric, prefix=None):
"""
Turn a metric into a well-formed metric name
prefix.b.c
:param metric The metric name to normalize
:param prefix A prefix to to add to the normalized name, default None
"""
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
# Eliminate multiple _
name = re.sub(r"__+", "_", name)
# Don't start/end with _
name = re.sub(r"^_", "", name)
name = re.sub(r"_$", "", name)
# Drop ._ and _.
name = re.sub(r"\._", ".", name)
name = re.sub(r"_\.", ".", name)
if prefix is not None:
return prefix + "." + name
else:
return name
@staticmethod
def read_config(instance, key, message=None, cast=None):
val = instance.get(key)
if val is None:
message = message or 'Must provide `%s` value in instance config' % key
raise Exception(message)
if cast is None:
return val
else:
return cast(val)
def run_check(name, path=None):
from tests.common import get_check
# Read the config file
confd_path = path or os.path.join(get_confd_path(get_os()), '%s.yaml' % name)
try:
f = open(confd_path)
except IOError:
raise Exception('Unable to open configuration at %s' % confd_path)
config_str = f.read()
f.close()
# Run the check
check, instances = get_check(name, config_str)
if not instances:
raise Exception('YAML configuration returned no instances.')
for instance in instances:
check.check(instance)
if check.has_events():
print "Events:\n"
pprint(check.get_events(), indent=4)
print "Metrics:\n"
pprint(check.get_metrics(), indent=4)
from check import AgentCheck

View File

@ -0,0 +1,573 @@
"""Base class for Checks.
If you are writing your own checks you should subclass the AgentCheck class.
The Check class is being deprecated so don't write new checks with it.
"""
import logging
import os
from pprint import pprint
import re
import time
import traceback
from monagent.common import check_status
from monagent.common.config import get_confd_path
from monagent.common.exceptions import CheckException, NaN, Infinity, UnknownValue
from monagent.common.util import LaconicFilter, get_hostname, get_os
log = logging.getLogger(__name__)
# todo convert all checks to the new interface then remove this. Is the LaconicFilter on logs used elsewhere?
#==============================================================================
# DEPRECATED
# ------------------------------
# If you are writing your own check, you should inherit from AgentCheck
# and not this class. This class will be removed in a future version
# of the agent.
#==============================================================================
class Check(object):
"""
(Abstract) class for all checks with the ability to:
* store 1 (and only 1) sample for gauges per metric/tag combination
* compute rates for counters
* only log error messages once (instead of each time they occur)
"""
def __init__(self, logger, agent_config=None):
# where to store samples, indexed by metric_name
# metric_name: {("sorted", "dimensions"): [(ts, value), (ts, value)],
# tuple(dimensions) are stored as a key since lists are not hashable
# None: [(ts, value), (ts, value)]}
# untagged values are indexed by None
self.agent_config = agent_config
self._sample_store = {}
self._counters = {} # metric_name: bool
self.logger = logger
try:
self.logger.addFilter(LaconicFilter())
except Exception:
self.logger.exception("Trying to install laconic log filter and failed")
@staticmethod
def normalize(metric, prefix=None):
"""Turn a metric into a well-formed metric name
prefix.b.c
"""
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
# Eliminate multiple _
name = re.sub(r"__+", "_", name)
# Don't start/end with _
name = re.sub(r"^_", "", name)
name = re.sub(r"_$", "", name)
# Drop ._ and _.
name = re.sub(r"\._", ".", name)
name = re.sub(r"_\.", ".", name)
if prefix is not None:
return prefix + "." + name
else:
return name
@staticmethod
def normalize_device_name(device_name):
return device_name.strip().lower().replace(' ', '_')
def counter(self, metric):
"""
Treats the metric as a counter, i.e. computes its per second derivative
ACHTUNG: Resets previous values associated with this metric.
"""
self._counters[metric] = True
self._sample_store[metric] = {}
def is_counter(self, metric):
"Is this metric a counter?"
return metric in self._counters
def gauge(self, metric):
"""
Treats the metric as a gauge, i.e. keep the data as is
ACHTUNG: Resets previous values associated with this metric.
"""
self._sample_store[metric] = {}
def is_metric(self, metric):
return metric in self._sample_store
def is_gauge(self, metric):
return self.is_metric(metric) and not self.is_counter(metric)
def get_metric_names(self):
"Get all metric names"
return self._sample_store.keys()
def save_gauge(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
""" Save a gauge value. """
if not self.is_gauge(metric):
self.gauge(metric)
self.save_sample(metric, value, timestamp, dimensions, hostname, device_name)
def save_sample(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
"""Save a simple sample, evict old values if needed
"""
if dimensions is None:
dimensions = {}
from common.util import cast_metric_val
if timestamp is None:
timestamp = time.time()
if metric not in self._sample_store:
raise CheckException("Saving a sample for an undefined metric: %s" % metric)
try:
value = cast_metric_val(value)
except ValueError, ve:
raise NaN(ve)
# Sort and validate dimensions
if dimensions is not None and not isinstance(dimensions, dict):
raise CheckException("Dimensions must be a dictionary")
# Data eviction rules
key = (tuple(sorted(dimensions.items())), device_name)
if self.is_gauge(metric):
self._sample_store[metric][key] = ((timestamp, value, hostname, device_name), )
elif self.is_counter(metric):
if self._sample_store[metric].get(key) is None:
self._sample_store[metric][key] = [(timestamp, value, hostname, device_name)]
else:
self._sample_store[metric][key] = self._sample_store[metric][key][-1:] + \
[(timestamp, value, hostname, device_name)]
else:
raise CheckException("%s must be either gauge or counter, skipping sample at %s" %
(metric, time.ctime(timestamp)))
if self.is_gauge(metric):
# store[metric][dimensions] = (ts, val) - only 1 value allowed
assert len(self._sample_store[metric][key]) == 1, self._sample_store[metric]
elif self.is_counter(metric):
assert len(self._sample_store[metric][key]) in (1, 2), self._sample_store[metric]
@classmethod
def _rate(cls, sample1, sample2):
"Simple rate"
try:
interval = sample2[0] - sample1[0]
if interval == 0:
raise Infinity()
delta = sample2[1] - sample1[1]
if delta < 0:
raise UnknownValue()
return (sample2[0], delta / interval, sample2[2], sample2[3])
except Infinity:
raise
except UnknownValue:
raise
except Exception, e:
raise NaN(e)
def get_sample_with_timestamp(self, metric, dimensions=None, device_name=None, expire=True):
"""Get (timestamp-epoch-style, value)
"""
if dimensions is None:
dimensions = {}
# Get the proper dimensions
key = (tuple(sorted(dimensions.items())), device_name)
# Never seen this metric
if metric not in self._sample_store:
raise UnknownValue()
# Not enough value to compute rate
elif self.is_counter(metric) and len(self._sample_store[metric][key]) < 2:
raise UnknownValue()
elif self.is_counter(metric) and len(self._sample_store[metric][key]) >= 2:
res = self._rate(self._sample_store[metric][key][-2], self._sample_store[metric][key][-1])
if expire:
del self._sample_store[metric][key][:-1]
return res
elif self.is_gauge(metric) and len(self._sample_store[metric][key]) >= 1:
return self._sample_store[metric][key][-1]
else:
raise UnknownValue()
def get_sample(self, metric, dimensions=None, device_name=None, expire=True):
"Return the last value for that metric"
x = self.get_sample_with_timestamp(metric, dimensions, device_name, expire)
assert isinstance(x, tuple) and len(x) == 4, x
return x[1]
def get_samples_with_timestamps(self, expire=True):
"Return all values {metric: (ts, value)} for non-tagged metrics"
values = {}
for m in self._sample_store:
try:
values[m] = self.get_sample_with_timestamp(m, expire=expire)
except Exception:
pass
return values
def get_samples(self, expire=True):
"Return all values {metric: value} for non-tagged metrics"
values = {}
for m in self._sample_store:
try:
# Discard the timestamp
values[m] = self.get_sample_with_timestamp(m, expire=expire)[1]
except Exception:
pass
return values
def get_metrics(self, expire=True):
"""Get all metrics, including the ones that are tagged.
This is the preferred method to retrieve metrics
@return the list of samples
@rtype [(metric_name, timestamp, value, {"dimensions": {"name1": "key1", "name2": "key2"}}), ...]
"""
metrics = []
for m in self._sample_store:
try:
for key in self._sample_store[m]:
dimensions_list, device_name = key
dimensions = dict(dimensions_list)
try:
ts, val, hostname, device_name = self.get_sample_with_timestamp(m, dimensions, device_name, expire)
except UnknownValue:
continue
attributes = {}
if dimensions_list:
attributes['dimensions'] = dimensions
if hostname:
attributes['host_name'] = hostname
if device_name:
attributes['device_name'] = device_name
metrics.append((m, int(ts), val, attributes))
except Exception:
pass
return metrics
class AgentCheck(object):
def __init__(self, name, init_config, agent_config, instances=None):
"""
Initialize a new check.
:param name: The name of the check
:param init_config: The config for initializing the check
:param agent_config: The global configuration for the agent
:param instances: A list of configuration objects for each instance.
"""
from monagent.common.aggregator import MetricsAggregator
self.name = name
self.init_config = init_config
self.agent_config = agent_config
self.hostname = get_hostname(agent_config)
self.log = logging.getLogger('%s.%s' % (__name__, name))
self.aggregator = MetricsAggregator(self.hostname,
recent_point_threshold=agent_config.get('recent_point_threshold', None))
self.events = []
self.instances = instances or []
self.warnings = []
self.library_versions = None
def instance_count(self):
""" Return the number of instances that are configured for this check. """
return len(self.instances)
def gauge(self, metric, value, dimensions=None, hostname=None, device_name=None, timestamp=None):
"""
Record the value of a gauge, with optional dimensions, hostname and device
name.
:param metric: The name of the metric
:param value: The value of the gauge
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
:param timestamp: (optional) The timestamp for this metric value
"""
self.aggregator.gauge(metric, value, dimensions, hostname, device_name, timestamp)
def increment(self, metric, value=1, dimensions=None, hostname=None, device_name=None):
"""
Increment a counter with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to increment by
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.increment(metric, value, dimensions, hostname, device_name)
def decrement(self, metric, value=-1, dimensions=None, hostname=None, device_name=None):
"""
Increment a counter with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to decrement by
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.decrement(metric, value, dimensions, hostname, device_name)
def rate(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""
Submit a point for a metric that will be calculated as a rate on flush.
Values will persist across each call to `check` if there is not enough
point to generate a rate on the flush.
:param metric: The name of the metric
:param value: The value of the rate
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.rate(metric, value, dimensions, hostname, device_name)
def histogram(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""
Sample a histogram value, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value to sample for the histogram
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.histogram(metric, value, dimensions, hostname, device_name)
def set(self, metric, value, dimensions=None, hostname=None, device_name=None):
"""
Sample a set value, with optional dimensions, hostname and device name.
:param metric: The name of the metric
:param value: The value for the set
:param dimensions: (optional) A dictionary of dimensions for this metric
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
:param device_name: (optional) The device name for this metric
"""
self.aggregator.set(metric, value, dimensions, hostname, device_name)
def event(self, event):
"""
Save an event.
:param event: The event payload as a dictionary. Has the following
structure:
{
"timestamp": int, the epoch timestamp for the event,
"event_type": string, the event time name,
"api_key": string, the api key of the account to associate the event with,
"msg_title": string, the title of the event,
"msg_text": string, the text body of the event,
"alert_type": (optional) string, one of ('error', 'warning', 'success', 'info').
Defaults to 'info'.
"source_type_name": (optional) string, the source type name,
"host": (optional) string, the name of the host,
"dimensions": (optional) a dictionary of dimensions to associate with this event
}
"""
if event.get('api_key') is None:
event['api_key'] = self.agent_config['api_key']
self.events.append(event)
def has_events(self):
"""
Check whether the check has saved any events
@return whether or not the check has saved any events
@rtype boolean
"""
return len(self.events) > 0
def get_metrics(self):
"""
Get all metrics, including the ones that are tagged.
@return the list of samples
@rtype list of Measurement objects from monagent.common.metrics
"""
return self.aggregator.flush()
def get_events(self):
"""
Return a list of the events saved by the check, if any
@return the list of events saved by this check
@rtype list of event dictionaries
"""
events = self.events
self.events = []
return events
def has_warnings(self):
"""
Check whether the instance run created any warnings
"""
return len(self.warnings) > 0
def warning(self, warning_message):
""" Add a warning message that will be printed in the info page
:param warning_message: String. Warning message to be displayed
"""
self.warnings.append(warning_message)
def get_library_info(self):
if self.library_versions is not None:
return self.library_versions
try:
self.library_versions = self.get_library_versions()
except NotImplementedError:
pass
def get_library_versions(self):
""" Should return a string that shows which version
of the needed libraries are used """
raise NotImplementedError
def get_warnings(self):
"""
Return the list of warnings messages to be displayed in the info page
"""
warnings = self.warnings
self.warnings = []
return warnings
def run(self):
""" Run all instances. """
instance_statuses = []
for i, instance in enumerate(self.instances):
try:
self.check(instance)
if self.has_warnings():
instance_status = check_status.InstanceStatus(i,
check_status.STATUS_WARNING,
warnings=self.get_warnings())
else:
instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK)
except Exception, e:
self.log.exception("Check '%s' instance #%s failed" % (self.name, i))
instance_status = check_status.InstanceStatus(i,
check_status.STATUS_ERROR,
error=e,
tb=traceback.format_exc())
instance_statuses.append(instance_status)
return instance_statuses
def check(self, instance):
"""
Overriden by the check class. This will be called to run the check.
:param instance: A dict with the instance information. This will vary
depending on your config structure.
"""
raise NotImplementedError()
@staticmethod
def stop():
"""
To be executed when the agent is being stopped to clean ressources
"""
pass
@classmethod
def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None):
"""
A method used for testing your check without running the agent.
"""
import yaml
try:
from yaml import CLoader as Loader
except ImportError:
from yaml import Loader
if path_to_yaml:
check_name = os.path.basename(path_to_yaml).split('.')[0]
try:
f = open(path_to_yaml)
except IOError:
raise Exception('Unable to open yaml config: %s' % path_to_yaml)
yaml_text = f.read()
f.close()
config = yaml.load(yaml_text, Loader=Loader)
check = cls(check_name, config.get('init_config') or {}, agentConfig or {})
return check, config.get('instances', [])
@staticmethod
def normalize(metric, prefix=None):
"""
Turn a metric into a well-formed metric name
prefix.b.c
:param metric The metric name to normalize
:param prefix A prefix to to add to the normalized name, default None
"""
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
# Eliminate multiple _
name = re.sub(r"__+", "_", name)
# Don't start/end with _
name = re.sub(r"^_", "", name)
name = re.sub(r"_$", "", name)
# Drop ._ and _.
name = re.sub(r"\._", ".", name)
name = re.sub(r"_\.", ".", name)
if prefix is not None:
return prefix + "." + name
else:
return name
@staticmethod
def read_config(instance, key, message=None, cast=None):
val = instance.get(key)
if val is None:
message = message or 'Must provide `%s` value in instance config' % key
raise Exception(message)
if cast is None:
return val
else:
return cast(val)
def run_check(name, path=None):
from tests.common import get_check
# Read the config file
confd_path = path or os.path.join(get_confd_path(get_os()), '%s.yaml' % name)
try:
f = open(confd_path)
except IOError:
raise Exception('Unable to open configuration at %s' % confd_path)
config_str = f.read()
f.close()
# Run the check
check, instances = get_check(name, config_str)
if not instances:
raise Exception('YAML configuration returned no instances.')
for instance in instances:
check.check(instance)
if check.has_events():
print "Events:\n"
pprint(check.get_events(), indent=4)
print "Metrics:\n"
pprint(check.get_metrics(), indent=4)

View File

@ -6,7 +6,7 @@ from datetime import datetime
from itertools import groupby # >= python 2.4
from utils import TailFile
from . import LaconicFilter
from monagent.common.util import LaconicFilter
from monagent.collector import modules

View File

@ -12,7 +12,7 @@ import sys
import time
# project
from .. import Check
from monagent.collector.checks.check import Check
from monagent.common.metrics import Measurement
from monagent.common.util import Platform

View File

@ -1,4 +1,4 @@
from .. import Check
from monagent.collector.checks.check import Check
try:
import wmi

View File

@ -1,327 +0,0 @@
#!/usr/bin/env python
"""
Datadog
www.datadoghq.com
----
Make sense of your IT Data
Licensed under Simplified BSD License (see LICENSE)
(C) Boxed Ice 2010 all rights reserved
(C) Datadog, Inc. 2010-2013 all rights reserved
"""
# set up logging before importing any other components
from monagent.common.config import initialize_logging
initialize_logging('forwarder')
from monagent.common.config import get_logging_config
import os
os.umask(022)
# Standard imports
import logging
import sys
from datetime import timedelta
import signal
from socket import gaierror, error as socket_error
# Tornado
import tornado.httpclient
import tornado.httpserver
import tornado.ioloop
import tornado.web
from tornado.escape import json_decode
from tornado.options import define, parse_command_line, options
# agent import
from api import MonAPI
from monagent.common.check_status import ForwarderStatus
from monagent.common.config import get_config
from monagent.common.metrics import Measurement
from monagent.common.util import Watchdog, get_tornado_ioloop
from transaction import Transaction, TransactionManager
log = logging.getLogger('forwarder')
log.setLevel(get_logging_config()['log_level'] or logging.INFO)
TRANSACTION_FLUSH_INTERVAL = 5000 # Every 5 seconds
WATCHDOG_INTERVAL_MULTIPLIER = 10 # 10x flush interval
# Maximum delay before replaying a transaction
MAX_WAIT_FOR_REPLAY = timedelta(seconds=90)
# Maximum queue size in bytes (when this is reached, old messages are dropped)
MAX_QUEUE_SIZE = 30 * 1024 * 1024 # 30MB
THROTTLING_DELAY = timedelta(microseconds=1000000/2) # 2 msg/second
class MetricTransaction(Transaction):
_application = None
_trManager = None
_endpoints = []
@classmethod
def set_application(cls, app):
cls._application = app
@classmethod
def set_tr_manager(cls, manager):
cls._trManager = manager
@classmethod
def get_tr_manager(cls):
return cls._trManager
@classmethod
def set_endpoints(cls, endpoint):
# todo we only have one endpoint option, generalize it better
# the immediate use case for two endpoints could be our own monitoring boxes, they could send to
# the main api and mini-mon api
cls._endpoints.append(endpoint)
def __init__(self, data, headers):
self._data = data
self._headers = headers
# Call after data has been set (size is computed in Transaction's init)
Transaction.__init__(self)
# Insert the transaction in the Manager
self._trManager.append(self)
log.debug("Created transaction %d" % self.get_id())
self._trManager.flush()
def __sizeof__(self):
return sys.getsizeof(self._data)
def flush(self):
try:
for endpoint in self._endpoints:
endpoint.post_metrics(self._data)
except Exception:
log.exception('Error flushing metrics to remote endpoints')
self._trManager.tr_error(self)
else:
self._trManager.tr_success(self)
self._trManager.flush_next()
class StatusHandler(tornado.web.RequestHandler):
def get(self):
threshold = int(self.get_argument('threshold', -1))
m = MetricTransaction.get_tr_manager()
self.write("<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
transactions = m.get_transactions()
for tr in transactions:
self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
(tr.get_id(), tr.get_size(), tr.get_error_count(), tr.get_next_flush()))
self.write("</table>")
if threshold >= 0:
if len(transactions) > threshold:
self.set_status(503)
class AgentInputHandler(tornado.web.RequestHandler):
def post(self):
"""Read the message and forward it to the intake
The message is expected to follow the format:
"""
# read the message it should be a list of monagent.common.metrics.Measurements expressed as a dict
msg = tornado.escape.json_decode(self.request.body)
try:
log.debug(msg)
measurements = [Measurement(**m) for m in msg]
except Exception:
log.exception('Error parsing body of Agent Input')
raise tornado.web.HTTPError(500)
headers = self.request.headers
if len(measurements) > 0:
# Setup a transaction for this message
tr = MetricTransaction(measurements, headers)
else:
raise tornado.web.HTTPError(500)
self.write("Transaction: %s" % tr.get_id())
class Forwarder(tornado.web.Application):
def __init__(self, port, agent_config, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False):
self._port = int(port)
self._agentConfig = agent_config
self._metrics = {}
MetricTransaction.set_application(self)
MetricTransaction.set_endpoints(MonAPI(agent_config['Api']))
self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
MetricTransaction.set_tr_manager(self._tr_manager)
self._watchdog = None
self.skip_ssl_validation = skip_ssl_validation or agent_config.get('skip_ssl_validation', False)
self.use_simple_http_client = use_simple_http_client
if self.skip_ssl_validation:
log.info("Skipping SSL hostname validation, useful when using a transparent proxy")
if watchdog:
watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
def _post_metrics(self):
if len(self._metrics) > 0:
MetricTransaction(self._metrics, headers={'Content-Type': 'application/json'})
self._metrics = {}
# todo why is the tornado logging method overridden? Perhaps ditch this.
def log_request(self, handler):
""" Override the tornado logging method.
If everything goes well, log level is DEBUG.
Otherwise it's WARNING or ERROR depending on the response code. """
if handler.get_status() < 400:
log_method = log.debug
elif handler.get_status() < 500:
log_method = log.warning
else:
log_method = log.error
request_time = 1000.0 * handler.request.request_time()
log_method("%d %s %.2fms", handler.get_status(),
handler._request_summary(), request_time)
def run(self):
handlers = [
(r"/intake/?", AgentInputHandler),
(r"/api/v1/series/?", AgentInputHandler),
(r"/status/?", StatusHandler),
]
settings = dict(
cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
xsrf_cookies=False,
debug=False,
log_function=self.log_request
)
non_local_traffic = self._agentConfig.get("non_local_traffic", False)
tornado.web.Application.__init__(self, handlers, **settings)
http_server = tornado.httpserver.HTTPServer(self)
try:
# non_local_traffic must be == True to match, not just some non-false value
if non_local_traffic is True:
http_server.listen(self._port)
else:
# localhost in lieu of 127.0.0.1 to support IPv6
try:
http_server.listen(self._port, address="localhost")
except gaierror:
log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
http_server.listen(self._port, address="127.0.0.1")
except socket_error, e:
if "Errno 99" in str(e):
log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4")
http_server.listen(self._port, address="127.0.0.1")
else:
raise
except socket_error, e:
log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
sys.exit(1)
except Exception:
log.exception("Uncaught exception. Forwarder is exiting.")
sys.exit(1)
log.info("Listening on port %d" % self._port)
# Register callbacks
self.mloop = get_tornado_ioloop()
logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)
def flush_trs():
if self._watchdog:
self._watchdog.reset()
self._post_metrics()
self._tr_manager.flush()
tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
# Start everything
if self._watchdog:
self._watchdog.reset()
tr_sched.start()
self.mloop.start()
log.info("Stopped")
def stop(self):
self.mloop.stop()
def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
agent_config = get_config(parse_args=False)
port = agent_config.get('listen_port', 17123)
if port is None:
port = 17123
else:
port = int(port)
app = Forwarder(port, agent_config, skip_ssl_validation=skip_ssl_validation,
use_simple_http_client=use_simple_http_client)
def sigterm_handler(signum, frame):
log.info("caught sigterm. stopping")
app.stop()
signal.signal(signal.SIGTERM, sigterm_handler)
signal.signal(signal.SIGINT, sigterm_handler)
return app
def main():
define("sslcheck", default=1, help="Verify SSL hostname, on by default")
define("use_simple_http_client", default=0, help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
args = parse_command_line()
skip_ssl_validation = False
use_simple_http_client = False
if unicode(options.sslcheck) == u"0":
skip_ssl_validation = True
if unicode(options.use_simple_http_client) == u"1":
use_simple_http_client = True
# If we don't have any arguments, run the server.
if not args:
app = init_forwarder(skip_ssl_validation, use_simple_http_client=use_simple_http_client)
try:
app.run()
finally:
ForwarderStatus.remove_latest_status()
else:
usage = "%s [help|info]. Run with no commands to start the server" % (sys.argv[0])
command = args[0]
if command == 'info':
logging.getLogger().setLevel(logging.ERROR)
return ForwarderStatus.print_latest_status()
elif command == 'help':
print usage
else:
print "Unknown command: %s" % command
print usage
return -1
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,108 +1 @@
import json
import logging
from threading import Timer
from monagent.common.keystone import Keystone
from monagent.common.util import get_hostname
from monclient import client
import monclient.exc as exc
log = logging.getLogger(__name__)
class MonAPI(object):
"""Sends measurements to MonAPI
Any errors should raise an exception so the transaction calling
this is not committed
"""
def __init__(self, config):
"""
Initialize Mon api client connection.
"""
self.config = config
self.url = config['url']
self.api_version = '2_0'
self.default_dimensions = config['dimensions']
self.token_expiration = 1438
# Verify the hostname is set as a dimension
if not 'hostname' in self.default_dimensions:
self.default_dimensions['hostname'] = get_hostname()
log.debug("Getting token from Keystone")
self.keystone_url = config['keystone_url']
self.username = config['username']
self.password = config['password']
self.project_name = config['project_name']
self.keystone = Keystone(self.keystone_url,
self.username,
self.password,
self.project_name)
self.mon_client = None
def _post(self, measurements):
"""Does the actual http post
measurements is a list of Measurement
"""
data = [m.__dict__ for m in measurements]
kwargs = {
'jsonbody': data
}
try:
if not self.mon_client:
# construct the mon client
self.mon_client = self.get_client()
done = False
while not done:
response = self.mon_client.metrics.create(**kwargs)
if 200 <= response.status_code <= 299:
# Good status from web service
log.debug("Message sent successfully: {0}"
.format(str(data)))
elif 400 <= response.status_code <= 499:
# Good status from web service but some type of issue
# with the data
if response.status_code == 401:
# Get a new token/client and retry
self.mon_client = self.get_client()
continue
else:
error_msg = "Successful web service call but there" + \
" were issues (Status: {0}, Status Message: " + \
"{1}, Message Content: {1})"
log.error(error_msg.format(response.status_code,
response.reason, response.text))
response.raise_for_status()
else: # Not a good status
response.raise_for_status()
done = True
except exc.HTTPException as he:
log.error("Error sending message to mon-api: {0}"
.format(str(he.message)))
def post_metrics(self, measurements):
"""post_metrics
given [Measurement, ...], format the request and post to
the monitoring api
"""
# Add default dimensions
for measurement in measurements:
for dimension in self.default_dimensions.keys():
if not measurement.dimensions.has_key(dimension):
measurement.dimensions.update({dimension: self.default_dimensions[dimension]})
self._post(measurements)
def get_client(self):
"""get_client
get a new mon-client object
"""
token = self.keystone.refresh_token()
# Re-create the client. This is temporary until
# the client is updated to be able to reset the
# token.
kwargs = {
'token': token
}
return client.Client(self.api_version, self.url, **kwargs)

View File

@ -0,0 +1,105 @@
import logging
from monclient import exc as exc, client
from monagent.common.keystone import Keystone
from monagent.common.util import get_hostname
log = logging.getLogger(__name__)
class MonAPI(object):
"""Sends measurements to MonAPI
Any errors should raise an exception so the transaction calling
this is not committed
"""
def __init__(self, config):
"""
Initialize Mon api client connection.
"""
self.config = config
self.url = config['url']
self.api_version = '2_0'
self.default_dimensions = config['dimensions']
self.token_expiration = 1438
# Verify the hostname is set as a dimension
if not 'hostname' in self.default_dimensions:
self.default_dimensions['hostname'] = get_hostname()
log.debug("Getting token from Keystone")
self.keystone_url = config['keystone_url']
self.username = config['username']
self.password = config['password']
self.project_name = config['project_name']
self.keystone = Keystone(self.keystone_url,
self.username,
self.password,
self.project_name)
self.mon_client = None
def _post(self, measurements):
"""Does the actual http post
measurements is a list of Measurement
"""
data = [m.__dict__ for m in measurements]
kwargs = {
'jsonbody': data
}
try:
if not self.mon_client:
# construct the mon client
self.mon_client = self.get_client()
done = False
while not done:
response = self.mon_client.metrics.create(**kwargs)
if 200 <= response.status_code <= 299:
# Good status from web service
log.debug("Message sent successfully: {0}"
.format(str(data)))
elif 400 <= response.status_code <= 499:
# Good status from web service but some type of issue
# with the data
if response.status_code == 401:
# Get a new token/client and retry
self.mon_client = self.get_client()
continue
else:
error_msg = "Successful web service call but there" + \
" were issues (Status: {0}, Status Message: " + \
"{1}, Message Content: {1})"
log.error(error_msg.format(response.status_code,
response.reason, response.text))
response.raise_for_status()
else: # Not a good status
response.raise_for_status()
done = True
except exc.HTTPException as he:
log.error("Error sending message to mon-api: {0}"
.format(str(he.message)))
def post_metrics(self, measurements):
"""post_metrics
given [Measurement, ...], format the request and post to
the monitoring api
"""
# Add default dimensions
for measurement in measurements:
for dimension in self.default_dimensions.keys():
if not measurement.dimensions.has_key(dimension):
measurement.dimensions.update({dimension: self.default_dimensions[dimension]})
self._post(measurements)
def get_client(self):
"""get_client
get a new mon-client object
"""
token = self.keystone.refresh_token()
# Re-create the client. This is temporary until
# the client is updated to be able to reset the
# token.
kwargs = {
'token': token
}
return client.Client(self.api_version, self.url, **kwargs)

View File

@ -0,0 +1,328 @@
#!/usr/bin/env python
"""
Datadog
www.datadoghq.com
----
Make sense of your IT Data
Licensed under Simplified BSD License (see LICENSE)
(C) Boxed Ice 2010 all rights reserved
(C) Datadog, Inc. 2010-2013 all rights reserved
"""
# set up logging before importing any other components
from monagent.common.config import initialize_logging
from monagent.forwarder.api.mon import MonAPI
initialize_logging('forwarder')
from monagent.common.config import get_logging_config
import os
os.umask(022)
# Standard imports
import logging
import sys
from datetime import timedelta
import signal
from socket import gaierror, error as socket_error
# Tornado
import tornado.httpclient
import tornado.httpserver
import tornado.ioloop
import tornado.web
from tornado.escape import json_decode
from tornado.options import define, parse_command_line, options
# agent import
from monagent.common.check_status import ForwarderStatus
from monagent.common.config import get_config
from monagent.common.metrics import Measurement
from monagent.common.util import Watchdog, get_tornado_ioloop
from transaction import Transaction, TransactionManager
log = logging.getLogger('forwarder')
log.setLevel(get_logging_config()['log_level'] or logging.INFO)
TRANSACTION_FLUSH_INTERVAL = 5000 # Every 5 seconds
WATCHDOG_INTERVAL_MULTIPLIER = 10 # 10x flush interval
# Maximum delay before replaying a transaction
MAX_WAIT_FOR_REPLAY = timedelta(seconds=90)
# Maximum queue size in bytes (when this is reached, old messages are dropped)
MAX_QUEUE_SIZE = 30 * 1024 * 1024 # 30MB
THROTTLING_DELAY = timedelta(microseconds=1000000/2) # 2 msg/second
class MetricTransaction(Transaction):
_application = None
_trManager = None
_endpoints = []
@classmethod
def set_application(cls, app):
cls._application = app
@classmethod
def set_tr_manager(cls, manager):
cls._trManager = manager
@classmethod
def get_tr_manager(cls):
return cls._trManager
@classmethod
def set_endpoints(cls, endpoint):
# todo we only have one endpoint option, generalize it better
# the immediate use case for two endpoints could be our own monitoring boxes, they could send to
# the main api and mini-mon api
cls._endpoints.append(endpoint)
def __init__(self, data, headers):
self._data = data
self._headers = headers
# Call after data has been set (size is computed in Transaction's init)
Transaction.__init__(self)
# Insert the transaction in the Manager
self._trManager.append(self)
log.debug("Created transaction %d" % self.get_id())
self._trManager.flush()
def __sizeof__(self):
return sys.getsizeof(self._data)
def flush(self):
try:
for endpoint in self._endpoints:
endpoint.post_metrics(self._data)
except Exception:
log.exception('Error flushing metrics to remote endpoints')
self._trManager.tr_error(self)
else:
self._trManager.tr_success(self)
self._trManager.flush_next()
class StatusHandler(tornado.web.RequestHandler):
def get(self):
threshold = int(self.get_argument('threshold', -1))
m = MetricTransaction.get_tr_manager()
self.write("<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
transactions = m.get_transactions()
for tr in transactions:
self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
(tr.get_id(), tr.get_size(), tr.get_error_count(), tr.get_next_flush()))
self.write("</table>")
if threshold >= 0:
if len(transactions) > threshold:
self.set_status(503)
class AgentInputHandler(tornado.web.RequestHandler):
def post(self):
"""Read the message and forward it to the intake
The message is expected to follow the format:
"""
# read the message it should be a list of monagent.common.metrics.Measurements expressed as a dict
msg = tornado.escape.json_decode(self.request.body)
try:
log.debug(msg)
measurements = [Measurement(**m) for m in msg]
except Exception:
log.exception('Error parsing body of Agent Input')
raise tornado.web.HTTPError(500)
headers = self.request.headers
if len(measurements) > 0:
# Setup a transaction for this message
tr = MetricTransaction(measurements, headers)
else:
raise tornado.web.HTTPError(500)
self.write("Transaction: %s" % tr.get_id())
class Forwarder(tornado.web.Application):
def __init__(self, port, agent_config, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False):
self._port = int(port)
self._agentConfig = agent_config
self._metrics = {}
MetricTransaction.set_application(self)
MetricTransaction.set_endpoints(MonAPI(agent_config['Api']))
self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
MetricTransaction.set_tr_manager(self._tr_manager)
self._watchdog = None
self.skip_ssl_validation = skip_ssl_validation or agent_config.get('skip_ssl_validation', False)
self.use_simple_http_client = use_simple_http_client
if self.skip_ssl_validation:
log.info("Skipping SSL hostname validation, useful when using a transparent proxy")
if watchdog:
watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
def _post_metrics(self):
if len(self._metrics) > 0:
MetricTransaction(self._metrics, headers={'Content-Type': 'application/json'})
self._metrics = {}
# todo why is the tornado logging method overridden? Perhaps ditch this.
def log_request(self, handler):
""" Override the tornado logging method.
If everything goes well, log level is DEBUG.
Otherwise it's WARNING or ERROR depending on the response code. """
if handler.get_status() < 400:
log_method = log.debug
elif handler.get_status() < 500:
log_method = log.warning
else:
log_method = log.error
request_time = 1000.0 * handler.request.request_time()
log_method("%d %s %.2fms", handler.get_status(),
handler._request_summary(), request_time)
def run(self):
handlers = [
(r"/intake/?", AgentInputHandler),
(r"/api/v1/series/?", AgentInputHandler),
(r"/status/?", StatusHandler),
]
settings = dict(
cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
xsrf_cookies=False,
debug=False,
log_function=self.log_request
)
non_local_traffic = self._agentConfig.get("non_local_traffic", False)
tornado.web.Application.__init__(self, handlers, **settings)
http_server = tornado.httpserver.HTTPServer(self)
try:
# non_local_traffic must be == True to match, not just some non-false value
if non_local_traffic is True:
http_server.listen(self._port)
else:
# localhost in lieu of 127.0.0.1 to support IPv6
try:
http_server.listen(self._port, address="localhost")
except gaierror:
log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
http_server.listen(self._port, address="127.0.0.1")
except socket_error, e:
if "Errno 99" in str(e):
log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4")
http_server.listen(self._port, address="127.0.0.1")
else:
raise
except socket_error, e:
log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
sys.exit(1)
except Exception:
log.exception("Uncaught exception. Forwarder is exiting.")
sys.exit(1)
log.info("Listening on port %d" % self._port)
# Register callbacks
self.mloop = get_tornado_ioloop()
logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)
def flush_trs():
if self._watchdog:
self._watchdog.reset()
self._post_metrics()
self._tr_manager.flush()
tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
# Start everything
if self._watchdog:
self._watchdog.reset()
tr_sched.start()
self.mloop.start()
log.info("Stopped")
def stop(self):
self.mloop.stop()
def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
agent_config = get_config(parse_args=False)
port = agent_config.get('listen_port', 17123)
if port is None:
port = 17123
else:
port = int(port)
app = Forwarder(port, agent_config, skip_ssl_validation=skip_ssl_validation,
use_simple_http_client=use_simple_http_client)
def sigterm_handler(signum, frame):
log.info("caught sigterm. stopping")
app.stop()
signal.signal(signal.SIGTERM, sigterm_handler)
signal.signal(signal.SIGINT, sigterm_handler)
return app
def main():
define("sslcheck", default=1, help="Verify SSL hostname, on by default")
define("use_simple_http_client", default=0, help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
args = parse_command_line()
skip_ssl_validation = False
use_simple_http_client = False
if unicode(options.sslcheck) == u"0":
skip_ssl_validation = True
if unicode(options.use_simple_http_client) == u"1":
use_simple_http_client = True
# If we don't have any arguments, run the server.
if not args:
app = init_forwarder(skip_ssl_validation, use_simple_http_client=use_simple_http_client)
try:
app.run()
finally:
ForwarderStatus.remove_latest_status()
else:
usage = "%s [help|info]. Run with no commands to start the server" % (sys.argv[0])
command = args[0]
if command == 'info':
logging.getLogger().setLevel(logging.ERROR)
return ForwarderStatus.print_latest_status()
elif command == 'help':
print usage
else:
print "Unknown command: %s" % command
print usage
return -1
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,154 +0,0 @@
#!/usr/bin/env python
"""
A Python Statsd implementation with dimensions added
"""
# set up logging before importing any other components
from monagent.common.config import initialize_logging
from monagent.monstatsd.reporter import Reporter
from monagent.monstatsd.udp import Server
initialize_logging('monstatsd')
import os
os.umask(022)
# stdlib
import logging
import optparse
import signal
import sys
# project
from monagent.common.aggregator import MetricsBucketAggregator
from monagent.common.check_status import MonstatsdStatus
from monagent.common.config import get_config
from monagent.common.daemon import Daemon, AgentSupervisor
from monagent.common.util import PidFile, get_hostname
log = logging.getLogger('monstatsd')
class Monstatsd(Daemon):
""" This class is the monstatsd daemon. """
def __init__(self, pid_file, server, reporter, autorestart):
Daemon.__init__(self, pid_file, autorestart=autorestart)
self.server = server
self.reporter = reporter
def _handle_sigterm(self, signum, frame):
log.debug("Caught sigterm. Stopping run loop.")
self.server.stop()
def run(self):
# Gracefully exit on sigterm.
signal.signal(signal.SIGTERM, self._handle_sigterm)
# Handle Keyboard Interrupt
signal.signal(signal.SIGINT, self._handle_sigterm)
# Start the reporting thread before accepting data
self.reporter.start()
try:
try:
self.server.start()
except Exception, e:
log.exception('Error starting server')
raise e
finally:
# The server will block until it's done. Once we're here, shutdown
# the reporting thread.
self.reporter.stop()
self.reporter.join()
log.info("Monstatsd is stopped")
# Restart if asked to restart
if self.autorestart:
sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def info(self):
logging.getLogger().setLevel(logging.ERROR)
return MonstatsdStatus.print_latest_status()
def init_monstatsd(config_path=None, use_watchdog=False):
"""Configure the server and the reporting thread.
"""
c = get_config(parse_args=False, cfg_path=config_path)
log.debug("Configuration monstatsd")
port = c['monstatsd_port']
interval = int(c['monstatsd_interval'])
aggregator_interval = int(c['monstatsd_agregator_bucket_size'])
non_local_traffic = c['non_local_traffic']
forward_to_host = c.get('statsd_forward_host')
forward_to_port = c.get('statsd_forward_port')
event_chunk_size = c.get('event_chunk_size')
target = c['forwarder_url']
hostname = get_hostname(c)
# Create the aggregator (which is the point of communication between the
# server and reporting threads.
assert 0 < interval
aggregator = MetricsBucketAggregator(hostname, aggregator_interval,
recent_point_threshold=c.get('recent_point_threshold', None))
# Start the reporting thread.
reporter = Reporter(interval, aggregator, target, use_watchdog, event_chunk_size)
# Start the server on an IPv4 stack
# Default to loopback
server_host = 'localhost'
# If specified, bind to all addressses
if non_local_traffic:
server_host = ''
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)
return reporter, server, c
def main(config_path=None):
""" The main entry point for the unix version of monstatsd. """
parser = optparse.OptionParser("%prog [start|stop|restart|status]")
opts, args = parser.parse_args()
reporter, server, cnf = init_monstatsd(config_path, use_watchdog=True)
pid_file = PidFile('monstatsd')
daemon = Monstatsd(pid_file.get_path(), server, reporter,
cnf.get('autorestart', False))
# If no args were passed in, run the server in the foreground.
# todo does this need to be a daemon even when it basically always runs in the foreground, if not
# restructure and get rid of the silly init_function
if not args:
daemon.run()
return 0
# Otherwise, we're process the deamon command.
else:
command = args[0]
if command == 'start':
daemon.start()
elif command == 'stop':
daemon.stop()
elif command == 'restart':
daemon.restart()
elif command == 'status':
daemon.status()
elif command == 'info':
return daemon.info()
else:
sys.stderr.write("Unknown command: %s\n\n" % command)
parser.print_help()
return 1
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,154 @@
#!/usr/bin/env python
"""
A Python Statsd implementation with dimensions added
"""
# set up logging before importing any other components
from monagent.common.config import initialize_logging
from monagent.monstatsd.reporter import Reporter
from monagent.monstatsd.udp import Server
initialize_logging('monstatsd')
import os
os.umask(022)
# stdlib
import logging
import optparse
import signal
import sys
# project
from monagent.common.aggregator import MetricsBucketAggregator
from monagent.common.check_status import MonstatsdStatus
from monagent.common.config import get_config
from monagent.common.daemon import Daemon, AgentSupervisor
from monagent.common.util import PidFile, get_hostname
log = logging.getLogger('monstatsd')
class Monstatsd(Daemon):
""" This class is the monstatsd daemon. """
def __init__(self, pid_file, server, reporter, autorestart):
Daemon.__init__(self, pid_file, autorestart=autorestart)
self.server = server
self.reporter = reporter
def _handle_sigterm(self, signum, frame):
log.debug("Caught sigterm. Stopping run loop.")
self.server.stop()
def run(self):
# Gracefully exit on sigterm.
signal.signal(signal.SIGTERM, self._handle_sigterm)
# Handle Keyboard Interrupt
signal.signal(signal.SIGINT, self._handle_sigterm)
# Start the reporting thread before accepting data
self.reporter.start()
try:
try:
self.server.start()
except Exception, e:
log.exception('Error starting server')
raise e
finally:
# The server will block until it's done. Once we're here, shutdown
# the reporting thread.
self.reporter.stop()
self.reporter.join()
log.info("Monstatsd is stopped")
# Restart if asked to restart
if self.autorestart:
sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
def info(self):
logging.getLogger().setLevel(logging.ERROR)
return MonstatsdStatus.print_latest_status()
def init_monstatsd(config_path=None, use_watchdog=False):
"""Configure the server and the reporting thread.
"""
c = get_config(parse_args=False, cfg_path=config_path)
log.debug("Configuration monstatsd")
port = c['monstatsd_port']
interval = int(c['monstatsd_interval'])
aggregator_interval = int(c['monstatsd_agregator_bucket_size'])
non_local_traffic = c['non_local_traffic']
forward_to_host = c.get('statsd_forward_host')
forward_to_port = c.get('statsd_forward_port')
event_chunk_size = c.get('event_chunk_size')
target = c['forwarder_url']
hostname = get_hostname(c)
# Create the aggregator (which is the point of communication between the
# server and reporting threads.
assert 0 < interval
aggregator = MetricsBucketAggregator(hostname, aggregator_interval,
recent_point_threshold=c.get('recent_point_threshold', None))
# Start the reporting thread.
reporter = Reporter(interval, aggregator, target, use_watchdog, event_chunk_size)
# Start the server on an IPv4 stack
# Default to loopback
server_host = 'localhost'
# If specified, bind to all addressses
if non_local_traffic:
server_host = ''
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)
return reporter, server, c
def main(config_path=None):
""" The main entry point for the unix version of monstatsd. """
parser = optparse.OptionParser("%prog [start|stop|restart|status]")
opts, args = parser.parse_args()
reporter, server, cnf = init_monstatsd(config_path, use_watchdog=True)
pid_file = PidFile('monstatsd')
daemon = Monstatsd(pid_file.get_path(), server, reporter,
cnf.get('autorestart', False))
# If no args were passed in, run the server in the foreground.
# todo does this need to be a daemon even when it basically always runs in the foreground, if not
# restructure and get rid of the silly init_function
if not args:
daemon.run()
return 0
# Otherwise, we're process the deamon command.
else:
command = args[0]
if command == 'start':
daemon.start()
elif command == 'stop':
daemon.stop()
elif command == 'restart':
daemon.restart()
elif command == 'status':
daemon.status()
elif command == 'info':
return daemon.info()
else:
sys.stderr.write("Unknown command: %s\n\n" % command)
parser.print_help()
return 1
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -17,7 +17,7 @@ class Plugins(collections.defaultdict):
# todo Possibly enforce the key being a string without .yaml in it.
def diff(self, other_plugins):
raise NotImplementedError
raise NotImplementedError
def merge(self, other):
"""Do a deep merge with precedence going to other (as is the case with update)

View File

@ -1,99 +1,3 @@
"""Classes and utilities for detection of running resources to be monitored.
Detection classes should be platform independent
"""
import psutil
from plugin import Plugin
from utils import find_process_cmdline, find_process_name, watch_process, service_api_check
from monsetup import agent_config
class Plugin(object):
"""Abstract class implemented by the mon-agent plugin detection classes
"""
# todo these should include dependency detection
def __init__(self, template_dir, overwrite=True):
self.available = False
self.template_dir = template_dir
self.dependencies = ()
self.overwrite = overwrite
self._detect()
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
raise NotImplementedError
def build_config(self):
"""Build the config as a Plugins object and return.
"""
raise NotImplementedError
def dependencies_installed(self):
"""return True if dependencies are installed
"""
raise NotImplementedError
@property
def name(self):
"""Return _name if set otherwise the class name"""
if '_name' in self.__dict__:
return self._name
else:
return self.__class__.__name__
def find_process_cmdline(search_string):
"""Simple function to search running process for one with cmdline containing
"""
for process in psutil.process_iter():
for arg in process.cmdline():
if arg.find(search_string) != -1:
return process
return None
def find_process_name(pname):
"""Simple function to search running process for one with pname.
"""
for process in psutil.process_iter():
if pname == process.name():
return process
return None
def watch_process(search_strings, service = None):
"""Takes a list of process search strings and returns a Plugins object with the config set.
This was built as a helper as many plugins setup process watching
"""
config = agent_config.Plugins()
parameters = {'name': search_strings[0],
'search_string': search_strings}
# If service parameter is set in the plugin config, add the service dimension which
# will override the service in the agent config
if service:
parameters['dimensions'] = {'service': service}
config['process'] = {'init_config': None,
'instances': [parameters]}
return config
def service_api_check(name, url, pattern, service = None):
"""Setup a service api to be watched by the http_check plugin."""
config = agent_config.Plugins()
parameters = {'name': name,
'url': url,
'match_pattern': pattern,
'timeout': 10,
'use_keystone': True}
# If service parameter is set in the plugin config, add the service dimension which
# will override the service in the agent config
if service:
parameters['dimensions'] = {'service': service}
config['http_check'] = {'init_config': None,
'instances': [parameters]}
return config

View File

@ -4,6 +4,7 @@ from monsetup import agent_config
log = logging.getLogger(__name__)
class Cinder(Plugin):
"""Detect cinder daemons and setup configuration to monitor them."""

View File

@ -4,6 +4,7 @@ from monsetup import agent_config
log = logging.getLogger(__name__)
class CinderAPI(Plugin):
"""Detect the Cinder-API daemon and setup configuration to monitor it."""
@ -21,11 +22,12 @@ class CinderAPI(Plugin):
config = agent_config.Plugins()
# First watch the Nova-API process
log.info("\tMonitoring the cinder API process.")
config.merge(watch_process([self.process_name], self.service_name))
config.merge(watch_process([self.process_name], self.service_name))
# Next setup an active http_status check on the API
log.info("\tConfiguring an http_check for the cinder API.")
config.merge(service_api_check(self.process_name, 'http://localhost:8776/v2.0', '.*version=1.*', self.service_name))
config.merge(
service_api_check(self.process_name, 'http://localhost:8776/v2.0', '.*version=1.*', self.service_name))
return config

View File

@ -31,6 +31,7 @@ class Kafka(Plugin):
import kazoo
from kazoo.client import KazooClient
logging.getLogger('kazoo').setLevel(logging.WARN) # kazoo fills up the console without this
zk = KazooClient(hosts='127.0.0.1:2181', read_only=True)
@ -48,7 +49,6 @@ class Kafka(Plugin):
except kazoo.exceptions.NoNodeError:
continue
log.info("\tInstalling kafka_consumer plugin.")
config['kafka_consumer'] = {'init_config': None,
'instances': [{'kafka_connect_str': 'localhost:9092',

View File

@ -4,6 +4,7 @@ from monsetup import agent_config
log = logging.getLogger(__name__)
class Nova(Plugin):
"""Detect Nova daemons and setup configuration to monitor them."""

View File

@ -4,6 +4,7 @@ from monsetup import agent_config
log = logging.getLogger(__name__)
class NovaAPI(Plugin):
"""Detect the Nova-API daemon and setup configuration to monitor it."""
@ -25,7 +26,8 @@ class NovaAPI(Plugin):
# Next setup an active http_status check on the API
log.info("\tConfiguring an http_check for the nova API.")
config.merge(service_api_check(self.process_name, 'http://localhost:8774/v2.0', '.*version=2.*', self.service_name))
config.merge(
service_api_check(self.process_name, 'http://localhost:8774/v2.0', '.*version=2.*', self.service_name))
return config

View File

@ -0,0 +1,40 @@
"""Classes for detection of running resources to be monitored.
Detection classes should be platform independent
"""
class Plugin(object):
"""Abstract class implemented by the mon-agent plugin detection classes
"""
# todo these should include dependency detection
def __init__(self, template_dir, overwrite=True):
self.available = False
self.template_dir = template_dir
self.dependencies = ()
self.overwrite = overwrite
self._detect()
def _detect(self):
"""Run detection, set self.available True if the service is detected."""
raise NotImplementedError
def build_config(self):
"""Build the config as a Plugins object and return.
"""
raise NotImplementedError
def dependencies_installed(self):
"""return True if dependencies are installed
"""
raise NotImplementedError
@property
def name(self):
"""Return _name if set otherwise the class name"""
if '_name' in self.__dict__:
return self._name
else:
return self.__class__.__name__

View File

@ -4,18 +4,19 @@ from monsetup import agent_config
log = logging.getLogger(__name__)
class Swift(Plugin):
"""Detect Swift daemons and setup configuration to monitor them."""
def _detect(self):
"""Run detection"""
self.swift_processes = ['swift-container-updater', 'swift-account-auditor',
'swift-object-replicator', 'swift-container-replicator',
'swift-object-auditor', 'swift-container-auditor',
'swift-account-reaper', 'swift-container-sync',
'swift-account-replicator', 'swift-object-updater',
'swift-object-server', 'swift-account-server',
'swift-container-server']
'swift-object-replicator', 'swift-container-replicator',
'swift-object-auditor', 'swift-container-auditor',
'swift-account-reaper', 'swift-container-sync',
'swift-account-replicator', 'swift-object-updater',
'swift-object-server', 'swift-account-server',
'swift-container-server']
self.found_processes = []
for process in self.swift_processes:

View File

@ -4,6 +4,7 @@ from monsetup import agent_config
log = logging.getLogger(__name__)
class SwiftAPI(Plugin):
"""Detect the Swift-API daemon and setup configuration to monitor it."""
@ -25,7 +26,8 @@ class SwiftAPI(Plugin):
# Next setup an active http_status check on the API
log.info("\tConfiguring an http_check for the swift API.")
config.merge(service_api_check(self.process_name, 'http://localhost:8080/healthcheck', '.*OK.*', self.service_name))
config.merge(
service_api_check(self.process_name, 'http://localhost:8080/healthcheck', '.*OK.*', self.service_name))
return config

View File

@ -0,0 +1,64 @@
""" Util functions to assist in detection.
"""
import psutil
from monsetup import agent_config
def find_process_cmdline(search_string):
"""Simple function to search running process for one with cmdline containing
"""
for process in psutil.process_iter():
for arg in process.cmdline():
if arg.find(search_string) != -1:
return process
return None
def find_process_name(pname):
"""Simple function to search running process for one with pname.
"""
for process in psutil.process_iter():
if pname == process.name():
return process
return None
def watch_process(search_strings, service=None):
"""Takes a list of process search strings and returns a Plugins object with the config set.
This was built as a helper as many plugins setup process watching
"""
config = agent_config.Plugins()
parameters = {'name': search_strings[0],
'search_string': search_strings}
# If service parameter is set in the plugin config, add the service dimension which
# will override the service in the agent config
if service:
parameters['dimensions'] = {'service': service}
config['process'] = {'init_config': None,
'instances': [parameters]}
return config
def service_api_check(name, url, pattern, service=None):
"""Setup a service api to be watched by the http_check plugin."""
config = agent_config.Plugins()
parameters = {'name': name,
'url': url,
'match_pattern': pattern,
'timeout': 10,
'use_keystone': True}
# If service parameter is set in the plugin config, add the service dimension which
# will override the service in the agent config
if service:
parameters['dimensions'] = {'service': service}
config['http_check'] = {'init_config': None,
'instances': [parameters]}
return config

View File

@ -1,44 +1,2 @@
"""Classes implementing different methods for running mon-agent on startup as well as starting the process immediately
"""
import psutil
from service import Service
class Service(object):
"""Abstract base class implementing the interface for various service types."""
def __init__(self, config_dir, log_dir, name='mon-agent'):
self.config_dir = config_dir
self.log_dir = log_dir
self.name = name
def enable(self):
"""Sets mon-agent to start on boot.
Generally this requires running as super user
"""
raise NotImplementedError
def start(self, restart=True):
"""Starts mon-agent
If the agent is running and restart is True, restart
"""
raise NotImplementedError
def stop(self):
"""Stops mon-agent
"""
raise NotImplementedError
def is_enabled(self):
"""Returns True if mon-agent is setup to start on boot, false otherwise
"""
raise NotImplementedError
@staticmethod
def is_running():
"""Returns True if mon-agent is running, false otherwise
"""
# Looking for the supervisor process not the individual components
for process in psutil.process_iter():
if '/etc/mon-agent/supervisor.conf' in process.cmdline():
return True
return False

View File

@ -0,0 +1,44 @@
"""Classes implementing different methods for running mon-agent on startup as well as starting the process immediately
"""
import psutil
class Service(object):
"""Abstract base class implementing the interface for various service types."""
def __init__(self, config_dir, log_dir, name='mon-agent'):
self.config_dir = config_dir
self.log_dir = log_dir
self.name = name
def enable(self):
"""Sets mon-agent to start on boot.
Generally this requires running as super user
"""
raise NotImplementedError
def start(self, restart=True):
"""Starts mon-agent
If the agent is running and restart is True, restart
"""
raise NotImplementedError
def stop(self):
"""Stops mon-agent
"""
raise NotImplementedError
def is_enabled(self):
"""Returns True if mon-agent is setup to start on boot, false otherwise
"""
raise NotImplementedError
@staticmethod
def is_running():
"""Returns True if mon-agent is running, false otherwise
"""
# Looking for the supervisor process not the individual components
for process in psutil.process_iter():
if '/etc/mon-agent/supervisor.conf' in process.cmdline():
return True
return False

View File

@ -131,9 +131,9 @@ setup(
packages=find_packages(exclude=['tests', 'build*', 'packaging*']),
entry_points={
'console_scripts': [
'mon-forwarder = monagent.forwarder:main',
'mon-forwarder = monagent.forwarder.daemon:main',
'mon-collector = monagent.collector.daemon:main',
'monstatsd = monagent.monstatsd:main',
'monstatsd = monagent.monstatsd.daemon:main',
'mon-setup = monsetup.main:main'
],
},