Remove 'detail' dimension, fix a couple bugs

The 'detail' dimension in http_check is not actually an appropriate
dimension since it does not contain data which _identifies_ a metric.
This data is valuable, though, and will be captured and made available
in a future enhancement.

Bug fixes:
- Fix unix.py crash on EFI mountpoints which return 0 available inodes
- Rename 'service_name' to 'name' in nagios_wrapper for ServicesCheck
  compatibility
- Update nagios_wrapper YAML example for naming convention

Change-Id: I44877a856fac84f54ca764a79e75ace9f90d9302
This commit is contained in:
David Schroeder
2014-09-17 13:31:04 -06:00
parent 70aa0d713c
commit bac28480bd
5 changed files with 22 additions and 35 deletions

View File

@@ -13,21 +13,10 @@ instances:
# The (optional) match_pattern parameter will instruct the check
# to match the HTTP response body against a regular-expression-
# compatible pattern. If the pattern matches the check will
# return 0 for OK. Otherwise, it will return 1 for an error and
# include the response body as a 'detail' tag, as long as
# 'include_content' below is set to 'true'
# return 0 for OK. Otherwise, it will return 1 for an error
# match_pattern: '.*OK.*OK.*OK.*OK.*OK'
# The (optional) include_content parameter will instruct the check
# to include the first 200 characters of the HTTP response body
# in notifications sent by this plugin. This is best used with
# "healthcheck"-type URLs, where the body contains a brief, human-
# readable summary of failure reasons in the case of errors. This
# defaults to false.
# include_content: true
# The (optional) collect_response_time parameter will instruct the
# check to create a metric 'network.http.response_time', tagged with
# the url, reporting the response time in seconds.

View File

@@ -5,19 +5,19 @@ init_config:
# Where to store last-run timestamps for each check (required)
# temp_file_path: /dev/shm/
# List a check name under 'service_name' and the full command-line
# List a check name under 'name' and the full command-line
# under 'check_command'. If the command exists in 'check_path' above,
# it is not necessary to specify the full path.
instances:
# - service_name: load
# - name: nagios.load
# check_command: check_load -r -w 2,1.5,1 -c 10,5,4
# - service_name: disk
# - name: nagios.disk
# check_command: check_disk -w 15\% -c 5\% -A -i /srv/node
# check_interval: 300
# - service_name: swap
# - name: nagios.swap
# check_command: /usr/lib/nagios/plugins/check_swap -w 50\% -c 10\%
# check_interval: 120
# dimensions: { 'group': 'memory' }

View File

@@ -114,6 +114,10 @@ class Disk(monagent.collector.checks.check.Check):
except IndexError:
self.logger.exception("Cannot parse %s" % (parts,))
# Some partitions (EFI boot) may appear to have 0 available inodes
if parts[1] == 0:
continue
if inodes:
usage_data['%s.disk_inode_utilization_perc' % parts[0]] = float(parts[2]) / parts[1] * 100
else:

View File

@@ -36,11 +36,10 @@ class HTTPCheck(ServicesCheck):
pattern = instance.get('match_pattern', None)
if url is None:
raise Exception("Bad configuration. You must specify a url")
include_content = instance.get('include_content', False)
ssl = instance.get('disable_ssl_validation', True)
token = AgentCheck.keystone.get_token()
return url, username, password, timeout, include_content, headers, response_time, dimensions, ssl, pattern, use_keystone, token
return url, username, password, timeout, headers, response_time, dimensions, ssl, pattern, use_keystone, token
def _create_status_event(self, status, msg, instance):
"""Does nothing: status events are not yet supported by Mon API.
@@ -49,7 +48,7 @@ class HTTPCheck(ServicesCheck):
return
def _check(self, instance):
addr, username, password, timeout, include_content, headers, response_time, dimensions, disable_ssl_validation, pattern, use_keystone, token = self._load_conf(
addr, username, password, timeout, headers, response_time, dimensions, disable_ssl_validation, pattern, use_keystone, token = self._load_conf(
instance)
content = ''
@@ -125,9 +124,7 @@ class HTTPCheck(ServicesCheck):
running_time = time.time() - start
self.gauge('http_response_time', running_time, dimensions=new_dimensions)
# Add a 'detail' tag if requested
if include_content:
new_dimensions['detail'] = json.dumps(content)
# TODO(dschroeder): Save/send content data when supported by API
if int(resp.status) >= 400:
if use_keystone and int(resp.status) == 401:

View File

@@ -1,5 +1,5 @@
#!/bin/env python
"""Monitoring Agent wrapper for Nagios checks.
"""Monasca Agent wrapper for Nagios checks.
"""
@@ -26,12 +26,11 @@ class WrapNagios(ServicesCheck):
@staticmethod
def _do_skip_check(instance, last_run_data):
"""Determine whether or not to skip a check depending on
the checks's check_interval, if specified, and the last
time the check was run
"""
if instance['service_name'] in last_run_data and 'check_interval' in instance:
if time.time() < last_run_data[instance['service_name']] + instance['check_interval']:
if instance['name'] in last_run_data and 'check_interval' in instance:
if time.time() < last_run_data[instance['name']] + instance['check_interval']:
return True
else:
return False
@@ -70,7 +69,7 @@ class WrapNagios(ServicesCheck):
last_run_path +
'nagios_wrapper_' +
hashlib.md5(
instance['service_name']).hexdigest() +
instance['name']).hexdigest() +
'.pck')
# Load last-run data from shared memory file
@@ -92,22 +91,20 @@ class WrapNagios(ServicesCheck):
output = proc.communicate()
# The check detail is all the text before the pipe
detail = output[0].split('|')[0]
if detail != '':
# Serialize the output for JSON-friendliness and add to the dimensions
dimensions['detail'] = json.dumps(detail)
# TODO(dschroeder): Save/send 'detail' when supported by the API
except OSError:
# Return an UNKNOWN code (3) if I have landed here
self.gauge(instance['service_name'], 3, dimensions=dimensions)
self.gauge(instance['name'], 3, dimensions=dimensions)
self.log.info(instance['check_command'].split(" ")[0] + " is missing or unreadable")
return
status_code = proc.poll()
last_run_data[instance['service_name']] = time.time()
self.gauge(instance['service_name'], status_code, dimensions=dimensions)
last_run_data[instance['name']] = time.time()
self.gauge(instance['name'], status_code, dimensions=dimensions)
# Return DOWN on critical, UP otherwise
if status_code == "2":
return Status.DOWN, "DOWN: " + dimensions['detail']
return Status.UP, "UP: " + dimensions['detail']
return Status.DOWN, "DOWN: {}".format(detail)
return Status.UP, "UP: {}".format(detail)
# Save last-run data
file_w = open(last_run_file, "w")