Check for local crash dumps

Add a check that monitors crash dumps. The specified crash directory
is scanned for sub-directories named 'YYYYmmddHHMM' which contain the
dump files. The 'dump_count' metric is the number of sub-directories
(i.e., the number of crash dumps present) and the 'value_meta' dict
contains the date-/timestamp of the most recent crash.

Implements: blueprint check-for-crashes

Change-Id: I20b584c68644ff2e76baabab78e965e682759aa0
This commit is contained in:
Juerg Haefliger 2015-06-26 12:51:23 +02:00
parent c189c45be7
commit 3dade6deb2
3 changed files with 107 additions and 0 deletions

View File

@ -0,0 +1,6 @@
init_config:
# crash_dir: /var/crash
instances:
# Crash check only supports one configured instance
- name: crash_stats

View File

@ -0,0 +1,46 @@
import logging
import os
from datetime import datetime
import monasca_agent.collector.checks as checks
log = logging.getLogger(__name__)
class Crash(checks.AgentCheck):
def __init__(self, name, init_config, agent_config):
super(Crash, self).__init__(name, init_config, agent_config)
self.crash_dir = self.init_config.get('crash_dir', '/var/crash')
log.debug('crash dir: %s', self.crash_dir)
def check(self, instance):
"""
Capture crash dump statistics
"""
dimensions = self._set_dimensions(None, instance)
dump_count = 0
value_meta = {'latest': ''}
# Parse the crash directory
if os.path.isdir(self.crash_dir):
for entry in sorted(os.listdir(self.crash_dir), reverse=True):
if os.path.isdir(os.path.join(self.crash_dir, entry)):
try:
dt = datetime.strptime(entry, '%Y%m%d%H%M')
except ValueError:
continue
# Found a valid crash dump directory
log.debug('found crash dump dir: %s',
os.path.join(self.crash_dir, entry))
dump_count += 1
# Return the date-/timestamp of the most recent crash
if dump_count == 1:
value_meta = {'latest': unicode(dt)}
log.debug('dump_count: %s', dump_count)
self.gauge('crash.dump_count', dump_count, dimensions=dimensions,
value_meta=value_meta)

55
tests/test_crash.py Normal file
View File

@ -0,0 +1,55 @@
import os
import shutil
import unittest
import uuid
from common import get_check
class TestCrash(unittest.TestCase):
def setUp(self):
self.crash_dir = '/tmp/crash-test-%s' % str(uuid.uuid4())
os.mkdir(self.crash_dir)
def tearDown(self):
shutil.rmtree(self.crash_dir)
def test_checks(self):
config = """
init_config:
crash_dir: %s
instances:
- name: crash_stats
""" % self.crash_dir
(check, instances) = get_check('crash', config)
# Baseline check
check.check(instances[0])
metrics = check.get_metrics()
self.assertEqual(metrics[0].value, 0)
self.assertEqual(metrics[0].value_meta['latest'], '')
# Add a crash and re-check
os.mkdir(os.path.join(self.crash_dir,'201504141011'))
check.check(instances[0])
metrics = check.get_metrics()
self.assertEqual(metrics[0].value, 1)
self.assertEqual(metrics[0].value_meta['latest'],
'2015-04-14 10:11:00')
# Add a second crash and re-check
os.mkdir(os.path.join(self.crash_dir,'201505222303'))
check.check(instances[0])
metrics = check.get_metrics()
self.assertEqual(metrics[0].value, 2)
self.assertEqual(metrics[0].value_meta['latest'],
'2015-05-22 23:03:00')
if __name__ == "__main__":
unittest.main()