tigher summary report with percentiles on times

2013-02-12 14:04:54 -06:00 · 2013-02-12 14:04:54 -06:00 · 9d40ce24dd
commit 9d40ce24dd
parent 56f65969e1
5 changed files with 219 additions and 10 deletions
--- a/migrations/003_populate_task_and_image.py
+++ b/migrations/003_populate_task_and_image.py
@ -54,7 +54,7 @@ def fix_chunk(hours, length):
    for kv in states.iteritems():
        print "%s = %d" % kv
-for day in xrange(2, 90):
+for day in xrange(0, 90):
    hours = day * 24
    steps = 12
    chunk = 24 / steps
--- a/reports/pretty.py
+++ b/reports/pretty.py
@ -0,0 +1,166 @@
 import datetime
 import json
 import sys
 import time
 import prettytable
 sys.path.append("/stacktach")
 from stacktach import datetime_to_decimal as dt
 from stacktach import image_type
 from stacktach import models
 if __name__ != '__main__':
    sys.exit(1)
 yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
 if len(sys.argv) == 2:
    try:
        t = time.strptime(sys.argv[1], "%Y-%m-%d")
        yesterday = datetime.datetime(*t[:6])
    except Exception, e:
        print e
        print "Usage: python requests.py YYYY-MM-DD (the end date)"
        sys.exit(1)
 percentile = 90
 hours = 24
 start = datetime.datetime(year=yesterday.year, month=yesterday.month, 
                          day=yesterday.day) 
 end = start + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)
 print "Generating report for %s to %s" % (start, end)
 dstart = dt.dt_to_decimal(start)
 dend = dt.dt_to_decimal(end)
 codes = {}
 # Get all the instances that have changed in the last N hours ...
 updates = models.RawData.objects.filter(event='compute.instance.update',
                                        when__gt=dstart, when__lte=dend)\
                                .values('instance').distinct()
 expiry = 60 * 60  # 1 hour
 cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
 failures = {}
 durations = {}
 attempts = {}
 for uuid_dict in updates:
    uuid = uuid_dict['instance']
    # All the unique Request ID's for this instance during that timespan.
    reqs = models.RawData.objects.filter(instance=uuid,
                                         when__gt=dstart, when__lte=dend) \
                                 .values('request_id').distinct()
    for req_dict in reqs:
        report = False
        req = req_dict['request_id']
        raws = models.RawData.objects.filter(request_id=req)\
                                     .exclude(event='compute.instance.exists')\
                                     .order_by('when')
        start = None
        err = None
        operation = "aux"
        image_type_num = 0
        for raw in raws:
            if not start:
                start = raw.when
            if 'error' in raw.routing_key:
                err = raw
                report = True
            for cmd in cmds:
                if cmd in raw.event:
                    operation = cmd
                    break
            if raw.image_type:
                image_type_num |= raw.image_type                 
        image = "?"
        if image_type.isset(image_type_num, image_type.BASE_IMAGE):
            image = "base"
        if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
            image = "snap"
        if not start:
            continue
        end = raw.when
        diff = end - start
        if diff > 3600:
            report = True
        key = (operation, image)
        # Track durations for all attempts, good and bad ...
        _durations = durations.get(key, [])
        _durations.append(diff)
        durations[key] = _durations
        attempts[key] = attempts.get(key, 0) + 1
        if report:
            failures[key] = failures.get(key, 0) + 1
 # Print the results ...
 cols = ["Operation", "Image", "Min*", "Max*", "Avg*",
        "Requests", "# Fail", "Fail %"]
 p = prettytable.PrettyTable(cols)
 for c in cols[2:]:
    p.align[c] = 'r'
 p.sortby = cols[0]
 pct = (float(100 - percentile) / 2.0) / 100.0
 print "* Using %d-th percentile for results (+/-%.1f%% cut)" % \
                            (percentile, pct * 100.0)
 total = 0
 failure_total = 0
 for key, count in attempts.iteritems():
    total += count
    operation, image = key
    failure_count = failures.get(key, 0)
    failure_total += failure_count
    failure_percentage = float(failure_count) / float(count)
    _failure_percentage = "%.1f%%" % (failure_percentage * 100.0)
    # N-th % of durations ...
    _values = durations[key]
    _values.sort()
    _outliers = int(float(len(_values)) * pct)
    if _outliers > 0:
        before = len(_values)
        _values = _values[_outliers:-_outliers]
        print "culling %d -> %d" % (before, len(_values))
    _min = 99999999
    _max = 0
    _total = 0.0
    for value in _values:
        _min = min(_min, value)
        _max = max(_max, value)
        _total += float(value)
    _avg = float(_total) / float(len(_values))
    _fmin = dt.sec_to_str(_min)
    _fmax = dt.sec_to_str(_max)
    _favg = dt.sec_to_str(_avg)
    p.add_row([operation, image, _fmin, _fmax, _favg, count, 
               failure_count, _failure_percentage])
 print p
 print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \
                (total, failure_total, 
                    (float(failure_total)/float(total)) * 100.0)
--- a/reports/requests.py
+++ b/reports/requests.py
@ -26,9 +26,10 @@ if len(sys.argv) == 2:
        sys.exit(1)
 hours = 0
-length = 24
+length = 6
-start = datetime.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day) 
+start = datetime.datetime(year=yesterday.year, month=yesterday.month, 
                          day=yesterday.day) 
 end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
 print "Generating report for %s to %s" % (start, end)
@ -48,6 +49,7 @@ cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
 failures = {}
 causes = {}
 durations = {}
 error_messages = {}
 successes = {}
 tenant_issues = {}
@ -71,10 +73,10 @@ for uuid_dict in updates:
        start = None
        err = None
-        operation = "n/a"
+        operation = "aux"
        platform = 0
        tenant = 0
-        cell = "n/a"
+        cell = "unk"
        for raw in raws:
            if not start:
@ -107,6 +109,17 @@ for uuid_dict in updates:
            report = True
        key = (operation, platform, cell)
        # Track durations for all attempts, good and bad ...
        duration_min, duration_max, duration_count, duration_total = \
            durations.get(key, (9999999, 0, 0, 0))
        duration_min = min(duration_min, diff)
        duration_max = max(duration_max, diff)
        duration_count += 1
        duration_total += diff
        durations[key] = (duration_min, duration_max, duration_count,
                          duration_total)
        if not report:
            successes[key] = successes.get(key, 0) + 1
        else:
@ -124,14 +137,16 @@ for uuid_dict in updates:
                payload = body['payload']
                print "Error. EventID: %s, Tenant %s, Service %s, Host %s, "\
                      "Deployment %s, Event %s, When %s"\
-                    % (err.id, err.tenant, err.service, err.host, err.deployment.name, 
+                    % (err.id, err.tenant, err.service, err.host, 
                       err.deployment.name, 
                       err.event, dt.dt_from_decimal(err.when))
                exc = payload.get('exception')
                if exc:
                    # group the messages ...
                    exc_str = str(exc)
                    print exc_str
-                    error_messages[exc_str] = error_messages.get(exc_str, 0) + 1
+                    error_messages[exc_str] = \
                                        error_messages.get(exc_str, 0) + 1
                    # extract the code, if any ...
                    code = exc.get('kwargs', {}).get('code')
@ -151,9 +166,15 @@ def dump_breakdown(totals, label):
    print p
 def dump_summary(info, label):
    print "-- %s by operation by cell by platform --" % (label,)
-    p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count"])
+    p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count",
                                 "Min", "Max", "Avg"])
    for c in ["Count", "Min", "Max", "Avg"]:
        p.align[c] = 'r'
    total = 0
    op_totals = {}
    cell_totals = {}
@ -164,11 +185,18 @@ def dump_summary(info, label):
        text = "n/a"
        if readable:
            text = ", ".join(readable)
        _min, _max, _count, _total = durations[key]
        _avg = float(_total) / float(_count)
        _fmin = dt.sec_to_str(_min)
        _fmax = dt.sec_to_str(_max)
        _favg = dt.sec_to_str(_avg * 100.0)
        op_totals[operation] = op_totals.get(operation, 0) + count
        cell_totals[cell] = cell_totals.get(cell, 0) + count
        platform_totals[text] = platform_totals.get(text, 0) + count
-        p.add_row([operation, cell, text, count])
+        p.add_row([operation, cell, text, count, _fmin, _fmax, _favg])
        total += count
    p.sortby = 'Count'
    print p
--- a/stacktach/datetime_to_decimal.py
+++ b/stacktach/datetime_to_decimal.py
@ -21,6 +21,19 @@ def dt_from_decimal(dec):
    return daittyme.replace(microsecond=micro)
 def sec_to_str(sec):
    sec = int(sec)
    if sec < 60:
        return "%ds" % sec
    minutes = sec / 60
    sec = sec % 60
    if minutes < 60:
        return "%d:%02ds" % (minutes, sec)
    hours = minutes / 60
    minutes = minutes % 60
    return "%02d:%02d:%02d" % (hours, minutes, sec)
 if __name__ == '__main__':
    now = datetime.datetime.utcnow()
    d = dt_to_decimal(now)
--- a/stacktach/image_type.py
+++ b/stacktach/image_type.py
@ -9,6 +9,8 @@ OS_RHEL = 0x800
 def isset(num, flag):
    if not num:
        return False
    return num & flag > 0
@ -41,7 +43,7 @@ def get_numeric_code(payload, default=0):
    if image_type == 'snapshot':
        num |= SNAPSHOT_IMAGE
-    os_type = meta.get('os_type', '')
+    os_type = meta.get('os_type', payload.get('os_type', ''))
    if os_type == 'linux':
        num |= LINUX_IMAGE