diff --git a/migrations/003_populate_task_and_image.py b/migrations/003_populate_task_and_image.py index eb5eb4d..dd09d79 100644 --- a/migrations/003_populate_task_and_image.py +++ b/migrations/003_populate_task_and_image.py @@ -54,7 +54,7 @@ def fix_chunk(hours, length): for kv in states.iteritems(): print "%s = %d" % kv -for day in xrange(2, 90): +for day in xrange(0, 90): hours = day * 24 steps = 12 chunk = 24 / steps diff --git a/reports/pretty.py b/reports/pretty.py new file mode 100644 index 0000000..d177867 --- /dev/null +++ b/reports/pretty.py @@ -0,0 +1,166 @@ +import datetime +import json +import sys +import time + +import prettytable + +sys.path.append("/stacktach") + +from stacktach import datetime_to_decimal as dt +from stacktach import image_type +from stacktach import models + + +if __name__ != '__main__': + sys.exit(1) + +yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1) +if len(sys.argv) == 2: + try: + t = time.strptime(sys.argv[1], "%Y-%m-%d") + yesterday = datetime.datetime(*t[:6]) + except Exception, e: + print e + print "Usage: python requests.py YYYY-MM-DD (the end date)" + sys.exit(1) + +percentile = 90 +hours = 24 + +start = datetime.datetime(year=yesterday.year, month=yesterday.month, + day=yesterday.day) +end = start + datetime.timedelta(hours=hours-1, minutes=59, seconds=59) + +print "Generating report for %s to %s" % (start, end) + +dstart = dt.dt_to_decimal(start) +dend = dt.dt_to_decimal(end) + +codes = {} + +# Get all the instances that have changed in the last N hours ... +updates = models.RawData.objects.filter(event='compute.instance.update', + when__gt=dstart, when__lte=dend)\ + .values('instance').distinct() + +expiry = 60 * 60 # 1 hour +cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] + +failures = {} +durations = {} +attempts = {} + +for uuid_dict in updates: + uuid = uuid_dict['instance'] + + # All the unique Request ID's for this instance during that timespan. + reqs = models.RawData.objects.filter(instance=uuid, + when__gt=dstart, when__lte=dend) \ + .values('request_id').distinct() + + + for req_dict in reqs: + report = False + req = req_dict['request_id'] + raws = models.RawData.objects.filter(request_id=req)\ + .exclude(event='compute.instance.exists')\ + .order_by('when') + + start = None + err = None + + operation = "aux" + image_type_num = 0 + + for raw in raws: + if not start: + start = raw.when + if 'error' in raw.routing_key: + err = raw + report = True + + for cmd in cmds: + if cmd in raw.event: + operation = cmd + break + + if raw.image_type: + image_type_num |= raw.image_type + + image = "?" + if image_type.isset(image_type_num, image_type.BASE_IMAGE): + image = "base" + if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): + image = "snap" + + if not start: + continue + + end = raw.when + diff = end - start + + if diff > 3600: + report = True + + key = (operation, image) + + # Track durations for all attempts, good and bad ... + _durations = durations.get(key, []) + _durations.append(diff) + durations[key] = _durations + + attempts[key] = attempts.get(key, 0) + 1 + + if report: + failures[key] = failures.get(key, 0) + 1 + +# Print the results ... +cols = ["Operation", "Image", "Min*", "Max*", "Avg*", + "Requests", "# Fail", "Fail %"] +p = prettytable.PrettyTable(cols) +for c in cols[2:]: + p.align[c] = 'r' +p.sortby = cols[0] + +pct = (float(100 - percentile) / 2.0) / 100.0 +print "* Using %d-th percentile for results (+/-%.1f%% cut)" % \ + (percentile, pct * 100.0) +total = 0 +failure_total = 0 +for key, count in attempts.iteritems(): + total += count + operation, image = key + + failure_count = failures.get(key, 0) + failure_total += failure_count + failure_percentage = float(failure_count) / float(count) + _failure_percentage = "%.1f%%" % (failure_percentage * 100.0) + + # N-th % of durations ... + _values = durations[key] + _values.sort() + _outliers = int(float(len(_values)) * pct) + if _outliers > 0: + before = len(_values) + _values = _values[_outliers:-_outliers] + print "culling %d -> %d" % (before, len(_values)) + _min = 99999999 + _max = 0 + _total = 0.0 + for value in _values: + _min = min(_min, value) + _max = max(_max, value) + _total += float(value) + _avg = float(_total) / float(len(_values)) + _fmin = dt.sec_to_str(_min) + _fmax = dt.sec_to_str(_max) + _favg = dt.sec_to_str(_avg) + + p.add_row([operation, image, _fmin, _fmax, _favg, count, + failure_count, _failure_percentage]) +print p + +print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \ + (total, failure_total, + (float(failure_total)/float(total)) * 100.0) diff --git a/reports/requests.py b/reports/requests.py index 48daeea..942fed8 100644 --- a/reports/requests.py +++ b/reports/requests.py @@ -26,9 +26,10 @@ if len(sys.argv) == 2: sys.exit(1) hours = 0 -length = 24 +length = 6 -start = datetime.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day) +start = datetime.datetime(year=yesterday.year, month=yesterday.month, + day=yesterday.day) end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59) print "Generating report for %s to %s" % (start, end) @@ -48,6 +49,7 @@ cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] failures = {} causes = {} +durations = {} error_messages = {} successes = {} tenant_issues = {} @@ -71,10 +73,10 @@ for uuid_dict in updates: start = None err = None - operation = "n/a" + operation = "aux" platform = 0 tenant = 0 - cell = "n/a" + cell = "unk" for raw in raws: if not start: @@ -107,6 +109,17 @@ for uuid_dict in updates: report = True key = (operation, platform, cell) + + # Track durations for all attempts, good and bad ... + duration_min, duration_max, duration_count, duration_total = \ + durations.get(key, (9999999, 0, 0, 0)) + duration_min = min(duration_min, diff) + duration_max = max(duration_max, diff) + duration_count += 1 + duration_total += diff + durations[key] = (duration_min, duration_max, duration_count, + duration_total) + if not report: successes[key] = successes.get(key, 0) + 1 else: @@ -124,14 +137,16 @@ for uuid_dict in updates: payload = body['payload'] print "Error. EventID: %s, Tenant %s, Service %s, Host %s, "\ "Deployment %s, Event %s, When %s"\ - % (err.id, err.tenant, err.service, err.host, err.deployment.name, + % (err.id, err.tenant, err.service, err.host, + err.deployment.name, err.event, dt.dt_from_decimal(err.when)) exc = payload.get('exception') if exc: # group the messages ... exc_str = str(exc) print exc_str - error_messages[exc_str] = error_messages.get(exc_str, 0) + 1 + error_messages[exc_str] = \ + error_messages.get(exc_str, 0) + 1 # extract the code, if any ... code = exc.get('kwargs', {}).get('code') @@ -151,9 +166,15 @@ def dump_breakdown(totals, label): print p + + def dump_summary(info, label): print "-- %s by operation by cell by platform --" % (label,) - p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count"]) + p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count", + "Min", "Max", "Avg"]) + for c in ["Count", "Min", "Max", "Avg"]: + p.align[c] = 'r' + total = 0 op_totals = {} cell_totals = {} @@ -164,11 +185,18 @@ def dump_summary(info, label): text = "n/a" if readable: text = ", ".join(readable) + + _min, _max, _count, _total = durations[key] + _avg = float(_total) / float(_count) + _fmin = dt.sec_to_str(_min) + _fmax = dt.sec_to_str(_max) + _favg = dt.sec_to_str(_avg * 100.0) + op_totals[operation] = op_totals.get(operation, 0) + count cell_totals[cell] = cell_totals.get(cell, 0) + count platform_totals[text] = platform_totals.get(text, 0) + count - p.add_row([operation, cell, text, count]) + p.add_row([operation, cell, text, count, _fmin, _fmax, _favg]) total += count p.sortby = 'Count' print p diff --git a/stacktach/datetime_to_decimal.py b/stacktach/datetime_to_decimal.py index 1781cb0..4c14caa 100644 --- a/stacktach/datetime_to_decimal.py +++ b/stacktach/datetime_to_decimal.py @@ -21,6 +21,19 @@ def dt_from_decimal(dec): return daittyme.replace(microsecond=micro) +def sec_to_str(sec): + sec = int(sec) + if sec < 60: + return "%ds" % sec + minutes = sec / 60 + sec = sec % 60 + if minutes < 60: + return "%d:%02ds" % (minutes, sec) + hours = minutes / 60 + minutes = minutes % 60 + return "%02d:%02d:%02d" % (hours, minutes, sec) + + if __name__ == '__main__': now = datetime.datetime.utcnow() d = dt_to_decimal(now) diff --git a/stacktach/image_type.py b/stacktach/image_type.py index 33af6ad..83c4eb0 100644 --- a/stacktach/image_type.py +++ b/stacktach/image_type.py @@ -9,6 +9,8 @@ OS_RHEL = 0x800 def isset(num, flag): + if not num: + return False return num & flag > 0 @@ -41,7 +43,7 @@ def get_numeric_code(payload, default=0): if image_type == 'snapshot': num |= SNAPSHOT_IMAGE - os_type = meta.get('os_type', '') + os_type = meta.get('os_type', payload.get('os_type', '')) if os_type == 'linux': num |= LINUX_IMAGE