tigher summary report with percentiles on times
This commit is contained in:
parent
56f65969e1
commit
9d40ce24dd
@ -54,7 +54,7 @@ def fix_chunk(hours, length):
|
|||||||
for kv in states.iteritems():
|
for kv in states.iteritems():
|
||||||
print "%s = %d" % kv
|
print "%s = %d" % kv
|
||||||
|
|
||||||
for day in xrange(2, 90):
|
for day in xrange(0, 90):
|
||||||
hours = day * 24
|
hours = day * 24
|
||||||
steps = 12
|
steps = 12
|
||||||
chunk = 24 / steps
|
chunk = 24 / steps
|
||||||
|
166
reports/pretty.py
Normal file
166
reports/pretty.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import prettytable
|
||||||
|
|
||||||
|
sys.path.append("/stacktach")
|
||||||
|
|
||||||
|
from stacktach import datetime_to_decimal as dt
|
||||||
|
from stacktach import image_type
|
||||||
|
from stacktach import models
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ != '__main__':
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
|
||||||
|
if len(sys.argv) == 2:
|
||||||
|
try:
|
||||||
|
t = time.strptime(sys.argv[1], "%Y-%m-%d")
|
||||||
|
yesterday = datetime.datetime(*t[:6])
|
||||||
|
except Exception, e:
|
||||||
|
print e
|
||||||
|
print "Usage: python requests.py YYYY-MM-DD (the end date)"
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
percentile = 90
|
||||||
|
hours = 24
|
||||||
|
|
||||||
|
start = datetime.datetime(year=yesterday.year, month=yesterday.month,
|
||||||
|
day=yesterday.day)
|
||||||
|
end = start + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)
|
||||||
|
|
||||||
|
print "Generating report for %s to %s" % (start, end)
|
||||||
|
|
||||||
|
dstart = dt.dt_to_decimal(start)
|
||||||
|
dend = dt.dt_to_decimal(end)
|
||||||
|
|
||||||
|
codes = {}
|
||||||
|
|
||||||
|
# Get all the instances that have changed in the last N hours ...
|
||||||
|
updates = models.RawData.objects.filter(event='compute.instance.update',
|
||||||
|
when__gt=dstart, when__lte=dend)\
|
||||||
|
.values('instance').distinct()
|
||||||
|
|
||||||
|
expiry = 60 * 60 # 1 hour
|
||||||
|
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
|
||||||
|
|
||||||
|
failures = {}
|
||||||
|
durations = {}
|
||||||
|
attempts = {}
|
||||||
|
|
||||||
|
for uuid_dict in updates:
|
||||||
|
uuid = uuid_dict['instance']
|
||||||
|
|
||||||
|
# All the unique Request ID's for this instance during that timespan.
|
||||||
|
reqs = models.RawData.objects.filter(instance=uuid,
|
||||||
|
when__gt=dstart, when__lte=dend) \
|
||||||
|
.values('request_id').distinct()
|
||||||
|
|
||||||
|
|
||||||
|
for req_dict in reqs:
|
||||||
|
report = False
|
||||||
|
req = req_dict['request_id']
|
||||||
|
raws = models.RawData.objects.filter(request_id=req)\
|
||||||
|
.exclude(event='compute.instance.exists')\
|
||||||
|
.order_by('when')
|
||||||
|
|
||||||
|
start = None
|
||||||
|
err = None
|
||||||
|
|
||||||
|
operation = "aux"
|
||||||
|
image_type_num = 0
|
||||||
|
|
||||||
|
for raw in raws:
|
||||||
|
if not start:
|
||||||
|
start = raw.when
|
||||||
|
if 'error' in raw.routing_key:
|
||||||
|
err = raw
|
||||||
|
report = True
|
||||||
|
|
||||||
|
for cmd in cmds:
|
||||||
|
if cmd in raw.event:
|
||||||
|
operation = cmd
|
||||||
|
break
|
||||||
|
|
||||||
|
if raw.image_type:
|
||||||
|
image_type_num |= raw.image_type
|
||||||
|
|
||||||
|
image = "?"
|
||||||
|
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
|
||||||
|
image = "base"
|
||||||
|
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
|
||||||
|
image = "snap"
|
||||||
|
|
||||||
|
if not start:
|
||||||
|
continue
|
||||||
|
|
||||||
|
end = raw.when
|
||||||
|
diff = end - start
|
||||||
|
|
||||||
|
if diff > 3600:
|
||||||
|
report = True
|
||||||
|
|
||||||
|
key = (operation, image)
|
||||||
|
|
||||||
|
# Track durations for all attempts, good and bad ...
|
||||||
|
_durations = durations.get(key, [])
|
||||||
|
_durations.append(diff)
|
||||||
|
durations[key] = _durations
|
||||||
|
|
||||||
|
attempts[key] = attempts.get(key, 0) + 1
|
||||||
|
|
||||||
|
if report:
|
||||||
|
failures[key] = failures.get(key, 0) + 1
|
||||||
|
|
||||||
|
# Print the results ...
|
||||||
|
cols = ["Operation", "Image", "Min*", "Max*", "Avg*",
|
||||||
|
"Requests", "# Fail", "Fail %"]
|
||||||
|
p = prettytable.PrettyTable(cols)
|
||||||
|
for c in cols[2:]:
|
||||||
|
p.align[c] = 'r'
|
||||||
|
p.sortby = cols[0]
|
||||||
|
|
||||||
|
pct = (float(100 - percentile) / 2.0) / 100.0
|
||||||
|
print "* Using %d-th percentile for results (+/-%.1f%% cut)" % \
|
||||||
|
(percentile, pct * 100.0)
|
||||||
|
total = 0
|
||||||
|
failure_total = 0
|
||||||
|
for key, count in attempts.iteritems():
|
||||||
|
total += count
|
||||||
|
operation, image = key
|
||||||
|
|
||||||
|
failure_count = failures.get(key, 0)
|
||||||
|
failure_total += failure_count
|
||||||
|
failure_percentage = float(failure_count) / float(count)
|
||||||
|
_failure_percentage = "%.1f%%" % (failure_percentage * 100.0)
|
||||||
|
|
||||||
|
# N-th % of durations ...
|
||||||
|
_values = durations[key]
|
||||||
|
_values.sort()
|
||||||
|
_outliers = int(float(len(_values)) * pct)
|
||||||
|
if _outliers > 0:
|
||||||
|
before = len(_values)
|
||||||
|
_values = _values[_outliers:-_outliers]
|
||||||
|
print "culling %d -> %d" % (before, len(_values))
|
||||||
|
_min = 99999999
|
||||||
|
_max = 0
|
||||||
|
_total = 0.0
|
||||||
|
for value in _values:
|
||||||
|
_min = min(_min, value)
|
||||||
|
_max = max(_max, value)
|
||||||
|
_total += float(value)
|
||||||
|
_avg = float(_total) / float(len(_values))
|
||||||
|
_fmin = dt.sec_to_str(_min)
|
||||||
|
_fmax = dt.sec_to_str(_max)
|
||||||
|
_favg = dt.sec_to_str(_avg)
|
||||||
|
|
||||||
|
p.add_row([operation, image, _fmin, _fmax, _favg, count,
|
||||||
|
failure_count, _failure_percentage])
|
||||||
|
print p
|
||||||
|
|
||||||
|
print "Total: %d, Failures: %d, Failure Rate: %.1f%%" % \
|
||||||
|
(total, failure_total,
|
||||||
|
(float(failure_total)/float(total)) * 100.0)
|
@ -26,9 +26,10 @@ if len(sys.argv) == 2:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
hours = 0
|
hours = 0
|
||||||
length = 24
|
length = 6
|
||||||
|
|
||||||
start = datetime.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day)
|
start = datetime.datetime(year=yesterday.year, month=yesterday.month,
|
||||||
|
day=yesterday.day)
|
||||||
end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
|
end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
|
||||||
|
|
||||||
print "Generating report for %s to %s" % (start, end)
|
print "Generating report for %s to %s" % (start, end)
|
||||||
@ -48,6 +49,7 @@ cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
|
|||||||
|
|
||||||
failures = {}
|
failures = {}
|
||||||
causes = {}
|
causes = {}
|
||||||
|
durations = {}
|
||||||
error_messages = {}
|
error_messages = {}
|
||||||
successes = {}
|
successes = {}
|
||||||
tenant_issues = {}
|
tenant_issues = {}
|
||||||
@ -71,10 +73,10 @@ for uuid_dict in updates:
|
|||||||
start = None
|
start = None
|
||||||
err = None
|
err = None
|
||||||
|
|
||||||
operation = "n/a"
|
operation = "aux"
|
||||||
platform = 0
|
platform = 0
|
||||||
tenant = 0
|
tenant = 0
|
||||||
cell = "n/a"
|
cell = "unk"
|
||||||
|
|
||||||
for raw in raws:
|
for raw in raws:
|
||||||
if not start:
|
if not start:
|
||||||
@ -107,6 +109,17 @@ for uuid_dict in updates:
|
|||||||
report = True
|
report = True
|
||||||
|
|
||||||
key = (operation, platform, cell)
|
key = (operation, platform, cell)
|
||||||
|
|
||||||
|
# Track durations for all attempts, good and bad ...
|
||||||
|
duration_min, duration_max, duration_count, duration_total = \
|
||||||
|
durations.get(key, (9999999, 0, 0, 0))
|
||||||
|
duration_min = min(duration_min, diff)
|
||||||
|
duration_max = max(duration_max, diff)
|
||||||
|
duration_count += 1
|
||||||
|
duration_total += diff
|
||||||
|
durations[key] = (duration_min, duration_max, duration_count,
|
||||||
|
duration_total)
|
||||||
|
|
||||||
if not report:
|
if not report:
|
||||||
successes[key] = successes.get(key, 0) + 1
|
successes[key] = successes.get(key, 0) + 1
|
||||||
else:
|
else:
|
||||||
@ -124,14 +137,16 @@ for uuid_dict in updates:
|
|||||||
payload = body['payload']
|
payload = body['payload']
|
||||||
print "Error. EventID: %s, Tenant %s, Service %s, Host %s, "\
|
print "Error. EventID: %s, Tenant %s, Service %s, Host %s, "\
|
||||||
"Deployment %s, Event %s, When %s"\
|
"Deployment %s, Event %s, When %s"\
|
||||||
% (err.id, err.tenant, err.service, err.host, err.deployment.name,
|
% (err.id, err.tenant, err.service, err.host,
|
||||||
|
err.deployment.name,
|
||||||
err.event, dt.dt_from_decimal(err.when))
|
err.event, dt.dt_from_decimal(err.when))
|
||||||
exc = payload.get('exception')
|
exc = payload.get('exception')
|
||||||
if exc:
|
if exc:
|
||||||
# group the messages ...
|
# group the messages ...
|
||||||
exc_str = str(exc)
|
exc_str = str(exc)
|
||||||
print exc_str
|
print exc_str
|
||||||
error_messages[exc_str] = error_messages.get(exc_str, 0) + 1
|
error_messages[exc_str] = \
|
||||||
|
error_messages.get(exc_str, 0) + 1
|
||||||
|
|
||||||
# extract the code, if any ...
|
# extract the code, if any ...
|
||||||
code = exc.get('kwargs', {}).get('code')
|
code = exc.get('kwargs', {}).get('code')
|
||||||
@ -151,9 +166,15 @@ def dump_breakdown(totals, label):
|
|||||||
print p
|
print p
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def dump_summary(info, label):
|
def dump_summary(info, label):
|
||||||
print "-- %s by operation by cell by platform --" % (label,)
|
print "-- %s by operation by cell by platform --" % (label,)
|
||||||
p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count"])
|
p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count",
|
||||||
|
"Min", "Max", "Avg"])
|
||||||
|
for c in ["Count", "Min", "Max", "Avg"]:
|
||||||
|
p.align[c] = 'r'
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
op_totals = {}
|
op_totals = {}
|
||||||
cell_totals = {}
|
cell_totals = {}
|
||||||
@ -164,11 +185,18 @@ def dump_summary(info, label):
|
|||||||
text = "n/a"
|
text = "n/a"
|
||||||
if readable:
|
if readable:
|
||||||
text = ", ".join(readable)
|
text = ", ".join(readable)
|
||||||
|
|
||||||
|
_min, _max, _count, _total = durations[key]
|
||||||
|
_avg = float(_total) / float(_count)
|
||||||
|
_fmin = dt.sec_to_str(_min)
|
||||||
|
_fmax = dt.sec_to_str(_max)
|
||||||
|
_favg = dt.sec_to_str(_avg * 100.0)
|
||||||
|
|
||||||
op_totals[operation] = op_totals.get(operation, 0) + count
|
op_totals[operation] = op_totals.get(operation, 0) + count
|
||||||
cell_totals[cell] = cell_totals.get(cell, 0) + count
|
cell_totals[cell] = cell_totals.get(cell, 0) + count
|
||||||
platform_totals[text] = platform_totals.get(text, 0) + count
|
platform_totals[text] = platform_totals.get(text, 0) + count
|
||||||
|
|
||||||
p.add_row([operation, cell, text, count])
|
p.add_row([operation, cell, text, count, _fmin, _fmax, _favg])
|
||||||
total += count
|
total += count
|
||||||
p.sortby = 'Count'
|
p.sortby = 'Count'
|
||||||
print p
|
print p
|
||||||
|
@ -21,6 +21,19 @@ def dt_from_decimal(dec):
|
|||||||
return daittyme.replace(microsecond=micro)
|
return daittyme.replace(microsecond=micro)
|
||||||
|
|
||||||
|
|
||||||
|
def sec_to_str(sec):
|
||||||
|
sec = int(sec)
|
||||||
|
if sec < 60:
|
||||||
|
return "%ds" % sec
|
||||||
|
minutes = sec / 60
|
||||||
|
sec = sec % 60
|
||||||
|
if minutes < 60:
|
||||||
|
return "%d:%02ds" % (minutes, sec)
|
||||||
|
hours = minutes / 60
|
||||||
|
minutes = minutes % 60
|
||||||
|
return "%02d:%02d:%02d" % (hours, minutes, sec)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
now = datetime.datetime.utcnow()
|
now = datetime.datetime.utcnow()
|
||||||
d = dt_to_decimal(now)
|
d = dt_to_decimal(now)
|
||||||
|
@ -9,6 +9,8 @@ OS_RHEL = 0x800
|
|||||||
|
|
||||||
|
|
||||||
def isset(num, flag):
|
def isset(num, flag):
|
||||||
|
if not num:
|
||||||
|
return False
|
||||||
return num & flag > 0
|
return num & flag > 0
|
||||||
|
|
||||||
|
|
||||||
@ -41,7 +43,7 @@ def get_numeric_code(payload, default=0):
|
|||||||
if image_type == 'snapshot':
|
if image_type == 'snapshot':
|
||||||
num |= SNAPSHOT_IMAGE
|
num |= SNAPSHOT_IMAGE
|
||||||
|
|
||||||
os_type = meta.get('os_type', '')
|
os_type = meta.get('os_type', payload.get('os_type', ''))
|
||||||
if os_type == 'linux':
|
if os_type == 'linux':
|
||||||
num |= LINUX_IMAGE
|
num |= LINUX_IMAGE
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user