515 lines
15 KiB
Python
Executable File
515 lines
15 KiB
Python
Executable File
#!/usr/bin/pypy
|
|
#-*- coding:utf-8 -*-
|
|
#
|
|
|
|
#
|
|
# This code is part of the LWN git data miner.
|
|
#
|
|
# Copyright 2007-12 Eklektix, Inc.
|
|
# Copyright 2007-12 Jonathan Corbet <corbet@lwn.net>
|
|
# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
|
|
#
|
|
# This file may be distributed under the terms of the GNU General
|
|
# Public License, version 2.
|
|
|
|
|
|
import database, csvdump, ConfigFile, reports
|
|
import getopt, datetime
|
|
import os, re, sys, rfc822, string
|
|
import logparser
|
|
from patterns import patterns
|
|
|
|
Today = datetime.date.today()
|
|
|
|
#
|
|
# Remember author names we have griped about.
|
|
#
|
|
GripedAuthorNames = [ ]
|
|
|
|
#
|
|
# Control options.
|
|
#
|
|
MapUnknown = 0
|
|
DevReports = 1
|
|
DateStats = 0
|
|
AuthorSOBs = 1
|
|
FileFilter = None
|
|
CSVFile = None
|
|
CSVPrefix = None
|
|
AkpmOverLt = 0
|
|
DumpDB = 0
|
|
CFName = 'gitdm.config'
|
|
DirName = ''
|
|
Aggregate = 'month'
|
|
Numstat = 0
|
|
ReportByFileType = 0
|
|
ReportUnknowns = False
|
|
|
|
#
|
|
# Options:
|
|
#
|
|
# -a Andrew Morton's signoffs shadow Linus's
|
|
# -b dir Specify the base directory to fetch the configuration files
|
|
# -c cfile Specify a configuration file
|
|
# -d Output individual developer stats
|
|
# -D Output date statistics
|
|
# -h hfile HTML output to hfile
|
|
# -l count Maximum length for output lists
|
|
# -n Use numstats instead of generated patch from git log
|
|
# -o file File for text output
|
|
# -p prefix Prefix for CSV output
|
|
# -r pattern Restrict to files matching pattern
|
|
# -s Ignore author SOB lines
|
|
# -u Map unknown employers to '(Unknown)'
|
|
# -U Dump unknown hackers in report
|
|
# -x file.csv Export raw statistics as CSV
|
|
# -w Aggregrate the raw statistics by weeks instead of months
|
|
# -y Aggregrate the raw statistics by years instead of months
|
|
# -z Dump out the hacker database at completion
|
|
|
|
def ParseOpts ():
|
|
global MapUnknown, DevReports
|
|
global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
|
|
global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
|
|
global ReportByFileType, ReportUnknowns
|
|
|
|
opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stUuwx:yz')
|
|
for opt in opts:
|
|
if opt[0] == '-a':
|
|
AkpmOverLt = 1
|
|
elif opt[0] == '-b':
|
|
DirName = opt[1]
|
|
elif opt[0] == '-c':
|
|
CFName = opt[1]
|
|
elif opt[0] == '-d':
|
|
DevReports = 0
|
|
elif opt[0] == '-D':
|
|
DateStats = 1
|
|
elif opt[0] == '-h':
|
|
reports.SetHTMLOutput (open (opt[1], 'w'))
|
|
elif opt[0] == '-l':
|
|
reports.SetMaxList (int (opt[1]))
|
|
elif opt[0] == '-n':
|
|
Numstat = 1
|
|
elif opt[0] == '-o':
|
|
reports.SetOutput (open (opt[1], 'w'))
|
|
elif opt[0] == '-p':
|
|
CSVPrefix = opt[1]
|
|
elif opt[0] == '-r':
|
|
print 'Filter on "%s"' % (opt[1])
|
|
FileFilter = re.compile (opt[1])
|
|
elif opt[0] == '-s':
|
|
AuthorSOBs = 0
|
|
elif opt[0] == '-t':
|
|
ReportByFileType = 1
|
|
elif opt[0] == '-u':
|
|
MapUnknown = 1
|
|
elif opt[0] == '-U':
|
|
ReportUnknowns = True
|
|
elif opt[0] == '-x':
|
|
CSVFile = open (opt[1], 'w')
|
|
print "open output file " + opt[1] + "\n"
|
|
elif opt [0] == '-w':
|
|
Aggregate = 'week'
|
|
elif opt [0] == '-y':
|
|
Aggregate = 'year'
|
|
elif opt[0] == '-z':
|
|
DumpDB = 1
|
|
|
|
|
|
|
|
def LookupStoreHacker (name, email):
|
|
email = database.RemapEmail (email)
|
|
h = database.LookupEmail (email)
|
|
if h: # already there
|
|
return h
|
|
elist = database.LookupEmployer (email, MapUnknown)
|
|
h = database.LookupName (name)
|
|
if h: # new email
|
|
h.addemail (email, elist)
|
|
return h
|
|
return database.StoreHacker(name, elist, email)
|
|
|
|
#
|
|
# Date tracking.
|
|
#
|
|
|
|
DateMap = { }
|
|
|
|
def AddDateLines(date, lines):
|
|
if lines > 1000000:
|
|
print 'Skip big patch (%d)' % lines
|
|
return
|
|
try:
|
|
DateMap[date] += lines
|
|
except KeyError:
|
|
DateMap[date] = lines
|
|
|
|
def PrintDateStats():
|
|
dates = DateMap.keys ()
|
|
dates.sort ()
|
|
total = 0
|
|
datef = open ('datelc.csv', 'w')
|
|
datef.write('Date,Changed,Total Changed\n')
|
|
for date in dates:
|
|
total += DateMap[date]
|
|
datef.write ('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day,
|
|
DateMap[date], total))
|
|
|
|
|
|
#
|
|
# Let's slowly try to move some smarts into this class.
|
|
#
|
|
class patch:
|
|
(ADDED, REMOVED) = range (2)
|
|
|
|
def __init__ (self, commit):
|
|
self.commit = commit
|
|
self.merge = self.added = self.removed = 0
|
|
self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
|
|
self.email = 'unknown@hacker.net'
|
|
self.sobs = [ ]
|
|
self.reviews = [ ]
|
|
self.testers = [ ]
|
|
self.reports = [ ]
|
|
self.filetypes = {}
|
|
|
|
def addreviewer (self, reviewer):
|
|
self.reviews.append (reviewer)
|
|
|
|
def addtester (self, tester):
|
|
self.testers.append (tester)
|
|
|
|
def addreporter (self, reporter):
|
|
self.reports.append (reporter)
|
|
|
|
def addfiletype (self, filetype, added, removed):
|
|
if self.filetypes.has_key (filetype):
|
|
self.filetypes[filetype][self.ADDED] += added
|
|
self.filetypes[filetype][self.REMOVED] += removed
|
|
else:
|
|
self.filetypes[filetype] = [added, removed]
|
|
|
|
def parse_numstat(line, file_filter):
|
|
"""
|
|
Receive a line of text, determine if fits a numstat line and
|
|
parse the added and removed lines as well as the file type.
|
|
"""
|
|
m = patterns['numstat'].match (line)
|
|
if m:
|
|
filename = m.group (3)
|
|
# If we have a file filter, check for file lines.
|
|
if file_filter and not file_filter.search (filename):
|
|
return None, None, None, None
|
|
|
|
try:
|
|
added = int (m.group (1))
|
|
removed = int (m.group (2))
|
|
except ValueError:
|
|
# A binary file (image, etc.) is marked with '-'
|
|
added = removed = 0
|
|
|
|
m = patterns['rename'].match (filename)
|
|
if m:
|
|
filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4))
|
|
|
|
filetype = database.FileTypes.guess_file_type (os.path.basename(filename))
|
|
return filename, filetype, added, removed
|
|
else:
|
|
return None, None, None, None
|
|
|
|
#
|
|
# The core hack for grabbing the information about a changeset.
|
|
#
|
|
def grabpatch(logpatch):
|
|
m = patterns['commit'].match (logpatch[0])
|
|
if not m:
|
|
return None
|
|
|
|
p = patch(m.group (1))
|
|
ignore = (FileFilter is not None)
|
|
for Line in logpatch[1:]:
|
|
#
|
|
# Maybe it's an author line?
|
|
#
|
|
m = patterns['author'].match (Line)
|
|
if m:
|
|
p.email = database.RemapEmail (m.group (2))
|
|
p.author = LookupStoreHacker(m.group (1), p.email)
|
|
continue
|
|
#
|
|
# Could be a signed-off-by:
|
|
#
|
|
m = patterns['signed-off-by'].match (Line)
|
|
if m:
|
|
email = database.RemapEmail (m.group (2))
|
|
sobber = LookupStoreHacker(m.group (1), email)
|
|
if sobber != p.author or AuthorSOBs:
|
|
p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
|
|
continue
|
|
#
|
|
# Various other tags of interest.
|
|
#
|
|
m = patterns['reviewed-by'].match (Line)
|
|
if m:
|
|
email = database.RemapEmail (m.group (2))
|
|
p.addreviewer (LookupStoreHacker(m.group (1), email))
|
|
continue
|
|
m = patterns['tested-by'].match (Line)
|
|
if m:
|
|
email = database.RemapEmail (m.group (2))
|
|
p.addtester (LookupStoreHacker (m.group (1), email))
|
|
p.author.testcredit (patch)
|
|
continue
|
|
# Reported-by:
|
|
m = patterns['reported-by'].match (Line)
|
|
if m:
|
|
email = database.RemapEmail (m.group (2))
|
|
p.addreporter (LookupStoreHacker (m.group (1), email))
|
|
p.author.reportcredit (patch)
|
|
continue
|
|
# Reported-and-tested-by:
|
|
m = patterns['reported-and-tested-by'].match (Line)
|
|
if m:
|
|
email = database.RemapEmail (m.group (2))
|
|
h = LookupStoreHacker (m.group (1), email)
|
|
p.addreporter (h)
|
|
p.addtester (h)
|
|
p.author.reportcredit (patch)
|
|
p.author.testcredit (patch)
|
|
continue
|
|
#
|
|
# If this one is a merge, make note of the fact.
|
|
#
|
|
m = patterns['merge'].match (Line)
|
|
if m:
|
|
p.merge = 1
|
|
continue
|
|
#
|
|
# See if it's the date.
|
|
#
|
|
m = patterns['date'].match (Line)
|
|
if m:
|
|
dt = rfc822.parsedate(m.group (2))
|
|
p.date = datetime.date (dt[0], dt[1], dt[2])
|
|
if p.date > Today:
|
|
sys.stderr.write ('Funky date: %s\n' % p.date)
|
|
p.date = Today
|
|
continue
|
|
if not Numstat:
|
|
#
|
|
# If we have a file filter, check for file lines.
|
|
#
|
|
if FileFilter:
|
|
ignore = ApplyFileFilter (Line, ignore)
|
|
#
|
|
# OK, maybe it's part of the diff itself.
|
|
#
|
|
if not ignore:
|
|
if patterns['add'].match (Line):
|
|
p.added += 1
|
|
continue
|
|
if patterns['rem'].match (Line):
|
|
p.removed += 1
|
|
else:
|
|
# Get the statistics (lines added/removes) using numstats
|
|
# and without requiring a diff (--numstat instead -p)
|
|
(filename, filetype, added, removed) = parse_numstat (Line, FileFilter)
|
|
if filename:
|
|
p.added += added
|
|
p.removed += removed
|
|
p.addfiletype (filetype, added, removed)
|
|
|
|
if '@' in p.author.name:
|
|
GripeAboutAuthorName (p.author.name)
|
|
|
|
return p
|
|
|
|
def GripeAboutAuthorName (name):
|
|
if name in GripedAuthorNames:
|
|
return
|
|
GripedAuthorNames.append (name)
|
|
print '%s is an author name, probably not what you want' % (name)
|
|
|
|
def ApplyFileFilter (line, ignore):
|
|
#
|
|
# If this is the first file line (--- a/), set ignore one way
|
|
# or the other.
|
|
#
|
|
m = patterns['filea'].match (line)
|
|
if m:
|
|
file = m.group (1)
|
|
if FileFilter.search (file):
|
|
return 0
|
|
return 1
|
|
#
|
|
# For the second line, we can turn ignore off, but not on
|
|
#
|
|
m = patterns['fileb'].match (line)
|
|
if m:
|
|
file = m.group (1)
|
|
if FileFilter.search (file):
|
|
return 0
|
|
return ignore
|
|
|
|
def is_svntag(logpatch):
|
|
"""
|
|
This is a workaround for a bug on the migration to Git
|
|
from Subversion found in GNOME. It may happen in other
|
|
repositories as well.
|
|
"""
|
|
|
|
for Line in logpatch:
|
|
m = patterns['svn-tag'].match(Line.strip())
|
|
if m:
|
|
sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' %
|
|
(m.group (0),))
|
|
return True
|
|
|
|
return False
|
|
|
|
#
|
|
# If this patch is signed off by both Andrew Morton and Linus Torvalds,
|
|
# remove the (redundant) Linus signoff.
|
|
#
|
|
def TrimLTSOBs (p):
|
|
if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs:
|
|
p.sobs.remove (Linus)
|
|
|
|
|
|
#
|
|
# Here starts the real program.
|
|
#
|
|
ParseOpts ()
|
|
|
|
#
|
|
# Read the config files.
|
|
#
|
|
ConfigFile.ConfigFile (CFName, DirName)
|
|
|
|
#
|
|
# Let's pre-seed the database with a couple of hackers
|
|
# we want to remember.
|
|
#
|
|
if AkpmOverLt == 1:
|
|
Linus = ('torvalds@linux-foundation.org',
|
|
LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
|
|
Akpm = ('akpm@linux-foundation.org',
|
|
LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
|
|
|
|
TotalChanged = TotalAdded = TotalRemoved = 0
|
|
|
|
#
|
|
# Snarf changesets.
|
|
#
|
|
print >> sys.stderr, 'Grabbing changesets...\r',
|
|
|
|
patches = logparser.LogPatchSplitter(sys.stdin)
|
|
printcount = CSCount = 0
|
|
|
|
for logpatch in patches:
|
|
if (printcount % 50) == 0:
|
|
print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
|
|
printcount += 1
|
|
|
|
# We want to ignore commits on svn tags since in Subversion
|
|
# thats mean a copy of the whole repository, which leads to
|
|
# wrong results. Some migrations from Subversion to Git does
|
|
# not catch all this tags/copy and import them just as a new
|
|
# big changeset.
|
|
if is_svntag(logpatch):
|
|
continue
|
|
|
|
p = grabpatch(logpatch)
|
|
if not p:
|
|
break
|
|
# if p.added > 100000 or p.removed > 100000:
|
|
# print 'Skipping massive add', p.commit
|
|
# continue
|
|
if FileFilter and p.added == 0 and p.removed == 0:
|
|
continue
|
|
|
|
#
|
|
# skip over any OpenStack Jenkins automated commits
|
|
#
|
|
if p.email == "jenkins@openstack.org":
|
|
continue
|
|
if p.email == "jenkins@review.openstack.org":
|
|
continue
|
|
if p.email == "openstack-infra@lists.openstack.org":
|
|
continue
|
|
|
|
#
|
|
# Record some global information - but only if this patch had
|
|
# stuff which wasn't ignored.
|
|
#
|
|
if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
|
|
TotalAdded += p.added
|
|
TotalRemoved += p.removed
|
|
TotalChanged += max (p.added, p.removed)
|
|
AddDateLines (p.date, max (p.added, p.removed))
|
|
empl = p.author.emailemployer (p.email, p.date)
|
|
empl.AddCSet (p)
|
|
if AkpmOverLt:
|
|
TrimLTSOBs (p)
|
|
for sobemail, sobber in p.sobs:
|
|
empl = sobber.emailemployer (sobemail, p.date)
|
|
empl.AddSOB()
|
|
|
|
if not p.merge:
|
|
p.author.addpatch (p)
|
|
for sobemail, sob in p.sobs:
|
|
sob.addsob (p)
|
|
for hacker in p.reviews:
|
|
hacker.addreview (p)
|
|
for hacker in p.testers:
|
|
hacker.addtested (p)
|
|
for hacker in p.reports:
|
|
hacker.addreport (p)
|
|
CSCount += 1
|
|
csvdump.AccumulatePatch (p, Aggregate)
|
|
csvdump.store_patch (p)
|
|
print >> sys.stderr, 'Grabbing changesets...done '
|
|
|
|
if DumpDB:
|
|
database.DumpDB ()
|
|
database.MixVirtuals ()
|
|
|
|
#
|
|
# Say something
|
|
#
|
|
hlist = database.AllHackers ()
|
|
elist = database.AllEmployers ()
|
|
ndev = nempl = 0
|
|
for h in hlist:
|
|
if len (h.patches) > 0:
|
|
ndev += 1
|
|
for e in elist:
|
|
if e.count > 0:
|
|
nempl += 1
|
|
reports.Write ('Processed %d csets from %d developers\n' % (CSCount,
|
|
ndev))
|
|
reports.Write ('%d employers found\n' % (nempl))
|
|
reports.Write ('A total of %d lines added, %d removed (delta %d)\n' %
|
|
(TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
|
|
if TotalChanged == 0:
|
|
TotalChanged = 1 # HACK to avoid div by zero
|
|
if DateStats:
|
|
PrintDateStats ()
|
|
|
|
if CSVPrefix:
|
|
csvdump.save_csv (CSVPrefix)
|
|
|
|
if CSVFile:
|
|
csvdump.OutputCSV (CSVFile)
|
|
CSVFile.close ()
|
|
|
|
if DevReports:
|
|
reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
|
|
if ReportUnknowns:
|
|
reports.ReportUnknowns(hlist, CSCount)
|
|
reports.EmplReports (elist, TotalChanged, CSCount)
|
|
|
|
if ReportByFileType and Numstat:
|
|
reports.ReportByFileType (hlist)
|