#!/usr/bin/pypy #-*- coding:utf-8 -*- # # # This code is part of the LWN git data miner. # # Copyright 2007-12 Eklektix, Inc. # Copyright 2007-12 Jonathan Corbet # Copyright 2011 Germán Póo-Caamaño # # This file may be distributed under the terms of the GNU General # Public License, version 2. import database, csvdump, ConfigFile, reports import getopt, datetime import os, re, sys, rfc822, string import logparser from patterns import patterns Today = datetime.date.today() # # Remember author names we have griped about. # GripedAuthorNames = [ ] # # Control options. # MapUnknown = 0 DevReports = 1 DateStats = 0 AuthorSOBs = 1 FileFilter = None CSVFile = None CSVPrefix = None AkpmOverLt = 0 DumpDB = 0 CFName = 'gitdm.config' DirName = '' Aggregate = 'month' Numstat = 0 ReportByFileType = 0 ReportUnknowns = False # # Options: # # -a Andrew Morton's signoffs shadow Linus's # -b dir Specify the base directory to fetch the configuration files # -c cfile Specify a configuration file # -d Output individual developer stats # -D Output date statistics # -h hfile HTML output to hfile # -l count Maximum length for output lists # -n Use numstats instead of generated patch from git log # -o file File for text output # -p prefix Prefix for CSV output # -r pattern Restrict to files matching pattern # -s Ignore author SOB lines # -u Map unknown employers to '(Unknown)' # -U Dump unknown hackers in report # -x file.csv Export raw statistics as CSV # -w Aggregrate the raw statistics by weeks instead of months # -y Aggregrate the raw statistics by years instead of months # -z Dump out the hacker database at completion def ParseOpts (): global MapUnknown, DevReports global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat global ReportByFileType, ReportUnknowns opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stUuwx:yz') for opt in opts: if opt[0] == '-a': AkpmOverLt = 1 elif opt[0] == '-b': DirName = opt[1] elif opt[0] == '-c': CFName = opt[1] elif opt[0] == '-d': DevReports = 0 elif opt[0] == '-D': DateStats = 1 elif opt[0] == '-h': reports.SetHTMLOutput (open (opt[1], 'w')) elif opt[0] == '-l': reports.SetMaxList (int (opt[1])) elif opt[0] == '-n': Numstat = 1 elif opt[0] == '-o': reports.SetOutput (open (opt[1], 'w')) elif opt[0] == '-p': CSVPrefix = opt[1] elif opt[0] == '-r': print 'Filter on "%s"' % (opt[1]) FileFilter = re.compile (opt[1]) elif opt[0] == '-s': AuthorSOBs = 0 elif opt[0] == '-t': ReportByFileType = 1 elif opt[0] == '-u': MapUnknown = 1 elif opt[0] == '-U': ReportUnknowns = True elif opt[0] == '-x': CSVFile = open (opt[1], 'w') print "open output file " + opt[1] + "\n" elif opt [0] == '-w': Aggregate = 'week' elif opt [0] == '-y': Aggregate = 'year' elif opt[0] == '-z': DumpDB = 1 def LookupStoreHacker (name, email): email = database.RemapEmail (email) h = database.LookupEmail (email) if h: # already there return h elist = database.LookupEmployer (email, MapUnknown) h = database.LookupName (name) if h: # new email h.addemail (email, elist) return h return database.StoreHacker(name, elist, email) # # Date tracking. # DateMap = { } def AddDateLines(date, lines): if lines > 1000000: print 'Skip big patch (%d)' % lines return try: DateMap[date] += lines except KeyError: DateMap[date] = lines def PrintDateStats(): dates = DateMap.keys () dates.sort () total = 0 datef = open ('datelc.csv', 'w') datef.write('Date,Changed,Total Changed\n') for date in dates: total += DateMap[date] datef.write ('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day, DateMap[date], total)) # # Let's slowly try to move some smarts into this class. # class patch: (ADDED, REMOVED) = range (2) def __init__ (self, commit): self.commit = commit self.merge = self.added = self.removed = 0 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net') self.email = 'unknown@hacker.net' self.sobs = [ ] self.reviews = [ ] self.testers = [ ] self.reports = [ ] self.filetypes = {} def addreviewer (self, reviewer): self.reviews.append (reviewer) def addtester (self, tester): self.testers.append (tester) def addreporter (self, reporter): self.reports.append (reporter) def addfiletype (self, filetype, added, removed): if self.filetypes.has_key (filetype): self.filetypes[filetype][self.ADDED] += added self.filetypes[filetype][self.REMOVED] += removed else: self.filetypes[filetype] = [added, removed] def parse_numstat(line, file_filter): """ Receive a line of text, determine if fits a numstat line and parse the added and removed lines as well as the file type. """ m = patterns['numstat'].match (line) if m: filename = m.group (3) # If we have a file filter, check for file lines. if file_filter and not file_filter.search (filename): return None, None, None, None try: added = int (m.group (1)) removed = int (m.group (2)) except ValueError: # A binary file (image, etc.) is marked with '-' added = removed = 0 m = patterns['rename'].match (filename) if m: filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4)) filetype = database.FileTypes.guess_file_type (os.path.basename(filename)) return filename, filetype, added, removed else: return None, None, None, None # # The core hack for grabbing the information about a changeset. # def grabpatch(logpatch): m = patterns['commit'].match (logpatch[0]) if not m: return None p = patch(m.group (1)) ignore = (FileFilter is not None) for Line in logpatch[1:]: # # Maybe it's an author line? # m = patterns['author'].match (Line) if m: p.email = database.RemapEmail (m.group (2)) p.author = LookupStoreHacker(m.group (1), p.email) continue # # Could be a signed-off-by: # m = patterns['signed-off-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) sobber = LookupStoreHacker(m.group (1), email) if sobber != p.author or AuthorSOBs: p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2)))) continue # # Various other tags of interest. # m = patterns['reviewed-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) p.addreviewer (LookupStoreHacker(m.group (1), email)) continue m = patterns['tested-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) p.addtester (LookupStoreHacker (m.group (1), email)) p.author.testcredit (patch) continue # Reported-by: m = patterns['reported-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) p.addreporter (LookupStoreHacker (m.group (1), email)) p.author.reportcredit (patch) continue # Reported-and-tested-by: m = patterns['reported-and-tested-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) h = LookupStoreHacker (m.group (1), email) p.addreporter (h) p.addtester (h) p.author.reportcredit (patch) p.author.testcredit (patch) continue # # If this one is a merge, make note of the fact. # m = patterns['merge'].match (Line) if m: p.merge = 1 continue # # See if it's the date. # m = patterns['date'].match (Line) if m: dt = rfc822.parsedate(m.group (2)) p.date = datetime.date (dt[0], dt[1], dt[2]) if p.date > Today: sys.stderr.write ('Funky date: %s\n' % p.date) p.date = Today continue if not Numstat: # # If we have a file filter, check for file lines. # if FileFilter: ignore = ApplyFileFilter (Line, ignore) # # OK, maybe it's part of the diff itself. # if not ignore: if patterns['add'].match (Line): p.added += 1 continue if patterns['rem'].match (Line): p.removed += 1 else: # Get the statistics (lines added/removes) using numstats # and without requiring a diff (--numstat instead -p) (filename, filetype, added, removed) = parse_numstat (Line, FileFilter) if filename: p.added += added p.removed += removed p.addfiletype (filetype, added, removed) if '@' in p.author.name: GripeAboutAuthorName (p.author.name) return p def GripeAboutAuthorName (name): if name in GripedAuthorNames: return GripedAuthorNames.append (name) print '%s is an author name, probably not what you want' % (name) def ApplyFileFilter (line, ignore): # # If this is the first file line (--- a/), set ignore one way # or the other. # m = patterns['filea'].match (line) if m: file = m.group (1) if FileFilter.search (file): return 0 return 1 # # For the second line, we can turn ignore off, but not on # m = patterns['fileb'].match (line) if m: file = m.group (1) if FileFilter.search (file): return 0 return ignore def is_svntag(logpatch): """ This is a workaround for a bug on the migration to Git from Subversion found in GNOME. It may happen in other repositories as well. """ for Line in logpatch: m = patterns['svn-tag'].match(Line.strip()) if m: sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' % (m.group (0),)) return True return False # # If this patch is signed off by both Andrew Morton and Linus Torvalds, # remove the (redundant) Linus signoff. # def TrimLTSOBs (p): if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs: p.sobs.remove (Linus) # # Here starts the real program. # ParseOpts () # # Read the config files. # ConfigFile.ConfigFile (CFName, DirName) # # Let's pre-seed the database with a couple of hackers # we want to remember. # if AkpmOverLt == 1: Linus = ('torvalds@linux-foundation.org', LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org')) Akpm = ('akpm@linux-foundation.org', LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org')) TotalChanged = TotalAdded = TotalRemoved = 0 # # Snarf changesets. # print >> sys.stderr, 'Grabbing changesets...\r', patches = logparser.LogPatchSplitter(sys.stdin) printcount = CSCount = 0 for logpatch in patches: if (printcount % 50) == 0: print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount, printcount += 1 # We want to ignore commits on svn tags since in Subversion # thats mean a copy of the whole repository, which leads to # wrong results. Some migrations from Subversion to Git does # not catch all this tags/copy and import them just as a new # big changeset. if is_svntag(logpatch): continue p = grabpatch(logpatch) if not p: break # if p.added > 100000 or p.removed > 100000: # print 'Skipping massive add', p.commit # continue if FileFilter and p.added == 0 and p.removed == 0: continue # # skip over any OpenStack Jenkins automated commits # if p.email == "jenkins@openstack.org": continue if p.email == "jenkins@review.openstack.org": continue if p.email == "openstack-infra@lists.openstack.org": continue # # Record some global information - but only if this patch had # stuff which wasn't ignored. # if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge: TotalAdded += p.added TotalRemoved += p.removed TotalChanged += max (p.added, p.removed) AddDateLines (p.date, max (p.added, p.removed)) empl = p.author.emailemployer (p.email, p.date) empl.AddCSet (p) if AkpmOverLt: TrimLTSOBs (p) for sobemail, sobber in p.sobs: empl = sobber.emailemployer (sobemail, p.date) empl.AddSOB() if not p.merge: p.author.addpatch (p) for sobemail, sob in p.sobs: sob.addsob (p) for hacker in p.reviews: hacker.addreview (p) for hacker in p.testers: hacker.addtested (p) for hacker in p.reports: hacker.addreport (p) CSCount += 1 csvdump.AccumulatePatch (p, Aggregate) csvdump.store_patch (p) print >> sys.stderr, 'Grabbing changesets...done ' if DumpDB: database.DumpDB () database.MixVirtuals () # # Say something # hlist = database.AllHackers () elist = database.AllEmployers () ndev = nempl = 0 for h in hlist: if len (h.patches) > 0: ndev += 1 for e in elist: if e.count > 0: nempl += 1 reports.Write ('Processed %d csets from %d developers\n' % (CSCount, ndev)) reports.Write ('%d employers found\n' % (nempl)) reports.Write ('A total of %d lines added, %d removed (delta %d)\n' % (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved)) if TotalChanged == 0: TotalChanged = 1 # HACK to avoid div by zero if DateStats: PrintDateStats () if CSVPrefix: csvdump.save_csv (CSVPrefix) if CSVFile: csvdump.OutputCSV (CSVFile) CSVFile.close () if DevReports: reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved) if ReportUnknowns: reports.ReportUnknowns(hlist, CSCount) reports.EmplReports (elist, TotalChanged, CSCount) if ReportByFileType and Numstat: reports.ReportByFileType (hlist)