gitdm/gitdm

#!/usr/bin/pypy
#-*- coding:utf-8 -*-
#

#
# This code is part of the LWN git data miner.
#
# Copyright 2007-12 Eklektix, Inc.
# Copyright 2007-12 Jonathan Corbet <corbet@lwn.net>
# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.


import database, csvdump, ConfigFile, reports
import getopt, datetime
import os, re, sys, rfc822, string
import logparser
from patterns import patterns

Today = datetime.date.today()

#
# Remember author names we have griped about.
#
GripedAuthorNames = [ ]

#
# Control options.
#
MapUnknown = 0
DevReports = 1
DateStats = 0
AuthorSOBs = 1
FileFilter = None
CSVFile = None
CSVPrefix = None
AkpmOverLt = 0
DumpDB = 0
CFName = 'gitdm.config'
DirName = ''
Aggregate = 'month'
Numstat = 0
ReportByFileType = 0
ReportUnknowns = False

#
# Options:
#
# -a		Andrew Morton's signoffs shadow Linus's
# -b dir	Specify the base directory to fetch the configuration files
# -c cfile	Specify a configuration file
# -d		Output individual developer stats
# -D		Output date statistics
# -h hfile	HTML output to hfile
# -l count	Maximum length for output lists
# -n        Use numstats instead of generated patch from git log
# -o file	File for text output
# -p prefix Prefix for CSV output
# -r pattern	Restrict to files matching pattern
# -s		Ignore author SOB lines
# -u		Map unknown employers to '(Unknown)'
# -U 		Dump unknown hackers in report
# -x file.csv   Export raw statistics as CSV
# -w        Aggregrate the raw statistics by weeks instead of months
# -y            Aggregrate the raw statistics by years instead of months
# -z		Dump out the hacker database at completion

def ParseOpts ():
    global MapUnknown, DevReports
    global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
    global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
    global ReportByFileType, ReportUnknowns

    opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stUuwx:yz')
    for opt in opts:
        if opt[0] == '-a':
            AkpmOverLt = 1
        elif opt[0] == '-b':
            DirName = opt[1]
        elif opt[0] == '-c':
            CFName = opt[1]
        elif opt[0] == '-d':
            DevReports = 0
        elif opt[0] == '-D':
            DateStats = 1
        elif opt[0] == '-h':
            reports.SetHTMLOutput (open (opt[1], 'w'))
        elif opt[0] == '-l':
            reports.SetMaxList (int (opt[1]))
        elif opt[0] == '-n':
            Numstat = 1
        elif opt[0] == '-o':
            reports.SetOutput (open (opt[1], 'w'))
        elif opt[0] == '-p':
            CSVPrefix = opt[1]
        elif opt[0] == '-r':
            print 'Filter on "%s"' % (opt[1])
            FileFilter = re.compile (opt[1])
        elif opt[0] == '-s':
            AuthorSOBs = 0
        elif opt[0] == '-t':
            ReportByFileType = 1
        elif opt[0] == '-u':
            MapUnknown = 1
        elif opt[0] == '-U':
            ReportUnknowns = True
        elif opt[0] == '-x':
            CSVFile = open (opt[1], 'w')
            print "open output file " + opt[1] + "\n"
        elif opt [0] == '-w':
            Aggregate = 'week'
        elif opt [0] == '-y':
            Aggregate = 'year'
        elif opt[0] == '-z':
            DumpDB = 1


def LookupStoreHacker (name, email):
    email = database.RemapEmail (email)
    h = database.LookupEmail (email)
    if h: # already there
        return h
    elist = database.LookupEmployer (email, MapUnknown)
    h = database.LookupName (name)
    if h: # new email
        h.addemail (email, elist)
        return h
    return database.StoreHacker(name, elist, email)

#
# Date tracking.
#

DateMap = { }

def AddDateLines(date, lines):
    if lines > 1000000:
        print 'Skip big patch (%d)' % lines
        return
    try:
        DateMap[date] += lines
    except KeyError:
        DateMap[date] = lines

def PrintDateStats():
    dates = DateMap.keys ()
    dates.sort ()
    total = 0
    datef = open ('datelc.csv', 'w')
    datef.write('Date,Changed,Total Changed\n')
    for date in dates:
        total += DateMap[date]
        datef.write ('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day,
                                    DateMap[date], total))


#
# Let's slowly try to move some smarts into this class.
#
class patch:
    (ADDED, REMOVED) = range (2)

    def __init__ (self, commit):
        self.commit = commit
        self.merge = self.added = self.removed = 0
        self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
        self.email = 'unknown@hacker.net'
        self.sobs = [ ]
        self.reviews = [ ]
        self.testers = [ ]
        self.reports = [ ]
        self.filetypes = {}

    def addreviewer (self, reviewer):
        self.reviews.append (reviewer)

    def addtester (self, tester):
        self.testers.append (tester)

    def addreporter (self, reporter):
        self.reports.append (reporter)

    def addfiletype (self, filetype, added, removed):
        if self.filetypes.has_key (filetype):
            self.filetypes[filetype][self.ADDED] += added
            self.filetypes[filetype][self.REMOVED] += removed
        else:
            self.filetypes[filetype] = [added, removed]

def parse_numstat(line, file_filter):
    """
        Receive a line of text, determine if fits a numstat line and
        parse the added and removed lines as well as the file type.
    """
    m = patterns['numstat'].match (line)
    if m:
        filename = m.group (3)
        # If we have a file filter, check for file lines.
        if file_filter and not file_filter.search (filename):
            return None, None, None, None

        try:
            added = int (m.group (1))
            removed = int (m.group (2))
        except ValueError:
            # A binary file (image, etc.) is marked with '-'
            added = removed = 0

        m = patterns['rename'].match (filename)
        if m:
            filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4))

        filetype = database.FileTypes.guess_file_type (os.path.basename(filename))
        return filename, filetype, added, removed
    else:
        return None, None, None, None

#
# The core hack for grabbing the information about a changeset.
#
def grabpatch(logpatch):
    m = patterns['commit'].match (logpatch[0])
    if not m:
        return None

    p = patch(m.group (1))
    ignore = (FileFilter is not None)
    for Line in logpatch[1:]:
        #
        # Maybe it's an author line?
        #
        m = patterns['author'].match (Line)
        if m:
            p.email = database.RemapEmail (m.group (2))
            p.author = LookupStoreHacker(m.group (1), p.email)
            continue
        #
        # Could be a signed-off-by:
        #
        m = patterns['signed-off-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            sobber = LookupStoreHacker(m.group (1), email)
            if sobber != p.author or AuthorSOBs:
                p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
            continue
        #
        # Various other tags of interest.
        #
        m = patterns['reviewed-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            p.addreviewer (LookupStoreHacker(m.group (1), email))
            continue
        m = patterns['tested-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            p.addtester (LookupStoreHacker (m.group (1), email))
            p.author.testcredit (patch)
            continue
        # Reported-by:
        m = patterns['reported-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            p.addreporter (LookupStoreHacker (m.group (1), email))
            p.author.reportcredit (patch)
            continue
        # Reported-and-tested-by:
        m = patterns['reported-and-tested-by'].match (Line)
        if m:
            email = database.RemapEmail (m.group (2))
            h = LookupStoreHacker (m.group (1), email)
            p.addreporter (h)
            p.addtester (h)
            p.author.reportcredit (patch)
            p.author.testcredit (patch)
            continue
        #
        # If this one is a merge, make note of the fact.
        #
        m = patterns['merge'].match (Line)
        if m:
            p.merge = 1
            continue
        #
        # See if it's the date.
        #
        m = patterns['date'].match (Line)
        if m:
            dt = rfc822.parsedate(m.group (2))
            p.date = datetime.date (dt[0], dt[1], dt[2])
            if p.date > Today:
                sys.stderr.write ('Funky date: %s\n' % p.date)
                p.date = Today
            continue
        if not Numstat:
            #
            # If we have a file filter, check for file lines.
            #
            if FileFilter:
                ignore = ApplyFileFilter (Line, ignore)
            #
            # OK, maybe it's part of the diff itself.
            #
            if not ignore:
                if patterns['add'].match (Line):
                    p.added += 1
                    continue
                if patterns['rem'].match (Line):
                    p.removed += 1
        else:
            # Get the statistics (lines added/removes) using numstats
            # and without requiring a diff (--numstat instead -p)
			(filename, filetype, added, removed) = parse_numstat (Line, FileFilter)
			if filename:
			    p.added += added
			    p.removed += removed
			    p.addfiletype (filetype, added, removed)

    if '@' in p.author.name:
        GripeAboutAuthorName (p.author.name)

    return p

def GripeAboutAuthorName (name):
    if name in GripedAuthorNames:
        return
    GripedAuthorNames.append (name)
    print '%s is an author name, probably not what you want' % (name)

def ApplyFileFilter (line, ignore):
    #
    # If this is the first file line (--- a/), set ignore one way
    # or the other.
    #
    m = patterns['filea'].match (line)
    if m:
        file = m.group (1)
        if FileFilter.search (file):
            return 0
        return 1
    #
    # For the second line, we can turn ignore off, but not on
    #
    m = patterns['fileb'].match (line)
    if m:
        file = m.group (1)
        if FileFilter.search (file):
            return 0
    return ignore

def is_svntag(logpatch):
    """
        This is a workaround for a bug on the migration to Git
        from Subversion found in GNOME.  It may happen in other
        repositories as well.
    """

    for Line in logpatch:
        m = patterns['svn-tag'].match(Line.strip())
        if m:
            sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' %
                              (m.group (0),))
            return True

    return False

#
# If this patch is signed off by both Andrew Morton and Linus Torvalds,
# remove the (redundant) Linus signoff.
#
def TrimLTSOBs (p):
    if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs:
        p.sobs.remove (Linus)


#
# Here starts the real program.
#
ParseOpts ()

#
# Read the config files.
#
ConfigFile.ConfigFile (CFName, DirName)

#
# Let's pre-seed the database with a couple of hackers
# we want to remember.
#
if AkpmOverLt == 1:
    Linus = ('torvalds@linux-foundation.org',
         LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
    Akpm = ('akpm@linux-foundation.org',
        LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))

TotalChanged = TotalAdded = TotalRemoved = 0

#
# Snarf changesets.
#
print >> sys.stderr, 'Grabbing changesets...\r',

patches = logparser.LogPatchSplitter(sys.stdin)
printcount = CSCount = 0

for logpatch in patches:
    if (printcount % 50) == 0:
        print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
    printcount += 1

    # We want to ignore commits on svn tags since in Subversion
    # thats mean a copy of the whole repository, which leads to
    # wrong results.  Some migrations from Subversion to Git does
    # not catch all this tags/copy and import them just as a new
    # big changeset.
    if is_svntag(logpatch):
        continue

    p = grabpatch(logpatch)
    if not p:
        break
#    if p.added > 100000 or p.removed > 100000:
#        print 'Skipping massive add', p.commit
#        continue
    if FileFilter and p.added == 0 and p.removed == 0:
        continue

    #
    # skip over any OpenStack Jenkins automated commits
    #
    if p.email == "jenkins@openstack.org":
        continue
    if p.email == "jenkins@review.openstack.org":
        continue
    if p.email == "openstack-infra@lists.openstack.org":
        continue

    #
    # Record some global information - but only if this patch had
    # stuff which wasn't ignored.
    #
    if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
        TotalAdded += p.added
        TotalRemoved += p.removed
        TotalChanged += max (p.added, p.removed)
        AddDateLines (p.date, max (p.added, p.removed))
        empl = p.author.emailemployer (p.email, p.date)
        empl.AddCSet (p)
        if AkpmOverLt:
            TrimLTSOBs (p)
        for sobemail, sobber in p.sobs:
            empl = sobber.emailemployer (sobemail, p.date)
            empl.AddSOB()

    if not p.merge:
        p.author.addpatch (p)
        for sobemail, sob in p.sobs:
            sob.addsob (p)
        for hacker in p.reviews:
            hacker.addreview (p)
        for hacker in p.testers:
            hacker.addtested (p)
        for hacker in p.reports:
            hacker.addreport (p)
        CSCount += 1
    csvdump.AccumulatePatch (p, Aggregate)
    csvdump.store_patch (p)
print >> sys.stderr, 'Grabbing changesets...done       '

if DumpDB:
    database.DumpDB ()
database.MixVirtuals ()

#
# Say something
#
hlist = database.AllHackers ()
elist = database.AllEmployers ()
ndev = nempl = 0
for h in hlist:
    if len (h.patches) > 0:
        ndev += 1
for e in elist:
    if e.count > 0:
        nempl += 1
reports.Write ('Processed %d csets from %d developers\n' % (CSCount,
                                                            ndev))
reports.Write ('%d employers found\n' % (nempl))
reports.Write ('A total of %d lines added, %d removed (delta %d)\n' %
               (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
if TotalChanged == 0:
    TotalChanged = 1 # HACK to avoid div by zero
if DateStats:
    PrintDateStats ()

if CSVPrefix:
    csvdump.save_csv (CSVPrefix)

if CSVFile:
    csvdump.OutputCSV (CSVFile)
    CSVFile.close ()

if DevReports:
    reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
if ReportUnknowns:
    reports.ReportUnknowns(hlist, CSCount)
reports.EmplReports (elist, TotalChanged, CSCount)

if ReportByFileType and Numstat:
    reports.ReportByFileType (hlist)