gitdm/database.py

326 lines
9.1 KiB
Python

#
# The "database".
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
#
import sys, datetime
class Hacker:
def __init__ (self, name, id, elist, email):
self.name = name
self.id = id
self.employer = [ elist ]
self.email = [ email ]
self.added = self.removed = 0
self.patches = [ ]
self.signoffs = [ ]
self.reviews = [ ]
self.tested = [ ]
self.reports = [ ]
self.bugsfixed = [ ]
self.testcred = self.repcred = 0
self.versions = [ ]
def addemail (self, email, elist):
self.email.append (email)
self.employer.append (elist)
HackersByEmail[email] = self
def emailemployer (self, email, date):
for i in range (0, len (self.email)):
if self.email[i] == email:
for edate, empl in self.employer[i]:
if edate > date:
return empl
print 'OOPS. ', self.name, self.employer, self.email, email, date
return None # Should not happen
def addpatch (self, patch):
self.added += patch.added
self.removed += patch.removed
self.patches.append (patch)
#
# Note that the author is represented in this release.
#
def addversion (self, release):
if release not in self.versions:
self.versions.append (release)
#
# There's got to be a better way.
#
def addsob (self, patch):
self.signoffs.append (patch)
def addreview (self, patch):
self.reviews.append (patch)
def addtested (self, patch):
self.tested.append (patch)
def addreport (self, patch):
self.reports.append (patch)
def reportcredit (self, patch):
self.repcred += 1
def testcredit (self, patch):
self.testcred += 1
def addbugfixed (self, bug):
self.bugsfixed.append (bug)
HackersByName = { }
HackersByEmail = { }
HackersByID = { }
MaxID = 0
def StoreHacker (name, elist, email):
global MaxID
id = MaxID
MaxID += 1
h = Hacker (name, id, elist, email)
HackersByName[name] = h
HackersByEmail[email] = h
HackersByID[id] = h
return h
def LookupEmail (addr):
try:
return HackersByEmail[addr]
except KeyError:
return None
def LookupName (name):
try:
return HackersByName[name]
except KeyError:
return None
def LookupID (id):
try:
return HackersByID[id]
except KeyError:
return None
def AllHackers ():
return HackersByID.values ()
# return [h for h in HackersByID.values ()] # if (h.added + h.removed) > 0]
def DumpDB ():
out = open ('database.dump', 'w')
names = HackersByName.keys ()
names.sort ()
for name in names:
h = HackersByName[name]
out.write ('%4d %s %d p (+%d -%d) sob: %d\n' % (h.id, h.name,
len (h.patches),
h.added, h.removed,
len (h.signoffs)))
for i in range (0, len (h.email)):
out.write ('\t%s -> \n' % (h.email[i]))
for date, empl in h.employer[i]:
out.write ('\t\t %d-%d-%d %s\n' % (date.year, date.month, date.day,
empl.name))
if h.versions:
out.write ('\tVersions: %s\n' % ','.join (h.versions))
#
# Hack: The first visible tag comes a ways into the stream; when we see it,
# push it backward through the changes we've already seen.
#
def ApplyFirstTag (tag):
for n in HackersByName.keys ():
if HackersByName[n].versions:
HackersByName[n].versions = [tag]
#
# Employer info.
#
class Employer:
def __init__ (self, name):
self.name = name
self.added = self.removed = self.count = self.changed = 0
self.sobs = 0
self.bugsfixed = [ ]
self.reviews = [ ]
self.hackers = [ ]
def AddCSet (self, patch):
self.added += patch.added
self.removed += patch.removed
self.changed += max(patch.added, patch.removed)
self.count += 1
if patch.author not in self.hackers:
self.hackers.append (patch.author)
def AddSOB (self):
self.sobs += 1
def AddBug (self, bug):
self.bugsfixed.append(bug)
if bug.owner not in self.hackers:
self.hackers.append (bug.owner)
def AddReview (self, reviewer):
self.reviews.append(reviewer)
if reviewer not in self.hackers:
self.hackers.append (reviewer)
Employers = { }
def GetEmployer (name):
try:
return Employers[name]
except KeyError:
e = Employer (name)
Employers[name] = e
return e
def AllEmployers ():
return Employers.values ()
#
# Certain obnoxious developers, who will remain nameless (because we
# would never want to run afoul of Thomas) want their work split among
# multiple companies. Let's try to cope with that. Let's also hope
# this doesn't spread.
#
class VirtualEmployer (Employer):
def __init__ (self, name):
Employer.__init__ (self, name)
self.splits = [ ]
def addsplit (self, name, fraction):
self.splits.append ((name, fraction))
#
# Go through and (destructively) apply our credits to the
# real employer. Only one level of weirdness is supported.
#
def applysplits (self):
for name, fraction in self.splits:
real = GetEmployer (name)
real.added += int (self.added*fraction)
real.removed += int (self.removed*fraction)
real.changed += int (self.changed*fraction)
real.count += int (self.count*fraction)
self.__init__ (name) # Reset counts just in case
def store (self):
if Employers.has_key (self.name):
print Employers[self.name]
sys.stderr.write ('WARNING: Virtual empl %s overwrites another\n'
% (self.name))
if len (self.splits) == 0:
sys.stderr.write ('WARNING: Virtual empl %s has no splits\n'
% (self.name))
# Should check that they add up too, but I'm lazy
Employers[self.name] = self
class FileType:
def __init__ (self, patterns={}, order=[]):
self.patterns = patterns
self.order = order
def guess_file_type (self, filename, patterns=None, order=None):
patterns = patterns or self.patterns
order = order or self.order
for file_type in order:
if patterns.has_key (file_type):
for patt in patterns[file_type]:
if patt.search (filename):
return file_type
return 'unknown'
#
# By default we recognize nothing.
#
FileTypes = FileType ({}, [])
#
# Mix all the virtual employers into their real destinations.
#
def MixVirtuals ():
for empl in AllEmployers ():
if isinstance (empl, VirtualEmployer):
empl.applysplits ()
#
# The email map.
#
EmailAliases = { }
def AddEmailAlias (variant, canonical):
if EmailAliases.has_key (variant):
sys.stderr.write ('Duplicate email alias for %s\n' % (variant))
EmailAliases[variant] = canonical
def RemapEmail (email):
email = email.lower ()
try:
return EmailAliases[email]
except KeyError:
return email
#
# Email-to-employer mapping.
#
EmailToEmployer = { }
nextyear = datetime.date.today () + datetime.timedelta (days = 365)
def AddEmailEmployerMapping (email, employer, end = nextyear):
if end is None:
end = nextyear
email = email.lower ()
empl = GetEmployer (employer)
try:
l = EmailToEmployer[email]
for i in range (0, len(l)):
date, xempl = l[i]
if date == end: # probably both nextyear
print 'WARNING: duplicate email/empl for %s' % (email)
if date > end:
l.insert (i, (end, empl))
return
l.append ((end, empl))
except KeyError:
EmailToEmployer[email] = [(end, empl)]
def MapToEmployer (email, unknown = 0):
# Somebody sometimes does s/@/ at /; let's fix it.
email = email.lower ().replace (' at ', '@')
try:
return EmailToEmployer[email]
except KeyError:
pass
namedom = email.split ('@')
if len (namedom) < 2:
print 'Oops...funky email %s' % email
return [(nextyear, GetEmployer ('Funky'))]
s = namedom[1].split ('.')
for dots in range (len (s) - 2, -1, -1):
addr = '.'.join (s[dots:])
try:
return EmailToEmployer[addr]
except KeyError:
pass
#
# We don't know who they work for.
#
if unknown:
return [(nextyear, GetEmployer ('(Unknown)'))]
return [(nextyear, GetEmployer (email))]
def LookupEmployer (email, mapunknown = 0):
elist = MapToEmployer (email, mapunknown)
return elist # GetEmployer (ename)