track git sha for files of interest

we will use this in a future commit to avoid re-reading files that
haven't changed
This commit is contained in:
Doug Hellmann 2014-11-13 13:07:13 +00:00
parent f0e635838f
commit 2898942696
3 changed files with 44 additions and 6 deletions

View File

@ -0,0 +1,22 @@
"""track file hash
Revision ID: 22e0aa22ab8e
Revises: 1fb08a62dd91
Create Date: 2014-11-13 00:32:24.909035
"""
# revision identifiers, used by Alembic.
revision = '22e0aa22ab8e'
down_revision = '1fb08a62dd91'
from alembic import op
import sqlalchemy as sa
def upgrade():
op.add_column('file', sa.Column('sha', sa.String))
def downgrade():
op.drop_column('file', 'sha')

View File

@ -21,6 +21,7 @@ class File(Base):
project_id = Column(Integer, ForeignKey('project.id')) project_id = Column(Integer, ForeignKey('project.id'))
name = Column(String, nullable=False) name = Column(String, nullable=False)
path = Column(String) path = Column(String)
sha = Column(String)
lines = relationship('Line', lines = relationship('Line',
backref='file', backref='file',
cascade="all, delete, delete-orphan") cascade="all, delete, delete-orphan")

View File

@ -26,15 +26,23 @@ def discover(repo_root, organizations):
) )
def _find_files_in_project(path): def _find_files_in_project(path):
"""Return a list of the files managed in the project. """Return a list of the files managed in the project and their sha hash.
Uses 'git ls-files' Uses 'git ls-files -s'
""" """
with utils.working_dir(path): with utils.working_dir(path):
cmd = subprocess.Popen(['git', 'ls-files', '-z'], # Ask git to tell us the sha hash so we can tell if the file
# has changed since we looked at it last.
cmd = subprocess.Popen(['git', 'ls-files', '-z', '-s'],
stdout=subprocess.PIPE) stdout=subprocess.PIPE)
output = cmd.communicate()[0] output = cmd.communicate()[0]
return output.split('\0') entries = output.split('\0')
for e in entries:
if not e:
continue
metadata, ignore, filename = e.partition('\t')
sha = metadata.split(' ')[1]
yield (filename, sha)
class ProjectManager(object): class ProjectManager(object):
@ -104,6 +112,13 @@ class ProjectManager(object):
def _update_project_files(self, proj_obj): def _update_project_files(self, proj_obj):
"""Update the files stored for each project""" """Update the files stored for each project"""
LOG.debug('reading file contents in %s', proj_obj.name) LOG.debug('reading file contents in %s', proj_obj.name)
# FIXME: Need to be smarter about updating files here. We have
# the full file contents, so we could compute a hash to see if
# the file has changed. Then we only have to delete data for
# the files that have changed, and re-read those, rather than
# reloading all of the files.
# Delete any existing files in case the list of files being # Delete any existing files in case the list of files being
# managed has changed. This naive, and we can do better, but as a # managed has changed. This naive, and we can do better, but as a
# first version it's OK. # first version it's OK.
@ -113,11 +128,11 @@ class ProjectManager(object):
query.delete() query.delete()
# Now load the files currently being managed by git. # Now load the files currently being managed by git.
for filename in _find_files_in_project(proj_obj.path): for filename, sha in _find_files_in_project(proj_obj.path):
fullname = os.path.join(proj_obj.path, filename) fullname = os.path.join(proj_obj.path, filename)
if not os.path.isfile(fullname): if not os.path.isfile(fullname):
continue continue
new_file = File(project=proj_obj, name=filename, path=fullname) new_file = File(project=proj_obj, name=filename, path=fullname, sha=sha)
self.session.add(new_file) self.session.add(new_file)
if any(fnmatch.fnmatch(filename, dnr) for dnr in self._DO_NOT_READ): if any(fnmatch.fnmatch(filename, dnr) for dnr in self._DO_NOT_READ):
LOG.debug('ignoring contents of %s', fullname) LOG.debug('ignoring contents of %s', fullname)