From 289894269682b8aaf2abba48f9a93d343fcd279b Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Thu, 13 Nov 2014 13:07:13 +0000 Subject: [PATCH] track git sha for files of interest we will use this in a future commit to avoid re-reading files that haven't changed --- .../versions/22e0aa22ab8e_track_file_hash.py | 22 +++++++++++++++ aeromancer/db/models.py | 1 + aeromancer/project.py | 27 ++++++++++++++----- 3 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 aeromancer/db/alembic/versions/22e0aa22ab8e_track_file_hash.py diff --git a/aeromancer/db/alembic/versions/22e0aa22ab8e_track_file_hash.py b/aeromancer/db/alembic/versions/22e0aa22ab8e_track_file_hash.py new file mode 100644 index 0000000..156b873 --- /dev/null +++ b/aeromancer/db/alembic/versions/22e0aa22ab8e_track_file_hash.py @@ -0,0 +1,22 @@ +"""track file hash + +Revision ID: 22e0aa22ab8e +Revises: 1fb08a62dd91 +Create Date: 2014-11-13 00:32:24.909035 + +""" + +# revision identifiers, used by Alembic. +revision = '22e0aa22ab8e' +down_revision = '1fb08a62dd91' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + op.add_column('file', sa.Column('sha', sa.String)) + + +def downgrade(): + op.drop_column('file', 'sha') diff --git a/aeromancer/db/models.py b/aeromancer/db/models.py index fb04478..d75f82a 100644 --- a/aeromancer/db/models.py +++ b/aeromancer/db/models.py @@ -21,6 +21,7 @@ class File(Base): project_id = Column(Integer, ForeignKey('project.id')) name = Column(String, nullable=False) path = Column(String) + sha = Column(String) lines = relationship('Line', backref='file', cascade="all, delete, delete-orphan") diff --git a/aeromancer/project.py b/aeromancer/project.py index ab574e2..4d6df1c 100644 --- a/aeromancer/project.py +++ b/aeromancer/project.py @@ -26,15 +26,23 @@ def discover(repo_root, organizations): ) def _find_files_in_project(path): - """Return a list of the files managed in the project. + """Return a list of the files managed in the project and their sha hash. - Uses 'git ls-files' + Uses 'git ls-files -s' """ with utils.working_dir(path): - cmd = subprocess.Popen(['git', 'ls-files', '-z'], + # Ask git to tell us the sha hash so we can tell if the file + # has changed since we looked at it last. + cmd = subprocess.Popen(['git', 'ls-files', '-z', '-s'], stdout=subprocess.PIPE) output = cmd.communicate()[0] - return output.split('\0') + entries = output.split('\0') + for e in entries: + if not e: + continue + metadata, ignore, filename = e.partition('\t') + sha = metadata.split(' ')[1] + yield (filename, sha) class ProjectManager(object): @@ -104,6 +112,13 @@ class ProjectManager(object): def _update_project_files(self, proj_obj): """Update the files stored for each project""" LOG.debug('reading file contents in %s', proj_obj.name) + + # FIXME: Need to be smarter about updating files here. We have + # the full file contents, so we could compute a hash to see if + # the file has changed. Then we only have to delete data for + # the files that have changed, and re-read those, rather than + # reloading all of the files. + # Delete any existing files in case the list of files being # managed has changed. This naive, and we can do better, but as a # first version it's OK. @@ -113,11 +128,11 @@ class ProjectManager(object): query.delete() # Now load the files currently being managed by git. - for filename in _find_files_in_project(proj_obj.path): + for filename, sha in _find_files_in_project(proj_obj.path): fullname = os.path.join(proj_obj.path, filename) if not os.path.isfile(fullname): continue - new_file = File(project=proj_obj, name=filename, path=fullname) + new_file = File(project=proj_obj, name=filename, path=fullname, sha=sha) self.session.add(new_file) if any(fnmatch.fnmatch(filename, dnr) for dnr in self._DO_NOT_READ): LOG.debug('ignoring contents of %s', fullname)