From 4ba48d996728c30e653120ffd7891b14480b5281 Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Mon, 11 Nov 2013 18:03:53 -0800 Subject: [PATCH] GC git.Repo objects in merger.Repo operations. * zuul/merger.py: Zuul's merger.Repo is holding on to repo objects that keep many file descriptors open. Turn all git.Repo objects into method local references so that the git.Repo objects can be garbage collected once these methods are popped off the stack. GCing these objects frees the file descriptors associated with that repo. This adds overhead to repo operations as each operation must do initial setup IO. * tests/test_scheduler.py: Assert that no instances of git.Repo are known to the python garbage collector. Do the assertion after running a fully garbage collection run. This is a cheap check that repo objects are being garbage collected properly at the end of each test. Change-Id: I1d67981f32708a85af62ca622402de9fac0e1842 --- tests/test_scheduler.py | 8 ++++ zuul/merger.py | 87 ++++++++++++++++++----------------------- 2 files changed, 47 insertions(+), 48 deletions(-) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index cdc67690b7..385fda76da 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -16,6 +16,7 @@ import ConfigParser from cStringIO import StringIO +import gc import hashlib import json import logging @@ -834,6 +835,13 @@ class TestScheduler(testtools.TestCase): def assertFinalState(self): # Make sure that the change cache is cleared self.assertEqual(len(self.gerrit._change_cache.keys()), 0) + # Make sure that git.Repo objects have been garbage collected. + repos = [] + gc.collect() + for obj in gc.get_objects(): + if isinstance(obj, git.Repo): + repos.append(obj) + self.assertEqual(len(repos), 0) self.assertEmptyQueues() def shutdown(self): diff --git a/zuul/merger.py b/zuul/merger.py index 5704d617dd..1f3c547b0e 100644 --- a/zuul/merger.py +++ b/zuul/merger.py @@ -46,103 +46,96 @@ class Repo(object): self.log.debug("Cloning from %s to %s" % (self.remote_url, self.local_path)) git.Repo.clone_from(self.remote_url, self.local_path) - self.repo = git.Repo(self.local_path) + repo = git.Repo(self.local_path) if self.email: - self.repo.config_writer().set_value('user', 'email', - self.email) + repo.config_writer().set_value('user', 'email', + self.email) if self.username: - self.repo.config_writer().set_value('user', 'name', - self.username) - self.repo.config_writer().write() + repo.config_writer().set_value('user', 'name', + self.username) + repo.config_writer().write() self._initialized = True - def recreateRepoObject(self): - self._ensure_cloned() - self.repo = git.Repo(self.local_path) + def createRepoObject(self): + try: + self._ensure_cloned() + repo = git.Repo(self.local_path) + except: + self.log.exception("Unable to initialize repo for %s" % + self.local_path) + return repo def reset(self): - self._ensure_cloned() + repo = self.createRepoObject() self.log.debug("Resetting repository %s" % self.local_path) self.update() - origin = self.repo.remotes.origin + origin = repo.remotes.origin for ref in origin.refs: if ref.remote_head == 'HEAD': continue - self.repo.create_head(ref.remote_head, ref, force=True) + repo.create_head(ref.remote_head, ref, force=True) # Reset to remote HEAD (usually origin/master) - self.repo.head.reference = origin.refs['HEAD'] - self.repo.head.reset(index=True, working_tree=True) - self.repo.git.clean('-x', '-f', '-d') + repo.head.reference = origin.refs['HEAD'] + repo.head.reset(index=True, working_tree=True) + repo.git.clean('-x', '-f', '-d') def getBranchHead(self, branch): - return self.repo.heads[branch] + repo = self.createRepoObject() + branch_head = repo.heads[branch] + return branch_head def checkout(self, ref): - self._ensure_cloned() + repo = self.createRepoObject() self.log.debug("Checking out %s" % ref) - self.repo.head.reference = ref - self.repo.head.reset(index=True, working_tree=True) + repo.head.reference = ref + repo.head.reset(index=True, working_tree=True) def cherryPick(self, ref): - self._ensure_cloned() + repo = self.createRepoObject() self.log.debug("Cherry-picking %s" % ref) self.fetch(ref) - self.repo.git.cherry_pick("FETCH_HEAD") + repo.git.cherry_pick("FETCH_HEAD") def merge(self, ref, strategy=None): - self._ensure_cloned() + repo = self.createRepoObject() args = [] if strategy: args += ['-s', strategy] args.append('FETCH_HEAD') self.fetch(ref) self.log.debug("Merging %s with args %s" % (ref, args)) - self.repo.git.merge(*args) + repo.git.merge(*args) def fetch(self, ref): - self._ensure_cloned() + repo = self.createRepoObject() # The git.remote.fetch method may read in git progress info and # interpret it improperly causing an AssertionError. Because the # data was fetched properly subsequent fetches don't seem to fail. # So try again if an AssertionError is caught. - origin = self.repo.remotes.origin + origin = repo.remotes.origin try: origin.fetch(ref) except AssertionError: origin.fetch(ref) - # If the repository is packed, and we fetch a change that is - # also entirely packed, the cache may be out of date for the - # same reason as reset() above. Avoid these problems by - # recreating the repo object. - # https://bugs.launchpad.net/zuul/+bug/1078946 - self.repo = git.Repo(self.local_path) - def createZuulRef(self, ref, commit='HEAD'): - self._ensure_cloned() + repo = self.createRepoObject() self.log.debug("CreateZuulRef %s at %s " % (ref, commit)) - ref = ZuulReference.create(self.repo, ref, commit) + ref = ZuulReference.create(repo, ref, commit) return ref.commit def push(self, local, remote): - self._ensure_cloned() + repo = self.createRepoObject() self.log.debug("Pushing %s:%s to %s " % (local, remote, self.remote_url)) - self.repo.remotes.origin.push('%s:%s' % (local, remote)) + repo.remotes.origin.push('%s:%s' % (local, remote)) def update(self): - self._ensure_cloned() + repo = self.createRepoObject() self.log.debug("Updating repository %s" % self.local_path) - origin = self.repo.remotes.origin + origin = repo.remotes.origin origin.update() - # If the remote repository is repacked, the repo object's - # cache may be out of date. Specifically, it caches whether - # to check the loose or packed DB for a given SHA. Further, - # if there was no pack or lose directory to start with, the - # repo object may not even have a database for it. Avoid - # these problems by recreating the repo object. - self.repo = git.Repo(self.local_path) class Merger(object): @@ -180,9 +173,7 @@ class Merger(object): self.log.exception("Unable to add project %s" % project) def getRepo(self, project): - r = self.repos.get(project, None) - r.recreateRepoObject() - return r + return self.repos.get(project, None) def updateRepo(self, project): repo = self.getRepo(project)