Optionally disable disk_limit_per_job

Our current approach to enforce the disk limit per job can be very expensive by running 'du' in a loop. When having many repos in the cache and many running jobs this can poison the cache and induce a large amount of IO load. This can influence overall performance especially if zuul is running on a shared storage like ceph. Change-Id: Ic03168e30e0cba4a4adb42eebf4709ceba0d8c3e
2019-02-22 08:10:59 +01:00 · 2019-02-22 08:10:59 +01:00 · 0a394ad67e
parent d60346181b
commit 0a394ad67e
3 changed files with 22 additions and 1 deletions
--- a/doc/source/admin/components.rst
+++ b/doc/source/admin/components.rst
@ -597,7 +597,7 @@ The following sections of ``zuul.conf`` are used by the executor:
      This integer is the maximum number of megabytes that any one job
      is allowed to consume on disk while it is running. If a job's
      scratch space has more than this much space consumed, it will be
-      aborted.
+      aborted. Set to -1 to disable the limit.
   .. attr:: trusted_ro_paths
--- a/tests/unit/test_disk_accountant.py
+++ b/tests/unit/test_disk_accountant.py
@ -65,6 +65,18 @@ class TestDiskAccountant(BaseTestCase):
            da.stop()
        self.assertFalse(da.thread.is_alive())
    def test_disk_accountant_no_limit(self):
        jobs_dir = tempfile.mkdtemp(
            dir=os.environ.get("ZUUL_TEST_ROOT", None))
        cache_dir = tempfile.mkdtemp()
        executor_server = FakeExecutor()
        da = DiskAccountant(jobs_dir, -1, executor_server.stopJobByJobDir,
                            cache_dir)
        da.start()
        self.assertFalse(da.running)
        da.stop()
        self.assertFalse(da.running)
    def test_cache_hard_links(self):
        root_dir = tempfile.mkdtemp(
            dir=os.environ.get("ZUUL_TEST_ROOT", None))
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -148,16 +148,25 @@ class DiskAccountant(object):
            self.stop_event.wait(delay_time)
    def start(self):
        if self.limit < 0:
            # No need to start if there is no limit.
            return
        self._running = True
        self.thread.start()
    def stop(self):
        if not self.running:
            return
        self._running = False
        self.stop_event.set()
        # We join here to avoid whitelisting the thread -- if it takes more
        # than 5s to stop in tests, there's a problem.
        self.thread.join(timeout=5)
    @property
    def running(self):
        return self._running
 class Watchdog(object):
    def __init__(self, timeout, function, args):