Optionally disable disk_limit_per_job

Our current approach to enforce the disk limit per job can be very expensive by running 'du' in a loop. When having many repos in the cache and many running jobs this can poison the cache and induce a large amount of IO load. This can influence overall performance especially if zuul is running on a shared storage like ceph. Change-Id: Ic03168e30e0cba4a4adb42eebf4709ceba0d8c3e
2019-02-22 08:10:59 +01:00 · 2019-02-22 08:10:59 +01:00 · 0a394ad67e
parent d60346181b
commit 0a394ad67e
3 changed files with 22 additions and 1 deletions
--- a/doc/source/admin/components.rst
+++ b/doc/source/admin/components.rst
@ -597,7 +597,7 @@ The following sections of ``zuul.conf`` are used by the executor:
      This integer is the maximum number of megabytes that any one job
      is allowed to consume on disk while it is running. If a job's
      scratch space has more than this much space consumed, it will be
-      aborted.
+      aborted. Set to -1 to disable the limit.

   .. attr:: trusted_ro_paths

--- a/tests/unit/test_disk_accountant.py
+++ b/tests/unit/test_disk_accountant.py
@ -65,6 +65,18 @@ class TestDiskAccountant(BaseTestCase):
            da.stop()
        self.assertFalse(da.thread.is_alive())

+    def test_disk_accountant_no_limit(self):
+        jobs_dir = tempfile.mkdtemp(
+            dir=os.environ.get("ZUUL_TEST_ROOT", None))
+        cache_dir = tempfile.mkdtemp()
+        executor_server = FakeExecutor()
+        da = DiskAccountant(jobs_dir, -1, executor_server.stopJobByJobDir,
+                            cache_dir)
+        da.start()
+        self.assertFalse(da.running)
+        da.stop()
+        self.assertFalse(da.running)
+
    def test_cache_hard_links(self):
        root_dir = tempfile.mkdtemp(
            dir=os.environ.get("ZUUL_TEST_ROOT", None))
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -148,16 +148,25 @@ class DiskAccountant(object):
            self.stop_event.wait(delay_time)

    def start(self):
+        if self.limit < 0:
+            # No need to start if there is no limit.
+            return
        self._running = True
        self.thread.start()

    def stop(self):
+        if not self.running:
+            return
        self._running = False
        self.stop_event.set()
        # We join here to avoid whitelisting the thread -- if it takes more
        # than 5s to stop in tests, there's a problem.
        self.thread.join(timeout=5)

+    @property
+    def running(self):
+        return self._running
+

 class Watchdog(object):
    def __init__(self, timeout, function, args):