Optionally disable disk_limit_per_job

Our current approach to enforce the disk limit per job can be very
expensive by running 'du' in a loop. When having many repos in the
cache and many running jobs this can poison the cache and induce a
large amount of IO load. This can influence overall performance
especially if zuul is running on a shared storage like ceph.

Change-Id: Ic03168e30e0cba4a4adb42eebf4709ceba0d8c3e
This commit is contained in:
Tobias Henkel 2019-02-22 08:10:59 +01:00
parent d60346181b
commit 0a394ad67e
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
3 changed files with 22 additions and 1 deletions

View File

@ -597,7 +597,7 @@ The following sections of ``zuul.conf`` are used by the executor:
This integer is the maximum number of megabytes that any one job This integer is the maximum number of megabytes that any one job
is allowed to consume on disk while it is running. If a job's is allowed to consume on disk while it is running. If a job's
scratch space has more than this much space consumed, it will be scratch space has more than this much space consumed, it will be
aborted. aborted. Set to -1 to disable the limit.
.. attr:: trusted_ro_paths .. attr:: trusted_ro_paths

View File

@ -65,6 +65,18 @@ class TestDiskAccountant(BaseTestCase):
da.stop() da.stop()
self.assertFalse(da.thread.is_alive()) self.assertFalse(da.thread.is_alive())
def test_disk_accountant_no_limit(self):
jobs_dir = tempfile.mkdtemp(
dir=os.environ.get("ZUUL_TEST_ROOT", None))
cache_dir = tempfile.mkdtemp()
executor_server = FakeExecutor()
da = DiskAccountant(jobs_dir, -1, executor_server.stopJobByJobDir,
cache_dir)
da.start()
self.assertFalse(da.running)
da.stop()
self.assertFalse(da.running)
def test_cache_hard_links(self): def test_cache_hard_links(self):
root_dir = tempfile.mkdtemp( root_dir = tempfile.mkdtemp(
dir=os.environ.get("ZUUL_TEST_ROOT", None)) dir=os.environ.get("ZUUL_TEST_ROOT", None))

View File

@ -148,16 +148,25 @@ class DiskAccountant(object):
self.stop_event.wait(delay_time) self.stop_event.wait(delay_time)
def start(self): def start(self):
if self.limit < 0:
# No need to start if there is no limit.
return
self._running = True self._running = True
self.thread.start() self.thread.start()
def stop(self): def stop(self):
if not self.running:
return
self._running = False self._running = False
self.stop_event.set() self.stop_event.set()
# We join here to avoid whitelisting the thread -- if it takes more # We join here to avoid whitelisting the thread -- if it takes more
# than 5s to stop in tests, there's a problem. # than 5s to stop in tests, there's a problem.
self.thread.join(timeout=5) self.thread.join(timeout=5)
@property
def running(self):
return self._running
class Watchdog(object): class Watchdog(object):
def __init__(self, timeout, function, args): def __init__(self, timeout, function, args):