Browse Source

Optionally disable disk_limit_per_job

Our current approach to enforce the disk limit per job can be very
expensive by running 'du' in a loop. When having many repos in the
cache and many running jobs this can poison the cache and induce a
large amount of IO load. This can influence overall performance
especially if zuul is running on a shared storage like ceph.

Change-Id: Ic03168e30e0cba4a4adb42eebf4709ceba0d8c3e
changes/96/638596/1
Tobias Henkel 2 years ago
parent
commit
0a394ad67e
No known key found for this signature in database GPG Key ID: 3750DEC158E5FA2
3 changed files with 22 additions and 1 deletions
  1. +1
    -1
      doc/source/admin/components.rst
  2. +12
    -0
      tests/unit/test_disk_accountant.py
  3. +9
    -0
      zuul/executor/server.py

+ 1
- 1
doc/source/admin/components.rst View File

@ -597,7 +597,7 @@ The following sections of ``zuul.conf`` are used by the executor:
This integer is the maximum number of megabytes that any one job
is allowed to consume on disk while it is running. If a job's
scratch space has more than this much space consumed, it will be
aborted.
aborted. Set to -1 to disable the limit.
.. attr:: trusted_ro_paths


+ 12
- 0
tests/unit/test_disk_accountant.py View File

@ -65,6 +65,18 @@ class TestDiskAccountant(BaseTestCase):
da.stop()
self.assertFalse(da.thread.is_alive())
def test_disk_accountant_no_limit(self):
jobs_dir = tempfile.mkdtemp(
dir=os.environ.get("ZUUL_TEST_ROOT", None))
cache_dir = tempfile.mkdtemp()
executor_server = FakeExecutor()
da = DiskAccountant(jobs_dir, -1, executor_server.stopJobByJobDir,
cache_dir)
da.start()
self.assertFalse(da.running)
da.stop()
self.assertFalse(da.running)
def test_cache_hard_links(self):
root_dir = tempfile.mkdtemp(
dir=os.environ.get("ZUUL_TEST_ROOT", None))


+ 9
- 0
zuul/executor/server.py View File

@ -148,16 +148,25 @@ class DiskAccountant(object):
self.stop_event.wait(delay_time)
def start(self):
if self.limit < 0:
# No need to start if there is no limit.
return
self._running = True
self.thread.start()
def stop(self):
if not self.running:
return
self._running = False
self.stop_event.set()
# We join here to avoid whitelisting the thread -- if it takes more
# than 5s to stop in tests, there's a problem.
self.thread.join(timeout=5)
@property
def running(self):
return self._running
class Watchdog(object):
def __init__(self, timeout, function, args):


Loading…
Cancel
Save