diff --git a/doc/source/admin/components.rst b/doc/source/admin/components.rst index d6b0984d24..2e18b5131c 100644 --- a/doc/source/admin/components.rst +++ b/doc/source/admin/components.rst @@ -575,6 +575,16 @@ The following sections of ``zuul.conf`` are used by the executor: The executor will observe system load and determine whether to accept more jobs every 30 seconds. + .. attr:: min_avail_mem + :default: 5.0 + + This is the minimum percentage of system RAM available. The + executor will stop accepting more than 1 job at a time until + more memory is available. The available memory percentage is + calculated from the total available memory divided by the + total real memory multiplied by 100. Buffers and cache are + considered available in the calculation. + .. attr:: hostname :default: hostname of the server diff --git a/requirements.txt b/requirements.txt index 39a2b0268d..3ab5850aec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ pyjwt iso8601 aiohttp uvloop;python_version>='3.5' +psutil diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 03a3a12cb3..1c72275b55 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -18,6 +18,7 @@ import json import logging import multiprocessing import os +import psutil import shutil import signal import shlex @@ -1949,6 +1950,7 @@ class ExecutorServer(object): ''' Apply some heuristics to decide whether or not we should be askign for more jobs ''' load_avg = os.getloadavg()[0] + avail_mem_pct = 100.0 - psutil.virtual_memory().percent if self.accepting_work: # Don't unregister if we don't have any active jobs. if load_avg > self.max_load_avg and self.job_workers: @@ -1956,10 +1958,19 @@ class ExecutorServer(object): "Unregistering due to high system load {} > {}".format( load_avg, self.max_load_avg)) self.unregister_work() - elif load_avg <= self.max_load_avg: + elif avail_mem_pct < self.min_avail_mem: + self.log.info( + "Unregistering due to low memory {:3.1f}% < {}".format( + avail_mem_pct, self.min_avail_mem)) + self.unregister_work() + elif (load_avg <= self.max_load_avg and + avail_mem_pct >= self.min_avail_mem): self.log.info( - "Re-registering as load is within limits {} <= {}".format( - load_avg, self.max_load_avg)) + "Re-registering as job is within limits " + "{} <= {} {:3.1f}% <= {}".format(load_avg, + self.max_load_avg, + avail_mem_pct, + self.min_avail_mem)) self.register_work() if self.statsd: base_key = 'zuul.executor.%s' % self.hostname