Add memory awareness to system load governor

This will unregister for concurrent jobs whenever available system
memory drops below 5% by default. It does not take into account
buffers or cache which could be reclaimed. Users can tune this up
or down as necessary.

This is a very conservative default and will likely need tuning
once observed in production.

Change-Id: Iab6469c0173d9f5635769d4ab0e8034a41355cd4
Signed-off-by: Paul Belanger <pabelanger@redhat.com>
This commit is contained in:
Paul Belanger 2018-01-08 16:24:57 -05:00 committed by Tobias Henkel
parent 22c5b7155b
commit 1754b2caf0
3 changed files with 25 additions and 3 deletions

View File

@ -575,6 +575,16 @@ The following sections of ``zuul.conf`` are used by the executor:
The executor will observe system load and determine whether The executor will observe system load and determine whether
to accept more jobs every 30 seconds. to accept more jobs every 30 seconds.
.. attr:: min_avail_mem
:default: 5.0
This is the minimum percentage of system RAM available. The
executor will stop accepting more than 1 job at a time until
more memory is available. The available memory percentage is
calculated from the total available memory divided by the
total real memory multiplied by 100. Buffers and cache are
considered available in the calculation.
.. attr:: hostname .. attr:: hostname
:default: hostname of the server :default: hostname of the server

View File

@ -27,3 +27,4 @@ pyjwt
iso8601 iso8601
aiohttp aiohttp
uvloop;python_version>='3.5' uvloop;python_version>='3.5'
psutil

View File

@ -18,6 +18,7 @@ import json
import logging import logging
import multiprocessing import multiprocessing
import os import os
import psutil
import shutil import shutil
import signal import signal
import shlex import shlex
@ -1949,6 +1950,7 @@ class ExecutorServer(object):
''' Apply some heuristics to decide whether or not we should ''' Apply some heuristics to decide whether or not we should
be askign for more jobs ''' be askign for more jobs '''
load_avg = os.getloadavg()[0] load_avg = os.getloadavg()[0]
avail_mem_pct = 100.0 - psutil.virtual_memory().percent
if self.accepting_work: if self.accepting_work:
# Don't unregister if we don't have any active jobs. # Don't unregister if we don't have any active jobs.
if load_avg > self.max_load_avg and self.job_workers: if load_avg > self.max_load_avg and self.job_workers:
@ -1956,10 +1958,19 @@ class ExecutorServer(object):
"Unregistering due to high system load {} > {}".format( "Unregistering due to high system load {} > {}".format(
load_avg, self.max_load_avg)) load_avg, self.max_load_avg))
self.unregister_work() self.unregister_work()
elif load_avg <= self.max_load_avg: elif avail_mem_pct < self.min_avail_mem:
self.log.info(
"Unregistering due to low memory {:3.1f}% < {}".format(
avail_mem_pct, self.min_avail_mem))
self.unregister_work()
elif (load_avg <= self.max_load_avg and
avail_mem_pct >= self.min_avail_mem):
self.log.info( self.log.info(
"Re-registering as load is within limits {} <= {}".format( "Re-registering as job is within limits "
load_avg, self.max_load_avg)) "{} <= {} {:3.1f}% <= {}".format(load_avg,
self.max_load_avg,
avail_mem_pct,
self.min_avail_mem))
self.register_work() self.register_work()
if self.statsd: if self.statsd:
base_key = 'zuul.executor.%s' % self.hostname base_key = 'zuul.executor.%s' % self.hostname