Browse Source

Add cgroup support to ram sensor

When running within k8s the system memory statistics are useless as
soon there are configured limits (which is strongly advised). In this
case we additionally need to check the cgroups.

Change-Id: Idebe5d7e60dc862e89d012594ab362a19f18708d
changes/06/549506/13
Tobias Henkel 3 years ago
parent
commit
145e62b568
No known key found for this signature in database GPG Key ID: 3750DEC158E5FA2
7 changed files with 216 additions and 2 deletions
  1. +6
    -0
      doc/source/admin/monitoring.rst
  2. +6
    -0
      releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml
  3. +34
    -0
      tests/fixtures/cgroup/memory.stat.bad
  4. +34
    -0
      tests/fixtures/cgroup/memory.stat.nolimit
  5. +34
    -0
      tests/fixtures/cgroup/memory.stat.ok
  6. +49
    -1
      tests/unit/test_executor.py
  7. +53
    -1
      zuul/executor/sensors/ram.py

+ 6
- 0
doc/source/admin/monitoring.rst View File

@ -188,6 +188,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
The used RAM (excluding buffers and cache) on this executor, as
a percentage multiplied by 100.
.. stat:: pct_used_ram_cgroup
:type: gauge
The used RAM (excluding buffers and cache) on this executor allowed by
the cgroup, as percentage multiplied by 100.
.. stat:: zuul.nodepool.requests
Holds metrics related to Zuul requests and responses from Nodepool.


+ 6
- 0
releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml View File

@ -0,0 +1,6 @@
---
features:
- |
The :attr:`executor.min_avail_mem` setting now takes cgroup limits
into account. There is also a new metric
`zuul.executor.<executor>.pct_used_ram_cgroup` available.

+ 34
- 0
tests/fixtures/cgroup/memory.stat.bad View File

@ -0,0 +1,34 @@
cache 0
rss 561152
rss_huge 0
mapped_file 0
dirty 0
writeback 0
swap 0
pgpgin 654
pgpgout 517
pgfault 1089
pgmajfault 0
inactive_anon 0
active_anon 454656
inactive_file 0
active_file 0
unevictable 0
hierarchical_memory_limit 5368709120
hierarchical_memsw_limit 5368709120
total_cache 0
total_rss 5153960755
total_rss_huge 0
total_mapped_file 0
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 654
total_pgpgout 517
total_pgfault 1089
total_pgmajfault 0
total_inactive_anon 0
total_active_anon 454656
total_inactive_file 0
total_active_file 0
total_unevictable 0

+ 34
- 0
tests/fixtures/cgroup/memory.stat.nolimit View File

@ -0,0 +1,34 @@
cache 0
rss 561152
rss_huge 0
mapped_file 0
dirty 0
writeback 0
swap 0
pgpgin 654
pgpgout 517
pgfault 1089
pgmajfault 0
inactive_anon 0
active_anon 454656
inactive_file 0
active_file 0
unevictable 0
hierarchical_memory_limit 9223372036854771712
hierarchical_memsw_limit 9223372036854771712
total_cache 0
total_rss 561152
total_rss_huge 0
total_mapped_file 0
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 654
total_pgpgout 517
total_pgfault 1089
total_pgmajfault 0
total_inactive_anon 0
total_active_anon 454656
total_inactive_file 0
total_active_file 0
total_unevictable 0

+ 34
- 0
tests/fixtures/cgroup/memory.stat.ok View File

@ -0,0 +1,34 @@
cache 0
rss 561152
rss_huge 0
mapped_file 0
dirty 0
writeback 0
swap 0
pgpgin 654
pgpgout 517
pgfault 1089
pgmajfault 0
inactive_anon 0
active_anon 454656
inactive_file 0
active_file 0
unevictable 0
hierarchical_memory_limit 5368709120
hierarchical_memsw_limit 5368709120
total_cache 0
total_rss 1073741824
total_rss_huge 0
total_mapped_file 0
total_dirty 0
total_writeback 0
total_swap 0
total_pgpgin 654
total_pgpgout 517
total_pgfault 1089
total_pgmajfault 0
total_inactive_anon 0
total_active_anon 454656
total_inactive_file 0
total_active_file 0
total_unevictable 0

+ 49
- 1
tests/unit/test_executor.py View File

@ -31,6 +31,7 @@ from tests.base import (
)
from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
from zuul.executor.sensors.ram import RAMSensor
class TestExecutorRepos(ZuulTestCase):
@ -466,15 +467,62 @@ class TestGovernor(ZuulTestCase):
pass
ram = Dummy()
ram.percent = 20.0 # 20% used
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
vm_mock.return_value = ram
loadavg_mock.return_value = (0.0, 0.0, 0.0)
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
ram.percent = 99.0 # 99% used
loadavg_mock.return_value = (100.0, 100.0, 100.0)
self.executor_server.manageLoad()
self.assertFalse(self.executor_server.accepting_work)
@mock.patch('os.getloadavg')
@mock.patch('psutil.virtual_memory')
def test_ram_governor(self, vm_mock, loadavg_mock):
class Dummy(object):
pass
ram = Dummy()
ram.percent = 20.0 # 20% used
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
vm_mock.return_value = ram
loadavg_mock.return_value = (0.0, 0.0, 0.0)
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
ram.percent = 99.0 # 99% used
self.executor_server.manageLoad()
self.assertFalse(self.executor_server.accepting_work)
@mock.patch('os.getloadavg')
@mock.patch('psutil.virtual_memory')
def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
class Dummy(object):
pass
ram = Dummy()
ram.percent = 20.0 # 20% used
ram.total = 8 * 1024 * 1024 * 1024 # 8GiB
vm_mock.return_value = ram
loadavg_mock.return_value = (0.0, 0.0, 0.0)
# Set no cgroup limit
ram_sensor = [x for x in self.executor_server.sensors
if isinstance(x, RAMSensor)][0]
ram_sensor.cgroup_stats_file = os.path.join(
FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
# Set cgroup limit 5GiB and ram usage 20%
ram_sensor.cgroup_stats_file = os.path.join(
FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
self.executor_server.manageLoad()
self.assertTrue(self.executor_server.accepting_work)
# Set cgroup limit 5GiB and ram usage 96%
ram_sensor.cgroup_stats_file = os.path.join(
FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
self.executor_server.manageLoad()
self.assertFalse(self.executor_server.accepting_work)
@mock.patch('os.statvfs')
def test_hdd_governor(self, statvfs_mock):
class Dummy(object):


+ 53
- 1
zuul/executor/sensors/ram.py View File

@ -13,11 +13,14 @@
# under the License.
import logging
import math
import psutil
from zuul.executor.sensors import SensorInterface
from zuul.lib.config import get_default
CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
def get_avail_mem_pct():
avail_mem_pct = 100.0 - psutil.virtual_memory().percent
@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
def __init__(self, config=None):
self.min_avail_mem = float(get_default(config, 'executor',
'min_avail_mem', '5.0'))
self.cgroup_stats_file = CGROUP_STATS_FILE
def _read_cgroup_stat(self):
stat = {}
try:
with open(self.cgroup_stats_file) as f:
for line in f.readlines():
key, value = line.split(' ')
stat[key] = int(value.strip())
except Exception:
pass
return stat
def _get_cgroup_limit(self):
stat = self._read_cgroup_stat()
limit = stat.get('hierarchical_memory_limit', math.inf)
mem_total = psutil.virtual_memory().total
if limit < mem_total:
return limit
else:
return math.inf
def _get_avail_mem_pct_cgroup(self):
stat = self._read_cgroup_stat()
limit = stat.get('hierarchical_memory_limit', math.inf)
usage = stat.get('total_rss', math.inf)
if math.isinf(limit) or math.isinf(usage):
# pretend we have all memory available if we got infs
return 100
return 100.0 - usage / limit * 100
def isOk(self):
avail_mem_pct = get_avail_mem_pct()
@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
return False, "low memory {:3.1f}% < {}".format(
avail_mem_pct, self.min_avail_mem)
return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem)
if math.isinf(self._get_cgroup_limit()):
# we have no cgroup defined limit so we're done now
return True, "{:3.1f}% <= {}".format(
avail_mem_pct, self.min_avail_mem)
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
if avail_mem_pct_cgroup < self.min_avail_mem:
return False, "low memory cgroup {:3.1f}% < {}".format(
avail_mem_pct_cgroup, self.min_avail_mem)
return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
avail_mem_pct, self.min_avail_mem,
avail_mem_pct_cgroup, self.min_avail_mem)
def reportStats(self, statsd, base_key):
avail_mem_pct = get_avail_mem_pct()
statsd.gauge(base_key + '.pct_used_ram',
int((100.0 - avail_mem_pct) * 100))
if math.isfinite(self._get_cgroup_limit()):
avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
statsd.gauge(base_key + '.pct_used_ram_cgroup',
int((100.0 - avail_mem_pct_cgroup) * 100))

Loading…
Cancel
Save