Browse Source

Add cgroup support to ram sensor

When running within k8s the system memory statistics are useless as
soon there are configured limits (which is strongly advised). In this
case we additionally need to check the cgroups.

Change-Id: Idebe5d7e60dc862e89d012594ab362a19f18708d
tags/3.5.0
Tobias Henkel 1 year ago
parent
commit
145e62b568
No account linked to committer's email address

+ 6
- 0
doc/source/admin/monitoring.rst View File

@@ -188,6 +188,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
188 188
      The used RAM (excluding buffers and cache) on this executor, as
189 189
      a percentage multiplied by 100.
190 190
 
191
+  .. stat:: pct_used_ram_cgroup
192
+     :type: gauge
193
+
194
+     The used RAM (excluding buffers and cache) on this executor allowed by
195
+     the cgroup, as percentage multiplied by 100.
196
+
191 197
 .. stat:: zuul.nodepool.requests
192 198
 
193 199
    Holds metrics related to Zuul requests and responses from Nodepool.

+ 6
- 0
releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+features:
3
+  - |
4
+    The :attr:`executor.min_avail_mem` setting now takes cgroup limits
5
+    into account. There is also a new metric
6
+    `zuul.executor.<executor>.pct_used_ram_cgroup` available.

+ 34
- 0
tests/fixtures/cgroup/memory.stat.bad View File

@@ -0,0 +1,34 @@
1
+cache 0
2
+rss 561152
3
+rss_huge 0
4
+mapped_file 0
5
+dirty 0
6
+writeback 0
7
+swap 0
8
+pgpgin 654
9
+pgpgout 517
10
+pgfault 1089
11
+pgmajfault 0
12
+inactive_anon 0
13
+active_anon 454656
14
+inactive_file 0
15
+active_file 0
16
+unevictable 0
17
+hierarchical_memory_limit 5368709120
18
+hierarchical_memsw_limit 5368709120
19
+total_cache 0
20
+total_rss 5153960755
21
+total_rss_huge 0
22
+total_mapped_file 0
23
+total_dirty 0
24
+total_writeback 0
25
+total_swap 0
26
+total_pgpgin 654
27
+total_pgpgout 517
28
+total_pgfault 1089
29
+total_pgmajfault 0
30
+total_inactive_anon 0
31
+total_active_anon 454656
32
+total_inactive_file 0
33
+total_active_file 0
34
+total_unevictable 0

+ 34
- 0
tests/fixtures/cgroup/memory.stat.nolimit View File

@@ -0,0 +1,34 @@
1
+cache 0
2
+rss 561152
3
+rss_huge 0
4
+mapped_file 0
5
+dirty 0
6
+writeback 0
7
+swap 0
8
+pgpgin 654
9
+pgpgout 517
10
+pgfault 1089
11
+pgmajfault 0
12
+inactive_anon 0
13
+active_anon 454656
14
+inactive_file 0
15
+active_file 0
16
+unevictable 0
17
+hierarchical_memory_limit 9223372036854771712
18
+hierarchical_memsw_limit 9223372036854771712
19
+total_cache 0
20
+total_rss 561152
21
+total_rss_huge 0
22
+total_mapped_file 0
23
+total_dirty 0
24
+total_writeback 0
25
+total_swap 0
26
+total_pgpgin 654
27
+total_pgpgout 517
28
+total_pgfault 1089
29
+total_pgmajfault 0
30
+total_inactive_anon 0
31
+total_active_anon 454656
32
+total_inactive_file 0
33
+total_active_file 0
34
+total_unevictable 0

+ 34
- 0
tests/fixtures/cgroup/memory.stat.ok View File

@@ -0,0 +1,34 @@
1
+cache 0
2
+rss 561152
3
+rss_huge 0
4
+mapped_file 0
5
+dirty 0
6
+writeback 0
7
+swap 0
8
+pgpgin 654
9
+pgpgout 517
10
+pgfault 1089
11
+pgmajfault 0
12
+inactive_anon 0
13
+active_anon 454656
14
+inactive_file 0
15
+active_file 0
16
+unevictable 0
17
+hierarchical_memory_limit 5368709120
18
+hierarchical_memsw_limit 5368709120
19
+total_cache 0
20
+total_rss 1073741824
21
+total_rss_huge 0
22
+total_mapped_file 0
23
+total_dirty 0
24
+total_writeback 0
25
+total_swap 0
26
+total_pgpgin 654
27
+total_pgpgout 517
28
+total_pgfault 1089
29
+total_pgmajfault 0
30
+total_inactive_anon 0
31
+total_active_anon 454656
32
+total_inactive_file 0
33
+total_active_file 0
34
+total_unevictable 0

+ 49
- 1
tests/unit/test_executor.py View File

@@ -31,6 +31,7 @@ from tests.base import (
31 31
 )
32 32
 
33 33
 from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
34
+from zuul.executor.sensors.ram import RAMSensor
34 35
 
35 36
 
36 37
 class TestExecutorRepos(ZuulTestCase):
@@ -466,15 +467,62 @@ class TestGovernor(ZuulTestCase):
466 467
             pass
467 468
         ram = Dummy()
468 469
         ram.percent = 20.0  # 20% used
470
+        ram.total = 8 * 1024 * 1024 * 1024  # 8GiB
469 471
         vm_mock.return_value = ram
470 472
         loadavg_mock.return_value = (0.0, 0.0, 0.0)
471 473
         self.executor_server.manageLoad()
472 474
         self.assertTrue(self.executor_server.accepting_work)
473
-        ram.percent = 99.0  # 99% used
474 475
         loadavg_mock.return_value = (100.0, 100.0, 100.0)
475 476
         self.executor_server.manageLoad()
476 477
         self.assertFalse(self.executor_server.accepting_work)
477 478
 
479
+    @mock.patch('os.getloadavg')
480
+    @mock.patch('psutil.virtual_memory')
481
+    def test_ram_governor(self, vm_mock, loadavg_mock):
482
+        class Dummy(object):
483
+            pass
484
+        ram = Dummy()
485
+        ram.percent = 20.0  # 20% used
486
+        ram.total = 8 * 1024 * 1024 * 1024  # 8GiB
487
+        vm_mock.return_value = ram
488
+        loadavg_mock.return_value = (0.0, 0.0, 0.0)
489
+        self.executor_server.manageLoad()
490
+        self.assertTrue(self.executor_server.accepting_work)
491
+        ram.percent = 99.0  # 99% used
492
+        self.executor_server.manageLoad()
493
+        self.assertFalse(self.executor_server.accepting_work)
494
+
495
+    @mock.patch('os.getloadavg')
496
+    @mock.patch('psutil.virtual_memory')
497
+    def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
498
+        class Dummy(object):
499
+            pass
500
+        ram = Dummy()
501
+        ram.percent = 20.0  # 20% used
502
+        ram.total = 8 * 1024 * 1024 * 1024  # 8GiB
503
+        vm_mock.return_value = ram
504
+        loadavg_mock.return_value = (0.0, 0.0, 0.0)
505
+
506
+        # Set no cgroup limit
507
+        ram_sensor = [x for x in self.executor_server.sensors
508
+                      if isinstance(x, RAMSensor)][0]
509
+        ram_sensor.cgroup_stats_file = os.path.join(
510
+            FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
511
+        self.executor_server.manageLoad()
512
+        self.assertTrue(self.executor_server.accepting_work)
513
+
514
+        # Set cgroup limit 5GiB and ram usage 20%
515
+        ram_sensor.cgroup_stats_file = os.path.join(
516
+            FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
517
+        self.executor_server.manageLoad()
518
+        self.assertTrue(self.executor_server.accepting_work)
519
+
520
+        # Set cgroup limit 5GiB and ram usage 96%
521
+        ram_sensor.cgroup_stats_file = os.path.join(
522
+            FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
523
+        self.executor_server.manageLoad()
524
+        self.assertFalse(self.executor_server.accepting_work)
525
+
478 526
     @mock.patch('os.statvfs')
479 527
     def test_hdd_governor(self, statvfs_mock):
480 528
         class Dummy(object):

+ 53
- 1
zuul/executor/sensors/ram.py View File

@@ -13,11 +13,14 @@
13 13
 # under the License.
14 14
 
15 15
 import logging
16
+import math
16 17
 import psutil
17 18
 
18 19
 from zuul.executor.sensors import SensorInterface
19 20
 from zuul.lib.config import get_default
20 21
 
22
+CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
23
+
21 24
 
22 25
 def get_avail_mem_pct():
23 26
     avail_mem_pct = 100.0 - psutil.virtual_memory().percent
@@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
30 33
     def __init__(self, config=None):
31 34
         self.min_avail_mem = float(get_default(config, 'executor',
32 35
                                                'min_avail_mem', '5.0'))
36
+        self.cgroup_stats_file = CGROUP_STATS_FILE
37
+
38
+    def _read_cgroup_stat(self):
39
+        stat = {}
40
+        try:
41
+            with open(self.cgroup_stats_file) as f:
42
+                for line in f.readlines():
43
+                    key, value = line.split(' ')
44
+                    stat[key] = int(value.strip())
45
+        except Exception:
46
+            pass
47
+        return stat
48
+
49
+    def _get_cgroup_limit(self):
50
+        stat = self._read_cgroup_stat()
51
+        limit = stat.get('hierarchical_memory_limit', math.inf)
52
+        mem_total = psutil.virtual_memory().total
53
+        if limit < mem_total:
54
+            return limit
55
+        else:
56
+            return math.inf
57
+
58
+    def _get_avail_mem_pct_cgroup(self):
59
+        stat = self._read_cgroup_stat()
60
+        limit = stat.get('hierarchical_memory_limit', math.inf)
61
+        usage = stat.get('total_rss', math.inf)
62
+
63
+        if math.isinf(limit) or math.isinf(usage):
64
+            # pretend we have all memory available if we got infs
65
+            return 100
66
+
67
+        return 100.0 - usage / limit * 100
33 68
 
34 69
     def isOk(self):
35 70
         avail_mem_pct = get_avail_mem_pct()
@@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
38 73
             return False, "low memory {:3.1f}% < {}".format(
39 74
                 avail_mem_pct, self.min_avail_mem)
40 75
 
41
-        return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem)
76
+        if math.isinf(self._get_cgroup_limit()):
77
+            # we have no cgroup defined limit so we're done now
78
+            return True, "{:3.1f}% <= {}".format(
79
+                avail_mem_pct, self.min_avail_mem)
80
+
81
+        avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
82
+        if avail_mem_pct_cgroup < self.min_avail_mem:
83
+            return False, "low memory cgroup {:3.1f}% < {}".format(
84
+                avail_mem_pct_cgroup, self.min_avail_mem)
85
+
86
+        return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
87
+            avail_mem_pct, self.min_avail_mem,
88
+            avail_mem_pct_cgroup, self.min_avail_mem)
42 89
 
43 90
     def reportStats(self, statsd, base_key):
44 91
         avail_mem_pct = get_avail_mem_pct()
45 92
 
46 93
         statsd.gauge(base_key + '.pct_used_ram',
47 94
                      int((100.0 - avail_mem_pct) * 100))
95
+
96
+        if math.isfinite(self._get_cgroup_limit()):
97
+            avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
98
+            statsd.gauge(base_key + '.pct_used_ram_cgroup',
99
+                         int((100.0 - avail_mem_pct_cgroup) * 100))

Loading…
Cancel
Save