Browse Source

Merge "Add cgroup support to ram sensor"

tags/3.5.0
Zuul 5 months ago
parent
commit
97da909bd8

+ 6
- 0
doc/source/admin/monitoring.rst View File

@@ -210,6 +210,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
210 210
       The used RAM (excluding buffers and cache) on this executor, as
211 211
       a percentage multiplied by 100.
212 212
 
213
+  .. stat:: pct_used_ram_cgroup
214
+     :type: gauge
215
+
216
+     The used RAM (excluding buffers and cache) on this executor allowed by
217
+     the cgroup, as percentage multiplied by 100.
218
+
213 219
 .. stat:: zuul.nodepool.requests
214 220
 
215 221
    Holds metrics related to Zuul requests and responses from Nodepool.

+ 6
- 0
releasenotes/notes/cgroups-governor-430a565cae0ef104.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+features:
3
+  - |
4
+    The :attr:`executor.min_avail_mem` setting now takes cgroup limits
5
+    into account. There is also a new metric
6
+    `zuul.executor.<executor>.pct_used_ram_cgroup` available.

+ 34
- 0
tests/fixtures/cgroup/memory.stat.bad View File

@@ -0,0 +1,34 @@
1
+cache 0
2
+rss 561152
3
+rss_huge 0
4
+mapped_file 0
5
+dirty 0
6
+writeback 0
7
+swap 0
8
+pgpgin 654
9
+pgpgout 517
10
+pgfault 1089
11
+pgmajfault 0
12
+inactive_anon 0
13
+active_anon 454656
14
+inactive_file 0
15
+active_file 0
16
+unevictable 0
17
+hierarchical_memory_limit 5368709120
18
+hierarchical_memsw_limit 5368709120
19
+total_cache 0
20
+total_rss 5153960755
21
+total_rss_huge 0
22
+total_mapped_file 0
23
+total_dirty 0
24
+total_writeback 0
25
+total_swap 0
26
+total_pgpgin 654
27
+total_pgpgout 517
28
+total_pgfault 1089
29
+total_pgmajfault 0
30
+total_inactive_anon 0
31
+total_active_anon 454656
32
+total_inactive_file 0
33
+total_active_file 0
34
+total_unevictable 0

+ 34
- 0
tests/fixtures/cgroup/memory.stat.nolimit View File

@@ -0,0 +1,34 @@
1
+cache 0
2
+rss 561152
3
+rss_huge 0
4
+mapped_file 0
5
+dirty 0
6
+writeback 0
7
+swap 0
8
+pgpgin 654
9
+pgpgout 517
10
+pgfault 1089
11
+pgmajfault 0
12
+inactive_anon 0
13
+active_anon 454656
14
+inactive_file 0
15
+active_file 0
16
+unevictable 0
17
+hierarchical_memory_limit 9223372036854771712
18
+hierarchical_memsw_limit 9223372036854771712
19
+total_cache 0
20
+total_rss 561152
21
+total_rss_huge 0
22
+total_mapped_file 0
23
+total_dirty 0
24
+total_writeback 0
25
+total_swap 0
26
+total_pgpgin 654
27
+total_pgpgout 517
28
+total_pgfault 1089
29
+total_pgmajfault 0
30
+total_inactive_anon 0
31
+total_active_anon 454656
32
+total_inactive_file 0
33
+total_active_file 0
34
+total_unevictable 0

+ 34
- 0
tests/fixtures/cgroup/memory.stat.ok View File

@@ -0,0 +1,34 @@
1
+cache 0
2
+rss 561152
3
+rss_huge 0
4
+mapped_file 0
5
+dirty 0
6
+writeback 0
7
+swap 0
8
+pgpgin 654
9
+pgpgout 517
10
+pgfault 1089
11
+pgmajfault 0
12
+inactive_anon 0
13
+active_anon 454656
14
+inactive_file 0
15
+active_file 0
16
+unevictable 0
17
+hierarchical_memory_limit 5368709120
18
+hierarchical_memsw_limit 5368709120
19
+total_cache 0
20
+total_rss 1073741824
21
+total_rss_huge 0
22
+total_mapped_file 0
23
+total_dirty 0
24
+total_writeback 0
25
+total_swap 0
26
+total_pgpgin 654
27
+total_pgpgout 517
28
+total_pgfault 1089
29
+total_pgmajfault 0
30
+total_inactive_anon 0
31
+total_active_anon 454656
32
+total_inactive_file 0
33
+total_active_file 0
34
+total_unevictable 0

+ 49
- 1
tests/unit/test_executor.py View File

@@ -31,6 +31,7 @@ from tests.base import (
31 31
 )
32 32
 
33 33
 from zuul.executor.sensors.startingbuilds import StartingBuildsSensor
34
+from zuul.executor.sensors.ram import RAMSensor
34 35
 
35 36
 
36 37
 class TestExecutorRepos(ZuulTestCase):
@@ -466,15 +467,62 @@ class TestGovernor(ZuulTestCase):
466 467
             pass
467 468
         ram = Dummy()
468 469
         ram.percent = 20.0  # 20% used
470
+        ram.total = 8 * 1024 * 1024 * 1024  # 8GiB
469 471
         vm_mock.return_value = ram
470 472
         loadavg_mock.return_value = (0.0, 0.0, 0.0)
471 473
         self.executor_server.manageLoad()
472 474
         self.assertTrue(self.executor_server.accepting_work)
473
-        ram.percent = 99.0  # 99% used
474 475
         loadavg_mock.return_value = (100.0, 100.0, 100.0)
475 476
         self.executor_server.manageLoad()
476 477
         self.assertFalse(self.executor_server.accepting_work)
477 478
 
479
+    @mock.patch('os.getloadavg')
480
+    @mock.patch('psutil.virtual_memory')
481
+    def test_ram_governor(self, vm_mock, loadavg_mock):
482
+        class Dummy(object):
483
+            pass
484
+        ram = Dummy()
485
+        ram.percent = 20.0  # 20% used
486
+        ram.total = 8 * 1024 * 1024 * 1024  # 8GiB
487
+        vm_mock.return_value = ram
488
+        loadavg_mock.return_value = (0.0, 0.0, 0.0)
489
+        self.executor_server.manageLoad()
490
+        self.assertTrue(self.executor_server.accepting_work)
491
+        ram.percent = 99.0  # 99% used
492
+        self.executor_server.manageLoad()
493
+        self.assertFalse(self.executor_server.accepting_work)
494
+
495
+    @mock.patch('os.getloadavg')
496
+    @mock.patch('psutil.virtual_memory')
497
+    def test_ram_cgroup_governor(self, vm_mock, loadavg_mock):
498
+        class Dummy(object):
499
+            pass
500
+        ram = Dummy()
501
+        ram.percent = 20.0  # 20% used
502
+        ram.total = 8 * 1024 * 1024 * 1024  # 8GiB
503
+        vm_mock.return_value = ram
504
+        loadavg_mock.return_value = (0.0, 0.0, 0.0)
505
+
506
+        # Set no cgroup limit
507
+        ram_sensor = [x for x in self.executor_server.sensors
508
+                      if isinstance(x, RAMSensor)][0]
509
+        ram_sensor.cgroup_stats_file = os.path.join(
510
+            FIXTURE_DIR, 'cgroup', 'memory.stat.nolimit')
511
+        self.executor_server.manageLoad()
512
+        self.assertTrue(self.executor_server.accepting_work)
513
+
514
+        # Set cgroup limit 5GiB and ram usage 20%
515
+        ram_sensor.cgroup_stats_file = os.path.join(
516
+            FIXTURE_DIR, 'cgroup', 'memory.stat.ok')
517
+        self.executor_server.manageLoad()
518
+        self.assertTrue(self.executor_server.accepting_work)
519
+
520
+        # Set cgroup limit 5GiB and ram usage 96%
521
+        ram_sensor.cgroup_stats_file = os.path.join(
522
+            FIXTURE_DIR, 'cgroup', 'memory.stat.bad')
523
+        self.executor_server.manageLoad()
524
+        self.assertFalse(self.executor_server.accepting_work)
525
+
478 526
     @mock.patch('os.statvfs')
479 527
     def test_hdd_governor(self, statvfs_mock):
480 528
         class Dummy(object):

+ 53
- 1
zuul/executor/sensors/ram.py View File

@@ -13,11 +13,14 @@
13 13
 # under the License.
14 14
 
15 15
 import logging
16
+import math
16 17
 import psutil
17 18
 
18 19
 from zuul.executor.sensors import SensorInterface
19 20
 from zuul.lib.config import get_default
20 21
 
22
+CGROUP_STATS_FILE = '/sys/fs/cgroup/memory/memory.stat'
23
+
21 24
 
22 25
 def get_avail_mem_pct():
23 26
     avail_mem_pct = 100.0 - psutil.virtual_memory().percent
@@ -30,6 +33,38 @@ class RAMSensor(SensorInterface):
30 33
     def __init__(self, config=None):
31 34
         self.min_avail_mem = float(get_default(config, 'executor',
32 35
                                                'min_avail_mem', '5.0'))
36
+        self.cgroup_stats_file = CGROUP_STATS_FILE
37
+
38
+    def _read_cgroup_stat(self):
39
+        stat = {}
40
+        try:
41
+            with open(self.cgroup_stats_file) as f:
42
+                for line in f.readlines():
43
+                    key, value = line.split(' ')
44
+                    stat[key] = int(value.strip())
45
+        except Exception:
46
+            pass
47
+        return stat
48
+
49
+    def _get_cgroup_limit(self):
50
+        stat = self._read_cgroup_stat()
51
+        limit = stat.get('hierarchical_memory_limit', math.inf)
52
+        mem_total = psutil.virtual_memory().total
53
+        if limit < mem_total:
54
+            return limit
55
+        else:
56
+            return math.inf
57
+
58
+    def _get_avail_mem_pct_cgroup(self):
59
+        stat = self._read_cgroup_stat()
60
+        limit = stat.get('hierarchical_memory_limit', math.inf)
61
+        usage = stat.get('total_rss', math.inf)
62
+
63
+        if math.isinf(limit) or math.isinf(usage):
64
+            # pretend we have all memory available if we got infs
65
+            return 100
66
+
67
+        return 100.0 - usage / limit * 100
33 68
 
34 69
     def isOk(self):
35 70
         avail_mem_pct = get_avail_mem_pct()
@@ -38,10 +73,27 @@ class RAMSensor(SensorInterface):
38 73
             return False, "low memory {:3.1f}% < {}".format(
39 74
                 avail_mem_pct, self.min_avail_mem)
40 75
 
41
-        return True, "{:3.1f}% <= {}".format(avail_mem_pct, self.min_avail_mem)
76
+        if math.isinf(self._get_cgroup_limit()):
77
+            # we have no cgroup defined limit so we're done now
78
+            return True, "{:3.1f}% <= {}".format(
79
+                avail_mem_pct, self.min_avail_mem)
80
+
81
+        avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
82
+        if avail_mem_pct_cgroup < self.min_avail_mem:
83
+            return False, "low memory cgroup {:3.1f}% < {}".format(
84
+                avail_mem_pct_cgroup, self.min_avail_mem)
85
+
86
+        return True, "{:3.1f}% <= {}, {:3.1f}% <= {}".format(
87
+            avail_mem_pct, self.min_avail_mem,
88
+            avail_mem_pct_cgroup, self.min_avail_mem)
42 89
 
43 90
     def reportStats(self, statsd, base_key):
44 91
         avail_mem_pct = get_avail_mem_pct()
45 92
 
46 93
         statsd.gauge(base_key + '.pct_used_ram',
47 94
                      int((100.0 - avail_mem_pct) * 100))
95
+
96
+        if math.isfinite(self._get_cgroup_limit()):
97
+            avail_mem_pct_cgroup = self._get_avail_mem_pct_cgroup()
98
+            statsd.gauge(base_key + '.pct_used_ram_cgroup',
99
+                         int((100.0 - avail_mem_pct_cgroup) * 100))

Loading…
Cancel
Save