Browse Source

Merge "Improve resource usage with semaphores"

tags/3.3.1
Zuul 5 months ago
parent
commit
ccbe7b10df

+ 19
- 1
doc/source/user/config.rst View File

@@ -629,7 +629,25 @@ Here is an example of two job definitions:
629 629
       The name of a :ref:`semaphore` which should be acquired and
630 630
       released when the job begins and ends.  If the semaphore is at
631 631
       maximum capacity, then Zuul will wait until it can be acquired
632
-      before starting the job.
632
+      before starting the job. The format is either a string or a
633
+      dictionary. If it's a string it references a semaphore using the
634
+      default value for :attr:`job.semaphore.resources-first`.
635
+
636
+      .. attr:: name
637
+         :required:
638
+
639
+         The name of the referenced semaphore
640
+
641
+      .. attr:: resources-first
642
+         :default: False
643
+
644
+         By default a semaphore is acquired before the resources are
645
+         requested. However in some cases the user wants to run cheap
646
+         jobs as quickly as possible in a consecutive manner. In this
647
+         case :attr:`job.semaphore.resources-first` can be enabled to
648
+         request the resources before locking the semaphore. This can
649
+         lead to some amount of blocked resources while waiting for the
650
+         semaphore so this should be used with caution.
633 651
 
634 652
    .. attr:: tags
635 653
 

+ 14
- 0
releasenotes/notes/semaphore-resources-295dceaf7ddbab0d.yaml View File

@@ -0,0 +1,14 @@
1
+---
2
+features:
3
+  - |
4
+    A job using a semaphore now can configure if it should acquire the it
5
+    before requesting resources or just before running.
6
+upgrade:
7
+  - |
8
+    The acquiring behavior of jobs with semaphores has been changed. Up to now
9
+    a job requested resources and aquired the semaphore just before it started
10
+    to run. However this could lead to a high amount of resource waste. Instead
11
+    jobs now acquire the semaphore before requesting the resources by default.
12
+    This behavior can be overridden by jobs using
13
+    :attr:`job.semaphore.resources-first` if some waste of resources is
14
+    acceptable.

+ 26
- 0
tests/fixtures/config/semaphore/git/common-config/zuul.yaml View File

@@ -22,6 +22,10 @@
22 22
 - job:
23 23
     name: base
24 24
     parent: null
25
+    nodeset:
26
+      nodes:
27
+        - name: controller
28
+          label: label1
25 29
 
26 30
 - job:
27 31
     name: project-test1
@@ -56,6 +60,20 @@
56 60
         - name: controller
57 61
           label: label1
58 62
 
63
+- job:
64
+    name: semaphore-one-test1-resources-first
65
+    semaphore:
66
+      name: test-semaphore
67
+      resources-first: True
68
+    run: playbooks/semaphore-one-test1.yaml
69
+
70
+- job:
71
+    name: semaphore-one-test2-resources-first
72
+    semaphore:
73
+      name: test-semaphore
74
+      resources-first: True
75
+    run: playbooks/semaphore-one-test1.yaml
76
+
59 77
 - project:
60 78
     name: org/project
61 79
     check:
@@ -77,3 +95,11 @@
77 95
     check:
78 96
       jobs:
79 97
         - semaphore-one-test3
98
+
99
+- project:
100
+    name: org/project3
101
+    check:
102
+      jobs:
103
+        - project-test1
104
+        - semaphore-one-test1-resources-first
105
+        - semaphore-one-test2-resources-first

+ 1
- 0
tests/fixtures/config/semaphore/git/org_project3/README View File

@@ -0,0 +1 @@
1
+test

+ 1
- 0
tests/fixtures/config/semaphore/main.yaml View File

@@ -8,3 +8,4 @@
8 8
           - org/project
9 9
           - org/project1
10 10
           - org/project2
11
+          - org/project3

+ 58
- 0
tests/unit/test_scheduler.py View File

@@ -5849,6 +5849,10 @@ class TestSemaphore(ZuulTestCase):
5849 5849
 
5850 5850
         self.executor_server.hold_jobs_in_build = True
5851 5851
 
5852
+        # Pause nodepool so we can check the ordering of getting the nodes
5853
+        # and aquiring the semaphore.
5854
+        self.fake_nodepool.paused = True
5855
+
5852 5856
         A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
5853 5857
         B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
5854 5858
         self.assertFalse('test-semaphore' in
@@ -5858,6 +5862,13 @@ class TestSemaphore(ZuulTestCase):
5858 5862
         self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
5859 5863
         self.waitUntilSettled()
5860 5864
 
5865
+        # By default we first lock the semaphore and then get the nodes
5866
+        # so at this point the semaphore needs to be aquired.
5867
+        self.assertTrue('test-semaphore' in
5868
+                        tenant.semaphore_handler.semaphores)
5869
+        self.fake_nodepool.paused = False
5870
+        self.waitUntilSettled()
5871
+
5861 5872
         self.assertEqual(len(self.builds), 3)
5862 5873
         self.assertEqual(self.builds[0].name, 'project-test1')
5863 5874
         self.assertEqual(self.builds[1].name, 'semaphore-one-test1')
@@ -5993,6 +6004,53 @@ class TestSemaphore(ZuulTestCase):
5993 6004
         self.assertEqual(A.reported, 1)
5994 6005
         self.assertEqual(B.reported, 1)
5995 6006
 
6007
+    def test_semaphore_resources_first(self):
6008
+        "Test semaphores with max=1 (mutex) and get resources first"
6009
+        tenant = self.sched.abide.tenants.get('tenant-one')
6010
+
6011
+        self.executor_server.hold_jobs_in_build = True
6012
+
6013
+        # Pause nodepool so we can check the ordering of getting the nodes
6014
+        # and aquiring the semaphore.
6015
+        self.fake_nodepool.paused = True
6016
+
6017
+        A = self.fake_gerrit.addFakeChange('org/project3', 'master', 'A')
6018
+        B = self.fake_gerrit.addFakeChange('org/project3', 'master', 'B')
6019
+        self.assertFalse('test-semaphore' in
6020
+                         tenant.semaphore_handler.semaphores)
6021
+
6022
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6023
+        self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
6024
+        self.waitUntilSettled()
6025
+
6026
+        # Here we first get the resources and then lock the semaphore
6027
+        # so at this point the semaphore should not be aquired.
6028
+        self.assertFalse('test-semaphore' in
6029
+                         tenant.semaphore_handler.semaphores)
6030
+        self.fake_nodepool.paused = False
6031
+        self.waitUntilSettled()
6032
+
6033
+        self.assertEqual(len(self.builds), 3)
6034
+        self.assertEqual(self.builds[0].name, 'project-test1')
6035
+        self.assertEqual(self.builds[1].name,
6036
+                         'semaphore-one-test1-resources-first')
6037
+        self.assertEqual(self.builds[2].name, 'project-test1')
6038
+
6039
+        self.executor_server.release('semaphore-one-test1')
6040
+        self.waitUntilSettled()
6041
+
6042
+        self.assertEqual(len(self.builds), 3)
6043
+        self.assertEqual(self.builds[0].name, 'project-test1')
6044
+        self.assertEqual(self.builds[1].name, 'project-test1')
6045
+        self.assertEqual(self.builds[2].name,
6046
+                         'semaphore-one-test2-resources-first')
6047
+        self.assertTrue('test-semaphore' in
6048
+                        tenant.semaphore_handler.semaphores)
6049
+
6050
+        self.executor_server.hold_jobs_in_build = False
6051
+        self.executor_server.release()
6052
+        self.waitUntilSettled()
6053
+
5996 6054
     def test_semaphore_zk_error(self):
5997 6055
         "Test semaphore release with zk error"
5998 6056
         tenant = self.sched.abide.tenants.get('tenant-one')

+ 13
- 2
zuul/configloader.py View File

@@ -517,6 +517,9 @@ class JobParser(object):
517 517
     secret = {vs.Required('name'): str,
518 518
               vs.Required('secret'): str}
519 519
 
520
+    semaphore = {vs.Required('name'): str,
521
+                 'resources-first': bool}
522
+
520 523
     # Attributes of a job that can also be used in Project and ProjectTemplate
521 524
     job_attributes = {'parent': vs.Any(str, None),
522 525
                       'final': bool,
@@ -528,7 +531,7 @@ class JobParser(object):
528 531
                       'success-url': str,
529 532
                       'hold-following-changes': bool,
530 533
                       'voting': bool,
531
-                      'semaphore': str,
534
+                      'semaphore': vs.Any(semaphore, str),
532 535
                       'tags': to_list(str),
533 536
                       'branches': to_list(str),
534 537
                       'files': to_list(str),
@@ -573,7 +576,6 @@ class JobParser(object):
573 576
         'workspace',
574 577
         'voting',
575 578
         'hold-following-changes',
576
-        'semaphore',
577 579
         'attempts',
578 580
         'failure-message',
579 581
         'success-message',
@@ -728,6 +730,15 @@ class JobParser(object):
728 730
                 new_projects[project.canonical_name] = job_project
729 731
             job.required_projects = new_projects
730 732
 
733
+        if 'semaphore' in conf:
734
+            semaphore = conf.get('semaphore')
735
+            if isinstance(semaphore, str):
736
+                job.semaphore = model.JobSemaphore(semaphore)
737
+            else:
738
+                job.semaphore = model.JobSemaphore(
739
+                    semaphore.get('name'),
740
+                    semaphore.get('resources-first', False))
741
+
731 742
         tags = conf.get('tags')
732 743
         if tags:
733 744
             job.tags = set(tags)

+ 1
- 1
zuul/manager/__init__.py View File

@@ -322,7 +322,7 @@ class PipelineManager(object):
322 322
         change.commit_needs_changes = dependencies
323 323
 
324 324
     def provisionNodes(self, item):
325
-        jobs = item.findJobsToRequest()
325
+        jobs = item.findJobsToRequest(item.pipeline.tenant.semaphore_handler)
326 326
         if not jobs:
327 327
             return False
328 328
         build_set = item.current_build_set

+ 54
- 8
zuul/model.py View File

@@ -1112,7 +1112,12 @@ class Job(ConfigObject):
1112 1112
         d['required_projects'] = []
1113 1113
         for project in self.required_projects.values():
1114 1114
             d['required_projects'].append(project.toDict())
1115
-        d['semaphore'] = self.semaphore
1115
+        if self.semaphore:
1116
+            # For now just leave the semaphore name here until we really need
1117
+            # more information in zuul-web about this
1118
+            d['semaphore'] = self.semaphore.name
1119
+        else:
1120
+            d['semaphore'] = None
1116 1121
         d['variables'] = self.variables
1117 1122
         d['final'] = self.final
1118 1123
         d['abstract'] = self.abstract
@@ -1511,6 +1516,21 @@ class JobProject(ConfigObject):
1511 1516
         return d
1512 1517
 
1513 1518
 
1519
+class JobSemaphore(ConfigObject):
1520
+    """ A reference to a semaphore from a job. """
1521
+
1522
+    def __init__(self, semaphore_name, resources_first=False):
1523
+        super().__init__()
1524
+        self.name = semaphore_name
1525
+        self.resources_first = resources_first
1526
+
1527
+    def toDict(self):
1528
+        d = dict()
1529
+        d['name'] = self.name
1530
+        d['resources_first'] = self.resources_first
1531
+        return d
1532
+
1533
+
1514 1534
 class JobList(ConfigObject):
1515 1535
     """ A list of jobs in a project's pipeline. """
1516 1536
 
@@ -2135,13 +2155,13 @@ class QueueItem(object):
2135 2155
                     # The nodes for this job are not ready, skip
2136 2156
                     # it for now.
2137 2157
                     continue
2138
-                if semaphore_handler.acquire(self, job):
2158
+                if semaphore_handler.acquire(self, job, False):
2139 2159
                     # If this job needs a semaphore, either acquire it or
2140 2160
                     # make sure that we have it before running the job.
2141 2161
                     torun.append(job)
2142 2162
         return torun
2143 2163
 
2144
-    def findJobsToRequest(self):
2164
+    def findJobsToRequest(self, semaphore_handler):
2145 2165
         build_set = self.current_build_set
2146 2166
         toreq = []
2147 2167
         if not self.live:
@@ -2177,7 +2197,10 @@ class QueueItem(object):
2177 2197
                     all_parent_jobs_successful = False
2178 2198
                     break
2179 2199
             if all_parent_jobs_successful:
2180
-                toreq.append(job)
2200
+                if semaphore_handler.acquire(self, job, True):
2201
+                    # If this job needs a semaphore, either acquire it or
2202
+                    # make sure that we have it before requesting the nodes.
2203
+                    toreq.append(job)
2181 2204
         return toreq
2182 2205
 
2183 2206
     def setResult(self, build):
@@ -3596,11 +3619,34 @@ class SemaphoreHandler(object):
3596 3619
     def __init__(self):
3597 3620
         self.semaphores = {}
3598 3621
 
3599
-    def acquire(self, item, job):
3622
+    def acquire(self, item, job, request_resources):
3623
+        """
3624
+        Aquires a semaphore for an item job combination. This gets called twice
3625
+        during the lifecycle of a job. The first call is before requesting
3626
+        build resources. The second call is before running the job. In which
3627
+        call we really acquire the semaphore is defined by the job.
3628
+
3629
+        :param item: The item
3630
+        :param job: The job
3631
+        :param request_resources: True if we want to acquire for the request
3632
+                                  resources phase, False if we want to acquire
3633
+                                  for the run phase.
3634
+        """
3600 3635
         if not job.semaphore:
3601 3636
             return True
3602 3637
 
3603
-        semaphore_key = job.semaphore
3638
+        if job.semaphore.resources_first and request_resources:
3639
+            # We're currently in the resource request phase and want to get the
3640
+            # resources before locking. So we don't need to do anything here.
3641
+            return True
3642
+        else:
3643
+            # As a safety net we want to acuire the semaphore at least in the
3644
+            # run phase so don't filter this here as re-acuiring the semaphore
3645
+            # is not a problem here if it has been already acquired before in
3646
+            # the resources phase.
3647
+            pass
3648
+
3649
+        semaphore_key = job.semaphore.name
3604 3650
 
3605 3651
         m = self.semaphores.get(semaphore_key)
3606 3652
         if not m:
@@ -3612,7 +3658,7 @@ class SemaphoreHandler(object):
3612 3658
             return True
3613 3659
 
3614 3660
         # semaphore is there, check max
3615
-        if len(m) < self._max_count(item, job.semaphore):
3661
+        if len(m) < self._max_count(item, job.semaphore.name):
3616 3662
             self._acquire(semaphore_key, item, job.name)
3617 3663
             return True
3618 3664
 
@@ -3622,7 +3668,7 @@ class SemaphoreHandler(object):
3622 3668
         if not job.semaphore:
3623 3669
             return
3624 3670
 
3625
-        semaphore_key = job.semaphore
3671
+        semaphore_key = job.semaphore.name
3626 3672
 
3627 3673
         m = self.semaphores.get(semaphore_key)
3628 3674
         if not m:

Loading…
Cancel
Save