Browse Source

Merge "Fix rare semaphore leak during reconfiguration"

tags/3.6.0
Zuul 2 months ago
parent
commit
b12d0d6164

+ 110
- 0
tests/fixtures/config/semaphore/git/common-config/zuul-remove-job.yaml View File

@@ -0,0 +1,110 @@
1
+- pipeline:
2
+    name: check
3
+    manager: independent
4
+    trigger:
5
+      gerrit:
6
+        - event: patchset-created
7
+    success:
8
+      gerrit:
9
+        Verified: 1
10
+    failure:
11
+      gerrit:
12
+        Verified: -1
13
+
14
+- semaphore:
15
+    name: test-semaphore-two
16
+    max: 2
17
+
18
+- job:
19
+    name: base
20
+    parent: null
21
+    nodeset:
22
+      nodes:
23
+        - name: controller
24
+          label: label1
25
+
26
+- job:
27
+    name: project-test1
28
+    run: playbooks/project-test1.yaml
29
+
30
+- job:
31
+    name: semaphore-one-test1
32
+    semaphore: test-semaphore
33
+    run: playbooks/semaphore-one-test1.yaml
34
+
35
+- job:
36
+    name: semaphore-one-test2
37
+    semaphore: test-semaphore
38
+    run: playbooks/semaphore-one-test2.yaml
39
+
40
+- job:
41
+    name: semaphore-two-test1
42
+    semaphore: test-semaphore-two
43
+    run: playbooks/semaphore-two-test1.yaml
44
+
45
+- job:
46
+    name: semaphore-two-test2
47
+    semaphore: test-semaphore-two
48
+    run: playbooks/semaphore-two-test2.yaml
49
+
50
+- job:
51
+    name: semaphore-one-test3
52
+    semaphore: test-semaphore
53
+    run: playbooks/semaphore-two-test1.yaml
54
+    nodeset:
55
+      nodes:
56
+        - name: controller
57
+          label: label1
58
+
59
+- job:
60
+    name: semaphore-one-test1-resources-first
61
+    semaphore:
62
+      name: test-semaphore
63
+      resources-first: True
64
+    run: playbooks/semaphore-one-test1.yaml
65
+
66
+- job:
67
+    name: semaphore-one-test2-resources-first
68
+    semaphore:
69
+      name: test-semaphore
70
+      resources-first: True
71
+    run: playbooks/semaphore-one-test1.yaml
72
+
73
+- project:
74
+    name: org/project
75
+    check:
76
+      jobs:
77
+        - project-test1
78
+        # This is the difference to zuul.yaml that is used in
79
+        # test_semaphore_reconfigure_job_removal:
80
+        # - semaphore-one-test1
81
+        # - semaphore-one-test2
82
+
83
+
84
+- project:
85
+    name: org/project1
86
+    check:
87
+      jobs:
88
+        - project-test1
89
+        - semaphore-two-test1
90
+        - semaphore-two-test2
91
+
92
+- project:
93
+    name: org/project2
94
+    check:
95
+      jobs:
96
+        - semaphore-one-test3
97
+
98
+- project:
99
+    name: org/project3
100
+    check:
101
+      jobs:
102
+        - project-test1
103
+        - semaphore-one-test1-resources-first
104
+        - semaphore-one-test2-resources-first
105
+
106
+- project:
107
+    name: org/project4
108
+    check:
109
+      jobs:
110
+        - semaphore-one-test1-resources-first

+ 90
- 0
tests/unit/test_scheduler.py View File

@@ -6584,6 +6584,96 @@ class TestSemaphore(ZuulTestCase):
6584 6584
         self.assertFalse('test-semaphore' in
6585 6585
                          tenant.semaphore_handler.semaphores)
6586 6586
 
6587
+    def test_semaphore_reconfigure_job_removal(self):
6588
+        "Test job removal during reconfiguration with semaphores"
6589
+        self.executor_server.hold_jobs_in_build = True
6590
+        tenant = self.sched.abide.tenants.get('tenant-one')
6591
+
6592
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
6593
+        self.assertFalse('test-semaphore' in
6594
+                         tenant.semaphore_handler.semaphores)
6595
+
6596
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6597
+        self.waitUntilSettled()
6598
+
6599
+        self.assertTrue('test-semaphore' in
6600
+                        tenant.semaphore_handler.semaphores)
6601
+
6602
+        self.commitConfigUpdate(
6603
+            'common-config',
6604
+            'config/semaphore/git/common-config/zuul-remove-job.yaml')
6605
+        self.sched.reconfigure(self.config)
6606
+        self.waitUntilSettled()
6607
+
6608
+        # Release job project-test1 which should be the only job left
6609
+        self.executor_server.release('project-test1')
6610
+        self.waitUntilSettled()
6611
+
6612
+        # The check pipeline should be empty
6613
+        tenant = self.sched.abide.tenants.get('tenant-one')
6614
+        check_pipeline = tenant.layout.pipelines['check']
6615
+        items = check_pipeline.getAllItems()
6616
+        self.assertEqual(len(items), 0)
6617
+
6618
+        # The semaphore should be released
6619
+        self.assertFalse('test-semaphore' in
6620
+                         tenant.semaphore_handler.semaphores)
6621
+
6622
+        self.executor_server.hold_jobs_in_build = False
6623
+        self.executor_server.release()
6624
+        self.waitUntilSettled()
6625
+
6626
+    def test_semaphore_reconfigure_job_removal_pending_node_request(self):
6627
+        """
6628
+        Test job removal during reconfiguration with semaphores and pending
6629
+        node request.
6630
+        """
6631
+        self.executor_server.hold_jobs_in_build = True
6632
+
6633
+        # Pause nodepool so we can block the job in node request state during
6634
+        # reconfiguration.
6635
+        self.fake_nodepool.pause()
6636
+
6637
+        tenant = self.sched.abide.tenants.get('tenant-one')
6638
+
6639
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
6640
+        self.assertFalse('test-semaphore' in
6641
+                         tenant.semaphore_handler.semaphores)
6642
+
6643
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6644
+        self.waitUntilSettled()
6645
+
6646
+        self.assertTrue('test-semaphore' in
6647
+                        tenant.semaphore_handler.semaphores)
6648
+
6649
+        self.commitConfigUpdate(
6650
+            'common-config',
6651
+            'config/semaphore/git/common-config/zuul-remove-job.yaml')
6652
+        self.sched.reconfigure(self.config)
6653
+        self.waitUntilSettled()
6654
+
6655
+        # Now we can unpause nodepool
6656
+        self.fake_nodepool.unpause()
6657
+        self.waitUntilSettled()
6658
+
6659
+        # Release job project-test1 which should be the only job left
6660
+        self.executor_server.release('project-test1')
6661
+        self.waitUntilSettled()
6662
+
6663
+        # The check pipeline should be empty
6664
+        tenant = self.sched.abide.tenants.get('tenant-one')
6665
+        check_pipeline = tenant.layout.pipelines['check']
6666
+        items = check_pipeline.getAllItems()
6667
+        self.assertEqual(len(items), 0)
6668
+
6669
+        # The semaphore should be released
6670
+        self.assertFalse('test-semaphore' in
6671
+                         tenant.semaphore_handler.semaphores)
6672
+
6673
+        self.executor_server.hold_jobs_in_build = False
6674
+        self.executor_server.release()
6675
+        self.waitUntilSettled()
6676
+
6587 6677
 
6588 6678
 class TestSemaphoreMultiTenant(ZuulTestCase):
6589 6679
     tenant_config_file = 'config/multi-tenant-semaphore/main.yaml'

+ 1
- 1
zuul/manager/__init__.py View File

@@ -836,7 +836,7 @@ class PipelineManager(object):
836 836
         self.log.debug("Item %s status is now:\n %s" %
837 837
                        (item, item.formatStatus()))
838 838
 
839
-        if build.retry:
839
+        if build.retry and build.build_set.getJobNodeSet(build.job.name):
840 840
             build.build_set.removeJobNodeSet(build.job.name)
841 841
 
842 842
         self._resumeBuilds(build.build_set)

+ 6
- 2
zuul/scheduler.py View File

@@ -830,6 +830,8 @@ class Scheduler(threading.Thread):
830 830
                     item.current_build_set.node_requests.items():
831 831
                     requests_to_cancel.append(
832 832
                         (item.current_build_set, request))
833
+
834
+            semaphores_to_release = []
833 835
             for build in builds_to_cancel:
834 836
                 self.log.info(
835 837
                     "Canceling build %s during reconfiguration" % (build,))
@@ -848,14 +850,16 @@ class Scheduler(threading.Thread):
848 850
                     self.log.exception(
849 851
                         "Exception while removing nodeset from build %s "
850 852
                         "for change %s" % (build, build.build_set.item.change))
851
-                tenant.semaphore_handler.release(
852
-                    build.build_set.item, build.job)
853
+                semaphores_to_release.append((build.build_set.item, build.job))
853 854
             for build_set, request in requests_to_cancel:
854 855
                 self.log.info(
855 856
                     "Canceling node request %s during reconfiguration",
856 857
                     request)
857 858
                 self.nodepool.cancelRequest(request)
858 859
                 build_set.removeJobNodeRequest(request.job.name)
860
+                semaphores_to_release.append((build_set.item, request.job))
861
+            for item, job in semaphores_to_release:
862
+                tenant.semaphore_handler.release(item, job)
859 863
 
860 864
     def _reconfigureTenant(self, tenant):
861 865
         # This is called from _doReconfigureEvent while holding the

Loading…
Cancel
Save