Browse Source

Merge "Fix missing semaphore release on node failure"

Zuul 2 months ago
parent
commit
069a550f81

+ 6
- 0
tests/fixtures/config/semaphore/git/common-config/zuul.yaml View File

@@ -103,3 +103,9 @@
103 103
         - project-test1
104 104
         - semaphore-one-test1-resources-first
105 105
         - semaphore-one-test2-resources-first
106
+
107
+- project:
108
+    name: org/project4
109
+    check:
110
+      jobs:
111
+        - semaphore-one-test1-resources-first

+ 1
- 0
tests/fixtures/config/semaphore/git/org_project4/README View File

@@ -0,0 +1 @@
1
+test

+ 1
- 0
tests/fixtures/config/semaphore/main.yaml View File

@@ -9,3 +9,4 @@
9 9
           - org/project1
10 10
           - org/project2
11 11
           - org/project3
12
+          - org/project4

+ 64
- 0
tests/unit/test_scheduler.py View File

@@ -6251,6 +6251,38 @@ class TestSemaphore(ZuulTestCase):
6251 6251
         self.assertEqual(A.reported, 1)
6252 6252
         self.assertEqual(B.reported, 1)
6253 6253
 
6254
+    def test_semaphore_node_failure(self):
6255
+        "Test semaphore and node failure"
6256
+        tenant = self.sched.abide.tenants.get('tenant-one')
6257
+
6258
+        # Pause nodepool so we can fail the node request later
6259
+        self.fake_nodepool.pause()
6260
+
6261
+        A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A')
6262
+        self.assertFalse('test-semaphore' in
6263
+                         tenant.semaphore_handler.semaphores)
6264
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6265
+        self.waitUntilSettled()
6266
+
6267
+        # By default we first lock the semaphore and then get the nodes
6268
+        # so at this point the semaphore needs to be aquired.
6269
+        self.assertTrue('test-semaphore' in
6270
+                        tenant.semaphore_handler.semaphores)
6271
+
6272
+        # Fail the node request and unpause
6273
+        req = self.fake_nodepool.getNodeRequests()[0]
6274
+        self.fake_nodepool.addFailRequest(req)
6275
+        self.fake_nodepool.unpause()
6276
+        self.waitUntilSettled()
6277
+
6278
+        # At this point the job that holds the semaphore failed with
6279
+        # node_failure and the semaphore must be released.
6280
+        self.assertFalse('test-semaphore' in
6281
+                         tenant.semaphore_handler.semaphores)
6282
+        self.assertEquals(1, A.reported)
6283
+        self.assertIn('semaphore-one-test3 semaphore-one-test3 : NODE_FAILURE',
6284
+                      A.messages[0])
6285
+
6254 6286
     def test_semaphore_resources_first(self):
6255 6287
         "Test semaphores with max=1 (mutex) and get resources first"
6256 6288
         tenant = self.sched.abide.tenants.get('tenant-one')
@@ -6298,6 +6330,38 @@ class TestSemaphore(ZuulTestCase):
6298 6330
         self.executor_server.release()
6299 6331
         self.waitUntilSettled()
6300 6332
 
6333
+    def test_semaphore_resources_first_node_failure(self):
6334
+        "Test semaphore and node failure"
6335
+        tenant = self.sched.abide.tenants.get('tenant-one')
6336
+
6337
+        # Pause nodepool so we can fail the node request later
6338
+        self.fake_nodepool.pause()
6339
+
6340
+        A = self.fake_gerrit.addFakeChange('org/project4', 'master', 'A')
6341
+        self.assertFalse('test-semaphore' in
6342
+                         tenant.semaphore_handler.semaphores)
6343
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
6344
+        self.waitUntilSettled()
6345
+
6346
+        # With resources first we first get the nodes so at this point the
6347
+        # semaphore must not be aquired.
6348
+        self.assertFalse('test-semaphore' in
6349
+                         tenant.semaphore_handler.semaphores)
6350
+
6351
+        # Fail the node request and unpause
6352
+        req = self.fake_nodepool.getNodeRequests()[0]
6353
+        self.fake_nodepool.addFailRequest(req)
6354
+        self.fake_nodepool.unpause()
6355
+        self.waitUntilSettled()
6356
+
6357
+        # At this point the job should never have acuired a semaphore so check
6358
+        # that it still has not locked a semaphore.
6359
+        self.assertFalse('test-semaphore' in
6360
+                         tenant.semaphore_handler.semaphores)
6361
+        self.assertEquals(1, A.reported)
6362
+        self.assertIn('semaphore-one-test1-resources-first : NODE_FAILURE',
6363
+                      A.messages[0])
6364
+
6301 6365
     def test_semaphore_zk_error(self):
6302 6366
         "Test semaphore release with zk error"
6303 6367
         tenant = self.sched.abide.tenants.get('tenant-one')

+ 3
- 0
zuul/manager/__init__.py View File

@@ -864,6 +864,9 @@ class PipelineManager(object):
864 864
                           (request, request.job.name,))
865 865
             build_set.item.setNodeRequestFailure(request.job)
866 866
             self._resumeBuilds(request.build_set)
867
+            tenant = build_set.item.pipeline.tenant
868
+            tenant.semaphore_handler.release(build_set.item, request.job)
869
+
867 870
         self.log.info("Completed node request %s for job %s of item %s "
868 871
                       "with nodes %s" %
869 872
                       (request, request.job, build_set.item,

Loading…
Cancel
Save