Browse Source

Merge "Fix items stuck in queue pending node requests"

tags/3.4.0
Zuul 5 months ago
parent
commit
c1006610fd
3 changed files with 84 additions and 4 deletions
  1. 72
    3
      tests/unit/test_scheduler.py
  2. 6
    1
      zuul/model.py
  3. 6
    0
      zuul/scheduler.py

+ 72
- 3
tests/unit/test_scheduler.py View File

@@ -4484,7 +4484,7 @@ class TestScheduler(ZuulTestCase):
4484 4484
         tenant = self.sched.abide.tenants.get('tenant-one')
4485 4485
         queue = tenant.layout.pipelines['gate'].queues[0]
4486 4486
         self.assertEqual(queue.window, 2)
4487
-        self.assertTrue(len(self.builds), 4)
4487
+        self.assertEqual(len(self.builds), 4)
4488 4488
 
4489 4489
         self.waitUntilSettled()
4490 4490
         self.commitConfigUpdate('org/common-config',
@@ -4499,7 +4499,7 @@ class TestScheduler(ZuulTestCase):
4499 4499
         # B is outside the window, but still marked active until the
4500 4500
         # next pass through the queue processor, so its builds haven't
4501 4501
         # been canceled.
4502
-        self.assertTrue(len(self.builds), 4)
4502
+        self.assertEqual(len(self.builds), 4)
4503 4503
 
4504 4504
         self.sched.reconfigure(self.config)
4505 4505
         tenant = self.sched.abide.tenants.get('tenant-one')
@@ -4507,7 +4507,7 @@ class TestScheduler(ZuulTestCase):
4507 4507
         self.assertEqual(queue.window, 1)
4508 4508
         self.waitUntilSettled()
4509 4509
         # B's builds have been canceled now
4510
-        self.assertTrue(len(self.builds), 2)
4510
+        self.assertEqual(len(self.builds), 2)
4511 4511
 
4512 4512
         self.executor_server.hold_jobs_in_build = False
4513 4513
         self.executor_server.release()
@@ -4524,6 +4524,75 @@ class TestScheduler(ZuulTestCase):
4524 4524
             dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
4525 4525
         ], ordered=False)
4526 4526
 
4527
+    @simple_layout('layouts/reconfigure-window-fixed.yaml')
4528
+    def test_reconfigure_window_fixed_requests(self):
4529
+        # Test the active window shrinking during reconfiguration with
4530
+        # outstanding node requests
4531
+        self.executor_server.hold_jobs_in_build = True
4532
+
4533
+        # Start the jobs for A
4534
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
4535
+        A.addApproval('Code-Review', 2)
4536
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
4537
+        self.waitUntilSettled()
4538
+        self.log.debug("A complete")
4539
+
4540
+        # Hold node requests for B
4541
+        self.fake_nodepool.pause()
4542
+        B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
4543
+        B.addApproval('Code-Review', 2)
4544
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
4545
+        self.waitUntilSettled()
4546
+        self.log.debug("B complete")
4547
+
4548
+        tenant = self.sched.abide.tenants.get('tenant-one')
4549
+        queue = tenant.layout.pipelines['gate'].queues[0]
4550
+        self.assertEqual(queue.window, 2)
4551
+        self.assertEqual(len(self.builds), 2)
4552
+
4553
+        self.waitUntilSettled()
4554
+        self.commitConfigUpdate('org/common-config',
4555
+                                'layouts/reconfigure-window-fixed2.yaml')
4556
+        self.sched.reconfigure(self.config)
4557
+        self.waitUntilSettled()
4558
+        self.log.debug("Reconfiguration complete")
4559
+
4560
+        tenant = self.sched.abide.tenants.get('tenant-one')
4561
+        queue = tenant.layout.pipelines['gate'].queues[0]
4562
+        # Because we have configured a static window, it should
4563
+        # be allowed to shrink on reconfiguration.
4564
+        self.assertEqual(queue.window, 1)
4565
+        self.assertEqual(len(self.builds), 2)
4566
+
4567
+        # After the previous reconfig, the queue processor will have
4568
+        # run and marked B inactive; run another reconfiguration so
4569
+        # that we're testing what happens when we reconfigure after
4570
+        # the active window having shrunk.
4571
+        self.sched.reconfigure(self.config)
4572
+
4573
+        # Unpause the node requests now
4574
+        self.fake_nodepool.unpause()
4575
+        self.waitUntilSettled()
4576
+        self.log.debug("Nodepool unpause complete")
4577
+
4578
+        # Allow A to merge and B to enter the active window and complete
4579
+        self.executor_server.hold_jobs_in_build = False
4580
+        self.executor_server.release()
4581
+        self.waitUntilSettled()
4582
+        self.log.debug("Executor unpause complete")
4583
+
4584
+        tenant = self.sched.abide.tenants.get('tenant-one')
4585
+        queue = tenant.layout.pipelines['gate'].queues[0]
4586
+        self.assertEqual(queue.window, 1)
4587
+
4588
+        self.waitUntilSettled()
4589
+        self.assertHistory([
4590
+            dict(name='job1', result='SUCCESS', changes='1,1'),
4591
+            dict(name='job2', result='SUCCESS', changes='1,1'),
4592
+            dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
4593
+            dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
4594
+        ], ordered=False)
4595
+
4527 4596
     @simple_layout('layouts/reconfigure-remove-add.yaml')
4528 4597
     def test_reconfigure_remove_add(self):
4529 4598
         # Test removing, then adding a job while in queue

+ 6
- 1
zuul/model.py View File

@@ -1849,7 +1849,7 @@ class BuildSet(object):
1849 1849
 
1850 1850
     def removeJobNodeSet(self, job_name):
1851 1851
         if job_name not in self.nodesets:
1852
-            raise Exception("No job set for %s" % (job_name))
1852
+            raise Exception("No job nodeset for %s" % (job_name))
1853 1853
         del self.nodesets[job_name]
1854 1854
 
1855 1855
     def setJobNodeRequest(self, job_name, req):
@@ -1860,6 +1860,11 @@ class BuildSet(object):
1860 1860
     def getJobNodeRequest(self, job_name):
1861 1861
         return self.node_requests.get(job_name)
1862 1862
 
1863
+    def removeJobNodeRequest(self, job_name):
1864
+        if job_name not in self.node_requests:
1865
+            raise Exception("No node request for %s" % (job_name))
1866
+        del self.node_requests[job_name]
1867
+
1863 1868
     def jobNodeRequestComplete(self, job_name, req, nodeset):
1864 1869
         if job_name in self.nodesets:
1865 1870
             raise Exception("Prior node request for %s" % (job_name))

+ 6
- 0
zuul/scheduler.py View File

@@ -1313,6 +1313,12 @@ class Scheduler(threading.Thread):
1313 1313
             self.log.warning("Item %s does not contain job %s "
1314 1314
                              "for node request %s",
1315 1315
                              build_set.item, request.job.name, request)
1316
+            try:
1317
+                build_set.removeJobNodeRequest(request.job.name)
1318
+            except Exception:
1319
+                self.log.exception("Unable to remove obsolete node request "
1320
+                                   "%s for %s job %s",
1321
+                                   request, build_set.item, request.job.name)
1316 1322
             if request.fulfilled:
1317 1323
                 self.nodepool.returnNodeSet(request.nodeset)
1318 1324
             return

Loading…
Cancel
Save