Browse Source

Fix items stuck in queue pending node requests

Given the following sequence:
* An item is within the active window
* A node request is submitted for a job for that item
* The active window shrinks
* A tenant reconfiguration is performed
* The node request completes

The nodes would be returned, however the record of the node request
was not removed from the item's buildset.  Once the item entered
the active window again, no further node requests would happen
for that job because of the retained node request record, and
therefore the item would be stuck in the queue.

To correct this, discard the node request record on the buildset
if we are going to return the nodes unused in that case.

Also clarify a debug message "job set" -> "job nodeset", and correct
some typos from a related unit test which were inadvertently simply
asserting true.

Change-Id: Ia6259b08ecda49f3123dae1c7d0e35dc46004561
tags/3.4.0
James E. Blair 5 months ago
parent
commit
31a7ddc791
3 changed files with 84 additions and 4 deletions
  1. 72
    3
      tests/unit/test_scheduler.py
  2. 6
    1
      zuul/model.py
  3. 6
    0
      zuul/scheduler.py

+ 72
- 3
tests/unit/test_scheduler.py View File

@@ -4484,7 +4484,7 @@ class TestScheduler(ZuulTestCase):
4484 4484
         tenant = self.sched.abide.tenants.get('tenant-one')
4485 4485
         queue = tenant.layout.pipelines['gate'].queues[0]
4486 4486
         self.assertEqual(queue.window, 2)
4487
-        self.assertTrue(len(self.builds), 4)
4487
+        self.assertEqual(len(self.builds), 4)
4488 4488
 
4489 4489
         self.waitUntilSettled()
4490 4490
         self.commitConfigUpdate('org/common-config',
@@ -4499,7 +4499,7 @@ class TestScheduler(ZuulTestCase):
4499 4499
         # B is outside the window, but still marked active until the
4500 4500
         # next pass through the queue processor, so its builds haven't
4501 4501
         # been canceled.
4502
-        self.assertTrue(len(self.builds), 4)
4502
+        self.assertEqual(len(self.builds), 4)
4503 4503
 
4504 4504
         self.sched.reconfigure(self.config)
4505 4505
         tenant = self.sched.abide.tenants.get('tenant-one')
@@ -4507,7 +4507,7 @@ class TestScheduler(ZuulTestCase):
4507 4507
         self.assertEqual(queue.window, 1)
4508 4508
         self.waitUntilSettled()
4509 4509
         # B's builds have been canceled now
4510
-        self.assertTrue(len(self.builds), 2)
4510
+        self.assertEqual(len(self.builds), 2)
4511 4511
 
4512 4512
         self.executor_server.hold_jobs_in_build = False
4513 4513
         self.executor_server.release()
@@ -4524,6 +4524,75 @@ class TestScheduler(ZuulTestCase):
4524 4524
             dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
4525 4525
         ], ordered=False)
4526 4526
 
4527
+    @simple_layout('layouts/reconfigure-window-fixed.yaml')
4528
+    def test_reconfigure_window_fixed_requests(self):
4529
+        # Test the active window shrinking during reconfiguration with
4530
+        # outstanding node requests
4531
+        self.executor_server.hold_jobs_in_build = True
4532
+
4533
+        # Start the jobs for A
4534
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
4535
+        A.addApproval('Code-Review', 2)
4536
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
4537
+        self.waitUntilSettled()
4538
+        self.log.debug("A complete")
4539
+
4540
+        # Hold node requests for B
4541
+        self.fake_nodepool.pause()
4542
+        B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
4543
+        B.addApproval('Code-Review', 2)
4544
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
4545
+        self.waitUntilSettled()
4546
+        self.log.debug("B complete")
4547
+
4548
+        tenant = self.sched.abide.tenants.get('tenant-one')
4549
+        queue = tenant.layout.pipelines['gate'].queues[0]
4550
+        self.assertEqual(queue.window, 2)
4551
+        self.assertEqual(len(self.builds), 2)
4552
+
4553
+        self.waitUntilSettled()
4554
+        self.commitConfigUpdate('org/common-config',
4555
+                                'layouts/reconfigure-window-fixed2.yaml')
4556
+        self.sched.reconfigure(self.config)
4557
+        self.waitUntilSettled()
4558
+        self.log.debug("Reconfiguration complete")
4559
+
4560
+        tenant = self.sched.abide.tenants.get('tenant-one')
4561
+        queue = tenant.layout.pipelines['gate'].queues[0]
4562
+        # Because we have configured a static window, it should
4563
+        # be allowed to shrink on reconfiguration.
4564
+        self.assertEqual(queue.window, 1)
4565
+        self.assertEqual(len(self.builds), 2)
4566
+
4567
+        # After the previous reconfig, the queue processor will have
4568
+        # run and marked B inactive; run another reconfiguration so
4569
+        # that we're testing what happens when we reconfigure after
4570
+        # the active window having shrunk.
4571
+        self.sched.reconfigure(self.config)
4572
+
4573
+        # Unpause the node requests now
4574
+        self.fake_nodepool.unpause()
4575
+        self.waitUntilSettled()
4576
+        self.log.debug("Nodepool unpause complete")
4577
+
4578
+        # Allow A to merge and B to enter the active window and complete
4579
+        self.executor_server.hold_jobs_in_build = False
4580
+        self.executor_server.release()
4581
+        self.waitUntilSettled()
4582
+        self.log.debug("Executor unpause complete")
4583
+
4584
+        tenant = self.sched.abide.tenants.get('tenant-one')
4585
+        queue = tenant.layout.pipelines['gate'].queues[0]
4586
+        self.assertEqual(queue.window, 1)
4587
+
4588
+        self.waitUntilSettled()
4589
+        self.assertHistory([
4590
+            dict(name='job1', result='SUCCESS', changes='1,1'),
4591
+            dict(name='job2', result='SUCCESS', changes='1,1'),
4592
+            dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
4593
+            dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
4594
+        ], ordered=False)
4595
+
4527 4596
     @simple_layout('layouts/reconfigure-remove-add.yaml')
4528 4597
     def test_reconfigure_remove_add(self):
4529 4598
         # Test removing, then adding a job while in queue

+ 6
- 1
zuul/model.py View File

@@ -1849,7 +1849,7 @@ class BuildSet(object):
1849 1849
 
1850 1850
     def removeJobNodeSet(self, job_name):
1851 1851
         if job_name not in self.nodesets:
1852
-            raise Exception("No job set for %s" % (job_name))
1852
+            raise Exception("No job nodeset for %s" % (job_name))
1853 1853
         del self.nodesets[job_name]
1854 1854
 
1855 1855
     def setJobNodeRequest(self, job_name, req):
@@ -1860,6 +1860,11 @@ class BuildSet(object):
1860 1860
     def getJobNodeRequest(self, job_name):
1861 1861
         return self.node_requests.get(job_name)
1862 1862
 
1863
+    def removeJobNodeRequest(self, job_name):
1864
+        if job_name not in self.node_requests:
1865
+            raise Exception("No node request for %s" % (job_name))
1866
+        del self.node_requests[job_name]
1867
+
1863 1868
     def jobNodeRequestComplete(self, job_name, req, nodeset):
1864 1869
         if job_name in self.nodesets:
1865 1870
             raise Exception("Prior node request for %s" % (job_name))

+ 6
- 0
zuul/scheduler.py View File

@@ -1313,6 +1313,12 @@ class Scheduler(threading.Thread):
1313 1313
             self.log.warning("Item %s does not contain job %s "
1314 1314
                              "for node request %s",
1315 1315
                              build_set.item, request.job.name, request)
1316
+            try:
1317
+                build_set.removeJobNodeRequest(request.job.name)
1318
+            except Exception:
1319
+                self.log.exception("Unable to remove obsolete node request "
1320
+                                   "%s for %s job %s",
1321
+                                   request, build_set.item, request.job.name)
1316 1322
             if request.fulfilled:
1317 1323
                 self.nodepool.returnNodeSet(request.nodeset)
1318 1324
             return

Loading…
Cancel
Save