Browse Source

Be more aggressive in canceling node requests

During a reconfiguration, we may cancel builds if they are no longer
necessary, however we do not do the same for node requests.  Currently
we let them run to completion and then return them unused.  It would
be more efficient (and behaviorally consistent) to cancel the request
as soon as we determine that it won't be used.

Also, change some warning log messages to info (as they don't indicate
a situation which may benefit from corrective action).

Change-Id: Ic3ef6b75437bf82bf9c8f426b23ea14d9aaa96b7
tags/3.4.0
James E. Blair 5 months ago
parent
commit
bb9ad84cb6
2 changed files with 22 additions and 11 deletions
  1. 2
    3
      zuul/model.py
  2. 20
    8
      zuul/scheduler.py

+ 2
- 3
zuul/model.py View File

@@ -1861,9 +1861,8 @@ class BuildSet(object):
1861 1861
         return self.node_requests.get(job_name)
1862 1862
 
1863 1863
     def removeJobNodeRequest(self, job_name):
1864
-        if job_name not in self.node_requests:
1865
-            raise Exception("No node request for %s" % (job_name))
1866
-        del self.node_requests[job_name]
1864
+        if job_name in self.node_requests:
1865
+            del self.node_requests[job_name]
1867 1866
 
1868 1867
     def jobNodeRequestComplete(self, job_name, req, nodeset):
1869 1868
         if job_name in self.nodesets:

+ 20
- 8
zuul/scheduler.py View File

@@ -770,6 +770,7 @@ class Scheduler(threading.Thread):
770 770
                                           new_pipeline.window_floor)
771 771
             items_to_remove = []
772 772
             builds_to_cancel = []
773
+            requests_to_cancel = []
773 774
             last_head = None
774 775
             for shared_queue in old_pipeline.queues:
775 776
                 # Attempt to keep window sizes from shrinking where possible
@@ -812,15 +813,25 @@ class Scheduler(threading.Thread):
812 813
                             else:
813 814
                                 item.removeBuild(build)
814 815
                                 builds_to_cancel.append(build)
816
+                        for request_job, request in \
817
+                            item.current_build_set.node_requests.items():
818
+                            new_job = item.getJob(request_job)
819
+                            if not new_job:
820
+                                requests_to_cancel.append(
821
+                                    (item.current_build_set, request))
815 822
                     else:
816 823
                         items_to_remove.append(item)
817 824
             for item in items_to_remove:
818
-                self.log.warning(
825
+                self.log.info(
819 826
                     "Removing item %s during reconfiguration" % (item,))
820 827
                 for build in item.current_build_set.getBuilds():
821 828
                     builds_to_cancel.append(build)
829
+                for request_job, request in \
830
+                    item.current_build_set.node_requests.items():
831
+                    requests_to_cancel.append(
832
+                        (item.current_build_set, request))
822 833
             for build in builds_to_cancel:
823
-                self.log.warning(
834
+                self.log.info(
824 835
                     "Canceling build %s during reconfiguration" % (build,))
825 836
                 try:
826 837
                     self.executor.cancel(build)
@@ -839,6 +850,12 @@ class Scheduler(threading.Thread):
839 850
                         "for change %s" % (build, build.build_set.item.change))
840 851
                 tenant.semaphore_handler.release(
841 852
                     build.build_set.item, build.job)
853
+            for build_set, request in requests_to_cancel:
854
+                self.log.info(
855
+                    "Canceling node request %s during reconfiguration",
856
+                    request)
857
+                self.nodepool.cancelRequest(request)
858
+                build_set.removeJobNodeRequest(request.job.name)
842 859
 
843 860
     def _reconfigureTenant(self, tenant):
844 861
         # This is called from _doReconfigureEvent while holding the
@@ -1313,12 +1330,7 @@ class Scheduler(threading.Thread):
1313 1330
             self.log.warning("Item %s does not contain job %s "
1314 1331
                              "for node request %s",
1315 1332
                              build_set.item, request.job.name, request)
1316
-            try:
1317
-                build_set.removeJobNodeRequest(request.job.name)
1318
-            except Exception:
1319
-                self.log.exception("Unable to remove obsolete node request "
1320
-                                   "%s for %s job %s",
1321
-                                   request, build_set.item, request.job.name)
1333
+            build_set.removeJobNodeRequest(request.job.name)
1322 1334
             if request.fulfilled:
1323 1335
                 self.nodepool.returnNodeSet(request.nodeset)
1324 1336
             return

Loading…
Cancel
Save