Browse Source

Ensure that completed handlers are removed frequently

On a busy system it can happen that assignHandlers takes quite some
time (we saw occurrences of more than 10 minutes). Within this time no
node request is marked as fulfilled even if the nodes are there. A
possible solution is to return from assignHandlers frequently during
the iteration so we can remove completed handlers and then proceed
with assigning handlers.

Change-Id: I10f40504c81d532e6953d7af63c5c58fd5283573
Tobias Henkel 6 months ago
parent
commit
9296de9bf5
1 changed files with 24 additions and 6 deletions
  1. 24
    6
      nodepool/launcher.py

+ 24
- 6
nodepool/launcher.py View File

@@ -143,21 +143,26 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
143 143
     # Private methods
144 144
     # ---------------------------------------------------------------
145 145
 
146
-    def _assignHandlers(self):
146
+    def _assignHandlers(self, timeout=15):
147 147
         '''
148 148
         For each request we can grab, create a NodeRequestHandler for it.
149 149
 
150 150
         The NodeRequestHandler object will kick off any threads needed to
151 151
         satisfy the request, then return. We will need to periodically poll
152 152
         the handler for completion.
153
+
154
+        If exceeds the timeout it stops further iteration and returns False
155
+        in order to give us time to call _removeCompletedHandlers. Otherwise
156
+        it returns True to signal that it is finished for now.
153 157
         '''
158
+        start = time.monotonic()
154 159
         provider = self.getProviderConfig()
155 160
         if not provider:
156 161
             self.log.info("Missing config. Deleted provider?")
157
-            return
162
+            return True
158 163
 
159 164
         if provider.max_concurrency == 0:
160
-            return
165
+            return True
161 166
 
162 167
         # Sort requests by queue priority, then, for all requests at
163 168
         # the same priority, use the relative_priority field to
@@ -168,8 +173,11 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
168 173
                                      r.id.split('-')[1]))
169 174
 
170 175
         for req in requests:
176
+            if not self.running:
177
+                return True
178
+
171 179
             if self.paused_handler:
172
-                return
180
+                return True
173 181
 
174 182
             # Get active threads for all pools for this provider
175 183
             active_threads = sum([
@@ -183,7 +191,7 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
183 191
                 self.log.debug("Request handling limited: %s active threads ",
184 192
                                "with max concurrency of %s",
185 193
                                active_threads, provider.max_concurrency)
186
-                return
194
+                return True
187 195
 
188 196
             req = self.zk.getNodeRequest(req.id)
189 197
             if not req:
@@ -217,6 +225,11 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
217 225
                 self.paused_handler = rh
218 226
             self.request_handlers.append(rh)
219 227
 
228
+            # if we exceeded the timeout stop iterating here
229
+            if time.monotonic() - start > timeout:
230
+                return False
231
+        return True
232
+
220 233
     def _removeCompletedHandlers(self):
221 234
         '''
222 235
         Poll handlers to see which have completed.
@@ -305,7 +318,12 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
305 318
 
306 319
             try:
307 320
                 if not self.paused_handler:
308
-                    self._assignHandlers()
321
+                    while not self._assignHandlers():
322
+                        # _assignHandlers can take quite some time on a busy
323
+                        # system so sprinkle _removeCompletedHandlers in
324
+                        # between such that we have a chance to fulfill
325
+                        # requests that already have all nodes.
326
+                        self._removeCompletedHandlers()
309 327
                 else:
310 328
                     # If we are paused, one request handler could not
311 329
                     # satisfy its assigned request, so give it

Loading…
Cancel
Save