Merge "Fix register race condition in static provider"

This commit is contained in:
Zuul 2019-10-17 16:04:36 +00:00 committed by Gerrit Code Review
commit cec796eaf2
1 changed files with 36 additions and 29 deletions

View File

@ -13,6 +13,7 @@
# under the License. # under the License.
import logging import logging
import threading
from operator import attrgetter from operator import attrgetter
from collections import Counter from collections import Counter
@ -46,6 +47,9 @@ class StaticNodeProvider(Provider):
def __init__(self, provider, *args): def __init__(self, provider, *args):
self.provider = provider self.provider = provider
# Lock to avoid data races when registering nodes from
# multiple threads (e.g. cleanup and deleted node worker).
self._register_lock = threading.Lock()
def checkHost(self, node): def checkHost(self, node):
'''Check node is reachable''' '''Check node is reachable'''
@ -371,18 +375,19 @@ class StaticNodeProvider(Provider):
return True return True
def cleanupLeakedResources(self): def cleanupLeakedResources(self):
registered = self.getRegisteredNodes() with self._register_lock:
for pool in self.provider.pools.values(): registered = self.getRegisteredNodes()
for node in pool.nodes: for pool in self.provider.pools.values():
try: for node in pool.nodes:
self.syncNodeCount(registered, node, pool) try:
except Exception: self.syncNodeCount(registered, node, pool)
self.log.exception("Couldn't sync node:") except Exception:
continue self.log.exception("Couldn't sync node:")
try: continue
self.assignReadyNodes(node, pool) try:
except Exception: self.assignReadyNodes(node, pool)
self.log.exception("Couldn't assign ready nodes:") except Exception:
self.log.exception("Couldn't assign ready nodes:")
def assignReadyNodes(self, node, pool): def assignReadyNodes(self, node, pool):
waiting_nodes = self.getWaitingNodesOfType(node["labels"]) waiting_nodes = self.getWaitingNodesOfType(node["labels"])
@ -413,22 +418,24 @@ class StaticNodeProvider(Provider):
if static_node is None: if static_node is None:
return return
try: with self._register_lock:
registered = self.getRegisteredNodes() try:
except Exception: registered = self.getRegisteredNodes()
self.log.exception( except Exception:
"Cannot get registered hostnames for node re-registration:") self.log.exception(
return "Cannot get registered hostnames for node re-registration:"
current_count = registered[nodeTuple(node)] )
return
current_count = registered[nodeTuple(node)]
# It's possible we were not able to de-register nodes due to a config # It's possible we were not able to de-register nodes due to a
# change (because they were in use). In that case, don't bother to # config change (because they were in use). In that case, don't
# reregister. # bother to reregister.
if current_count >= static_node["max-parallel-jobs"]: if current_count >= static_node["max-parallel-jobs"]:
return return
try: try:
self.registerNodeFromConfig( self.registerNodeFromConfig(
1, node.provider, node.pool, static_node) 1, node.provider, node.pool, static_node)
except Exception: except Exception:
self.log.exception("Cannot re-register deleted node %s", node) self.log.exception("Cannot re-register deleted node %s", node)