Merge "Fix register race condition in static provider"
This commit is contained in:
commit
cec796eaf2
|
@ -13,6 +13,7 @@
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
@ -46,6 +47,9 @@ class StaticNodeProvider(Provider):
|
||||||
|
|
||||||
def __init__(self, provider, *args):
|
def __init__(self, provider, *args):
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
|
# Lock to avoid data races when registering nodes from
|
||||||
|
# multiple threads (e.g. cleanup and deleted node worker).
|
||||||
|
self._register_lock = threading.Lock()
|
||||||
|
|
||||||
def checkHost(self, node):
|
def checkHost(self, node):
|
||||||
'''Check node is reachable'''
|
'''Check node is reachable'''
|
||||||
|
@ -371,18 +375,19 @@ class StaticNodeProvider(Provider):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def cleanupLeakedResources(self):
|
def cleanupLeakedResources(self):
|
||||||
registered = self.getRegisteredNodes()
|
with self._register_lock:
|
||||||
for pool in self.provider.pools.values():
|
registered = self.getRegisteredNodes()
|
||||||
for node in pool.nodes:
|
for pool in self.provider.pools.values():
|
||||||
try:
|
for node in pool.nodes:
|
||||||
self.syncNodeCount(registered, node, pool)
|
try:
|
||||||
except Exception:
|
self.syncNodeCount(registered, node, pool)
|
||||||
self.log.exception("Couldn't sync node:")
|
except Exception:
|
||||||
continue
|
self.log.exception("Couldn't sync node:")
|
||||||
try:
|
continue
|
||||||
self.assignReadyNodes(node, pool)
|
try:
|
||||||
except Exception:
|
self.assignReadyNodes(node, pool)
|
||||||
self.log.exception("Couldn't assign ready nodes:")
|
except Exception:
|
||||||
|
self.log.exception("Couldn't assign ready nodes:")
|
||||||
|
|
||||||
def assignReadyNodes(self, node, pool):
|
def assignReadyNodes(self, node, pool):
|
||||||
waiting_nodes = self.getWaitingNodesOfType(node["labels"])
|
waiting_nodes = self.getWaitingNodesOfType(node["labels"])
|
||||||
|
@ -413,22 +418,24 @@ class StaticNodeProvider(Provider):
|
||||||
if static_node is None:
|
if static_node is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
with self._register_lock:
|
||||||
registered = self.getRegisteredNodes()
|
try:
|
||||||
except Exception:
|
registered = self.getRegisteredNodes()
|
||||||
self.log.exception(
|
except Exception:
|
||||||
"Cannot get registered hostnames for node re-registration:")
|
self.log.exception(
|
||||||
return
|
"Cannot get registered hostnames for node re-registration:"
|
||||||
current_count = registered[nodeTuple(node)]
|
)
|
||||||
|
return
|
||||||
|
current_count = registered[nodeTuple(node)]
|
||||||
|
|
||||||
# It's possible we were not able to de-register nodes due to a config
|
# It's possible we were not able to de-register nodes due to a
|
||||||
# change (because they were in use). In that case, don't bother to
|
# config change (because they were in use). In that case, don't
|
||||||
# reregister.
|
# bother to reregister.
|
||||||
if current_count >= static_node["max-parallel-jobs"]:
|
if current_count >= static_node["max-parallel-jobs"]:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.registerNodeFromConfig(
|
self.registerNodeFromConfig(
|
||||||
1, node.provider, node.pool, static_node)
|
1, node.provider, node.pool, static_node)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception("Cannot re-register deleted node %s", node)
|
self.log.exception("Cannot re-register deleted node %s", node)
|
||||||
|
|
Loading…
Reference in New Issue