Use cache more aggressively when searching for ready nodes

When assigning node requests, the pool main loop attempts to find existing ready nodes for each request it accepts before creating a NodeLauncher (which is the class which usually causes asynchronous node creation to happen). Even though we iterate over every ready node in the system looking for a match, this should be fast because: * We only consider nodes from our provider+pool. * We only consider nodes which are not already assigned (and newly created nodes are created with assigned_to already set). * We use the node cache. However, there is room for improvement: * We could use cached_ids so that we don't have to list children. * We can avoid trying to lock nodes that the cache says are already locked (this should generally be covered by the "assigned_to" check mentioned above, but this is a good extra check and is consistent with other new optimizations. * We can ignore nodes in the cache with incomplete data. The first and last items are likely to make the most difference. It has been observed that in a production system we can end up spending much more time than expected looking for ready nodes. The most likely cause for this is cache misses, since if we are able to use cached data we would expect all the other checks to be dealt with quickly. That leaves nodes that have appeared very recently (so they show up in a live get_children call but aren't in the cache yet, or if they are, are in the cache with incomplete (empty) data) or nodes that have been disappeared very recently (so they showed up in our initial listing but by the time we get around to trying to lock them, they have been removed from the cache). This also adds a log entry to the very chatty "cache event" log handler in the case of a cache miss in order to aid future debugging. Change-Id: I32a0f2d0c2f1b8bbbeae1441ee48c8320e3b9129
2023-05-30 13:40:21 -07:00 · 2023-05-30 13:40:21 -07:00 · 218da4aa81
parent 99d2a361a1
commit 218da4aa81
3 changed files with 23 additions and 8 deletions
--- a/nodepool/driver/init.py
+++ b/nodepool/driver/init.py
@ -463,6 +463,9 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
                        continue
                    if node.pool != self.pool.name:
                        continue
+                    # Skip if the node is *probably* locked.
+                    if node.lock_contenders:
+                        continue
                    # Check this driver reuse requirements
                    if not self.checkReusableNode(node):
                        continue
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@ -709,6 +709,13 @@ class CleanupWorker(BaseCleanupWorker):
                if (now - node.state_time) < label.max_ready_age:
                    continue

+                # We don't check the cached lock contenders here
+                # because it's unlikely a locked node will achieve
+                # max_ready_age, so the lock below will almost always
+                # succeed.  This helps protect against invalid cache
+                # data (ie, the cache saying it's locked even though
+                # it isn't).
+
                try:
                    zk_conn.lockNode(node, blocking=False)
                except exceptions.ZKLockException:
--- a/nodepool/zk/zookeeper.py
+++ b/nodepool/zk/zookeeper.py
@ -1111,6 +1111,7 @@ class ZooKeeper(ZooKeeperBase):
    '''

    log = logging.getLogger("nodepool.zk.ZooKeeper")
+    event_log = logging.getLogger("nodepool.zk.cache.event")

    IMAGE_ROOT = "/nodepool/images"
    LAUNCHER_ROOT = "/nodepool/launchers"
@ -2446,19 +2447,23 @@ class ZooKeeper(ZooKeeperBase):
        except kze.NoNodeError:
            return []

-    def getNode(self, node, cached=False):
+    def getNode(self, node, cached=False, only_cached=False):
        '''
        Get the data for a specific node.

        :param str node: The node ID.
        :param bool cached: True if the data should be taken from the cache.
-
+        :param bool only_cached: True if we should ignore nodes not
+                                 fully in the cache.
        :returns: The node data, or None if the node was not found.
        '''
        if cached and self._node_cache:
            d = self._node_cache.getNode(node)
            if d:
                return d
+            if only_cached:
+                self.event_log.debug("Cache miss for node %s", node)
+                return None

        # We got here we either didn't use the cache or the cache didn't
        # have the node (yet). Note that even if we use caching we need to
@ -2596,7 +2601,7 @@ class ZooKeeper(ZooKeeperBase):
            those labels.
        '''
        ret = {}
-        for node in self.nodeIterator(cached=cached):
+        for node in self.nodeIterator(cached=cached, cached_ids=cached):
            if node.state != READY or node.allocated_to:
                continue
            for label in labels:
@ -2673,21 +2678,21 @@ class ZooKeeper(ZooKeeperBase):
        return False

    def nodeIterator(self, cached=True, cached_ids=False):
-        '''
-        Utility generator method for iterating through all nodes.
+        '''Utility generator method for iterating through all nodes.

        :param bool cached: True if the data should be taken from the cache.
        :param bool cached_ids: True if the node IDs should be taken from the
-                                cache.
+                                cache; this will also avoid hitting ZK
+                                if the node data is missing from the cache.
        '''

-        if cached_ids:
+        if cached_ids and self._node_cache:
            node_ids = self._node_cache.getNodeIds()
        else:
            node_ids = self.getNodes()

        for node_id in node_ids:
-            node = self.getNode(node_id, cached=cached)
+            node = self.getNode(node_id, cached=cached, only_cached=cached_ids)
            if node:
                yield node