Use cached ids in node iterator more often

There are several places where it is now probably safe to use cached ids when iterating over ZK nodes. The main reasons not to use cached ids are in the case of race conditions or in case the tree cache may have missed an event and is unaware of a node. We have increased confidence in the accuracy of our cache now, so at least in the cases where we know that races are not an issue, we can switch those to use cached ids and save a ZK round trip (a possibly significant one if there is a long list of children). This change adds the flag in the following places (with explanations of why it's safe): * State machine cleanup routines Leaked instances have to show up on two subsequent calls to be acted upon, so this is not sensitive to timing * Quota calculation If we do get the quota wrong, drivers are expected to handle that gracefully anyway. * Held node cleanup Worst case is we wait until next iteration to clean up. * Stats They're a snapshot anyway, so a cache mismatch is really just a shift in the snapshot time. Change-Id: Ie7af2f62188951bf302ffdb64827d868609a1e3c
2023-05-17 20:04:07 -07:00 · 2023-05-17 20:04:07 -07:00 · 99d2a361a1
parent 0a858c4be8
commit 99d2a361a1
4 changed files with 6 additions and 5 deletions
--- a/nodepool/driver/statemachine.py
+++ b/nodepool/driver/statemachine.py
@ -767,7 +767,7 @@ class StateMachineProvider(Provider, QuotaSupport):
        '''
        used_quota = QuotaInformation()

-        node_ids = set([n.id for n in self._zk.nodeIterator()])
+        node_ids = set([n.id for n in self._zk.nodeIterator(cached_ids=True)])

        for instance in self.adapter.listInstances():
            meta = instance.metadata
@ -806,7 +806,7 @@ class StateMachineProvider(Provider, QuotaSupport):
    def cleanupLeakedResources(self):
        known_nodes = set()

-        for node in self._zk.nodeIterator():
+        for node in self._zk.nodeIterator(cached_ids=True):
            if node.provider != self.provider.name:
                continue
            known_nodes.add(node.id)
--- a/nodepool/driver/utils.py
+++ b/nodepool/driver/utils.py
@ -339,7 +339,7 @@ class QuotaSupport:
        '''
        used_quota = QuotaInformation()

-        for node in self._zk.nodeIterator():
+        for node in self._zk.nodeIterator(cached_ids=True):
            if node.provider == self.provider.name:
                try:
                    if pool and not node.pool == pool.name:
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@ -742,7 +742,8 @@ class CleanupWorker(BaseCleanupWorker):
        self.log.debug('Cleaning up held nodes...')

        zk_conn = self._nodepool.getZK()
-        held_nodes = [n for n in zk_conn.nodeIterator() if n.state == zk.HOLD]
+        held_nodes = [n for n in zk_conn.nodeIterator(cached_ids=True)
+                      if n.state == zk.HOLD]
        for node in held_nodes:
            # Can't do anything if we aren't configured for this provider.
            if node.provider not in self._nodepool.config.providers:
--- a/nodepool/stats.py
+++ b/nodepool/stats.py
@ -120,7 +120,7 @@ class StatsReporter(object):
                key = 'nodepool.label.%s.nodes.%s' % (label, state)
                states[key] = 0

-        for node in zk_conn.nodeIterator():
+        for node in zk_conn.nodeIterator(cached_ids=True):
            # nodepool.nodes.STATE
            key = 'nodepool.nodes.%s' % node.state
            states[key] += 1