Add idle state to driver providers

This change adds an idle state to driver providers which is used to
indicate that the provider should stop performing actions that are not
safe to perform while we bootstrap a second newer version of the
provider to handle a config update.

This is particularly interesting for the static driver because it is
managing all of its state internally to nodepool and not relying on
external cloud systems to track resources. This means it is important
for the static provider to not have an old provider object update
zookeeper at the same time as a new provider object. This was previously
possible and created situtations where the resources in zookeeper did
not reflect our local config.

Since all other drivers rely on external state the primary update here
is to the static driver. We simply stop performing config
synchronization if the idle flag is set on a static provider. This will
allow the new provider to take over reflecting the new config
consistently.

Note, we don't take other approaches and essentially create a system
specific to the static driver because we're trying to avoid modifying
the nodepool runtime significantly to fix a problem that is specific to
the static driver.

Change-Id: I93519d0c6f4ddf8a417d837f6ae12a30a55870bb
This commit is contained in:
Clark Boylan 2022-10-24 15:06:49 -07:00
parent 6cfda7de66
commit 2a231a08c9
8 changed files with 40 additions and 0 deletions

View File

@ -195,6 +195,17 @@ class Provider(ProviderNotifications):
"""
pass
@abc.abstractmethod
def idle(self):
"""Idle the provider
This is called before stop(). Providers should use this as a signal
to idle themselves and stop performing any actions that may interfere
with a new version of this provider starting up.
"""
pass
@abc.abstractmethod
def join(self):
"""Wait for provider to finish

View File

@ -56,6 +56,9 @@ class KubernetesProvider(Provider, QuotaSupport):
self.log.debug("Stopping")
self.ready = False
def idle(self):
pass
def listNodes(self):
servers = []

View File

@ -52,6 +52,9 @@ class OpenshiftProvider(Provider, QuotaSupport):
def stop(self):
self.log.debug("Stopping")
def idle(self):
pass
def listNodes(self):
servers = []

View File

@ -69,6 +69,9 @@ class OpenStackProvider(Provider, QuotaSupport):
self.running = False
self._server_list_watcher_stop_event.set()
def idle(self):
pass
def join(self):
self._server_list_watcher.join()

View File

@ -510,6 +510,9 @@ class StateMachineProvider(Provider, QuotaSupport):
self.adapter.stop()
self.log.debug("Stopped")
def idle(self):
pass
def join(self):
self.log.debug("Joining")
if self.state_machine_thread:

View File

@ -59,6 +59,9 @@ class StaticNodeProvider(Provider, QuotaSupport):
# multiple threads (e.g. cleanup and deleted node worker).
self._register_lock = threading.Lock()
self._node_slots = {} # nodeTuple -> [node]
# Flag to indicates we need to stop processing state that could
# interfere with a newer versions of ourselves running.
self._idle = False
def _getSlot(self, node):
return self._node_slots[nodeTuple(node)].index(node)
@ -412,6 +415,9 @@ class StaticNodeProvider(Provider, QuotaSupport):
def stop(self):
self.log.debug("Stopping")
def idle(self):
self._idle = True
def poolNodes(self):
return {
nodeTuple(n): n
@ -437,6 +443,8 @@ class StaticNodeProvider(Provider, QuotaSupport):
return True
def cleanupLeakedResources(self):
if self._idle:
return
with self._register_lock:
self.getRegisteredNodes()
for pool in self.provider.pools.values():
@ -458,6 +466,9 @@ class StaticNodeProvider(Provider, QuotaSupport):
'''
Re-register the deleted node.
'''
if self._idle:
return
# It's possible a deleted node no longer exists in our config, so
# don't bother to reregister.
node_tuple = nodeTuple(node)

View File

@ -29,6 +29,9 @@ class TestProvider(Provider):
def stop(self):
pass
def idle(self):
pass
def join(self):
pass

View File

@ -50,6 +50,9 @@ class ProviderManager(object):
if old_config:
oldmanager = old_config.provider_managers.get(p.name)
if oldmanager and p != oldmanager.provider:
# Signal that actions not safe to run on both the old and
# new providers while we synchronize should cease to run.
oldmanager.idle()
stop_managers.append(oldmanager)
oldmanager = None
if oldmanager: