Implement cleanup of leaked resources in k8s driver

As other drivers, cleanup resources (i.e. k8s namespaces) that are
managed by nodepool (have nodepool specific metadata) but do not have
a correpsonding zk node (anymore). We consider those leaked nodes that
we want to clean up periodically.

Change-Id: I6f8122861a42845a18e4d376e58ee4a48aa1df3a
This commit is contained in:
Benjamin Schanzel
2022-08-03 14:41:23 +02:00
parent 95b3d4c302
commit bda63ad8ca
3 changed files with 91 additions and 8 deletions

View File

@@ -20,7 +20,7 @@ import time
from kubernetes import client as k8s_client
from nodepool import exceptions
from nodepool import exceptions, stats
from nodepool.driver import Provider
from nodepool.driver.kubernetes import handler
from nodepool.driver.utils import QuotaInformation, QuotaSupport
@@ -37,6 +37,7 @@ class KubernetesProvider(Provider, QuotaSupport):
super().__init__()
self.provider = provider
self._zk = None
self._statsd = stats.get_client()
self.ready = False
_, _, self.k8s_client, self.rbac_client = get_client(
self.log, provider.context, k8s_client.RbacAuthorizationV1Api)
@@ -59,7 +60,7 @@ class KubernetesProvider(Provider, QuotaSupport):
servers = []
class FakeServer:
def __init__(self, namespace, provider, valid_names):
def __init__(self, namespace, valid_names):
self.id = namespace.metadata.name
self.name = namespace.metadata.name
self.metadata = {}
@@ -70,8 +71,7 @@ class KubernetesProvider(Provider, QuotaSupport):
try:
# Make sure last component of name is an id
int(node_id)
self.metadata['nodepool_provider_name'] = provider
self.metadata['nodepool_node_id'] = node_id
self.metadata = namespace.metadata.labels
except Exception:
# Probably not a managed namespace, let's skip metadata
pass
@@ -81,8 +81,7 @@ class KubernetesProvider(Provider, QuotaSupport):
if self.ready:
for namespace in self.k8s_client.list_namespace().items:
servers.append(FakeServer(
namespace, self.provider.name, self.namespace_names))
servers.append(FakeServer(namespace, self.namespace_names))
return servers
def labelReady(self, name):
@@ -93,7 +92,36 @@ class KubernetesProvider(Provider, QuotaSupport):
pass
def cleanupLeakedResources(self):
pass
'''
Delete any leaked server instances.
Remove any servers found in this provider that are not recorded in
the ZooKeeper data.
'''
for server in self.listNodes():
meta = server.get('metadata', {})
if 'nodepool_provider_name' not in meta:
continue
if meta['nodepool_provider_name'] != self.provider.name:
# Another launcher, sharing this provider but configured
# with a different name, owns this.
continue
if not self._zk.getNode(meta['nodepool_node_id']):
self.log.warning(
"Deleting leaked instance %s (%s) in %s "
"(unknown node id %s)",
server.name, server.id, self.provider.name,
meta['nodepool_node_id']
)
self.cleanupNode(server.id)
if self._statsd:
key = ('nodepool.provider.%s.leaked.nodes'
% self.provider.name)
self._statsd.incr(key)
def startNodeCleanup(self, node):
t = NodeDeleter(self._zk, self, node)
@@ -136,7 +164,11 @@ class KubernetesProvider(Provider, QuotaSupport):
'kind': 'Namespace',
'metadata': {
'name': namespace,
'nodepool_node_id': name
'labels': {
'nodepool_node_id': node.id,
'nodepool_provider_name': self.provider.name,
'nodepool_pool_name': pool,
}
}
}
proj = self.k8s_client.create_namespace(ns_body)

View File

@@ -0,0 +1,24 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: pod-fedora
min-ready: 1
providers:
- name: kubespray
driver: kubernetes
context: admin-cluster.local
pools:
- name: main
labels:
- name: pod-fedora
type: pod
image: docker.io/fedora:28

View File

@@ -40,6 +40,7 @@ class FakeCoreClient(object):
class FakeNamespace:
class metadata:
name = ns_body['metadata']['name']
labels = ns_body['metadata']['labels']
self.namespaces.append(FakeNamespace)
return FakeNamespace
@@ -223,6 +224,32 @@ class TestDriverKubernetes(tests.DBTestCase):
self._test_kubernetes_quota(
'kubernetes-tenant-quota-ram.yaml', pause=False)
def test_kubernetes_leaked_node(self):
conf = self.setup_config('kubernetes-leaked-node.yaml')
pool = self.useNodepool(conf, watermark_sleep=1)
pool.cleanup_interval = 1
pool.start()
# wait for min-ready node to be available
nodes = self.waitForNodes('pod-fedora')
self.assertEqual(len(nodes), 1)
manager = pool.getProviderManager('kubespray')
servers = manager.listNodes()
self.assertEqual(len(servers), 1)
# delete node from zk so it becomes 'leaked'
self.zk.deleteNode(nodes[0])
# node gets replaced, wait for that
new_nodes = self.waitForNodes('pod-fedora')
self.assertEqual(len(new_nodes), 1)
# original node should get deleted eventually
self.waitForInstanceDeletion(manager, nodes[0].external_id)
servers = manager.listNodes()
self.assertEqual(len(servers), 1)
def _test_kubernetes_quota(self, config, pause=True):
configfile = self.setup_config(config)
pool = self.useNodepool(configfile, watermark_sleep=1)