Implement cleanup of leaked resources in k8s driver
As other drivers, cleanup resources (i.e. k8s namespaces) that are managed by nodepool (have nodepool specific metadata) but do not have a correpsonding zk node (anymore). We consider those leaked nodes that we want to clean up periodically. Change-Id: I6f8122861a42845a18e4d376e58ee4a48aa1df3a
This commit is contained in:
@@ -20,7 +20,7 @@ import time
|
||||
|
||||
from kubernetes import client as k8s_client
|
||||
|
||||
from nodepool import exceptions
|
||||
from nodepool import exceptions, stats
|
||||
from nodepool.driver import Provider
|
||||
from nodepool.driver.kubernetes import handler
|
||||
from nodepool.driver.utils import QuotaInformation, QuotaSupport
|
||||
@@ -37,6 +37,7 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
super().__init__()
|
||||
self.provider = provider
|
||||
self._zk = None
|
||||
self._statsd = stats.get_client()
|
||||
self.ready = False
|
||||
_, _, self.k8s_client, self.rbac_client = get_client(
|
||||
self.log, provider.context, k8s_client.RbacAuthorizationV1Api)
|
||||
@@ -59,7 +60,7 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
servers = []
|
||||
|
||||
class FakeServer:
|
||||
def __init__(self, namespace, provider, valid_names):
|
||||
def __init__(self, namespace, valid_names):
|
||||
self.id = namespace.metadata.name
|
||||
self.name = namespace.metadata.name
|
||||
self.metadata = {}
|
||||
@@ -70,8 +71,7 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
try:
|
||||
# Make sure last component of name is an id
|
||||
int(node_id)
|
||||
self.metadata['nodepool_provider_name'] = provider
|
||||
self.metadata['nodepool_node_id'] = node_id
|
||||
self.metadata = namespace.metadata.labels
|
||||
except Exception:
|
||||
# Probably not a managed namespace, let's skip metadata
|
||||
pass
|
||||
@@ -81,8 +81,7 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
|
||||
if self.ready:
|
||||
for namespace in self.k8s_client.list_namespace().items:
|
||||
servers.append(FakeServer(
|
||||
namespace, self.provider.name, self.namespace_names))
|
||||
servers.append(FakeServer(namespace, self.namespace_names))
|
||||
return servers
|
||||
|
||||
def labelReady(self, name):
|
||||
@@ -93,7 +92,36 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
pass
|
||||
|
||||
def cleanupLeakedResources(self):
|
||||
pass
|
||||
'''
|
||||
Delete any leaked server instances.
|
||||
|
||||
Remove any servers found in this provider that are not recorded in
|
||||
the ZooKeeper data.
|
||||
'''
|
||||
|
||||
for server in self.listNodes():
|
||||
meta = server.get('metadata', {})
|
||||
|
||||
if 'nodepool_provider_name' not in meta:
|
||||
continue
|
||||
|
||||
if meta['nodepool_provider_name'] != self.provider.name:
|
||||
# Another launcher, sharing this provider but configured
|
||||
# with a different name, owns this.
|
||||
continue
|
||||
|
||||
if not self._zk.getNode(meta['nodepool_node_id']):
|
||||
self.log.warning(
|
||||
"Deleting leaked instance %s (%s) in %s "
|
||||
"(unknown node id %s)",
|
||||
server.name, server.id, self.provider.name,
|
||||
meta['nodepool_node_id']
|
||||
)
|
||||
self.cleanupNode(server.id)
|
||||
if self._statsd:
|
||||
key = ('nodepool.provider.%s.leaked.nodes'
|
||||
% self.provider.name)
|
||||
self._statsd.incr(key)
|
||||
|
||||
def startNodeCleanup(self, node):
|
||||
t = NodeDeleter(self._zk, self, node)
|
||||
@@ -136,7 +164,11 @@ class KubernetesProvider(Provider, QuotaSupport):
|
||||
'kind': 'Namespace',
|
||||
'metadata': {
|
||||
'name': namespace,
|
||||
'nodepool_node_id': name
|
||||
'labels': {
|
||||
'nodepool_node_id': node.id,
|
||||
'nodepool_provider_name': self.provider.name,
|
||||
'nodepool_pool_name': pool,
|
||||
}
|
||||
}
|
||||
}
|
||||
proj = self.k8s_client.create_namespace(ns_body)
|
||||
|
||||
24
nodepool/tests/fixtures/kubernetes-leaked-node.yaml
vendored
Normal file
24
nodepool/tests/fixtures/kubernetes-leaked-node.yaml
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
zookeeper-tls:
|
||||
ca: {zookeeper_ca}
|
||||
cert: {zookeeper_cert}
|
||||
key: {zookeeper_key}
|
||||
|
||||
labels:
|
||||
- name: pod-fedora
|
||||
min-ready: 1
|
||||
|
||||
providers:
|
||||
- name: kubespray
|
||||
driver: kubernetes
|
||||
context: admin-cluster.local
|
||||
pools:
|
||||
- name: main
|
||||
labels:
|
||||
- name: pod-fedora
|
||||
type: pod
|
||||
image: docker.io/fedora:28
|
||||
@@ -40,6 +40,7 @@ class FakeCoreClient(object):
|
||||
class FakeNamespace:
|
||||
class metadata:
|
||||
name = ns_body['metadata']['name']
|
||||
labels = ns_body['metadata']['labels']
|
||||
self.namespaces.append(FakeNamespace)
|
||||
return FakeNamespace
|
||||
|
||||
@@ -223,6 +224,32 @@ class TestDriverKubernetes(tests.DBTestCase):
|
||||
self._test_kubernetes_quota(
|
||||
'kubernetes-tenant-quota-ram.yaml', pause=False)
|
||||
|
||||
def test_kubernetes_leaked_node(self):
|
||||
conf = self.setup_config('kubernetes-leaked-node.yaml')
|
||||
pool = self.useNodepool(conf, watermark_sleep=1)
|
||||
pool.cleanup_interval = 1
|
||||
pool.start()
|
||||
|
||||
# wait for min-ready node to be available
|
||||
nodes = self.waitForNodes('pod-fedora')
|
||||
self.assertEqual(len(nodes), 1)
|
||||
manager = pool.getProviderManager('kubespray')
|
||||
servers = manager.listNodes()
|
||||
self.assertEqual(len(servers), 1)
|
||||
|
||||
# delete node from zk so it becomes 'leaked'
|
||||
self.zk.deleteNode(nodes[0])
|
||||
|
||||
# node gets replaced, wait for that
|
||||
new_nodes = self.waitForNodes('pod-fedora')
|
||||
self.assertEqual(len(new_nodes), 1)
|
||||
|
||||
# original node should get deleted eventually
|
||||
self.waitForInstanceDeletion(manager, nodes[0].external_id)
|
||||
|
||||
servers = manager.listNodes()
|
||||
self.assertEqual(len(servers), 1)
|
||||
|
||||
def _test_kubernetes_quota(self, config, pause=True):
|
||||
configfile = self.setup_config(config)
|
||||
pool = self.useNodepool(configfile, watermark_sleep=1)
|
||||
|
||||
Reference in New Issue
Block a user