From 5849c3b9a7ab4f4e91bbb839672616afffb8601c Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 1 Dec 2023 08:03:24 -0800 Subject: [PATCH] Fix gpu parameter type in openshiftpods In config validation, the gpu parametr type was specified as str rather than float. This is corrected. This was not discovered in testing because the only tests which use the gpu parameter for the other k8s drivers are not present in the openshiftpods driver. This change also adds the missing tests for the default resource and resource limits feature which exercises the gpu limits. Change-Id: Ife932acaeb5a90ebc94ad36c3b4615a4469f0c40 --- nodepool/driver/openshiftpods/config.py | 2 +- nodepool/driver/openshiftpods/handler.py | 3 + .../openshiftpods-default-limits.yaml | 40 ++++ .../openshiftpods-default-resources.yaml | 42 ++++ .../tests/unit/test_driver_openshiftpods.py | 220 ++++++++++++++++++ 5 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 nodepool/tests/fixtures/openshiftpods-default-limits.yaml create mode 100644 nodepool/tests/fixtures/openshiftpods-default-resources.yaml diff --git a/nodepool/driver/openshiftpods/config.py b/nodepool/driver/openshiftpods/config.py index 3c853bb72..2f25e71af 100644 --- a/nodepool/driver/openshiftpods/config.py +++ b/nodepool/driver/openshiftpods/config.py @@ -72,7 +72,7 @@ class OpenshiftPodsProviderConfig(OpenshiftProviderConfig): 'cpu-limit': int, 'memory-limit': int, 'storage-limit': int, - 'gpu': str, + 'gpu': float, 'gpu-resource': str, 'python-path': str, 'shell-type': str, diff --git a/nodepool/driver/openshiftpods/handler.py b/nodepool/driver/openshiftpods/handler.py index 67d59f1d2..222b21c9f 100644 --- a/nodepool/driver/openshiftpods/handler.py +++ b/nodepool/driver/openshiftpods/handler.py @@ -50,6 +50,9 @@ class OpenshiftPodLauncher(OpenshiftLauncher): 'user': 'zuul-worker', } self.node.connection_type = "kubectl" + pool = self.handler.provider.pools.get(self.node.pool) + self.node.resources = self.handler.manager.quotaNeededByLabel( + self.node.type[0], pool).get_resources() self.node.cloud = self.provider_config.context self.node.host_id = pod_node_id self.zk.storeNode(self.node) diff --git a/nodepool/tests/fixtures/openshiftpods-default-limits.yaml b/nodepool/tests/fixtures/openshiftpods-default-limits.yaml new file mode 100644 index 000000000..79b8f5e83 --- /dev/null +++ b/nodepool/tests/fixtures/openshiftpods-default-limits.yaml @@ -0,0 +1,40 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: pod-default + - name: pod-custom-cpu + - name: pod-custom-mem + - name: pod-custom-storage + +providers: + - name: openshift + driver: openshiftpods + context: admin-cluster.local + pools: + - name: main + default-label-cpu: 2 + default-label-memory: 1024 + default-label-storage: 10 + default-label-cpu-limit: 8 + default-label-memory-limit: 4196 + default-label-storage-limit: 40 + labels: + - name: pod-default + image: test + - name: pod-custom-cpu + image: test + cpu-limit: 4 + - name: pod-custom-mem + image: test + memory-limit: 2048 + - name: pod-custom-storage + image: test + storage-limit: 20 diff --git a/nodepool/tests/fixtures/openshiftpods-default-resources.yaml b/nodepool/tests/fixtures/openshiftpods-default-resources.yaml new file mode 100644 index 000000000..78bb123a5 --- /dev/null +++ b/nodepool/tests/fixtures/openshiftpods-default-resources.yaml @@ -0,0 +1,42 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: pod-default + - name: pod-custom-cpu + - name: pod-custom-mem + - name: pod-custom-storage + - name: pod-custom-gpu + +providers: + - name: openshift + driver: openshiftpods + context: admin-cluster.local + pools: + - name: main + default-label-cpu: 2 + default-label-memory: 1024 + default-label-storage: 10 + labels: + - name: pod-default + image: test + - name: pod-custom-cpu + image: test + cpu: 4 + - name: pod-custom-mem + image: test + memory: 2048 + - name: pod-custom-storage + image: test + storage: 20 + - name: pod-custom-gpu + image: test + gpu-resource: gpu-vendor.example/example-gpu + gpu: 0.5 diff --git a/nodepool/tests/unit/test_driver_openshiftpods.py b/nodepool/tests/unit/test_driver_openshiftpods.py index f85046792..68ec538f9 100644 --- a/nodepool/tests/unit/test_driver_openshiftpods.py +++ b/nodepool/tests/unit/test_driver_openshiftpods.py @@ -24,6 +24,7 @@ from nodepool.zk import zookeeper as zk class FakeCoreClient(object): def __init__(self): + self._pod_requests = [] self.pods = [] class FakeApi: @@ -41,6 +42,7 @@ class FakeCoreClient(object): class FakePod: class metadata: name = pod_body['metadata']['name'] + self._pod_requests.append((ns, pod_body)) self.pods.append(FakePod) return FakePod @@ -221,3 +223,221 @@ class TestDriverOpenshiftPods(tests.DBTestCase): def test_openshift_pod_tenant_quota_extra(self): self._test_openshift_pod_quota( 'openshiftpods-tenant-quota-extra.yaml', pause=False) + + def test_openshift_pod_default_label_resources(self): + configfile = self.setup_config('openshiftpods-default-resources.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('pod-default') + req.node_types.append('pod-custom-cpu') + req.node_types.append('pod-custom-mem') + req.node_types.append('pod-custom-storage') + req.node_types.append('pod-custom-gpu') + self.zk.storeNodeRequest(req) + + self.log.debug("Waiting for request %s", req.id) + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + + self.assertNotEqual(req.nodes, []) + node_default = self.zk.getNode(req.nodes[0]) + node_cust_cpu = self.zk.getNode(req.nodes[1]) + node_cust_mem = self.zk.getNode(req.nodes[2]) + node_cust_storage = self.zk.getNode(req.nodes[3]) + node_cust_gpu = self.zk.getNode(req.nodes[4]) + + resources_default = { + 'instances': 1, + 'cores': 2, + 'ram': 1024, + 'ephemeral-storage': 10, + } + resources_cust_cpu = { + 'instances': 1, + 'cores': 4, + 'ram': 1024, + 'ephemeral-storage': 10, + } + resources_cust_mem = { + 'instances': 1, + 'cores': 2, + 'ram': 2048, + 'ephemeral-storage': 10, + } + resources_cust_storage = { + 'instances': 1, + 'cores': 2, + 'ram': 1024, + 'ephemeral-storage': 20, + } + resources_cust_gpu = { + 'instances': 1, + 'cores': 2, + 'ram': 1024, + 'ephemeral-storage': 10, + 'gpu-vendor.example/example-gpu': 0.5 + } + + self.assertDictEqual(resources_default, node_default.resources) + self.assertDictEqual(resources_cust_cpu, node_cust_cpu.resources) + self.assertDictEqual(resources_cust_mem, node_cust_mem.resources) + self.assertDictEqual(resources_cust_storage, + node_cust_storage.resources) + self.assertDictEqual(resources_cust_gpu, node_cust_gpu.resources) + + ns, pod = self.fake_k8s_client._pod_requests[0] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[1] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 4, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + 'requests': { + 'cpu': 4, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[2] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '2048Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '2048Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[3] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 2, + 'ephemeral-storage': '20M', + 'memory': '1024Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '20M', + 'memory': '1024Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[4] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi', + 'gpu-vendor.example/example-gpu': '0.50' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi', + 'gpu-vendor.example/example-gpu': '0.50' + }, + }) + + for node in (node_default, + node_cust_cpu, + node_cust_mem, + node_cust_gpu): + node.state = zk.DELETING + self.zk.storeNode(node) + self.waitForNodeDeletion(node) + + def test_openshift_pod_default_label_limits(self): + configfile = self.setup_config('openshiftpods-default-limits.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('pod-default') + req.node_types.append('pod-custom-cpu') + req.node_types.append('pod-custom-mem') + req.node_types.append('pod-custom-storage') + self.zk.storeNodeRequest(req) + + self.log.debug("Waiting for request %s", req.id) + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + self.assertNotEqual(req.nodes, []) + + ns, pod = self.fake_k8s_client._pod_requests[0] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 8, + 'ephemeral-storage': '40M', + 'memory': '4196Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[1] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 4, + 'ephemeral-storage': '40M', + 'memory': '4196Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[2] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 8, + 'ephemeral-storage': '40M', + 'memory': '2048Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + }) + + ns, pod = self.fake_k8s_client._pod_requests[3] + self.assertEqual(pod['spec']['containers'][0]['resources'], { + 'limits': { + 'cpu': 8, + 'ephemeral-storage': '20M', + 'memory': '4196Mi' + }, + 'requests': { + 'cpu': 2, + 'ephemeral-storage': '10M', + 'memory': '1024Mi' + }, + })