Add gpu support for k8s/openshift pods

This adds the option to request GPUs for kubernetes and openshift pods.

Since the resource name depends on the GPU vendor and the cluster
installation, this option is left for the user to define it in the
node pool.
To leverage the ability of some schedulers to use fractional GPUs,
the actual GPU value is read as a string.

For GPUs, requests and limits cannot be decoupled (cf.
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/),
so the same value will be used for requests and limits.

Change-Id: Ibe33b06c374a431f164080edb34c3a501c360df7
This commit is contained in:
mbecker 2023-02-14 15:56:03 +01:00
parent 669552f6f9
commit e56f5594e4
13 changed files with 138 additions and 0 deletions

View File

@ -298,6 +298,25 @@ Selecting the kubernetes driver adds the following options to the
label type; specifies the ephemeral-storage limit in
MB for the pod.
.. attr:: gpu
:type: str
Only used by the
:value:`providers.[openshift].pools.labels.type.pod`
label type; specifies the amount of gpu allocated to the pod.
This will be used to set both requests and limits to the same
value, based on how kubernetes assigns gpu resources:
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/.
.. attr:: gpu-resource
:type: str
Only used by the
:value:`providers.[openshift].pools.labels.type.pod`
label type; specifies the custom schedulable resource
associated with the installed gpu that is available
in the cluster.
.. attr:: env
:type: list
:default: []

View File

@ -209,6 +209,25 @@ Selecting the openshift pods driver adds the following options to the
Specifies the ephemeral-storage limit in MB for the pod.
.. attr:: gpu
:type: str
Only used by the
:value:`providers.[openshift].pools.labels.type.pod`
label type; specifies the amount of gpu allocated to the pod.
This will be used to set both requests and limits to the same
value, based on how kubernetes assigns gpu resources:
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/.
.. attr:: gpu-resource
:type: str
Only used by the
:value:`providers.[openshift].pools.labels.type.pod`
label type; specifies the custom schedulable resource
associated with the installed gpu that is available
in the cluster.
.. attr:: python-path
:type: str
:default: auto

View File

@ -296,6 +296,25 @@ Selecting the openshift driver adds the following options to the
label type; specifies the ephemeral-storage limit in
MB for the pod.
.. attr:: gpu
:type: str
Only used by the
:value:`providers.[openshift].pools.labels.type.pod`
label type; specifies the amount of gpu allocated to the pod.
This will be used to set both requests and limits to the same
value, based on how kubernetes assigns gpu resources:
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/.
.. attr:: gpu-resource
:type: str
Only used by the
:value:`providers.[openshift].pools.labels.type.pod`
label type; specifies the custom schedulable resource
associated with the installed gpu that is available
in the cluster.
.. attr:: env
:type: list
:default: []

View File

@ -72,6 +72,8 @@ class KubernetesPool(ConfigPool):
'memory-limit', default_memory_limit)
pl.storage_limit = label.get(
'storage-limit', default_storage_limit)
pl.gpu = label.get('gpu')
pl.gpu_resource = label.get('gpu-resource')
pl.env = label.get('env', [])
pl.node_selector = label.get('node-selector')
pl.privileged = label.get('privileged')
@ -126,6 +128,8 @@ class KubernetesProviderConfig(ProviderConfig):
'cpu-limit': int,
'memory-limit': int,
'storage-limit': int,
'gpu': str,
'gpu-resource': str,
'env': [env_var],
'node-selector': dict,
'privileged': bool,

View File

@ -332,6 +332,9 @@ class KubernetesProvider(Provider, QuotaSupport):
limits['memory'] = '%dMi' % int(label.memory_limit)
if label.storage_limit:
limits['ephemeral-storage'] = '%dM' % int(label.storage_limit)
if label.gpu_resource and label.gpu:
requests[label.gpu_resource] = label.gpu
limits[label.gpu_resource] = label.gpu
resources = {}
if requests:
resources['requests'] = requests

View File

@ -70,6 +70,8 @@ class OpenshiftPool(ConfigPool):
'memory-limit', default_memory_limit)
pl.storage_limit = label.get(
'storage-limit', default_storage_limit)
pl.gpu = label.get('gpu')
pl.gpu_resource = label.get('gpu-resource')
pl.python_path = label.get('python-path', 'auto')
pl.shell_type = label.get('shell-type')
pl.env = label.get('env', [])
@ -126,6 +128,8 @@ class OpenshiftProviderConfig(ProviderConfig):
'cpu-limit': int,
'memory-limit': int,
'storage-limit': int,
'gpu': str,
'gpu-resource': str,
'python-path': str,
'shell-type': str,
'env': [env_var],

View File

@ -248,6 +248,9 @@ class OpenshiftProvider(Provider, QuotaSupport):
limits['memory'] = '%dMi' % int(label.memory_limit)
if label.storage_limit:
limits['ephemeral-storage'] = '%dM' % int(label.storage_limit)
if label.gpu_resource and label.gpu:
requests[label.gpu_resource] = label.gpu
limits[label.gpu_resource] = label.gpu
resources = {}
if requests:
resources['requests'] = requests

View File

@ -60,6 +60,8 @@ class OpenshiftPodsProviderConfig(OpenshiftProviderConfig):
'cpu-limit': int,
'memory-limit': int,
'storage-limit': int,
'gpu': str,
'gpu-resource': str,
'python-path': str,
'shell-type': str,
'env': [env_var],

View File

@ -35,3 +35,7 @@ providers:
- name: pod-custom-storage
type: pod
storage: 20
- name: pod-custom-gpu
type: pod
gpu-resource: gpu-vendor.example/example-gpu
gpu: '1'

View File

@ -35,3 +35,7 @@ providers:
- name: pod-custom-storage
type: pod
storage: 20
- name: pod-custom-gpu
type: pod
gpu-resource: gpu-vendor.example/example-gpu
gpu: '1'

View File

@ -259,6 +259,7 @@ class TestDriverKubernetes(tests.DBTestCase):
req.node_types.append('pod-custom-cpu')
req.node_types.append('pod-custom-mem')
req.node_types.append('pod-custom-storage')
req.node_types.append('pod-custom-gpu')
self.zk.storeNodeRequest(req)
self.log.debug("Waiting for request %s", req.id)
@ -270,6 +271,7 @@ class TestDriverKubernetes(tests.DBTestCase):
node_cust_cpu = self.zk.getNode(req.nodes[1])
node_cust_mem = self.zk.getNode(req.nodes[2])
node_cust_storage = self.zk.getNode(req.nodes[3])
node_cust_gpu = self.zk.getNode(req.nodes[4])
resources_default = {
'instances': 1,
@ -295,12 +297,20 @@ class TestDriverKubernetes(tests.DBTestCase):
'ram': 1024,
'ephemeral-storage': 20,
}
resources_cust_gpu = {
'instances': 1,
'cores': 2,
'ram': 1024,
'ephemeral-storage': 10,
'gpu': 1
}
self.assertDictEqual(resources_default, node_default.resources)
self.assertDictEqual(resources_cust_cpu, node_cust_cpu.resources)
self.assertDictEqual(resources_cust_mem, node_cust_mem.resources)
self.assertDictEqual(resources_cust_storage,
node_cust_storage.resources)
self.assertDictEqual(resources_cust_gpu, node_cust_gpu.resources)
ns, pod = self.fake_k8s_client._pod_requests[0]
self.assertEqual(pod['spec']['containers'][0]['resources'], {
@ -358,6 +368,22 @@ class TestDriverKubernetes(tests.DBTestCase):
},
})
ns, pod = self.fake_k8s_client._pod_requests[3]
self.assertEqual(pod['spec']['containers'][0]['resources'], {
'limits': {
'cpu': 2,
'ephemeral-storage': '10M',
'memory': '1024Mi',
'gpu-vendor.example/example-gpu': 1
},
'requests': {
'cpu': 2,
'ephemeral-storage': '10M',
'memory': '1024Mi',
'gpu-vendor.example/example-gpu': 1
},
})
for node in (node_default, node_cust_cpu, node_cust_mem):
node.state = zk.DELETING
self.zk.storeNode(node)

View File

@ -271,6 +271,7 @@ class TestDriverOpenshift(tests.DBTestCase):
req.node_types.append('pod-custom-cpu')
req.node_types.append('pod-custom-mem')
req.node_types.append('pod-custom-storage')
req.node_types.append('pod-custom-gpu')
self.zk.storeNodeRequest(req)
self.log.debug("Waiting for request %s", req.id)
@ -282,6 +283,7 @@ class TestDriverOpenshift(tests.DBTestCase):
node_cust_cpu = self.zk.getNode(req.nodes[1])
node_cust_mem = self.zk.getNode(req.nodes[2])
node_cust_storage = self.zk.getNode(req.nodes[3])
node_cust_gpu = self.zk.getNode(req.nodes[4])
resources_default = {
'instances': 1,
@ -307,12 +309,20 @@ class TestDriverOpenshift(tests.DBTestCase):
'ram': 1024,
'ephemeral-storage': 20,
}
resources_cust_gpu = {
'instances': 1,
'cores': 2,
'ram': 1024,
'ephemeral-storage': 20,
'gpu': 1,
}
self.assertDictEqual(resources_default, node_default.resources)
self.assertDictEqual(resources_cust_cpu, node_cust_cpu.resources)
self.assertDictEqual(resources_cust_mem, node_cust_mem.resources)
self.assertDictEqual(resources_cust_storage,
node_cust_storage.resources)
self.assertDictEqual(resources_cust_gpu, node_cust_gpu.resources)
ns, pod = self.fake_k8s_client._pod_requests[0]
self.assertEqual(pod['spec']['containers'][0]['resources'], {
@ -370,6 +380,22 @@ class TestDriverOpenshift(tests.DBTestCase):
},
})
ns, pod = self.fake_k8s_client._pod_requests[3]
self.assertEqual(pod['spec']['containers'][0]['resources'], {
'limits': {
'cpu': 2,
'ephemeral-storage': '10M',
'memory': '1024Mi',
'gpu-vendor.example/example-gpu': 1
},
'requests': {
'cpu': 2,
'ephemeral-storage': '10M',
'memory': '1024Mi',
'gpu-vendor.example/example-gpu': 1
},
})
for node in (node_default, node_cust_cpu, node_cust_mem):
node.state = zk.DELETING
self.zk.storeNode(node)

View File

@ -0,0 +1,5 @@
---
features:
- |
Add support for requesting gpu resources
in kubernetes and openshift drivers.