fabc6822a0
This upgrade is needed in support of A100 GPU, kernel upgrade and bug 1948050. It eliminates the requirement to create nvidia specific runtimeclass prior to installing the charts by pre-installing the toolkit through toolkit- installer subchart. This commit has been tested with the following: driver: 470.57.02 toolkit: 1.7.1-ubi8 defaultRuntime: containerd Test Plan: PASS: Verify gpu-operator starts and adds nvidia.com/gpu to the node. PASS: Verify nvidia-toolkit is removed with helm override of global.toolkit_force_clean=true. PASS: Verify pods can access gpu device and nvidia tools to monitor the GPU. PASS: Verify pod can build and execute cuda sample code. PASS: Verify driver pod prints out warning when building on Low Latency kernel with helm override of: --set driver.env[0].name=IGNORE_PREEMPT_RT_PRESENCE Closes-Bug: 1948050 Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com> Change-Id: I18dd2a0ab1adc6f9364314a22373aadc93cad27f
868 lines
31 KiB
Diff
868 lines
31 KiB
Diff
From 65ac63ca1bc8517f3f0c3560498de758149a3800 Mon Sep 17 00:00:00 2001
|
|
From: Babak Sarashki <babak.sarashki@windriver.com>
|
|
Date: Sun, 7 Mar 2021 17:19:08 +0000
|
|
Subject: [PATCH] enablement: support on starlingx cloud platform
|
|
|
|
StarlingX is a cloud infrastructure software stack for edge.
|
|
It has an immutable file system, and system configruation. For
|
|
instance changes to set containerd runtime by the gpu-operator
|
|
will be overriden and must be avoided.
|
|
|
|
This commit enables gpu-operator on Starlingx (starlingx.io).
|
|
The changes to the gpu-operator include bundling modified assets
|
|
and a modified version of the nvidia-driver build script with the
|
|
helm charts.
|
|
|
|
The modifications include host-mounting the kernel headers and
|
|
kernel build directory onto the respective mount points inside
|
|
the driver pod namespace; modifying the nvidia-driver to account
|
|
for pre-installed kernel packages; and pre-installing the nvidia-
|
|
toolkit version 1.7.1-ubi8. The defaultRuntime is expected to
|
|
be containerd.
|
|
|
|
To load the operator on starlingx:
|
|
|
|
$ source /etc/platform/openrc
|
|
[...(keystone_admin)]$ system service-parameter-add \
|
|
platform container_runtime \
|
|
custom_container_runtime=nvidia:/path/to/nvidia-container-runtime
|
|
|
|
[...(keystone_admin)]$ system host-lock 1; system host-unlock 1
|
|
|
|
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
|
|
---
|
|
assets/state-driver/0500_daemonset.yaml | 47 ++-
|
|
.../0500_daemonset.yaml | 18 ++
|
|
deployments/gpu-operator/Chart.yaml | 3 +
|
|
.../charts/stx-toolkit-installer/.helmignore | 23 ++
|
|
.../charts/stx-toolkit-installer/Chart.yaml | 6 +
|
|
.../templates/_helpers.tpl | 6 +
|
|
.../templates/toolkit.yaml | 71 +++++
|
|
.../charts/stx-toolkit-installer/values.yaml | 8 +
|
|
.../templates/build_configmap.yaml | 291 ++++++++++++++++++
|
|
.../gpu-operator/templates/clusterpolicy.yaml | 4 +-
|
|
.../gpu-operator/templates/operator.yaml | 52 +++-
|
|
.../templates/operator_confimap.yaml | 61 ++++
|
|
deployments/gpu-operator/values.yaml | 15 +-
|
|
13 files changed, 583 insertions(+), 22 deletions(-)
|
|
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore
|
|
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml
|
|
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl
|
|
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml
|
|
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml
|
|
create mode 100644 deployments/gpu-operator/templates/build_configmap.yaml
|
|
create mode 100644 deployments/gpu-operator/templates/operator_confimap.yaml
|
|
|
|
diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml
|
|
index 4cd1617..c8aefd2 100644
|
|
--- a/assets/state-driver/0500_daemonset.yaml
|
|
+++ b/assets/state-driver/0500_daemonset.yaml
|
|
@@ -35,7 +35,6 @@ spec:
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
- # always use runc for driver containers
|
|
- name: NVIDIA_VISIBLE_DEVICES
|
|
value: void
|
|
securityContext:
|
|
@@ -72,8 +71,14 @@ spec:
|
|
- image: "FILLED BY THE OPERATOR"
|
|
imagePullPolicy: IfNotPresent
|
|
name: nvidia-driver-ctr
|
|
- command: ["nvidia-driver"]
|
|
- args: ["init"]
|
|
+ command: ["/bin/bash"]
|
|
+ args:
|
|
+ - "-c"
|
|
+ - "--"
|
|
+ - >
|
|
+ tar -C /usr/host-include -c . -f - | tar -C /usr/include -xvf -;
|
|
+ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so;
|
|
+ /usr/local/bin/nvidia-driver init;
|
|
securityContext:
|
|
privileged: true
|
|
seLinuxOptions:
|
|
@@ -94,6 +99,22 @@ spec:
|
|
- name: run-mellanox-drivers
|
|
mountPath: /run/mellanox/drivers
|
|
mountPropagation: HostToContainer
|
|
+ - name: host-modules
|
|
+ mountPath: /lib/modules
|
|
+ readOnly: false
|
|
+ - name: host-include
|
|
+ mountPath: /usr/host-include
|
|
+ readOnly: false
|
|
+ - name: host-kernel-devel
|
|
+ mountPath: /usr/src/kernels
|
|
+ readOnly: true
|
|
+ - name: host-usr-src
|
|
+ mountPath: /usr/host-src
|
|
+ readOnly: false
|
|
+ - name: vol11
|
|
+ mountPath: /usr/local/bin/nvidia-driver
|
|
+ subPath: nvidia-driver-build-script
|
|
+ readOnly: true
|
|
- image: "FILLED BY THE OPERATOR"
|
|
imagePullPolicy: IfNotPresent
|
|
name: nvidia-peermem-ctr
|
|
@@ -157,4 +178,22 @@ spec:
|
|
hostPath:
|
|
path: /run/nvidia/validations
|
|
type: DirectoryOrCreate
|
|
-
|
|
+ - name: host-modules
|
|
+ hostPath:
|
|
+ path: /lib/modules
|
|
+ - name: host-kernel-devel
|
|
+ hostPath:
|
|
+ path: /usr/src/kernels/
|
|
+ - name: host-include
|
|
+ hostPath:
|
|
+ path: /usr/include
|
|
+ - name: host-usr-src
|
|
+ hostPath:
|
|
+ path: /usr/src
|
|
+ - name: vol11
|
|
+ configMap:
|
|
+ name: nvidia-driver
|
|
+ defaultMode: 0777
|
|
+ items:
|
|
+ - key: nvidia-driver-build-script
|
|
+ path: nvidia-driver-build-script
|
|
diff --git a/assets/state-operator-validation/0500_daemonset.yaml b/assets/state-operator-validation/0500_daemonset.yaml
|
|
index 266c9d6..ce226fa 100644
|
|
--- a/assets/state-operator-validation/0500_daemonset.yaml
|
|
+++ b/assets/state-operator-validation/0500_daemonset.yaml
|
|
@@ -75,6 +75,10 @@ spec:
|
|
- name: run-nvidia-validations
|
|
mountPath: /run/nvidia/validations
|
|
mountPropagation: Bidirectional
|
|
+ - name: vol12
|
|
+ mountPath: /var/nvidia/manifests/cuda-workload-validation.yaml
|
|
+ subPath: cuda-workload-validation.yaml
|
|
+ readOnly: true
|
|
- name: plugin-validation
|
|
image: "FILLED_BY_OPERATOR"
|
|
command: ['sh', '-c']
|
|
@@ -98,6 +102,10 @@ spec:
|
|
- name: run-nvidia-validations
|
|
mountPath: /run/nvidia/validations
|
|
mountPropagation: Bidirectional
|
|
+ - name: vol12
|
|
+ mountPath: /var/nvidia/manifests/plugin-workload-validation.yaml
|
|
+ subPath: plugin-workload-validation.yaml
|
|
+ readOnly: true
|
|
containers:
|
|
- image: "FILLED_BY_OPERATOR"
|
|
name: nvidia-operator-validator
|
|
@@ -113,6 +121,7 @@ spec:
|
|
- name: run-nvidia-validations
|
|
mountPath: "/run/nvidia/validations"
|
|
mountPropagation: Bidirectional
|
|
+ terminationGracePeriodSeconds: 60
|
|
volumes:
|
|
- name: run-nvidia-validations
|
|
hostPath:
|
|
@@ -121,3 +130,12 @@ spec:
|
|
- name: driver-install-path
|
|
hostPath:
|
|
path: /run/nvidia/driver
|
|
+ - name: vol12
|
|
+ configMap:
|
|
+ name: nvidia-validator
|
|
+ defaultMode: 0444
|
|
+ items:
|
|
+ - key: cuda-workload-validation.yaml
|
|
+ path: cuda-workload-validation.yaml
|
|
+ - key: plugin-workload-validation.yaml
|
|
+ path: plugin-workload-validation.yaml
|
|
diff --git a/deployments/gpu-operator/Chart.yaml b/deployments/gpu-operator/Chart.yaml
|
|
index 0b379a3..7b743e4 100644
|
|
--- a/deployments/gpu-operator/Chart.yaml
|
|
+++ b/deployments/gpu-operator/Chart.yaml
|
|
@@ -22,3 +22,6 @@ dependencies:
|
|
version: 0.8.2
|
|
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
|
|
condition: nfd.enabled
|
|
+ - name: stx-toolkit-installer
|
|
+ version: 0.1.0
|
|
+ condition: toolkit-installer.enabled
|
|
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore
|
|
new file mode 100644
|
|
index 0000000..0e8a0eb
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore
|
|
@@ -0,0 +1,23 @@
|
|
+# Patterns to ignore when building packages.
|
|
+# This supports shell glob matching, relative path matching, and
|
|
+# negation (prefixed with !). Only one pattern per line.
|
|
+.DS_Store
|
|
+# Common VCS dirs
|
|
+.git/
|
|
+.gitignore
|
|
+.bzr/
|
|
+.bzrignore
|
|
+.hg/
|
|
+.hgignore
|
|
+.svn/
|
|
+# Common backup files
|
|
+*.swp
|
|
+*.bak
|
|
+*.tmp
|
|
+*.orig
|
|
+*~
|
|
+# Various IDEs
|
|
+.project
|
|
+.idea/
|
|
+*.tmproj
|
|
+.vscode/
|
|
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml
|
|
new file mode 100644
|
|
index 0000000..c195c58
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml
|
|
@@ -0,0 +1,6 @@
|
|
+apiVersion: v2
|
|
+appVersion: v0.1.0
|
|
+name: stx-toolkit-installer
|
|
+description: "Standalone nvidia toolkit installer for starlingx"
|
|
+type: application
|
|
+version: 1.7.1-ubi8
|
|
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl
|
|
new file mode 100644
|
|
index 0000000..b6f6274
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl
|
|
@@ -0,0 +1,6 @@
|
|
+{{/*
|
|
+Full image name with tag
|
|
+*/}}
|
|
+{{- define "toolkit-installer.fullimage" -}}
|
|
+{{- .Values.toolkit.repository -}}/{{- .Values.toolkit.image -}}:{{- .Values.toolkit.version | default .Chart.AppVersion -}}
|
|
+{{- end }}
|
|
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml
|
|
new file mode 100644
|
|
index 0000000..3cbec11
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml
|
|
@@ -0,0 +1,71 @@
|
|
+apiVersion: apps/v1
|
|
+kind: DaemonSet
|
|
+metadata:
|
|
+ name: toolkit-installer
|
|
+ namespace: kube-system
|
|
+ labels:
|
|
+ app.kubernetes.io/component: "toolkit-installer"
|
|
+ {{ $.Release.labels }}
|
|
+spec:
|
|
+ selector:
|
|
+ matchLabels:
|
|
+ {{ $.Release.labels }}
|
|
+ app.kubernetes.io/component: "toolkit-installer"
|
|
+ app: "toolkit-installer"
|
|
+ template:
|
|
+ metadata:
|
|
+ labels:
|
|
+ {{ $.Release.labels }}
|
|
+ app.kubernetes.io/component: "toolkit-installer"
|
|
+ app: "toolkit-installer"
|
|
+ spec:
|
|
+ containers:
|
|
+ - name: toolkit-daemon
|
|
+ image: {{ include "toolkit-installer.fullimage" . }}
|
|
+ lifecycle:
|
|
+ preStop:
|
|
+ exec:
|
|
+ command:
|
|
+ - "/bin/sh"
|
|
+ - "-c"
|
|
+ - "--"
|
|
+ - >
|
|
+ if [ $toolkit_force_clean == "true" ] ; then
|
|
+ while [[ -f /var/run/nvidia/validations/cuda-ready ]] ||
|
|
+ [[ -f /var/run/nvidia/validations/driver-ready ]] ||
|
|
+ [[ -f /var/run/nvidia/validations/plugin-ready ]] ||
|
|
+ [[ -f /var/run/nvidia/validations/toolkit-ready ]] ;
|
|
+ do
|
|
+ echo "waiting for gpu pods to exit"
|
|
+ sleep 10;
|
|
+ done;
|
|
+ sleep 60;
|
|
+ rm -rf /usr/local/nvidia/toolkit;
|
|
+ fi;
|
|
+ command: ["/bin/bash"]
|
|
+ args:
|
|
+ - "-c"
|
|
+ - "--"
|
|
+ - >
|
|
+ ./toolkit install /usr/local/nvidia/toolkit;
|
|
+ sleep infinity;
|
|
+ env:
|
|
+ - name: toolkit_force_clean
|
|
+ value: {{ quote .Values.global.toolkit_force_clean }}
|
|
+ volumeMounts:
|
|
+ - name: toolkitdest
|
|
+ mountPath: /usr/local/nvidia
|
|
+ readOnly: false
|
|
+ - name: varrunnvidia
|
|
+ mountPath: /var/run/nvidia
|
|
+ readOnly: true
|
|
+ {{ if (.Values.global.toolkit_force_clean) and (eq .Values.gobal.toolkit_force_clean "true") }}
|
|
+ terminationGracePeriodSeconds: 120
|
|
+ {{- end }}
|
|
+ volumes:
|
|
+ - name: toolkitdest
|
|
+ hostPath:
|
|
+ path: /usr/local/nvidia
|
|
+ - name: varrunnvidia
|
|
+ hostPath:
|
|
+ path: /var/run/nvidia
|
|
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml
|
|
new file mode 100644
|
|
index 0000000..b898dc2
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml
|
|
@@ -0,0 +1,8 @@
|
|
+toolkit:
|
|
+ repository: nvidia
|
|
+ image: container-toolkit
|
|
+ version: 1.7.1-ubi8
|
|
+ imagePullPolicy: IfNotPresent
|
|
+ imagePullSecrets: []
|
|
+ priorityClassName: system-node-critical
|
|
+ defaultRuntime: containerd
|
|
diff --git a/deployments/gpu-operator/templates/build_configmap.yaml b/deployments/gpu-operator/templates/build_configmap.yaml
|
|
new file mode 100644
|
|
index 0000000..a7453a4
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/templates/build_configmap.yaml
|
|
@@ -0,0 +1,291 @@
|
|
+{{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
|
|
+apiVersion: v1
|
|
+kind: Namespace
|
|
+metadata:
|
|
+ name: "gpu-operator-resources"
|
|
+---
|
|
+apiVersion: v1
|
|
+kind: ConfigMap
|
|
+metadata:
|
|
+ name: nvidia-driver
|
|
+ namespace: gpu-operator-resources
|
|
+data:
|
|
+ nvidia-driver-build-script: |
|
|
+ #! /bin/bash
|
|
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
|
+ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier:
|
|
+ # Apache-2.0.
|
|
+ # This script is from: https://gitlab.com/nvidia/container-images/driver.
|
|
+ # It is modified and included under configmap for platforms that require
|
|
+ # pre-installed packages. Such platforms have the option to modify the
|
|
+ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for
|
|
+ # further customizations.
|
|
+
|
|
+ set -eu
|
|
+
|
|
+ RUN_DIR=/run/nvidia
|
|
+ PID_FILE=${RUN_DIR}/${0##*/}.pid
|
|
+ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
|
|
+ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
|
|
+ KERNEL_VERSION="$(uname -r)"
|
|
+
|
|
+ _install_tools() {
|
|
+ yum clean all
|
|
+ yum install -y centos-release-scl
|
|
+ yum install -y epel-release
|
|
+ yum install -y devtoolset-8-build devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-make
|
|
+ }
|
|
+
|
|
+ # Load the kernel modules and start persistenced.
|
|
+ _load_driver() {
|
|
+ echo "Loading IPMI kernel module..."
|
|
+ modprobe ipmi_msghandler
|
|
+
|
|
+ echo "Loading NVIDIA driver kernel modules..."
|
|
+ modprobe -a nvidia nvidia-uvm nvidia-modeset
|
|
+
|
|
+ echo "Starting NVIDIA persistence daemon..."
|
|
+ nvidia-persistenced --persistence-mode
|
|
+ }
|
|
+
|
|
+ # Stop persistenced and unload the kernel modules if they are currently loaded.
|
|
+ _unload_driver() {
|
|
+ local rmmod_args=()
|
|
+ local nvidia_deps=0
|
|
+ local nvidia_refs=0
|
|
+ local nvidia_uvm_refs=0
|
|
+ local nvidia_modeset_refs=0
|
|
+
|
|
+ echo "Stopping NVIDIA persistence daemon..."
|
|
+ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then
|
|
+ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid)
|
|
+
|
|
+ kill -SIGTERM "${pid}"
|
|
+ for i in $(seq 1 10); do
|
|
+ kill -0 "${pid}" 2> /dev/null || break
|
|
+ sleep 0.1
|
|
+ done
|
|
+ if [ $i -eq 10 ]; then
|
|
+ echo "Could not stop NVIDIA persistence daemon" >&2
|
|
+ return 1
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ echo "Unloading NVIDIA driver kernel modules..."
|
|
+ if [ -f /sys/module/nvidia_modeset/refcnt ]; then
|
|
+ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
|
|
+ rmmod_args+=("nvidia-modeset")
|
|
+ ((++nvidia_deps))
|
|
+ fi
|
|
+ if [ -f /sys/module/nvidia_uvm/refcnt ]; then
|
|
+ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt)
|
|
+ rmmod_args+=("nvidia-uvm")
|
|
+ ((++nvidia_deps))
|
|
+ fi
|
|
+ if [ -f /sys/module/nvidia/refcnt ]; then
|
|
+ nvidia_refs=$(< /sys/module/nvidia/refcnt)
|
|
+ rmmod_args+=("nvidia")
|
|
+ fi
|
|
+ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then
|
|
+ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
|
|
+ return 1
|
|
+ fi
|
|
+
|
|
+ if [ ${#rmmod_args[@]} -gt 0 ]; then
|
|
+ rmmod ${rmmod_args[@]}
|
|
+ fi
|
|
+ return 0
|
|
+ }
|
|
+
|
|
+ # Link and install the kernel modules from a precompiled package using the nvidia-installer.
|
|
+ _install_driver() {
|
|
+ local install_args=()
|
|
+
|
|
+ # Default is standard kernel.
|
|
+ if [ ! -z ${IGNORE_PREEMPT_RT_PRESENCE+x} ] ; then
|
|
+ echo "WARN: IGNORE_PREEMPT_RT_PRESENCE set"
|
|
+ echo "Build Target PREEMPT_RT best effort"
|
|
+ fi;
|
|
+
|
|
+ _install_tools
|
|
+ export PATH=/opt/rh/devtoolset-8/root/usr/bin${PATH:+:${PATH}}
|
|
+ export PCP_DIR=/opt/rh/devtoolset-8/root
|
|
+
|
|
+ echo "Installing NVIDIA driver kernel modules..."
|
|
+ cd /usr/src/nvidia-${DRIVER_VERSION}
|
|
+ # rm -rf /lib/modules/${KERNEL_VERSION}/video
|
|
+
|
|
+ if [ "${ACCEPT_LICENSE}" = "yes" ]; then
|
|
+ install_args+=("--accept-license")
|
|
+ fi
|
|
+ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"}
|
|
+ }
|
|
+
|
|
+ # Mount the driver rootfs into the run directory with the exception of sysfs.
|
|
+ _mount_rootfs() {
|
|
+ echo "Mounting NVIDIA driver rootfs..."
|
|
+ mount --make-runbindable /sys
|
|
+ mount --make-private /sys
|
|
+ mkdir -p ${RUN_DIR}/driver
|
|
+ mount --rbind / ${RUN_DIR}/driver
|
|
+ }
|
|
+
|
|
+ # Unmount the driver rootfs from the run directory.
|
|
+ _unmount_rootfs() {
|
|
+ echo "Unmounting NVIDIA driver rootfs..."
|
|
+ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then
|
|
+ umount -l -R ${RUN_DIR}/driver
|
|
+ fi
|
|
+ }
|
|
+
|
|
+ init() {
|
|
+ echo -e "\n========== NVIDIA Software Installer ==========\n"
|
|
+ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
|
|
+
|
|
+ exec 3> ${PID_FILE}
|
|
+ if ! flock -n 3; then
|
|
+ echo "An instance of the NVIDIA driver is already running, aborting"
|
|
+ exit 1
|
|
+ fi
|
|
+ echo $$ >&3
|
|
+
|
|
+ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
|
|
+ trap "_shutdown" EXIT
|
|
+
|
|
+ _unload_driver || exit 1
|
|
+ _unmount_rootfs
|
|
+
|
|
+ (
|
|
+ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia.ko ] ||
|
|
+ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-uvm.ko ] ||
|
|
+ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-modeset.ko ]
|
|
+ ) && _install_driver
|
|
+
|
|
+ _load_driver
|
|
+ _mount_rootfs
|
|
+
|
|
+ echo "Done, now waiting for signal"
|
|
+ sleep infinity &
|
|
+ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
|
|
+ trap - EXIT
|
|
+ while true; do wait $! || continue; done
|
|
+ exit 0
|
|
+ }
|
|
+
|
|
+ usage() {
|
|
+ cat >&2 <<EOF
|
|
+ Usage: $0 COMMAND [ARG...]
|
|
+
|
|
+ Commands:
|
|
+ init [-a | --accept-license]
|
|
+ EOF
|
|
+ exit 1
|
|
+ }
|
|
+
|
|
+ if [ $# -eq 0 ]; then
|
|
+ usage
|
|
+ fi
|
|
+ command=$1; shift
|
|
+ case "${command}" in
|
|
+ init) options=$(getopt -l accept-license -o a -- "$@") ;;
|
|
+ *) usage ;;
|
|
+ esac
|
|
+ if [ $? -ne 0 ]; then
|
|
+ usage
|
|
+ fi
|
|
+ eval set -- "${options}"
|
|
+
|
|
+ ACCEPT_LICENSE=""
|
|
+ KERNEL_VERSION=$(uname -r)
|
|
+ PRIVATE_KEY=""
|
|
+ PACKAGE_TAG=""
|
|
+
|
|
+ for opt in ${options}; do
|
|
+ case "$opt" in
|
|
+ -a | --accept-license) ACCEPT_LICENSE="yes"; shift 1 ;;
|
|
+ --) shift; break ;;
|
|
+ esac
|
|
+ done
|
|
+ if [ $# -ne 0 ]; then
|
|
+ usage
|
|
+ fi
|
|
+ $command;
|
|
+---
|
|
+apiVersion: v1
|
|
+kind: ConfigMap
|
|
+metadata:
|
|
+ name: nvidia-validator
|
|
+ namespace: gpu-operator-resources
|
|
+data:
|
|
+ cuda-workload-validation.yaml: |
|
|
+ apiVersion: v1
|
|
+ kind: Pod
|
|
+ metadata:
|
|
+ labels:
|
|
+ app: nvidia-cuda-validator
|
|
+ generateName: nvidia-cuda-validator-
|
|
+ namespace: gpu-operator-resources
|
|
+ spec:
|
|
+ tolerations:
|
|
+ - key: nvidia.com/gpu
|
|
+ operator: Exists
|
|
+ effect: NoSchedule
|
|
+ readOnlyRootFilesystem: true
|
|
+ restartPolicy: OnFailure
|
|
+ serviceAccount: nvidia-operator-validator
|
|
+ runtimeClassName: nvidia
|
|
+ initContainers:
|
|
+ - name: cuda-validation
|
|
+ image: "FILLED_BY_VALIDATOR"
|
|
+ imagePullPolicy: IfNotPresent
|
|
+ command: ['sh', '-c']
|
|
+ args: ["vectorAdd"]
|
|
+ securityContext:
|
|
+ allowPrivilegeEscalation: false
|
|
+ containers:
|
|
+ - name: nvidia-cuda-validator
|
|
+ image: "FILLED_BY_VALIDATOR"
|
|
+ imagePullPolicy: IfNotPresent
|
|
+ # override command and args as validation is already done by initContainer
|
|
+ command: ['sh', '-c']
|
|
+ args: ["echo cuda workload validation is successful"]
|
|
+ securityContext:
|
|
+ allowPrivilegeEscalation: false
|
|
+ plugin-workload-validation.yaml: |
|
|
+ apiVersion: v1
|
|
+ kind: Pod
|
|
+ metadata:
|
|
+ labels:
|
|
+ app: nvidia-device-plugin-validator
|
|
+ generateName: nvidia-device-plugin-validator-
|
|
+ namespace: gpu-operator-resources
|
|
+ spec:
|
|
+ tolerations:
|
|
+ - key: nvidia.com/gpu
|
|
+ operator: Exists
|
|
+ effect: NoSchedule
|
|
+ readOnlyRootFilesystem: true
|
|
+ restartPolicy: OnFailure
|
|
+ serviceAccount: nvidia-operator-validator
|
|
+ runtimeClassName: nvidia
|
|
+ initContainers:
|
|
+ - name: plugin-validation
|
|
+ image: "FILLED_BY_VALIDATOR"
|
|
+ imagePullPolicy: IfNotPresent
|
|
+ command: ['sh', '-c']
|
|
+ args: ["vectorAdd"]
|
|
+ securityContext:
|
|
+ allowPrivilegeEscalation: false
|
|
+ resources:
|
|
+ limits:
|
|
+ "FILLED_BY_VALIDATOR": 1
|
|
+ containers:
|
|
+ - name: nvidia-device-plugin-validator
|
|
+ image: "FILLED_BY_VALIDATOR"
|
|
+ imagePullPolicy: IfNotPresent
|
|
+ # override command and args as validation is already done by initContainer
|
|
+ command: ['sh', '-c']
|
|
+ args: ["echo device-plugin workload validation is successful"]
|
|
+ securityContext:
|
|
+ allowPrivilegeEscalation: false
|
|
+{{- end }}
|
|
diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml
|
|
index c819a2e..a33cffb 100644
|
|
--- a/deployments/gpu-operator/templates/clusterpolicy.yaml
|
|
+++ b/deployments/gpu-operator/templates/clusterpolicy.yaml
|
|
@@ -152,7 +152,7 @@ spec:
|
|
args: {{ toYaml .Values.driver.args | nindent 6 }}
|
|
{{- end }}
|
|
toolkit:
|
|
- enabled: {{ .Values.toolkit.enabled }}
|
|
+ enabled: false
|
|
{{- if .Values.toolkit.repository }}
|
|
repository: {{ .Values.toolkit.repository }}
|
|
{{- end }}
|
|
@@ -354,4 +354,4 @@ spec:
|
|
{{- end }}
|
|
{{- if .Values.nodeStatusExporter.args }}
|
|
args: {{ toYaml .Values.nodeStatusExporter.args | nindent 6 }}
|
|
- {{- end }}
|
|
\ No newline at end of file
|
|
+ {{- end }}
|
|
diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
|
|
index c97b4b1..32234d8 100644
|
|
--- a/deployments/gpu-operator/templates/operator.yaml
|
|
+++ b/deployments/gpu-operator/templates/operator.yaml
|
|
@@ -50,29 +50,41 @@ spec:
|
|
mountPath: "/host-etc/os-release"
|
|
readOnly: true
|
|
|
|
- {{- if eq .Values.operator.include_assets "include_assets" }}
|
|
+ {{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
|
|
{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
|
|
- name: assets
|
|
mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }}
|
|
subPath: {{ printf "gfd_%s" (base $path) }}
|
|
{{- end }}
|
|
|
|
+ {{- range $path, $_ := .Files.Glob "assets/pre-requisites//*" }}
|
|
+ - name: assets
|
|
+ mountPath: {{ printf "/opt/gpu-operator/pre-requisites/%s" (base $path) }}
|
|
+ subPath: {{ printf "pre_requisites_%s" (base $path) }}
|
|
+ {{- end }}
|
|
+
|
|
{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
|
|
- name: assets
|
|
mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }}
|
|
subPath: {{ printf "state_container_toolkit_%s" (base $path) }}
|
|
{{- end }}
|
|
|
|
- {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-dcgm-exporter/*" }}
|
|
- name: assets
|
|
- mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
|
|
- subPath: {{ printf "state_device_%s" (base $path) }}
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-dcgm-exporter/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_dcgm_exporter_%s" (base $path) }}
|
|
{{- end }}
|
|
|
|
- {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-dcgm/*" }}
|
|
- name: assets
|
|
- mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }}
|
|
- subPath: {{ printf "state_device_validation_%s" (base $path) }}
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-dcgm/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_dcgm_%s" (base $path) }}
|
|
+ {{- end }}
|
|
+
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
|
|
+ - name: assets
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_device_plugin_%s" (base $path) }}
|
|
{{- end }}
|
|
|
|
{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
|
|
@@ -81,10 +93,28 @@ spec:
|
|
subPath: {{ printf "state_driver_%s" (base $path) }}
|
|
{{- end }}
|
|
|
|
- {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-mig-manager/*" }}
|
|
+ - name: assets
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-mig-manager/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_mig_manager_%s" (base $path) }}
|
|
+ {{- end }}
|
|
+
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-node-status-exporter/*" }}
|
|
+ - name: assets
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-node-status-exporter/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_node_status_exporter_%s" (base $path) }}
|
|
+ {{- end }}
|
|
+
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-operator-metrics/*" }}
|
|
+ - name: assets
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-operator-metrics/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_operator_metrics_%s" (base $path) }}
|
|
+ {{- end }}
|
|
+
|
|
+ {{- range $path, $_ := .Files.Glob "assets/state-operator-validation/*" }}
|
|
- name: assets
|
|
- mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }}
|
|
- subPath: {{ printf "state_monitor_%s" (base $path) }}
|
|
+ mountPath: {{ printf "/opt/gpu-operator/state-operator-validation/%s" (base $path) }}
|
|
+ subPath: {{ printf "state_operator_validation_%s" (base $path) }}
|
|
{{- end }}
|
|
{{- end }}
|
|
livenessProbe:
|
|
@@ -110,7 +140,7 @@ spec:
|
|
- name: host-os-release
|
|
hostPath:
|
|
path: "/etc/os-release"
|
|
- {{- if eq .Values.operator.include_assets "include_assets" }}
|
|
+ {{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
|
|
- name: assets
|
|
configMap:
|
|
name: operator-configmap
|
|
diff --git a/deployments/gpu-operator/templates/operator_confimap.yaml b/deployments/gpu-operator/templates/operator_confimap.yaml
|
|
new file mode 100644
|
|
index 0000000..6303960
|
|
--- /dev/null
|
|
+++ b/deployments/gpu-operator/templates/operator_confimap.yaml
|
|
@@ -0,0 +1,61 @@
|
|
+{{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
|
|
+apiVersion: v1
|
|
+kind: ConfigMap
|
|
+metadata:
|
|
+ name: operator-configmap
|
|
+data:
|
|
+{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
|
|
+{{ printf "gfd_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/pre-requisites//*" }}
|
|
+{{ printf "pre_requisites_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
|
|
+{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-dcgm-exporter/*" }}
|
|
+{{ printf "state_dcgm_exporter_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-dcgm/*" }}
|
|
+{{ printf "state_dcgm_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
|
|
+{{ printf "state_device_plugin_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
|
|
+{{ printf "state_driver_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-mig-manager/*" }}
|
|
+{{ printf "state_mig_manager_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-node-status-exporter/*" }}
|
|
+{{ printf "state_node_status_exporter_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-operator-metrics/*" }}
|
|
+{{ printf "state_operator_metrics_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+
|
|
+{{- range $path, $_ := .Files.Glob "assets/state-operator-validation/*" }}
|
|
+{{ printf "state_operator_validation_%s" (base $path) | indent 2 }}: |-
|
|
+{{ $.Files.Get $path | indent 4 }}
|
|
+{{- end }}
|
|
+{{- end }}
|
|
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
|
|
index 6689636..e8157a1 100644
|
|
--- a/deployments/gpu-operator/values.yaml
|
|
+++ b/deployments/gpu-operator/values.yaml
|
|
@@ -11,6 +11,9 @@ nfd:
|
|
psp:
|
|
enabled: false
|
|
|
|
+toolkit-installer:
|
|
+ enabled: true
|
|
+
|
|
daemonsets:
|
|
priorityClassName: system-node-critical
|
|
tolerations:
|
|
@@ -45,7 +48,7 @@ operator:
|
|
imagePullPolicy: IfNotPresent
|
|
imagePullSecrets: []
|
|
priorityClassName: system-node-critical
|
|
- defaultRuntime: docker
|
|
+ defaultRuntime: containerd
|
|
runtimeClass: nvidia
|
|
initContainer:
|
|
image: cuda
|
|
@@ -70,8 +73,7 @@ operator:
|
|
values: [""]
|
|
logging:
|
|
timeEncoding: epoch
|
|
- # Set "include_assets" true to include assets/gpu-operator with the helm chart
|
|
- include_assets: ""
|
|
+ include_assets: "True"
|
|
resources:
|
|
limits:
|
|
cpu: 500m
|
|
@@ -127,10 +129,10 @@ driver:
|
|
config: ""
|
|
|
|
toolkit:
|
|
- enabled: true
|
|
+ enabled: false
|
|
repository: nvcr.io/nvidia/k8s
|
|
image: container-toolkit
|
|
- version: 1.6.0-ubuntu18.04
|
|
+ version: 1.7.1-ubi8
|
|
imagePullPolicy: IfNotPresent
|
|
imagePullSecrets: []
|
|
env: []
|
|
@@ -255,3 +257,6 @@ node-feature-discovery:
|
|
|
|
serviceAccount:
|
|
name: node-feature-discovery
|
|
+
|
|
+global:
|
|
+ toolkit_force_clean: false
|
|
--
|
|
2.17.1
|
|
|