From fd5d9e694ba05021582c9250364c93d1c6a11f9c Mon Sep 17 00:00:00 2001 From: Andre Fernando Zanella Kantek Date: Wed, 27 Jul 2022 10:56:43 -0300 Subject: [PATCH] Debian: Add package gpu-operator This change adds the gpu-operator package to the Debian build. The NVIDIA GPU Operator uses the operator framework within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. The provided patches come from the CentOS port done in https://review.opendev.org/c/starlingx/integ/+/784144 https://review.opendev.org/c/starlingx/integ/+/817725 Test plan (Debian only) PASS build ISO with the package installed PASS execute helm install PASS execute helm uninstall Story: 2009968 Task: 45976 Signed-off-by: Andre Fernando Zanella Kantek Change-Id: Ic656d764dc3e31dcd89e02b172c14eb6d32743a7 --- debian_pkg_dirs | 1 + gpu/gpu-operator/debian/deb_folder/changelog | 5 + gpu/gpu-operator/debian/deb_folder/control | 14 + gpu/gpu-operator/debian/deb_folder/copyright | 29 + .../debian/deb_folder/gpu-operator.install | 1 + gpu/gpu-operator/debian/deb_folder/rules | 23 + gpu/gpu-operator/debian/meta_data.yaml | 11 + ...p-configmap-with-assets-for-volumemo.patch | 136 +++ ...-support-on-starlingx-cloud-platform.patch | 867 ++++++++++++++++++ gpu/gpu-operator/debian/patches/series | 2 + 10 files changed, 1089 insertions(+) create mode 100644 gpu/gpu-operator/debian/deb_folder/changelog create mode 100644 gpu/gpu-operator/debian/deb_folder/control create mode 100644 gpu/gpu-operator/debian/deb_folder/copyright create mode 100644 gpu/gpu-operator/debian/deb_folder/gpu-operator.install create mode 100644 gpu/gpu-operator/debian/deb_folder/rules create mode 100644 gpu/gpu-operator/debian/meta_data.yaml create mode 100644 gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch create mode 100644 gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch create mode 100644 gpu/gpu-operator/debian/patches/series diff --git a/debian_pkg_dirs b/debian_pkg_dirs index 449fb94de..de44351d9 100644 --- a/debian_pkg_dirs +++ b/debian_pkg_dirs @@ -45,6 +45,7 @@ golang-github-dev/golang-github-cilium-ebpf-dev golang-github-dev/golang-github-coreos-go-systemd-dev golang-github-dev/golang-github-opencontainers-specs-dev golang-github-dev/golang-github-vishvananda-netlink +gpu/gpu-operator grub/grub2 grub/grubby kubernetes/armada diff --git a/gpu/gpu-operator/debian/deb_folder/changelog b/gpu/gpu-operator/debian/deb_folder/changelog new file mode 100644 index 000000000..0fa199f69 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/changelog @@ -0,0 +1,5 @@ +gpu-operator (1.8.1) unstable; urgency=medium + + * Initial release. + + -- Andre Kantek Thu, 27 Jul 2022 14:00:42 +0000 diff --git a/gpu/gpu-operator/debian/deb_folder/control b/gpu/gpu-operator/debian/deb_folder/control new file mode 100644 index 000000000..5172d093e --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/control @@ -0,0 +1,14 @@ +Source: gpu-operator +Section: admin +Priority: optional +Maintainer: StarlingX Developers +Build-Depends: debhelper-compat (= 13), helm +Standards-Version: 4.5.1 +Homepage: https://www.starlingx.io + +Package: gpu-operator +Architecture: any +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: The NVIDIA GPU Operator uses the operator framework within + Kubernetes to automate the management of all NVIDIA software components + needed to provision GPU diff --git a/gpu/gpu-operator/debian/deb_folder/copyright b/gpu/gpu-operator/debian/deb_folder/copyright new file mode 100644 index 000000000..7db1ad9a0 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/copyright @@ -0,0 +1,29 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Upstream-Contact: https://github.com/NVIDIA/gpu-operator/ +Source: https://github.com/NVIDIA/gpu-operator/ +Files: * +Copyright: (C) 2018-2022 https://github.com/NVIDIA/gpu-operator/ +License: Apache-2 + +Upstream-Name: gpu-operator +Upstream-Contact: StarlingX Developers +Source: https://opendev.org/starlingx/integ/src/branch/master/gpu/gpu-operator/ +Files: debian/* +Copyright: (c) 2022 Wind River Systems, Inc. +License: Apache-2 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + . + On Debian-based systems the full text of the Apache version 2.0 license + can be found in `/usr/share/common-licenses/Apache-2.0'. + diff --git a/gpu/gpu-operator/debian/deb_folder/gpu-operator.install b/gpu/gpu-operator/debian/deb_folder/gpu-operator.install new file mode 100644 index 000000000..6fb208476 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/gpu-operator.install @@ -0,0 +1 @@ +opt/extracharts/gpu-operator-v3-1.8.1.tgz \ No newline at end of file diff --git a/gpu/gpu-operator/debian/deb_folder/rules b/gpu/gpu-operator/debian/deb_folder/rules new file mode 100644 index 000000000..ea8fba3d2 --- /dev/null +++ b/gpu/gpu-operator/debian/deb_folder/rules @@ -0,0 +1,23 @@ +#!/usr/bin/make -f + +export HELM_VER = v3 +export PKG_VERSION = 1.8.1 +export DEBIAN_DESTDIR := $(CURDIR)/debian/tmp + +%: + dh $@ + +override_dh_auto_build: + mkdir -p deployments/gpu-operator/assets/state-driver/ + mkdir -p deployments/gpu-operator/assets/state-operator-validation/ + cp assets/state-driver/0500_daemonset.yaml deployments/gpu-operator/assets/state-driver/0500_daemonset.yaml + cp assets/state-operator-validation/0500_daemonset.yaml deployments/gpu-operator/assets/state-operator-validation/0500_daemonset.yaml + helm lint deployments/gpu-operator + mkdir build_results + helm package --version ${HELM_VER}-${PKG_VERSION} --app-version v${PKG_VERSION} -d build_results deployments/gpu-operator + +override_dh_auto_install: + # Install the app tar file. + install -d -m 755 ${DEBIAN_DESTDIR}/opt/extracharts + install -p -D -m 644 build_results/gpu-operator-${HELM_VER}-${PKG_VERSION}.tgz ${DEBIAN_DESTDIR}/opt/extracharts + dh_install diff --git a/gpu/gpu-operator/debian/meta_data.yaml b/gpu/gpu-operator/debian/meta_data.yaml new file mode 100644 index 000000000..a6974e191 --- /dev/null +++ b/gpu/gpu-operator/debian/meta_data.yaml @@ -0,0 +1,11 @@ +--- +debname: gpu-operator +debver: 1.8.1 +dl_path: + name: gpu-operator-v1.8.1.tar.gz + url: https://github.com/NVIDIA/gpu-operator/archive/refs/tags/v1.8.1.tar.gz + md5sum: 03c7346c724774ecd63d33ba7d8e110a + sha256sum: 42e08c95ce5b558a296cb31c98a6beeef3b551d47d236fa082db7fa5c44ad471 +revision: + dist: $STX_DIST + PKG_GITREVCOUNT: true diff --git a/gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch b/gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch new file mode 100644 index 000000000..2215c093c --- /dev/null +++ b/gpu/gpu-operator/debian/patches/deployments-setup-configmap-with-assets-for-volumemo.patch @@ -0,0 +1,136 @@ +From 1094b6f1593ec454b3a6313ecf9fae53f8c66899 Mon Sep 17 00:00:00 2001 +From: Babak Sarashki +Date: Sat, 6 Mar 2021 00:22:40 +0000 +Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts + +This feature allows inclusion of assets/ in the helm chart and their +export to the gpu-operator pod through configmap volumeMounts. + +Signed-off-by: Babak Sarashki +--- + .../gpu-operator/templates/operator.yaml | 44 +++++++++++++++++++ + .../templates/operator_configmap.yaml | 36 +++++++++++++++ + deployments/gpu-operator/values.yaml | 2 + + 3 files changed, 82 insertions(+) + create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml + +diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml +index 1d81f74..c97b4b1 100644 +--- a/deployments/gpu-operator/templates/operator.yaml ++++ b/deployments/gpu-operator/templates/operator.yaml +@@ -49,6 +49,44 @@ spec: + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true ++ ++ {{- if eq .Values.operator.include_assets "include_assets" }} ++ {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }} ++ subPath: {{ printf "gfd_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }} ++ subPath: {{ printf "state_container_toolkit_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }} ++ subPath: {{ printf "state_device_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }} ++ subPath: {{ printf "state_device_validation_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }} ++ subPath: {{ printf "state_driver_%s" (base $path) }} ++ {{- end }} ++ ++ {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} ++ - name: assets ++ mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }} ++ subPath: {{ printf "state_monitor_%s" (base $path) }} ++ {{- end }} ++ {{- end }} + livenessProbe: + httpGet: + path: /healthz +@@ -72,6 +110,12 @@ spec: + - name: host-os-release + hostPath: + path: "/etc/os-release" ++ {{- if eq .Values.operator.include_assets "include_assets" }} ++ - name: assets ++ configMap: ++ name: operator-configmap ++ {{- end }} ++ + {{- with .Values.operator.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} +diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml +new file mode 100644 +index 0000000..61f366e +--- /dev/null ++++ b/deployments/gpu-operator/templates/operator_configmap.yaml +@@ -0,0 +1,36 @@ ++{{- if eq .Values.operator.include_assets "include_assets" }} ++apiVersion: v1 ++kind: ConfigMap ++metadata: ++ name: operator-configmap ++data: ++{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} ++{{ printf "gfd_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} ++{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} ++{{ printf "state_device_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} ++{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} ++{{ printf "state_driver_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++ ++{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} ++{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |- ++{{ $.Files.Get $path | indent 4 }} ++{{- end }} ++{{- end }} +diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml +index 78a4757..6689636 100644 +--- a/deployments/gpu-operator/values.yaml ++++ b/deployments/gpu-operator/values.yaml +@@ -70,6 +70,8 @@ operator: + values: [""] + logging: + timeEncoding: epoch ++ # Set "include_assets" true to include assets/gpu-operator with the helm chart ++ include_assets: "" + resources: + limits: + cpu: 500m +-- +2.17.1 + diff --git a/gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch b/gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch new file mode 100644 index 000000000..1c8b93809 --- /dev/null +++ b/gpu/gpu-operator/debian/patches/enablement-support-on-starlingx-cloud-platform.patch @@ -0,0 +1,867 @@ +From 65ac63ca1bc8517f3f0c3560498de758149a3800 Mon Sep 17 00:00:00 2001 +From: Babak Sarashki +Date: Sun, 7 Mar 2021 17:19:08 +0000 +Subject: [PATCH] enablement: support on starlingx cloud platform + +StarlingX is a cloud infrastructure software stack for edge. +It has an immutable file system, and system configruation. For +instance changes to set containerd runtime by the gpu-operator +will be overriden and must be avoided. + +This commit enables gpu-operator on Starlingx (starlingx.io). +The changes to the gpu-operator include bundling modified assets +and a modified version of the nvidia-driver build script with the +helm charts. + +The modifications include host-mounting the kernel headers and +kernel build directory onto the respective mount points inside +the driver pod namespace; modifying the nvidia-driver to account +for pre-installed kernel packages; and pre-installing the nvidia- +toolkit version 1.7.1-ubi8. The defaultRuntime is expected to +be containerd. + +To load the operator on starlingx: + +$ source /etc/platform/openrc +[...(keystone_admin)]$ system service-parameter-add \ + platform container_runtime \ + custom_container_runtime=nvidia:/path/to/nvidia-container-runtime + +[...(keystone_admin)]$ system host-lock 1; system host-unlock 1 + +Signed-off-by: Babak Sarashki +--- + assets/state-driver/0500_daemonset.yaml | 47 ++- + .../0500_daemonset.yaml | 18 ++ + deployments/gpu-operator/Chart.yaml | 3 + + .../charts/stx-toolkit-installer/.helmignore | 23 ++ + .../charts/stx-toolkit-installer/Chart.yaml | 6 + + .../templates/_helpers.tpl | 6 + + .../templates/toolkit.yaml | 71 +++++ + .../charts/stx-toolkit-installer/values.yaml | 8 + + .../templates/build_configmap.yaml | 291 ++++++++++++++++++ + .../gpu-operator/templates/clusterpolicy.yaml | 4 +- + .../gpu-operator/templates/operator.yaml | 52 +++- + .../templates/operator_confimap.yaml | 61 ++++ + deployments/gpu-operator/values.yaml | 15 +- + 13 files changed, 583 insertions(+), 22 deletions(-) + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml + create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml + create mode 100644 deployments/gpu-operator/templates/build_configmap.yaml + create mode 100644 deployments/gpu-operator/templates/operator_confimap.yaml + +diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml +index 4cd1617..c8aefd2 100644 +--- a/assets/state-driver/0500_daemonset.yaml ++++ b/assets/state-driver/0500_daemonset.yaml +@@ -35,7 +35,6 @@ spec: + valueFrom: + fieldRef: + fieldPath: spec.nodeName +- # always use runc for driver containers + - name: NVIDIA_VISIBLE_DEVICES + value: void + securityContext: +@@ -72,8 +71,14 @@ spec: + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-driver-ctr +- command: ["nvidia-driver"] +- args: ["init"] ++ command: ["/bin/bash"] ++ args: ++ - "-c" ++ - "--" ++ - > ++ tar -C /usr/host-include -c . -f - | tar -C /usr/include -xvf -; ++ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so; ++ /usr/local/bin/nvidia-driver init; + securityContext: + privileged: true + seLinuxOptions: +@@ -94,6 +99,22 @@ spec: + - name: run-mellanox-drivers + mountPath: /run/mellanox/drivers + mountPropagation: HostToContainer ++ - name: host-modules ++ mountPath: /lib/modules ++ readOnly: false ++ - name: host-include ++ mountPath: /usr/host-include ++ readOnly: false ++ - name: host-kernel-devel ++ mountPath: /usr/src/kernels ++ readOnly: true ++ - name: host-usr-src ++ mountPath: /usr/host-src ++ readOnly: false ++ - name: vol11 ++ mountPath: /usr/local/bin/nvidia-driver ++ subPath: nvidia-driver-build-script ++ readOnly: true + - image: "FILLED BY THE OPERATOR" + imagePullPolicy: IfNotPresent + name: nvidia-peermem-ctr +@@ -157,4 +178,22 @@ spec: + hostPath: + path: /run/nvidia/validations + type: DirectoryOrCreate +- ++ - name: host-modules ++ hostPath: ++ path: /lib/modules ++ - name: host-kernel-devel ++ hostPath: ++ path: /usr/src/kernels/ ++ - name: host-include ++ hostPath: ++ path: /usr/include ++ - name: host-usr-src ++ hostPath: ++ path: /usr/src ++ - name: vol11 ++ configMap: ++ name: nvidia-driver ++ defaultMode: 0777 ++ items: ++ - key: nvidia-driver-build-script ++ path: nvidia-driver-build-script +diff --git a/assets/state-operator-validation/0500_daemonset.yaml b/assets/state-operator-validation/0500_daemonset.yaml +index 266c9d6..ce226fa 100644 +--- a/assets/state-operator-validation/0500_daemonset.yaml ++++ b/assets/state-operator-validation/0500_daemonset.yaml +@@ -75,6 +75,10 @@ spec: + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional ++ - name: vol12 ++ mountPath: /var/nvidia/manifests/cuda-workload-validation.yaml ++ subPath: cuda-workload-validation.yaml ++ readOnly: true + - name: plugin-validation + image: "FILLED_BY_OPERATOR" + command: ['sh', '-c'] +@@ -98,6 +102,10 @@ spec: + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + mountPropagation: Bidirectional ++ - name: vol12 ++ mountPath: /var/nvidia/manifests/plugin-workload-validation.yaml ++ subPath: plugin-workload-validation.yaml ++ readOnly: true + containers: + - image: "FILLED_BY_OPERATOR" + name: nvidia-operator-validator +@@ -113,6 +121,7 @@ spec: + - name: run-nvidia-validations + mountPath: "/run/nvidia/validations" + mountPropagation: Bidirectional ++ terminationGracePeriodSeconds: 60 + volumes: + - name: run-nvidia-validations + hostPath: +@@ -121,3 +130,12 @@ spec: + - name: driver-install-path + hostPath: + path: /run/nvidia/driver ++ - name: vol12 ++ configMap: ++ name: nvidia-validator ++ defaultMode: 0444 ++ items: ++ - key: cuda-workload-validation.yaml ++ path: cuda-workload-validation.yaml ++ - key: plugin-workload-validation.yaml ++ path: plugin-workload-validation.yaml +diff --git a/deployments/gpu-operator/Chart.yaml b/deployments/gpu-operator/Chart.yaml +index 0b379a3..7b743e4 100644 +--- a/deployments/gpu-operator/Chart.yaml ++++ b/deployments/gpu-operator/Chart.yaml +@@ -22,3 +22,6 @@ dependencies: + version: 0.8.2 + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts + condition: nfd.enabled ++ - name: stx-toolkit-installer ++ version: 0.1.0 ++ condition: toolkit-installer.enabled +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore +new file mode 100644 +index 0000000..0e8a0eb +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore +@@ -0,0 +1,23 @@ ++# Patterns to ignore when building packages. ++# This supports shell glob matching, relative path matching, and ++# negation (prefixed with !). Only one pattern per line. ++.DS_Store ++# Common VCS dirs ++.git/ ++.gitignore ++.bzr/ ++.bzrignore ++.hg/ ++.hgignore ++.svn/ ++# Common backup files ++*.swp ++*.bak ++*.tmp ++*.orig ++*~ ++# Various IDEs ++.project ++.idea/ ++*.tmproj ++.vscode/ +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml +new file mode 100644 +index 0000000..c195c58 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml +@@ -0,0 +1,6 @@ ++apiVersion: v2 ++appVersion: v0.1.0 ++name: stx-toolkit-installer ++description: "Standalone nvidia toolkit installer for starlingx" ++type: application ++version: 1.7.1-ubi8 +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl +new file mode 100644 +index 0000000..b6f6274 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl +@@ -0,0 +1,6 @@ ++{{/* ++Full image name with tag ++*/}} ++{{- define "toolkit-installer.fullimage" -}} ++{{- .Values.toolkit.repository -}}/{{- .Values.toolkit.image -}}:{{- .Values.toolkit.version | default .Chart.AppVersion -}} ++{{- end }} +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml +new file mode 100644 +index 0000000..3cbec11 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml +@@ -0,0 +1,71 @@ ++apiVersion: apps/v1 ++kind: DaemonSet ++metadata: ++ name: toolkit-installer ++ namespace: kube-system ++ labels: ++ app.kubernetes.io/component: "toolkit-installer" ++ {{ $.Release.labels }} ++spec: ++ selector: ++ matchLabels: ++ {{ $.Release.labels }} ++ app.kubernetes.io/component: "toolkit-installer" ++ app: "toolkit-installer" ++ template: ++ metadata: ++ labels: ++ {{ $.Release.labels }} ++ app.kubernetes.io/component: "toolkit-installer" ++ app: "toolkit-installer" ++ spec: ++ containers: ++ - name: toolkit-daemon ++ image: {{ include "toolkit-installer.fullimage" . }} ++ lifecycle: ++ preStop: ++ exec: ++ command: ++ - "/bin/sh" ++ - "-c" ++ - "--" ++ - > ++ if [ $toolkit_force_clean == "true" ] ; then ++ while [[ -f /var/run/nvidia/validations/cuda-ready ]] || ++ [[ -f /var/run/nvidia/validations/driver-ready ]] || ++ [[ -f /var/run/nvidia/validations/plugin-ready ]] || ++ [[ -f /var/run/nvidia/validations/toolkit-ready ]] ; ++ do ++ echo "waiting for gpu pods to exit" ++ sleep 10; ++ done; ++ sleep 60; ++ rm -rf /usr/local/nvidia/toolkit; ++ fi; ++ command: ["/bin/bash"] ++ args: ++ - "-c" ++ - "--" ++ - > ++ ./toolkit install /usr/local/nvidia/toolkit; ++ sleep infinity; ++ env: ++ - name: toolkit_force_clean ++ value: {{ quote .Values.global.toolkit_force_clean }} ++ volumeMounts: ++ - name: toolkitdest ++ mountPath: /usr/local/nvidia ++ readOnly: false ++ - name: varrunnvidia ++ mountPath: /var/run/nvidia ++ readOnly: true ++ {{ if (.Values.global.toolkit_force_clean) and (eq .Values.gobal.toolkit_force_clean "true") }} ++ terminationGracePeriodSeconds: 120 ++ {{- end }} ++ volumes: ++ - name: toolkitdest ++ hostPath: ++ path: /usr/local/nvidia ++ - name: varrunnvidia ++ hostPath: ++ path: /var/run/nvidia +diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml +new file mode 100644 +index 0000000..b898dc2 +--- /dev/null ++++ b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml +@@ -0,0 +1,8 @@ ++toolkit: ++ repository: nvidia ++ image: container-toolkit ++ version: 1.7.1-ubi8 ++ imagePullPolicy: IfNotPresent ++ imagePullSecrets: [] ++ priorityClassName: system-node-critical ++ defaultRuntime: containerd +diff --git a/deployments/gpu-operator/templates/build_configmap.yaml b/deployments/gpu-operator/templates/build_configmap.yaml +new file mode 100644 +index 0000000..a7453a4 +--- /dev/null ++++ b/deployments/gpu-operator/templates/build_configmap.yaml +@@ -0,0 +1,291 @@ ++{{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }} ++apiVersion: v1 ++kind: Namespace ++metadata: ++ name: "gpu-operator-resources" ++--- ++apiVersion: v1 ++kind: ConfigMap ++metadata: ++ name: nvidia-driver ++ namespace: gpu-operator-resources ++data: ++ nvidia-driver-build-script: | ++ #! /bin/bash ++ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. ++ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier: ++ # Apache-2.0. ++ # This script is from: https://gitlab.com/nvidia/container-images/driver. ++ # It is modified and included under configmap for platforms that require ++ # pre-installed packages. Such platforms have the option to modify the ++ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for ++ # further customizations. ++ ++ set -eu ++ ++ RUN_DIR=/run/nvidia ++ PID_FILE=${RUN_DIR}/${0##*/}.pid ++ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"} ++ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver ++ KERNEL_VERSION="$(uname -r)" ++ ++ _install_tools() { ++ yum clean all ++ yum install -y centos-release-scl ++ yum install -y epel-release ++ yum install -y devtoolset-8-build devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-make ++ } ++ ++ # Load the kernel modules and start persistenced. ++ _load_driver() { ++ echo "Loading IPMI kernel module..." ++ modprobe ipmi_msghandler ++ ++ echo "Loading NVIDIA driver kernel modules..." ++ modprobe -a nvidia nvidia-uvm nvidia-modeset ++ ++ echo "Starting NVIDIA persistence daemon..." ++ nvidia-persistenced --persistence-mode ++ } ++ ++ # Stop persistenced and unload the kernel modules if they are currently loaded. ++ _unload_driver() { ++ local rmmod_args=() ++ local nvidia_deps=0 ++ local nvidia_refs=0 ++ local nvidia_uvm_refs=0 ++ local nvidia_modeset_refs=0 ++ ++ echo "Stopping NVIDIA persistence daemon..." ++ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then ++ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) ++ ++ kill -SIGTERM "${pid}" ++ for i in $(seq 1 10); do ++ kill -0 "${pid}" 2> /dev/null || break ++ sleep 0.1 ++ done ++ if [ $i -eq 10 ]; then ++ echo "Could not stop NVIDIA persistence daemon" >&2 ++ return 1 ++ fi ++ fi ++ ++ echo "Unloading NVIDIA driver kernel modules..." ++ if [ -f /sys/module/nvidia_modeset/refcnt ]; then ++ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) ++ rmmod_args+=("nvidia-modeset") ++ ((++nvidia_deps)) ++ fi ++ if [ -f /sys/module/nvidia_uvm/refcnt ]; then ++ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) ++ rmmod_args+=("nvidia-uvm") ++ ((++nvidia_deps)) ++ fi ++ if [ -f /sys/module/nvidia/refcnt ]; then ++ nvidia_refs=$(< /sys/module/nvidia/refcnt) ++ rmmod_args+=("nvidia") ++ fi ++ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then ++ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 ++ return 1 ++ fi ++ ++ if [ ${#rmmod_args[@]} -gt 0 ]; then ++ rmmod ${rmmod_args[@]} ++ fi ++ return 0 ++ } ++ ++ # Link and install the kernel modules from a precompiled package using the nvidia-installer. ++ _install_driver() { ++ local install_args=() ++ ++ # Default is standard kernel. ++ if [ ! -z ${IGNORE_PREEMPT_RT_PRESENCE+x} ] ; then ++ echo "WARN: IGNORE_PREEMPT_RT_PRESENCE set" ++ echo "Build Target PREEMPT_RT best effort" ++ fi; ++ ++ _install_tools ++ export PATH=/opt/rh/devtoolset-8/root/usr/bin${PATH:+:${PATH}} ++ export PCP_DIR=/opt/rh/devtoolset-8/root ++ ++ echo "Installing NVIDIA driver kernel modules..." ++ cd /usr/src/nvidia-${DRIVER_VERSION} ++ # rm -rf /lib/modules/${KERNEL_VERSION}/video ++ ++ if [ "${ACCEPT_LICENSE}" = "yes" ]; then ++ install_args+=("--accept-license") ++ fi ++ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"} ++ } ++ ++ # Mount the driver rootfs into the run directory with the exception of sysfs. ++ _mount_rootfs() { ++ echo "Mounting NVIDIA driver rootfs..." ++ mount --make-runbindable /sys ++ mount --make-private /sys ++ mkdir -p ${RUN_DIR}/driver ++ mount --rbind / ${RUN_DIR}/driver ++ } ++ ++ # Unmount the driver rootfs from the run directory. ++ _unmount_rootfs() { ++ echo "Unmounting NVIDIA driver rootfs..." ++ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then ++ umount -l -R ${RUN_DIR}/driver ++ fi ++ } ++ ++ init() { ++ echo -e "\n========== NVIDIA Software Installer ==========\n" ++ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" ++ ++ exec 3> ${PID_FILE} ++ if ! flock -n 3; then ++ echo "An instance of the NVIDIA driver is already running, aborting" ++ exit 1 ++ fi ++ echo $$ >&3 ++ ++ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM ++ trap "_shutdown" EXIT ++ ++ _unload_driver || exit 1 ++ _unmount_rootfs ++ ++ ( ++ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia.ko ] || ++ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-uvm.ko ] || ++ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-modeset.ko ] ++ ) && _install_driver ++ ++ _load_driver ++ _mount_rootfs ++ ++ echo "Done, now waiting for signal" ++ sleep infinity & ++ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM ++ trap - EXIT ++ while true; do wait $! || continue; done ++ exit 0 ++ } ++ ++ usage() { ++ cat >&2 <