Install NVIDIA vGPU software

This commit is contained in:
Aurelien Lourot 2021-11-18 10:41:20 +01:00
parent e8594f50a9
commit 01a4f6bfe4
5 changed files with 154 additions and 25 deletions

View File

@ -13,3 +13,11 @@ options:
and
https://docs.openstack.org/nova/ussuri/admin/virtual-gpu.html#how-to-discover-a-gpu-type
for more details.
force-install-nvidia-vgpu:
type: boolean
default: false
description: |
FOR TESTING ONLY. If true, the NVIDIA vGPU software will be installed and
set up on all units regardless of the presence of NVIDIA GPU hardware. If
false, the software will be installed and set up only on units where that
hardware is present.

View File

@ -8,9 +8,7 @@ description: |
tags:
- openstack
series:
- bionic
- focal
- groovy
- hirsute
- impish
subordinate: true
@ -22,3 +20,11 @@ requires:
juju-info:
interface: juju-info
scope: container
resources:
nvidia-vgpu-software:
type: file
filename: nvidia-vgpu.deb
description: |
Proprietary NVIDIA vGPU host software (to be installed on compute nodes).
.
See https://docs.nvidia.com/grid/

View File

@ -1,4 +1,5 @@
ops
git+https://opendev.org/openstack/charm-ops-openstack#egg=ops_openstack
ruamel.yaml
pylspci

View File

@ -17,34 +17,50 @@
import logging
from ops.charm import CharmBase
from charmhelpers.core.hookenv import cached
from charmhelpers.core.host import file_hash
from charmhelpers.fetch import (
apt_cache,
apt_install,
)
import ops_openstack.plugins.classes
from ops.main import main
from ops.model import ActiveStatus
from ops.model import (
ActiveStatus,
BlockedStatus,
ModelError,
)
from pylspci.parsers import SimpleParser
from ruamel.yaml import YAML
class NovaComputeNvidiaVgpuCharm(CharmBase):
class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm):
# NOTE(lourot): as of today (2021-11-25), OSBaseCharm doesn't make use of
# this dict's keys (config files) but only uses its values (service names):
RESTART_MAP = {
'/usr/share/nvidia/vgpu/vgpuConfig.xml': ['nvidia-vgpu-mgr'],
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.framework.observe(self.on.install, self._on_install)
self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)
super().register_status_check(self.__check_status)
self.framework.observe(self.on.config_changed, self._on_config_changed)
self.framework.observe(self.on.start, self._on_start)
def _on_install(self, _):
"""install hook."""
self.__set_ready_unit_status()
def _on_upgrade_charm(self, _):
"""upgrade-charm hook."""
self.__set_ready_unit_status()
# hash of the last successfully installed NVIDIA vGPU software passed
# as resource to the charm:
self._stored.set_default(last_installed_resource_hash=None)
def _on_config_changed(self, _):
"""config-changed hook."""
if not self._has_nvidia_gpu_hardware():
return
# NOTE(lourot): We want to re-install the software here if a new
# version has just been provided as a charm resource.
self.__install_nvidia_software_if_needed()
vgpu_device_mappings_str = self.config.get('vgpu-device-mappings')
if vgpu_device_mappings_str is not None:
@ -52,15 +68,113 @@ class NovaComputeNvidiaVgpuCharm(CharmBase):
logging.debug('vgpu-device-mappings={}'.format(
vgpu_device_mappings))
def __set_ready_unit_status(self):
"""Set the unit status to active/ready."""
unit_status_msg = (
'Unit is ready: '
+ ('no ' if not self._has_nvidia_gpu_hardware() else '')
+ 'NVIDIA GPU found')
self.unit.status = ActiveStatus(unit_status_msg)
self.update_status()
def _on_start(self, _):
"""start hook."""
# NOTE(lourot): We install software in the `start` hook instead of
# the `install` hook because we want to be able to install software
# after a reboot if NVIDIA hardware has then been added for the
# first time.
self.__install_nvidia_software_if_needed()
# NOTE(lourot): this is used by OSBaseCharm.update_status():
self._stored.is_started = True
self.update_status()
def services(self):
# If no NVIDIA software is expected to be installed on this particular
# unit, then no service should be expected to run by
# OSBaseCharm.update_status(). Otherwise the services from the
# RESTART_MAP are expected to run.
if not self.__is_nvidia_software_to_be_installed():
return []
return super().services()
def __check_status(self):
"""Determine the unit status to be set.
:rtype: StatusBase
"""
unit_status_msg = ('no ' if not self._has_nvidia_gpu_hardware()
else '') + 'NVIDIA GPU found; '
installed_versions = self.__installed_nvidia_software_versions()
if len(installed_versions) > 0:
unit_status_msg += 'installed NVIDIA software: '
unit_status_msg += ', '.join(installed_versions)
else:
unit_status_msg += 'no NVIDIA software installed'
if self.__is_nvidia_software_to_be_installed() and len(
installed_versions) == 0:
return BlockedStatus(unit_status_msg)
return ActiveStatus('Unit is ready: ' + unit_status_msg)
def __install_nvidia_software_if_needed(self):
"""Install the NVIDIA software on this unit if relevant."""
if self.__is_nvidia_software_to_be_installed():
nvidia_software_path, nvidia_software_hash = (
self.__path_and_hash_nvidia_resource())
if nvidia_software_path is None:
# No software has been provided as charm resource. We can't
# install anything. OSBaseCharm.update_status() will be
# executed later and put the unit in blocked state.
return
last_installed_hash = self._stored.last_installed_resource_hash
if nvidia_software_hash == last_installed_hash:
logging.info(
'NVIDIA vGPU software with hash {} already installed, '
'skipping'.format(nvidia_software_hash))
return
logging.info(
'Installing NVIDIA vGPU software with hash {}'.format(
nvidia_software_hash))
apt_install([nvidia_software_path], fatal=True)
self._stored.last_installed_resource_hash = nvidia_software_hash
@cached
def __is_nvidia_software_to_be_installed(self):
"""Determine whether the NVIDIA vGPU software is to be installed.
:returns: True if the software is to be installed and set up on the
unit.
:rtype: bool
"""
return (self._has_nvidia_gpu_hardware() or
self.config.get('force-install-nvidia-vgpu'))
def __path_and_hash_nvidia_resource(self):
"""Get path to and hash of software provided as charm resource.
:returns: Pair of path and hash. (None, None) if no charm resource has
been provided.
:rtype: Tuple[PosixPath, str]
"""
try:
nvidia_vgpu_software_path = (
self.framework.model.resources.fetch('nvidia-vgpu-software'))
except ModelError:
return None, None
return nvidia_vgpu_software_path, file_hash(nvidia_vgpu_software_path)
def __installed_nvidia_software_versions(self):
"""Get a list of installed NVIDIA vGPU software versions.
:returns: List of versions
:rtype: List[str]
"""
return [package['version'] for package in
apt_cache().dpkg_list(['nvidia-vgpu-ubuntu-*']).values()]
@staticmethod
@cached
def _has_nvidia_gpu_hardware():
"""Search for NVIDIA GPU hardware.

View File

@ -25,11 +25,11 @@ class TestNovaComputeNvidiaVgpuCharm(unittest.TestCase):
self.addCleanup(self.harness.cleanup)
self.harness.begin()
def test_install(self):
def test_start(self):
self.assertEqual(
self.harness.framework.model.app.name,
'nova-compute-nvidia-vgpu')
# Test that charm is active upon installation.
self.harness.charm.on.install.emit()
self.harness.charm.on.start.emit()
self.assertTrue(isinstance(
self.harness.model.unit.status, ActiveStatus))