Install NVIDIA vGPU software
This commit is contained in:
parent
e8594f50a9
commit
01a4f6bfe4
@ -13,3 +13,11 @@ options:
|
||||
and
|
||||
https://docs.openstack.org/nova/ussuri/admin/virtual-gpu.html#how-to-discover-a-gpu-type
|
||||
for more details.
|
||||
force-install-nvidia-vgpu:
|
||||
type: boolean
|
||||
default: false
|
||||
description: |
|
||||
FOR TESTING ONLY. If true, the NVIDIA vGPU software will be installed and
|
||||
set up on all units regardless of the presence of NVIDIA GPU hardware. If
|
||||
false, the software will be installed and set up only on units where that
|
||||
hardware is present.
|
||||
|
@ -8,9 +8,7 @@ description: |
|
||||
tags:
|
||||
- openstack
|
||||
series:
|
||||
- bionic
|
||||
- focal
|
||||
- groovy
|
||||
- hirsute
|
||||
- impish
|
||||
subordinate: true
|
||||
@ -22,3 +20,11 @@ requires:
|
||||
juju-info:
|
||||
interface: juju-info
|
||||
scope: container
|
||||
resources:
|
||||
nvidia-vgpu-software:
|
||||
type: file
|
||||
filename: nvidia-vgpu.deb
|
||||
description: |
|
||||
Proprietary NVIDIA vGPU host software (to be installed on compute nodes).
|
||||
.
|
||||
See https://docs.nvidia.com/grid/
|
||||
|
@ -1,4 +1,5 @@
|
||||
ops
|
||||
git+https://opendev.org/openstack/charm-ops-openstack#egg=ops_openstack
|
||||
|
||||
ruamel.yaml
|
||||
pylspci
|
||||
|
156
src/charm.py
156
src/charm.py
@ -17,34 +17,50 @@
|
||||
|
||||
import logging
|
||||
|
||||
from ops.charm import CharmBase
|
||||
from charmhelpers.core.hookenv import cached
|
||||
from charmhelpers.core.host import file_hash
|
||||
from charmhelpers.fetch import (
|
||||
apt_cache,
|
||||
apt_install,
|
||||
)
|
||||
|
||||
import ops_openstack.plugins.classes
|
||||
|
||||
from ops.main import main
|
||||
from ops.model import ActiveStatus
|
||||
from ops.model import (
|
||||
ActiveStatus,
|
||||
BlockedStatus,
|
||||
ModelError,
|
||||
)
|
||||
|
||||
from pylspci.parsers import SimpleParser
|
||||
from ruamel.yaml import YAML
|
||||
|
||||
|
||||
class NovaComputeNvidiaVgpuCharm(CharmBase):
|
||||
class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm):
|
||||
|
||||
# NOTE(lourot): as of today (2021-11-25), OSBaseCharm doesn't make use of
|
||||
# this dict's keys (config files) but only uses its values (service names):
|
||||
RESTART_MAP = {
|
||||
'/usr/share/nvidia/vgpu/vgpuConfig.xml': ['nvidia-vgpu-mgr'],
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.framework.observe(self.on.install, self._on_install)
|
||||
self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)
|
||||
super().register_status_check(self.__check_status)
|
||||
|
||||
self.framework.observe(self.on.config_changed, self._on_config_changed)
|
||||
self.framework.observe(self.on.start, self._on_start)
|
||||
|
||||
def _on_install(self, _):
|
||||
"""install hook."""
|
||||
self.__set_ready_unit_status()
|
||||
|
||||
def _on_upgrade_charm(self, _):
|
||||
"""upgrade-charm hook."""
|
||||
self.__set_ready_unit_status()
|
||||
# hash of the last successfully installed NVIDIA vGPU software passed
|
||||
# as resource to the charm:
|
||||
self._stored.set_default(last_installed_resource_hash=None)
|
||||
|
||||
def _on_config_changed(self, _):
|
||||
"""config-changed hook."""
|
||||
if not self._has_nvidia_gpu_hardware():
|
||||
return
|
||||
# NOTE(lourot): We want to re-install the software here if a new
|
||||
# version has just been provided as a charm resource.
|
||||
self.__install_nvidia_software_if_needed()
|
||||
|
||||
vgpu_device_mappings_str = self.config.get('vgpu-device-mappings')
|
||||
if vgpu_device_mappings_str is not None:
|
||||
@ -52,15 +68,113 @@ class NovaComputeNvidiaVgpuCharm(CharmBase):
|
||||
logging.debug('vgpu-device-mappings={}'.format(
|
||||
vgpu_device_mappings))
|
||||
|
||||
def __set_ready_unit_status(self):
|
||||
"""Set the unit status to active/ready."""
|
||||
unit_status_msg = (
|
||||
'Unit is ready: '
|
||||
+ ('no ' if not self._has_nvidia_gpu_hardware() else '')
|
||||
+ 'NVIDIA GPU found')
|
||||
self.unit.status = ActiveStatus(unit_status_msg)
|
||||
self.update_status()
|
||||
|
||||
def _on_start(self, _):
|
||||
"""start hook."""
|
||||
# NOTE(lourot): We install software in the `start` hook instead of
|
||||
# the `install` hook because we want to be able to install software
|
||||
# after a reboot if NVIDIA hardware has then been added for the
|
||||
# first time.
|
||||
self.__install_nvidia_software_if_needed()
|
||||
|
||||
# NOTE(lourot): this is used by OSBaseCharm.update_status():
|
||||
self._stored.is_started = True
|
||||
|
||||
self.update_status()
|
||||
|
||||
def services(self):
|
||||
# If no NVIDIA software is expected to be installed on this particular
|
||||
# unit, then no service should be expected to run by
|
||||
# OSBaseCharm.update_status(). Otherwise the services from the
|
||||
# RESTART_MAP are expected to run.
|
||||
if not self.__is_nvidia_software_to_be_installed():
|
||||
return []
|
||||
return super().services()
|
||||
|
||||
def __check_status(self):
|
||||
"""Determine the unit status to be set.
|
||||
|
||||
:rtype: StatusBase
|
||||
"""
|
||||
unit_status_msg = ('no ' if not self._has_nvidia_gpu_hardware()
|
||||
else '') + 'NVIDIA GPU found; '
|
||||
|
||||
installed_versions = self.__installed_nvidia_software_versions()
|
||||
if len(installed_versions) > 0:
|
||||
unit_status_msg += 'installed NVIDIA software: '
|
||||
unit_status_msg += ', '.join(installed_versions)
|
||||
else:
|
||||
unit_status_msg += 'no NVIDIA software installed'
|
||||
|
||||
if self.__is_nvidia_software_to_be_installed() and len(
|
||||
installed_versions) == 0:
|
||||
return BlockedStatus(unit_status_msg)
|
||||
|
||||
return ActiveStatus('Unit is ready: ' + unit_status_msg)
|
||||
|
||||
def __install_nvidia_software_if_needed(self):
|
||||
"""Install the NVIDIA software on this unit if relevant."""
|
||||
if self.__is_nvidia_software_to_be_installed():
|
||||
nvidia_software_path, nvidia_software_hash = (
|
||||
self.__path_and_hash_nvidia_resource())
|
||||
|
||||
if nvidia_software_path is None:
|
||||
# No software has been provided as charm resource. We can't
|
||||
# install anything. OSBaseCharm.update_status() will be
|
||||
# executed later and put the unit in blocked state.
|
||||
return
|
||||
|
||||
last_installed_hash = self._stored.last_installed_resource_hash
|
||||
if nvidia_software_hash == last_installed_hash:
|
||||
logging.info(
|
||||
'NVIDIA vGPU software with hash {} already installed, '
|
||||
'skipping'.format(nvidia_software_hash))
|
||||
return
|
||||
|
||||
logging.info(
|
||||
'Installing NVIDIA vGPU software with hash {}'.format(
|
||||
nvidia_software_hash))
|
||||
apt_install([nvidia_software_path], fatal=True)
|
||||
self._stored.last_installed_resource_hash = nvidia_software_hash
|
||||
|
||||
@cached
|
||||
def __is_nvidia_software_to_be_installed(self):
|
||||
"""Determine whether the NVIDIA vGPU software is to be installed.
|
||||
|
||||
:returns: True if the software is to be installed and set up on the
|
||||
unit.
|
||||
:rtype: bool
|
||||
"""
|
||||
return (self._has_nvidia_gpu_hardware() or
|
||||
self.config.get('force-install-nvidia-vgpu'))
|
||||
|
||||
def __path_and_hash_nvidia_resource(self):
|
||||
"""Get path to and hash of software provided as charm resource.
|
||||
|
||||
:returns: Pair of path and hash. (None, None) if no charm resource has
|
||||
been provided.
|
||||
:rtype: Tuple[PosixPath, str]
|
||||
"""
|
||||
try:
|
||||
nvidia_vgpu_software_path = (
|
||||
self.framework.model.resources.fetch('nvidia-vgpu-software'))
|
||||
except ModelError:
|
||||
return None, None
|
||||
|
||||
return nvidia_vgpu_software_path, file_hash(nvidia_vgpu_software_path)
|
||||
|
||||
def __installed_nvidia_software_versions(self):
|
||||
"""Get a list of installed NVIDIA vGPU software versions.
|
||||
|
||||
:returns: List of versions
|
||||
:rtype: List[str]
|
||||
"""
|
||||
return [package['version'] for package in
|
||||
apt_cache().dpkg_list(['nvidia-vgpu-ubuntu-*']).values()]
|
||||
|
||||
@staticmethod
|
||||
@cached
|
||||
def _has_nvidia_gpu_hardware():
|
||||
"""Search for NVIDIA GPU hardware.
|
||||
|
||||
|
@ -25,11 +25,11 @@ class TestNovaComputeNvidiaVgpuCharm(unittest.TestCase):
|
||||
self.addCleanup(self.harness.cleanup)
|
||||
self.harness.begin()
|
||||
|
||||
def test_install(self):
|
||||
def test_start(self):
|
||||
self.assertEqual(
|
||||
self.harness.framework.model.app.name,
|
||||
'nova-compute-nvidia-vgpu')
|
||||
# Test that charm is active upon installation.
|
||||
self.harness.charm.on.install.emit()
|
||||
self.harness.charm.on.start.emit()
|
||||
self.assertTrue(isinstance(
|
||||
self.harness.model.unit.status, ActiveStatus))
|
||||
|
Loading…
Reference in New Issue
Block a user