From 01a4f6bfe44d2487be65fafb34b583d93affb9f9 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 18 Nov 2021 10:41:20 +0100 Subject: [PATCH] Install NVIDIA vGPU software --- config.yaml | 8 ++ metadata.yaml | 10 ++- requirements.txt | 1 + src/charm.py | 156 +++++++++++++++++++++++++++++++++------ unit_tests/test_charm.py | 4 +- 5 files changed, 154 insertions(+), 25 deletions(-) diff --git a/config.yaml b/config.yaml index bed315f..710cfc3 100644 --- a/config.yaml +++ b/config.yaml @@ -13,3 +13,11 @@ options: and https://docs.openstack.org/nova/ussuri/admin/virtual-gpu.html#how-to-discover-a-gpu-type for more details. + force-install-nvidia-vgpu: + type: boolean + default: false + description: | + FOR TESTING ONLY. If true, the NVIDIA vGPU software will be installed and + set up on all units regardless of the presence of NVIDIA GPU hardware. If + false, the software will be installed and set up only on units where that + hardware is present. diff --git a/metadata.yaml b/metadata.yaml index 8692543..2c4522d 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -8,9 +8,7 @@ description: | tags: - openstack series: -- bionic - focal -- groovy - hirsute - impish subordinate: true @@ -22,3 +20,11 @@ requires: juju-info: interface: juju-info scope: container +resources: + nvidia-vgpu-software: + type: file + filename: nvidia-vgpu.deb + description: | + Proprietary NVIDIA vGPU host software (to be installed on compute nodes). + . + See https://docs.nvidia.com/grid/ diff --git a/requirements.txt b/requirements.txt index d7d1378..dc9c74f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ ops +git+https://opendev.org/openstack/charm-ops-openstack#egg=ops_openstack ruamel.yaml pylspci diff --git a/src/charm.py b/src/charm.py index 924afdf..d86ebe2 100755 --- a/src/charm.py +++ b/src/charm.py @@ -17,34 +17,50 @@ import logging -from ops.charm import CharmBase +from charmhelpers.core.hookenv import cached +from charmhelpers.core.host import file_hash +from charmhelpers.fetch import ( + apt_cache, + apt_install, +) + +import ops_openstack.plugins.classes + from ops.main import main -from ops.model import ActiveStatus +from ops.model import ( + ActiveStatus, + BlockedStatus, + ModelError, +) from pylspci.parsers import SimpleParser from ruamel.yaml import YAML -class NovaComputeNvidiaVgpuCharm(CharmBase): +class NovaComputeNvidiaVgpuCharm(ops_openstack.core.OSBaseCharm): + + # NOTE(lourot): as of today (2021-11-25), OSBaseCharm doesn't make use of + # this dict's keys (config files) but only uses its values (service names): + RESTART_MAP = { + '/usr/share/nvidia/vgpu/vgpuConfig.xml': ['nvidia-vgpu-mgr'], + } def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.framework.observe(self.on.install, self._on_install) - self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm) + super().register_status_check(self.__check_status) + self.framework.observe(self.on.config_changed, self._on_config_changed) + self.framework.observe(self.on.start, self._on_start) - def _on_install(self, _): - """install hook.""" - self.__set_ready_unit_status() - - def _on_upgrade_charm(self, _): - """upgrade-charm hook.""" - self.__set_ready_unit_status() + # hash of the last successfully installed NVIDIA vGPU software passed + # as resource to the charm: + self._stored.set_default(last_installed_resource_hash=None) def _on_config_changed(self, _): """config-changed hook.""" - if not self._has_nvidia_gpu_hardware(): - return + # NOTE(lourot): We want to re-install the software here if a new + # version has just been provided as a charm resource. + self.__install_nvidia_software_if_needed() vgpu_device_mappings_str = self.config.get('vgpu-device-mappings') if vgpu_device_mappings_str is not None: @@ -52,15 +68,113 @@ class NovaComputeNvidiaVgpuCharm(CharmBase): logging.debug('vgpu-device-mappings={}'.format( vgpu_device_mappings)) - def __set_ready_unit_status(self): - """Set the unit status to active/ready.""" - unit_status_msg = ( - 'Unit is ready: ' - + ('no ' if not self._has_nvidia_gpu_hardware() else '') - + 'NVIDIA GPU found') - self.unit.status = ActiveStatus(unit_status_msg) + self.update_status() + + def _on_start(self, _): + """start hook.""" + # NOTE(lourot): We install software in the `start` hook instead of + # the `install` hook because we want to be able to install software + # after a reboot if NVIDIA hardware has then been added for the + # first time. + self.__install_nvidia_software_if_needed() + + # NOTE(lourot): this is used by OSBaseCharm.update_status(): + self._stored.is_started = True + + self.update_status() + + def services(self): + # If no NVIDIA software is expected to be installed on this particular + # unit, then no service should be expected to run by + # OSBaseCharm.update_status(). Otherwise the services from the + # RESTART_MAP are expected to run. + if not self.__is_nvidia_software_to_be_installed(): + return [] + return super().services() + + def __check_status(self): + """Determine the unit status to be set. + + :rtype: StatusBase + """ + unit_status_msg = ('no ' if not self._has_nvidia_gpu_hardware() + else '') + 'NVIDIA GPU found; ' + + installed_versions = self.__installed_nvidia_software_versions() + if len(installed_versions) > 0: + unit_status_msg += 'installed NVIDIA software: ' + unit_status_msg += ', '.join(installed_versions) + else: + unit_status_msg += 'no NVIDIA software installed' + + if self.__is_nvidia_software_to_be_installed() and len( + installed_versions) == 0: + return BlockedStatus(unit_status_msg) + + return ActiveStatus('Unit is ready: ' + unit_status_msg) + + def __install_nvidia_software_if_needed(self): + """Install the NVIDIA software on this unit if relevant.""" + if self.__is_nvidia_software_to_be_installed(): + nvidia_software_path, nvidia_software_hash = ( + self.__path_and_hash_nvidia_resource()) + + if nvidia_software_path is None: + # No software has been provided as charm resource. We can't + # install anything. OSBaseCharm.update_status() will be + # executed later and put the unit in blocked state. + return + + last_installed_hash = self._stored.last_installed_resource_hash + if nvidia_software_hash == last_installed_hash: + logging.info( + 'NVIDIA vGPU software with hash {} already installed, ' + 'skipping'.format(nvidia_software_hash)) + return + + logging.info( + 'Installing NVIDIA vGPU software with hash {}'.format( + nvidia_software_hash)) + apt_install([nvidia_software_path], fatal=True) + self._stored.last_installed_resource_hash = nvidia_software_hash + + @cached + def __is_nvidia_software_to_be_installed(self): + """Determine whether the NVIDIA vGPU software is to be installed. + + :returns: True if the software is to be installed and set up on the + unit. + :rtype: bool + """ + return (self._has_nvidia_gpu_hardware() or + self.config.get('force-install-nvidia-vgpu')) + + def __path_and_hash_nvidia_resource(self): + """Get path to and hash of software provided as charm resource. + + :returns: Pair of path and hash. (None, None) if no charm resource has + been provided. + :rtype: Tuple[PosixPath, str] + """ + try: + nvidia_vgpu_software_path = ( + self.framework.model.resources.fetch('nvidia-vgpu-software')) + except ModelError: + return None, None + + return nvidia_vgpu_software_path, file_hash(nvidia_vgpu_software_path) + + def __installed_nvidia_software_versions(self): + """Get a list of installed NVIDIA vGPU software versions. + + :returns: List of versions + :rtype: List[str] + """ + return [package['version'] for package in + apt_cache().dpkg_list(['nvidia-vgpu-ubuntu-*']).values()] @staticmethod + @cached def _has_nvidia_gpu_hardware(): """Search for NVIDIA GPU hardware. diff --git a/unit_tests/test_charm.py b/unit_tests/test_charm.py index 22f49fc..d44512f 100644 --- a/unit_tests/test_charm.py +++ b/unit_tests/test_charm.py @@ -25,11 +25,11 @@ class TestNovaComputeNvidiaVgpuCharm(unittest.TestCase): self.addCleanup(self.harness.cleanup) self.harness.begin() - def test_install(self): + def test_start(self): self.assertEqual( self.harness.framework.model.app.name, 'nova-compute-nvidia-vgpu') # Test that charm is active upon installation. - self.harness.charm.on.install.emit() + self.harness.charm.on.start.emit() self.assertTrue(isinstance( self.harness.model.unit.status, ActiveStatus))