From 37e0fe598295474fd8a1b1f00d2b0632dd652047 Mon Sep 17 00:00:00 2001 From: James Page Date: Thu, 12 May 2022 09:30:40 +0100 Subject: [PATCH] Refactor unit status output The unit status output for this charm was verbose compared to others; refactor to display minimal information when active and pertinent information when in a blocked state. Note that the NVIDIA driver version should really be in the application version; this will require further refactoring to expose this information to the operator framework. Change-Id: I26663ca8da99083a409e085ea239e43245ef6265 --- .gitignore | 1 - src/charm_utils.py | 31 ++++++++++----------------- src/nvidia_utils.py | 14 ++++++------ test-requirements.txt | 1 - unit_tests/test_charm_utils.py | 38 ++++++++++++++++++--------------- unit_tests/test_nvidia_utils.py | 10 +++++++-- 6 files changed, 47 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index cfb8fab..8b17d1e 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,4 @@ interfaces .stestr *__pycache__* *.pyc -/nova-compute-nvidia-vgpu.charm *.charm diff --git a/src/charm_utils.py b/src/charm_utils.py index ba92267..1a6b4f6 100644 --- a/src/charm_utils.py +++ b/src/charm_utils.py @@ -58,7 +58,8 @@ def is_nvidia_software_to_be_installed(charm_config): def is_nvidia_software_to_be_installed_notcached(charm_config): - return (nvidia_utils.has_nvidia_gpu_hardware() or + nvidia_gpu_hardware, _ = nvidia_utils.has_nvidia_gpu_hardware() + return (nvidia_gpu_hardware or charm_config.get('force-install-nvidia-vgpu')) @@ -110,34 +111,24 @@ def check_status(config, services): :type services: List[str] :rtype: ops.model.StatusBase """ - unit_status_msg = ('no ' if not nvidia_utils.has_nvidia_gpu_hardware() - else '') + 'NVIDIA GPU found; ' - installed_versions = nvidia_utils.installed_nvidia_software_versions() software_is_installed = len(installed_versions) > 0 + if (is_nvidia_software_to_be_installed(config) and + not software_is_installed): + return BlockedStatus("NVIDIA GPU detected, drivers not installed") + _, services_not_running_msg = ows_check_services_running(services, ports=[]) software_is_running = services_not_running_msg is None - if software_is_installed: - unit_status_msg += 'installed NVIDIA software: ' - unit_status_msg += ', '.join(installed_versions) - if not software_is_running: - # NOTE(lourot): the exact list of services not running that should - # be will be displayed in the principal's blocked status message - # already, so no need to repeat it here on the subordinate. - unit_status_msg += '; reboot required?' - else: - unit_status_msg += 'no NVIDIA software installed' + if software_is_installed and not software_is_running: + return BlockedStatus("manual reboot required") - if ((is_nvidia_software_to_be_installed(config) and - not software_is_installed) or - (software_is_installed and - not software_is_running)): - return BlockedStatus(unit_status_msg) + nvidia_gpu_hardware, num_gpus = nvidia_utils.has_nvidia_gpu_hardware() + unit_status_msg = "{} GPU".format(num_gpus) - return ActiveStatus('Unit is ready: ' + unit_status_msg) + return ActiveStatus('Unit is ready ({})'.format(unit_status_msg)) def set_principal_unit_relation_data(relation_data_to_be_set, config, diff --git a/src/nvidia_utils.py b/src/nvidia_utils.py index 40df85d..0d9fc97 100644 --- a/src/nvidia_utils.py +++ b/src/nvidia_utils.py @@ -102,15 +102,15 @@ def list_vgpu_types(): def has_nvidia_gpu_hardware(): """Search for NVIDIA GPU hardware. - :returns: True if some NVIDIA GPU hardware is found on the current - unit. - :rtype: bool + :returns: a tuple of (bool, int) indicating if NVIDIA GPU hardware + is found and how many GPU's where detected. + :rtype: (bool, int) """ return _has_nvidia_gpu_hardware_notcached() def _has_nvidia_gpu_hardware_notcached(): - nvidia_gpu_hardware_found = False + num_nvidia_devices = 0 for device in SimpleParser().run(): device_class = device.cls.name device_vendor = device.vendor.name @@ -124,12 +124,12 @@ def _has_nvidia_gpu_hardware_notcached(): logging.debug('NVIDIA GPU found: {}'.format(device)) # NOTE(lourot): we could `break` out here but it's interesting # for debugging purposes to print them all. - nvidia_gpu_hardware_found = True + num_nvidia_devices += 1 - if not nvidia_gpu_hardware_found: + if num_nvidia_devices == 0: logging.debug('No NVIDIA GPU found.') - return nvidia_gpu_hardware_found + return num_nvidia_devices > 0, num_nvidia_devices def _installed_nvidia_software_packages(): diff --git a/test-requirements.txt b/test-requirements.txt index e6f71d3..1af9893 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,7 +1,6 @@ # This file is managed centrally. If you find the need to modify this as a # one-off, please don't. Intead, consult #openstack-charms and ask about # requirements management in charms via bot-control. Thank you. -charm-tools>=2.4.4 coverage>=3.6 mock>=1.2 flake8>=4.0.1; python_version >= '3.6' diff --git a/unit_tests/test_charm_utils.py b/unit_tests/test_charm_utils.py index e872898..0cd2611 100644 --- a/unit_tests/test_charm_utils.py +++ b/unit_tests/test_charm_utils.py @@ -32,12 +32,12 @@ class TestCharmUtils(unittest.TestCase): @patch('nvidia_utils.has_nvidia_gpu_hardware') def test_is_nvidia_software_to_be_installed(self, has_nvidia_gpu_hardware_mock): - has_nvidia_gpu_hardware_mock.return_value = True + has_nvidia_gpu_hardware_mock.return_value = True, 1 self.assertTrue( charm_utils.is_nvidia_software_to_be_installed_notcached({ 'force-install-nvidia-vgpu': False})) - has_nvidia_gpu_hardware_mock.return_value = False + has_nvidia_gpu_hardware_mock.return_value = False, 0 self.assertTrue( charm_utils.is_nvidia_software_to_be_installed_notcached({ 'force-install-nvidia-vgpu': True})) @@ -88,55 +88,59 @@ class TestCharmUtils(unittest.TestCase): def test_check_status( self, has_hw_mock, installed_sw_mock, is_sw_to_be_installed_mock, check_services_running_mock): - has_hw_mock.return_value = True + has_hw_mock.return_value = True, 1 installed_sw_mock.return_value = ['42', '43'] is_sw_to_be_installed_mock.return_value = True check_services_running_mock.return_value = (None, None) self.assertEqual( charm_utils.check_status(None, None), ActiveStatus( - 'Unit is ready: ' - 'NVIDIA GPU found; installed NVIDIA software: 42, 43')) + 'Unit is ready (1 GPU)' + ) + ) - has_hw_mock.return_value = False + has_hw_mock.return_value = False, 0 installed_sw_mock.return_value = ['42', '43'] is_sw_to_be_installed_mock.return_value = True check_services_running_mock.return_value = (None, None) self.assertEqual( charm_utils.check_status(None, None), ActiveStatus( - 'Unit is ready: ' - 'no NVIDIA GPU found; installed NVIDIA software: 42, 43')) + 'Unit is ready (0 GPU)' + ) + ) - has_hw_mock.return_value = True + has_hw_mock.return_value = True, 1 installed_sw_mock.return_value = [] is_sw_to_be_installed_mock.return_value = True check_services_running_mock.return_value = (None, None) self.assertEqual( charm_utils.check_status(None, None), BlockedStatus( - 'NVIDIA GPU found; no NVIDIA software installed')) + 'NVIDIA GPU detected, drivers not installed' + ) + ) - has_hw_mock.return_value = True + has_hw_mock.return_value = True, 2 installed_sw_mock.return_value = [] is_sw_to_be_installed_mock.return_value = False check_services_running_mock.return_value = (None, None) self.assertEqual( charm_utils.check_status(None, None), ActiveStatus( - 'Unit is ready: ' - 'NVIDIA GPU found; no NVIDIA software installed')) + 'Unit is ready (2 GPU)' + ) + ) - has_hw_mock.return_value = True + has_hw_mock.return_value = True, 1 installed_sw_mock.return_value = ['42', '43'] is_sw_to_be_installed_mock.return_value = True check_services_running_mock.return_value = ( None, 'Services not running that should be: nvidia-vgpu-mgr') self.assertEqual( charm_utils.check_status(None, None), - BlockedStatus( - 'NVIDIA GPU found; installed NVIDIA software: 42, 43; ' - 'reboot required?')) + BlockedStatus('manual reboot required') + ) @patch('nvidia_utils._installed_nvidia_software_packages') @patch('charm_utils.get_os_codename_package') diff --git a/unit_tests/test_nvidia_utils.py b/unit_tests/test_nvidia_utils.py index 54ff9db..774ecfa 100644 --- a/unit_tests/test_nvidia_utils.py +++ b/unit_tests/test_nvidia_utils.py @@ -54,13 +54,19 @@ class TestNvidiaUtils(unittest.TestCase): def test_has_nvidia_gpu_hardware_with_hw(self, lspci_parser_mock): lspci_parser_mock.return_value.run.return_value = ( self._PCI_DEVICES_LIST_WITH_NVIDIA_GPU) - self.assertTrue(nvidia_utils._has_nvidia_gpu_hardware_notcached()) + self.assertEqual( + nvidia_utils._has_nvidia_gpu_hardware_notcached(), + (True, 1) + ) @patch('nvidia_utils.SimpleParser') def test_has_nvidia_gpu_hardware_without_hw(self, lspci_parser_mock): lspci_parser_mock.return_value.run.return_value = ( self._PCI_DEVICES_LIST_WITHOUT_GPU) - self.assertFalse(nvidia_utils._has_nvidia_gpu_hardware_notcached()) + self.assertEqual( + nvidia_utils._has_nvidia_gpu_hardware_notcached(), + (False, 0) + ) @patch('nvidia_utils.Path') @patch('nvidia_utils.os.listdir')