Merge "Add support for burnin-gpu"

2025-01-29 19:20:10 +00:00
parent 234bc70f12 dfcb86d738
commit 0c35e7e2da
5 changed files with 199 additions and 44 deletions
--- a/ironic_python_agent/burnin.py
+++ b/ironic_python_agent/burnin.py
@@ -28,71 +28,79 @@ NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
 NETWORK_READER_CYCLE = 30


-def stress_ng_cpu(node):
-    """Burn-in the CPU with stress-ng
+def stress_ng(node, stressor_type, default_timeout=86400):
+    """Run stress-ng for different stressor types

-    Run stress-ng on a configurable number of CPUs for
-    a configurable amount of time. Without config use
-    all CPUs and stress them for 24 hours.
+    Burn-in a configurable number of CPU/VM with stress-ng,
+    for a configurable amount of time but default of 24 hours.

    :param node: Ironic node object
+    :param stressor_type: 'cpu' or 'vm'
+    :param default_timeout: Default timeout in seconds (default: 86400)
+
+    :raises: ValueError if an unknown stressor_type is provided
    :raises: CommandExecutionError if the execution of stress-ng fails.
    """
+    stressor_type = stressor_type.lower()
+    if stressor_type not in ['cpu', 'vm']:
+        raise ValueError("Unknown stressor type: %s" % stressor_type)
+
    info = node.get('driver_info', {})
-    cpu = info.get('agent_burnin_cpu_cpu', 0)
-    timeout = info.get('agent_burnin_cpu_timeout', 86400)
-    outputfile = info.get('agent_burnin_cpu_outputfile', None)
+    stressor_suffix = {'cpu': 'cpu'}
+
+    if stressor_type == 'vm':
+        count_key = 'agent_burnin_%s_vm' % stressor_type
+        bytes_key = 'agent_burnin_%s_vm-bytes' % stressor_type
+    else:
+        count_key = 'agent_burnin_%s_%s' % (stressor_type,
+                                            stressor_suffix[stressor_type])
+        bytes_key = None
+
+    timeout_key = 'agent_burnin_%s_timeout' % stressor_type
+    outputfile_key = 'agent_burnin_%s_outputfile' % stressor_type
+
+    count = info.get(count_key, 0)
+    timeout = info.get(timeout_key, default_timeout)
+    outputfile = info.get(outputfile_key)
+
+    args = ['stress-ng', '--%s' % stressor_type, count, '--timeout', timeout]
+
+    if stressor_type == 'vm':
+        vm_bytes = info.get(bytes_key, '98%')
+        args.extend(['--vm-bytes', vm_bytes])
+
+    args.extend(['--metrics-brief'])

-    args = ('stress-ng', '--cpu', cpu, '--timeout', timeout,
-            '--metrics-brief')
    if outputfile:
-        args += ('--log-file', outputfile,)
+        args.extend(['--log-file', outputfile])

-    LOG.debug('Burn-in stress_ng_cpu command: %s', args)
+    LOG.debug('Burn-in stress_ng_%s command: %s', stressor_type, args)

    try:
        _, err = utils.execute(*args)
        # stress-ng reports on stderr only
        LOG.info(err)
    except (processutils.ProcessExecutionError, OSError) as e:
-        error_msg = "stress-ng (cpu) failed with error %s" % e
+        error_msg = 'stress-ng (%s) failed with error %s' % (stressor_type, e)
        LOG.error(error_msg)
        raise errors.CommandExecutionError(error_msg)


+def stress_ng_cpu(node):
+    """Burn-in the CPU with stress-ng"""
+    stress_ng(node, 'cpu')
+
+
 def stress_ng_vm(node):
-    """Burn-in the memory with the vm stressor in stress-ng
+    """Burn-in the memory with the vm stressor in stress-ng.

    Run stress-ng with a configurable number of workers on
    a configurable amount of the available memory for
    a configurable amount of time. Without config use
    as many workers as CPUs, 98% of the memory and stress
    it for 24 hours.
-
-    :param node: Ironic node object
-    :raises: CommandExecutionError if the execution of stress-ng fails.
    """
-    info = node.get('driver_info', {})
-    vm = info.get('agent_burnin_vm_vm', 0)
-    vm_bytes = info.get('agent_burnin_vm_vm-bytes', '98%')
-    timeout = info.get('agent_burnin_vm_timeout', 86400)
-    outputfile = info.get('agent_burnin_vm_outputfile', None)
-
-    args = ('stress-ng', '--vm', vm, '--vm-bytes', vm_bytes,
-            '--timeout', timeout, '--metrics-brief')
-    if outputfile:
-        args += ('--log-file', outputfile,)
-
-    LOG.debug('Burn-in stress_ng_vm command: %s', args)
-
-    try:
-        _, err = utils.execute(*args)
-        # stress-ng reports on stderr only
-        LOG.info(err)
-    except (processutils.ProcessExecutionError, OSError) as e:
-        error_msg = "stress-ng (vm) failed with error %s" % e
-        LOG.error(error_msg)
-        raise errors.CommandExecutionError(error_msg)
+    stress_ng(node, 'vm')


 def _smart_test_status(device):
@@ -420,3 +428,84 @@ def fio_network(node):
        irole = "reader" if (role == "writer") else "writer"
        logfilename = outputfile + '.' + irole
    _do_fio_network(not role == 'writer', runtime, partner, logfilename)
+
+
+def _gpu_burn_check_count(install_dir, count):
+    """Check the count of GPUs with gpu-burn
+
+    Run a check to confirm how many GPUs are seen by the OS.
+
+    :param install_dir: The location where gpu-burn has been installed.
+    :param count: The number of expected GPUs.
+
+    :raises: CleaningError if the incorrect number of GPUs found.
+    :raises: CommandExecutionError if the execution of gpu-burn fails.
+
+     """
+    args = ['./gpu_burn', '-l']
+    LOG.debug('Burn-in gpu count command: %s', args)
+    try:
+        out, _ = utils.execute(*args, cwd=install_dir)
+        # gpu-burn reports on stdout
+        LOG.debug(out)
+    except (processutils.ProcessExecutionError, OSError) as e:
+        error_msg = 'gpu-burn failed with error %s' % e
+        LOG.error(error_msg)
+        raise errors.CommandExecutionError(error_msg)
+
+    gpu_data = [i for i in out.splitlines() if i.startswith('ID')]
+    gpu_count = len(gpu_data)
+    if gpu_count != count:
+        error_msg = ("gpu-burn failed to find the correct number of gpus. "
+                     "%s found but %s expected." % (gpu_count, count))
+        LOG.error(error_msg)
+        raise errors.CleaningError(error_msg)
+
+
+def _gpu_burn_run(install_dir, memory, timeout=86400):
+    """Burn-in the GPU with gpu-burn
+
+    Run a GPU burn-in job for a configurable amount of time.
+
+    :param install_dir: The location where gpu-burn has been installed.
+    :param memory: Use N% or X MB of the available GPU memory.
+    :param timeout: Timeout in seconds (default: 86400).
+
+    :raises: CommandExecutionError if the execution of gpu-burn fails.
+    """
+
+    args = ['./gpu_burn', '-m', memory, timeout]
+    LOG.debug('Burn-in gpu command: %s', args)
+    try:
+        out, _ = utils.execute(*args, cwd=install_dir)
+        # gpu-burn reports on stdout
+        LOG.debug(out)
+    except (processutils.ProcessExecutionError, OSError) as e:
+        error_msg = 'gpu-burn failed with error %s' % e
+        LOG.error(error_msg)
+        raise errors.CommandExecutionError(error_msg)
+
+
+def gpu_burn(node):
+    """Burn-in and check correct count of GPUs using gpu-burn
+
+    Check that the expected number of GPUs are available on the node
+    and run a GPU burn-in job for a configurable amount of time.
+
+    :param node: Ironic node object
+    """
+    info = node.get('driver_info', {})
+
+    install_dir = info.get('agent_burnin_gpu_install_dir', '/opt/gpu-burn')
+    timeout = info.get('agent_burnin_gpu_timeout', 86400)
+    memory = info.get('agent_burnin_gpu_memory', '95%')
+    count = info.get('agent_burnin_gpu_count', 0)
+
+    # Only check count if an expected number of GPUs has been configured
+    if count > 0:
+        _gpu_burn_check_count(install_dir, count)
+    else:
+        LOG.debug("Burn-in gpu skipping expected number of GPUs check as "
+                  "'agent_burnin_gpu_count' set to 0")
+
+    _gpu_burn_run(install_dir, memory, timeout)
--- a/ironic_python_agent/hardware.py
+++ b/ironic_python_agent/hardware.py
@@ -2063,6 +2063,14 @@ class GenericHardwareManager(HardwareManager):
        """
        burnin.stress_ng_cpu(node)

+    def burnin_gpu(self, node, ports):
+        """Burn-in the GPU
+
+        :param node: Ironic node object
+        :param ports: list of Ironic port objects
+        """
+        burnin.gpu_burn(node)
+
    def burnin_disk(self, node, ports):
        """Burn-in the disk

@@ -2656,6 +2664,13 @@ class GenericHardwareManager(HardwareManager):
                'reboot_requested': False,
                'abortable': True
            },
+            {
+                'step': 'burnin_gpu',
+                'priority': 0,
+                'interface': 'deploy',
+                'reboot_requested': False,
+                'abortable': True
+            },
            {
                'step': 'burnin_disk',
                'priority': 0,
@@ -2752,6 +2767,13 @@ class GenericHardwareManager(HardwareManager):
                'reboot_requested': False,
                'abortable': True
            },
+            {
+                'step': 'burnin_gpu',
+                'priority': 0,
+                'interface': 'deploy',
+                'reboot_requested': False,
+                'abortable': True
+            },
            # NOTE(TheJulia): Burnin disk is explicitly not carried in this
            # list because it would be destructive to data on a disk.
            # If someone needs to do that, the machine should be
--- a/ironic_python_agent/tests/unit/test_burnin.py
+++ b/ironic_python_agent/tests/unit/test_burnin.py
@@ -11,6 +11,7 @@
 #    under the License.

 from unittest import mock
+from unittest.mock import call

 from oslo_concurrency import processutils
 from tooz import coordination
@@ -94,7 +95,6 @@ class TestBurnin(base.IronicAgentTest):
                          burnin.stress_ng_cpu, node)

    def test_stress_ng_vm_default(self, mock_execute):
-
        node = {'driver_info': {}}
        mock_execute.return_value = (['out', 'err'])

@@ -102,8 +102,8 @@ class TestBurnin(base.IronicAgentTest):

        mock_execute.assert_called_once_with(

-            'stress-ng', '--vm', 0, '--vm-bytes', '98%',
-            '--timeout', 86400, '--metrics-brief')
+            'stress-ng', '--vm', 0, '--timeout', 86400, '--vm-bytes', '98%',
+            '--metrics-brief')

    def test_stress_ng_vm_non_default(self, mock_execute):

@@ -117,9 +117,8 @@ class TestBurnin(base.IronicAgentTest):
        burnin.stress_ng_vm(node)

        mock_execute.assert_called_once_with(
-            'stress-ng', '--vm', 2, '--vm-bytes', '25%',
-            '--timeout', 120, '--metrics-brief',
-            '--log-file', '/var/log/burnin.vm')
+            'stress-ng', '--vm', 2, '--timeout', 120, '--vm-bytes', '25%',
+            '--metrics-brief', '--log-file', '/var/log/burnin.vm')

    def test_stress_ng_vm_no_stress_ng(self, mock_execute):

@@ -515,3 +514,38 @@ class TestBurnin(base.IronicAgentTest):
        # get_members is called initially, then every second until the
        # other node appears
        self.assertEqual(3, mock_coordinator.get_members.call_count)
+
+
+def test_gpu_burn_default(self, mock_execute):
+    node = {'driver_info': {}}
+    mock_execute.return_value = (['out', 'err'])
+    expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'),
+                      call('./gpu_burn', '-m', '95%', 86400,
+                           cwd='/opt/gpu-burn')]
+
+    burnin.gpu_burn(node)
+
+    mock_execute.assert_has_calls(expected_calls, any_order=False)
+
+
+@mock.patch('ironic_python_agent.burnin.len', return_value=2, autospec=True)
+def test_gpu_burn_non_default(self, mock_gpu_burn_check_count, mock_execute):
+    node = {'driver_info': {
+        'agent_burnin_gpu_count': 2,
+        'agent_burnin_gpu_timeout': 3600}}
+    mock_execute.return_value = (['out', 'err'])
+    expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'),
+                      call('./gpu_burn', '-m', '95%', 3600,
+                           cwd='/opt/gpu-burn')]
+
+    burnin.gpu_burn(node)
+
+    mock_execute.assert_has_calls(expected_calls, any_order=False)
+
+
+def test_gpu_burn_no_package(self, mock_execute):
+    node = {'driver_info': {}}
+    mock_execute.side_effect = processutils.ProcessExecutionError()
+
+    self.assertRaises(errors.CommandExecutionError,
+                      burnin.gpu_burn, node)
--- a/ironic_python_agent/tests/unit/test_hardware.py
+++ b/ironic_python_agent/tests/unit/test_hardware.py
@@ -253,6 +253,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
                'reboot_requested': False,
                'abortable': True
            },
+            {
+                'step': 'burnin_gpu',
+                'priority': 0,
+                'interface': 'deploy',
+                'reboot_requested': False,
+                'abortable': True
+            },
            {
                'step': 'burnin_disk',
                'priority': 0,
--- a/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml
+++ b/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml
@@ -0,0 +1,3 @@
+features:
+  - |
+    Add support for GPU burn-in testing using gpu-burn.