Merge "Add support for burnin-gpu"

2025-01-29 19:20:10 +00:00
parent 234bc70f12 dfcb86d738
commit 0c35e7e2da
5 changed files with 199 additions and 44 deletions
--- a/ironic_python_agent/burnin.py
+++ b/ironic_python_agent/burnin.py
@@ -28,71 +28,79 @@ NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
 NETWORK_READER_CYCLE = 30
-def stress_ng_cpu(node):
+def stress_ng(node, stressor_type, default_timeout=86400):
-    """Burn-in the CPU with stress-ng
+    """Run stress-ng for different stressor types
-    Run stress-ng on a configurable number of CPUs for
+    Burn-in a configurable number of CPU/VM with stress-ng,
-    a configurable amount of time. Without config use
+    for a configurable amount of time but default of 24 hours.
    all CPUs and stress them for 24 hours.
    :param node: Ironic node object
    :param stressor_type: 'cpu' or 'vm'
    :param default_timeout: Default timeout in seconds (default: 86400)
    :raises: ValueError if an unknown stressor_type is provided
    :raises: CommandExecutionError if the execution of stress-ng fails.
    """
    stressor_type = stressor_type.lower()
    if stressor_type not in ['cpu', 'vm']:
        raise ValueError("Unknown stressor type: %s" % stressor_type)
    info = node.get('driver_info', {})
-    cpu = info.get('agent_burnin_cpu_cpu', 0)
+    stressor_suffix = {'cpu': 'cpu'}
-    timeout = info.get('agent_burnin_cpu_timeout', 86400)
+
-    outputfile = info.get('agent_burnin_cpu_outputfile', None)
+    if stressor_type == 'vm':
        count_key = 'agent_burnin_%s_vm' % stressor_type
        bytes_key = 'agent_burnin_%s_vm-bytes' % stressor_type
    else:
        count_key = 'agent_burnin_%s_%s' % (stressor_type,
                                            stressor_suffix[stressor_type])
        bytes_key = None
    timeout_key = 'agent_burnin_%s_timeout' % stressor_type
    outputfile_key = 'agent_burnin_%s_outputfile' % stressor_type
    count = info.get(count_key, 0)
    timeout = info.get(timeout_key, default_timeout)
    outputfile = info.get(outputfile_key)
    args = ['stress-ng', '--%s' % stressor_type, count, '--timeout', timeout]
    if stressor_type == 'vm':
        vm_bytes = info.get(bytes_key, '98%')
        args.extend(['--vm-bytes', vm_bytes])
    args.extend(['--metrics-brief'])
    args = ('stress-ng', '--cpu', cpu, '--timeout', timeout,
            '--metrics-brief')
    if outputfile:
-        args += ('--log-file', outputfile,)
+        args.extend(['--log-file', outputfile])
-    LOG.debug('Burn-in stress_ng_cpu command: %s', args)
+    LOG.debug('Burn-in stress_ng_%s command: %s', stressor_type, args)
    try:
        _, err = utils.execute(*args)
        # stress-ng reports on stderr only
        LOG.info(err)
    except (processutils.ProcessExecutionError, OSError) as e:
-        error_msg = "stress-ng (cpu) failed with error %s" % e
+        error_msg = 'stress-ng (%s) failed with error %s' % (stressor_type, e)
        LOG.error(error_msg)
        raise errors.CommandExecutionError(error_msg)
 def stress_ng_cpu(node):
    """Burn-in the CPU with stress-ng"""
    stress_ng(node, 'cpu')
 def stress_ng_vm(node):
-    """Burn-in the memory with the vm stressor in stress-ng
+    """Burn-in the memory with the vm stressor in stress-ng.
    Run stress-ng with a configurable number of workers on
    a configurable amount of the available memory for
    a configurable amount of time. Without config use
    as many workers as CPUs, 98% of the memory and stress
    it for 24 hours.
    :param node: Ironic node object
    :raises: CommandExecutionError if the execution of stress-ng fails.
    """
-    info = node.get('driver_info', {})
+    stress_ng(node, 'vm')
    vm = info.get('agent_burnin_vm_vm', 0)
    vm_bytes = info.get('agent_burnin_vm_vm-bytes', '98%')
    timeout = info.get('agent_burnin_vm_timeout', 86400)
    outputfile = info.get('agent_burnin_vm_outputfile', None)
    args = ('stress-ng', '--vm', vm, '--vm-bytes', vm_bytes,
            '--timeout', timeout, '--metrics-brief')
    if outputfile:
        args += ('--log-file', outputfile,)
    LOG.debug('Burn-in stress_ng_vm command: %s', args)
    try:
        _, err = utils.execute(*args)
        # stress-ng reports on stderr only
        LOG.info(err)
    except (processutils.ProcessExecutionError, OSError) as e:
        error_msg = "stress-ng (vm) failed with error %s" % e
        LOG.error(error_msg)
        raise errors.CommandExecutionError(error_msg)
 def _smart_test_status(device):
@@ -420,3 +428,84 @@ def fio_network(node):
        irole = "reader" if (role == "writer") else "writer"
        logfilename = outputfile + '.' + irole
    _do_fio_network(not role == 'writer', runtime, partner, logfilename)
 def _gpu_burn_check_count(install_dir, count):
    """Check the count of GPUs with gpu-burn
    Run a check to confirm how many GPUs are seen by the OS.
    :param install_dir: The location where gpu-burn has been installed.
    :param count: The number of expected GPUs.
    :raises: CleaningError if the incorrect number of GPUs found.
    :raises: CommandExecutionError if the execution of gpu-burn fails.
     """
    args = ['./gpu_burn', '-l']
    LOG.debug('Burn-in gpu count command: %s', args)
    try:
        out, _ = utils.execute(*args, cwd=install_dir)
        # gpu-burn reports on stdout
        LOG.debug(out)
    except (processutils.ProcessExecutionError, OSError) as e:
        error_msg = 'gpu-burn failed with error %s' % e
        LOG.error(error_msg)
        raise errors.CommandExecutionError(error_msg)
    gpu_data = [i for i in out.splitlines() if i.startswith('ID')]
    gpu_count = len(gpu_data)
    if gpu_count != count:
        error_msg = ("gpu-burn failed to find the correct number of gpus. "
                     "%s found but %s expected." % (gpu_count, count))
        LOG.error(error_msg)
        raise errors.CleaningError(error_msg)
 def _gpu_burn_run(install_dir, memory, timeout=86400):
    """Burn-in the GPU with gpu-burn
    Run a GPU burn-in job for a configurable amount of time.
    :param install_dir: The location where gpu-burn has been installed.
    :param memory: Use N% or X MB of the available GPU memory.
    :param timeout: Timeout in seconds (default: 86400).
    :raises: CommandExecutionError if the execution of gpu-burn fails.
    """
    args = ['./gpu_burn', '-m', memory, timeout]
    LOG.debug('Burn-in gpu command: %s', args)
    try:
        out, _ = utils.execute(*args, cwd=install_dir)
        # gpu-burn reports on stdout
        LOG.debug(out)
    except (processutils.ProcessExecutionError, OSError) as e:
        error_msg = 'gpu-burn failed with error %s' % e
        LOG.error(error_msg)
        raise errors.CommandExecutionError(error_msg)
 def gpu_burn(node):
    """Burn-in and check correct count of GPUs using gpu-burn
    Check that the expected number of GPUs are available on the node
    and run a GPU burn-in job for a configurable amount of time.
    :param node: Ironic node object
    """
    info = node.get('driver_info', {})
    install_dir = info.get('agent_burnin_gpu_install_dir', '/opt/gpu-burn')
    timeout = info.get('agent_burnin_gpu_timeout', 86400)
    memory = info.get('agent_burnin_gpu_memory', '95%')
    count = info.get('agent_burnin_gpu_count', 0)
    # Only check count if an expected number of GPUs has been configured
    if count > 0:
        _gpu_burn_check_count(install_dir, count)
    else:
        LOG.debug("Burn-in gpu skipping expected number of GPUs check as "
                  "'agent_burnin_gpu_count' set to 0")
    _gpu_burn_run(install_dir, memory, timeout)
--- a/ironic_python_agent/hardware.py
+++ b/ironic_python_agent/hardware.py
@@ -2063,6 +2063,14 @@ class GenericHardwareManager(HardwareManager):
        """
        burnin.stress_ng_cpu(node)
    def burnin_gpu(self, node, ports):
        """Burn-in the GPU
        :param node: Ironic node object
        :param ports: list of Ironic port objects
        """
        burnin.gpu_burn(node)
    def burnin_disk(self, node, ports):
        """Burn-in the disk
@@ -2656,6 +2664,13 @@ class GenericHardwareManager(HardwareManager):
                'reboot_requested': False,
                'abortable': True
            },
            {
                'step': 'burnin_gpu',
                'priority': 0,
                'interface': 'deploy',
                'reboot_requested': False,
                'abortable': True
            },
            {
                'step': 'burnin_disk',
                'priority': 0,
@@ -2752,6 +2767,13 @@ class GenericHardwareManager(HardwareManager):
                'reboot_requested': False,
                'abortable': True
            },
            {
                'step': 'burnin_gpu',
                'priority': 0,
                'interface': 'deploy',
                'reboot_requested': False,
                'abortable': True
            },
            # NOTE(TheJulia): Burnin disk is explicitly not carried in this
            # list because it would be destructive to data on a disk.
            # If someone needs to do that, the machine should be
--- a/ironic_python_agent/tests/unit/test_burnin.py
+++ b/ironic_python_agent/tests/unit/test_burnin.py
@@ -11,6 +11,7 @@
 #    under the License.
 from unittest import mock
 from unittest.mock import call
 from oslo_concurrency import processutils
 from tooz import coordination
@@ -94,7 +95,6 @@ class TestBurnin(base.IronicAgentTest):
                          burnin.stress_ng_cpu, node)
    def test_stress_ng_vm_default(self, mock_execute):
        node = {'driver_info': {}}
        mock_execute.return_value = (['out', 'err'])
@@ -102,8 +102,8 @@ class TestBurnin(base.IronicAgentTest):
        mock_execute.assert_called_once_with(
-            'stress-ng', '--vm', 0, '--vm-bytes', '98%',
+            'stress-ng', '--vm', 0, '--timeout', 86400, '--vm-bytes', '98%',
-            '--timeout', 86400, '--metrics-brief')
+            '--metrics-brief')
    def test_stress_ng_vm_non_default(self, mock_execute):
@@ -117,9 +117,8 @@ class TestBurnin(base.IronicAgentTest):
        burnin.stress_ng_vm(node)
        mock_execute.assert_called_once_with(
-            'stress-ng', '--vm', 2, '--vm-bytes', '25%',
+            'stress-ng', '--vm', 2, '--timeout', 120, '--vm-bytes', '25%',
-            '--timeout', 120, '--metrics-brief',
+            '--metrics-brief', '--log-file', '/var/log/burnin.vm')
            '--log-file', '/var/log/burnin.vm')
    def test_stress_ng_vm_no_stress_ng(self, mock_execute):
@@ -515,3 +514,38 @@ class TestBurnin(base.IronicAgentTest):
        # get_members is called initially, then every second until the
        # other node appears
        self.assertEqual(3, mock_coordinator.get_members.call_count)
 def test_gpu_burn_default(self, mock_execute):
    node = {'driver_info': {}}
    mock_execute.return_value = (['out', 'err'])
    expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'),
                      call('./gpu_burn', '-m', '95%', 86400,
                           cwd='/opt/gpu-burn')]
    burnin.gpu_burn(node)
    mock_execute.assert_has_calls(expected_calls, any_order=False)
@mock.patch('ironic_python_agent.burnin.len', return_value=2, autospec=True)
 def test_gpu_burn_non_default(self, mock_gpu_burn_check_count, mock_execute):
    node = {'driver_info': {
        'agent_burnin_gpu_count': 2,
        'agent_burnin_gpu_timeout': 3600}}
    mock_execute.return_value = (['out', 'err'])
    expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'),
                      call('./gpu_burn', '-m', '95%', 3600,
                           cwd='/opt/gpu-burn')]
    burnin.gpu_burn(node)
    mock_execute.assert_has_calls(expected_calls, any_order=False)
 def test_gpu_burn_no_package(self, mock_execute):
    node = {'driver_info': {}}
    mock_execute.side_effect = processutils.ProcessExecutionError()
    self.assertRaises(errors.CommandExecutionError,
                      burnin.gpu_burn, node)
--- a/ironic_python_agent/tests/unit/test_hardware.py
+++ b/ironic_python_agent/tests/unit/test_hardware.py
@@ -253,6 +253,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
                'reboot_requested': False,
                'abortable': True
            },
            {
                'step': 'burnin_gpu',
                'priority': 0,
                'interface': 'deploy',
                'reboot_requested': False,
                'abortable': True
            },
            {
                'step': 'burnin_disk',
                'priority': 0,
--- a/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml
+++ b/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml
@@ -0,0 +1,3 @@
 features:
  - |
    Add support for GPU burn-in testing using gpu-burn.