diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py index 2af51b2ad..46e53e0c0 100644 --- a/ironic_python_agent/burnin.py +++ b/ironic_python_agent/burnin.py @@ -28,71 +28,79 @@ NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader']) NETWORK_READER_CYCLE = 30 -def stress_ng_cpu(node): - """Burn-in the CPU with stress-ng +def stress_ng(node, stressor_type, default_timeout=86400): + """Run stress-ng for different stressor types - Run stress-ng on a configurable number of CPUs for - a configurable amount of time. Without config use - all CPUs and stress them for 24 hours. + Burn-in a configurable number of CPU/VM with stress-ng, + for a configurable amount of time but default of 24 hours. :param node: Ironic node object + :param stressor_type: 'cpu' or 'vm' + :param default_timeout: Default timeout in seconds (default: 86400) + + :raises: ValueError if an unknown stressor_type is provided :raises: CommandExecutionError if the execution of stress-ng fails. """ + stressor_type = stressor_type.lower() + if stressor_type not in ['cpu', 'vm']: + raise ValueError("Unknown stressor type: %s" % stressor_type) + info = node.get('driver_info', {}) - cpu = info.get('agent_burnin_cpu_cpu', 0) - timeout = info.get('agent_burnin_cpu_timeout', 86400) - outputfile = info.get('agent_burnin_cpu_outputfile', None) + stressor_suffix = {'cpu': 'cpu'} + + if stressor_type == 'vm': + count_key = 'agent_burnin_%s_vm' % stressor_type + bytes_key = 'agent_burnin_%s_vm-bytes' % stressor_type + else: + count_key = 'agent_burnin_%s_%s' % (stressor_type, + stressor_suffix[stressor_type]) + bytes_key = None + + timeout_key = 'agent_burnin_%s_timeout' % stressor_type + outputfile_key = 'agent_burnin_%s_outputfile' % stressor_type + + count = info.get(count_key, 0) + timeout = info.get(timeout_key, default_timeout) + outputfile = info.get(outputfile_key) + + args = ['stress-ng', '--%s' % stressor_type, count, '--timeout', timeout] + + if stressor_type == 'vm': + vm_bytes = info.get(bytes_key, '98%') + args.extend(['--vm-bytes', vm_bytes]) + + args.extend(['--metrics-brief']) - args = ('stress-ng', '--cpu', cpu, '--timeout', timeout, - '--metrics-brief') if outputfile: - args += ('--log-file', outputfile,) + args.extend(['--log-file', outputfile]) - LOG.debug('Burn-in stress_ng_cpu command: %s', args) + LOG.debug('Burn-in stress_ng_%s command: %s', stressor_type, args) try: _, err = utils.execute(*args) # stress-ng reports on stderr only LOG.info(err) except (processutils.ProcessExecutionError, OSError) as e: - error_msg = "stress-ng (cpu) failed with error %s" % e + error_msg = 'stress-ng (%s) failed with error %s' % (stressor_type, e) LOG.error(error_msg) raise errors.CommandExecutionError(error_msg) +def stress_ng_cpu(node): + """Burn-in the CPU with stress-ng""" + stress_ng(node, 'cpu') + + def stress_ng_vm(node): - """Burn-in the memory with the vm stressor in stress-ng + """Burn-in the memory with the vm stressor in stress-ng. Run stress-ng with a configurable number of workers on a configurable amount of the available memory for a configurable amount of time. Without config use as many workers as CPUs, 98% of the memory and stress it for 24 hours. - - :param node: Ironic node object - :raises: CommandExecutionError if the execution of stress-ng fails. """ - info = node.get('driver_info', {}) - vm = info.get('agent_burnin_vm_vm', 0) - vm_bytes = info.get('agent_burnin_vm_vm-bytes', '98%') - timeout = info.get('agent_burnin_vm_timeout', 86400) - outputfile = info.get('agent_burnin_vm_outputfile', None) - - args = ('stress-ng', '--vm', vm, '--vm-bytes', vm_bytes, - '--timeout', timeout, '--metrics-brief') - if outputfile: - args += ('--log-file', outputfile,) - - LOG.debug('Burn-in stress_ng_vm command: %s', args) - - try: - _, err = utils.execute(*args) - # stress-ng reports on stderr only - LOG.info(err) - except (processutils.ProcessExecutionError, OSError) as e: - error_msg = "stress-ng (vm) failed with error %s" % e - LOG.error(error_msg) - raise errors.CommandExecutionError(error_msg) + stress_ng(node, 'vm') def _smart_test_status(device): @@ -420,3 +428,84 @@ def fio_network(node): irole = "reader" if (role == "writer") else "writer" logfilename = outputfile + '.' + irole _do_fio_network(not role == 'writer', runtime, partner, logfilename) + + +def _gpu_burn_check_count(install_dir, count): + """Check the count of GPUs with gpu-burn + + Run a check to confirm how many GPUs are seen by the OS. + + :param install_dir: The location where gpu-burn has been installed. + :param count: The number of expected GPUs. + + :raises: CleaningError if the incorrect number of GPUs found. + :raises: CommandExecutionError if the execution of gpu-burn fails. + + """ + args = ['./gpu_burn', '-l'] + LOG.debug('Burn-in gpu count command: %s', args) + try: + out, _ = utils.execute(*args, cwd=install_dir) + # gpu-burn reports on stdout + LOG.debug(out) + except (processutils.ProcessExecutionError, OSError) as e: + error_msg = 'gpu-burn failed with error %s' % e + LOG.error(error_msg) + raise errors.CommandExecutionError(error_msg) + + gpu_data = [i for i in out.splitlines() if i.startswith('ID')] + gpu_count = len(gpu_data) + if gpu_count != count: + error_msg = ("gpu-burn failed to find the correct number of gpus. " + "%s found but %s expected." % (gpu_count, count)) + LOG.error(error_msg) + raise errors.CleaningError(error_msg) + + +def _gpu_burn_run(install_dir, memory, timeout=86400): + """Burn-in the GPU with gpu-burn + + Run a GPU burn-in job for a configurable amount of time. + + :param install_dir: The location where gpu-burn has been installed. + :param memory: Use N% or X MB of the available GPU memory. + :param timeout: Timeout in seconds (default: 86400). + + :raises: CommandExecutionError if the execution of gpu-burn fails. + """ + + args = ['./gpu_burn', '-m', memory, timeout] + LOG.debug('Burn-in gpu command: %s', args) + try: + out, _ = utils.execute(*args, cwd=install_dir) + # gpu-burn reports on stdout + LOG.debug(out) + except (processutils.ProcessExecutionError, OSError) as e: + error_msg = 'gpu-burn failed with error %s' % e + LOG.error(error_msg) + raise errors.CommandExecutionError(error_msg) + + +def gpu_burn(node): + """Burn-in and check correct count of GPUs using gpu-burn + + Check that the expected number of GPUs are available on the node + and run a GPU burn-in job for a configurable amount of time. + + :param node: Ironic node object + """ + info = node.get('driver_info', {}) + + install_dir = info.get('agent_burnin_gpu_install_dir', '/opt/gpu-burn') + timeout = info.get('agent_burnin_gpu_timeout', 86400) + memory = info.get('agent_burnin_gpu_memory', '95%') + count = info.get('agent_burnin_gpu_count', 0) + + # Only check count if an expected number of GPUs has been configured + if count > 0: + _gpu_burn_check_count(install_dir, count) + else: + LOG.debug("Burn-in gpu skipping expected number of GPUs check as " + "'agent_burnin_gpu_count' set to 0") + + _gpu_burn_run(install_dir, memory, timeout) diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py index 9fd237323..78e805881 100644 --- a/ironic_python_agent/hardware.py +++ b/ironic_python_agent/hardware.py @@ -1941,6 +1941,14 @@ class GenericHardwareManager(HardwareManager): """ burnin.stress_ng_cpu(node) + def burnin_gpu(self, node, ports): + """Burn-in the GPU + + :param node: Ironic node object + :param ports: list of Ironic port objects + """ + burnin.gpu_burn(node) + def burnin_disk(self, node, ports): """Burn-in the disk @@ -2498,6 +2506,13 @@ class GenericHardwareManager(HardwareManager): 'reboot_requested': False, 'abortable': True }, + { + 'step': 'burnin_gpu', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True + }, { 'step': 'burnin_disk', 'priority': 0, @@ -2594,6 +2609,13 @@ class GenericHardwareManager(HardwareManager): 'reboot_requested': False, 'abortable': True }, + { + 'step': 'burnin_gpu', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True + }, # NOTE(TheJulia): Burnin disk is explicitly not carried in this # list because it would be destructive to data on a disk. # If someone needs to do that, the machine should be diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py index f1b2f99cd..f0134d4c5 100644 --- a/ironic_python_agent/tests/unit/test_burnin.py +++ b/ironic_python_agent/tests/unit/test_burnin.py @@ -11,6 +11,7 @@ # under the License. from unittest import mock +from unittest.mock import call from ironic_lib import utils from oslo_concurrency import processutils @@ -94,7 +95,6 @@ class TestBurnin(base.IronicAgentTest): burnin.stress_ng_cpu, node) def test_stress_ng_vm_default(self, mock_execute): - node = {'driver_info': {}} mock_execute.return_value = (['out', 'err']) @@ -102,8 +102,8 @@ class TestBurnin(base.IronicAgentTest): mock_execute.assert_called_once_with( - 'stress-ng', '--vm', 0, '--vm-bytes', '98%', - '--timeout', 86400, '--metrics-brief') + 'stress-ng', '--vm', 0, '--timeout', 86400, '--vm-bytes', '98%', + '--metrics-brief') def test_stress_ng_vm_non_default(self, mock_execute): @@ -117,9 +117,8 @@ class TestBurnin(base.IronicAgentTest): burnin.stress_ng_vm(node) mock_execute.assert_called_once_with( - 'stress-ng', '--vm', 2, '--vm-bytes', '25%', - '--timeout', 120, '--metrics-brief', - '--log-file', '/var/log/burnin.vm') + 'stress-ng', '--vm', 2, '--timeout', 120, '--vm-bytes', '25%', + '--metrics-brief', '--log-file', '/var/log/burnin.vm') def test_stress_ng_vm_no_stress_ng(self, mock_execute): @@ -515,3 +514,38 @@ class TestBurnin(base.IronicAgentTest): # get_members is called initially, then every second until the # other node appears self.assertEqual(3, mock_coordinator.get_members.call_count) + + +def test_gpu_burn_default(self, mock_execute): + node = {'driver_info': {}} + mock_execute.return_value = (['out', 'err']) + expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'), + call('./gpu_burn', '-m', '95%', 86400, + cwd='/opt/gpu-burn')] + + burnin.gpu_burn(node) + + mock_execute.assert_has_calls(expected_calls, any_order=False) + + +@mock.patch('ironic_python_agent.burnin.len', return_value=2, autospec=True) +def test_gpu_burn_non_default(self, mock_gpu_burn_check_count, mock_execute): + node = {'driver_info': { + 'agent_burnin_gpu_count': 2, + 'agent_burnin_gpu_timeout': 3600}} + mock_execute.return_value = (['out', 'err']) + expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'), + call('./gpu_burn', '-m', '95%', 3600, + cwd='/opt/gpu-burn')] + + burnin.gpu_burn(node) + + mock_execute.assert_has_calls(expected_calls, any_order=False) + + +def test_gpu_burn_no_package(self, mock_execute): + node = {'driver_info': {}} + mock_execute.side_effect = processutils.ProcessExecutionError() + + self.assertRaises(errors.CommandExecutionError, + burnin.gpu_burn, node) diff --git a/ironic_python_agent/tests/unit/test_hardware.py b/ironic_python_agent/tests/unit/test_hardware.py index 75869971a..6b4ffd73e 100644 --- a/ironic_python_agent/tests/unit/test_hardware.py +++ b/ironic_python_agent/tests/unit/test_hardware.py @@ -244,6 +244,13 @@ class TestGenericHardwareManager(base.IronicAgentTest): 'reboot_requested': False, 'abortable': True }, + { + 'step': 'burnin_gpu', + 'priority': 0, + 'interface': 'deploy', + 'reboot_requested': False, + 'abortable': True + }, { 'step': 'burnin_disk', 'priority': 0, diff --git a/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml b/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml new file mode 100644 index 000000000..8d2bd03f4 --- /dev/null +++ b/releasenotes/notes/add-support-for-burnin-gpu-76c8c267529a18bd.yaml @@ -0,0 +1,3 @@ +features: + - | + Add support for GPU burn-in testing using gpu-burn.