Merge "Add support for burnin-gpu"
This commit is contained in:
@@ -28,71 +28,79 @@ NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
|
|||||||
NETWORK_READER_CYCLE = 30
|
NETWORK_READER_CYCLE = 30
|
||||||
|
|
||||||
|
|
||||||
def stress_ng_cpu(node):
|
def stress_ng(node, stressor_type, default_timeout=86400):
|
||||||
"""Burn-in the CPU with stress-ng
|
"""Run stress-ng for different stressor types
|
||||||
|
|
||||||
Run stress-ng on a configurable number of CPUs for
|
Burn-in a configurable number of CPU/VM with stress-ng,
|
||||||
a configurable amount of time. Without config use
|
for a configurable amount of time but default of 24 hours.
|
||||||
all CPUs and stress them for 24 hours.
|
|
||||||
|
|
||||||
:param node: Ironic node object
|
:param node: Ironic node object
|
||||||
|
:param stressor_type: 'cpu' or 'vm'
|
||||||
|
:param default_timeout: Default timeout in seconds (default: 86400)
|
||||||
|
|
||||||
|
:raises: ValueError if an unknown stressor_type is provided
|
||||||
:raises: CommandExecutionError if the execution of stress-ng fails.
|
:raises: CommandExecutionError if the execution of stress-ng fails.
|
||||||
"""
|
"""
|
||||||
|
stressor_type = stressor_type.lower()
|
||||||
|
if stressor_type not in ['cpu', 'vm']:
|
||||||
|
raise ValueError("Unknown stressor type: %s" % stressor_type)
|
||||||
|
|
||||||
info = node.get('driver_info', {})
|
info = node.get('driver_info', {})
|
||||||
cpu = info.get('agent_burnin_cpu_cpu', 0)
|
stressor_suffix = {'cpu': 'cpu'}
|
||||||
timeout = info.get('agent_burnin_cpu_timeout', 86400)
|
|
||||||
outputfile = info.get('agent_burnin_cpu_outputfile', None)
|
if stressor_type == 'vm':
|
||||||
|
count_key = 'agent_burnin_%s_vm' % stressor_type
|
||||||
|
bytes_key = 'agent_burnin_%s_vm-bytes' % stressor_type
|
||||||
|
else:
|
||||||
|
count_key = 'agent_burnin_%s_%s' % (stressor_type,
|
||||||
|
stressor_suffix[stressor_type])
|
||||||
|
bytes_key = None
|
||||||
|
|
||||||
|
timeout_key = 'agent_burnin_%s_timeout' % stressor_type
|
||||||
|
outputfile_key = 'agent_burnin_%s_outputfile' % stressor_type
|
||||||
|
|
||||||
|
count = info.get(count_key, 0)
|
||||||
|
timeout = info.get(timeout_key, default_timeout)
|
||||||
|
outputfile = info.get(outputfile_key)
|
||||||
|
|
||||||
|
args = ['stress-ng', '--%s' % stressor_type, count, '--timeout', timeout]
|
||||||
|
|
||||||
|
if stressor_type == 'vm':
|
||||||
|
vm_bytes = info.get(bytes_key, '98%')
|
||||||
|
args.extend(['--vm-bytes', vm_bytes])
|
||||||
|
|
||||||
|
args.extend(['--metrics-brief'])
|
||||||
|
|
||||||
args = ('stress-ng', '--cpu', cpu, '--timeout', timeout,
|
|
||||||
'--metrics-brief')
|
|
||||||
if outputfile:
|
if outputfile:
|
||||||
args += ('--log-file', outputfile,)
|
args.extend(['--log-file', outputfile])
|
||||||
|
|
||||||
LOG.debug('Burn-in stress_ng_cpu command: %s', args)
|
LOG.debug('Burn-in stress_ng_%s command: %s', stressor_type, args)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_, err = utils.execute(*args)
|
_, err = utils.execute(*args)
|
||||||
# stress-ng reports on stderr only
|
# stress-ng reports on stderr only
|
||||||
LOG.info(err)
|
LOG.info(err)
|
||||||
except (processutils.ProcessExecutionError, OSError) as e:
|
except (processutils.ProcessExecutionError, OSError) as e:
|
||||||
error_msg = "stress-ng (cpu) failed with error %s" % e
|
error_msg = 'stress-ng (%s) failed with error %s' % (stressor_type, e)
|
||||||
LOG.error(error_msg)
|
LOG.error(error_msg)
|
||||||
raise errors.CommandExecutionError(error_msg)
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def stress_ng_cpu(node):
|
||||||
|
"""Burn-in the CPU with stress-ng"""
|
||||||
|
stress_ng(node, 'cpu')
|
||||||
|
|
||||||
|
|
||||||
def stress_ng_vm(node):
|
def stress_ng_vm(node):
|
||||||
"""Burn-in the memory with the vm stressor in stress-ng
|
"""Burn-in the memory with the vm stressor in stress-ng.
|
||||||
|
|
||||||
Run stress-ng with a configurable number of workers on
|
Run stress-ng with a configurable number of workers on
|
||||||
a configurable amount of the available memory for
|
a configurable amount of the available memory for
|
||||||
a configurable amount of time. Without config use
|
a configurable amount of time. Without config use
|
||||||
as many workers as CPUs, 98% of the memory and stress
|
as many workers as CPUs, 98% of the memory and stress
|
||||||
it for 24 hours.
|
it for 24 hours.
|
||||||
|
|
||||||
:param node: Ironic node object
|
|
||||||
:raises: CommandExecutionError if the execution of stress-ng fails.
|
|
||||||
"""
|
"""
|
||||||
info = node.get('driver_info', {})
|
stress_ng(node, 'vm')
|
||||||
vm = info.get('agent_burnin_vm_vm', 0)
|
|
||||||
vm_bytes = info.get('agent_burnin_vm_vm-bytes', '98%')
|
|
||||||
timeout = info.get('agent_burnin_vm_timeout', 86400)
|
|
||||||
outputfile = info.get('agent_burnin_vm_outputfile', None)
|
|
||||||
|
|
||||||
args = ('stress-ng', '--vm', vm, '--vm-bytes', vm_bytes,
|
|
||||||
'--timeout', timeout, '--metrics-brief')
|
|
||||||
if outputfile:
|
|
||||||
args += ('--log-file', outputfile,)
|
|
||||||
|
|
||||||
LOG.debug('Burn-in stress_ng_vm command: %s', args)
|
|
||||||
|
|
||||||
try:
|
|
||||||
_, err = utils.execute(*args)
|
|
||||||
# stress-ng reports on stderr only
|
|
||||||
LOG.info(err)
|
|
||||||
except (processutils.ProcessExecutionError, OSError) as e:
|
|
||||||
error_msg = "stress-ng (vm) failed with error %s" % e
|
|
||||||
LOG.error(error_msg)
|
|
||||||
raise errors.CommandExecutionError(error_msg)
|
|
||||||
|
|
||||||
|
|
||||||
def _smart_test_status(device):
|
def _smart_test_status(device):
|
||||||
@@ -420,3 +428,84 @@ def fio_network(node):
|
|||||||
irole = "reader" if (role == "writer") else "writer"
|
irole = "reader" if (role == "writer") else "writer"
|
||||||
logfilename = outputfile + '.' + irole
|
logfilename = outputfile + '.' + irole
|
||||||
_do_fio_network(not role == 'writer', runtime, partner, logfilename)
|
_do_fio_network(not role == 'writer', runtime, partner, logfilename)
|
||||||
|
|
||||||
|
|
||||||
|
def _gpu_burn_check_count(install_dir, count):
|
||||||
|
"""Check the count of GPUs with gpu-burn
|
||||||
|
|
||||||
|
Run a check to confirm how many GPUs are seen by the OS.
|
||||||
|
|
||||||
|
:param install_dir: The location where gpu-burn has been installed.
|
||||||
|
:param count: The number of expected GPUs.
|
||||||
|
|
||||||
|
:raises: CleaningError if the incorrect number of GPUs found.
|
||||||
|
:raises: CommandExecutionError if the execution of gpu-burn fails.
|
||||||
|
|
||||||
|
"""
|
||||||
|
args = ['./gpu_burn', '-l']
|
||||||
|
LOG.debug('Burn-in gpu count command: %s', args)
|
||||||
|
try:
|
||||||
|
out, _ = utils.execute(*args, cwd=install_dir)
|
||||||
|
# gpu-burn reports on stdout
|
||||||
|
LOG.debug(out)
|
||||||
|
except (processutils.ProcessExecutionError, OSError) as e:
|
||||||
|
error_msg = 'gpu-burn failed with error %s' % e
|
||||||
|
LOG.error(error_msg)
|
||||||
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
gpu_data = [i for i in out.splitlines() if i.startswith('ID')]
|
||||||
|
gpu_count = len(gpu_data)
|
||||||
|
if gpu_count != count:
|
||||||
|
error_msg = ("gpu-burn failed to find the correct number of gpus. "
|
||||||
|
"%s found but %s expected." % (gpu_count, count))
|
||||||
|
LOG.error(error_msg)
|
||||||
|
raise errors.CleaningError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _gpu_burn_run(install_dir, memory, timeout=86400):
|
||||||
|
"""Burn-in the GPU with gpu-burn
|
||||||
|
|
||||||
|
Run a GPU burn-in job for a configurable amount of time.
|
||||||
|
|
||||||
|
:param install_dir: The location where gpu-burn has been installed.
|
||||||
|
:param memory: Use N% or X MB of the available GPU memory.
|
||||||
|
:param timeout: Timeout in seconds (default: 86400).
|
||||||
|
|
||||||
|
:raises: CommandExecutionError if the execution of gpu-burn fails.
|
||||||
|
"""
|
||||||
|
|
||||||
|
args = ['./gpu_burn', '-m', memory, timeout]
|
||||||
|
LOG.debug('Burn-in gpu command: %s', args)
|
||||||
|
try:
|
||||||
|
out, _ = utils.execute(*args, cwd=install_dir)
|
||||||
|
# gpu-burn reports on stdout
|
||||||
|
LOG.debug(out)
|
||||||
|
except (processutils.ProcessExecutionError, OSError) as e:
|
||||||
|
error_msg = 'gpu-burn failed with error %s' % e
|
||||||
|
LOG.error(error_msg)
|
||||||
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_burn(node):
|
||||||
|
"""Burn-in and check correct count of GPUs using gpu-burn
|
||||||
|
|
||||||
|
Check that the expected number of GPUs are available on the node
|
||||||
|
and run a GPU burn-in job for a configurable amount of time.
|
||||||
|
|
||||||
|
:param node: Ironic node object
|
||||||
|
"""
|
||||||
|
info = node.get('driver_info', {})
|
||||||
|
|
||||||
|
install_dir = info.get('agent_burnin_gpu_install_dir', '/opt/gpu-burn')
|
||||||
|
timeout = info.get('agent_burnin_gpu_timeout', 86400)
|
||||||
|
memory = info.get('agent_burnin_gpu_memory', '95%')
|
||||||
|
count = info.get('agent_burnin_gpu_count', 0)
|
||||||
|
|
||||||
|
# Only check count if an expected number of GPUs has been configured
|
||||||
|
if count > 0:
|
||||||
|
_gpu_burn_check_count(install_dir, count)
|
||||||
|
else:
|
||||||
|
LOG.debug("Burn-in gpu skipping expected number of GPUs check as "
|
||||||
|
"'agent_burnin_gpu_count' set to 0")
|
||||||
|
|
||||||
|
_gpu_burn_run(install_dir, memory, timeout)
|
||||||
|
@@ -2063,6 +2063,14 @@ class GenericHardwareManager(HardwareManager):
|
|||||||
"""
|
"""
|
||||||
burnin.stress_ng_cpu(node)
|
burnin.stress_ng_cpu(node)
|
||||||
|
|
||||||
|
def burnin_gpu(self, node, ports):
|
||||||
|
"""Burn-in the GPU
|
||||||
|
|
||||||
|
:param node: Ironic node object
|
||||||
|
:param ports: list of Ironic port objects
|
||||||
|
"""
|
||||||
|
burnin.gpu_burn(node)
|
||||||
|
|
||||||
def burnin_disk(self, node, ports):
|
def burnin_disk(self, node, ports):
|
||||||
"""Burn-in the disk
|
"""Burn-in the disk
|
||||||
|
|
||||||
@@ -2656,6 +2664,13 @@ class GenericHardwareManager(HardwareManager):
|
|||||||
'reboot_requested': False,
|
'reboot_requested': False,
|
||||||
'abortable': True
|
'abortable': True
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
'step': 'burnin_gpu',
|
||||||
|
'priority': 0,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'reboot_requested': False,
|
||||||
|
'abortable': True
|
||||||
|
},
|
||||||
{
|
{
|
||||||
'step': 'burnin_disk',
|
'step': 'burnin_disk',
|
||||||
'priority': 0,
|
'priority': 0,
|
||||||
@@ -2752,6 +2767,13 @@ class GenericHardwareManager(HardwareManager):
|
|||||||
'reboot_requested': False,
|
'reboot_requested': False,
|
||||||
'abortable': True
|
'abortable': True
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
'step': 'burnin_gpu',
|
||||||
|
'priority': 0,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'reboot_requested': False,
|
||||||
|
'abortable': True
|
||||||
|
},
|
||||||
# NOTE(TheJulia): Burnin disk is explicitly not carried in this
|
# NOTE(TheJulia): Burnin disk is explicitly not carried in this
|
||||||
# list because it would be destructive to data on a disk.
|
# list because it would be destructive to data on a disk.
|
||||||
# If someone needs to do that, the machine should be
|
# If someone needs to do that, the machine should be
|
||||||
|
@@ -11,6 +11,7 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
from unittest.mock import call
|
||||||
|
|
||||||
from oslo_concurrency import processutils
|
from oslo_concurrency import processutils
|
||||||
from tooz import coordination
|
from tooz import coordination
|
||||||
@@ -94,7 +95,6 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
burnin.stress_ng_cpu, node)
|
burnin.stress_ng_cpu, node)
|
||||||
|
|
||||||
def test_stress_ng_vm_default(self, mock_execute):
|
def test_stress_ng_vm_default(self, mock_execute):
|
||||||
|
|
||||||
node = {'driver_info': {}}
|
node = {'driver_info': {}}
|
||||||
mock_execute.return_value = (['out', 'err'])
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
|
||||||
@@ -102,8 +102,8 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
|
|
||||||
mock_execute.assert_called_once_with(
|
mock_execute.assert_called_once_with(
|
||||||
|
|
||||||
'stress-ng', '--vm', 0, '--vm-bytes', '98%',
|
'stress-ng', '--vm', 0, '--timeout', 86400, '--vm-bytes', '98%',
|
||||||
'--timeout', 86400, '--metrics-brief')
|
'--metrics-brief')
|
||||||
|
|
||||||
def test_stress_ng_vm_non_default(self, mock_execute):
|
def test_stress_ng_vm_non_default(self, mock_execute):
|
||||||
|
|
||||||
@@ -117,9 +117,8 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
burnin.stress_ng_vm(node)
|
burnin.stress_ng_vm(node)
|
||||||
|
|
||||||
mock_execute.assert_called_once_with(
|
mock_execute.assert_called_once_with(
|
||||||
'stress-ng', '--vm', 2, '--vm-bytes', '25%',
|
'stress-ng', '--vm', 2, '--timeout', 120, '--vm-bytes', '25%',
|
||||||
'--timeout', 120, '--metrics-brief',
|
'--metrics-brief', '--log-file', '/var/log/burnin.vm')
|
||||||
'--log-file', '/var/log/burnin.vm')
|
|
||||||
|
|
||||||
def test_stress_ng_vm_no_stress_ng(self, mock_execute):
|
def test_stress_ng_vm_no_stress_ng(self, mock_execute):
|
||||||
|
|
||||||
@@ -515,3 +514,38 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
# get_members is called initially, then every second until the
|
# get_members is called initially, then every second until the
|
||||||
# other node appears
|
# other node appears
|
||||||
self.assertEqual(3, mock_coordinator.get_members.call_count)
|
self.assertEqual(3, mock_coordinator.get_members.call_count)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gpu_burn_default(self, mock_execute):
|
||||||
|
node = {'driver_info': {}}
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'),
|
||||||
|
call('./gpu_burn', '-m', '95%', 86400,
|
||||||
|
cwd='/opt/gpu-burn')]
|
||||||
|
|
||||||
|
burnin.gpu_burn(node)
|
||||||
|
|
||||||
|
mock_execute.assert_has_calls(expected_calls, any_order=False)
|
||||||
|
|
||||||
|
|
||||||
|
@mock.patch('ironic_python_agent.burnin.len', return_value=2, autospec=True)
|
||||||
|
def test_gpu_burn_non_default(self, mock_gpu_burn_check_count, mock_execute):
|
||||||
|
node = {'driver_info': {
|
||||||
|
'agent_burnin_gpu_count': 2,
|
||||||
|
'agent_burnin_gpu_timeout': 3600}}
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
expected_calls = [call('./gpu_burn', '-l', cwd='/opt/gpu-burn'),
|
||||||
|
call('./gpu_burn', '-m', '95%', 3600,
|
||||||
|
cwd='/opt/gpu-burn')]
|
||||||
|
|
||||||
|
burnin.gpu_burn(node)
|
||||||
|
|
||||||
|
mock_execute.assert_has_calls(expected_calls, any_order=False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gpu_burn_no_package(self, mock_execute):
|
||||||
|
node = {'driver_info': {}}
|
||||||
|
mock_execute.side_effect = processutils.ProcessExecutionError()
|
||||||
|
|
||||||
|
self.assertRaises(errors.CommandExecutionError,
|
||||||
|
burnin.gpu_burn, node)
|
||||||
|
@@ -253,6 +253,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
|
|||||||
'reboot_requested': False,
|
'reboot_requested': False,
|
||||||
'abortable': True
|
'abortable': True
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
'step': 'burnin_gpu',
|
||||||
|
'priority': 0,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'reboot_requested': False,
|
||||||
|
'abortable': True
|
||||||
|
},
|
||||||
{
|
{
|
||||||
'step': 'burnin_disk',
|
'step': 'burnin_disk',
|
||||||
'priority': 0,
|
'priority': 0,
|
||||||
|
@@ -0,0 +1,3 @@
|
|||||||
|
features:
|
||||||
|
- |
|
||||||
|
Add support for GPU burn-in testing using gpu-burn.
|
Reference in New Issue
Block a user