Burn-in: Add network step

Add a clean step for network burn-in via fio. Get basic
run parameters from the node's driver_info.

Story: #2007523
Task: #42385

Change-Id: I2861696740b2de9ec38f7e9fc2c5e448c009d0bf
This commit is contained in:
Arne Wiebalck 2021-06-28 10:49:15 +02:00
parent 20e145e4da
commit cacdd9bab3
6 changed files with 179 additions and 0 deletions

View File

@ -82,6 +82,9 @@ Clean steps
``deploy.burnin_memory`` ``deploy.burnin_memory``
Stress-test the memory of a node via stress-ng for a configurable Stress-test the memory of a node via stress-ng for a configurable
amount of time. Disabled by default. amount of time. Disabled by default.
``deploy.burnin_network``
Stress-test the network of a pair of nodes via fio for a configurable
amount of time. Disabled by default.
``deploy.erase_devices`` ``deploy.erase_devices``
Securely erases all information from all recognized disk devices. Securely erases all information from all recognized disk devices.
Relatively fast when secure ATA erase is available, otherwise can take Relatively fast when secure ATA erase is available, otherwise can take

View File

@ -10,6 +10,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import time
from ironic_lib import utils from ironic_lib import utils
from oslo_concurrency import processutils from oslo_concurrency import processutils
from oslo_log import log from oslo_log import log
@ -19,6 +21,9 @@ from ironic_python_agent import hardware
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
NETWORK_READER_CYCLE = 30
def stress_ng_cpu(node): def stress_ng_cpu(node):
"""Burn-in the CPU with stress-ng """Burn-in the CPU with stress-ng
@ -115,3 +120,72 @@ def fio_disk(node):
{'err': e}) {'err': e})
LOG.error(error_msg) LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg) raise errors.CommandExecutionError(error_msg)
def _do_fio_network(writer, runtime, partner):
args = ['fio', '--ioengine', 'net', '--port', '9000', '--fill_device', 1,
'--group_reporting', '--gtod_reduce', 1, '--numjobs', 16]
if writer:
xargs = ['--name', 'writer', '--rw', 'write', '--runtime', runtime,
'--time_based', '--listen']
else:
xargs = ['--name', 'reader', '--rw', 'read', '--hostname', partner]
args.extend(xargs)
while True:
LOG.info('Burn-in fio network command: %s', ' '.join(map(str, args)))
try:
out, err = utils.execute(*args)
# fio reports on stdout
LOG.info(out)
break
except (processutils.ProcessExecutionError, OSError) as e:
error_msg = ("fio (network) failed with error %(err)s",
{'err': e})
LOG.error(error_msg)
# while the writer blocks in fio, the reader fails with
# 'Connection {refused, timeout}' errors if the partner
# is not ready, so we need to wait explicitly
if not writer and 'Connection' in str(e):
LOG.info("fio (network): reader retrying in %s seconds ...",
NETWORK_READER_CYCLE)
time.sleep(NETWORK_READER_CYCLE)
else:
raise errors.CommandExecutionError(error_msg)
def fio_network(node):
"""Burn-in the network with fio
Run an fio network job for a pair of nodes for a configurable
amount of time. The pair is statically defined in driver_info
via 'agent_burnin_fio_network_config'.
The writer will wait for the reader to connect, then write to the
network. Upon completion, the roles are swapped.
Note (arne_wiebalck): Initial version. The plan is to make the
match making dynamic by posting availability
on a distributed backend, e.g. via tooz.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of fio fails.
:raises: CleaningError if the configuration is incomplete.
"""
info = node.get('driver_info', {})
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
# get our role and identify our partner
config = info.get('agent_burnin_fio_network_config')
if not config:
error_msg = ("fio (network) failed to find "
"'agent_burnin_fio_network_config' in driver_info")
raise errors.CleaningError(error_msg)
LOG.debug("agent_burnin_fio_network_config is %s", str(config))
role = config.get('role')
partner = config.get('partner')
_do_fio_network(role == 'writer', runtime, partner)
LOG.debug("fio (network): first direction done, swapping roles ...")
_do_fio_network(not role == 'writer', runtime, partner)

View File

@ -1418,6 +1418,14 @@ class GenericHardwareManager(HardwareManager):
""" """
burnin.stress_ng_vm(node) burnin.stress_ng_vm(node)
def burnin_network(self, node, ports):
"""Burn-in the network
:param node: Ironic node object
:param ports: list of Ironic port objects
"""
burnin.fio_network(node)
def _shred_block_device(self, node, block_device): def _shred_block_device(self, node, block_device):
"""Erase a block device using shred. """Erase a block device using shred.
@ -1912,6 +1920,13 @@ class GenericHardwareManager(HardwareManager):
'reboot_requested': False, 'reboot_requested': False,
'abortable': True 'abortable': True
}, },
{
'step': 'burnin_network',
'priority': 0,
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
},
] ]
def get_deploy_steps(self, node, ports): def get_deploy_steps(self, node, ports):

View File

@ -144,3 +144,77 @@ class TestBurnin(base.IronicAgentTest):
self.assertRaises(errors.CommandExecutionError, self.assertRaises(errors.CommandExecutionError,
burnin.fio_disk, node) burnin.fio_disk, node)
def test_fio_network_reader(self, mock_execute):
node = {'driver_info': {'agent_burnin_fio_network_runtime': 600,
'agent_burnin_fio_network_config':
{'partner': 'host-002',
'role': 'reader'}}}
mock_execute.return_value = (['out', 'err'])
burnin.fio_network(node)
expected_calls = [
mock.call('fio', '--ioengine', 'net', '--port', '9000',
'--fill_device', 1, '--group_reporting',
'--gtod_reduce', 1, '--numjobs', 16, '--name',
'reader', '--rw', 'read', '--hostname', 'host-002'),
mock.call('fio', '--ioengine', 'net', '--port', '9000',
'--fill_device', 1, '--group_reporting',
'--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer',
'--rw', 'write', '--runtime', 600, '--time_based',
'--listen')]
mock_execute.assert_has_calls(expected_calls)
def test_fio_network_writer(self, mock_execute):
node = {'driver_info': {'agent_burnin_fio_network_runtime': 600,
'agent_burnin_fio_network_config':
{'partner': 'host-001',
'role': 'writer'}}}
mock_execute.return_value = (['out', 'err'])
burnin.fio_network(node)
expected_calls = [
mock.call('fio', '--ioengine', 'net', '--port', '9000',
'--fill_device', 1, '--group_reporting',
'--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer',
'--rw', 'write', '--runtime', 600, '--time_based',
'--listen'),
mock.call('fio', '--ioengine', 'net', '--port', '9000',
'--fill_device', 1, '--group_reporting',
'--gtod_reduce', 1, '--numjobs', 16, '--name',
'reader', '--rw', 'read', '--hostname', 'host-001')]
mock_execute.assert_has_calls(expected_calls)
def test_fio_network_no_fio(self, mock_execute):
node = {'driver_info': {'agent_burnin_fio_network_config':
{'partner': 'host-003', 'role': 'reader'}}}
mock_execute.side_effect = processutils.ProcessExecutionError('boom')
self.assertRaises(errors.CommandExecutionError,
burnin.fio_network, node)
@mock.patch('time.sleep', autospec=True)
def test_fio_network_reader_loop(self, mock_time, mock_execute):
node = {'driver_info': {'agent_burnin_fio_network_config':
{'partner': 'host-004', 'role': 'reader'}}}
# mock the infinite loop
mock_execute.side_effect = (processutils.ProcessExecutionError(
'Connection timeout'),
processutils.ProcessExecutionError(
'Connection timeout'),
processutils.ProcessExecutionError(
'Connection refused'),
['out', 'err'], # connected!
['out', 'err']) # reversed roles
burnin.fio_network(node)
# we loop 3 times, then do the 2 fio calls
self.assertEqual(5, mock_execute.call_count)
self.assertEqual(3, mock_time.call_count)

View File

@ -170,6 +170,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'interface': 'deploy', 'interface': 'deploy',
'reboot_requested': False, 'reboot_requested': False,
'abortable': True 'abortable': True
},
{
'step': 'burnin_network',
'priority': 0,
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
} }
] ]
clean_steps = self.hardware.get_clean_steps(self.node, []) clean_steps = self.hardware.get_clean_steps(self.node, [])

View File

@ -0,0 +1,6 @@
---
features:
- |
Adds a burn-in cleaning step 'burnin_network' to stress test the
network interface for a configurable amount of time with fio. To
use this step, fio needs to be installed on the RAM disk.