Burn-in: Add network step
Add a clean step for network burn-in via fio. Get basic run parameters from the node's driver_info. Story: #2007523 Task: #42385 Change-Id: I2861696740b2de9ec38f7e9fc2c5e448c009d0bf
This commit is contained in:
parent
20e145e4da
commit
cacdd9bab3
@ -82,6 +82,9 @@ Clean steps
|
|||||||
``deploy.burnin_memory``
|
``deploy.burnin_memory``
|
||||||
Stress-test the memory of a node via stress-ng for a configurable
|
Stress-test the memory of a node via stress-ng for a configurable
|
||||||
amount of time. Disabled by default.
|
amount of time. Disabled by default.
|
||||||
|
``deploy.burnin_network``
|
||||||
|
Stress-test the network of a pair of nodes via fio for a configurable
|
||||||
|
amount of time. Disabled by default.
|
||||||
``deploy.erase_devices``
|
``deploy.erase_devices``
|
||||||
Securely erases all information from all recognized disk devices.
|
Securely erases all information from all recognized disk devices.
|
||||||
Relatively fast when secure ATA erase is available, otherwise can take
|
Relatively fast when secure ATA erase is available, otherwise can take
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
from ironic_lib import utils
|
from ironic_lib import utils
|
||||||
from oslo_concurrency import processutils
|
from oslo_concurrency import processutils
|
||||||
from oslo_log import log
|
from oslo_log import log
|
||||||
@ -19,6 +21,9 @@ from ironic_python_agent import hardware
|
|||||||
|
|
||||||
LOG = log.getLogger(__name__)
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
|
||||||
|
NETWORK_READER_CYCLE = 30
|
||||||
|
|
||||||
|
|
||||||
def stress_ng_cpu(node):
|
def stress_ng_cpu(node):
|
||||||
"""Burn-in the CPU with stress-ng
|
"""Burn-in the CPU with stress-ng
|
||||||
@ -115,3 +120,72 @@ def fio_disk(node):
|
|||||||
{'err': e})
|
{'err': e})
|
||||||
LOG.error(error_msg)
|
LOG.error(error_msg)
|
||||||
raise errors.CommandExecutionError(error_msg)
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _do_fio_network(writer, runtime, partner):
|
||||||
|
|
||||||
|
args = ['fio', '--ioengine', 'net', '--port', '9000', '--fill_device', 1,
|
||||||
|
'--group_reporting', '--gtod_reduce', 1, '--numjobs', 16]
|
||||||
|
if writer:
|
||||||
|
xargs = ['--name', 'writer', '--rw', 'write', '--runtime', runtime,
|
||||||
|
'--time_based', '--listen']
|
||||||
|
else:
|
||||||
|
xargs = ['--name', 'reader', '--rw', 'read', '--hostname', partner]
|
||||||
|
args.extend(xargs)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
LOG.info('Burn-in fio network command: %s', ' '.join(map(str, args)))
|
||||||
|
try:
|
||||||
|
out, err = utils.execute(*args)
|
||||||
|
# fio reports on stdout
|
||||||
|
LOG.info(out)
|
||||||
|
break
|
||||||
|
except (processutils.ProcessExecutionError, OSError) as e:
|
||||||
|
error_msg = ("fio (network) failed with error %(err)s",
|
||||||
|
{'err': e})
|
||||||
|
LOG.error(error_msg)
|
||||||
|
# while the writer blocks in fio, the reader fails with
|
||||||
|
# 'Connection {refused, timeout}' errors if the partner
|
||||||
|
# is not ready, so we need to wait explicitly
|
||||||
|
if not writer and 'Connection' in str(e):
|
||||||
|
LOG.info("fio (network): reader retrying in %s seconds ...",
|
||||||
|
NETWORK_READER_CYCLE)
|
||||||
|
time.sleep(NETWORK_READER_CYCLE)
|
||||||
|
else:
|
||||||
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def fio_network(node):
|
||||||
|
"""Burn-in the network with fio
|
||||||
|
|
||||||
|
Run an fio network job for a pair of nodes for a configurable
|
||||||
|
amount of time. The pair is statically defined in driver_info
|
||||||
|
via 'agent_burnin_fio_network_config'.
|
||||||
|
The writer will wait for the reader to connect, then write to the
|
||||||
|
network. Upon completion, the roles are swapped.
|
||||||
|
|
||||||
|
Note (arne_wiebalck): Initial version. The plan is to make the
|
||||||
|
match making dynamic by posting availability
|
||||||
|
on a distributed backend, e.g. via tooz.
|
||||||
|
|
||||||
|
:param node: Ironic node object
|
||||||
|
:raises: CommandExecutionError if the execution of fio fails.
|
||||||
|
:raises: CleaningError if the configuration is incomplete.
|
||||||
|
"""
|
||||||
|
|
||||||
|
info = node.get('driver_info', {})
|
||||||
|
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
|
||||||
|
|
||||||
|
# get our role and identify our partner
|
||||||
|
config = info.get('agent_burnin_fio_network_config')
|
||||||
|
if not config:
|
||||||
|
error_msg = ("fio (network) failed to find "
|
||||||
|
"'agent_burnin_fio_network_config' in driver_info")
|
||||||
|
raise errors.CleaningError(error_msg)
|
||||||
|
LOG.debug("agent_burnin_fio_network_config is %s", str(config))
|
||||||
|
role = config.get('role')
|
||||||
|
partner = config.get('partner')
|
||||||
|
|
||||||
|
_do_fio_network(role == 'writer', runtime, partner)
|
||||||
|
LOG.debug("fio (network): first direction done, swapping roles ...")
|
||||||
|
_do_fio_network(not role == 'writer', runtime, partner)
|
||||||
|
@ -1418,6 +1418,14 @@ class GenericHardwareManager(HardwareManager):
|
|||||||
"""
|
"""
|
||||||
burnin.stress_ng_vm(node)
|
burnin.stress_ng_vm(node)
|
||||||
|
|
||||||
|
def burnin_network(self, node, ports):
|
||||||
|
"""Burn-in the network
|
||||||
|
|
||||||
|
:param node: Ironic node object
|
||||||
|
:param ports: list of Ironic port objects
|
||||||
|
"""
|
||||||
|
burnin.fio_network(node)
|
||||||
|
|
||||||
def _shred_block_device(self, node, block_device):
|
def _shred_block_device(self, node, block_device):
|
||||||
"""Erase a block device using shred.
|
"""Erase a block device using shred.
|
||||||
|
|
||||||
@ -1912,6 +1920,13 @@ class GenericHardwareManager(HardwareManager):
|
|||||||
'reboot_requested': False,
|
'reboot_requested': False,
|
||||||
'abortable': True
|
'abortable': True
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
'step': 'burnin_network',
|
||||||
|
'priority': 0,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'reboot_requested': False,
|
||||||
|
'abortable': True
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_deploy_steps(self, node, ports):
|
def get_deploy_steps(self, node, ports):
|
||||||
|
@ -144,3 +144,77 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
|
|
||||||
self.assertRaises(errors.CommandExecutionError,
|
self.assertRaises(errors.CommandExecutionError,
|
||||||
burnin.fio_disk, node)
|
burnin.fio_disk, node)
|
||||||
|
|
||||||
|
def test_fio_network_reader(self, mock_execute):
|
||||||
|
|
||||||
|
node = {'driver_info': {'agent_burnin_fio_network_runtime': 600,
|
||||||
|
'agent_burnin_fio_network_config':
|
||||||
|
{'partner': 'host-002',
|
||||||
|
'role': 'reader'}}}
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
|
||||||
|
burnin.fio_network(node)
|
||||||
|
|
||||||
|
expected_calls = [
|
||||||
|
mock.call('fio', '--ioengine', 'net', '--port', '9000',
|
||||||
|
'--fill_device', 1, '--group_reporting',
|
||||||
|
'--gtod_reduce', 1, '--numjobs', 16, '--name',
|
||||||
|
'reader', '--rw', 'read', '--hostname', 'host-002'),
|
||||||
|
mock.call('fio', '--ioengine', 'net', '--port', '9000',
|
||||||
|
'--fill_device', 1, '--group_reporting',
|
||||||
|
'--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer',
|
||||||
|
'--rw', 'write', '--runtime', 600, '--time_based',
|
||||||
|
'--listen')]
|
||||||
|
mock_execute.assert_has_calls(expected_calls)
|
||||||
|
|
||||||
|
def test_fio_network_writer(self, mock_execute):
|
||||||
|
|
||||||
|
node = {'driver_info': {'agent_burnin_fio_network_runtime': 600,
|
||||||
|
'agent_burnin_fio_network_config':
|
||||||
|
{'partner': 'host-001',
|
||||||
|
'role': 'writer'}}}
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
|
||||||
|
burnin.fio_network(node)
|
||||||
|
|
||||||
|
expected_calls = [
|
||||||
|
mock.call('fio', '--ioengine', 'net', '--port', '9000',
|
||||||
|
'--fill_device', 1, '--group_reporting',
|
||||||
|
'--gtod_reduce', 1, '--numjobs', 16, '--name', 'writer',
|
||||||
|
'--rw', 'write', '--runtime', 600, '--time_based',
|
||||||
|
'--listen'),
|
||||||
|
mock.call('fio', '--ioengine', 'net', '--port', '9000',
|
||||||
|
'--fill_device', 1, '--group_reporting',
|
||||||
|
'--gtod_reduce', 1, '--numjobs', 16, '--name',
|
||||||
|
'reader', '--rw', 'read', '--hostname', 'host-001')]
|
||||||
|
mock_execute.assert_has_calls(expected_calls)
|
||||||
|
|
||||||
|
def test_fio_network_no_fio(self, mock_execute):
|
||||||
|
|
||||||
|
node = {'driver_info': {'agent_burnin_fio_network_config':
|
||||||
|
{'partner': 'host-003', 'role': 'reader'}}}
|
||||||
|
mock_execute.side_effect = processutils.ProcessExecutionError('boom')
|
||||||
|
|
||||||
|
self.assertRaises(errors.CommandExecutionError,
|
||||||
|
burnin.fio_network, node)
|
||||||
|
|
||||||
|
@mock.patch('time.sleep', autospec=True)
|
||||||
|
def test_fio_network_reader_loop(self, mock_time, mock_execute):
|
||||||
|
|
||||||
|
node = {'driver_info': {'agent_burnin_fio_network_config':
|
||||||
|
{'partner': 'host-004', 'role': 'reader'}}}
|
||||||
|
# mock the infinite loop
|
||||||
|
mock_execute.side_effect = (processutils.ProcessExecutionError(
|
||||||
|
'Connection timeout'),
|
||||||
|
processutils.ProcessExecutionError(
|
||||||
|
'Connection timeout'),
|
||||||
|
processutils.ProcessExecutionError(
|
||||||
|
'Connection refused'),
|
||||||
|
['out', 'err'], # connected!
|
||||||
|
['out', 'err']) # reversed roles
|
||||||
|
|
||||||
|
burnin.fio_network(node)
|
||||||
|
|
||||||
|
# we loop 3 times, then do the 2 fio calls
|
||||||
|
self.assertEqual(5, mock_execute.call_count)
|
||||||
|
self.assertEqual(3, mock_time.call_count)
|
||||||
|
@ -170,6 +170,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
|
|||||||
'interface': 'deploy',
|
'interface': 'deploy',
|
||||||
'reboot_requested': False,
|
'reboot_requested': False,
|
||||||
'abortable': True
|
'abortable': True
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'step': 'burnin_network',
|
||||||
|
'priority': 0,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'reboot_requested': False,
|
||||||
|
'abortable': True
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
clean_steps = self.hardware.get_clean_steps(self.node, [])
|
clean_steps = self.hardware.get_clean_steps(self.node, [])
|
||||||
|
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Adds a burn-in cleaning step 'burnin_network' to stress test the
|
||||||
|
network interface for a configurable amount of time with fio. To
|
||||||
|
use this step, fio needs to be installed on the RAM disk.
|
Loading…
x
Reference in New Issue
Block a user