Arne Wiebalck a86e21e4f4 Check the network burnin roles and partner
The network burnin roles are 'reader' and 'writer'. Raise an error
if the role is not provided or if the role is unknown. Equally,
raise an error if the partner is not provided.

Change-Id: I6259a7b0d15d62e68b1dc27f0cb511f8563c02ce
2021-08-10 16:37:40 +02:00

200 lines
7.2 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from ironic_lib import utils
from oslo_concurrency import processutils
from oslo_log import log
from ironic_python_agent import errors
from ironic_python_agent import hardware
LOG = log.getLogger(__name__)
NETWORK_BURNIN_ROLES = frozenset(['writer', 'reader'])
NETWORK_READER_CYCLE = 30
def stress_ng_cpu(node):
"""Burn-in the CPU with stress-ng
Run stress-ng on a configurable number of CPUs for
a configurable amount of time. Without config use
all CPUs and stress them for 24 hours.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of stress-ng fails.
"""
info = node.get('driver_info', {})
cpu = info.get('agent_burnin_cpu_cpu', 0)
timeout = info.get('agent_burnin_cpu_timeout', 86400)
args = ('stress-ng', '--cpu', cpu, '--timeout', timeout,
'--metrics-brief')
LOG.debug('Burn-in stress_ng_cpu command: %s', args)
try:
_, err = utils.execute(*args)
# stress-ng reports on stderr only
LOG.info(err)
except (processutils.ProcessExecutionError, OSError) as e:
error_msg = ("stress-ng (cpu) failed with error %(err)s",
{'err': e})
LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg)
def stress_ng_vm(node):
"""Burn-in the memory with the vm stressor in stress-ng
Run stress-ng with a configurable number of workers on
a configurable amount of the available memory for
a configurable amount of time. Without config use
as many workers as CPUs, 98% of the memory and stress
it for 24 hours.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of stress-ng fails.
"""
info = node.get('driver_info', {})
vm = info.get('agent_burnin_vm_vm', 0)
vm_bytes = info.get('agent_burnin_vm_vm-bytes', '98%')
timeout = info.get('agent_burnin_vm_timeout', 86400)
args = ('stress-ng', '--vm', vm, '--vm-bytes', vm_bytes,
'--timeout', timeout, '--metrics-brief')
LOG.debug('Burn-in stress_ng_vm command: %s', args)
try:
_, err = utils.execute(*args)
# stress-ng reports on stderr only
LOG.info(err)
except (processutils.ProcessExecutionError, OSError) as e:
error_msg = ("stress-ng (vm) failed with error %(err)s",
{'err': e})
LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg)
def fio_disk(node):
"""Burn-in the disks with fio
Run an fio randrw job for a configurable number of iterations
or a given amount of time.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of fio fails.
"""
info = node.get('driver_info', {})
# 4 iterations, same as badblock's default
loops = info.get('agent_burnin_fio_disk_loops', 4)
runtime = info.get('agent_burnin_fio_disk_runtime', 0)
args = ['fio', '--rw', 'readwrite', '--bs', '4k', '--direct', 1,
'--ioengine', 'libaio', '--iodepth', '32', '--verify',
'crc32c', '--verify_dump', 1, '--continue_on_error', 'verify',
'--loops', loops, '--runtime', runtime, '--time_based']
devices = hardware.list_all_block_devices()
for device in devices:
args.extend(['--name', device.name])
LOG.debug('Burn-in fio disk command: %s', ' '.join(map(str, args)))
try:
out, _ = utils.execute(*args)
# fio reports on stdout
LOG.info(out)
except (processutils.ProcessExecutionError, OSError) as e:
error_msg = ("fio (disk) failed with error %(err)s",
{'err': e})
LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg)
def _do_fio_network(writer, runtime, partner):
args = ['fio', '--ioengine', 'net', '--port', '9000', '--fill_device', 1,
'--group_reporting', '--gtod_reduce', 1, '--numjobs', 16]
if writer:
xargs = ['--name', 'writer', '--rw', 'write', '--runtime', runtime,
'--time_based', '--listen']
else:
xargs = ['--name', 'reader', '--rw', 'read', '--hostname', partner]
args.extend(xargs)
while True:
LOG.info('Burn-in fio network command: %s', ' '.join(map(str, args)))
try:
out, err = utils.execute(*args)
# fio reports on stdout
LOG.info(out)
break
except (processutils.ProcessExecutionError, OSError) as e:
error_msg = ("fio (network) failed with error %(err)s",
{'err': e})
LOG.error(error_msg)
# while the writer blocks in fio, the reader fails with
# 'Connection {refused, timeout}' errors if the partner
# is not ready, so we need to wait explicitly
if not writer and 'Connection' in str(e):
LOG.info("fio (network): reader retrying in %s seconds ...",
NETWORK_READER_CYCLE)
time.sleep(NETWORK_READER_CYCLE)
else:
raise errors.CommandExecutionError(error_msg)
def fio_network(node):
"""Burn-in the network with fio
Run an fio network job for a pair of nodes for a configurable
amount of time. The pair is statically defined in driver_info
via 'agent_burnin_fio_network_config'.
The writer will wait for the reader to connect, then write to the
network. Upon completion, the roles are swapped.
Note (arne_wiebalck): Initial version. The plan is to make the
match making dynamic by posting availability
on a distributed backend, e.g. via tooz.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of fio fails.
:raises: CleaningError if the configuration is incomplete.
"""
info = node.get('driver_info', {})
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
# get our role and identify our partner
config = info.get('agent_burnin_fio_network_config')
if not config:
error_msg = ("fio (network) failed to find "
"'agent_burnin_fio_network_config' in driver_info")
raise errors.CleaningError(error_msg)
LOG.debug("agent_burnin_fio_network_config is %s", str(config))
role = config.get('role')
if role not in NETWORK_BURNIN_ROLES:
error_msg = ("fio (network) found an unknown role: %s", role)
raise errors.CleaningError(error_msg)
partner = config.get('partner')
if not partner:
error_msg = ("fio (network) failed to find partner")
raise errors.CleaningError(error_msg)
_do_fio_network(role == 'writer', runtime, partner)
LOG.debug("fio (network): first direction done, swapping roles ...")
_do_fio_network(not role == 'writer', runtime, partner)