Burn-in: Dynamic network pairing

Pair nodes dynamically via a distributed coordination backend for
network burn-in. The algorithm uses a group to pair nodes: after
acquiring a lock, a first node joins the group, releases the lock,
waits for a second node, then they both leave, and release the lock
for the next pair.

Story: #2007523
Task: #42796

Change-Id: I572093b144bc90a49cd76929c7e8685ed45d9f6e
This commit is contained in:
Arne Wiebalck 2021-12-06 15:23:21 +01:00
parent fa5cccd137
commit 7f15455d8d
6 changed files with 293 additions and 16 deletions

View File

@ -11,11 +11,13 @@
# limitations under the License. # limitations under the License.
import json import json
import socket
import time import time
from ironic_lib import utils from ironic_lib import utils
from oslo_concurrency import processutils from oslo_concurrency import processutils
from oslo_log import log from oslo_log import log
from tooz import coordination
from ironic_python_agent import errors from ironic_python_agent import errors
from ironic_python_agent import hardware from ironic_python_agent import hardware
@ -252,45 +254,160 @@ def _do_fio_network(writer, runtime, partner, outputfile):
raise errors.CommandExecutionError(error_msg) raise errors.CommandExecutionError(error_msg)
def _find_network_burnin_partner_and_role(backend_url, group_name, timeout):
"""Find a partner node for network burn-in and get our role.
:param backend_url: The tooz backend url.
:param group_name: The tooz group name for pairing.
:param timeout:Timeout in seconds for a node to wait for a partner.
:returns: A set with the partner node and the role of the local node.
"""
member_id = socket.gethostname()
coordinator = coordination.get_coordinator(backend_url, member_id)
coordinator.start(start_heart=True)
groups = coordinator.get_groups()
for group in groups.get():
if group_name == group.decode('utf-8'):
LOG.debug("Found group %s", group_name)
break
else:
LOG.info("Creating group %s", group_name)
coordinator.create_group(group_name)
def join_group(group_name):
request = coordinator.join_group(group_name)
request.get()
def leave_group(group_name):
request = coordinator.leave_group(group_name)
request.get()
# Attempt to get the pairing lock. The lock is released when:
# a) a node enters the group and is the first to join, or
# b) a node enters second, finished pairing, sees
# the pairing node exiting, and left itself.
# The lock 'walls' all nodes willing to pair.
group_lock = coordinator.get_lock("group_lock")
with group_lock:
# we need the initial members in order to know the first
# node (which may leave quickly when we join)
init_members = coordinator.get_members(group_name)
LOG.info("Original group members are %s", init_members.get())
members_cnt = len(init_members.get())
join_group(group_name)
# we assign the first node the writer role since it will
# leave the group first, it may be ready once the second
# node leaves the group, and we save one wait cycle
if not members_cnt:
first = True
role = "writer"
group_lock.release() # allow second node to enter
else:
first = False
role = "reader"
partner = None
start_pairing = time.time()
while time.time() - start_pairing < timeout:
if first:
# we are the first and therefore need to wait
# for another node to join
members = coordinator.get_members(group_name)
members_cnt = len(members.get())
else:
# use the initial members in case the other
# node leaves before we get an updated list
members = init_members
assert members_cnt < 3
if members_cnt == 2 or not first:
LOG.info("Two members, start pairing...")
for member in members.get():
node = member.decode('utf-8')
if node != member_id:
partner = node
if not partner:
error_msg = ("fio (network) no partner to pair found")
raise errors.CleaningError(error_msg)
# if you are the second to enter, wait for the first to exit
if not first:
members = coordinator.get_members(group_name)
while (len(members.get()) == 2):
time.sleep(0.2)
members = coordinator.get_members(group_name)
leave_group(group_name)
group_lock.release()
else:
leave_group(group_name)
break
else:
LOG.info("One member, waiting for second node to join ...")
time.sleep(1)
else:
leave_group(group_name)
error_msg = ("fio (network) timed out to find partner")
raise errors.CleaningError(error_msg)
return (partner, role)
def fio_network(node): def fio_network(node):
"""Burn-in the network with fio """Burn-in the network with fio
Run an fio network job for a pair of nodes for a configurable Run an fio network job for a pair of nodes for a configurable
amount of time. The pair is statically defined in driver_info amount of time. The pair is either statically defined in
via 'agent_burnin_fio_network_config'. driver_info via 'agent_burnin_fio_network_config' or the role
and partner is found dynamically via a tooz backend.
The writer will wait for the reader to connect, then write to the The writer will wait for the reader to connect, then write to the
network. Upon completion, the roles are swapped. network. Upon completion, the roles are swapped.
Note (arne_wiebalck): Initial version. The plan is to make the
match making dynamic by posting availability
on a distributed backend, e.g. via tooz.
:param node: Ironic node object :param node: Ironic node object
:raises: CommandExecutionError if the execution of fio fails. :raises: CommandExecutionError if the execution of fio fails.
:raises: CleaningError if the configuration is incomplete. :raises: CleaningError if the configuration is incomplete.
""" """
info = node.get('driver_info', {}) info = node.get('driver_info', {})
runtime = info.get('agent_burnin_fio_network_runtime', 21600) runtime = info.get('agent_burnin_fio_network_runtime', 21600)
outputfile = info.get('agent_burnin_fio_network_outputfile', None) outputfile = info.get('agent_burnin_fio_network_outputfile', None)
# get our role and identify our partner # get our role and identify our partner
config = info.get('agent_burnin_fio_network_config') config = info.get('agent_burnin_fio_network_config')
if not config: if config:
error_msg = ("fio (network) failed to find " LOG.debug("static agent_burnin_fio_network_config is %s",
"'agent_burnin_fio_network_config' in driver_info") config)
raise errors.CleaningError(error_msg) role = config.get('role')
LOG.debug("agent_burnin_fio_network_config is %s", str(config)) partner = config.get('partner')
else:
timeout = info.get(
'agent_burnin_fio_network_pairing_timeout', 900)
group_name = info.get(
'agent_burnin_fio_network_pairing_group_name',
'ironic.network-burnin')
backend_url = info.get(
'agent_burnin_fio_network_pairing_backend_url', None)
if not backend_url:
msg = ('fio (network): dynamic pairing config is missing '
'agent_burnin_fio_network_pairing_backend_url')
raise errors.CleaningError(msg)
LOG.info("dynamic pairing for network burn-in ...")
(partner, role) = _find_network_burnin_partner_and_role(
backend_url=backend_url,
group_name=group_name,
timeout=timeout)
role = config.get('role')
if role not in NETWORK_BURNIN_ROLES: if role not in NETWORK_BURNIN_ROLES:
error_msg = "fio (network) found an unknown role: %s" % role error_msg = "fio (network) found an unknown role: %s" % role
raise errors.CleaningError(error_msg) raise errors.CleaningError(error_msg)
partner = config.get('partner')
if not partner: if not partner:
error_msg = ("fio (network) failed to find partner") error_msg = "fio (network) failed to find partner"
raise errors.CleaningError(error_msg) raise errors.CleaningError(error_msg)
LOG.info("fio (network): partner %s, role is %s", partner, role)
logfilename = None logfilename = None
if outputfile: if outputfile:

View File

@ -14,6 +14,7 @@ from unittest import mock
from ironic_lib import utils from ironic_lib import utils
from oslo_concurrency import processutils from oslo_concurrency import processutils
from tooz import coordination
from ironic_python_agent import burnin from ironic_python_agent import burnin
from ironic_python_agent import errors from ironic_python_agent import errors
@ -379,3 +380,138 @@ class TestBurnin(base.IronicAgentTest):
# we loop 3 times, then do the 2 fio calls # we loop 3 times, then do the 2 fio calls
self.assertEqual(5, mock_execute.call_count) self.assertEqual(5, mock_execute.call_count)
self.assertEqual(3, mock_time.call_count) self.assertEqual(3, mock_time.call_count)
def test_fio_network_dynamic_pairing_raise_missing_config(self,
mock_execute):
node = {'driver_info': {}}
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
def test_fio_network_dynamic_pairing_raise_wrong_config(self,
mock_execute):
node = {'driver_info': {
'backend_url': 'zookeeper://zookeeper-host-01:2181',
'group_name': 'ironic.dynamic-network-burnin',
'timeout': 600}}
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
autospec=True)
def test_fio_network_dynamic_pairing_defaults(self, mock_find,
mock_execute):
node = {'driver_info': {
'agent_burnin_fio_network_pairing_backend_url':
'zookeeper://zookeeper-host-01:2181'}}
mock_find.return_value = ['partner-host', 'reader']
mock_execute.return_value = (['out', 'err'])
burnin.fio_network(node)
mock_find.assert_called_once_with(
backend_url='zookeeper://zookeeper-host-01:2181',
group_name='ironic.network-burnin',
timeout=900)
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
autospec=True)
def test_fio_network_dynamic_pairing_no_defaults(self, mock_find,
mock_execute):
node = {'driver_info': {
'agent_burnin_fio_network_pairing_backend_url':
'zookeeper://zookeeper-host-01:2181',
'agent_burnin_fio_network_pairing_group_name':
'ironic.special-group',
'agent_burnin_fio_network_pairing_timeout': 600}}
mock_find.return_value = ['partner-host', 'reader']
mock_execute.return_value = (['out', 'err'])
burnin.fio_network(node)
mock_find.assert_called_once_with(
backend_url='zookeeper://zookeeper-host-01:2181',
group_name='ironic.special-group',
timeout=600)
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
def test_fio_network_dynamic_find_timeout(self, mock_get_coordinator,
mock_execute):
mock_coordinator = mock.MagicMock()
mock_get_coordinator.return_value = mock_coordinator
# timeout since no other node is joining
self.assertRaises(errors.CleaningError,
burnin._find_network_burnin_partner_and_role,
"zk://xyz", 'group', 2)
# group did not exist, so we created it
mock_coordinator.create_group.assert_called_once_with('group')
mock_coordinator.join_group.assert_called_once()
# get_members is called initially, then every second
# up to the timeout
self.assertEqual(3, mock_coordinator.get_members.call_count)
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
def test_fio_network_dynamic_find_pair_1st(self, mock_get_coordinator,
mock_execute):
mock_coordinator = mock.MagicMock()
mock_get_coordinator.return_value = mock_coordinator
class Members:
def __init__(self, members=[]):
self.members = members
def get(self):
return self.members
# we are the first node to enter, so no other host
# initially until the second one appears after some
# interations
mock_coordinator.get_members.side_effect = \
[Members(), Members([b'host1']), Members([b'host1']),
Members([b'host1']), Members([b'host1', b'host2'])]
(partner, role) = \
burnin._find_network_burnin_partner_and_role("zk://xyz",
"group", 10)
# ... so we will leave first and be the writer
self.assertEqual((partner, role), ("host2", "writer"))
# group did not exist, so we created it
mock_coordinator.create_group.assert_called_once_with('group')
mock_coordinator.join_group.assert_called_once()
# get_members is called initially, then every second
# up to the timeout
self.assertEqual(5, mock_coordinator.get_members.call_count)
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
def test_fio_network_dynamic_find_pair_2nd(self, mock_get_coordinator,
mock_execute):
mock_coordinator = mock.MagicMock()
mock_get_coordinator.return_value = mock_coordinator
class Members:
def __init__(self, members=[]):
self.members = members
def get(self):
return self.members
# we are the second node to enter, host1 is there before us ...
mock_coordinator.get_members.side_effect = \
[Members([b'host1']),
Members([b'host1', b'host2']),
Members([b'host2'])]
(partner, role) = \
burnin._find_network_burnin_partner_and_role("zk://xyz",
"group", 10)
# ... so we will leave second and be the reader
self.assertEqual((partner, role), ("host1", "reader"))
# group did not exist, so we created it
mock_coordinator.create_group.assert_called_once_with('group')
mock_coordinator.join_group.assert_called_once()
# get_members is called initially, then every second until the
# other node appears
self.assertEqual(3, mock_coordinator.get_members.call_count)

View File

@ -7,6 +7,7 @@ dogpile.cache==0.9.2
eventlet==0.18.2 eventlet==0.18.2
importlib_metadata==1.7.0;python_version<'3.8' importlib_metadata==1.7.0;python_version<'3.8'
ironic-lib==5.1.0 ironic-lib==5.1.0
kazoo==2.8.0
netifaces==0.10.4 netifaces==0.10.4
openstacksdk==0.49.0 openstacksdk==0.49.0
oslo.concurrency==3.26.0 oslo.concurrency==3.26.0
@ -24,3 +25,4 @@ stestr==1.0.0
stevedore==1.20.0 stevedore==1.20.0
tenacity==6.2.0 tenacity==6.2.0
testtools==2.2.0 testtools==2.2.0
tooz==2.7.2

View File

@ -0,0 +1,17 @@
---
features:
- |
For network burn-in, nodes can now be paired dynamically via a
distributed coordination backend (as an alternative to a static
configuration). This allows burn-in to proceed on a 'first come
first served' basis with the nodes available, rather than a node
being blocked since the static partner is currently delayed.
In order to configure this dynamic pairing, the nodes will need
at least 'agent_burnin_fio_network_pairing_backend_url' in their
driver-info (the URL for the coordination backend). In order to
separate different hardware types, which may be using different
networks and shall be burnt-in separately, the nodes can in
addition define 'agent_burnin_fio_network_pairing_group_name' to
have pairing only happening between nodes in the same group. An
additional 'agent_burnin_fio_network_pairing_timeout' allows to
limit the time given to the nodes to wait for a partner.

View File

@ -20,3 +20,4 @@ tenacity>=6.2.0 # Apache-2.0
ironic-lib>=5.1.0 # Apache-2.0 ironic-lib>=5.1.0 # Apache-2.0
Werkzeug>=1.0.1 # BSD License Werkzeug>=1.0.1 # BSD License
cryptography>=2.3 # BSD/Apache-2.0 cryptography>=2.3 # BSD/Apache-2.0
tooz>=2.7.2 # Apache-2.0

View File

@ -64,3 +64,7 @@ api_doc_dir = contributor/api
tag_build = tag_build =
tag_date = 0 tag_date = 0
tag_svn_revision = 0 tag_svn_revision = 0
[extras]
burnin-network-kazoo =
kazoo>=2.8.0 # Apache-2.0