Burn-in: Dynamic network pairing
Pair nodes dynamically via a distributed coordination backend for network burn-in. The algorithm uses a group to pair nodes: after acquiring a lock, a first node joins the group, releases the lock, waits for a second node, then they both leave, and release the lock for the next pair. Story: #2007523 Task: #42796 Change-Id: I572093b144bc90a49cd76929c7e8685ed45d9f6e
This commit is contained in:
parent
fa5cccd137
commit
7f15455d8d
@ -11,11 +11,13 @@
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import socket
|
||||
import time
|
||||
|
||||
from ironic_lib import utils
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_log import log
|
||||
from tooz import coordination
|
||||
|
||||
from ironic_python_agent import errors
|
||||
from ironic_python_agent import hardware
|
||||
@ -252,45 +254,160 @@ def _do_fio_network(writer, runtime, partner, outputfile):
|
||||
raise errors.CommandExecutionError(error_msg)
|
||||
|
||||
|
||||
def _find_network_burnin_partner_and_role(backend_url, group_name, timeout):
|
||||
"""Find a partner node for network burn-in and get our role.
|
||||
|
||||
:param backend_url: The tooz backend url.
|
||||
:param group_name: The tooz group name for pairing.
|
||||
:param timeout:Timeout in seconds for a node to wait for a partner.
|
||||
:returns: A set with the partner node and the role of the local node.
|
||||
"""
|
||||
|
||||
member_id = socket.gethostname()
|
||||
coordinator = coordination.get_coordinator(backend_url, member_id)
|
||||
coordinator.start(start_heart=True)
|
||||
|
||||
groups = coordinator.get_groups()
|
||||
for group in groups.get():
|
||||
if group_name == group.decode('utf-8'):
|
||||
LOG.debug("Found group %s", group_name)
|
||||
break
|
||||
else:
|
||||
LOG.info("Creating group %s", group_name)
|
||||
coordinator.create_group(group_name)
|
||||
|
||||
def join_group(group_name):
|
||||
request = coordinator.join_group(group_name)
|
||||
request.get()
|
||||
|
||||
def leave_group(group_name):
|
||||
request = coordinator.leave_group(group_name)
|
||||
request.get()
|
||||
|
||||
# Attempt to get the pairing lock. The lock is released when:
|
||||
# a) a node enters the group and is the first to join, or
|
||||
# b) a node enters second, finished pairing, sees
|
||||
# the pairing node exiting, and left itself.
|
||||
# The lock 'walls' all nodes willing to pair.
|
||||
group_lock = coordinator.get_lock("group_lock")
|
||||
with group_lock:
|
||||
# we need the initial members in order to know the first
|
||||
# node (which may leave quickly when we join)
|
||||
init_members = coordinator.get_members(group_name)
|
||||
LOG.info("Original group members are %s", init_members.get())
|
||||
members_cnt = len(init_members.get())
|
||||
|
||||
join_group(group_name)
|
||||
|
||||
# we assign the first node the writer role since it will
|
||||
# leave the group first, it may be ready once the second
|
||||
# node leaves the group, and we save one wait cycle
|
||||
if not members_cnt:
|
||||
first = True
|
||||
role = "writer"
|
||||
group_lock.release() # allow second node to enter
|
||||
else:
|
||||
first = False
|
||||
role = "reader"
|
||||
|
||||
partner = None
|
||||
start_pairing = time.time()
|
||||
while time.time() - start_pairing < timeout:
|
||||
if first:
|
||||
# we are the first and therefore need to wait
|
||||
# for another node to join
|
||||
members = coordinator.get_members(group_name)
|
||||
members_cnt = len(members.get())
|
||||
else:
|
||||
# use the initial members in case the other
|
||||
# node leaves before we get an updated list
|
||||
members = init_members
|
||||
|
||||
assert members_cnt < 3
|
||||
|
||||
if members_cnt == 2 or not first:
|
||||
LOG.info("Two members, start pairing...")
|
||||
for member in members.get():
|
||||
node = member.decode('utf-8')
|
||||
if node != member_id:
|
||||
partner = node
|
||||
if not partner:
|
||||
error_msg = ("fio (network) no partner to pair found")
|
||||
raise errors.CleaningError(error_msg)
|
||||
|
||||
# if you are the second to enter, wait for the first to exit
|
||||
if not first:
|
||||
members = coordinator.get_members(group_name)
|
||||
while (len(members.get()) == 2):
|
||||
time.sleep(0.2)
|
||||
members = coordinator.get_members(group_name)
|
||||
leave_group(group_name)
|
||||
group_lock.release()
|
||||
else:
|
||||
leave_group(group_name)
|
||||
break
|
||||
else:
|
||||
LOG.info("One member, waiting for second node to join ...")
|
||||
time.sleep(1)
|
||||
else:
|
||||
leave_group(group_name)
|
||||
error_msg = ("fio (network) timed out to find partner")
|
||||
raise errors.CleaningError(error_msg)
|
||||
|
||||
return (partner, role)
|
||||
|
||||
|
||||
def fio_network(node):
|
||||
"""Burn-in the network with fio
|
||||
|
||||
Run an fio network job for a pair of nodes for a configurable
|
||||
amount of time. The pair is statically defined in driver_info
|
||||
via 'agent_burnin_fio_network_config'.
|
||||
amount of time. The pair is either statically defined in
|
||||
driver_info via 'agent_burnin_fio_network_config' or the role
|
||||
and partner is found dynamically via a tooz backend.
|
||||
|
||||
The writer will wait for the reader to connect, then write to the
|
||||
network. Upon completion, the roles are swapped.
|
||||
|
||||
Note (arne_wiebalck): Initial version. The plan is to make the
|
||||
match making dynamic by posting availability
|
||||
on a distributed backend, e.g. via tooz.
|
||||
|
||||
:param node: Ironic node object
|
||||
:raises: CommandExecutionError if the execution of fio fails.
|
||||
:raises: CleaningError if the configuration is incomplete.
|
||||
"""
|
||||
|
||||
info = node.get('driver_info', {})
|
||||
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
|
||||
outputfile = info.get('agent_burnin_fio_network_outputfile', None)
|
||||
|
||||
# get our role and identify our partner
|
||||
config = info.get('agent_burnin_fio_network_config')
|
||||
if not config:
|
||||
error_msg = ("fio (network) failed to find "
|
||||
"'agent_burnin_fio_network_config' in driver_info")
|
||||
raise errors.CleaningError(error_msg)
|
||||
LOG.debug("agent_burnin_fio_network_config is %s", str(config))
|
||||
if config:
|
||||
LOG.debug("static agent_burnin_fio_network_config is %s",
|
||||
config)
|
||||
role = config.get('role')
|
||||
partner = config.get('partner')
|
||||
else:
|
||||
timeout = info.get(
|
||||
'agent_burnin_fio_network_pairing_timeout', 900)
|
||||
group_name = info.get(
|
||||
'agent_burnin_fio_network_pairing_group_name',
|
||||
'ironic.network-burnin')
|
||||
backend_url = info.get(
|
||||
'agent_burnin_fio_network_pairing_backend_url', None)
|
||||
if not backend_url:
|
||||
msg = ('fio (network): dynamic pairing config is missing '
|
||||
'agent_burnin_fio_network_pairing_backend_url')
|
||||
raise errors.CleaningError(msg)
|
||||
LOG.info("dynamic pairing for network burn-in ...")
|
||||
(partner, role) = _find_network_burnin_partner_and_role(
|
||||
backend_url=backend_url,
|
||||
group_name=group_name,
|
||||
timeout=timeout)
|
||||
|
||||
role = config.get('role')
|
||||
if role not in NETWORK_BURNIN_ROLES:
|
||||
error_msg = "fio (network) found an unknown role: %s" % role
|
||||
raise errors.CleaningError(error_msg)
|
||||
|
||||
partner = config.get('partner')
|
||||
if not partner:
|
||||
error_msg = ("fio (network) failed to find partner")
|
||||
error_msg = "fio (network) failed to find partner"
|
||||
raise errors.CleaningError(error_msg)
|
||||
LOG.info("fio (network): partner %s, role is %s", partner, role)
|
||||
|
||||
logfilename = None
|
||||
if outputfile:
|
||||
|
@ -14,6 +14,7 @@ from unittest import mock
|
||||
|
||||
from ironic_lib import utils
|
||||
from oslo_concurrency import processutils
|
||||
from tooz import coordination
|
||||
|
||||
from ironic_python_agent import burnin
|
||||
from ironic_python_agent import errors
|
||||
@ -379,3 +380,138 @@ class TestBurnin(base.IronicAgentTest):
|
||||
# we loop 3 times, then do the 2 fio calls
|
||||
self.assertEqual(5, mock_execute.call_count)
|
||||
self.assertEqual(3, mock_time.call_count)
|
||||
|
||||
def test_fio_network_dynamic_pairing_raise_missing_config(self,
|
||||
mock_execute):
|
||||
node = {'driver_info': {}}
|
||||
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
|
||||
|
||||
def test_fio_network_dynamic_pairing_raise_wrong_config(self,
|
||||
mock_execute):
|
||||
node = {'driver_info': {
|
||||
'backend_url': 'zookeeper://zookeeper-host-01:2181',
|
||||
'group_name': 'ironic.dynamic-network-burnin',
|
||||
'timeout': 600}}
|
||||
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
|
||||
|
||||
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
|
||||
autospec=True)
|
||||
def test_fio_network_dynamic_pairing_defaults(self, mock_find,
|
||||
mock_execute):
|
||||
node = {'driver_info': {
|
||||
'agent_burnin_fio_network_pairing_backend_url':
|
||||
'zookeeper://zookeeper-host-01:2181'}}
|
||||
mock_find.return_value = ['partner-host', 'reader']
|
||||
mock_execute.return_value = (['out', 'err'])
|
||||
|
||||
burnin.fio_network(node)
|
||||
|
||||
mock_find.assert_called_once_with(
|
||||
backend_url='zookeeper://zookeeper-host-01:2181',
|
||||
group_name='ironic.network-burnin',
|
||||
timeout=900)
|
||||
|
||||
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
|
||||
autospec=True)
|
||||
def test_fio_network_dynamic_pairing_no_defaults(self, mock_find,
|
||||
mock_execute):
|
||||
node = {'driver_info': {
|
||||
'agent_burnin_fio_network_pairing_backend_url':
|
||||
'zookeeper://zookeeper-host-01:2181',
|
||||
'agent_burnin_fio_network_pairing_group_name':
|
||||
'ironic.special-group',
|
||||
'agent_burnin_fio_network_pairing_timeout': 600}}
|
||||
mock_find.return_value = ['partner-host', 'reader']
|
||||
mock_execute.return_value = (['out', 'err'])
|
||||
|
||||
burnin.fio_network(node)
|
||||
|
||||
mock_find.assert_called_once_with(
|
||||
backend_url='zookeeper://zookeeper-host-01:2181',
|
||||
group_name='ironic.special-group',
|
||||
timeout=600)
|
||||
|
||||
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
|
||||
def test_fio_network_dynamic_find_timeout(self, mock_get_coordinator,
|
||||
mock_execute):
|
||||
mock_coordinator = mock.MagicMock()
|
||||
mock_get_coordinator.return_value = mock_coordinator
|
||||
|
||||
# timeout since no other node is joining
|
||||
self.assertRaises(errors.CleaningError,
|
||||
burnin._find_network_burnin_partner_and_role,
|
||||
"zk://xyz", 'group', 2)
|
||||
|
||||
# group did not exist, so we created it
|
||||
mock_coordinator.create_group.assert_called_once_with('group')
|
||||
mock_coordinator.join_group.assert_called_once()
|
||||
# get_members is called initially, then every second
|
||||
# up to the timeout
|
||||
self.assertEqual(3, mock_coordinator.get_members.call_count)
|
||||
|
||||
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
|
||||
def test_fio_network_dynamic_find_pair_1st(self, mock_get_coordinator,
|
||||
mock_execute):
|
||||
mock_coordinator = mock.MagicMock()
|
||||
mock_get_coordinator.return_value = mock_coordinator
|
||||
|
||||
class Members:
|
||||
def __init__(self, members=[]):
|
||||
self.members = members
|
||||
|
||||
def get(self):
|
||||
return self.members
|
||||
|
||||
# we are the first node to enter, so no other host
|
||||
# initially until the second one appears after some
|
||||
# interations
|
||||
mock_coordinator.get_members.side_effect = \
|
||||
[Members(), Members([b'host1']), Members([b'host1']),
|
||||
Members([b'host1']), Members([b'host1', b'host2'])]
|
||||
|
||||
(partner, role) = \
|
||||
burnin._find_network_burnin_partner_and_role("zk://xyz",
|
||||
"group", 10)
|
||||
|
||||
# ... so we will leave first and be the writer
|
||||
self.assertEqual((partner, role), ("host2", "writer"))
|
||||
|
||||
# group did not exist, so we created it
|
||||
mock_coordinator.create_group.assert_called_once_with('group')
|
||||
mock_coordinator.join_group.assert_called_once()
|
||||
# get_members is called initially, then every second
|
||||
# up to the timeout
|
||||
self.assertEqual(5, mock_coordinator.get_members.call_count)
|
||||
|
||||
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
|
||||
def test_fio_network_dynamic_find_pair_2nd(self, mock_get_coordinator,
|
||||
mock_execute):
|
||||
mock_coordinator = mock.MagicMock()
|
||||
mock_get_coordinator.return_value = mock_coordinator
|
||||
|
||||
class Members:
|
||||
def __init__(self, members=[]):
|
||||
self.members = members
|
||||
|
||||
def get(self):
|
||||
return self.members
|
||||
|
||||
# we are the second node to enter, host1 is there before us ...
|
||||
mock_coordinator.get_members.side_effect = \
|
||||
[Members([b'host1']),
|
||||
Members([b'host1', b'host2']),
|
||||
Members([b'host2'])]
|
||||
|
||||
(partner, role) = \
|
||||
burnin._find_network_burnin_partner_and_role("zk://xyz",
|
||||
"group", 10)
|
||||
|
||||
# ... so we will leave second and be the reader
|
||||
self.assertEqual((partner, role), ("host1", "reader"))
|
||||
|
||||
# group did not exist, so we created it
|
||||
mock_coordinator.create_group.assert_called_once_with('group')
|
||||
mock_coordinator.join_group.assert_called_once()
|
||||
# get_members is called initially, then every second until the
|
||||
# other node appears
|
||||
self.assertEqual(3, mock_coordinator.get_members.call_count)
|
||||
|
@ -7,6 +7,7 @@ dogpile.cache==0.9.2
|
||||
eventlet==0.18.2
|
||||
importlib_metadata==1.7.0;python_version<'3.8'
|
||||
ironic-lib==5.1.0
|
||||
kazoo==2.8.0
|
||||
netifaces==0.10.4
|
||||
openstacksdk==0.49.0
|
||||
oslo.concurrency==3.26.0
|
||||
@ -24,3 +25,4 @@ stestr==1.0.0
|
||||
stevedore==1.20.0
|
||||
tenacity==6.2.0
|
||||
testtools==2.2.0
|
||||
tooz==2.7.2
|
||||
|
@ -0,0 +1,17 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
For network burn-in, nodes can now be paired dynamically via a
|
||||
distributed coordination backend (as an alternative to a static
|
||||
configuration). This allows burn-in to proceed on a 'first come
|
||||
first served' basis with the nodes available, rather than a node
|
||||
being blocked since the static partner is currently delayed.
|
||||
In order to configure this dynamic pairing, the nodes will need
|
||||
at least 'agent_burnin_fio_network_pairing_backend_url' in their
|
||||
driver-info (the URL for the coordination backend). In order to
|
||||
separate different hardware types, which may be using different
|
||||
networks and shall be burnt-in separately, the nodes can in
|
||||
addition define 'agent_burnin_fio_network_pairing_group_name' to
|
||||
have pairing only happening between nodes in the same group. An
|
||||
additional 'agent_burnin_fio_network_pairing_timeout' allows to
|
||||
limit the time given to the nodes to wait for a partner.
|
@ -20,3 +20,4 @@ tenacity>=6.2.0 # Apache-2.0
|
||||
ironic-lib>=5.1.0 # Apache-2.0
|
||||
Werkzeug>=1.0.1 # BSD License
|
||||
cryptography>=2.3 # BSD/Apache-2.0
|
||||
tooz>=2.7.2 # Apache-2.0
|
||||
|
Loading…
Reference in New Issue
Block a user