From 7f15455d8d106a7d7ecdbc90e96f06d7e322ac4f Mon Sep 17 00:00:00 2001 From: Arne Wiebalck Date: Mon, 6 Dec 2021 15:23:21 +0100 Subject: [PATCH] Burn-in: Dynamic network pairing Pair nodes dynamically via a distributed coordination backend for network burn-in. The algorithm uses a group to pair nodes: after acquiring a lock, a first node joins the group, releases the lock, waits for a second node, then they both leave, and release the lock for the next pair. Story: #2007523 Task: #42796 Change-Id: I572093b144bc90a49cd76929c7e8685ed45d9f6e --- ironic_python_agent/burnin.py | 149 ++++++++++++++++-- ironic_python_agent/tests/unit/test_burnin.py | 136 ++++++++++++++++ lower-constraints.txt | 2 + ...amic_network_pairing-33e398255050eb98.yaml | 17 ++ requirements.txt | 1 + setup.cfg | 4 + 6 files changed, 293 insertions(+), 16 deletions(-) create mode 100644 releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml diff --git a/ironic_python_agent/burnin.py b/ironic_python_agent/burnin.py index 798445384..2af51b2ad 100644 --- a/ironic_python_agent/burnin.py +++ b/ironic_python_agent/burnin.py @@ -11,11 +11,13 @@ # limitations under the License. import json +import socket import time from ironic_lib import utils from oslo_concurrency import processutils from oslo_log import log +from tooz import coordination from ironic_python_agent import errors from ironic_python_agent import hardware @@ -252,45 +254,160 @@ def _do_fio_network(writer, runtime, partner, outputfile): raise errors.CommandExecutionError(error_msg) +def _find_network_burnin_partner_and_role(backend_url, group_name, timeout): + """Find a partner node for network burn-in and get our role. + + :param backend_url: The tooz backend url. + :param group_name: The tooz group name for pairing. + :param timeout:Timeout in seconds for a node to wait for a partner. + :returns: A set with the partner node and the role of the local node. + """ + + member_id = socket.gethostname() + coordinator = coordination.get_coordinator(backend_url, member_id) + coordinator.start(start_heart=True) + + groups = coordinator.get_groups() + for group in groups.get(): + if group_name == group.decode('utf-8'): + LOG.debug("Found group %s", group_name) + break + else: + LOG.info("Creating group %s", group_name) + coordinator.create_group(group_name) + + def join_group(group_name): + request = coordinator.join_group(group_name) + request.get() + + def leave_group(group_name): + request = coordinator.leave_group(group_name) + request.get() + + # Attempt to get the pairing lock. The lock is released when: + # a) a node enters the group and is the first to join, or + # b) a node enters second, finished pairing, sees + # the pairing node exiting, and left itself. + # The lock 'walls' all nodes willing to pair. + group_lock = coordinator.get_lock("group_lock") + with group_lock: + # we need the initial members in order to know the first + # node (which may leave quickly when we join) + init_members = coordinator.get_members(group_name) + LOG.info("Original group members are %s", init_members.get()) + members_cnt = len(init_members.get()) + + join_group(group_name) + + # we assign the first node the writer role since it will + # leave the group first, it may be ready once the second + # node leaves the group, and we save one wait cycle + if not members_cnt: + first = True + role = "writer" + group_lock.release() # allow second node to enter + else: + first = False + role = "reader" + + partner = None + start_pairing = time.time() + while time.time() - start_pairing < timeout: + if first: + # we are the first and therefore need to wait + # for another node to join + members = coordinator.get_members(group_name) + members_cnt = len(members.get()) + else: + # use the initial members in case the other + # node leaves before we get an updated list + members = init_members + + assert members_cnt < 3 + + if members_cnt == 2 or not first: + LOG.info("Two members, start pairing...") + for member in members.get(): + node = member.decode('utf-8') + if node != member_id: + partner = node + if not partner: + error_msg = ("fio (network) no partner to pair found") + raise errors.CleaningError(error_msg) + + # if you are the second to enter, wait for the first to exit + if not first: + members = coordinator.get_members(group_name) + while (len(members.get()) == 2): + time.sleep(0.2) + members = coordinator.get_members(group_name) + leave_group(group_name) + group_lock.release() + else: + leave_group(group_name) + break + else: + LOG.info("One member, waiting for second node to join ...") + time.sleep(1) + else: + leave_group(group_name) + error_msg = ("fio (network) timed out to find partner") + raise errors.CleaningError(error_msg) + + return (partner, role) + + def fio_network(node): """Burn-in the network with fio Run an fio network job for a pair of nodes for a configurable - amount of time. The pair is statically defined in driver_info - via 'agent_burnin_fio_network_config'. + amount of time. The pair is either statically defined in + driver_info via 'agent_burnin_fio_network_config' or the role + and partner is found dynamically via a tooz backend. + The writer will wait for the reader to connect, then write to the network. Upon completion, the roles are swapped. - Note (arne_wiebalck): Initial version. The plan is to make the - match making dynamic by posting availability - on a distributed backend, e.g. via tooz. - :param node: Ironic node object :raises: CommandExecutionError if the execution of fio fails. :raises: CleaningError if the configuration is incomplete. """ - info = node.get('driver_info', {}) runtime = info.get('agent_burnin_fio_network_runtime', 21600) outputfile = info.get('agent_burnin_fio_network_outputfile', None) # get our role and identify our partner config = info.get('agent_burnin_fio_network_config') - if not config: - error_msg = ("fio (network) failed to find " - "'agent_burnin_fio_network_config' in driver_info") - raise errors.CleaningError(error_msg) - LOG.debug("agent_burnin_fio_network_config is %s", str(config)) + if config: + LOG.debug("static agent_burnin_fio_network_config is %s", + config) + role = config.get('role') + partner = config.get('partner') + else: + timeout = info.get( + 'agent_burnin_fio_network_pairing_timeout', 900) + group_name = info.get( + 'agent_burnin_fio_network_pairing_group_name', + 'ironic.network-burnin') + backend_url = info.get( + 'agent_burnin_fio_network_pairing_backend_url', None) + if not backend_url: + msg = ('fio (network): dynamic pairing config is missing ' + 'agent_burnin_fio_network_pairing_backend_url') + raise errors.CleaningError(msg) + LOG.info("dynamic pairing for network burn-in ...") + (partner, role) = _find_network_burnin_partner_and_role( + backend_url=backend_url, + group_name=group_name, + timeout=timeout) - role = config.get('role') if role not in NETWORK_BURNIN_ROLES: error_msg = "fio (network) found an unknown role: %s" % role raise errors.CleaningError(error_msg) - - partner = config.get('partner') if not partner: - error_msg = ("fio (network) failed to find partner") + error_msg = "fio (network) failed to find partner" raise errors.CleaningError(error_msg) + LOG.info("fio (network): partner %s, role is %s", partner, role) logfilename = None if outputfile: diff --git a/ironic_python_agent/tests/unit/test_burnin.py b/ironic_python_agent/tests/unit/test_burnin.py index 32708130e..25d334e72 100644 --- a/ironic_python_agent/tests/unit/test_burnin.py +++ b/ironic_python_agent/tests/unit/test_burnin.py @@ -14,6 +14,7 @@ from unittest import mock from ironic_lib import utils from oslo_concurrency import processutils +from tooz import coordination from ironic_python_agent import burnin from ironic_python_agent import errors @@ -379,3 +380,138 @@ class TestBurnin(base.IronicAgentTest): # we loop 3 times, then do the 2 fio calls self.assertEqual(5, mock_execute.call_count) self.assertEqual(3, mock_time.call_count) + + def test_fio_network_dynamic_pairing_raise_missing_config(self, + mock_execute): + node = {'driver_info': {}} + self.assertRaises(errors.CleaningError, burnin.fio_network, node) + + def test_fio_network_dynamic_pairing_raise_wrong_config(self, + mock_execute): + node = {'driver_info': { + 'backend_url': 'zookeeper://zookeeper-host-01:2181', + 'group_name': 'ironic.dynamic-network-burnin', + 'timeout': 600}} + self.assertRaises(errors.CleaningError, burnin.fio_network, node) + + @mock.patch.object(burnin, '_find_network_burnin_partner_and_role', + autospec=True) + def test_fio_network_dynamic_pairing_defaults(self, mock_find, + mock_execute): + node = {'driver_info': { + 'agent_burnin_fio_network_pairing_backend_url': + 'zookeeper://zookeeper-host-01:2181'}} + mock_find.return_value = ['partner-host', 'reader'] + mock_execute.return_value = (['out', 'err']) + + burnin.fio_network(node) + + mock_find.assert_called_once_with( + backend_url='zookeeper://zookeeper-host-01:2181', + group_name='ironic.network-burnin', + timeout=900) + + @mock.patch.object(burnin, '_find_network_burnin_partner_and_role', + autospec=True) + def test_fio_network_dynamic_pairing_no_defaults(self, mock_find, + mock_execute): + node = {'driver_info': { + 'agent_burnin_fio_network_pairing_backend_url': + 'zookeeper://zookeeper-host-01:2181', + 'agent_burnin_fio_network_pairing_group_name': + 'ironic.special-group', + 'agent_burnin_fio_network_pairing_timeout': 600}} + mock_find.return_value = ['partner-host', 'reader'] + mock_execute.return_value = (['out', 'err']) + + burnin.fio_network(node) + + mock_find.assert_called_once_with( + backend_url='zookeeper://zookeeper-host-01:2181', + group_name='ironic.special-group', + timeout=600) + + @mock.patch.object(coordination, 'get_coordinator', autospec=True) + def test_fio_network_dynamic_find_timeout(self, mock_get_coordinator, + mock_execute): + mock_coordinator = mock.MagicMock() + mock_get_coordinator.return_value = mock_coordinator + + # timeout since no other node is joining + self.assertRaises(errors.CleaningError, + burnin._find_network_burnin_partner_and_role, + "zk://xyz", 'group', 2) + + # group did not exist, so we created it + mock_coordinator.create_group.assert_called_once_with('group') + mock_coordinator.join_group.assert_called_once() + # get_members is called initially, then every second + # up to the timeout + self.assertEqual(3, mock_coordinator.get_members.call_count) + + @mock.patch.object(coordination, 'get_coordinator', autospec=True) + def test_fio_network_dynamic_find_pair_1st(self, mock_get_coordinator, + mock_execute): + mock_coordinator = mock.MagicMock() + mock_get_coordinator.return_value = mock_coordinator + + class Members: + def __init__(self, members=[]): + self.members = members + + def get(self): + return self.members + + # we are the first node to enter, so no other host + # initially until the second one appears after some + # interations + mock_coordinator.get_members.side_effect = \ + [Members(), Members([b'host1']), Members([b'host1']), + Members([b'host1']), Members([b'host1', b'host2'])] + + (partner, role) = \ + burnin._find_network_burnin_partner_and_role("zk://xyz", + "group", 10) + + # ... so we will leave first and be the writer + self.assertEqual((partner, role), ("host2", "writer")) + + # group did not exist, so we created it + mock_coordinator.create_group.assert_called_once_with('group') + mock_coordinator.join_group.assert_called_once() + # get_members is called initially, then every second + # up to the timeout + self.assertEqual(5, mock_coordinator.get_members.call_count) + + @mock.patch.object(coordination, 'get_coordinator', autospec=True) + def test_fio_network_dynamic_find_pair_2nd(self, mock_get_coordinator, + mock_execute): + mock_coordinator = mock.MagicMock() + mock_get_coordinator.return_value = mock_coordinator + + class Members: + def __init__(self, members=[]): + self.members = members + + def get(self): + return self.members + + # we are the second node to enter, host1 is there before us ... + mock_coordinator.get_members.side_effect = \ + [Members([b'host1']), + Members([b'host1', b'host2']), + Members([b'host2'])] + + (partner, role) = \ + burnin._find_network_burnin_partner_and_role("zk://xyz", + "group", 10) + + # ... so we will leave second and be the reader + self.assertEqual((partner, role), ("host1", "reader")) + + # group did not exist, so we created it + mock_coordinator.create_group.assert_called_once_with('group') + mock_coordinator.join_group.assert_called_once() + # get_members is called initially, then every second until the + # other node appears + self.assertEqual(3, mock_coordinator.get_members.call_count) diff --git a/lower-constraints.txt b/lower-constraints.txt index 8fbb7f138..64365d598 100644 --- a/lower-constraints.txt +++ b/lower-constraints.txt @@ -7,6 +7,7 @@ dogpile.cache==0.9.2 eventlet==0.18.2 importlib_metadata==1.7.0;python_version<'3.8' ironic-lib==5.1.0 +kazoo==2.8.0 netifaces==0.10.4 openstacksdk==0.49.0 oslo.concurrency==3.26.0 @@ -24,3 +25,4 @@ stestr==1.0.0 stevedore==1.20.0 tenacity==6.2.0 testtools==2.2.0 +tooz==2.7.2 diff --git a/releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml b/releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml new file mode 100644 index 000000000..1802f7ec2 --- /dev/null +++ b/releasenotes/notes/add_burnin_dynamic_network_pairing-33e398255050eb98.yaml @@ -0,0 +1,17 @@ +--- +features: + - | + For network burn-in, nodes can now be paired dynamically via a + distributed coordination backend (as an alternative to a static + configuration). This allows burn-in to proceed on a 'first come + first served' basis with the nodes available, rather than a node + being blocked since the static partner is currently delayed. + In order to configure this dynamic pairing, the nodes will need + at least 'agent_burnin_fio_network_pairing_backend_url' in their + driver-info (the URL for the coordination backend). In order to + separate different hardware types, which may be using different + networks and shall be burnt-in separately, the nodes can in + addition define 'agent_burnin_fio_network_pairing_group_name' to + have pairing only happening between nodes in the same group. An + additional 'agent_burnin_fio_network_pairing_timeout' allows to + limit the time given to the nodes to wait for a partner. diff --git a/requirements.txt b/requirements.txt index cadb5c30b..3badcd0e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ tenacity>=6.2.0 # Apache-2.0 ironic-lib>=5.1.0 # Apache-2.0 Werkzeug>=1.0.1 # BSD License cryptography>=2.3 # BSD/Apache-2.0 +tooz>=2.7.2 # Apache-2.0 diff --git a/setup.cfg b/setup.cfg index 5662cf1bc..06d224794 100644 --- a/setup.cfg +++ b/setup.cfg @@ -64,3 +64,7 @@ api_doc_dir = contributor/api tag_build = tag_date = 0 tag_svn_revision = 0 + +[extras] +burnin-network-kazoo = + kazoo>=2.8.0 # Apache-2.0