Merge "NVMe-oF: Fix attach when reconnecting"

This commit is contained in:
Zuul 2024-01-09 07:20:14 +00:00 committed by Gerrit Code Review
commit f1fcd9a75d
3 changed files with 194 additions and 6 deletions

View File

@ -133,6 +133,9 @@ class Portal(object):
"""Representation of an NVMe-oF Portal with some related operations."""
LIVE = 'live'
MISSING = None # Unkown or not present in the system
CONNECTING = 'connecting'
# Default value of reconnect_delay in sysfs
DEFAULT_RECONNECT_DELAY = 10
controller: Optional[str] = None # Don't know controller name on start
def __str__(self) -> str:
@ -176,6 +179,15 @@ class Portal(object):
return ctrl_property('state', self.controller)
return None
@property
def reconnect_delay(self) -> int:
# 10 seconds is the default value of reconnect_delay
if self.controller:
res = ctrl_property('reconnect_delay', self.controller)
if res is not None:
return int(res)
return self.DEFAULT_RECONNECT_DELAY
def get_device(self) -> Optional[str]:
"""Get a device path using available volume identification markers.
@ -683,6 +695,8 @@ class NVMeOFConnector(base.BaseLinuxConnector):
# Use a class attribute since a restart is needed to change it on the host
native_multipath_supported = None
# Time we think is more than reasonable to establish an NVMe-oF connection
TIME_TO_CONNECT = 10
def __init__(self,
root_helper: str,
@ -861,6 +875,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
"""
connected = False
missing_portals = []
reconnecting_portals = []
for portal in target.portals:
state = portal.state # store it so we read only once from sysfs
@ -871,6 +886,9 @@ class NVMeOFConnector(base.BaseLinuxConnector):
# Remember portals that are not present in the system
elif state == portal.MISSING:
missing_portals.append(portal)
elif state == portal.CONNECTING:
LOG.debug('%s is reconnecting', portal)
reconnecting_portals.append(portal)
# Ignore reconnecting/dead portals
else:
LOG.debug('%s exists but is %s', portal, state)
@ -905,11 +923,32 @@ class NVMeOFConnector(base.BaseLinuxConnector):
LOG.warning('Race condition with some other application '
'when connecting to %s, please check your '
'system configuration.', portal)
connected = True
state = portal.state
if state == portal.LIVE:
connected = True
elif state == portal.CONNECTING:
reconnecting_portals.append(portal)
else:
LOG.error('Ignoring %s due to unknown state (%s)',
portal, state)
if not do_multipath:
break # We are connected
if not connected and reconnecting_portals:
delay = self.TIME_TO_CONNECT + max(p.reconnect_delay
for p in reconnecting_portals)
LOG.debug('Waiting %s seconds for some nvme controllers to '
'reconnect', delay)
timeout = time.time() + delay
while time.time() < timeout:
time.sleep(1)
if any(p.is_live for p in reconnecting_portals):
LOG.debug('Reconnected')
connected = True
break
LOG.debug('No controller reconnected')
if not connected:
raise exception.VolumeDeviceNotFound(device=target.nqn)

View File

@ -160,6 +160,13 @@ class PortalTestCase(test_base.TestCase):
self.assertIs(expected, self.portal.is_live)
mock_state.assert_called_once_with()
@mock.patch.object(nvmeof, 'ctrl_property', return_value='10')
def test_reconnect_delay(self, mock_property):
"""Reconnect delay returns an int."""
self.portal.controller = 'nvme0'
self.assertIs(10, self.portal.reconnect_delay)
mock_property.assert_called_once_with('reconnect_delay', 'nvme0')
@mock.patch.object(nvmeof, 'ctrl_property')
def test_state(self, mock_property):
"""State uses sysfs to check the value."""
@ -1362,6 +1369,11 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
else:
mock_cli.assert_not_called()
@mock.patch('time.time', side_effect=[0, 1, 20] * 3)
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
new_callable=mock.PropertyMock, return_value=10)
@mock.patch.object(nvmeof.Portal, 'is_live',
new_callable=mock.PropertyMock, return_value=False)
@mock.patch.object(nvmeof.Target, 'find_device')
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
@ -1369,7 +1381,7 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
def test__connect_target_portals_down(
self, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
mock_find_dev):
mock_find_dev, mock_is_live, mock_delay, mock_time):
"""Test connect target has all portal connections down."""
retries = 3
mock_state.side_effect = retries * 3 * ['connecting']
@ -1379,12 +1391,19 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
self.conn_props.targets[0])
self.assertEqual(retries * 3, mock_state.call_count)
self.assertEqual(retries * 3, mock_is_live.call_count)
self.assertEqual(retries * 3, mock_delay.call_count)
mock_state.assert_has_calls(retries * 3 * [mock.call()])
mock_rescan.assert_not_called()
mock_set_ctrls.assert_not_called()
mock_find_dev.assert_not_called()
mock_cli.assert_not_called()
@mock.patch('time.time', side_effect=[0, 1, 20] * 3)
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
new_callable=mock.PropertyMock, return_value=10)
@mock.patch.object(nvmeof.Portal, 'is_live',
new_callable=mock.PropertyMock, return_value=False)
@mock.patch.object(nvmeof.LOG, 'error')
@mock.patch.object(nvmeof.Target, 'find_device')
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
@ -1393,7 +1412,7 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
def test__connect_target_no_portals_connect(
self, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
mock_find_dev, mock_log):
mock_find_dev, mock_log, mock_is_live, mock_delay, mock_time):
"""Test connect target when fails to connect to any portal."""
retries = 3
mock_state.side_effect = retries * ['connecting', 'connecting', None]
@ -1416,6 +1435,9 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
retries * [mock.call(['connect', '-a', portal.address,
'-s', portal.port, '-t', portal.transport,
'-n', target.nqn, '-Q', '128', '-l', '-1'])])
# There are 2 in connecting state
self.assertEqual(retries * 2, mock_is_live.call_count)
self.assertEqual(retries * 2, mock_delay.call_count)
@mock.patch.object(nvmeof.Target, 'find_device')
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
@ -1492,7 +1514,7 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
self, exit_code, mock_state, mock_rescan, mock_cli,
mock_set_ctrls, mock_find_dev, mock_log):
"""Treat race condition with sysadmin as success."""
mock_state.side_effect = ['connecting', 'connecting', None]
mock_state.side_effect = ['connecting', 'connecting', None, 'live']
dev_path = '/dev/nvme0n1'
mock_find_dev.return_value = dev_path
mock_cli.side_effect = putils.ProcessExecutionError(
@ -1503,8 +1525,8 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
res = self.connector._connect_target(target)
self.assertEqual(dev_path, res)
self.assertEqual(3, mock_state.call_count)
mock_state.assert_has_calls(3 * [mock.call()])
self.assertEqual(4, mock_state.call_count)
mock_state.assert_has_calls(4 * [mock.call()])
mock_rescan.assert_not_called()
mock_set_ctrls.assert_called_once_with()
mock_find_dev.assert_called_once_with()
@ -1515,6 +1537,126 @@ class NVMeOFConnectorTestCase(test_connector.ConnectorTestCase):
portal.transport, '-n', target.nqn, '-Q', '128', '-l', '-1'])
self.assertEqual(1, mock_log.call_count)
@ddt.data(70, errno.EALREADY)
@mock.patch.object(nvmeof.LOG, 'warning')
@mock.patch('time.sleep')
@mock.patch('time.time', side_effect=[0, 0.1, 0.6])
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
new_callable=mock.PropertyMock, return_value=10)
@mock.patch.object(nvmeof.Portal, 'is_live',
new_callable=mock.PropertyMock)
@mock.patch.object(nvmeof.Target, 'find_device')
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
@mock.patch.object(nvmeof.NVMeOFConnector, 'rescan')
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
def test__connect_target_race_connecting(
self, exit_code, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
mock_find_dev, mock_is_live, mock_delay, mock_time, mock_sleep,
mock_log):
"""Test connect target when portal is reconnecting after race."""
mock_cli.side_effect = putils.ProcessExecutionError(
exit_code=exit_code)
mock_state.side_effect = ['connecting', 'connecting', None,
'connecting']
mock_is_live.side_effect = [False, False, False, False, True]
target = self.conn_props.targets[0]
res = self.connector._connect_target(target)
self.assertEqual(mock_find_dev.return_value, res)
self.assertEqual(4, mock_state.call_count)
self.assertEqual(5, mock_is_live.call_count)
self.assertEqual(3, mock_delay.call_count)
self.assertEqual(2, mock_sleep.call_count)
mock_sleep.assert_has_calls(2 * [mock.call(1)])
mock_rescan.assert_not_called()
mock_set_ctrls.assert_called_once()
mock_find_dev.assert_called_once()
portal = target.portals[-1]
mock_cli.assert_called_once_with([
'connect', '-a', portal.address, '-s', portal.port, '-t',
portal.transport, '-n', target.nqn, '-Q', '128', '-l', '-1'])
self.assertEqual(1, mock_log.call_count)
@ddt.data(70, errno.EALREADY)
@mock.patch.object(nvmeof.LOG, 'warning')
@mock.patch.object(nvmeof.LOG, 'error')
@mock.patch('time.sleep')
@mock.patch('time.time', side_effect=[0, 0.1, 0.6])
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
new_callable=mock.PropertyMock, return_value=10)
@mock.patch.object(nvmeof.Portal, 'is_live',
new_callable=mock.PropertyMock)
@mock.patch.object(nvmeof.Target, 'find_device')
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
@mock.patch.object(nvmeof.NVMeOFConnector, 'rescan')
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock)
def test__connect_target_race_unknown(
self, exit_code, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
mock_find_dev, mock_is_live, mock_delay, mock_time, mock_sleep,
mock_log_err, mock_log_warn):
"""Test connect target when portal is unknown after race."""
mock_cli.side_effect = putils.ProcessExecutionError(
exit_code=exit_code)
mock_state.side_effect = ['connecting', 'connecting', None,
'unknown']
mock_is_live.side_effect = [False, False, False, True]
target = self.conn_props.targets[0]
res = self.connector._connect_target(target)
self.assertEqual(mock_find_dev.return_value, res)
self.assertEqual(4, mock_state.call_count)
self.assertEqual(4, mock_is_live.call_count)
self.assertEqual(2, mock_delay.call_count)
self.assertEqual(2, mock_sleep.call_count)
mock_sleep.assert_has_calls(2 * [mock.call(1)])
mock_rescan.assert_not_called()
mock_set_ctrls.assert_called_once()
mock_find_dev.assert_called_once()
portal = target.portals[-1]
mock_cli.assert_called_once_with([
'connect', '-a', portal.address, '-s', portal.port, '-t',
portal.transport, '-n', target.nqn, '-Q', '128', '-l', '-1'])
self.assertEqual(1, mock_log_err.call_count)
self.assertEqual(1, mock_log_warn.call_count)
@mock.patch('time.sleep')
@mock.patch('time.time', side_effect=[0, 0.1, 0.6])
@mock.patch.object(nvmeof.Portal, 'reconnect_delay',
new_callable=mock.PropertyMock, return_value=10)
@mock.patch.object(nvmeof.Portal, 'is_live',
new_callable=mock.PropertyMock)
@mock.patch.object(nvmeof.Target, 'find_device')
@mock.patch.object(nvmeof.Target, 'set_portals_controllers')
@mock.patch.object(nvmeof.NVMeOFConnector, 'run_nvme_cli')
@mock.patch.object(nvmeof.NVMeOFConnector, 'rescan')
@mock.patch.object(nvmeof.Portal, 'state', new_callable=mock.PropertyMock,
return_value='connecting')
def test__connect_target_portals_connecting(
self, mock_state, mock_rescan, mock_cli, mock_set_ctrls,
mock_find_dev, mock_is_live, mock_delay, mock_time, mock_sleep):
"""Test connect target when portals reconnect."""
# First pass everything connecting, second pass the second portal is up
mock_is_live.side_effect = [False, False, False, False, True]
# Connecting state changing after 2 sleeps
target = self.conn_props.targets[0]
res = self.connector._connect_target(target)
self.assertEqual(mock_find_dev.return_value, res)
self.assertEqual(3, mock_state.call_count)
self.assertEqual(5, mock_is_live.call_count)
self.assertEqual(3, mock_delay.call_count)
self.assertEqual(2, mock_sleep.call_count)
mock_sleep.assert_has_calls(2 * [mock.call(1)])
mock_rescan.assert_not_called()
mock_set_ctrls.assert_called_once()
mock_find_dev.assert_called_once()
mock_cli.assert_not_called()
@mock.patch.object(nvmeof.NVMeOFConnector, 'stop_and_assemble_raid')
@mock.patch.object(nvmeof.NVMeOFConnector, '_is_device_in_raid')
def test_handle_replicated_volume_existing(

View File

@ -0,0 +1,7 @@
---
fixes:
- |
NVMe-oF connector `bug #2035695
<https://bugs.launchpad.net/os-brick/+bug/2035695>`_: Fixed
attaching volumes when all portals of a subsystem are reconnecting at the
time of the request.