From 639f953194cdc07b1e6aff1fc1ca2a7bc9d28536 Mon Sep 17 00:00:00 2001 From: Rajat Dhasmana Date: Mon, 27 May 2024 18:33:54 +0530 Subject: [PATCH] Wait for multipath device to be ready for I/O The "multipath -C " command waits for the multipath device map to be ready for I/O. This is useful in preventing race conditions when we are trying to write to the multipath device before it is ready for I/O. We added 2 new config options to make the wait time configurable, 1. wait_mpath_device_attempts - defaults to 4 attempts 2. wait_mpath_device_interval - defaults to 1 second Closes-Bug: #2067949 Change-Id: Ib075ec62a2bf993615c5c802f34acd7838bfa2af --- .../initiator/connectors/fibre_channel.py | 7 +++- os_brick/initiator/connectors/iscsi.py | 7 ++-- os_brick/initiator/linuxscsi.py | 19 ++++++++++ os_brick/opts.py | 18 ++++++++++ .../connectors/test_fibre_channel.py | 2 ++ os_brick/tests/initiator/test_linuxscsi.py | 36 +++++++++++++++++++ .../notes/wait-mpath-io-703605e74ee009ef.yaml | 16 +++++++++ 7 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/wait-mpath-io-703605e74ee009ef.yaml diff --git a/os_brick/initiator/connectors/fibre_channel.py b/os_brick/initiator/connectors/fibre_channel.py index 36b0cd758..96abe7759 100644 --- a/os_brick/initiator/connectors/fibre_channel.py +++ b/os_brick/initiator/connectors/fibre_channel.py @@ -286,7 +286,12 @@ class FibreChannelConnector(base.BaseLinuxConnector): if multipath_id: # only set the multipath_id if we found one device_info['multipath_id'] = multipath_id - + if self.device_name: + device = os.path.basename(self.device_name) + mpath = self._linuxscsi.find_sysfs_multipath_dm( + [device]) + # Wait for multipath device to be ready for I/O + self._linuxscsi.wait_for_mpath_device(mpath) else: device_path = self.host_device diff --git a/os_brick/initiator/connectors/iscsi.py b/os_brick/initiator/connectors/iscsi.py index c051c1169..0eee607d3 100644 --- a/os_brick/initiator/connectors/iscsi.py +++ b/os_brick/initiator/connectors/iscsi.py @@ -787,8 +787,11 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): if not mpath: LOG.warning('No dm was created, connection to volume is probably ' 'bad and will perform poorly.') - elif not wwn: - wwn = self._linuxscsi.get_sysfs_wwn(found, mpath) + else: + # Wait for multipath device to be ready for I/O + self._linuxscsi.wait_for_mpath_device(mpath) + if not wwn: + wwn = self._linuxscsi.get_sysfs_wwn(found, mpath) assert wwn is not None return self._get_connect_result(connection_properties, wwn, found, diff --git a/os_brick/initiator/linuxscsi.py b/os_brick/initiator/linuxscsi.py index 4bf53c304..38a9f1204 100644 --- a/os_brick/initiator/linuxscsi.py +++ b/os_brick/initiator/linuxscsi.py @@ -26,6 +26,7 @@ import time from typing import Optional from oslo_concurrency import processutils as putils +from oslo_config import cfg from oslo_log import log as logging from oslo_utils import excutils @@ -36,6 +37,7 @@ from os_brick.privileged import rootwrap as priv_rootwrap from os_brick import utils LOG = logging.getLogger(__name__) +CONF = cfg.CONF MULTIPATH_ERROR_REGEX = re.compile(r"\w{3} \d+ \d\d:\d\d:\d\d \|.*$") MULTIPATH_WWID_REGEX = re.compile(r"\((?P.+)\)") @@ -853,3 +855,20 @@ class LinuxSCSI(executor.Executor): if map_name and self.get_dm_name(mpath): raise exception.BrickException("Multipath doesn't go away") LOG.debug('Multipath %s no longer present', mpath) + + def wait_for_mpath_device(self, mpath): + """Wait for multipath device to become ready for I/O. + + mpath is the kernel name of the device (dm-*) which is the + expected argument for multipath -C command. + """ + try: + self._execute('multipath', '-C', mpath, + attempts=CONF.os_brick.wait_mpath_device_attempts, + interval=CONF.os_brick.wait_mpath_device_interval, + run_as_root=True, + root_helper=self._root_helper) + except putils.ProcessExecutionError as exc: + LOG.error("Failed to get mpath device %(mpath)s ready for " + "I/O: %(except)s", {'mpath': mpath, 'except': exc}) + raise diff --git a/os_brick/opts.py b/os_brick/opts.py index 61d90b724..64906f264 100644 --- a/os_brick/opts.py +++ b/os_brick/opts.py @@ -23,6 +23,24 @@ _opts = [ 'for compute nodes, but not for HCI deployments or ' 'controllers where Glance uses Cinder as a backend, as ' 'locks should use the same directory.'), + cfg.IntOpt('wait_mpath_device_attempts', + default=4, + min=1, + help='Number of attempts for the multipath device to be ready ' + 'for I/O after it was created. Readiness is checked with ' + '``multipath -C``. See related ' + '``wait_mpath_device_interval`` config option. Default ' + 'value is 4.'), + cfg.IntOpt('wait_mpath_device_interval', + default=1, + min=1, + help='Interval value to wait for multipath device to be ready ' + 'for I/O. Max number of attempts is set in ' + '``wait_mpath_device_attempts``. Time in seconds to wait ' + 'for each retry is ``base ^ attempt * interval``, so for ' + '4 attempts (1 attempt 3 retries) and 1 second interval ' + 'will yield: 2, 4 and 8 seconds. Note that there is no ' + 'wait before first attempt. Default value is 1.'), ] cfg.CONF.register_opts(_opts, group='os_brick') diff --git a/os_brick/tests/initiator/connectors/test_fibre_channel.py b/os_brick/tests/initiator/connectors/test_fibre_channel.py index 92f00b666..7054e29a1 100644 --- a/os_brick/tests/initiator/connectors/test_fibre_channel.py +++ b/os_brick/tests/initiator/connectors/test_fibre_channel.py @@ -291,6 +291,7 @@ class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): connection_info['data']) @mock.patch.object(linuxscsi.LinuxSCSI, 'find_multipath_device_path') + @mock.patch.object(linuxscsi.LinuxSCSI, 'wait_for_mpath_device') def _test_connect_volume_multipath(self, get_device_info_mock, get_scsi_wwn_mock, get_fc_hbas_info_mock, @@ -301,6 +302,7 @@ class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): find_mp_dev_mock, access_mode, should_wait_for_rw, + wait_mpath_device_mock, find_mp_device_path_mock): self.connector.use_multipath = True get_fc_hbas_mock.side_effect = self.fake_get_fc_hbas diff --git a/os_brick/tests/initiator/test_linuxscsi.py b/os_brick/tests/initiator/test_linuxscsi.py index a8d7bf817..b16bf179d 100644 --- a/os_brick/tests/initiator/test_linuxscsi.py +++ b/os_brick/tests/initiator/test_linuxscsi.py @@ -1443,3 +1443,39 @@ loop0 0""" def test_lun_for_addressing_sam3_flat(self, original_lun, expected_lun): lun = self.linuxscsi.lun_for_addressing(original_lun, 'SAM3-flat') self.assertEqual(expected_lun, lun) + + @mock.patch.object(linuxscsi, 'LOG') + @mock.patch.object(linuxscsi.LinuxSCSI, '_execute') + def test_wait_for_mpath_device(self, exec_mock, mock_log): + exec_mock.return_value = ( + "3293379.070675 | /dev/dm-7: path sdb is usable", + None, + ) + mpath_name = 'dm-7' + self.linuxscsi.wait_for_mpath_device(mpath_name) + + self.assertEqual(1, exec_mock.call_count) + exec_mock.assert_called_once_with( + 'multipath', '-C', mpath_name, + attempts=4, interval=1, + run_as_root=True, + root_helper=self.linuxscsi._root_helper) + mock_log.error.assert_not_called() + + @mock.patch.object(linuxscsi, 'LOG') + @mock.patch.object(linuxscsi.LinuxSCSI, '_execute') + def test_wait_for_mpath_device_fails(self, exec_mock, mock_log): + exec_mock.side_effect = putils.ProcessExecutionError + + mpath_name = 'dm-7' + exc = self.assertRaises(putils.ProcessExecutionError, + self.linuxscsi.wait_for_mpath_device, 'dm-7') + + exec_mock.assert_called_once_with( + 'multipath', '-C', mpath_name, + attempts=4, interval=1, + run_as_root=True, + root_helper=self.linuxscsi._root_helper) + mock_log.error.assert_called_once_with( + "Failed to get mpath device %(mpath)s ready for " + "I/O: %(except)s", {'mpath': mpath_name, 'except': exc}) diff --git a/releasenotes/notes/wait-mpath-io-703605e74ee009ef.yaml b/releasenotes/notes/wait-mpath-io-703605e74ee009ef.yaml new file mode 100644 index 000000000..0d8226b66 --- /dev/null +++ b/releasenotes/notes/wait-mpath-io-703605e74ee009ef.yaml @@ -0,0 +1,16 @@ +--- +fixes: + - | + `Bug #2067949 `_: + Fixed issue where we try to write into a multipath device and fail + since it is not ready for I/O. Now we wait until the I/O is likely + to succeed. + We introduced 2 new config options to make the wait time + configurable: + + * ``wait_mpath_device_attempts`` + * ``wait_mpath_device_interval`` + + These options defaults to 4 attempts and 1 second interval + respectively. See help text of the config options for more + information.