From cd9da93ee12b1ba6145ec5b196cc02f01b84db3f Mon Sep 17 00:00:00 2001 From: Gorka Eguileor Date: Mon, 6 Aug 2018 18:47:27 +0200 Subject: [PATCH] Fix multipath disconnect with path failure Under certain conditions detaching a multipath device may result on failure when flushing one of the individual paths, but the disconnect should have succeeded, because there were other paths available to flush all the data. OS-Brick is currently following standard recommended disconnect mechanism for multipath devices: - Release all device holders - Flush multipath - Flush single paths - Delete single devices The problem is that this procedure does an innecessary step, flushing individual single paths, that may result in an error. Originally it was thought that the individual flushes were necessary to prevent data loss, but upon further study of the multipath-tools and the device-mapper code it was discovered that this is not really the case. After the multipath flushing has been completed we can be sure that the data has been successfully sent and acknowledge by the device. Closes-Bug: #1785669 Change-Id: I10f7fea2d69d5d9011f0d5486863a8d9d8a9696e (cherry picked from commit d866ee75c23e05efee5688a948498831980d98e3) (cherry picked from commit b253e3169625a9617a528f9f989808fb041780db) --- .../initiator/connectors/fibre_channel.py | 13 ++- .../connectors/fibre_channel_s390x.py | 2 +- os_brick/initiator/connectors/iscsi.py | 18 ++-- os_brick/initiator/linuxscsi.py | 50 +++++++++-- .../connectors/test_fibre_channel.py | 24 +++++- .../connectors/test_fibre_channel_s390x.py | 3 +- .../tests/initiator/connectors/test_iscsi.py | 26 ++++-- os_brick/tests/initiator/test_linuxscsi.py | 86 ++++++++++++++++++- ...multipath-disconnect-819d01e6e981883e.yaml | 10 +++ 9 files changed, 204 insertions(+), 28 deletions(-) create mode 100644 releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml diff --git a/os_brick/initiator/connectors/fibre_channel.py b/os_brick/initiator/connectors/fibre_channel.py index 4e2448f3e..849fb7e9a 100644 --- a/os_brick/initiator/connectors/fibre_channel.py +++ b/os_brick/initiator/connectors/fibre_channel.py @@ -270,14 +270,21 @@ class FibreChannelConnector(base.BaseLinuxConnector): devices.append(device_info) LOG.debug("devices to remove = %s", devices) - self._remove_devices(connection_properties, devices) + self._remove_devices(connection_properties, devices, device_info) - def _remove_devices(self, connection_properties, devices): + def _remove_devices(self, connection_properties, devices, device_info): # There may have been more than 1 device mounted # by the kernel for this volume. We have to remove # all of them + path_used = self._linuxscsi.get_dev_path(connection_properties, + device_info) + was_multipath = '/pci-' not in path_used for device in devices: - self._linuxscsi.remove_scsi_device(device["device"]) + device_path = device['device'] + flush = self._linuxscsi.requires_flush(device_path, + path_used, + was_multipath) + self._linuxscsi.remove_scsi_device(device_path, flush=flush) def _get_pci_num(self, hba): # NOTE(walter-boring) diff --git a/os_brick/initiator/connectors/fibre_channel_s390x.py b/os_brick/initiator/connectors/fibre_channel_s390x.py index 4d39b1b07..bfd6b3f70 100644 --- a/os_brick/initiator/connectors/fibre_channel_s390x.py +++ b/os_brick/initiator/connectors/fibre_channel_s390x.py @@ -87,7 +87,7 @@ class FibreChannelConnectorS390X(fibre_channel.FibreChannelConnector): ] return host_device - def _remove_devices(self, connection_properties, devices): + def _remove_devices(self, connection_properties, devices, device_info): hbas = self._linuxfc.get_fc_hbas_info() ports = connection_properties['target_wwn'] possible_devs = self._get_possible_devices(hbas, ports) diff --git a/os_brick/initiator/connectors/iscsi.py b/os_brick/initiator/connectors/iscsi.py index 5836c3753..fa21608c0 100644 --- a/os_brick/initiator/connectors/iscsi.py +++ b/os_brick/initiator/connectors/iscsi.py @@ -845,10 +845,12 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): :type ignore_errors: bool """ return self._cleanup_connection(connection_properties, force=force, - ignore_errors=ignore_errors) + ignore_errors=ignore_errors, + device_info=device_info) def _cleanup_connection(self, connection_properties, ips_iqns_luns=None, - force=False, ignore_errors=False): + force=False, ignore_errors=False, + device_info=None): """Cleans up connection flushing and removing devices and multipath. :param connection_properties: The dictionary that describes all @@ -864,6 +866,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): :param ignore_errors: When force is True, this will decide whether to ignore errors or raise an exception once finished the operation. Default is False. + :param device_info: Attached device information. :type ignore_errors: bool """ exc = exception.ExceptionChainer() @@ -880,9 +883,14 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): remove_devices = set() for remove, __ in devices_map.values(): remove_devices.update(remove) - multipath_name = self._linuxscsi.remove_connection(remove_devices, - self.use_multipath, - force, exc) + + path_used = self._linuxscsi.get_dev_path(connection_properties, + device_info) + was_multipath = (path_used.startswith('/dev/dm-') or + 'mpath' in path_used) + multipath_name = self._linuxscsi.remove_connection( + remove_devices, self.use_multipath, force, exc, + path_used, was_multipath) # Disconnect sessions and remove nodes that are left without devices disconnect = [conn for conn, (__, keep) in devices_map.items() diff --git a/os_brick/initiator/linuxscsi.py b/os_brick/initiator/linuxscsi.py index 9c631dc64..a0f26a507 100644 --- a/os_brick/initiator/linuxscsi.py +++ b/os_brick/initiator/linuxscsi.py @@ -60,14 +60,16 @@ class LinuxSCSI(executor.Executor): else: return None - def remove_scsi_device(self, device, force=False, exc=None): + def remove_scsi_device(self, device, force=False, exc=None, + flush=True): """Removes a scsi device based upon /dev/sdX name.""" path = "/sys/block/%s/device/delete" % device.replace("/dev/", "") if os.path.exists(path): exc = exception.ExceptionChainer() if exc is None else exc - # flush any outstanding IO first - with exc.context(force, 'Flushing %s failed', device): - self.flush_device_io(device) + if flush: + # flush any outstanding IO first + with exc.context(force, 'Flushing %s failed', device): + self.flush_device_io(device) LOG.debug("Remove SCSI device %(device)s with %(path)s", {'device': device, 'path': path}) @@ -193,14 +195,48 @@ class LinuxSCSI(executor.Executor): return dm return None + @staticmethod + def get_dev_path(connection_properties, device_info): + """Determine what path was used by Nova/Cinder to access volume.""" + if device_info and device_info.get('path'): + return device_info.get('path') + + return connection_properties.get('device_path') or '' + + @staticmethod + def requires_flush(path, path_used, was_multipath): + """Check if a device needs to be flushed when detaching. + + A device representing a single path connection to a volume must only be + flushed if it has been used directly by Nova or Cinder to write data. + + If the path has been used via a multipath DM or if the device was part + of a multipath but a different single path was used for I/O (instead of + the multipath) then we don't need to flush. + """ + # No used path happens on failed attachs, when we don't care about + # individual flushes. + # When Nova used a multipath we don't need to do individual flushes. + if not path_used or was_multipath: + return False + + # We need to flush the single path that was used. + # For encrypted volumes the symlink has been replaced, so realpath + # won't return device under /dev but under /dev/disk/... + path = os.path.realpath(path) + path_used = os.path.realpath(path_used) + return path_used == path or '/dev' != os.path.split(path_used)[0] + def remove_connection(self, devices_names, is_multipath, force=False, - exc=None): + exc=None, path_used=None, was_multipath=False): """Remove LUNs and multipath associated with devices names. :param devices_names: Iterable with real device names ('sda', 'sdb') :param is_multipath: Whether this is a multipath connection or not :param force: Whether to forcefully disconnect even if flush fails. :param exc: ExceptionChainer where to add exceptions if forcing + :param path_used: What path was used by Nova/Cinder for I/O + :param was_multipath: If the path used for I/O was a multipath :returns: Multipath device map name if found and not flushed """ if not devices_names: @@ -220,7 +256,9 @@ class LinuxSCSI(executor.Executor): multipath_name = None for device_name in devices_names: - self.remove_scsi_device('/dev/' + device_name, force, exc) + dev_path = '/dev/' + device_name + flush = self.requires_flush(dev_path, path_used, was_multipath) + self.remove_scsi_device(dev_path, force, exc, flush) # Wait until the symlinks are removed with exc.context(force, 'Some devices remain from %s', devices_names): diff --git a/os_brick/tests/initiator/connectors/test_fibre_channel.py b/os_brick/tests/initiator/connectors/test_fibre_channel.py index 916a9bd3c..c8c0465e2 100644 --- a/os_brick/tests/initiator/connectors/test_fibre_channel.py +++ b/os_brick/tests/initiator/connectors/test_fibre_channel.py @@ -11,6 +11,7 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +import ddt import mock import os import six @@ -23,6 +24,7 @@ from os_brick.initiator import linuxscsi from os_brick.tests.initiator import test_connector +@ddt.ddt class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): def setUp(self): @@ -249,9 +251,7 @@ class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): devices['devices'][0]) expected_commands = [ 'multipath -f ' + find_mp_device_path_mock.return_value, - 'blockdev --flushbufs /dev/sdb', 'tee -a /sys/block/sdb/device/delete', - 'blockdev --flushbufs /dev/sdc', 'tee -a /sys/block/sdc/device/delete', ] self.assertEqual(expected_commands, self.cmds) @@ -450,3 +450,23 @@ class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): find_mp_dev_mock, 'rw', True) + + @ddt.data(('/dev/mapper/', True), + ('/dev/mapper/mpath0', True), + ('/dev/disk/by-path/pci-1-fc-1-lun-1', False)) + @ddt.unpack + @mock.patch('os_brick.initiator.linuxscsi.LinuxSCSI.remove_scsi_device') + @mock.patch('os_brick.initiator.linuxscsi.LinuxSCSI.requires_flush') + @mock.patch('os_brick.initiator.linuxscsi.LinuxSCSI.get_dev_path') + def test__remove_devices(self, path_used, was_multipath, get_dev_path_mock, + flush_mock, remove_mock): + get_dev_path_mock.return_value = path_used + self.connector._remove_devices(mock.sentinel.con_props, + [{'device': '/dev/sda'}], + mock.sentinel.device_info) + get_dev_path_mock.assert_called_once_with(mock.sentinel.con_props, + mock.sentinel.device_info) + flush_mock.assert_called_once_with('/dev/sda', path_used, + was_multipath) + remove_mock.assert_called_once_with('/dev/sda', + flush=flush_mock.return_value) diff --git a/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py b/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py index 0f3f16eac..e02fdb709 100644 --- a/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py +++ b/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py @@ -68,7 +68,8 @@ class FibreChannelConnectorS390XTestCase(test_connector.ConnectorTestCase): def test_remove_devices(self, mock_deconfigure_scsi_device, mock_get_fc_hbas_info, mock_get_possible_devices): connection_properties = {'target_wwn': 5, 'target_lun': 2} - self.connector._remove_devices(connection_properties, devices=None) + self.connector._remove_devices(connection_properties, devices=None, + device_info=None) mock_deconfigure_scsi_device.assert_called_with(3, 5, "0x0002000000000000") mock_get_fc_hbas_info.assert_called_once_with() diff --git a/os_brick/tests/initiator/connectors/test_iscsi.py b/os_brick/tests/initiator/connectors/test_iscsi.py index f8af45256..e12f80303 100644 --- a/os_brick/tests/initiator/connectors/test_iscsi.py +++ b/os_brick/tests/initiator/connectors/test_iscsi.py @@ -538,7 +538,8 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): cleanup_mock.assert_called_once_with( mock.sentinel.con_props, force=mock.sentinel.Force, - ignore_errors=mock.sentinel.ignore_errors) + ignore_errors=mock.sentinel.ignore_errors, + device_info=mock.sentinel.dev_info) @ddt.data(True, False) @mock.patch.object(iscsi.ISCSIConnector, '_get_transport') @@ -649,13 +650,21 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): ['-m', 'discoverydb', '-o', 'show', '-P', 1]) transport_mock.assert_called_once_with() + @ddt.data(('/dev/sda', False), + ('/dev/disk/by-id/scsi-WWID', False), + ('/dev/dm-11', True), + ('/dev/disk/by-id/dm-uuid-mpath-MPATH', True)) + @ddt.unpack + @mock.patch.object(linuxscsi.LinuxSCSI, 'get_dev_path') @mock.patch.object(iscsi.ISCSIConnector, '_disconnect_connection') @mock.patch.object(iscsi.ISCSIConnector, '_get_connection_devices') @mock.patch.object(linuxscsi.LinuxSCSI, 'flush_multipath_device') @mock.patch.object(linuxscsi.LinuxSCSI, 'remove_connection', return_value=None) - def test_cleanup_connection(self, remove_mock, flush_mock, con_devs_mock, - discon_mock): + def test_cleanup_connection(self, path_used, was_multipath, remove_mock, + flush_mock, con_devs_mock, discon_mock, + get_dev_path_mock): + get_dev_path_mock.return_value = path_used # Return an ordered dicts instead of normal dict for discon_mock.assert con_devs_mock.return_value = collections.OrderedDict(( (('ip1:port1', 'tgt1'), ({'sda'}, set())), @@ -666,12 +675,16 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): 'use_multipath') as use_mp_mock: self.connector._cleanup_connection( self.CON_PROPS, ips_iqns_luns=mock.sentinel.ips_iqns_luns, - force=False, ignore_errors=False) + force=False, ignore_errors=False, + device_info=mock.sentinel.device_info) + get_dev_path_mock.called_once_with(self.CON_PROPS, + mock.sentinel.device_info) con_devs_mock.assert_called_once_with(self.CON_PROPS, mock.sentinel.ips_iqns_luns) remove_mock.assert_called_once_with({'sda', 'sdb'}, use_mp_mock, - False, mock.ANY) + False, mock.ANY, + path_used, was_multipath) discon_mock.assert_called_once_with( self.CON_PROPS, [('ip1:port1', 'tgt1'), ('ip3:port3', 'tgt3')], @@ -707,7 +720,8 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): con_devs_mock.assert_called_once_with(self.CON_PROPS, mock.sentinel.ips_iqns_luns) remove_mock.assert_called_once_with({'sda', 'sdb'}, use_mp_mock, - mock.sentinel.force, mock.ANY) + mock.sentinel.force, mock.ANY, + '', False) discon_mock.assert_called_once_with( self.CON_PROPS, [('ip1:port1', 'tgt1'), ('ip3:port3', 'tgt3')], diff --git a/os_brick/tests/initiator/test_linuxscsi.py b/os_brick/tests/initiator/test_linuxscsi.py index 0c5d65eb0..504f205a0 100644 --- a/os_brick/tests/initiator/test_linuxscsi.py +++ b/os_brick/tests/initiator/test_linuxscsi.py @@ -33,6 +33,7 @@ class LinuxSCSITestCase(base.TestCase): def setUp(self): super(LinuxSCSITestCase, self).setUp() self.cmds = [] + self.realpath = os.path.realpath self.mock_object(os.path, 'realpath', return_value='/dev/sdc') self.mock_object(os, 'stat', returns=os.stat(__file__)) self.linuxscsi = linuxscsi.LinuxSCSI(None, execute=self.fake_execute) @@ -91,6 +92,16 @@ class LinuxSCSITestCase(base.TestCase): flush_mock.assert_called_once_with(device) echo_mock.assert_called_once_with('/sys/block/sdc/device/delete', '1') + @mock.patch.object(os.path, 'exists', return_value=False) + def test_remove_scsi_device_no_flush(self, exists_mock): + self.linuxscsi.remove_scsi_device("/dev/sdc") + expected_commands = [] + self.assertEqual(expected_commands, self.cmds) + exists_mock.return_value = True + self.linuxscsi.remove_scsi_device("/dev/sdc", flush=False) + expected_commands = [('tee -a /sys/block/sdc/device/delete')] + self.assertEqual(expected_commands, self.cmds) + @mock.patch('time.sleep') @mock.patch('os.path.exists', return_value=True) def test_wait_for_volumes_removal_failure(self, exists_mock, sleep_mock): @@ -243,8 +254,8 @@ class LinuxSCSITestCase(base.TestCase): self.assertEqual(get_dm_name_mock.return_value if do_raise else None, mp_name) remove_mock.assert_has_calls([ - mock.call('/dev/sda', mock.sentinel.Force, exc), - mock.call('/dev/sdb', mock.sentinel.Force, exc)]) + mock.call('/dev/sda', mock.sentinel.Force, exc, False), + mock.call('/dev/sdb', mock.sentinel.Force, exc, False)]) wait_mock.assert_called_once_with(devices_names) self.assertEqual(do_raise, bool(exc)) remove_link_mock.assert_called_once_with(devices_names) @@ -285,8 +296,28 @@ class LinuxSCSITestCase(base.TestCase): force=mock.sentinel.Force, exc=exc) remove_mock.assert_has_calls( - [mock.call('/dev/sda', mock.sentinel.Force, exc), - mock.call('/dev/sdb', mock.sentinel.Force, exc)]) + [mock.call('/dev/sda', mock.sentinel.Force, exc, False), + mock.call('/dev/sdb', mock.sentinel.Force, exc, False)]) + wait_mock.assert_called_once_with(devices_names) + remove_link_mock.assert_called_once_with(devices_names) + + @mock.patch.object(linuxscsi.LinuxSCSI, '_remove_scsi_symlinks') + @mock.patch.object(linuxscsi.LinuxSCSI, 'wait_for_volumes_removal') + @mock.patch.object(linuxscsi.LinuxSCSI, 'remove_scsi_device') + def test_remove_connection_singlepath_used(self, remove_mock, wait_mock, + remove_link_mock): + devices_names = ('sda', 'sdb') + exc = exception.ExceptionChainer() + + # realpath was mocked on test setup + with mock.patch('os.path.realpath', side_effect=self.realpath): + self.linuxscsi.remove_connection(devices_names, is_multipath=True, + force=mock.sentinel.Force, + exc=exc, path_used='/dev/sdb', + was_multipath=False) + remove_mock.assert_has_calls( + [mock.call('/dev/sda', mock.sentinel.Force, exc, False), + mock.call('/dev/sdb', mock.sentinel.Force, exc, True)]) wait_mock.assert_called_once_with(devices_names) remove_link_mock.assert_called_once_with(devices_names) @@ -953,3 +984,50 @@ loop0 0""" def test_multipath_add_path(self): self.linuxscsi.multipath_add_path('/dev/sda') self.assertEqual(['multipathd add path /dev/sda'], self.cmds) + + @ddt.data({'con_props': {}, 'dev_info': {'path': mock.sentinel.path}}, + {'con_props': None, 'dev_info': {'path': mock.sentinel.path}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {'path': mock.sentinel.path}}) + @ddt.unpack + def test_get_dev_path_device_info(self, con_props, dev_info): + self.assertEqual(mock.sentinel.path, + self.linuxscsi.get_dev_path(con_props, dev_info)) + + @ddt.data({'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {'path': None}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {'path': ''}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': None}) + @ddt.unpack + def test_get_dev_path_conn_props(self, con_props, dev_info): + self.assertEqual(mock.sentinel.device_path, + self.linuxscsi.get_dev_path(con_props, dev_info)) + + @ddt.data({'con_props': {'device_path': ''}, 'dev_info': {'path': None}}, + {'con_props': {'device_path': None}, 'dev_info': {'path': ''}}, + {'con_props': {}, 'dev_info': {}}, + {'con_props': {}, 'dev_info': None}) + @ddt.unpack + def test_get_dev_path_no_path(self, con_props, dev_info): + self.assertEqual('', self.linuxscsi.get_dev_path(con_props, dev_info)) + + @ddt.data(('/dev/sda', '/dev/sda', True, False, None), + ('/dev/sda', '', True, False, None), + ('/dev/link_sda', '/dev/link_sdb', False, False, ('/dev/sda', + '/dev/sdb')), + ('/dev/link_sda', '/dev/link2_sda', False, True, ('/dev/sda', + '/dev/sda'))) + @ddt.unpack + def test_requires_flush(self, path, path_used, was_multipath, expected, + real_paths): + with mock.patch('os.path.realpath', side_effect=real_paths) as mocked: + self.assertEqual( + expected, + self.linuxscsi.requires_flush(path, path_used, was_multipath)) + if real_paths: + mocked.assert_has_calls([mock.call(path), + mock.call(path_used)]) diff --git a/releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml b/releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml new file mode 100644 index 000000000..7c33131e2 --- /dev/null +++ b/releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Under certain conditions detaching a multipath device may result in + failure when flushing one of the individual paths, but the disconnect + should have succeeded, because there were other paths available to flush + all the data. + + The multipath disconnect mechanism is now more robust and will only fail + when disconnecting if multipath would lose data.