From d866ee75c23e05efee5688a948498831980d98e3 Mon Sep 17 00:00:00 2001 From: Gorka Eguileor Date: Mon, 6 Aug 2018 18:47:27 +0200 Subject: [PATCH] Fix multipath disconnect with path failure Under certain conditions detaching a multipath device may result on failure when flushing one of the individual paths, but the disconnect should have succeeded, because there were other paths available to flush all the data. OS-Brick is currently following standard recommended disconnect mechanism for multipath devices: - Release all device holders - Flush multipath - Flush single paths - Delete single devices The problem is that this procedure does an innecessary step, flushing individual single paths, that may result in an error. Originally it was thought that the individual flushes were necessary to prevent data loss, but upon further study of the multipath-tools and the device-mapper code it was discovered that this is not really the case. After the multipath flushing has been completed we can be sure that the data has been successfully sent and acknowledge by the device. Closes-Bug: #1785669 Change-Id: I10f7fea2d69d5d9011f0d5486863a8d9d8a9696e --- .../initiator/connectors/fibre_channel.py | 13 ++- .../connectors/fibre_channel_s390x.py | 2 +- os_brick/initiator/connectors/iscsi.py | 18 ++-- os_brick/initiator/linuxscsi.py | 50 +++++++++-- .../connectors/test_fibre_channel.py | 22 ++++- .../connectors/test_fibre_channel_s390x.py | 3 +- .../tests/initiator/connectors/test_iscsi.py | 26 ++++-- os_brick/tests/initiator/test_linuxscsi.py | 86 ++++++++++++++++++- ...multipath-disconnect-819d01e6e981883e.yaml | 10 +++ 9 files changed, 202 insertions(+), 28 deletions(-) create mode 100644 releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml diff --git a/os_brick/initiator/connectors/fibre_channel.py b/os_brick/initiator/connectors/fibre_channel.py index 9742101fd..b66ce47fd 100644 --- a/os_brick/initiator/connectors/fibre_channel.py +++ b/os_brick/initiator/connectors/fibre_channel.py @@ -333,14 +333,21 @@ class FibreChannelConnector(base.BaseLinuxConnector): devices.append(device_info) LOG.debug("devices to remove = %s", devices) - self._remove_devices(connection_properties, devices) + self._remove_devices(connection_properties, devices, device_info) - def _remove_devices(self, connection_properties, devices): + def _remove_devices(self, connection_properties, devices, device_info): # There may have been more than 1 device mounted # by the kernel for this volume. We have to remove # all of them + path_used = self._linuxscsi.get_dev_path(connection_properties, + device_info) + was_multipath = '/pci-' not in path_used for device in devices: - self._linuxscsi.remove_scsi_device(device["device"]) + device_path = device['device'] + flush = self._linuxscsi.requires_flush(device_path, + path_used, + was_multipath) + self._linuxscsi.remove_scsi_device(device_path, flush=flush) def _get_pci_num(self, hba): # NOTE(walter-boring) diff --git a/os_brick/initiator/connectors/fibre_channel_s390x.py b/os_brick/initiator/connectors/fibre_channel_s390x.py index 4d39b1b07..bfd6b3f70 100644 --- a/os_brick/initiator/connectors/fibre_channel_s390x.py +++ b/os_brick/initiator/connectors/fibre_channel_s390x.py @@ -87,7 +87,7 @@ class FibreChannelConnectorS390X(fibre_channel.FibreChannelConnector): ] return host_device - def _remove_devices(self, connection_properties, devices): + def _remove_devices(self, connection_properties, devices, device_info): hbas = self._linuxfc.get_fc_hbas_info() ports = connection_properties['target_wwn'] possible_devs = self._get_possible_devices(hbas, ports) diff --git a/os_brick/initiator/connectors/iscsi.py b/os_brick/initiator/connectors/iscsi.py index 5836c3753..fa21608c0 100644 --- a/os_brick/initiator/connectors/iscsi.py +++ b/os_brick/initiator/connectors/iscsi.py @@ -845,10 +845,12 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): :type ignore_errors: bool """ return self._cleanup_connection(connection_properties, force=force, - ignore_errors=ignore_errors) + ignore_errors=ignore_errors, + device_info=device_info) def _cleanup_connection(self, connection_properties, ips_iqns_luns=None, - force=False, ignore_errors=False): + force=False, ignore_errors=False, + device_info=None): """Cleans up connection flushing and removing devices and multipath. :param connection_properties: The dictionary that describes all @@ -864,6 +866,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): :param ignore_errors: When force is True, this will decide whether to ignore errors or raise an exception once finished the operation. Default is False. + :param device_info: Attached device information. :type ignore_errors: bool """ exc = exception.ExceptionChainer() @@ -880,9 +883,14 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector): remove_devices = set() for remove, __ in devices_map.values(): remove_devices.update(remove) - multipath_name = self._linuxscsi.remove_connection(remove_devices, - self.use_multipath, - force, exc) + + path_used = self._linuxscsi.get_dev_path(connection_properties, + device_info) + was_multipath = (path_used.startswith('/dev/dm-') or + 'mpath' in path_used) + multipath_name = self._linuxscsi.remove_connection( + remove_devices, self.use_multipath, force, exc, + path_used, was_multipath) # Disconnect sessions and remove nodes that are left without devices disconnect = [conn for conn, (__, keep) in devices_map.items() diff --git a/os_brick/initiator/linuxscsi.py b/os_brick/initiator/linuxscsi.py index 432e6e666..221ef4372 100644 --- a/os_brick/initiator/linuxscsi.py +++ b/os_brick/initiator/linuxscsi.py @@ -60,14 +60,16 @@ class LinuxSCSI(executor.Executor): else: return None - def remove_scsi_device(self, device, force=False, exc=None): + def remove_scsi_device(self, device, force=False, exc=None, + flush=True): """Removes a scsi device based upon /dev/sdX name.""" path = "/sys/block/%s/device/delete" % device.replace("/dev/", "") if os.path.exists(path): exc = exception.ExceptionChainer() if exc is None else exc - # flush any outstanding IO first - with exc.context(force, 'Flushing %s failed', device): - self.flush_device_io(device) + if flush: + # flush any outstanding IO first + with exc.context(force, 'Flushing %s failed', device): + self.flush_device_io(device) LOG.debug("Remove SCSI device %(device)s with %(path)s", {'device': device, 'path': path}) @@ -193,14 +195,48 @@ class LinuxSCSI(executor.Executor): return dm return None + @staticmethod + def get_dev_path(connection_properties, device_info): + """Determine what path was used by Nova/Cinder to access volume.""" + if device_info and device_info.get('path'): + return device_info.get('path') + + return connection_properties.get('device_path') or '' + + @staticmethod + def requires_flush(path, path_used, was_multipath): + """Check if a device needs to be flushed when detaching. + + A device representing a single path connection to a volume must only be + flushed if it has been used directly by Nova or Cinder to write data. + + If the path has been used via a multipath DM or if the device was part + of a multipath but a different single path was used for I/O (instead of + the multipath) then we don't need to flush. + """ + # No used path happens on failed attachs, when we don't care about + # individual flushes. + # When Nova used a multipath we don't need to do individual flushes. + if not path_used or was_multipath: + return False + + # We need to flush the single path that was used. + # For encrypted volumes the symlink has been replaced, so realpath + # won't return device under /dev but under /dev/disk/... + path = os.path.realpath(path) + path_used = os.path.realpath(path_used) + return path_used == path or '/dev' != os.path.split(path_used)[0] + def remove_connection(self, devices_names, is_multipath, force=False, - exc=None): + exc=None, path_used=None, was_multipath=False): """Remove LUNs and multipath associated with devices names. :param devices_names: Iterable with real device names ('sda', 'sdb') :param is_multipath: Whether this is a multipath connection or not :param force: Whether to forcefully disconnect even if flush fails. :param exc: ExceptionChainer where to add exceptions if forcing + :param path_used: What path was used by Nova/Cinder for I/O + :param was_multipath: If the path used for I/O was a multipath :returns: Multipath device map name if found and not flushed """ if not devices_names: @@ -220,7 +256,9 @@ class LinuxSCSI(executor.Executor): multipath_name = None for device_name in devices_names: - self.remove_scsi_device('/dev/' + device_name, force, exc) + dev_path = '/dev/' + device_name + flush = self.requires_flush(dev_path, path_used, was_multipath) + self.remove_scsi_device(dev_path, force, exc, flush) # Wait until the symlinks are removed with exc.context(force, 'Some devices remain from %s', devices_names): diff --git a/os_brick/tests/initiator/connectors/test_fibre_channel.py b/os_brick/tests/initiator/connectors/test_fibre_channel.py index fc4418c31..2521d9459 100644 --- a/os_brick/tests/initiator/connectors/test_fibre_channel.py +++ b/os_brick/tests/initiator/connectors/test_fibre_channel.py @@ -259,9 +259,7 @@ class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): devices['devices'][0]) expected_commands = [ 'multipath -f ' + find_mp_device_path_mock.return_value, - 'blockdev --flushbufs /dev/sdb', 'tee -a /sys/block/sdb/device/delete', - 'blockdev --flushbufs /dev/sdc', 'tee -a /sys/block/sdc/device/delete', ] self.assertEqual(expected_commands, self.cmds) @@ -617,3 +615,23 @@ class FibreChannelConnectorTestCase(test_connector.ConnectorTestCase): self.assertIn('initiator_target_lun_map', connection_info) self.assertEqual(expected_map, connection_info['initiator_target_lun_map']) + + @ddt.data(('/dev/mapper/', True), + ('/dev/mapper/mpath0', True), + ('/dev/disk/by-path/pci-1-fc-1-lun-1', False)) + @ddt.unpack + @mock.patch('os_brick.initiator.linuxscsi.LinuxSCSI.remove_scsi_device') + @mock.patch('os_brick.initiator.linuxscsi.LinuxSCSI.requires_flush') + @mock.patch('os_brick.initiator.linuxscsi.LinuxSCSI.get_dev_path') + def test__remove_devices(self, path_used, was_multipath, get_dev_path_mock, + flush_mock, remove_mock): + get_dev_path_mock.return_value = path_used + self.connector._remove_devices(mock.sentinel.con_props, + [{'device': '/dev/sda'}], + mock.sentinel.device_info) + get_dev_path_mock.assert_called_once_with(mock.sentinel.con_props, + mock.sentinel.device_info) + flush_mock.assert_called_once_with('/dev/sda', path_used, + was_multipath) + remove_mock.assert_called_once_with('/dev/sda', + flush=flush_mock.return_value) diff --git a/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py b/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py index 0f3f16eac..e02fdb709 100644 --- a/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py +++ b/os_brick/tests/initiator/connectors/test_fibre_channel_s390x.py @@ -68,7 +68,8 @@ class FibreChannelConnectorS390XTestCase(test_connector.ConnectorTestCase): def test_remove_devices(self, mock_deconfigure_scsi_device, mock_get_fc_hbas_info, mock_get_possible_devices): connection_properties = {'target_wwn': 5, 'target_lun': 2} - self.connector._remove_devices(connection_properties, devices=None) + self.connector._remove_devices(connection_properties, devices=None, + device_info=None) mock_deconfigure_scsi_device.assert_called_with(3, 5, "0x0002000000000000") mock_get_fc_hbas_info.assert_called_once_with() diff --git a/os_brick/tests/initiator/connectors/test_iscsi.py b/os_brick/tests/initiator/connectors/test_iscsi.py index f8af45256..e12f80303 100644 --- a/os_brick/tests/initiator/connectors/test_iscsi.py +++ b/os_brick/tests/initiator/connectors/test_iscsi.py @@ -538,7 +538,8 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): cleanup_mock.assert_called_once_with( mock.sentinel.con_props, force=mock.sentinel.Force, - ignore_errors=mock.sentinel.ignore_errors) + ignore_errors=mock.sentinel.ignore_errors, + device_info=mock.sentinel.dev_info) @ddt.data(True, False) @mock.patch.object(iscsi.ISCSIConnector, '_get_transport') @@ -649,13 +650,21 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): ['-m', 'discoverydb', '-o', 'show', '-P', 1]) transport_mock.assert_called_once_with() + @ddt.data(('/dev/sda', False), + ('/dev/disk/by-id/scsi-WWID', False), + ('/dev/dm-11', True), + ('/dev/disk/by-id/dm-uuid-mpath-MPATH', True)) + @ddt.unpack + @mock.patch.object(linuxscsi.LinuxSCSI, 'get_dev_path') @mock.patch.object(iscsi.ISCSIConnector, '_disconnect_connection') @mock.patch.object(iscsi.ISCSIConnector, '_get_connection_devices') @mock.patch.object(linuxscsi.LinuxSCSI, 'flush_multipath_device') @mock.patch.object(linuxscsi.LinuxSCSI, 'remove_connection', return_value=None) - def test_cleanup_connection(self, remove_mock, flush_mock, con_devs_mock, - discon_mock): + def test_cleanup_connection(self, path_used, was_multipath, remove_mock, + flush_mock, con_devs_mock, discon_mock, + get_dev_path_mock): + get_dev_path_mock.return_value = path_used # Return an ordered dicts instead of normal dict for discon_mock.assert con_devs_mock.return_value = collections.OrderedDict(( (('ip1:port1', 'tgt1'), ({'sda'}, set())), @@ -666,12 +675,16 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): 'use_multipath') as use_mp_mock: self.connector._cleanup_connection( self.CON_PROPS, ips_iqns_luns=mock.sentinel.ips_iqns_luns, - force=False, ignore_errors=False) + force=False, ignore_errors=False, + device_info=mock.sentinel.device_info) + get_dev_path_mock.called_once_with(self.CON_PROPS, + mock.sentinel.device_info) con_devs_mock.assert_called_once_with(self.CON_PROPS, mock.sentinel.ips_iqns_luns) remove_mock.assert_called_once_with({'sda', 'sdb'}, use_mp_mock, - False, mock.ANY) + False, mock.ANY, + path_used, was_multipath) discon_mock.assert_called_once_with( self.CON_PROPS, [('ip1:port1', 'tgt1'), ('ip3:port3', 'tgt3')], @@ -707,7 +720,8 @@ class ISCSIConnectorTestCase(test_connector.ConnectorTestCase): con_devs_mock.assert_called_once_with(self.CON_PROPS, mock.sentinel.ips_iqns_luns) remove_mock.assert_called_once_with({'sda', 'sdb'}, use_mp_mock, - mock.sentinel.force, mock.ANY) + mock.sentinel.force, mock.ANY, + '', False) discon_mock.assert_called_once_with( self.CON_PROPS, [('ip1:port1', 'tgt1'), ('ip3:port3', 'tgt3')], diff --git a/os_brick/tests/initiator/test_linuxscsi.py b/os_brick/tests/initiator/test_linuxscsi.py index 4c1f67c9a..96ab0da3f 100644 --- a/os_brick/tests/initiator/test_linuxscsi.py +++ b/os_brick/tests/initiator/test_linuxscsi.py @@ -33,6 +33,7 @@ class LinuxSCSITestCase(base.TestCase): def setUp(self): super(LinuxSCSITestCase, self).setUp() self.cmds = [] + self.realpath = os.path.realpath self.mock_object(os.path, 'realpath', return_value='/dev/sdc') self.mock_object(os, 'stat', returns=os.stat(__file__)) self.linuxscsi = linuxscsi.LinuxSCSI(None, execute=self.fake_execute) @@ -91,6 +92,16 @@ class LinuxSCSITestCase(base.TestCase): flush_mock.assert_called_once_with(device) echo_mock.assert_called_once_with('/sys/block/sdc/device/delete', '1') + @mock.patch.object(os.path, 'exists', return_value=False) + def test_remove_scsi_device_no_flush(self, exists_mock): + self.linuxscsi.remove_scsi_device("/dev/sdc") + expected_commands = [] + self.assertEqual(expected_commands, self.cmds) + exists_mock.return_value = True + self.linuxscsi.remove_scsi_device("/dev/sdc", flush=False) + expected_commands = [('tee -a /sys/block/sdc/device/delete')] + self.assertEqual(expected_commands, self.cmds) + @mock.patch('time.sleep') @mock.patch('os.path.exists', return_value=True) def test_wait_for_volumes_removal_failure(self, exists_mock, sleep_mock): @@ -243,8 +254,8 @@ class LinuxSCSITestCase(base.TestCase): self.assertEqual(get_dm_name_mock.return_value if do_raise else None, mp_name) remove_mock.assert_has_calls([ - mock.call('/dev/sda', mock.sentinel.Force, exc), - mock.call('/dev/sdb', mock.sentinel.Force, exc)]) + mock.call('/dev/sda', mock.sentinel.Force, exc, False), + mock.call('/dev/sdb', mock.sentinel.Force, exc, False)]) wait_mock.assert_called_once_with(devices_names) self.assertEqual(do_raise, bool(exc)) remove_link_mock.assert_called_once_with(devices_names) @@ -285,8 +296,28 @@ class LinuxSCSITestCase(base.TestCase): force=mock.sentinel.Force, exc=exc) remove_mock.assert_has_calls( - [mock.call('/dev/sda', mock.sentinel.Force, exc), - mock.call('/dev/sdb', mock.sentinel.Force, exc)]) + [mock.call('/dev/sda', mock.sentinel.Force, exc, False), + mock.call('/dev/sdb', mock.sentinel.Force, exc, False)]) + wait_mock.assert_called_once_with(devices_names) + remove_link_mock.assert_called_once_with(devices_names) + + @mock.patch.object(linuxscsi.LinuxSCSI, '_remove_scsi_symlinks') + @mock.patch.object(linuxscsi.LinuxSCSI, 'wait_for_volumes_removal') + @mock.patch.object(linuxscsi.LinuxSCSI, 'remove_scsi_device') + def test_remove_connection_singlepath_used(self, remove_mock, wait_mock, + remove_link_mock): + devices_names = ('sda', 'sdb') + exc = exception.ExceptionChainer() + + # realpath was mocked on test setup + with mock.patch('os.path.realpath', side_effect=self.realpath): + self.linuxscsi.remove_connection(devices_names, is_multipath=True, + force=mock.sentinel.Force, + exc=exc, path_used='/dev/sdb', + was_multipath=False) + remove_mock.assert_has_calls( + [mock.call('/dev/sda', mock.sentinel.Force, exc, False), + mock.call('/dev/sdb', mock.sentinel.Force, exc, True)]) wait_mock.assert_called_once_with(devices_names) remove_link_mock.assert_called_once_with(devices_names) @@ -991,3 +1022,50 @@ loop0 0""" def test_multipath_add_path(self): self.linuxscsi.multipath_add_path('/dev/sda') self.assertEqual(['multipathd add path /dev/sda'], self.cmds) + + @ddt.data({'con_props': {}, 'dev_info': {'path': mock.sentinel.path}}, + {'con_props': None, 'dev_info': {'path': mock.sentinel.path}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {'path': mock.sentinel.path}}) + @ddt.unpack + def test_get_dev_path_device_info(self, con_props, dev_info): + self.assertEqual(mock.sentinel.path, + self.linuxscsi.get_dev_path(con_props, dev_info)) + + @ddt.data({'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {'path': None}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {'path': ''}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': {}}, + {'con_props': {'device_path': mock.sentinel.device_path}, + 'dev_info': None}) + @ddt.unpack + def test_get_dev_path_conn_props(self, con_props, dev_info): + self.assertEqual(mock.sentinel.device_path, + self.linuxscsi.get_dev_path(con_props, dev_info)) + + @ddt.data({'con_props': {'device_path': ''}, 'dev_info': {'path': None}}, + {'con_props': {'device_path': None}, 'dev_info': {'path': ''}}, + {'con_props': {}, 'dev_info': {}}, + {'con_props': {}, 'dev_info': None}) + @ddt.unpack + def test_get_dev_path_no_path(self, con_props, dev_info): + self.assertEqual('', self.linuxscsi.get_dev_path(con_props, dev_info)) + + @ddt.data(('/dev/sda', '/dev/sda', True, False, None), + ('/dev/sda', '', True, False, None), + ('/dev/link_sda', '/dev/link_sdb', False, False, ('/dev/sda', + '/dev/sdb')), + ('/dev/link_sda', '/dev/link2_sda', False, True, ('/dev/sda', + '/dev/sda'))) + @ddt.unpack + def test_requires_flush(self, path, path_used, was_multipath, expected, + real_paths): + with mock.patch('os.path.realpath', side_effect=real_paths) as mocked: + self.assertEqual( + expected, + self.linuxscsi.requires_flush(path, path_used, was_multipath)) + if real_paths: + mocked.assert_has_calls([mock.call(path), + mock.call(path_used)]) diff --git a/releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml b/releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml new file mode 100644 index 000000000..7c33131e2 --- /dev/null +++ b/releasenotes/notes/fix-multipath-disconnect-819d01e6e981883e.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Under certain conditions detaching a multipath device may result in + failure when flushing one of the individual paths, but the disconnect + should have succeeded, because there were other paths available to flush + all the data. + + The multipath disconnect mechanism is now more robust and will only fail + when disconnecting if multipath would lose data.