From f4ecdf0f606bfc5a5cb8f661b1f11540c0168ffe Mon Sep 17 00:00:00 2001 From: "Walter A. Boring IV" Date: Fri, 27 Mar 2015 15:37:19 -0700 Subject: [PATCH] Brick: Fix race in removing iSCSI device Some folks are seeing race conditions on overloaded or slow machines, where the time between issuing the remove volume command, and the volume being from /dev/disk/by-path by the kernel isn't long enough. This patch adds the Cinder retry decorator mechanism. This patch uses the new retry decorator to try 3 times to wait for the volume to disappear from the system, before moving on. Change-Id: I24cc18d188b8eeb3b7a1392b2bd8b520dc2a0e03 Closes-Bug: 1437441 --- os_brick/exception.py | 4 + os_brick/initiator/connector.py | 7 ++ os_brick/initiator/linuxscsi.py | 15 +++ os_brick/tests/initiator/test_connector.py | 102 ++++++++++++------- os_brick/tests/initiator/test_linuxscsi.py | 17 ++++ os_brick/tests/test_utils.py | 111 +++++++++++++++++++++ os_brick/utils.py | 58 +++++++++++ requirements.txt | 1 + 8 files changed, 279 insertions(+), 36 deletions(-) create mode 100644 os_brick/tests/test_utils.py create mode 100644 os_brick/utils.py diff --git a/os_brick/exception.py b/os_brick/exception.py index 49e5821a2..8120de0e2 100644 --- a/os_brick/exception.py +++ b/os_brick/exception.py @@ -98,5 +98,9 @@ class VolumeDeviceNotFound(BrickException): message = _("Volume device not found at %(device)s.") +class VolumePathNotRemoved(BrickException): + message = _("Volume path %(volume_path)s was not removed in time.") + + class ProtocolNotSupported(BrickException): message = _("Connect to volume via protocol %(protocol)s not supported.") diff --git a/os_brick/initiator/connector.py b/os_brick/initiator/connector.py index b584e9f98..4a5f46e27 100644 --- a/os_brick/initiator/connector.py +++ b/os_brick/initiator/connector.py @@ -412,6 +412,13 @@ class ISCSIConnector(InitiatorConnector): if dev_name: self._linuxscsi.remove_scsi_device(dev_name) + # NOTE(jdg): On busy systems we can have a race here + # where remove_iscsi_device is called before the device file + # has actually been removed. The result is an orphaned + # iscsi session that never gets logged out. The following + # call to wait addresses that issue. + self._linuxscsi.wait_for_volume_removal(host_device) + # NOTE(vish): Only disconnect from the target if no luns from the # target are in use. device_prefix = ("/dev/disk/by-path/ip-%(portal)s-iscsi-%(iqn)s-lun-" % diff --git a/os_brick/initiator/linuxscsi.py b/os_brick/initiator/linuxscsi.py index a529a6cdd..838dc1d73 100644 --- a/os_brick/initiator/linuxscsi.py +++ b/os_brick/initiator/linuxscsi.py @@ -22,8 +22,10 @@ import re from oslo_concurrency import processutils as putils from oslo_log import log as logging +from os_brick import exception from os_brick import executor from os_brick.i18n import _LW +from os_brick import utils LOG = logging.getLogger(__name__) @@ -67,6 +69,19 @@ class LinuxSCSI(executor.Executor): {'device': device, 'path': path}) self.echo_scsi_command(path, "1") + @utils.retry(exceptions=exception.VolumePathNotRemoved, retries=3, + backoff_rate=1) + def wait_for_volume_removal(self, volume_path): + """This is used to ensure that volumes are gone.""" + LOG.debug("Checking to see if SCSI volume %s has been removed.", + volume_path) + if os.path.exists(volume_path): + LOG.debug("%(path)s still exists.", {'path': volume_path}) + raise exception.VolumePathNotRemoved( + volume_path=volume_path) + else: + LOG.debug("SCSI volume %s has been removed.", volume_path) + def get_device_info(self, device): (out, _err) = self._execute('sg_scan', device, run_as_root=True, root_helper=self._root_helper) diff --git a/os_brick/tests/initiator/test_connector.py b/os_brick/tests/initiator/test_connector.py index b93107eb9..3039c942d 100644 --- a/os_brick/tests/initiator/test_connector.py +++ b/os_brick/tests/initiator/test_connector.py @@ -234,9 +234,13 @@ class ISCSIConnectorTestCase(ConnectorTestCase): initiator = self.connector.get_initiator() self.assertEqual(initiator, 'iqn.1234-56.foo.bar:01:23456789abc') - @mock.patch.object(os.path, 'exists', return_value=True) def _test_connect_volume(self, extra_props, additional_commands, - exists_mock): + disconnect_mock=None): + # for making sure the /dev/disk/by-path is gone + exists_mock = mock.Mock() + exists_mock.return_value = True + os.path.exists = exists_mock + location = '10.0.2.15:3260' name = 'volume-00000001' iqn = 'iqn.2010-10.org.openstack:%s' % name @@ -249,29 +253,40 @@ class ISCSIConnectorTestCase(ConnectorTestCase): self.assertEqual(device['type'], 'block') self.assertEqual(device['path'], dev_str) - self.connector.disconnect_volume(connection_info['data'], device) - expected_commands = [('iscsiadm -m node -T %s -p %s' % - (iqn, location)), - ('iscsiadm -m session'), - ('iscsiadm -m node -T %s -p %s --login' % - (iqn, location)), - ('iscsiadm -m node -T %s -p %s --op update' - ' -n node.startup -v automatic' - % (iqn, location)), - ('iscsiadm -m node --rescan'), - ('iscsiadm -m session --rescan'), - ('blockdev --flushbufs /dev/sdb'), - ('tee -a /sys/block/sdb/device/delete'), - ('iscsiadm -m node -T %s -p %s --op update' - ' -n node.startup -v manual' % (iqn, location)), - ('iscsiadm -m node -T %s -p %s --logout' % - (iqn, location)), - ('iscsiadm -m node -T %s -p %s --op delete' % - (iqn, location)), ] + additional_commands - LOG.debug("self.cmds = %s" % self.cmds) - LOG.debug("expected = %s" % expected_commands) + self.count = 0 - self.assertEqual(expected_commands, self.cmds) + def mock_exists_effect(*args, **kwargs): + self.count = self.count + 1 + if self.count == 4: + return False + else: + return True + + if disconnect_mock is None: + disconnect_mock = mock_exists_effect + + with mock.patch.object(os.path, 'exists', + side_effect=disconnect_mock): + self.connector.disconnect_volume(connection_info['data'], device) + expected_commands = [ + ('iscsiadm -m node -T %s -p %s' % (iqn, location)), + ('iscsiadm -m session'), + ('iscsiadm -m node -T %s -p %s --login' % (iqn, location)), + ('iscsiadm -m node -T %s -p %s --op update' + ' -n node.startup -v automatic' % (iqn, location)), + ('iscsiadm -m node --rescan'), + ('iscsiadm -m session --rescan'), + ('blockdev --flushbufs /dev/sdb'), + ('tee -a /sys/block/sdb/device/delete'), + ('iscsiadm -m node -T %s -p %s --op update' + ' -n node.startup -v manual' % (iqn, location)), + ('iscsiadm -m node -T %s -p %s --logout' % (iqn, location)), + ('iscsiadm -m node -T %s -p %s --op delete' % + (iqn, location)), ] + additional_commands + LOG.debug("self.cmds = %s", self.cmds) + LOG.debug("expected = %s", expected_commands) + + self.assertEqual(expected_commands, self.cmds) @testtools.skipUnless(os.path.exists('/dev/disk/by-path'), 'Test requires /dev/disk/by-path') @@ -297,7 +312,20 @@ class ISCSIConnectorTestCase(ConnectorTestCase): (iqn2, location2)), ('iscsiadm -m node -T %s -p %s --op delete' % (iqn2, location2))] - self._test_connect_volume(extra_props, additional_commands) + + def mock_exists_effect(*args, **kwargs): + self.count = self.count + 1 + # we have 2 targets in this test, so we need + # to make sure we remove and detect removal + # for both. + if (self.count == 4 or + self.count == 8): + return False + else: + return True + + self._test_connect_volume(extra_props, additional_commands, + disconnect_mock=mock_exists_effect) @testtools.skipUnless(os.path.exists('/dev/disk/by-path'), 'Test requires /dev/disk/by-path') @@ -338,17 +366,19 @@ class ISCSIConnectorTestCase(ConnectorTestCase): check_exit_code=[0, 255]) mock_iscsiadm.reset_mock() - self.connector.disconnect_volume(connection_info['data'], device) - props = connection_info['data'].copy() - for key in ('target_portals', 'target_iqns', 'target_luns'): - props.pop(key, None) - mock_iscsiadm.assert_any_call(props, ('--logout',), - check_exit_code=[0, 21, 255]) - props['target_portal'] = location2 - props['target_iqn'] = iqn2 - props['target_lun'] = 2 - mock_iscsiadm.assert_any_call(props, ('--logout',), - check_exit_code=[0, 21, 255]) + with mock.patch.object(os.path, 'exists', + return_value=False): + self.connector.disconnect_volume(connection_info['data'], device) + props = connection_info['data'].copy() + for key in ('target_portals', 'target_iqns', 'target_luns'): + props.pop(key, None) + mock_iscsiadm.assert_any_call(props, ('--logout',), + check_exit_code=[0, 21, 255]) + props['target_portal'] = location2 + props['target_iqn'] = iqn2 + props['target_lun'] = 2 + mock_iscsiadm.assert_any_call(props, ('--logout',), + check_exit_code=[0, 21, 255]) @mock.patch.object(connector.ISCSIConnector, '_run_iscsiadm_bare') @mock.patch.object(connector.ISCSIConnector, diff --git a/os_brick/tests/initiator/test_linuxscsi.py b/os_brick/tests/initiator/test_linuxscsi.py index cfc9fd05e..0d19e06eb 100644 --- a/os_brick/tests/initiator/test_linuxscsi.py +++ b/os_brick/tests/initiator/test_linuxscsi.py @@ -19,6 +19,7 @@ import string import mock from oslo_log import log as logging +from os_brick import exception from os_brick.initiator import linuxscsi from os_brick.tests import base @@ -67,6 +68,22 @@ class LinuxSCSITestCase(base.TestCase): ('tee -a /sys/block/sdc/device/delete')] self.assertEqual(expected_commands, self.cmds) + def test_wait_for_volume_removal(self): + fake_path = '/dev/disk/by-path/fake-iscsi-iqn-lun-0' + exists_mock = mock.Mock() + exists_mock.return_value = True + os.path.exists = exists_mock + self.assertRaises(exception.VolumePathNotRemoved, + self.linuxscsi.wait_for_volume_removal, + fake_path) + + exists_mock = mock.Mock() + exists_mock.return_value = False + os.path.exists = exists_mock + self.linuxscsi.wait_for_volume_removal(fake_path) + expected_commands = [] + self.assertEqual(expected_commands, self.cmds) + def test_flush_multipath_device(self): self.linuxscsi.flush_multipath_device('/dev/dm-9') expected_commands = [('multipath -f /dev/dm-9')] diff --git a/os_brick/tests/test_utils.py b/os_brick/tests/test_utils.py new file mode 100644 index 000000000..01229f33c --- /dev/null +++ b/os_brick/tests/test_utils.py @@ -0,0 +1,111 @@ +# (c) Copyright 2015 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import time + +import mock +from oslo_log import log as logging + +from os_brick import exception +from os_brick.tests import base +from os_brick import utils + + +LOG = logging.getLogger(__name__) + + +class WrongException(exception.BrickException): + pass + + +class TestRetryDecorator(base.TestCase): + + def test_no_retry_required(self): + self.counter = 0 + + with mock.patch.object(time, 'sleep') as mock_sleep: + @utils.retry(exceptions=exception.VolumeDeviceNotFound, + interval=2, + retries=3, + backoff_rate=2) + def succeeds(): + self.counter += 1 + return 'success' + + ret = succeeds() + self.assertFalse(mock_sleep.called) + self.assertEqual(ret, 'success') + self.assertEqual(self.counter, 1) + + def test_retries_once(self): + self.counter = 0 + interval = 2 + backoff_rate = 2 + retries = 3 + + with mock.patch.object(time, 'sleep') as mock_sleep: + @utils.retry(exception.VolumeDeviceNotFound, + interval, + retries, + backoff_rate) + def fails_once(): + self.counter += 1 + if self.counter < 2: + raise exception.VolumeDeviceNotFound(device='fake') + else: + return 'success' + + ret = fails_once() + self.assertEqual(ret, 'success') + self.assertEqual(self.counter, 2) + self.assertEqual(mock_sleep.call_count, 1) + mock_sleep.assert_called_with(interval * backoff_rate) + + def test_limit_is_reached(self): + self.counter = 0 + retries = 3 + interval = 2 + backoff_rate = 4 + + with mock.patch.object(time, 'sleep') as mock_sleep: + @utils.retry(exception.VolumeDeviceNotFound, + interval, + retries, + backoff_rate) + def always_fails(): + self.counter += 1 + raise exception.VolumeDeviceNotFound(device='fake') + + self.assertRaises(exception.VolumeDeviceNotFound, + always_fails) + self.assertEqual(retries, self.counter) + + expected_sleep_arg = [] + + for i in xrange(retries): + if i > 0: + interval *= backoff_rate + expected_sleep_arg.append(float(interval)) + + mock_sleep.assert_has_calls(map(mock.call, expected_sleep_arg)) + + def test_wrong_exception_no_retry(self): + + with mock.patch.object(time, 'sleep') as mock_sleep: + @utils.retry(exceptions=exception.VolumeDeviceNotFound) + def raise_unexpected_error(): + raise WrongException("wrong exception") + + self.assertRaises(WrongException, raise_unexpected_error) + self.assertFalse(mock_sleep.called) diff --git a/os_brick/utils.py b/os_brick/utils.py new file mode 100644 index 000000000..5bd7f612b --- /dev/null +++ b/os_brick/utils.py @@ -0,0 +1,58 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +"""Utilities and helper functions.""" + +from oslo_log import log as logging +import retrying +import six + +from os_brick.i18n import _ + + +LOG = logging.getLogger(__name__) + + +def retry(exceptions, interval=1, retries=3, backoff_rate=2): + + def _retry_on_exception(e): + return isinstance(e, exceptions) + + def _backoff_sleep(previous_attempt_number, delay_since_first_attempt_ms): + exp = backoff_rate ** previous_attempt_number + wait_for = max(0, interval * exp) + LOG.debug("Sleeping for %s seconds", wait_for) + return wait_for * 1000.0 + + def _print_stop(previous_attempt_number, delay_since_first_attempt_ms): + delay_since_first_attempt = delay_since_first_attempt_ms / 1000.0 + LOG.debug("Failed attempt %s", previous_attempt_number) + LOG.debug("Have been at this for %s seconds", + delay_since_first_attempt) + return previous_attempt_number == retries + + if retries < 1: + raise ValueError(_('Retries must be greater than or ' + 'equal to 1 (received: %s). ') % retries) + + def _decorator(f): + + @six.wraps(f) + def _wrapper(*args, **kwargs): + r = retrying.Retrying(retry_on_exception=_retry_on_exception, + wait_func=_backoff_sleep, + stop_func=_print_stop) + return r.call(f, *args, **kwargs) + + return _wrapper + + return _decorator diff --git a/requirements.txt b/requirements.txt index 783b51d17..267a4f548 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ oslo.log>=1.0.0,<1.1.0 oslo.serialization>=1.4.0,<1.5.0 oslo.i18n>=1.5.0,<1.6.0 oslo.utils>=1.4.0,<1.5.0 +retrying>=1.2.3,!=1.3.0 six>=1.9.0