Brick: Fix race in removing iSCSI device
Some folks are seeing race conditions on overloaded or slow machines, where the time between issuing the remove volume command, and the volume being from /dev/disk/by-path by the kernel isn't long enough. This patch adds the Cinder retry decorator mechanism. This patch uses the new retry decorator to try 3 times to wait for the volume to disappear from the system, before moving on. Change-Id: I24cc18d188b8eeb3b7a1392b2bd8b520dc2a0e03 Closes-Bug: 1437441
This commit is contained in:
parent
7d341f1543
commit
f4ecdf0f60
@ -98,5 +98,9 @@ class VolumeDeviceNotFound(BrickException):
|
||||
message = _("Volume device not found at %(device)s.")
|
||||
|
||||
|
||||
class VolumePathNotRemoved(BrickException):
|
||||
message = _("Volume path %(volume_path)s was not removed in time.")
|
||||
|
||||
|
||||
class ProtocolNotSupported(BrickException):
|
||||
message = _("Connect to volume via protocol %(protocol)s not supported.")
|
||||
|
@ -412,6 +412,13 @@ class ISCSIConnector(InitiatorConnector):
|
||||
if dev_name:
|
||||
self._linuxscsi.remove_scsi_device(dev_name)
|
||||
|
||||
# NOTE(jdg): On busy systems we can have a race here
|
||||
# where remove_iscsi_device is called before the device file
|
||||
# has actually been removed. The result is an orphaned
|
||||
# iscsi session that never gets logged out. The following
|
||||
# call to wait addresses that issue.
|
||||
self._linuxscsi.wait_for_volume_removal(host_device)
|
||||
|
||||
# NOTE(vish): Only disconnect from the target if no luns from the
|
||||
# target are in use.
|
||||
device_prefix = ("/dev/disk/by-path/ip-%(portal)s-iscsi-%(iqn)s-lun-" %
|
||||
|
@ -22,8 +22,10 @@ import re
|
||||
from oslo_concurrency import processutils as putils
|
||||
from oslo_log import log as logging
|
||||
|
||||
from os_brick import exception
|
||||
from os_brick import executor
|
||||
from os_brick.i18n import _LW
|
||||
from os_brick import utils
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@ -67,6 +69,19 @@ class LinuxSCSI(executor.Executor):
|
||||
{'device': device, 'path': path})
|
||||
self.echo_scsi_command(path, "1")
|
||||
|
||||
@utils.retry(exceptions=exception.VolumePathNotRemoved, retries=3,
|
||||
backoff_rate=1)
|
||||
def wait_for_volume_removal(self, volume_path):
|
||||
"""This is used to ensure that volumes are gone."""
|
||||
LOG.debug("Checking to see if SCSI volume %s has been removed.",
|
||||
volume_path)
|
||||
if os.path.exists(volume_path):
|
||||
LOG.debug("%(path)s still exists.", {'path': volume_path})
|
||||
raise exception.VolumePathNotRemoved(
|
||||
volume_path=volume_path)
|
||||
else:
|
||||
LOG.debug("SCSI volume %s has been removed.", volume_path)
|
||||
|
||||
def get_device_info(self, device):
|
||||
(out, _err) = self._execute('sg_scan', device, run_as_root=True,
|
||||
root_helper=self._root_helper)
|
||||
|
@ -234,9 +234,13 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
|
||||
initiator = self.connector.get_initiator()
|
||||
self.assertEqual(initiator, 'iqn.1234-56.foo.bar:01:23456789abc')
|
||||
|
||||
@mock.patch.object(os.path, 'exists', return_value=True)
|
||||
def _test_connect_volume(self, extra_props, additional_commands,
|
||||
exists_mock):
|
||||
disconnect_mock=None):
|
||||
# for making sure the /dev/disk/by-path is gone
|
||||
exists_mock = mock.Mock()
|
||||
exists_mock.return_value = True
|
||||
os.path.exists = exists_mock
|
||||
|
||||
location = '10.0.2.15:3260'
|
||||
name = 'volume-00000001'
|
||||
iqn = 'iqn.2010-10.org.openstack:%s' % name
|
||||
@ -249,29 +253,40 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
|
||||
self.assertEqual(device['type'], 'block')
|
||||
self.assertEqual(device['path'], dev_str)
|
||||
|
||||
self.connector.disconnect_volume(connection_info['data'], device)
|
||||
expected_commands = [('iscsiadm -m node -T %s -p %s' %
|
||||
(iqn, location)),
|
||||
('iscsiadm -m session'),
|
||||
('iscsiadm -m node -T %s -p %s --login' %
|
||||
(iqn, location)),
|
||||
('iscsiadm -m node -T %s -p %s --op update'
|
||||
' -n node.startup -v automatic'
|
||||
% (iqn, location)),
|
||||
('iscsiadm -m node --rescan'),
|
||||
('iscsiadm -m session --rescan'),
|
||||
('blockdev --flushbufs /dev/sdb'),
|
||||
('tee -a /sys/block/sdb/device/delete'),
|
||||
('iscsiadm -m node -T %s -p %s --op update'
|
||||
' -n node.startup -v manual' % (iqn, location)),
|
||||
('iscsiadm -m node -T %s -p %s --logout' %
|
||||
(iqn, location)),
|
||||
('iscsiadm -m node -T %s -p %s --op delete' %
|
||||
(iqn, location)), ] + additional_commands
|
||||
LOG.debug("self.cmds = %s" % self.cmds)
|
||||
LOG.debug("expected = %s" % expected_commands)
|
||||
self.count = 0
|
||||
|
||||
self.assertEqual(expected_commands, self.cmds)
|
||||
def mock_exists_effect(*args, **kwargs):
|
||||
self.count = self.count + 1
|
||||
if self.count == 4:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
if disconnect_mock is None:
|
||||
disconnect_mock = mock_exists_effect
|
||||
|
||||
with mock.patch.object(os.path, 'exists',
|
||||
side_effect=disconnect_mock):
|
||||
self.connector.disconnect_volume(connection_info['data'], device)
|
||||
expected_commands = [
|
||||
('iscsiadm -m node -T %s -p %s' % (iqn, location)),
|
||||
('iscsiadm -m session'),
|
||||
('iscsiadm -m node -T %s -p %s --login' % (iqn, location)),
|
||||
('iscsiadm -m node -T %s -p %s --op update'
|
||||
' -n node.startup -v automatic' % (iqn, location)),
|
||||
('iscsiadm -m node --rescan'),
|
||||
('iscsiadm -m session --rescan'),
|
||||
('blockdev --flushbufs /dev/sdb'),
|
||||
('tee -a /sys/block/sdb/device/delete'),
|
||||
('iscsiadm -m node -T %s -p %s --op update'
|
||||
' -n node.startup -v manual' % (iqn, location)),
|
||||
('iscsiadm -m node -T %s -p %s --logout' % (iqn, location)),
|
||||
('iscsiadm -m node -T %s -p %s --op delete' %
|
||||
(iqn, location)), ] + additional_commands
|
||||
LOG.debug("self.cmds = %s", self.cmds)
|
||||
LOG.debug("expected = %s", expected_commands)
|
||||
|
||||
self.assertEqual(expected_commands, self.cmds)
|
||||
|
||||
@testtools.skipUnless(os.path.exists('/dev/disk/by-path'),
|
||||
'Test requires /dev/disk/by-path')
|
||||
@ -297,7 +312,20 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
|
||||
(iqn2, location2)),
|
||||
('iscsiadm -m node -T %s -p %s --op delete' %
|
||||
(iqn2, location2))]
|
||||
self._test_connect_volume(extra_props, additional_commands)
|
||||
|
||||
def mock_exists_effect(*args, **kwargs):
|
||||
self.count = self.count + 1
|
||||
# we have 2 targets in this test, so we need
|
||||
# to make sure we remove and detect removal
|
||||
# for both.
|
||||
if (self.count == 4 or
|
||||
self.count == 8):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
self._test_connect_volume(extra_props, additional_commands,
|
||||
disconnect_mock=mock_exists_effect)
|
||||
|
||||
@testtools.skipUnless(os.path.exists('/dev/disk/by-path'),
|
||||
'Test requires /dev/disk/by-path')
|
||||
@ -338,17 +366,19 @@ class ISCSIConnectorTestCase(ConnectorTestCase):
|
||||
check_exit_code=[0, 255])
|
||||
|
||||
mock_iscsiadm.reset_mock()
|
||||
self.connector.disconnect_volume(connection_info['data'], device)
|
||||
props = connection_info['data'].copy()
|
||||
for key in ('target_portals', 'target_iqns', 'target_luns'):
|
||||
props.pop(key, None)
|
||||
mock_iscsiadm.assert_any_call(props, ('--logout',),
|
||||
check_exit_code=[0, 21, 255])
|
||||
props['target_portal'] = location2
|
||||
props['target_iqn'] = iqn2
|
||||
props['target_lun'] = 2
|
||||
mock_iscsiadm.assert_any_call(props, ('--logout',),
|
||||
check_exit_code=[0, 21, 255])
|
||||
with mock.patch.object(os.path, 'exists',
|
||||
return_value=False):
|
||||
self.connector.disconnect_volume(connection_info['data'], device)
|
||||
props = connection_info['data'].copy()
|
||||
for key in ('target_portals', 'target_iqns', 'target_luns'):
|
||||
props.pop(key, None)
|
||||
mock_iscsiadm.assert_any_call(props, ('--logout',),
|
||||
check_exit_code=[0, 21, 255])
|
||||
props['target_portal'] = location2
|
||||
props['target_iqn'] = iqn2
|
||||
props['target_lun'] = 2
|
||||
mock_iscsiadm.assert_any_call(props, ('--logout',),
|
||||
check_exit_code=[0, 21, 255])
|
||||
|
||||
@mock.patch.object(connector.ISCSIConnector, '_run_iscsiadm_bare')
|
||||
@mock.patch.object(connector.ISCSIConnector,
|
||||
|
@ -19,6 +19,7 @@ import string
|
||||
import mock
|
||||
from oslo_log import log as logging
|
||||
|
||||
from os_brick import exception
|
||||
from os_brick.initiator import linuxscsi
|
||||
from os_brick.tests import base
|
||||
|
||||
@ -67,6 +68,22 @@ class LinuxSCSITestCase(base.TestCase):
|
||||
('tee -a /sys/block/sdc/device/delete')]
|
||||
self.assertEqual(expected_commands, self.cmds)
|
||||
|
||||
def test_wait_for_volume_removal(self):
|
||||
fake_path = '/dev/disk/by-path/fake-iscsi-iqn-lun-0'
|
||||
exists_mock = mock.Mock()
|
||||
exists_mock.return_value = True
|
||||
os.path.exists = exists_mock
|
||||
self.assertRaises(exception.VolumePathNotRemoved,
|
||||
self.linuxscsi.wait_for_volume_removal,
|
||||
fake_path)
|
||||
|
||||
exists_mock = mock.Mock()
|
||||
exists_mock.return_value = False
|
||||
os.path.exists = exists_mock
|
||||
self.linuxscsi.wait_for_volume_removal(fake_path)
|
||||
expected_commands = []
|
||||
self.assertEqual(expected_commands, self.cmds)
|
||||
|
||||
def test_flush_multipath_device(self):
|
||||
self.linuxscsi.flush_multipath_device('/dev/dm-9')
|
||||
expected_commands = [('multipath -f /dev/dm-9')]
|
||||
|
111
os_brick/tests/test_utils.py
Normal file
111
os_brick/tests/test_utils.py
Normal file
@ -0,0 +1,111 @@
|
||||
# (c) Copyright 2015 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import time
|
||||
|
||||
import mock
|
||||
from oslo_log import log as logging
|
||||
|
||||
from os_brick import exception
|
||||
from os_brick.tests import base
|
||||
from os_brick import utils
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WrongException(exception.BrickException):
|
||||
pass
|
||||
|
||||
|
||||
class TestRetryDecorator(base.TestCase):
|
||||
|
||||
def test_no_retry_required(self):
|
||||
self.counter = 0
|
||||
|
||||
with mock.patch.object(time, 'sleep') as mock_sleep:
|
||||
@utils.retry(exceptions=exception.VolumeDeviceNotFound,
|
||||
interval=2,
|
||||
retries=3,
|
||||
backoff_rate=2)
|
||||
def succeeds():
|
||||
self.counter += 1
|
||||
return 'success'
|
||||
|
||||
ret = succeeds()
|
||||
self.assertFalse(mock_sleep.called)
|
||||
self.assertEqual(ret, 'success')
|
||||
self.assertEqual(self.counter, 1)
|
||||
|
||||
def test_retries_once(self):
|
||||
self.counter = 0
|
||||
interval = 2
|
||||
backoff_rate = 2
|
||||
retries = 3
|
||||
|
||||
with mock.patch.object(time, 'sleep') as mock_sleep:
|
||||
@utils.retry(exception.VolumeDeviceNotFound,
|
||||
interval,
|
||||
retries,
|
||||
backoff_rate)
|
||||
def fails_once():
|
||||
self.counter += 1
|
||||
if self.counter < 2:
|
||||
raise exception.VolumeDeviceNotFound(device='fake')
|
||||
else:
|
||||
return 'success'
|
||||
|
||||
ret = fails_once()
|
||||
self.assertEqual(ret, 'success')
|
||||
self.assertEqual(self.counter, 2)
|
||||
self.assertEqual(mock_sleep.call_count, 1)
|
||||
mock_sleep.assert_called_with(interval * backoff_rate)
|
||||
|
||||
def test_limit_is_reached(self):
|
||||
self.counter = 0
|
||||
retries = 3
|
||||
interval = 2
|
||||
backoff_rate = 4
|
||||
|
||||
with mock.patch.object(time, 'sleep') as mock_sleep:
|
||||
@utils.retry(exception.VolumeDeviceNotFound,
|
||||
interval,
|
||||
retries,
|
||||
backoff_rate)
|
||||
def always_fails():
|
||||
self.counter += 1
|
||||
raise exception.VolumeDeviceNotFound(device='fake')
|
||||
|
||||
self.assertRaises(exception.VolumeDeviceNotFound,
|
||||
always_fails)
|
||||
self.assertEqual(retries, self.counter)
|
||||
|
||||
expected_sleep_arg = []
|
||||
|
||||
for i in xrange(retries):
|
||||
if i > 0:
|
||||
interval *= backoff_rate
|
||||
expected_sleep_arg.append(float(interval))
|
||||
|
||||
mock_sleep.assert_has_calls(map(mock.call, expected_sleep_arg))
|
||||
|
||||
def test_wrong_exception_no_retry(self):
|
||||
|
||||
with mock.patch.object(time, 'sleep') as mock_sleep:
|
||||
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
|
||||
def raise_unexpected_error():
|
||||
raise WrongException("wrong exception")
|
||||
|
||||
self.assertRaises(WrongException, raise_unexpected_error)
|
||||
self.assertFalse(mock_sleep.called)
|
58
os_brick/utils.py
Normal file
58
os_brick/utils.py
Normal file
@ -0,0 +1,58 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
"""Utilities and helper functions."""
|
||||
|
||||
from oslo_log import log as logging
|
||||
import retrying
|
||||
import six
|
||||
|
||||
from os_brick.i18n import _
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def retry(exceptions, interval=1, retries=3, backoff_rate=2):
|
||||
|
||||
def _retry_on_exception(e):
|
||||
return isinstance(e, exceptions)
|
||||
|
||||
def _backoff_sleep(previous_attempt_number, delay_since_first_attempt_ms):
|
||||
exp = backoff_rate ** previous_attempt_number
|
||||
wait_for = max(0, interval * exp)
|
||||
LOG.debug("Sleeping for %s seconds", wait_for)
|
||||
return wait_for * 1000.0
|
||||
|
||||
def _print_stop(previous_attempt_number, delay_since_first_attempt_ms):
|
||||
delay_since_first_attempt = delay_since_first_attempt_ms / 1000.0
|
||||
LOG.debug("Failed attempt %s", previous_attempt_number)
|
||||
LOG.debug("Have been at this for %s seconds",
|
||||
delay_since_first_attempt)
|
||||
return previous_attempt_number == retries
|
||||
|
||||
if retries < 1:
|
||||
raise ValueError(_('Retries must be greater than or '
|
||||
'equal to 1 (received: %s). ') % retries)
|
||||
|
||||
def _decorator(f):
|
||||
|
||||
@six.wraps(f)
|
||||
def _wrapper(*args, **kwargs):
|
||||
r = retrying.Retrying(retry_on_exception=_retry_on_exception,
|
||||
wait_func=_backoff_sleep,
|
||||
stop_func=_print_stop)
|
||||
return r.call(f, *args, **kwargs)
|
||||
|
||||
return _wrapper
|
||||
|
||||
return _decorator
|
@ -10,4 +10,5 @@ oslo.log>=1.0.0,<1.1.0
|
||||
oslo.serialization>=1.4.0,<1.5.0
|
||||
oslo.i18n>=1.5.0,<1.6.0
|
||||
oslo.utils>=1.4.0,<1.5.0
|
||||
retrying>=1.2.3,!=1.3.0
|
||||
six>=1.9.0
|
||||
|
Loading…
Reference in New Issue
Block a user