Succeed on iSCSI detach when path just went down

If the iSCSI connection to a device goes down right after we flush it,
or if one of the paths of a multipath device goes down right before we
start disconnecting, the detach will fail even though it should succeed.

We'll see a VolumePathNotRemoved exception listing volumes that had not
disappeared.

This happens because, under those circumstances, it may take up to 30
seconds for the SCSI device to be removed from /dev, but expect it to
disappear in 6 seconds (first check happens, immediately, then another
in 2 seconds, and another in 4 seconds).

Since the device will be removed if we wait a bit more, this patch makes
it so that we wait for up to 30 seconds for the removal.

To ensure we wait as little time as possible, we change the way we wait
for the devices to be removed.  Instead of checking, sleeping for 2 and
then for 4 seconds, and then checking again, we just sleep 500ms between
checks, and we do the DEBUG log every 5 seconds.

Change-Id: If801dfc2462c0d3f986eebd4108087139934610d
Closes-Bug: #1794829
(cherry-picked from commit b9c7bc2b59)
(cherry picked from commit b75411de2b)
This commit is contained in:
Gorka Eguileor 2018-09-27 17:55:00 +02:00 committed by Elise Gafford
parent d4a600748b
commit 9722aa7db8
2 changed files with 20 additions and 9 deletions

View File

@ -19,10 +19,11 @@
import glob
import os
import re
import six
import time
from oslo_concurrency import processutils as putils
from oslo_log import log as logging
import six
from os_brick import exception
from os_brick import executor
@ -76,18 +77,28 @@ class LinuxSCSI(executor.Executor):
with exc.context(force, 'Removing %s failed', device):
self.echo_scsi_command(path, "1")
@utils.retry(exceptions=exception.VolumePathNotRemoved)
def wait_for_volumes_removal(self, volumes_names):
"""Wait for device paths to be removed from the system."""
str_names = ', '.join(volumes_names)
LOG.debug('Checking to see if SCSI volumes %s have been removed.',
str_names)
exist = [volume_name for volume_name in volumes_names
if os.path.exists('/dev/' + volume_name)]
if exist:
LOG.debug('%s still exist.', ', '.join(exist))
raise exception.VolumePathNotRemoved(volume_path=exist)
LOG.debug("SCSI volumes %s have been removed.", str_names)
exist = ['/dev/' + volume_name for volume_name in volumes_names]
# It can take up to 30 seconds to remove a SCSI device if the path
# failed right before we start detaching, which is unlikely, but we
# still shouldn't fail in that case.
for i in range(61):
exist = [path for path in exist if os.path.exists(path)]
if not exist:
LOG.debug("SCSI volumes %s have been removed.", str_names)
return
# Don't sleep on the last try since we are quitting
if i < 60:
time.sleep(0.5)
# Log every 5 seconds
if i % 10 == 0:
LOG.debug('%s still exist.', ', '.join(exist))
raise exception.VolumePathNotRemoved(volume_path=exist)
def get_device_info(self, device):
(out, _err) = self._execute('sg_scan', device, run_as_root=True,

View File

@ -105,7 +105,7 @@ class LinuxSCSITestCase(base.TestCase):
@mock.patch('time.sleep')
@mock.patch('os.path.exists', return_value=True)
def test_wait_for_volumes_removal_failure(self, exists_mock, sleep_mock):
retries = 3
retries = 61
names = ('sda', 'sdb')
self.assertRaises(exception.VolumePathNotRemoved,
self.linuxscsi.wait_for_volumes_removal, names)