From b75411de2b2aadd1eafd2f8f8b1579df357bf09f Mon Sep 17 00:00:00 2001 From: Gorka Eguileor Date: Thu, 27 Sep 2018 17:55:00 +0200 Subject: [PATCH] Succeed on iSCSI detach when path just went down If the iSCSI connection to a device goes down right after we flush it, or if one of the paths of a multipath device goes down right before we start disconnecting, the detach will fail even though it should succeed. We'll see a VolumePathNotRemoved exception listing volumes that had not disappeared. This happens because, under those circumstances, it may take up to 30 seconds for the SCSI device to be removed from /dev, but expect it to disappear in 6 seconds (first check happens, immediately, then another in 2 seconds, and another in 4 seconds). Since the device will be removed if we wait a bit more, this patch makes it so that we wait for up to 30 seconds for the removal. To ensure we wait as little time as possible, we change the way we wait for the devices to be removed. Instead of checking, sleeping for 2 and then for 4 seconds, and then checking again, we just sleep 500ms between checks, and we do the DEBUG log every 5 seconds. Change-Id: If801dfc2462c0d3f986eebd4108087139934610d Closes-Bug: #1794829 (cherry-picked from b9c7bc2b597d944cbc404d6bf5fedc35d095a897) --- os_brick/initiator/linuxscsi.py | 27 +++++++++++++++------- os_brick/tests/initiator/test_linuxscsi.py | 2 +- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/os_brick/initiator/linuxscsi.py b/os_brick/initiator/linuxscsi.py index a79962eaa..7c14ba5fb 100644 --- a/os_brick/initiator/linuxscsi.py +++ b/os_brick/initiator/linuxscsi.py @@ -19,10 +19,11 @@ import glob import os import re -import six +import time from oslo_concurrency import processutils as putils from oslo_log import log as logging +import six from os_brick import exception from os_brick import executor @@ -76,18 +77,28 @@ class LinuxSCSI(executor.Executor): with exc.context(force, 'Removing %s failed', device): self.echo_scsi_command(path, "1") - @utils.retry(exceptions=exception.VolumePathNotRemoved) def wait_for_volumes_removal(self, volumes_names): """Wait for device paths to be removed from the system.""" str_names = ', '.join(volumes_names) LOG.debug('Checking to see if SCSI volumes %s have been removed.', str_names) - exist = [volume_name for volume_name in volumes_names - if os.path.exists('/dev/' + volume_name)] - if exist: - LOG.debug('%s still exist.', ', '.join(exist)) - raise exception.VolumePathNotRemoved(volume_path=exist) - LOG.debug("SCSI volumes %s have been removed.", str_names) + exist = ['/dev/' + volume_name for volume_name in volumes_names] + + # It can take up to 30 seconds to remove a SCSI device if the path + # failed right before we start detaching, which is unlikely, but we + # still shouldn't fail in that case. + for i in range(61): + exist = [path for path in exist if os.path.exists(path)] + if not exist: + LOG.debug("SCSI volumes %s have been removed.", str_names) + return + # Don't sleep on the last try since we are quitting + if i < 60: + time.sleep(0.5) + # Log every 5 seconds + if i % 10 == 0: + LOG.debug('%s still exist.', ', '.join(exist)) + raise exception.VolumePathNotRemoved(volume_path=exist) def get_device_info(self, device): (out, _err) = self._execute('sg_scan', device, run_as_root=True, diff --git a/os_brick/tests/initiator/test_linuxscsi.py b/os_brick/tests/initiator/test_linuxscsi.py index 9f02a7239..21d4beda0 100644 --- a/os_brick/tests/initiator/test_linuxscsi.py +++ b/os_brick/tests/initiator/test_linuxscsi.py @@ -106,7 +106,7 @@ class LinuxSCSITestCase(base.TestCase): @mock.patch('time.sleep') @mock.patch('os.path.exists', return_value=True) def test_wait_for_volumes_removal_failure(self, exists_mock, sleep_mock): - retries = 3 + retries = 61 names = ('sda', 'sdb') self.assertRaises(exception.VolumePathNotRemoved, self.linuxscsi.wait_for_volumes_removal, names)