Add/remove disk retry/timeout

Recently-added methods discover_vscsi_disk and remove_block_dev in
nova_powervm.virt.powervm.mgmt performed SCSI bus scans followed by
checks for the existence of device special files.  Turns out the bus
scan is asynchronous, meaning there may be a short delay between the
return of the scan command and the (dis)appearance of the device special
file.  This change set builds a retry and timeout into those methods to
accomodate.

Change-Id: I888c74b17f1c9819e94cb6dad889d32f16f6bbcb
This commit is contained in:
Eric Fried 2015-06-22 10:51:04 -05:00
parent cd972fca7d
commit e0af19bc6f
3 changed files with 78 additions and 20 deletions

View File

@ -67,9 +67,11 @@ class TestMgmt(test.TestCase):
process_input='- - -', run_as_root=True)
mock_realpath.assert_called_with(devlink)
@mock.patch('time.sleep')
@mock.patch('glob.glob')
@mock.patch('nova.utils.execute')
def test_discover_vscsi_disk_not_one_result(self, mock_exec, mock_glob):
def test_discover_vscsi_disk_not_one_result(self, mock_exec, mock_glob,
mock_sleep):
"""Zero or more than one disk is found by discover_vscsi_disk."""
udid = ('275b5d5f88fa5611e48be9000098be9400'
'13fb2aa55a2d7b8d150cb1b7b6bc04d6')
@ -77,18 +79,23 @@ class TestMgmt(test.TestCase):
mapping.client_adapter.slot_number = 5
mapping.backing_storage.udid = udid
# No disks found
mock_glob.side_effect = [['path'], []]
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
mock_glob.side_effect = lambda path: []
self.assertRaises(npvmex.NoDiskDiscoveryException,
mgmt.discover_vscsi_disk, mapping)
self.assertTrue(mock_sleep.call_count)
# Multiple disks found
mock_sleep.reset_mock()
mock_glob.side_effect = [['path'], ['/dev/sde', '/dev/sdf']]
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
mgmt.discover_vscsi_disk, mapping)
self.assertEqual(0, mock_sleep.call_count)
@mock.patch('time.sleep')
@mock.patch('os.path.realpath')
@mock.patch('os.stat')
@mock.patch('nova.utils.execute')
def test_remove_block_dev(self, mock_exec, mock_stat, mock_realpath):
def test_remove_block_dev(self, mock_exec, mock_stat, mock_realpath,
mock_sleep):
link = '/dev/link/foo'
realpath = '/dev/sde'
delpath = '/sys/block/sde/device/delete'
@ -102,6 +109,7 @@ class TestMgmt(test.TestCase):
mock.call(realpath)])
mock_exec.assert_called_with('tee', '-a', delpath, process_input='1',
run_as_root=True)
self.assertEqual(0, mock_sleep.call_count)
# Device param not found
mock_exec.reset_mock()
@ -126,9 +134,12 @@ class TestMgmt(test.TestCase):
# Deletion was attempted, but device is still there
mock_exec.reset_mock()
mock_stat.reset_mock()
mock_stat.side_effect = (None, None, None)
mock_sleep.reset_mock()
mock_stat.side_effect = lambda path: 1
self.assertRaises(
npvmex.DeviceDeletionException, mgmt.remove_block_dev, link)
# stat was called thrice; exec was called once
self.assertEqual(3, mock_stat.call_count)
# stat was called many times; exec was called once
self.assertTrue(mock_stat.call_count > 4)
self.assertEqual(1, mock_exec.call_count)
# sleep was called many times
self.assertTrue(mock_sleep.call_count)

View File

@ -45,8 +45,15 @@ class ManagementPartitionNotFoundException(nex.NovaException):
"%(count)d.")
class NoDiskDiscoveryException(nex.NovaException):
"""Failed to discover any disk."""
msg_fmt = _("Having scanned SCSI bus %(bus)x on the management partition, "
"disk with UDID %(udid)s failed to appear after %(polls)d "
"polls over %(timeout)d seconds.")
class UniqueDiskDiscoveryException(nex.NovaException):
"""Expected to discover exactly one disk, but discovered 0 or >1."""
"""Expected to discover exactly one disk, but discovered >1."""
msg_fmt = _("Expected to find exactly one disk on the management "
"partition at %(path_pattern)s; found %(count)d.")
@ -54,7 +61,8 @@ class UniqueDiskDiscoveryException(nex.NovaException):
class DeviceDeletionException(nex.NovaException):
"""Expected to delete a disk, but the disk is still present afterward."""
msg_fmt = _("Device %(devpath)s is still present on the management "
"partition after attempting to delete it.")
"partition after attempting to delete it. Polled %(polls)d "
"times over %(timeout)d seconds.")
class InstanceDiskMappingFailed(AbstractDiskException):

View File

@ -28,13 +28,13 @@ from nova.i18n import _LI
from nova.storage import linuxscsi
import os
from os import path
from oslo_log import log as logging
from pypowervm.wrappers import logical_partition as pvm_lpar
from nova_powervm.virt.powervm import exception as npvmex
from oslo_log import log as logging
import retrying
LOG = logging.getLogger(__name__)
@ -49,7 +49,7 @@ def get_mgmt_partition(adapter):
return wraps[0]
def discover_vscsi_disk(mapping):
def discover_vscsi_disk(mapping, scan_timeout=10):
"""Bring a mapped device into the management partition and find its name.
Based on a VSCSIMapping, scan the appropriate virtual SCSI host bus,
@ -64,7 +64,13 @@ def discover_vscsi_disk(mapping):
:param mapping: The pypowervm.wrappers.virtual_io_server.VSCSIMapping
representing the mapping of the desired disk to the
management partition.
:param scan_timeout: The maximum number of seconds after scanning to wait
for the specified device to appear.
:return: The udev-generated ("/dev/sdX") name of the discovered disk.
:raise NoDiskDiscoveryException: If the disk did not appear after the
specified timeout.
:raise UniqueDiskDiscoveryException: If more than one disk appears with the
expected UDID.
"""
# TODO(IBM): Support for other host platforms.
@ -87,7 +93,22 @@ def discover_vscsi_disk(mapping):
# Now see if our device showed up. If so, we can reliably match it based
# on its Linux ID, which ends with the disk's UDID.
dpathpat = '/dev/disk/by-id/*%s' % udid
disks = glob.glob(dpathpat)
# The bus scan is asynchronous. Need to poll, waiting for the device to
# spring into existence. Stop when glob finds at least one device, or
# after the specified timeout. Sleep 1/4 second between polls.
@retrying.retry(retry_on_result=lambda result: not result, wait_fixed=250,
stop_max_delay=scan_timeout * 1000)
def _poll_for_dev(globpat):
return glob.glob(globpat)
try:
disks = _poll_for_dev(dpathpat)
except retrying.RetryError as re:
raise npvmex.NoDiskDiscoveryException(
bus=lslot, udid=udid, polls=re.last_attempt.attempt_number,
timeout=scan_timeout)
# If we get here, _poll_for_dev returned a nonempty list. If not exactly
# one entry, this is an error.
if len(disks) != 1:
raise npvmex.UniqueDiskDiscoveryException(path_pattern=dpathpat,
count=len(disks))
@ -100,7 +121,7 @@ def discover_vscsi_disk(mapping):
return dpath
def remove_block_dev(devpath):
def remove_block_dev(devpath, scan_timeout=10):
"""Remove a block device from the management partition.
This method causes the operating system of the management partition to
@ -108,6 +129,8 @@ def remove_block_dev(devpath):
:param devpath: Any path to the block special file associated with the
device to be removed.
:param scan_timeout: The maximum number of seconds after scanning to wait
for the specified device to disappear.
:raise InvalidDevicePath: If the specified device or its 'delete' special
file cannot be found.
:raise DeviceDeletionException: If the deletion was attempted, but the
@ -130,9 +153,25 @@ def remove_block_dev(devpath):
"partition via special file %(delpath)s.",
{'devpath': devpath, 'delpath': delpath})
linuxscsi.echo_scsi_command(delpath, '1')
# The bus scan is asynchronous. Need to poll, waiting for the device to
# disappear. Stop when stat raises OSError (dev file not found) - which is
# success - or after the specified timeout (which is failure). Sleep 1/4
# second between polls.
@retrying.retry(retry_on_result=lambda result: result, wait_fixed=250,
stop_max_delay=scan_timeout * 1000)
def _poll_for_del(statpath):
try:
os.stat(statpath)
return True
except OSError:
# Device special file is absent, as expected
return False
try:
os.stat(devpath)
raise npvmex.DeviceDeletionException(devpath=devpath)
except OSError:
# Device special file is absent, as expected
pass
_poll_for_del(devpath)
except retrying.RetryError as re:
# stat just kept returning (dev file continued to exist).
raise npvmex.DeviceDeletionException(
devpath=devpath, polls=re.last_attempt.attempt_number,
timeout=scan_timeout)
# Else stat raised - the device disappeared - all done.