Add/remove disk retry/timeout
Recently-added methods discover_vscsi_disk and remove_block_dev in nova_powervm.virt.powervm.mgmt performed SCSI bus scans followed by checks for the existence of device special files. Turns out the bus scan is asynchronous, meaning there may be a short delay between the return of the scan command and the (dis)appearance of the device special file. This change set builds a retry and timeout into those methods to accomodate. Change-Id: I888c74b17f1c9819e94cb6dad889d32f16f6bbcb
This commit is contained in:
parent
cd972fca7d
commit
e0af19bc6f
@ -67,9 +67,11 @@ class TestMgmt(test.TestCase):
|
||||
process_input='- - -', run_as_root=True)
|
||||
mock_realpath.assert_called_with(devlink)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch('glob.glob')
|
||||
@mock.patch('nova.utils.execute')
|
||||
def test_discover_vscsi_disk_not_one_result(self, mock_exec, mock_glob):
|
||||
def test_discover_vscsi_disk_not_one_result(self, mock_exec, mock_glob,
|
||||
mock_sleep):
|
||||
"""Zero or more than one disk is found by discover_vscsi_disk."""
|
||||
udid = ('275b5d5f88fa5611e48be9000098be9400'
|
||||
'13fb2aa55a2d7b8d150cb1b7b6bc04d6')
|
||||
@ -77,18 +79,23 @@ class TestMgmt(test.TestCase):
|
||||
mapping.client_adapter.slot_number = 5
|
||||
mapping.backing_storage.udid = udid
|
||||
# No disks found
|
||||
mock_glob.side_effect = [['path'], []]
|
||||
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
|
||||
mock_glob.side_effect = lambda path: []
|
||||
self.assertRaises(npvmex.NoDiskDiscoveryException,
|
||||
mgmt.discover_vscsi_disk, mapping)
|
||||
self.assertTrue(mock_sleep.call_count)
|
||||
# Multiple disks found
|
||||
mock_sleep.reset_mock()
|
||||
mock_glob.side_effect = [['path'], ['/dev/sde', '/dev/sdf']]
|
||||
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
|
||||
mgmt.discover_vscsi_disk, mapping)
|
||||
self.assertEqual(0, mock_sleep.call_count)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch('os.path.realpath')
|
||||
@mock.patch('os.stat')
|
||||
@mock.patch('nova.utils.execute')
|
||||
def test_remove_block_dev(self, mock_exec, mock_stat, mock_realpath):
|
||||
def test_remove_block_dev(self, mock_exec, mock_stat, mock_realpath,
|
||||
mock_sleep):
|
||||
link = '/dev/link/foo'
|
||||
realpath = '/dev/sde'
|
||||
delpath = '/sys/block/sde/device/delete'
|
||||
@ -102,6 +109,7 @@ class TestMgmt(test.TestCase):
|
||||
mock.call(realpath)])
|
||||
mock_exec.assert_called_with('tee', '-a', delpath, process_input='1',
|
||||
run_as_root=True)
|
||||
self.assertEqual(0, mock_sleep.call_count)
|
||||
|
||||
# Device param not found
|
||||
mock_exec.reset_mock()
|
||||
@ -126,9 +134,12 @@ class TestMgmt(test.TestCase):
|
||||
# Deletion was attempted, but device is still there
|
||||
mock_exec.reset_mock()
|
||||
mock_stat.reset_mock()
|
||||
mock_stat.side_effect = (None, None, None)
|
||||
mock_sleep.reset_mock()
|
||||
mock_stat.side_effect = lambda path: 1
|
||||
self.assertRaises(
|
||||
npvmex.DeviceDeletionException, mgmt.remove_block_dev, link)
|
||||
# stat was called thrice; exec was called once
|
||||
self.assertEqual(3, mock_stat.call_count)
|
||||
# stat was called many times; exec was called once
|
||||
self.assertTrue(mock_stat.call_count > 4)
|
||||
self.assertEqual(1, mock_exec.call_count)
|
||||
# sleep was called many times
|
||||
self.assertTrue(mock_sleep.call_count)
|
||||
|
@ -45,8 +45,15 @@ class ManagementPartitionNotFoundException(nex.NovaException):
|
||||
"%(count)d.")
|
||||
|
||||
|
||||
class NoDiskDiscoveryException(nex.NovaException):
|
||||
"""Failed to discover any disk."""
|
||||
msg_fmt = _("Having scanned SCSI bus %(bus)x on the management partition, "
|
||||
"disk with UDID %(udid)s failed to appear after %(polls)d "
|
||||
"polls over %(timeout)d seconds.")
|
||||
|
||||
|
||||
class UniqueDiskDiscoveryException(nex.NovaException):
|
||||
"""Expected to discover exactly one disk, but discovered 0 or >1."""
|
||||
"""Expected to discover exactly one disk, but discovered >1."""
|
||||
msg_fmt = _("Expected to find exactly one disk on the management "
|
||||
"partition at %(path_pattern)s; found %(count)d.")
|
||||
|
||||
@ -54,7 +61,8 @@ class UniqueDiskDiscoveryException(nex.NovaException):
|
||||
class DeviceDeletionException(nex.NovaException):
|
||||
"""Expected to delete a disk, but the disk is still present afterward."""
|
||||
msg_fmt = _("Device %(devpath)s is still present on the management "
|
||||
"partition after attempting to delete it.")
|
||||
"partition after attempting to delete it. Polled %(polls)d "
|
||||
"times over %(timeout)d seconds.")
|
||||
|
||||
|
||||
class InstanceDiskMappingFailed(AbstractDiskException):
|
||||
|
@ -28,13 +28,13 @@ from nova.i18n import _LI
|
||||
from nova.storage import linuxscsi
|
||||
import os
|
||||
from os import path
|
||||
|
||||
from oslo_log import log as logging
|
||||
|
||||
from pypowervm.wrappers import logical_partition as pvm_lpar
|
||||
|
||||
from nova_powervm.virt.powervm import exception as npvmex
|
||||
|
||||
from oslo_log import log as logging
|
||||
import retrying
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ def get_mgmt_partition(adapter):
|
||||
return wraps[0]
|
||||
|
||||
|
||||
def discover_vscsi_disk(mapping):
|
||||
def discover_vscsi_disk(mapping, scan_timeout=10):
|
||||
"""Bring a mapped device into the management partition and find its name.
|
||||
|
||||
Based on a VSCSIMapping, scan the appropriate virtual SCSI host bus,
|
||||
@ -64,7 +64,13 @@ def discover_vscsi_disk(mapping):
|
||||
:param mapping: The pypowervm.wrappers.virtual_io_server.VSCSIMapping
|
||||
representing the mapping of the desired disk to the
|
||||
management partition.
|
||||
:param scan_timeout: The maximum number of seconds after scanning to wait
|
||||
for the specified device to appear.
|
||||
:return: The udev-generated ("/dev/sdX") name of the discovered disk.
|
||||
:raise NoDiskDiscoveryException: If the disk did not appear after the
|
||||
specified timeout.
|
||||
:raise UniqueDiskDiscoveryException: If more than one disk appears with the
|
||||
expected UDID.
|
||||
"""
|
||||
# TODO(IBM): Support for other host platforms.
|
||||
|
||||
@ -87,7 +93,22 @@ def discover_vscsi_disk(mapping):
|
||||
# Now see if our device showed up. If so, we can reliably match it based
|
||||
# on its Linux ID, which ends with the disk's UDID.
|
||||
dpathpat = '/dev/disk/by-id/*%s' % udid
|
||||
disks = glob.glob(dpathpat)
|
||||
|
||||
# The bus scan is asynchronous. Need to poll, waiting for the device to
|
||||
# spring into existence. Stop when glob finds at least one device, or
|
||||
# after the specified timeout. Sleep 1/4 second between polls.
|
||||
@retrying.retry(retry_on_result=lambda result: not result, wait_fixed=250,
|
||||
stop_max_delay=scan_timeout * 1000)
|
||||
def _poll_for_dev(globpat):
|
||||
return glob.glob(globpat)
|
||||
try:
|
||||
disks = _poll_for_dev(dpathpat)
|
||||
except retrying.RetryError as re:
|
||||
raise npvmex.NoDiskDiscoveryException(
|
||||
bus=lslot, udid=udid, polls=re.last_attempt.attempt_number,
|
||||
timeout=scan_timeout)
|
||||
# If we get here, _poll_for_dev returned a nonempty list. If not exactly
|
||||
# one entry, this is an error.
|
||||
if len(disks) != 1:
|
||||
raise npvmex.UniqueDiskDiscoveryException(path_pattern=dpathpat,
|
||||
count=len(disks))
|
||||
@ -100,7 +121,7 @@ def discover_vscsi_disk(mapping):
|
||||
return dpath
|
||||
|
||||
|
||||
def remove_block_dev(devpath):
|
||||
def remove_block_dev(devpath, scan_timeout=10):
|
||||
"""Remove a block device from the management partition.
|
||||
|
||||
This method causes the operating system of the management partition to
|
||||
@ -108,6 +129,8 @@ def remove_block_dev(devpath):
|
||||
|
||||
:param devpath: Any path to the block special file associated with the
|
||||
device to be removed.
|
||||
:param scan_timeout: The maximum number of seconds after scanning to wait
|
||||
for the specified device to disappear.
|
||||
:raise InvalidDevicePath: If the specified device or its 'delete' special
|
||||
file cannot be found.
|
||||
:raise DeviceDeletionException: If the deletion was attempted, but the
|
||||
@ -130,9 +153,25 @@ def remove_block_dev(devpath):
|
||||
"partition via special file %(delpath)s.",
|
||||
{'devpath': devpath, 'delpath': delpath})
|
||||
linuxscsi.echo_scsi_command(delpath, '1')
|
||||
|
||||
# The bus scan is asynchronous. Need to poll, waiting for the device to
|
||||
# disappear. Stop when stat raises OSError (dev file not found) - which is
|
||||
# success - or after the specified timeout (which is failure). Sleep 1/4
|
||||
# second between polls.
|
||||
@retrying.retry(retry_on_result=lambda result: result, wait_fixed=250,
|
||||
stop_max_delay=scan_timeout * 1000)
|
||||
def _poll_for_del(statpath):
|
||||
try:
|
||||
os.stat(statpath)
|
||||
return True
|
||||
except OSError:
|
||||
# Device special file is absent, as expected
|
||||
return False
|
||||
try:
|
||||
os.stat(devpath)
|
||||
raise npvmex.DeviceDeletionException(devpath=devpath)
|
||||
except OSError:
|
||||
# Device special file is absent, as expected
|
||||
pass
|
||||
_poll_for_del(devpath)
|
||||
except retrying.RetryError as re:
|
||||
# stat just kept returning (dev file continued to exist).
|
||||
raise npvmex.DeviceDeletionException(
|
||||
devpath=devpath, polls=re.last_attempt.attempt_number,
|
||||
timeout=scan_timeout)
|
||||
# Else stat raised - the device disappeared - all done.
|
||||
|
Loading…
Reference in New Issue
Block a user