Add/remove disk retry/timeout
Recently-added methods discover_vscsi_disk and remove_block_dev in nova_powervm.virt.powervm.mgmt performed SCSI bus scans followed by checks for the existence of device special files. Turns out the bus scan is asynchronous, meaning there may be a short delay between the return of the scan command and the (dis)appearance of the device special file. This change set builds a retry and timeout into those methods to accomodate. Change-Id: I888c74b17f1c9819e94cb6dad889d32f16f6bbcb
This commit is contained in:
@@ -67,9 +67,11 @@ class TestMgmt(test.TestCase):
|
|||||||
process_input='- - -', run_as_root=True)
|
process_input='- - -', run_as_root=True)
|
||||||
mock_realpath.assert_called_with(devlink)
|
mock_realpath.assert_called_with(devlink)
|
||||||
|
|
||||||
|
@mock.patch('time.sleep')
|
||||||
@mock.patch('glob.glob')
|
@mock.patch('glob.glob')
|
||||||
@mock.patch('nova.utils.execute')
|
@mock.patch('nova.utils.execute')
|
||||||
def test_discover_vscsi_disk_not_one_result(self, mock_exec, mock_glob):
|
def test_discover_vscsi_disk_not_one_result(self, mock_exec, mock_glob,
|
||||||
|
mock_sleep):
|
||||||
"""Zero or more than one disk is found by discover_vscsi_disk."""
|
"""Zero or more than one disk is found by discover_vscsi_disk."""
|
||||||
udid = ('275b5d5f88fa5611e48be9000098be9400'
|
udid = ('275b5d5f88fa5611e48be9000098be9400'
|
||||||
'13fb2aa55a2d7b8d150cb1b7b6bc04d6')
|
'13fb2aa55a2d7b8d150cb1b7b6bc04d6')
|
||||||
@@ -77,18 +79,23 @@ class TestMgmt(test.TestCase):
|
|||||||
mapping.client_adapter.slot_number = 5
|
mapping.client_adapter.slot_number = 5
|
||||||
mapping.backing_storage.udid = udid
|
mapping.backing_storage.udid = udid
|
||||||
# No disks found
|
# No disks found
|
||||||
mock_glob.side_effect = [['path'], []]
|
mock_glob.side_effect = lambda path: []
|
||||||
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
|
self.assertRaises(npvmex.NoDiskDiscoveryException,
|
||||||
mgmt.discover_vscsi_disk, mapping)
|
mgmt.discover_vscsi_disk, mapping)
|
||||||
|
self.assertTrue(mock_sleep.call_count)
|
||||||
# Multiple disks found
|
# Multiple disks found
|
||||||
|
mock_sleep.reset_mock()
|
||||||
mock_glob.side_effect = [['path'], ['/dev/sde', '/dev/sdf']]
|
mock_glob.side_effect = [['path'], ['/dev/sde', '/dev/sdf']]
|
||||||
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
|
self.assertRaises(npvmex.UniqueDiskDiscoveryException,
|
||||||
mgmt.discover_vscsi_disk, mapping)
|
mgmt.discover_vscsi_disk, mapping)
|
||||||
|
self.assertEqual(0, mock_sleep.call_count)
|
||||||
|
|
||||||
|
@mock.patch('time.sleep')
|
||||||
@mock.patch('os.path.realpath')
|
@mock.patch('os.path.realpath')
|
||||||
@mock.patch('os.stat')
|
@mock.patch('os.stat')
|
||||||
@mock.patch('nova.utils.execute')
|
@mock.patch('nova.utils.execute')
|
||||||
def test_remove_block_dev(self, mock_exec, mock_stat, mock_realpath):
|
def test_remove_block_dev(self, mock_exec, mock_stat, mock_realpath,
|
||||||
|
mock_sleep):
|
||||||
link = '/dev/link/foo'
|
link = '/dev/link/foo'
|
||||||
realpath = '/dev/sde'
|
realpath = '/dev/sde'
|
||||||
delpath = '/sys/block/sde/device/delete'
|
delpath = '/sys/block/sde/device/delete'
|
||||||
@@ -102,6 +109,7 @@ class TestMgmt(test.TestCase):
|
|||||||
mock.call(realpath)])
|
mock.call(realpath)])
|
||||||
mock_exec.assert_called_with('tee', '-a', delpath, process_input='1',
|
mock_exec.assert_called_with('tee', '-a', delpath, process_input='1',
|
||||||
run_as_root=True)
|
run_as_root=True)
|
||||||
|
self.assertEqual(0, mock_sleep.call_count)
|
||||||
|
|
||||||
# Device param not found
|
# Device param not found
|
||||||
mock_exec.reset_mock()
|
mock_exec.reset_mock()
|
||||||
@@ -126,9 +134,12 @@ class TestMgmt(test.TestCase):
|
|||||||
# Deletion was attempted, but device is still there
|
# Deletion was attempted, but device is still there
|
||||||
mock_exec.reset_mock()
|
mock_exec.reset_mock()
|
||||||
mock_stat.reset_mock()
|
mock_stat.reset_mock()
|
||||||
mock_stat.side_effect = (None, None, None)
|
mock_sleep.reset_mock()
|
||||||
|
mock_stat.side_effect = lambda path: 1
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
npvmex.DeviceDeletionException, mgmt.remove_block_dev, link)
|
npvmex.DeviceDeletionException, mgmt.remove_block_dev, link)
|
||||||
# stat was called thrice; exec was called once
|
# stat was called many times; exec was called once
|
||||||
self.assertEqual(3, mock_stat.call_count)
|
self.assertTrue(mock_stat.call_count > 4)
|
||||||
self.assertEqual(1, mock_exec.call_count)
|
self.assertEqual(1, mock_exec.call_count)
|
||||||
|
# sleep was called many times
|
||||||
|
self.assertTrue(mock_sleep.call_count)
|
||||||
|
|||||||
@@ -45,8 +45,15 @@ class ManagementPartitionNotFoundException(nex.NovaException):
|
|||||||
"%(count)d.")
|
"%(count)d.")
|
||||||
|
|
||||||
|
|
||||||
|
class NoDiskDiscoveryException(nex.NovaException):
|
||||||
|
"""Failed to discover any disk."""
|
||||||
|
msg_fmt = _("Having scanned SCSI bus %(bus)x on the management partition, "
|
||||||
|
"disk with UDID %(udid)s failed to appear after %(polls)d "
|
||||||
|
"polls over %(timeout)d seconds.")
|
||||||
|
|
||||||
|
|
||||||
class UniqueDiskDiscoveryException(nex.NovaException):
|
class UniqueDiskDiscoveryException(nex.NovaException):
|
||||||
"""Expected to discover exactly one disk, but discovered 0 or >1."""
|
"""Expected to discover exactly one disk, but discovered >1."""
|
||||||
msg_fmt = _("Expected to find exactly one disk on the management "
|
msg_fmt = _("Expected to find exactly one disk on the management "
|
||||||
"partition at %(path_pattern)s; found %(count)d.")
|
"partition at %(path_pattern)s; found %(count)d.")
|
||||||
|
|
||||||
@@ -54,7 +61,8 @@ class UniqueDiskDiscoveryException(nex.NovaException):
|
|||||||
class DeviceDeletionException(nex.NovaException):
|
class DeviceDeletionException(nex.NovaException):
|
||||||
"""Expected to delete a disk, but the disk is still present afterward."""
|
"""Expected to delete a disk, but the disk is still present afterward."""
|
||||||
msg_fmt = _("Device %(devpath)s is still present on the management "
|
msg_fmt = _("Device %(devpath)s is still present on the management "
|
||||||
"partition after attempting to delete it.")
|
"partition after attempting to delete it. Polled %(polls)d "
|
||||||
|
"times over %(timeout)d seconds.")
|
||||||
|
|
||||||
|
|
||||||
class InstanceDiskMappingFailed(AbstractDiskException):
|
class InstanceDiskMappingFailed(AbstractDiskException):
|
||||||
|
|||||||
@@ -28,13 +28,13 @@ from nova.i18n import _LI
|
|||||||
from nova.storage import linuxscsi
|
from nova.storage import linuxscsi
|
||||||
import os
|
import os
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from oslo_log import log as logging
|
|
||||||
|
|
||||||
from pypowervm.wrappers import logical_partition as pvm_lpar
|
from pypowervm.wrappers import logical_partition as pvm_lpar
|
||||||
|
|
||||||
from nova_powervm.virt.powervm import exception as npvmex
|
from nova_powervm.virt.powervm import exception as npvmex
|
||||||
|
|
||||||
|
from oslo_log import log as logging
|
||||||
|
import retrying
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -49,7 +49,7 @@ def get_mgmt_partition(adapter):
|
|||||||
return wraps[0]
|
return wraps[0]
|
||||||
|
|
||||||
|
|
||||||
def discover_vscsi_disk(mapping):
|
def discover_vscsi_disk(mapping, scan_timeout=10):
|
||||||
"""Bring a mapped device into the management partition and find its name.
|
"""Bring a mapped device into the management partition and find its name.
|
||||||
|
|
||||||
Based on a VSCSIMapping, scan the appropriate virtual SCSI host bus,
|
Based on a VSCSIMapping, scan the appropriate virtual SCSI host bus,
|
||||||
@@ -64,7 +64,13 @@ def discover_vscsi_disk(mapping):
|
|||||||
:param mapping: The pypowervm.wrappers.virtual_io_server.VSCSIMapping
|
:param mapping: The pypowervm.wrappers.virtual_io_server.VSCSIMapping
|
||||||
representing the mapping of the desired disk to the
|
representing the mapping of the desired disk to the
|
||||||
management partition.
|
management partition.
|
||||||
|
:param scan_timeout: The maximum number of seconds after scanning to wait
|
||||||
|
for the specified device to appear.
|
||||||
:return: The udev-generated ("/dev/sdX") name of the discovered disk.
|
:return: The udev-generated ("/dev/sdX") name of the discovered disk.
|
||||||
|
:raise NoDiskDiscoveryException: If the disk did not appear after the
|
||||||
|
specified timeout.
|
||||||
|
:raise UniqueDiskDiscoveryException: If more than one disk appears with the
|
||||||
|
expected UDID.
|
||||||
"""
|
"""
|
||||||
# TODO(IBM): Support for other host platforms.
|
# TODO(IBM): Support for other host platforms.
|
||||||
|
|
||||||
@@ -87,7 +93,22 @@ def discover_vscsi_disk(mapping):
|
|||||||
# Now see if our device showed up. If so, we can reliably match it based
|
# Now see if our device showed up. If so, we can reliably match it based
|
||||||
# on its Linux ID, which ends with the disk's UDID.
|
# on its Linux ID, which ends with the disk's UDID.
|
||||||
dpathpat = '/dev/disk/by-id/*%s' % udid
|
dpathpat = '/dev/disk/by-id/*%s' % udid
|
||||||
disks = glob.glob(dpathpat)
|
|
||||||
|
# The bus scan is asynchronous. Need to poll, waiting for the device to
|
||||||
|
# spring into existence. Stop when glob finds at least one device, or
|
||||||
|
# after the specified timeout. Sleep 1/4 second between polls.
|
||||||
|
@retrying.retry(retry_on_result=lambda result: not result, wait_fixed=250,
|
||||||
|
stop_max_delay=scan_timeout * 1000)
|
||||||
|
def _poll_for_dev(globpat):
|
||||||
|
return glob.glob(globpat)
|
||||||
|
try:
|
||||||
|
disks = _poll_for_dev(dpathpat)
|
||||||
|
except retrying.RetryError as re:
|
||||||
|
raise npvmex.NoDiskDiscoveryException(
|
||||||
|
bus=lslot, udid=udid, polls=re.last_attempt.attempt_number,
|
||||||
|
timeout=scan_timeout)
|
||||||
|
# If we get here, _poll_for_dev returned a nonempty list. If not exactly
|
||||||
|
# one entry, this is an error.
|
||||||
if len(disks) != 1:
|
if len(disks) != 1:
|
||||||
raise npvmex.UniqueDiskDiscoveryException(path_pattern=dpathpat,
|
raise npvmex.UniqueDiskDiscoveryException(path_pattern=dpathpat,
|
||||||
count=len(disks))
|
count=len(disks))
|
||||||
@@ -100,7 +121,7 @@ def discover_vscsi_disk(mapping):
|
|||||||
return dpath
|
return dpath
|
||||||
|
|
||||||
|
|
||||||
def remove_block_dev(devpath):
|
def remove_block_dev(devpath, scan_timeout=10):
|
||||||
"""Remove a block device from the management partition.
|
"""Remove a block device from the management partition.
|
||||||
|
|
||||||
This method causes the operating system of the management partition to
|
This method causes the operating system of the management partition to
|
||||||
@@ -108,6 +129,8 @@ def remove_block_dev(devpath):
|
|||||||
|
|
||||||
:param devpath: Any path to the block special file associated with the
|
:param devpath: Any path to the block special file associated with the
|
||||||
device to be removed.
|
device to be removed.
|
||||||
|
:param scan_timeout: The maximum number of seconds after scanning to wait
|
||||||
|
for the specified device to disappear.
|
||||||
:raise InvalidDevicePath: If the specified device or its 'delete' special
|
:raise InvalidDevicePath: If the specified device or its 'delete' special
|
||||||
file cannot be found.
|
file cannot be found.
|
||||||
:raise DeviceDeletionException: If the deletion was attempted, but the
|
:raise DeviceDeletionException: If the deletion was attempted, but the
|
||||||
@@ -130,9 +153,25 @@ def remove_block_dev(devpath):
|
|||||||
"partition via special file %(delpath)s.",
|
"partition via special file %(delpath)s.",
|
||||||
{'devpath': devpath, 'delpath': delpath})
|
{'devpath': devpath, 'delpath': delpath})
|
||||||
linuxscsi.echo_scsi_command(delpath, '1')
|
linuxscsi.echo_scsi_command(delpath, '1')
|
||||||
|
|
||||||
|
# The bus scan is asynchronous. Need to poll, waiting for the device to
|
||||||
|
# disappear. Stop when stat raises OSError (dev file not found) - which is
|
||||||
|
# success - or after the specified timeout (which is failure). Sleep 1/4
|
||||||
|
# second between polls.
|
||||||
|
@retrying.retry(retry_on_result=lambda result: result, wait_fixed=250,
|
||||||
|
stop_max_delay=scan_timeout * 1000)
|
||||||
|
def _poll_for_del(statpath):
|
||||||
try:
|
try:
|
||||||
os.stat(devpath)
|
os.stat(statpath)
|
||||||
raise npvmex.DeviceDeletionException(devpath=devpath)
|
return True
|
||||||
except OSError:
|
except OSError:
|
||||||
# Device special file is absent, as expected
|
# Device special file is absent, as expected
|
||||||
pass
|
return False
|
||||||
|
try:
|
||||||
|
_poll_for_del(devpath)
|
||||||
|
except retrying.RetryError as re:
|
||||||
|
# stat just kept returning (dev file continued to exist).
|
||||||
|
raise npvmex.DeviceDeletionException(
|
||||||
|
devpath=devpath, polls=re.last_attempt.attempt_number,
|
||||||
|
timeout=scan_timeout)
|
||||||
|
# Else stat raised - the device disappeared - all done.
|
||||||
|
|||||||
Reference in New Issue
Block a user