Retry unmount operation on the ZFSOnLinux driver

When a share is mounted on the same host as the manila-share
process, zfs prevents us from destroying the underlying
dataset until the share has been cleanly unmounted from
the host. Kernel mounts can take a few seconds to get
unmounted fully especially when there are a lot of
linux namespaces that the mountpoint has been shared to.

Add a retry on these operations to harden the deletion
process and prevent spurious failures.

Change-Id: I4aba76b72df274d0a8cb90fe0ab8799523c260ef
Closes-Bug: #1903773
Related-Bug: #1896672
Signed-off-by: Goutham Pacha Ravi <gouthampravi@gmail.com>
This commit is contained in:
Goutham Pacha Ravi 2020-11-10 13:31:38 -08:00
parent 136a89937c
commit e3fea14788
3 changed files with 72 additions and 71 deletions

View File

@ -273,30 +273,27 @@ class ZFSonLinuxShareDriver(zfs_utils.ExecuteMixin, driver.ShareDriver):
msg=_("Could not destroy '%s' dataset, " msg=_("Could not destroy '%s' dataset, "
"because it had opened files.") % name) "because it had opened files.") % name)
# NOTE(vponomaryov): Now, when no file usages and mounts of dataset @utils.retry(exception.ProcessExecutionError)
# exist, destroy dataset. def _zfs_destroy_with_retry():
try: """Retry destroying dataset ten times with exponential backoff."""
self.zfs('destroy', '-f', name)
return
except exception.ProcessExecutionError:
LOG.info("Failed to destroy ZFS dataset, retrying one time")
# NOTE(bswartz): There appears to be a bug in ZFS when creating and # NOTE(bswartz): There appears to be a bug in ZFS when creating and
# destroying datasets concurrently where the filesystem remains mounted # destroying datasets concurrently where the filesystem remains
# even though ZFS thinks it's unmounted. The most reliable workaround # mounted even though ZFS thinks it's unmounted. The most reliable
# I've found is to force the unmount, then retry the destroy, with # workaround I've found is to force the unmount, then attempt the
# short pauses around the unmount. # destroy, with short pauses around the unmount. (See bug#1546723)
time.sleep(1)
try: try:
self.execute('sudo', 'umount', mountpoint) self.execute('sudo', 'umount', mountpoint)
except exception.ProcessExecutionError: except exception.ProcessExecutionError:
# Ignore failed umount, it's normal # Ignore failed umount, it's normal
pass pass
time.sleep(1) time.sleep(2)
# This time the destroy is expected to succeed. # NOTE(vponomaryov): Now, when no file usages and mounts of dataset
# exist, destroy dataset.
self.zfs('destroy', '-f', name) self.zfs('destroy', '-f', name)
_zfs_destroy_with_retry()
def _setup_helpers(self): def _setup_helpers(self):
"""Setups share helper for ZFS backend.""" """Setups share helper for ZFS backend."""
self._helpers = {} self._helpers = {}

View File

@ -152,6 +152,7 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
self.driver = zfs_driver.ZFSonLinuxShareDriver( self.driver = zfs_driver.ZFSonLinuxShareDriver(
configuration=self.configuration, configuration=self.configuration,
private_storage=self.private_storage) private_storage=self.private_storage)
self.mock_object(zfs_driver.time, 'sleep')
def test_init(self): def test_init(self):
self.assertTrue(hasattr(self.driver, 'replica_snapshot_prefix')) self.assertTrue(hasattr(self.driver, 'replica_snapshot_prefix'))
@ -1174,7 +1175,6 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
zfs_driver.share_types, zfs_driver.share_types,
'get_extra_specs_from_share', 'get_extra_specs_from_share',
mock.Mock(return_value={})) mock.Mock(return_value={}))
self.mock_object(zfs_driver.time, "sleep")
mock__get_dataset_name = self.mock_object( mock__get_dataset_name = self.mock_object(
self.driver, "_get_dataset_name", self.driver, "_get_dataset_name",
mock.Mock(return_value=new_dataset_name)) mock.Mock(return_value=new_dataset_name))
@ -1318,7 +1318,6 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
zfs_driver.share_types, zfs_driver.share_types,
'get_extra_specs_from_share', 'get_extra_specs_from_share',
mock.Mock(return_value={})) mock.Mock(return_value={}))
self.mock_object(zfs_driver.time, "sleep")
mock__get_dataset_name = self.mock_object( mock__get_dataset_name = self.mock_object(
self.driver, "_get_dataset_name", self.driver, "_get_dataset_name",
mock.Mock(return_value=new_ds_name)) mock.Mock(return_value=new_ds_name))
@ -1451,24 +1450,41 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
snapshot_instance["snapshot_id"]) snapshot_instance["snapshot_id"])
def test__delete_dataset_or_snapshot_with_retry_snapshot(self): def test__delete_dataset_or_snapshot_with_retry_snapshot(self):
self.mock_object(self.driver, 'get_zfs_option') umount_sideeffects = (exception.ProcessExecutionError,
self.mock_object(self.driver, 'zfs') exception.ProcessExecutionError,
exception.ProcessExecutionError)
zfs_destroy_sideeffects = (exception.ProcessExecutionError,
exception.ProcessExecutionError,
None)
self.mock_object(
self.driver, 'get_zfs_option', mock.Mock(return_value='/foo@bar'))
self.mock_object(
self.driver, 'execute', mock.Mock(side_effect=umount_sideeffects))
self.mock_object(
self.driver, 'zfs', mock.Mock(side_effect=zfs_destroy_sideeffects))
self.driver._delete_dataset_or_snapshot_with_retry('foo@bar') self.driver._delete_dataset_or_snapshot_with_retry('foo@bar')
self.driver.get_zfs_option.assert_called_once_with( self.driver.get_zfs_option.assert_called_once_with(
'foo@bar', 'mountpoint') 'foo@bar', 'mountpoint')
self.driver.zfs.assert_called_once_with( self.driver.execute.assert_has_calls(
'destroy', '-f', 'foo@bar') [mock.call('sudo', 'umount', '/foo@bar')] * 3)
self.driver.zfs.assert_has_calls(
[mock.call('destroy', '-f', 'foo@bar')] * 3)
self.assertEqual(3, self.driver.execute.call_count)
self.assertEqual(3, self.driver.zfs.call_count)
def test__delete_dataset_or_snapshot_with_retry_of(self): def test__delete_dataset_or_snapshot_with_retry_dataset_busy_fail(self):
self.mock_object(self.driver, 'get_zfs_option') # simulating local processes that have open files on the mount
lsof_sideeffects = ([('1001, 1002', '0')] * 30)
self.mock_object(self.driver, 'get_zfs_option',
mock.Mock(return_value='/fake/dataset/name'))
self.mock_object( self.mock_object(
self.driver, 'execute', mock.Mock(return_value=('a', 'b'))) self.driver, 'execute', mock.Mock(side_effect=lsof_sideeffects))
self.mock_object(zfs_driver.time, 'sleep')
self.mock_object(zfs_driver.LOG, 'debug') self.mock_object(zfs_driver.LOG, 'debug')
self.mock_object( self.mock_object(
zfs_driver.time, 'time', mock.Mock(side_effect=range(1, 70, 2))) zfs_driver.time, 'time', mock.Mock(side_effect=range(1, 70, 2)))
self.mock_object(self.driver, 'zfs')
dataset_name = 'fake/dataset/name' dataset_name = 'fake/dataset/name'
self.assertRaises( self.assertRaises(
@ -1480,55 +1496,39 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
self.driver.get_zfs_option.assert_called_once_with( self.driver.get_zfs_option.assert_called_once_with(
dataset_name, 'mountpoint') dataset_name, 'mountpoint')
self.assertEqual(29, zfs_driver.LOG.debug.call_count) self.assertEqual(29, zfs_driver.LOG.debug.call_count)
# We should've bailed out before executing "zfs destroy"
self.driver.zfs.assert_not_called()
def test__delete_dataset_or_snapshot_with_retry_temp_of(self): def test__delete_dataset_or_snapshot_with_retry_dataset(self):
self.mock_object(self.driver, 'get_zfs_option') lsof_sideeffects = (('1001', '0'), exception.ProcessExecutionError)
self.mock_object(self.driver, 'zfs') umount_sideeffects = (exception.ProcessExecutionError,
self.mock_object( exception.ProcessExecutionError,
self.driver, 'execute', mock.Mock(side_effect=[ exception.ProcessExecutionError)
('a', 'b'), zfs_destroy_sideeffects = (exception.ProcessExecutionError,
exception.ProcessExecutionError( exception.ProcessExecutionError,
'FAKE lsof returns not found')])) None)
self.mock_object(zfs_driver.time, 'sleep')
self.mock_object(zfs_driver.LOG, 'debug')
self.mock_object(
zfs_driver.time, 'time', mock.Mock(side_effect=range(1, 70, 2)))
dataset_name = 'fake/dataset/name' dataset_name = 'fake/dataset/name'
self.mock_object(self.driver, 'get_zfs_option', mock.Mock(
self.driver._delete_dataset_or_snapshot_with_retry(dataset_name) return_value='/%s' % dataset_name))
self.driver.get_zfs_option.assert_called_once_with(
dataset_name, 'mountpoint')
self.assertEqual(2, self.driver.execute.call_count)
self.assertEqual(1, zfs_driver.LOG.debug.call_count)
zfs_driver.LOG.debug.assert_called_once_with(
mock.ANY, {'name': dataset_name, 'out': 'a'})
zfs_driver.time.sleep.assert_called_once_with(2)
self.driver.zfs.assert_called_once_with('destroy', '-f', dataset_name)
def test__delete_dataset_or_snapshot_with_retry_busy(self):
self.mock_object(self.driver, 'get_zfs_option')
self.mock_object( self.mock_object(
self.driver, 'execute', mock.Mock( self.driver, 'execute', mock.Mock(
side_effect=exception.ProcessExecutionError( side_effect=(lsof_sideeffects + umount_sideeffects)))
'FAKE lsof returns not found')))
self.mock_object( self.mock_object(
self.driver, 'zfs', mock.Mock(side_effect=[ self.driver, 'zfs', mock.Mock(side_effect=zfs_destroy_sideeffects))
exception.ProcessExecutionError(
'cannot destroy FAKE: dataset is busy\n'),
None, None]))
self.mock_object(zfs_driver.time, 'sleep')
self.mock_object(zfs_driver.LOG, 'info') self.mock_object(zfs_driver.LOG, 'info')
dataset_name = 'fake/dataset/name'
self.driver._delete_dataset_or_snapshot_with_retry(dataset_name) self.driver._delete_dataset_or_snapshot_with_retry(dataset_name)
self.driver.get_zfs_option.assert_called_once_with( self.driver.get_zfs_option.assert_called_once_with(
dataset_name, 'mountpoint') dataset_name, 'mountpoint')
self.assertEqual(2, zfs_driver.time.sleep.call_count) self.driver.execute.assert_has_calls(
self.assertEqual(2, self.driver.execute.call_count) [mock.call('lsof', '-w', '/%s' % dataset_name)] * 2 +
self.assertEqual(1, zfs_driver.LOG.info.call_count) [mock.call('sudo', 'umount', '/%s' % dataset_name)] * 3)
self.assertEqual(2, self.driver.zfs.call_count) self.driver.zfs.assert_has_calls(
[mock.call('destroy', '-f', dataset_name)] * 3)
self.assertEqual(6, zfs_driver.time.sleep.call_count)
self.assertEqual(5, self.driver.execute.call_count)
self.assertEqual(3, self.driver.zfs.call_count)
def test_create_replica(self): def test_create_replica(self):
active_replica = { active_replica = {
@ -2524,7 +2524,6 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
'dataset_name': dst_dataset_name, 'dataset_name': dst_dataset_name,
'ssh_cmd': dst_ssh_cmd, 'ssh_cmd': dst_ssh_cmd,
}) })
self.mock_object(zfs_driver.time, 'sleep')
mock_delete_dataset = self.mock_object( mock_delete_dataset = self.mock_object(
self.driver, '_delete_dataset_or_snapshot_with_retry') self.driver, '_delete_dataset_or_snapshot_with_retry')
ps_output = ( ps_output = (
@ -2570,7 +2569,6 @@ class ZFSonLinuxShareDriverTestCase(test.TestCase):
'dataset_name': dst_dataset_name, 'dataset_name': dst_dataset_name,
'ssh_cmd': dst_ssh_cmd, 'ssh_cmd': dst_ssh_cmd,
}) })
self.mock_object(zfs_driver.time, 'sleep')
mock_delete_dataset = self.mock_object( mock_delete_dataset = self.mock_object(
self.driver, '_delete_dataset_or_snapshot_with_retry') self.driver, '_delete_dataset_or_snapshot_with_retry')
self.mock_object( self.mock_object(

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Share cleanup for the ZFSOnLinux driver has been enhanced to retry on known
errors that could occur due to mount propagation. See `bug 1903773
<https://bugs.launchpad.net/manila/+bug/1903773>`_ for more details.