LVM: Fix delete volume error due to lvs failure

Sometimes we get an unexpected failure on the lvs command where it exits
with code 139, which is not one of the 5 predefined ones in the code
ECMD_PROCESSED, ENO_SUCH_CMD, EINVALID_CMD_LINE, EINIT_FAILED, or
ECMD_FAILED.

We've seen this happen mostly on CI jobs when deleting a volume, on the
check to see if the volume is present (``_volume_not_present``) and
makes the whole operation unexpectedly fail.

When looking at the logs we can find other instances of this same exit
code happening in other calls, but those are fortunately to be covered
by retries for other unexpected errors such as bug #1335905 that seem to
make the call eventually succeed.

The stderr of the failure is polluted with debug and warning messages
such as:

  [1] /dev/sda1: stat failed: No such file or directory

      This has been removed [2] from the LVM code indicating it's
      somewhat troublesome, but doesn't explain how.

  [3] Path /dev/sda1 no longer valid for device(8,1)

  [4] WARNING: Scan ignoring device 8:0 with no paths.

But the real error is most likely:

  [5]: Device open /dev/sda 8:0 failed errno 2

On failure we see that error twice, because the code retries it in LVM
trying to workaround some kind of unknown udev race [6].

Since the LVM code indicates that a retry can help, we'll retry on error
139 when calling ``get_lv_info``.

To narrow down the retry we'll only do it on error 139, so we modify the
existing ``retry`` decorator to accept the ``retry`` parameter (same as
the tenacity library) and create our own retry if the
ProcessExecutionError fails with a specific error.

This pattern seems better than blindly retrying all
ProcessExecutionError cases.

[1]: 17f5572bc9/lib/filters/filter-persistent.c (L132)
[2]: 22c5467add
[3]: b84a9927b7/lib/device/dev-cache.c (L1396-L1402)
[4]: b84a9927b7/lib/label/label.c (L798)
[5]: b84a9927b7/lib/label/label.c (L550)
[6]: b84a9927b7/lib/label/label.c (L562-L567)

Closes-Bug: #1901783
Change-Id: I6824ba4fbcb6fd8f57f8ff86ad7132446ac6c504
This commit is contained in:
Gorka Eguileor 2021-03-29 15:47:28 +02:00
parent 2a959e6645
commit 0b2c0d9238
8 changed files with 78 additions and 11 deletions

View File

@ -286,6 +286,8 @@ class LVM(executor.Executor):
return LVM._supports_pvs_ignoreskippedcluster
@staticmethod
@utils.retry(retry=utils.retry_if_exit_code, retry_param=139, interval=0.5,
backoff_rate=0.5) # Bug#1901783
def get_lv_info(root_helper, vg_name=None, lv_name=None):
"""Retrieve info about LVs (all, in a VG, or a single LV).
@ -653,7 +655,7 @@ class LVM(executor.Executor):
# order to prevent a race condition.
self._wait_for_volume_deactivation(name)
@utils.retry(exceptions=exception.VolumeNotDeactivated, retries=5,
@utils.retry(retry_param=exception.VolumeNotDeactivated, retries=5,
backoff_rate=2)
def _wait_for_volume_deactivation(self, name):
LOG.debug("Checking to see if volume %s has been deactivated.",

View File

@ -231,6 +231,27 @@ class BrickLvmTestCase(test.TestCase):
'sudo', vg_name='fake-vg')
)
@mock.patch('tenacity.nap.sleep', mock.Mock())
@mock.patch.object(brick.putils, 'execute')
def test_get_lv_info_retry(self, exec_mock):
exec_mock.side_effect = (
processutils.ProcessExecutionError('', '', exit_code=139),
('vg name size', ''),
)
self.assertEqual(
[{'name': 'name', 'vg': 'vg', 'size': 'size'}],
self.vg.get_lv_info('sudo', vg_name='vg', lv_name='name')
)
self.assertEqual(2, exec_mock.call_count)
args = ['env', 'LC_ALL=C', 'lvs', '--noheadings', '--unit=g', '-o',
'vg_name,name,size', '--nosuffix', '--readonly', 'vg/name']
if self.configuration.lvm_suppress_fd_warnings:
args.insert(2, 'LVM_SUPPRESS_FD_WARNINGS=1')
lvs_call = mock.call(*args, root_helper='sudo', run_as_root=True)
exec_mock.assert_has_calls([lvs_call, lvs_call])
def test_get_all_physical_volumes(self):
# Filtered VG version
pvs = self.vg.get_all_physical_volumes('sudo', 'fake-vg')

View File

@ -1056,6 +1056,33 @@ class TestRetryDecorator(test.TestCase):
self.assertRaises(WrongException, raise_unexpected_error)
self.assertFalse(mock_sleep.called)
@mock.patch('tenacity.nap.sleep')
def test_retry_exit_code(self, sleep_mock):
exit_code = 5
exception = utils.processutils.ProcessExecutionError
@utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code)
def raise_retriable_exit_code():
raise exception(exit_code=exit_code)
self.assertRaises(exception, raise_retriable_exit_code)
self.assertEqual(2, sleep_mock.call_count)
sleep_mock.assert_has_calls([mock.call(1), mock.call(2)])
@mock.patch('tenacity.nap.sleep')
def test_retry_exit_code_non_retriable(self, sleep_mock):
exit_code = 5
exception = utils.processutils.ProcessExecutionError
@utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code)
def raise_non_retriable_exit_code():
raise exception(exit_code=exit_code + 1)
self.assertRaises(exception, raise_non_retriable_exit_code)
sleep_mock.assert_not_called()
@ddt.ddt
class TestCalculateVirtualFree(test.TestCase):

View File

@ -575,8 +575,19 @@ class ComparableMixin(object):
return self._compare(other, lambda s, o: s != o)
def retry(exceptions, interval=1, retries=3, backoff_rate=2,
wait_random=False):
class retry_if_exit_code(tenacity.retry_if_exception):
"""Retry on ProcessExecutionError specific exit codes."""
def __init__(self, codes):
self.codes = (codes,) if isinstance(codes, int) else codes
super(retry_if_exit_code, self).__init__(self._check_exit_code)
def _check_exit_code(self, exc):
return (exc and isinstance(exc, processutils.ProcessExecutionError) and
exc.exit_code in self.codes)
def retry(retry_param, interval=1, retries=3, backoff_rate=2,
wait_random=False, retry=tenacity.retry_if_exception_type):
if retries < 1:
raise ValueError('Retries must be greater than or '
@ -598,7 +609,7 @@ def retry(exceptions, interval=1, retries=3, backoff_rate=2,
after=tenacity.after_log(LOG, logging.DEBUG),
stop=tenacity.stop_after_attempt(retries),
reraise=True,
retry=tenacity.retry_if_exception_type(exceptions),
retry=retry(retry_param),
wait=wait)
return r.call(f, *args, **kwargs)

View File

@ -232,7 +232,7 @@ class HttpClient(object):
raise exception.VolumeBackendAPIException(message=msg)
return rest_response
@utils.retry(exceptions=(requests.ConnectionError,
@utils.retry(retry_param=(requests.ConnectionError,
DellDriverRetryableException))
def get(self, url):
LOG.debug('get: %(url)s', {'url': url})
@ -247,7 +247,7 @@ class HttpClient(object):
raise DellDriverRetryableException()
return rest_response
@utils.retry(exceptions=(requests.ConnectionError,))
@utils.retry(retry_param=(requests.ConnectionError,))
def post(self, url, payload, async_call=False):
LOG.debug('post: %(url)s data: %(payload)s',
{'url': url,
@ -261,7 +261,7 @@ class HttpClient(object):
self.asynctimeout if async_call else self.synctimeout)),
async_call)
@utils.retry(exceptions=(requests.ConnectionError,))
@utils.retry(retry_param=(requests.ConnectionError,))
def put(self, url, payload, async_call=False):
LOG.debug('put: %(url)s data: %(payload)s',
{'url': url,
@ -275,7 +275,7 @@ class HttpClient(object):
self.asynctimeout if async_call else self.synctimeout)),
async_call)
@utils.retry(exceptions=(requests.ConnectionError,))
@utils.retry(retry_param=(requests.ConnectionError,))
def delete(self, url, payload=None, async_call=False):
LOG.debug('delete: %(url)s data: %(payload)s',
{'url': url, 'payload': payload})

View File

@ -200,7 +200,7 @@ class Client(object):
def modify_lun(self):
pass
@cinder_utils.retry(exceptions=const.VNXTargetNotReadyError,
@cinder_utils.retry(retry_param=const.VNXTargetNotReadyError,
interval=15,
retries=5, backoff_rate=1)
def migrate_lun(self, src_id, dst_id,

View File

@ -704,7 +704,7 @@ class AS13000Driver(san.SanISCSIDriver):
request_type=request_type)
@volume_utils.trace
@utils.retry(exceptions=exception.VolumeDriverException,
@utils.retry(retry_param=exception.VolumeDriverException,
interval=1,
retries=3)
def _add_lun_to_target(self, target_name, volume):

View File

@ -0,0 +1,6 @@
---
fixes:
- |
LVM driver `bug #1901783
<https://bugs.launchpad.net/cinder/+bug/1901783>`_: Fix unexpected delete
volume failure due to unexpected exit code 139 on ``lvs`` command call.