From 5fbde659aecddc50dd64709426cfc73ef828e1bb Mon Sep 17 00:00:00 2001 From: Gorka Eguileor Date: Mon, 29 Mar 2021 15:47:28 +0200 Subject: [PATCH] LVM: Fix delete volume error due to lvs failure Sometimes we get an unexpected failure on the lvs command where it exits with code 139, which is not one of the 5 predefined ones in the code ECMD_PROCESSED, ENO_SUCH_CMD, EINVALID_CMD_LINE, EINIT_FAILED, or ECMD_FAILED. We've seen this happen mostly on CI jobs when deleting a volume, on the check to see if the volume is present (``_volume_not_present``) and makes the whole operation unexpectedly fail. When looking at the logs we can find other instances of this same exit code happening in other calls, but those are fortunately to be covered by retries for other unexpected errors such as bug #1335905 that seem to make the call eventually succeed. The stderr of the failure is polluted with debug and warning messages such as: [1] /dev/sda1: stat failed: No such file or directory This has been removed [2] from the LVM code indicating it's somewhat troublesome, but doesn't explain how. [3] Path /dev/sda1 no longer valid for device(8,1) [4] WARNING: Scan ignoring device 8:0 with no paths. But the real error is most likely: [5]: Device open /dev/sda 8:0 failed errno 2 On failure we see that error twice, because the code retries it in LVM trying to workaround some kind of unknown udev race [6]. Since the LVM code indicates that a retry can help, we'll retry on error 139 when calling ``get_lv_info``. To narrow down the retry we'll only do it on error 139, so we modify the existing ``retry`` decorator to accept the ``retry`` parameter (same as the tenacity library) and create our own retry if the ProcessExecutionError fails with a specific error. This pattern seems better than blindly retrying all ProcessExecutionError cases. [1]: https://github.com/lvmteam/lvm2/blob/17f5572bc972c932bcf62fc2bff3029268ae0109/lib/filters/filter-persistent.c#L132 [2]: https://github.com/lvmteam/lvm2/commit/22c5467add88268554a7d163a26892166dbc3104 [3]: https://github.com/lvmteam/lvm2/blob/b84a9927b78727efffbb257a61e9e95a648cdfab/lib/device/dev-cache.c#L1396-L1402 [4]: https://github.com/lvmteam/lvm2/blob/b84a9927b78727efffbb257a61e9e95a648cdfab/lib/label/label.c#L798 [5]: https://github.com/lvmteam/lvm2/blob/b84a9927b78727efffbb257a61e9e95a648cdfab/lib/label/label.c#L550 [6]: https://github.com/lvmteam/lvm2/blob/b84a9927b78727efffbb257a61e9e95a648cdfab/lib/label/label.c#L562-L567 Conflicts: cinder/volume/drivers/inspur/as13000/as13000_driver.py Conflict caused by the rename of a (decorator) class in >=wallaby The test was also fixes to account for the lack of some later fixes. Closes-Bug: #1901783 Change-Id: I6824ba4fbcb6fd8f57f8ff86ad7132446ac6c504 (cherry picked from commit 0b2c0d9238b6f84a02fcd82f97bc5a2d9f681247) (cherry picked from commit 26919f3f843c301eac5625fc1745d0b3612d2814) --- cinder/brick/local_dev/lvm.py | 4 ++- cinder/tests/unit/brick/test_brick_lvm.py | 21 +++++++++++++++ cinder/tests/unit/test_utils.py | 27 +++++++++++++++++++ cinder/utils.py | 17 +++++++++--- .../drivers/dell_emc/sc/storagecenter_api.py | 10 +++---- cinder/volume/drivers/dell_emc/vnx/client.py | 2 +- .../drivers/inspur/as13000/as13000_driver.py | 2 +- .../lvm-delete-error-f12da00c1b3859dc.yaml | 6 +++++ 8 files changed, 78 insertions(+), 11 deletions(-) create mode 100644 releasenotes/notes/lvm-delete-error-f12da00c1b3859dc.yaml diff --git a/cinder/brick/local_dev/lvm.py b/cinder/brick/local_dev/lvm.py index 8e29b8f07d0..e4305eed656 100644 --- a/cinder/brick/local_dev/lvm.py +++ b/cinder/brick/local_dev/lvm.py @@ -278,6 +278,8 @@ class LVM(executor.Executor): return LVM._supports_pvs_ignoreskippedcluster @staticmethod + @utils.retry(retry=utils.retry_if_exit_code, retry_param=139, interval=0.5, + backoff_rate=0.5) # Bug#1901783 def get_lv_info(root_helper, vg_name=None, lv_name=None): """Retrieve info about LVs (all, in a VG, or a single LV). @@ -644,7 +646,7 @@ class LVM(executor.Executor): # order to prevent a race condition. self._wait_for_volume_deactivation(name) - @utils.retry(exceptions=exception.VolumeNotDeactivated, retries=5, + @utils.retry(retry_param=exception.VolumeNotDeactivated, retries=5, backoff_rate=2) def _wait_for_volume_deactivation(self, name): LOG.debug("Checking to see if volume %s has been deactivated.", diff --git a/cinder/tests/unit/brick/test_brick_lvm.py b/cinder/tests/unit/brick/test_brick_lvm.py index 35fe5a4aaa8..b2079f6c24d 100644 --- a/cinder/tests/unit/brick/test_brick_lvm.py +++ b/cinder/tests/unit/brick/test_brick_lvm.py @@ -231,6 +231,27 @@ class BrickLvmTestCase(test.TestCase): 'sudo', vg_name='fake-vg') ) + @mock.patch('tenacity.nap.sleep', mock.Mock()) + @mock.patch.object(brick.putils, 'execute') + def test_get_lv_info_retry(self, exec_mock): + exec_mock.side_effect = ( + processutils.ProcessExecutionError('', '', exit_code=139), + ('vg name size', ''), + ) + + self.assertEqual( + [{'name': 'name', 'vg': 'vg', 'size': 'size'}], + self.vg.get_lv_info('sudo', vg_name='vg', lv_name='name') + ) + + self.assertEqual(2, exec_mock.call_count) + args = ['env', 'LC_ALL=C', 'lvs', '--noheadings', '--unit=g', '-o', + 'vg_name,name,size', '--nosuffix', 'vg/name'] + if self.configuration.lvm_suppress_fd_warnings: + args.insert(2, 'LVM_SUPPRESS_FD_WARNINGS=1') + lvs_call = mock.call(*args, root_helper='sudo', run_as_root=True) + exec_mock.assert_has_calls([lvs_call, lvs_call]) + def test_get_all_physical_volumes(self): # Filtered VG version pvs = self.vg.get_all_physical_volumes('sudo', 'fake-vg') diff --git a/cinder/tests/unit/test_utils.py b/cinder/tests/unit/test_utils.py index 4069c7e9525..b44dcaa129f 100644 --- a/cinder/tests/unit/test_utils.py +++ b/cinder/tests/unit/test_utils.py @@ -1057,6 +1057,33 @@ class TestRetryDecorator(test.TestCase): self.assertRaises(WrongException, raise_unexpected_error) self.assertFalse(mock_sleep.called) + @mock.patch('tenacity.nap.sleep') + def test_retry_exit_code(self, sleep_mock): + + exit_code = 5 + exception = utils.processutils.ProcessExecutionError + + @utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code) + def raise_retriable_exit_code(): + raise exception(exit_code=exit_code) + + self.assertRaises(exception, raise_retriable_exit_code) + self.assertEqual(2, sleep_mock.call_count) + sleep_mock.assert_has_calls([mock.call(1), mock.call(2)]) + + @mock.patch('tenacity.nap.sleep') + def test_retry_exit_code_non_retriable(self, sleep_mock): + + exit_code = 5 + exception = utils.processutils.ProcessExecutionError + + @utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code) + def raise_non_retriable_exit_code(): + raise exception(exit_code=exit_code + 1) + + self.assertRaises(exception, raise_non_retriable_exit_code) + sleep_mock.assert_not_called() + @ddt.ddt class LogTracingTestCase(test.TestCase): diff --git a/cinder/utils.py b/cinder/utils.py index ee81b5b4a93..4487a673d45 100644 --- a/cinder/utils.py +++ b/cinder/utils.py @@ -655,8 +655,19 @@ class ComparableMixin(object): return self._compare(other, lambda s, o: s != o) -def retry(exceptions, interval=1, retries=3, backoff_rate=2, - wait_random=False): +class retry_if_exit_code(tenacity.retry_if_exception): + """Retry on ProcessExecutionError specific exit codes.""" + def __init__(self, codes): + self.codes = (codes,) if isinstance(codes, int) else codes + super(retry_if_exit_code, self).__init__(self._check_exit_code) + + def _check_exit_code(self, exc): + return (exc and isinstance(exc, processutils.ProcessExecutionError) and + exc.exit_code in self.codes) + + +def retry(retry_param, interval=1, retries=3, backoff_rate=2, + wait_random=False, retry=tenacity.retry_if_exception_type): if retries < 1: raise ValueError('Retries must be greater than or ' @@ -678,7 +689,7 @@ def retry(exceptions, interval=1, retries=3, backoff_rate=2, after=tenacity.after_log(LOG, logging.DEBUG), stop=tenacity.stop_after_attempt(retries), reraise=True, - retry=tenacity.retry_if_exception_type(exceptions), + retry=retry(retry_param), wait=wait) return r.call(f, *args, **kwargs) diff --git a/cinder/volume/drivers/dell_emc/sc/storagecenter_api.py b/cinder/volume/drivers/dell_emc/sc/storagecenter_api.py index 19bdcb2f380..7c08a99d942 100644 --- a/cinder/volume/drivers/dell_emc/sc/storagecenter_api.py +++ b/cinder/volume/drivers/dell_emc/sc/storagecenter_api.py @@ -232,8 +232,8 @@ class HttpClient(object): raise exception.VolumeBackendAPIException(message=msg) return rest_response - @utils.retry(exceptions=(requests.ConnectionError, - DellDriverRetryableException)) + @utils.retry(retry_param=(requests.ConnectionError, + DellDriverRetryableException)) def get(self, url): LOG.debug('get: %(url)s', {'url': url}) rest_response = self.session.get(self.__formatUrl(url), @@ -247,7 +247,7 @@ class HttpClient(object): raise DellDriverRetryableException() return rest_response - @utils.retry(exceptions=(requests.ConnectionError,)) + @utils.retry(retry_param=(requests.ConnectionError,)) def post(self, url, payload, async_call=False): LOG.debug('post: %(url)s data: %(payload)s', {'url': url, @@ -261,7 +261,7 @@ class HttpClient(object): self.asynctimeout if async_call else self.synctimeout)), async_call) - @utils.retry(exceptions=(requests.ConnectionError,)) + @utils.retry(retry_param=(requests.ConnectionError,)) def put(self, url, payload, async_call=False): LOG.debug('put: %(url)s data: %(payload)s', {'url': url, @@ -275,7 +275,7 @@ class HttpClient(object): self.asynctimeout if async_call else self.synctimeout)), async_call) - @utils.retry(exceptions=(requests.ConnectionError,)) + @utils.retry(retry_param=(requests.ConnectionError,)) def delete(self, url, payload=None, async_call=False): LOG.debug('delete: %(url)s data: %(payload)s', {'url': url, 'payload': payload}) diff --git a/cinder/volume/drivers/dell_emc/vnx/client.py b/cinder/volume/drivers/dell_emc/vnx/client.py index 35c73f9056c..94b2ccc272e 100644 --- a/cinder/volume/drivers/dell_emc/vnx/client.py +++ b/cinder/volume/drivers/dell_emc/vnx/client.py @@ -200,7 +200,7 @@ class Client(object): def modify_lun(self): pass - @cinder_utils.retry(exceptions=const.VNXTargetNotReadyError, + @cinder_utils.retry(retry_param=const.VNXTargetNotReadyError, interval=15, retries=5, backoff_rate=1) def migrate_lun(self, src_id, dst_id, diff --git a/cinder/volume/drivers/inspur/as13000/as13000_driver.py b/cinder/volume/drivers/inspur/as13000/as13000_driver.py index f144860a117..4c0d537918b 100644 --- a/cinder/volume/drivers/inspur/as13000/as13000_driver.py +++ b/cinder/volume/drivers/inspur/as13000/as13000_driver.py @@ -704,7 +704,7 @@ class AS13000Driver(san.SanISCSIDriver): request_type=request_type) @utils.trace - @utils.retry(exceptions=exception.VolumeDriverException, + @utils.retry(retry_param=exception.VolumeDriverException, interval=1, retries=3) def _add_lun_to_target(self, target_name, volume): diff --git a/releasenotes/notes/lvm-delete-error-f12da00c1b3859dc.yaml b/releasenotes/notes/lvm-delete-error-f12da00c1b3859dc.yaml new file mode 100644 index 00000000000..69eba8fd643 --- /dev/null +++ b/releasenotes/notes/lvm-delete-error-f12da00c1b3859dc.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + LVM driver `bug #1901783 + `_: Fix unexpected delete + volume failure due to unexpected exit code 139 on ``lvs`` command call.