LVM: Retry lv calls on segfault due to lvs failure
Follow-up to 0b2c0d9238b6f84a02fcd82f97bc5a2d9f681247 and I6824ba4f. Sometimes we get an unexpected failure on the lvs command where it exits with code 139, which is not one of the 5 predefined ones in the code ECMD_PROCESSED, ENO_SUCH_CMD, EINVALID_CMD_LINE, EINIT_FAILED, or ECMD_FAILED. Since the LVM code indicates that a retry can help, we'll retry on error 139 when calling ``get_lv_info``. Retry lvdisplay, lvs and lvcreate calls on segfault. In addition, we only need readonly mode for lvdisplay. Co-Authored-By: Gorka Eguileor <geguileo@redhat.com> Co-Authored-By: Eric Harney <eharney@redhat.com> Closes-Bug: #1901783 Change-Id: I62fadd401591025ecf9f83fa095249008f75bbaa
This commit is contained in:
parent
1d444f0842
commit
80f0bc4f5d
|
@ -531,7 +531,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
with excutils.save_and_reraise_exception():
|
||||
self._cleanup_connection(connection_properties, force=True)
|
||||
|
||||
@utils.retry(exceptions=(exception.VolumeDeviceNotFound))
|
||||
@utils.retry((exception.VolumeDeviceNotFound))
|
||||
def _get_device_link(self, wwn, device, mpath):
|
||||
# These are the default symlinks that should always be there
|
||||
if mpath:
|
||||
|
@ -571,7 +571,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
result['multipath_id'] = wwn
|
||||
return result
|
||||
|
||||
@utils.retry(exceptions=(exception.VolumeDeviceNotFound))
|
||||
@utils.retry((exception.VolumeDeviceNotFound))
|
||||
def _connect_single_volume(self, connection_properties):
|
||||
"""Connect to a volume using a single path."""
|
||||
data = {'stop_connecting': False, 'num_logins': 0, 'failed_logins': 0,
|
||||
|
@ -698,7 +698,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
data['just_added_devices'].append(device)
|
||||
data['stopped_threads'] += 1
|
||||
|
||||
@utils.retry(exceptions=(exception.VolumeDeviceNotFound))
|
||||
@utils.retry((exception.VolumeDeviceNotFound))
|
||||
def _connect_multipath_volume(self, connection_properties):
|
||||
"""Connect to a multipathed volume launching parallel login requests.
|
||||
|
||||
|
|
|
@ -198,7 +198,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
"when running nvme list.")
|
||||
raise exception.CommandExecutionFailed(message=msg)
|
||||
|
||||
@utils.retry(exceptions=exception.VolumePathsNotFound)
|
||||
@utils.retry(exception.VolumePathsNotFound)
|
||||
def _get_device_path(self, current_nvme_devices):
|
||||
all_nvme_devices = self._get_nvme_devices()
|
||||
LOG.debug("all_nvme_devices are %(all_nvme_devices)s",
|
||||
|
@ -208,7 +208,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
raise exception.VolumePathsNotFound()
|
||||
return list(path)
|
||||
|
||||
@utils.retry(exceptions=putils.ProcessExecutionError)
|
||||
@utils.retry(putils.ProcessExecutionError)
|
||||
def _try_connect_nvme(self, cmd):
|
||||
self._execute(*cmd, root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
|
@ -240,7 +240,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
|
||||
return ret_val
|
||||
|
||||
@utils.retry(exceptions=exception.NotFound, retries=5)
|
||||
@utils.retry(exception.NotFound, retries=5)
|
||||
def _is_nvme_available(self, nvme_name):
|
||||
nvme_name_pattern = "/dev/%sn[0-9]+" % nvme_name
|
||||
for nvme_dev_name in self._get_nvme_devices():
|
||||
|
@ -594,7 +594,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
raise exception.VolumeDeviceNotFound(device=target_nqn)
|
||||
|
||||
@staticmethod
|
||||
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
|
||||
@utils.retry(exception.VolumeDeviceNotFound)
|
||||
def get_nvme_device_path(executor, target_nqn, vol_uuid):
|
||||
nvme_ctrl = NVMeOFConnector._get_nvme_controller(executor, target_nqn)
|
||||
try:
|
||||
|
|
|
@ -153,7 +153,7 @@ class ScaleIOConnector(base.BaseLinuxConnector):
|
|||
# NOTE: Usually 3 retries is enough to find the volume.
|
||||
# If there are network issues, it could take much longer. Set
|
||||
# the max retries to 15 to make sure we can find the volume.
|
||||
@utils.retry(exceptions=exception.BrickException,
|
||||
@utils.retry(exception.BrickException,
|
||||
retries=15,
|
||||
backoff_rate=1)
|
||||
def _wait_for_volume_path(self, path):
|
||||
|
|
|
@ -387,7 +387,7 @@ class LinuxSCSI(executor.Executor):
|
|||
attempts=3, timeout=300, interval=10,
|
||||
root_helper=self._root_helper)
|
||||
|
||||
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
|
||||
@utils.retry(exception.VolumeDeviceNotFound)
|
||||
def wait_for_path(self, volume_path):
|
||||
"""Wait for a path to show up."""
|
||||
LOG.debug("Checking to see if %s exists yet.",
|
||||
|
@ -399,7 +399,7 @@ class LinuxSCSI(executor.Executor):
|
|||
else:
|
||||
LOG.debug("%s has shown up.", volume_path)
|
||||
|
||||
@utils.retry(exceptions=exception.BlockDeviceReadOnly, retries=5)
|
||||
@utils.retry(exception.BlockDeviceReadOnly, retries=5)
|
||||
def wait_for_rw(self, wwn, device_path):
|
||||
"""Wait for block device to be Read-Write."""
|
||||
LOG.debug("Checking to see if %s is read-only.",
|
||||
|
|
|
@ -143,6 +143,24 @@ class LVM(executor.Executor):
|
|||
cmd = ['vgcreate', self.vg_name, ','.join(pv_list)]
|
||||
self._execute(*cmd, root_helper=self._root_helper, run_as_root=True)
|
||||
|
||||
@utils.retry(retry=utils.retry_if_exit_code, retry_param=139, interval=0.5,
|
||||
backoff_rate=0.5)
|
||||
def _run_lvm_command(self,
|
||||
cmd_arg_list: list,
|
||||
root_helper: str = None,
|
||||
run_as_root: bool = True) -> tuple:
|
||||
"""Run LVM commands with a retry on code 139 to work around LVM bugs.
|
||||
|
||||
Refer to LP bug 1901783, LP bug 1932188.
|
||||
"""
|
||||
if not root_helper:
|
||||
root_helper = self._root_helper
|
||||
|
||||
(out, err) = self._execute(*cmd_arg_list,
|
||||
root_helper=root_helper,
|
||||
run_as_root=run_as_root)
|
||||
return (out, err)
|
||||
|
||||
def _get_vg_uuid(self):
|
||||
cmd = LVM.LVM_CMD_PREFIX + ['vgs', '--noheadings',
|
||||
'-o', 'uuid', self.vg_name]
|
||||
|
@ -172,9 +190,8 @@ class LVM(executor.Executor):
|
|||
free_space = 0.0
|
||||
|
||||
try:
|
||||
(out, err) = self._execute(*cmd,
|
||||
root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
(out, err) = self._run_lvm_command(cmd)
|
||||
|
||||
if out is not None:
|
||||
out = out.strip()
|
||||
data = out.split(':')
|
||||
|
@ -279,6 +296,8 @@ class LVM(executor.Executor):
|
|||
return False
|
||||
|
||||
@staticmethod
|
||||
@utils.retry(retry=utils.retry_if_exit_code, retry_param=139, interval=0.5,
|
||||
backoff_rate=0.5) # Bug#1901783
|
||||
def get_lv_info(root_helper, vg_name=None, lv_name=None):
|
||||
"""Retrieve info about LVs (all, in a VG, or a single LV).
|
||||
|
||||
|
@ -537,9 +556,7 @@ class LVM(executor.Executor):
|
|||
'size': size_args,
|
||||
'free': self.vg_free_space})
|
||||
|
||||
self._execute(*cmd,
|
||||
root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
self._run_lvm_command(cmd)
|
||||
|
||||
self.vg_thin_pool = name
|
||||
|
||||
|
@ -574,9 +591,7 @@ class LVM(executor.Executor):
|
|||
cmd.extend(['-R', str(rsize)])
|
||||
|
||||
try:
|
||||
self._execute(*cmd,
|
||||
root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
self._run_lvm_command(cmd)
|
||||
except putils.ProcessExecutionError as err:
|
||||
LOG.exception('Error creating Volume')
|
||||
LOG.error('Cmd :%s', err.cmd)
|
||||
|
@ -606,9 +621,7 @@ class LVM(executor.Executor):
|
|||
cmd.extend(['-L', '%sg' % (size)])
|
||||
|
||||
try:
|
||||
self._execute(*cmd,
|
||||
root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
self._run_lvm_command(cmd)
|
||||
except putils.ProcessExecutionError as err:
|
||||
LOG.exception('Error creating snapshot')
|
||||
LOG.error('Cmd :%s', err.cmd)
|
||||
|
@ -626,9 +639,7 @@ class LVM(executor.Executor):
|
|||
def _lv_is_active(self, name):
|
||||
cmd = LVM.LVM_CMD_PREFIX + ['lvdisplay', '--noheading', '-C', '-o',
|
||||
'Attr', '%s/%s' % (self.vg_name, name)]
|
||||
out, _err = self._execute(*cmd,
|
||||
root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
out, _err = self._run_lvm_command(cmd)
|
||||
if out:
|
||||
out = out.strip()
|
||||
# An example output might be '-wi-a----'; the 4th index specifies
|
||||
|
@ -656,8 +667,8 @@ class LVM(executor.Executor):
|
|||
# order to prevent a race condition.
|
||||
self._wait_for_volume_deactivation(name)
|
||||
|
||||
@utils.retry(exceptions=exception.VolumeNotDeactivated, retries=3,
|
||||
backoff_rate=1)
|
||||
@utils.retry(retry_param=exception.VolumeNotDeactivated, retries=5,
|
||||
backoff_rate=2)
|
||||
def _wait_for_volume_deactivation(self, name):
|
||||
LOG.debug("Checking to see if volume %s has been deactivated.",
|
||||
name)
|
||||
|
@ -768,10 +779,9 @@ class LVM(executor.Executor):
|
|||
|
||||
def lv_has_snapshot(self, name):
|
||||
cmd = LVM.LVM_CMD_PREFIX + ['lvdisplay', '--noheading', '-C', '-o',
|
||||
'Attr', '%s/%s' % (self.vg_name, name)]
|
||||
out, _err = self._execute(*cmd,
|
||||
root_helper=self._root_helper,
|
||||
run_as_root=True)
|
||||
'Attr', '--readonly',
|
||||
'%s/%s' % (self.vg_name, name)]
|
||||
out, _err = self._run_lvm_command(cmd)
|
||||
if out:
|
||||
out = out.strip()
|
||||
if (out[0] == 'o') or (out[0] == 'O'):
|
||||
|
|
|
@ -17,6 +17,7 @@ from unittest import mock
|
|||
from oslo_concurrency import processutils
|
||||
|
||||
from os_brick import exception
|
||||
from os_brick import executor as os_brick_executor
|
||||
from os_brick.local_dev import lvm as brick
|
||||
from os_brick.privileged import rootwrap as priv_rootwrap
|
||||
from os_brick.tests import base
|
||||
|
@ -227,6 +228,39 @@ class BrickLvmTestCase(base.TestCase):
|
|||
'sudo', vg_name='fake-vg')
|
||||
)
|
||||
|
||||
@mock.patch('tenacity.nap.sleep', mock.Mock())
|
||||
@mock.patch.object(brick.putils, 'execute')
|
||||
def test_get_lv_info_retry(self, exec_mock):
|
||||
exec_mock.side_effect = (
|
||||
processutils.ProcessExecutionError('', '', exit_code=139),
|
||||
('vg name size', ''),
|
||||
)
|
||||
self.assertEqual(
|
||||
[{'name': 'fake-1', 'size': '1.00g', 'vg': 'fake-vg'},
|
||||
{'name': 'fake-2', 'size': '1.00g', 'vg': 'fake-vg'}],
|
||||
self.vg.get_lv_info('sudo', vg_name='vg', lv_name='name')
|
||||
)
|
||||
|
||||
@mock.patch('tenacity.nap.sleep', mock.Mock())
|
||||
@mock.patch.object(os_brick_executor.Executor, '_execute')
|
||||
def test_get_thin_pool_free_space_retry(self, exec_mock):
|
||||
exec_mock.side_effect = (
|
||||
processutils.ProcessExecutionError('', '', exit_code=139),
|
||||
('15.84:50', ''),
|
||||
)
|
||||
self.assertEqual(
|
||||
7.92,
|
||||
self.vg._get_thin_pool_free_space('vg', 'thinpool')
|
||||
)
|
||||
self.assertEqual(2, exec_mock.call_count)
|
||||
args = ['env', 'LC_ALL=C', 'lvs', '--noheadings', '--unit=g', '-o',
|
||||
'size,data_percent', '--separator', ':', '--nosuffix',
|
||||
'/dev/vg/thinpool']
|
||||
if self.configuration.lvm_suppress_fd_warnings:
|
||||
args.insert(2, 'LVM_SUPPRESS_FD_WARNINGS=1')
|
||||
lvs_call = mock.call(*args, root_helper='sudo', run_as_root=True)
|
||||
exec_mock.assert_has_calls([lvs_call, lvs_call])
|
||||
|
||||
def test_get_all_physical_volumes(self):
|
||||
# Filtered VG version
|
||||
pvs = self.vg.get_all_physical_volumes('sudo', 'fake-vg')
|
||||
|
@ -373,7 +407,7 @@ class BrickLvmTestCase(base.TestCase):
|
|||
self.assertFalse(self.vg.deactivate_lv.called)
|
||||
|
||||
def test_lv_deactivate(self):
|
||||
with mock.patch.object(self.vg, '_execute'):
|
||||
with mock.patch.object(self.vg, '_execute', return_value=(0, 0)):
|
||||
is_active_mock = mock.Mock()
|
||||
is_active_mock.return_value = False
|
||||
self.vg._lv_is_active = is_active_mock
|
||||
|
@ -382,7 +416,7 @@ class BrickLvmTestCase(base.TestCase):
|
|||
|
||||
@mock.patch('os_brick.utils._time_sleep')
|
||||
def test_lv_deactivate_timeout(self, mock_sleep):
|
||||
with mock.patch.object(self.vg, '_execute'):
|
||||
with mock.patch.object(self.vg, '_execute', return_value=(0, 0)):
|
||||
is_active_mock = mock.Mock()
|
||||
is_active_mock.return_value = True
|
||||
self.vg._lv_is_active = is_active_mock
|
||||
|
|
|
@ -31,7 +31,7 @@ class TestRetryDecorator(base.TestCase):
|
|||
self.counter = 0
|
||||
|
||||
with mock.patch.object(utils, '_time_sleep') as mock_sleep:
|
||||
@utils.retry(exceptions=exception.VolumeDeviceNotFound,
|
||||
@utils.retry(exception.VolumeDeviceNotFound,
|
||||
interval=2,
|
||||
retries=3,
|
||||
backoff_rate=2)
|
||||
|
@ -100,13 +100,35 @@ class TestRetryDecorator(base.TestCase):
|
|||
def test_wrong_exception_no_retry(self):
|
||||
|
||||
with mock.patch.object(utils, '_time_sleep') as mock_sleep:
|
||||
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
|
||||
@utils.retry(exception.VolumeDeviceNotFound)
|
||||
def raise_unexpected_error():
|
||||
raise WrongException("wrong exception")
|
||||
|
||||
self.assertRaises(WrongException, raise_unexpected_error)
|
||||
self.assertFalse(mock_sleep.called)
|
||||
|
||||
@mock.patch('tenacity.nap.sleep')
|
||||
def test_retry_exit_code(self, sleep_mock):
|
||||
exit_code = 5
|
||||
exception = utils.processutils.ProcessExecutionError
|
||||
|
||||
@utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code)
|
||||
def raise_retriable_exit_code():
|
||||
raise exception(exit_code=exit_code)
|
||||
self.assertRaises(exception, raise_retriable_exit_code)
|
||||
self.assertEqual(0, sleep_mock.call_count)
|
||||
|
||||
@mock.patch('tenacity.nap.sleep')
|
||||
def test_retry_exit_code_non_retriable(self, sleep_mock):
|
||||
exit_code = 5
|
||||
exception = utils.processutils.ProcessExecutionError
|
||||
|
||||
@utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code)
|
||||
def raise_non_retriable_exit_code():
|
||||
raise exception(exit_code=exit_code + 1)
|
||||
self.assertRaises(exception, raise_non_retriable_exit_code)
|
||||
sleep_mock.assert_not_called()
|
||||
|
||||
|
||||
class LogTracingTestCase(base.TestCase):
|
||||
"""Test out the log tracing."""
|
||||
|
|
|
@ -17,6 +17,7 @@ import inspect
|
|||
import logging as py_logging
|
||||
import time
|
||||
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_log import log as logging
|
||||
from oslo_utils import strutils
|
||||
|
||||
|
@ -44,7 +45,19 @@ import tenacity # noqa
|
|||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def retry(exceptions, interval=1, retries=3, backoff_rate=2):
|
||||
class retry_if_exit_code(tenacity.retry_if_exception):
|
||||
"""Retry on ProcessExecutionError specific exit codes."""
|
||||
def __init__(self, codes):
|
||||
self.codes = (codes,) if isinstance(codes, int) else codes
|
||||
super(retry_if_exit_code, self).__init__(self._check_exit_code)
|
||||
|
||||
def _check_exit_code(self, exc):
|
||||
return (exc and isinstance(exc, processutils.ProcessExecutionError) and
|
||||
exc.exit_code in self.codes)
|
||||
|
||||
|
||||
def retry(retry_param, interval=1, retries=3, backoff_rate=2,
|
||||
retry=tenacity.retry_if_exception_type):
|
||||
|
||||
if retries < 1:
|
||||
raise ValueError(_('Retries must be greater than or '
|
||||
|
@ -59,7 +72,7 @@ def retry(exceptions, interval=1, retries=3, backoff_rate=2):
|
|||
after=tenacity.after_log(LOG, logging.DEBUG),
|
||||
stop=tenacity.stop_after_attempt(retries),
|
||||
reraise=True,
|
||||
retry=tenacity.retry_if_exception_type(exceptions),
|
||||
retry=retry(retry_param),
|
||||
wait=tenacity.wait_exponential(
|
||||
multiplier=interval, min=0, exp_base=backoff_rate))
|
||||
return r.call(f, *args, **kwargs)
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
LVM driver `bug #1901783
|
||||
<https://bugs.launchpad.net/cinder/+bug/1901783>`_: Fix unexpected delete
|
||||
volume failure due to unexpected exit code 139 on ``lvs`` command call.
|
||||
|
Loading…
Reference in New Issue