LVM: Retry lv calls on segfault due to lvs failure

Follow-up to 0b2c0d9238b6f84a02fcd82f97bc5a2d9f681247 and I6824ba4f.

Sometimes we get an unexpected failure on the lvs command where it exits
with code 139, which is not one of the 5 predefined ones in the code
ECMD_PROCESSED, ENO_SUCH_CMD, EINVALID_CMD_LINE, EINIT_FAILED, or
ECMD_FAILED.

Since the LVM code indicates that a retry can help, we'll retry on error
139 when calling ``get_lv_info``. Retry lvdisplay, lvs and lvcreate calls
on segfault.

In addition, we only need readonly mode for lvdisplay.

Co-Authored-By: Gorka Eguileor <geguileo@redhat.com>
Co-Authored-By: Eric Harney <eharney@redhat.com>
Closes-Bug: #1901783
Change-Id: I62fadd401591025ecf9f83fa095249008f75bbaa
This commit is contained in:
Sofia Enriquez 2021-06-29 19:57:54 +00:00
parent 1d444f0842
commit 80f0bc4f5d
9 changed files with 123 additions and 37 deletions

View File

@ -531,7 +531,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
with excutils.save_and_reraise_exception():
self._cleanup_connection(connection_properties, force=True)
@utils.retry(exceptions=(exception.VolumeDeviceNotFound))
@utils.retry((exception.VolumeDeviceNotFound))
def _get_device_link(self, wwn, device, mpath):
# These are the default symlinks that should always be there
if mpath:
@ -571,7 +571,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
result['multipath_id'] = wwn
return result
@utils.retry(exceptions=(exception.VolumeDeviceNotFound))
@utils.retry((exception.VolumeDeviceNotFound))
def _connect_single_volume(self, connection_properties):
"""Connect to a volume using a single path."""
data = {'stop_connecting': False, 'num_logins': 0, 'failed_logins': 0,
@ -698,7 +698,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
data['just_added_devices'].append(device)
data['stopped_threads'] += 1
@utils.retry(exceptions=(exception.VolumeDeviceNotFound))
@utils.retry((exception.VolumeDeviceNotFound))
def _connect_multipath_volume(self, connection_properties):
"""Connect to a multipathed volume launching parallel login requests.

View File

@ -198,7 +198,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
"when running nvme list.")
raise exception.CommandExecutionFailed(message=msg)
@utils.retry(exceptions=exception.VolumePathsNotFound)
@utils.retry(exception.VolumePathsNotFound)
def _get_device_path(self, current_nvme_devices):
all_nvme_devices = self._get_nvme_devices()
LOG.debug("all_nvme_devices are %(all_nvme_devices)s",
@ -208,7 +208,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
raise exception.VolumePathsNotFound()
return list(path)
@utils.retry(exceptions=putils.ProcessExecutionError)
@utils.retry(putils.ProcessExecutionError)
def _try_connect_nvme(self, cmd):
self._execute(*cmd, root_helper=self._root_helper,
run_as_root=True)
@ -240,7 +240,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
return ret_val
@utils.retry(exceptions=exception.NotFound, retries=5)
@utils.retry(exception.NotFound, retries=5)
def _is_nvme_available(self, nvme_name):
nvme_name_pattern = "/dev/%sn[0-9]+" % nvme_name
for nvme_dev_name in self._get_nvme_devices():
@ -594,7 +594,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
raise exception.VolumeDeviceNotFound(device=target_nqn)
@staticmethod
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
@utils.retry(exception.VolumeDeviceNotFound)
def get_nvme_device_path(executor, target_nqn, vol_uuid):
nvme_ctrl = NVMeOFConnector._get_nvme_controller(executor, target_nqn)
try:

View File

@ -153,7 +153,7 @@ class ScaleIOConnector(base.BaseLinuxConnector):
# NOTE: Usually 3 retries is enough to find the volume.
# If there are network issues, it could take much longer. Set
# the max retries to 15 to make sure we can find the volume.
@utils.retry(exceptions=exception.BrickException,
@utils.retry(exception.BrickException,
retries=15,
backoff_rate=1)
def _wait_for_volume_path(self, path):

View File

@ -387,7 +387,7 @@ class LinuxSCSI(executor.Executor):
attempts=3, timeout=300, interval=10,
root_helper=self._root_helper)
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
@utils.retry(exception.VolumeDeviceNotFound)
def wait_for_path(self, volume_path):
"""Wait for a path to show up."""
LOG.debug("Checking to see if %s exists yet.",
@ -399,7 +399,7 @@ class LinuxSCSI(executor.Executor):
else:
LOG.debug("%s has shown up.", volume_path)
@utils.retry(exceptions=exception.BlockDeviceReadOnly, retries=5)
@utils.retry(exception.BlockDeviceReadOnly, retries=5)
def wait_for_rw(self, wwn, device_path):
"""Wait for block device to be Read-Write."""
LOG.debug("Checking to see if %s is read-only.",

View File

@ -143,6 +143,24 @@ class LVM(executor.Executor):
cmd = ['vgcreate', self.vg_name, ','.join(pv_list)]
self._execute(*cmd, root_helper=self._root_helper, run_as_root=True)
@utils.retry(retry=utils.retry_if_exit_code, retry_param=139, interval=0.5,
backoff_rate=0.5)
def _run_lvm_command(self,
cmd_arg_list: list,
root_helper: str = None,
run_as_root: bool = True) -> tuple:
"""Run LVM commands with a retry on code 139 to work around LVM bugs.
Refer to LP bug 1901783, LP bug 1932188.
"""
if not root_helper:
root_helper = self._root_helper
(out, err) = self._execute(*cmd_arg_list,
root_helper=root_helper,
run_as_root=run_as_root)
return (out, err)
def _get_vg_uuid(self):
cmd = LVM.LVM_CMD_PREFIX + ['vgs', '--noheadings',
'-o', 'uuid', self.vg_name]
@ -172,9 +190,8 @@ class LVM(executor.Executor):
free_space = 0.0
try:
(out, err) = self._execute(*cmd,
root_helper=self._root_helper,
run_as_root=True)
(out, err) = self._run_lvm_command(cmd)
if out is not None:
out = out.strip()
data = out.split(':')
@ -279,6 +296,8 @@ class LVM(executor.Executor):
return False
@staticmethod
@utils.retry(retry=utils.retry_if_exit_code, retry_param=139, interval=0.5,
backoff_rate=0.5) # Bug#1901783
def get_lv_info(root_helper, vg_name=None, lv_name=None):
"""Retrieve info about LVs (all, in a VG, or a single LV).
@ -537,9 +556,7 @@ class LVM(executor.Executor):
'size': size_args,
'free': self.vg_free_space})
self._execute(*cmd,
root_helper=self._root_helper,
run_as_root=True)
self._run_lvm_command(cmd)
self.vg_thin_pool = name
@ -574,9 +591,7 @@ class LVM(executor.Executor):
cmd.extend(['-R', str(rsize)])
try:
self._execute(*cmd,
root_helper=self._root_helper,
run_as_root=True)
self._run_lvm_command(cmd)
except putils.ProcessExecutionError as err:
LOG.exception('Error creating Volume')
LOG.error('Cmd :%s', err.cmd)
@ -606,9 +621,7 @@ class LVM(executor.Executor):
cmd.extend(['-L', '%sg' % (size)])
try:
self._execute(*cmd,
root_helper=self._root_helper,
run_as_root=True)
self._run_lvm_command(cmd)
except putils.ProcessExecutionError as err:
LOG.exception('Error creating snapshot')
LOG.error('Cmd :%s', err.cmd)
@ -626,9 +639,7 @@ class LVM(executor.Executor):
def _lv_is_active(self, name):
cmd = LVM.LVM_CMD_PREFIX + ['lvdisplay', '--noheading', '-C', '-o',
'Attr', '%s/%s' % (self.vg_name, name)]
out, _err = self._execute(*cmd,
root_helper=self._root_helper,
run_as_root=True)
out, _err = self._run_lvm_command(cmd)
if out:
out = out.strip()
# An example output might be '-wi-a----'; the 4th index specifies
@ -656,8 +667,8 @@ class LVM(executor.Executor):
# order to prevent a race condition.
self._wait_for_volume_deactivation(name)
@utils.retry(exceptions=exception.VolumeNotDeactivated, retries=3,
backoff_rate=1)
@utils.retry(retry_param=exception.VolumeNotDeactivated, retries=5,
backoff_rate=2)
def _wait_for_volume_deactivation(self, name):
LOG.debug("Checking to see if volume %s has been deactivated.",
name)
@ -768,10 +779,9 @@ class LVM(executor.Executor):
def lv_has_snapshot(self, name):
cmd = LVM.LVM_CMD_PREFIX + ['lvdisplay', '--noheading', '-C', '-o',
'Attr', '%s/%s' % (self.vg_name, name)]
out, _err = self._execute(*cmd,
root_helper=self._root_helper,
run_as_root=True)
'Attr', '--readonly',
'%s/%s' % (self.vg_name, name)]
out, _err = self._run_lvm_command(cmd)
if out:
out = out.strip()
if (out[0] == 'o') or (out[0] == 'O'):

View File

@ -17,6 +17,7 @@ from unittest import mock
from oslo_concurrency import processutils
from os_brick import exception
from os_brick import executor as os_brick_executor
from os_brick.local_dev import lvm as brick
from os_brick.privileged import rootwrap as priv_rootwrap
from os_brick.tests import base
@ -227,6 +228,39 @@ class BrickLvmTestCase(base.TestCase):
'sudo', vg_name='fake-vg')
)
@mock.patch('tenacity.nap.sleep', mock.Mock())
@mock.patch.object(brick.putils, 'execute')
def test_get_lv_info_retry(self, exec_mock):
exec_mock.side_effect = (
processutils.ProcessExecutionError('', '', exit_code=139),
('vg name size', ''),
)
self.assertEqual(
[{'name': 'fake-1', 'size': '1.00g', 'vg': 'fake-vg'},
{'name': 'fake-2', 'size': '1.00g', 'vg': 'fake-vg'}],
self.vg.get_lv_info('sudo', vg_name='vg', lv_name='name')
)
@mock.patch('tenacity.nap.sleep', mock.Mock())
@mock.patch.object(os_brick_executor.Executor, '_execute')
def test_get_thin_pool_free_space_retry(self, exec_mock):
exec_mock.side_effect = (
processutils.ProcessExecutionError('', '', exit_code=139),
('15.84:50', ''),
)
self.assertEqual(
7.92,
self.vg._get_thin_pool_free_space('vg', 'thinpool')
)
self.assertEqual(2, exec_mock.call_count)
args = ['env', 'LC_ALL=C', 'lvs', '--noheadings', '--unit=g', '-o',
'size,data_percent', '--separator', ':', '--nosuffix',
'/dev/vg/thinpool']
if self.configuration.lvm_suppress_fd_warnings:
args.insert(2, 'LVM_SUPPRESS_FD_WARNINGS=1')
lvs_call = mock.call(*args, root_helper='sudo', run_as_root=True)
exec_mock.assert_has_calls([lvs_call, lvs_call])
def test_get_all_physical_volumes(self):
# Filtered VG version
pvs = self.vg.get_all_physical_volumes('sudo', 'fake-vg')
@ -373,7 +407,7 @@ class BrickLvmTestCase(base.TestCase):
self.assertFalse(self.vg.deactivate_lv.called)
def test_lv_deactivate(self):
with mock.patch.object(self.vg, '_execute'):
with mock.patch.object(self.vg, '_execute', return_value=(0, 0)):
is_active_mock = mock.Mock()
is_active_mock.return_value = False
self.vg._lv_is_active = is_active_mock
@ -382,7 +416,7 @@ class BrickLvmTestCase(base.TestCase):
@mock.patch('os_brick.utils._time_sleep')
def test_lv_deactivate_timeout(self, mock_sleep):
with mock.patch.object(self.vg, '_execute'):
with mock.patch.object(self.vg, '_execute', return_value=(0, 0)):
is_active_mock = mock.Mock()
is_active_mock.return_value = True
self.vg._lv_is_active = is_active_mock

View File

@ -31,7 +31,7 @@ class TestRetryDecorator(base.TestCase):
self.counter = 0
with mock.patch.object(utils, '_time_sleep') as mock_sleep:
@utils.retry(exceptions=exception.VolumeDeviceNotFound,
@utils.retry(exception.VolumeDeviceNotFound,
interval=2,
retries=3,
backoff_rate=2)
@ -100,13 +100,35 @@ class TestRetryDecorator(base.TestCase):
def test_wrong_exception_no_retry(self):
with mock.patch.object(utils, '_time_sleep') as mock_sleep:
@utils.retry(exceptions=exception.VolumeDeviceNotFound)
@utils.retry(exception.VolumeDeviceNotFound)
def raise_unexpected_error():
raise WrongException("wrong exception")
self.assertRaises(WrongException, raise_unexpected_error)
self.assertFalse(mock_sleep.called)
@mock.patch('tenacity.nap.sleep')
def test_retry_exit_code(self, sleep_mock):
exit_code = 5
exception = utils.processutils.ProcessExecutionError
@utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code)
def raise_retriable_exit_code():
raise exception(exit_code=exit_code)
self.assertRaises(exception, raise_retriable_exit_code)
self.assertEqual(0, sleep_mock.call_count)
@mock.patch('tenacity.nap.sleep')
def test_retry_exit_code_non_retriable(self, sleep_mock):
exit_code = 5
exception = utils.processutils.ProcessExecutionError
@utils.retry(retry=utils.retry_if_exit_code, retry_param=exit_code)
def raise_non_retriable_exit_code():
raise exception(exit_code=exit_code + 1)
self.assertRaises(exception, raise_non_retriable_exit_code)
sleep_mock.assert_not_called()
class LogTracingTestCase(base.TestCase):
"""Test out the log tracing."""

View File

@ -17,6 +17,7 @@ import inspect
import logging as py_logging
import time
from oslo_concurrency import processutils
from oslo_log import log as logging
from oslo_utils import strutils
@ -44,7 +45,19 @@ import tenacity # noqa
LOG = logging.getLogger(__name__)
def retry(exceptions, interval=1, retries=3, backoff_rate=2):
class retry_if_exit_code(tenacity.retry_if_exception):
"""Retry on ProcessExecutionError specific exit codes."""
def __init__(self, codes):
self.codes = (codes,) if isinstance(codes, int) else codes
super(retry_if_exit_code, self).__init__(self._check_exit_code)
def _check_exit_code(self, exc):
return (exc and isinstance(exc, processutils.ProcessExecutionError) and
exc.exit_code in self.codes)
def retry(retry_param, interval=1, retries=3, backoff_rate=2,
retry=tenacity.retry_if_exception_type):
if retries < 1:
raise ValueError(_('Retries must be greater than or '
@ -59,7 +72,7 @@ def retry(exceptions, interval=1, retries=3, backoff_rate=2):
after=tenacity.after_log(LOG, logging.DEBUG),
stop=tenacity.stop_after_attempt(retries),
reraise=True,
retry=tenacity.retry_if_exception_type(exceptions),
retry=retry(retry_param),
wait=tenacity.wait_exponential(
multiplier=interval, min=0, exp_base=backoff_rate))
return r.call(f, *args, **kwargs)

View File

@ -0,0 +1,7 @@
---
fixes:
- |
LVM driver `bug #1901783
<https://bugs.launchpad.net/cinder/+bug/1901783>`_: Fix unexpected delete
volume failure due to unexpected exit code 139 on ``lvs`` command call.