NetApp SolidFire: Fix clone and request timeout issues

Users are experiencing timeout issues in certain environments, mostly
when volumes are too big (ie. multi-terabyte volumes), due to poor
network performance or upgrade issues that revolve around the SolidFire
cluster. A viable solution is to make driver timeout values
configurable in cinder.conf, so users can set these timeouts
according to their needs.

This patch adds two timeout settings to the SolidFire driver
(for cloning operation and globally to all api requests), to
allow users to set the appropriate timeouts for their environment.

Closes-Bug: #1898587
Change-Id: Ie330c76a5db0ea76d4fed5a6ae7b8736dadc8591
(cherry picked from commit c607a82a99)
(cherry picked from commit 3b2e7d662b)
(cherry picked from commit b790e81e74)
This commit is contained in:
Fernando Ferraz 2020-10-01 11:20:51 -03:00 committed by Fernando Ferraz Silva
parent 88133fae47
commit ddb88caad8
3 changed files with 65 additions and 20 deletions

View File

@ -180,8 +180,8 @@ class SolidFireVolumeTestCase(test.TestCase):
'volumeID': 6}]
def fake_issue_api_request(self, method, params, version='1.0',
endpoint=None):
if method is 'GetClusterCapacity':
endpoint=None, timeout=None):
if method == 'GetClusterCapacity':
data = {}
if version == '1.0':
data = {'result': {'clusterCapacity': {
@ -597,6 +597,12 @@ class SolidFireVolumeTestCase(test.TestCase):
'volume_type_id': None,
'created_at': timeutils.utcnow()}
fake_model_info = {
'provider_id': '%s %s cluster-id-01' % (
self.fake_sfvol['volumeID'],
self.fake_sfaccount['accountID'])
}
ctx = context.get_admin_context()
testvol = fake_volume.fake_volume_obj(ctx, **updates_vol_a)
testvol_b = fake_volume.fake_volume_obj(ctx, **updates_vol_b)
@ -616,7 +622,7 @@ class SolidFireVolumeTestCase(test.TestCase):
return_value=[]), \
mock.patch.object(sfv,
'_get_model_info',
return_value={}):
return_value=fake_model_info):
sfv.create_cloned_volume(testvol_b, testvol)
def test_initialize_connector_with_blocksizes(self):
@ -2948,6 +2954,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'mvip': self.mvip,
'svip': self.svip}
self.configuration.sf_volume_clone_timeout = 1
sfv = solidfire.SolidFireDriver(configuration=self.configuration)
sfv.replication_enabled = False
@ -2992,7 +2999,7 @@ class SolidFireVolumeTestCase(test.TestCase):
mock_issue_api_request.assert_has_calls(calls)
mock_test_set_cluster_pairs.assert_not_called()
mock_update_attributes.assert_not_called()
mock_get_model_info.assert_called_once()
mock_get_model_info.assert_called()
mock_snapshot_discovery.assert_not_called()
reset_mocks()

View File

@ -24,6 +24,7 @@ import warnings
from oslo_config import cfg
from oslo_log import log as logging
from oslo_service import loopingcall
from oslo_utils import excutils
from oslo_utils import timeutils
from oslo_utils import units
@ -86,7 +87,18 @@ sf_opts = [
'provisioning calculations. If this parameter is set to '
'\'usedSpace\', the driver will report correct '
'values as expected by Cinder '
'thin provisioning.')]
'thin provisioning.'),
cfg.IntOpt('sf_api_request_timeout',
default=30,
min=30,
help='Sets time in seconds to wait for an api request to '
'complete.'),
cfg.IntOpt('sf_volume_clone_timeout',
default=600,
min=60,
help='Sets time in seconds to wait for a clone of a volume or '
'snapshot to complete.'
)]
CONF = cfg.CONF
CONF.register_opts(sf_opts, group=configuration.SHARED_CONF_GROUP)
@ -582,11 +594,14 @@ class SolidFireDriver(san.SanISCSIDriver):
return endpoint
@retry(retry_exc_tuple, tries=6)
def _issue_api_request(self, method, params, version='1.0', endpoint=None):
def _issue_api_request(self, method, params, version='1.0',
endpoint=None, timeout=None):
if params is None:
params = {}
if endpoint is None:
endpoint = self.active_cluster['endpoint']
if not timeout:
timeout = self.configuration.sf_api_request_timeout
payload = {'method': method, 'params': params}
url = '%s/json-rpc/%s/' % (endpoint['url'], version)
@ -598,7 +613,7 @@ class SolidFireDriver(san.SanISCSIDriver):
data=json.dumps(payload),
auth=(endpoint['login'], endpoint['passwd']),
verify=self.verify_ssl,
timeout=30)
timeout=timeout)
response = req.json()
req.close()
if (('error' in response) and
@ -785,15 +800,13 @@ class SolidFireDriver(san.SanISCSIDriver):
def _get_model_info(self, sfaccount, sf_volume_id, endpoint=None):
volume = None
iteration_count = 0
while not volume and iteration_count < 600:
volume_list = self._get_volumes_by_sfaccount(
sfaccount['accountID'], endpoint=endpoint)
for v in volume_list:
if v['volumeID'] == sf_volume_id:
volume = v
break
iteration_count += 1
volume_list = self._get_volumes_by_sfaccount(
sfaccount['accountID'], endpoint=endpoint)
for v in volume_list:
if v['volumeID'] == sf_volume_id:
volume = v
break
if not volume:
LOG.error('Failed to retrieve volume SolidFire-'
@ -863,10 +876,28 @@ class SolidFireDriver(san.SanISCSIDriver):
params['volumeID'] = sf_cloned_id
data = self._issue_api_request('ModifyVolume', params)
model_update = self._get_model_info(sf_account, sf_cloned_id)
if model_update is None:
mesg = _('Failed to get model update from clone')
raise SolidFireAPIException(mesg)
def _wait_volume_is_active():
try:
model_info = self._get_model_info(sf_account, sf_cloned_id)
if model_info:
raise loopingcall.LoopingCallDone(model_info)
except exception.VolumeNotFound:
LOG.debug('Waiting for cloned volume [%s] - [%s] to become '
'active', sf_cloned_id, vref.id)
pass
try:
timer = loopingcall.FixedIntervalWithTimeoutLoopingCall(
_wait_volume_is_active)
model_update = timer.start(
interval=1,
timeout=self.configuration.sf_volume_clone_timeout).wait()
except loopingcall.LoopingCallTimeOut:
msg = (_('Failed to get model update from clone '
'%(cloned_id)s - %(vref_id)s') %
{'cloned_id': sf_cloned_id, 'vref_id': vref.id})
LOG.error(msg)
raise SolidFireAPIException(msg)
rep_settings = self._retrieve_replication_settings(vref)
if self.replication_enabled and rep_settings:

View File

@ -0,0 +1,7 @@
---
fixes:
- |
`Bug #1898587 <https://bugs.launchpad.net/cinder/+bug/1898587>`_:
Address cloning and api request timeout issues users may hit in
certain environments, by allowing configuring timeout values for
these operations through cinder configuration file.