NetApp SolidFire: Fix clone and request timeout issues

Users are experiencing timeout issues in certain environments, mostly
when volumes are too big (ie. multi-terabyte volumes), due to poor
network performance or upgrade issues that revolve around the SolidFire
cluster. A viable solution is to make driver timeout values
configurable in cinder.conf, so users can set these timeouts
according to their needs.

This patch adds two timeout settings to the SolidFire driver
(for cloning operation and globally to all api requests), to
allow users to set the appropriate timeouts for their environment.

Closes-Bug: #1898587
Change-Id: Ie330c76a5db0ea76d4fed5a6ae7b8736dadc8591
This commit is contained in:
Fernando Ferraz 2020-10-01 11:20:51 -03:00
parent 11fa011c86
commit c607a82a99
3 changed files with 62 additions and 19 deletions

View File

@ -221,7 +221,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'volumeID': 6}]
def fake_issue_api_request(self, method, params, version='1.0',
endpoint=None):
endpoint=None, timeout=None):
if method == 'GetClusterCapacity':
data = {}
if version == '1.0':
@ -638,6 +638,12 @@ class SolidFireVolumeTestCase(test.TestCase):
'volume_type_id': None,
'created_at': timeutils.utcnow()}
fake_model_info = {
'provider_id': '%s %s cluster-id-01' % (
self.fake_sfvol['volumeID'],
self.fake_sfaccount['accountID'])
}
ctx = context.get_admin_context()
testvol = fake_volume.fake_volume_obj(ctx, **updates_vol_a)
testvol_b = fake_volume.fake_volume_obj(ctx, **updates_vol_b)
@ -657,7 +663,7 @@ class SolidFireVolumeTestCase(test.TestCase):
return_value=[]), \
mock.patch.object(sfv,
'_get_model_info',
return_value={}):
return_value=fake_model_info):
sfv.create_cloned_volume(testvol_b, testvol)
def test_initialize_connector_with_blocksizes(self):
@ -3041,6 +3047,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'mvip': self.mvip,
'svip': self.svip}
self.configuration.sf_volume_clone_timeout = 1
sfv = solidfire.SolidFireDriver(configuration=self.configuration)
sfv.replication_enabled = False
@ -3085,7 +3092,7 @@ class SolidFireVolumeTestCase(test.TestCase):
mock_issue_api_request.assert_has_calls(calls)
mock_test_set_cluster_pairs.assert_not_called()
mock_update_attributes.assert_not_called()
mock_get_model_info.assert_called_once()
mock_get_model_info.assert_called()
mock_snapshot_discovery.assert_not_called()
reset_mocks()

View File

@ -100,7 +100,18 @@ sf_opts = [
default=3600,
min=30,
help='Sets time in seconds to wait for a migrating volume to '
'complete pairing and sync.')]
'complete pairing and sync.'),
cfg.IntOpt('sf_api_request_timeout',
default=30,
min=30,
help='Sets time in seconds to wait for an api request to '
'complete.'),
cfg.IntOpt('sf_volume_clone_timeout',
default=600,
min=60,
help='Sets time in seconds to wait for a clone of a volume or '
'snapshot to complete.'
)]
CONF = cfg.CONF
CONF.register_opts(sf_opts, group=configuration.SHARED_CONF_GROUP)
@ -656,11 +667,14 @@ class SolidFireDriver(san.SanISCSIDriver):
return endpoint
@retry(retry_exc_tuple, tries=6)
def _issue_api_request(self, method, params, version='1.0', endpoint=None):
def _issue_api_request(self, method, params, version='1.0',
endpoint=None, timeout=None):
if params is None:
params = {}
if endpoint is None:
endpoint = self.active_cluster['endpoint']
if not timeout:
timeout = self.configuration.sf_api_request_timeout
payload = {'method': method, 'params': params}
url = '%s/json-rpc/%s/' % (endpoint['url'], version)
@ -672,7 +686,7 @@ class SolidFireDriver(san.SanISCSIDriver):
data=json.dumps(payload),
auth=(endpoint['login'], endpoint['passwd']),
verify=self.verify_ssl,
timeout=30)
timeout=timeout)
response = req.json()
req.close()
if (('error' in response) and
@ -859,15 +873,13 @@ class SolidFireDriver(san.SanISCSIDriver):
def _get_model_info(self, sfaccount, sf_volume_id, endpoint=None):
volume = None
iteration_count = 0
while not volume and iteration_count < 600:
volume_list = self._get_volumes_by_sfaccount(
sfaccount['accountID'], endpoint=endpoint)
for v in volume_list:
if v['volumeID'] == sf_volume_id:
volume = v
break
iteration_count += 1
volume_list = self._get_volumes_by_sfaccount(
sfaccount['accountID'], endpoint=endpoint)
for v in volume_list:
if v['volumeID'] == sf_volume_id:
volume = v
break
if not volume:
LOG.error('Failed to retrieve volume SolidFire-'
@ -937,10 +949,27 @@ class SolidFireDriver(san.SanISCSIDriver):
params['volumeID'] = sf_cloned_id
data = self._issue_api_request('ModifyVolume', params)
model_update = self._get_model_info(sf_account, sf_cloned_id)
if model_update is None:
mesg = _('Failed to get model update from clone')
raise SolidFireAPIException(mesg)
def _wait_volume_is_active():
try:
model_info = self._get_model_info(sf_account, sf_cloned_id)
if model_info:
raise loopingcall.LoopingCallDone(model_info)
except exception.VolumeNotFound:
LOG.debug('Waiting for cloned volume [%s] - [%s] to become '
'active', sf_cloned_id, vref.id)
pass
try:
timer = loopingcall.FixedIntervalWithTimeoutLoopingCall(
_wait_volume_is_active)
model_update = timer.start(
interval=1,
timeout=self.configuration.sf_volume_clone_timeout).wait()
except loopingcall.LoopingCallTimeOut:
msg = _('Failed to get model update from clone [%s] - [%s]' %
(sf_cloned_id, vref.id))
LOG.error(msg)
raise SolidFireAPIException(msg)
rep_settings = self._retrieve_replication_settings(vref)
if self.replication_enabled and rep_settings:

View File

@ -0,0 +1,7 @@
---
fixes:
- |
`Bug #1898587 <https://bugs.launchpad.net/cinder/+bug/1898587>`_:
Address cloning and api request timeout issues users may hit in
certain environments, by allowing configuring timeout values for
these operations through cinder configuration file.