Browse Source

NetApp SolidFire: Fix duplicate volume when API response is lost

The SolidFire driver retries API requests in case a connection
error occurrs. When network is unstable, there may be the
possibility that the SolidFire backend successfully receive
and process a create volume operation, but fail to deliver the
response back to the driver.

When this scenario occurrs, the SolidFire driver automatically
resends the request, creating a second volume and leaving a
duplicate unused. Although this doesn't affect
driver functionality at first moment (the volume id from the
cluster is always correctly associated to cinder provider id),
further operations may hit the unused volume, leading to
unexpected hehavior.

This patch fixes this issue by:

1. Checking if the volume name already exists in the
backend before trying to create it. Volume creation will
raise a exception and abort in case of a volume is found.

2. Checking for volume creation right after a read timeout is
detected, preventing invalid API calls.

3. Adding option ´sf_volume_create_timeout´ to the SolidFire
driver, to allow users to set the appropriate timeout value for
their environment.

Closes-Bug: #1896112
Change-Id: I4383b691a8cc4aacb046332e418aafb88ba8ba56
(cherry picked from commit 42c92cc407)
(cherry picked from commit deb31a0c4d)
(cherry picked from commit d73da73283)
changes/75/764275/9
Fernando Ferraz 7 months ago
parent
commit
f70bfbf711
3 changed files with 85 additions and 8 deletions
  1. +6
    -3
      cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
  2. +65
    -5
      cinder/volume/drivers/solidfire.py
  3. +14
    -0
      releasenotes/notes/sf-fix-duplicate-volume-request-lost-adefacda1298dc62.yaml

+ 6
- 3
cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py View File

@ -329,11 +329,14 @@ class SolidFireVolumeTestCase(test.TestCase):
'attributes': {'uuid': f_uuid[1]},
'qos': None,
'iqn': test_name}]}}
if params and params['startVolumeID']:
if params and params.get('startVolumeID', None):
volumes = result['result']['volumes']
selected_volumes = [v for v in volumes if v.get('volumeID')
!= params['startVolumeID']]
selected_volumes = [v for v in volumes if v.get('volumeID') !=
params['startVolumeID']]
result['result']['volumes'] = selected_volumes
else:
result = {'result': {'volumes': []}}
return result
elif method is 'DeleteSnapshot':
return {'result': {}}


+ 65
- 5
cinder/volume/drivers/solidfire.py View File

@ -98,7 +98,13 @@ sf_opts = [
min=60,
help='Sets time in seconds to wait for a clone of a volume or '
'snapshot to complete.'
)]
),
cfg.IntOpt('sf_volume_create_timeout',
default=60,
min=30,
help='Sets time in seconds to wait for a create volume '
'operation to complete.')]
CONF = cfg.CONF
CONF.register_opts(sf_opts, group=configuration.SHARED_CONF_GROUP)
@ -236,9 +242,11 @@ class SolidFireDriver(san.SanISCSIDriver):
SnapshotsOnly)
2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
service is restarted
2.0.18 - Fix bug #1896112 SolidFire Driver creates duplicate volume
when API response is lost
"""
VERSION = '2.0.17'
VERSION = '2.0.18'
# ThirdPartySystems wiki page
CI_WIKI_NAME = "NetApp_SolidFire_CI"
@ -928,10 +936,62 @@ class SolidFireDriver(san.SanISCSIDriver):
params['attributes'] = attributes
return self._issue_api_request('ModifyVolume', params)
def _list_volumes_by_name(self, sf_volume_name):
params = {'volumeName': sf_volume_name}
return self._issue_api_request(
'ListVolumes', params, version='8.0')['result']['volumes']
def _wait_volume_is_active(self, sf_volume_name):
def _wait():
volumes = self._list_volumes_by_name(sf_volume_name)
if volumes:
LOG.debug("Found Volume [%s] in SolidFire backend. "
"Current status is [%s].",
sf_volume_name, volumes[0]['status'])
if volumes[0]['status'] == 'active':
raise loopingcall.LoopingCallDone(volumes[0])
try:
timer = loopingcall.FixedIntervalWithTimeoutLoopingCall(
_wait)
sf_volume = (timer.start(
interval=1,
timeout=self.configuration.sf_volume_create_timeout).wait())
return sf_volume
except loopingcall.LoopingCallTimeOut:
msg = ("Timeout while waiting volume [%s] "
"to be in active state." % sf_volume_name)
LOG.error(msg)
raise SolidFireAPIException(msg)
def _do_volume_create(self, sf_account, params, endpoint=None):
params['accountID'] = sf_account['accountID']
sf_volid = self._issue_api_request(
'CreateVolume', params, endpoint=endpoint)['result']['volumeID']
sf_volume_name = params['name']
volumes_found = self._list_volumes_by_name(sf_volume_name)
if volumes_found:
msg = ('Volume name [%s] already exists '
'in SolidFire backend.') % sf_volume_name
LOG.error(msg)
raise DuplicateSfVolumeNames(message=msg)
sf_volid = None
try:
params['accountID'] = sf_account['accountID']
response = self._issue_api_request(
'CreateVolume', params, endpoint=endpoint)
sf_volid = response['result']['volumeID']
except requests.exceptions.ReadTimeout:
LOG.debug("Read Timeout exception caught while creating "
"volume [%s].", sf_volume_name)
# Check if volume was created for the given name,
# in case the backend has processed the request but failed
# to deliver the response before api request timeout.
volume_created = self._wait_volume_is_active(sf_volume_name)
sf_volid = volume_created['volumeID']
return self._get_model_info(sf_account, sf_volid, endpoint=endpoint)
def _do_snapshot_create(self, params):


+ 14
- 0
releasenotes/notes/sf-fix-duplicate-volume-request-lost-adefacda1298dc62.yaml View File

@ -0,0 +1,14 @@
---
fixes:
- |
NetApp SolidFire driver `Bug #1896112
<https://bugs.launchpad.net/cinder/+bug/1896112>`_:
Fixes an issue that may duplicate volumes during creation, in case
the SolidFire backend successfully processes a request and creates
the volume, but fails to deliver the result back to the driver (the
response is lost). When this scenario occurs, the SolidFire driver
will retry the operation, which previously resulted in the creation
of a duplicate volume. This fix adds the ``sf_volume_create_timeout``
configuration option (default value: 60 seconds) which specifies an
additional length of time that the driver will wait for the volume to
become active on the backend before raising an exception.

Loading…
Cancel
Save