From d73da73283c783f90e878c5fbd53e114e1918f0a Mon Sep 17 00:00:00 2001 From: Fernando Ferraz Date: Mon, 5 Oct 2020 19:20:15 -0300 Subject: [PATCH] NetApp SolidFire: Fix duplicate volume when API response is lost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SolidFire driver retries API requests in case a connection error occurrs. When network is unstable, there may be the possibility that the SolidFire backend successfully receive and process a create volume operation, but fail to deliver the response back to the driver. When this scenario occurrs, the SolidFire driver automatically resends the request, creating a second volume and leaving a duplicate unused. Although this doesn't affect driver functionality at first moment (the volume id from the cluster is always correctly associated to cinder provider id), further operations may hit the unused volume, leading to unexpected hehavior. This patch fixes this issue by: 1. Checking if the volume name already exists in the backend before trying to create it. Volume creation will raise a exception and abort in case of a volume is found. 2. Checking for volume creation right after a read timeout is detected, preventing invalid API calls. 3. Adding option ´sf_volume_create_timeout´ to the SolidFire driver, to allow users to set the appropriate timeout value for their environment. Closes-Bug: #1896112 Change-Id: I4383b691a8cc4aacb046332e418aafb88ba8ba56 (cherry picked from commit 42c92cc407d475751bc61c98445c9c0740a71496) (cherry picked from commit deb31a0c4d35ea6c479bfe466b780ed07a489276) --- .../drivers/solidfire/test_solidfire.py | 9 ++- cinder/volume/drivers/solidfire.py | 71 +++++++++++++++++-- ...-volume-request-lost-adefacda1298dc62.yaml | 14 ++++ 3 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 releasenotes/notes/sf-fix-duplicate-volume-request-lost-adefacda1298dc62.yaml diff --git a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py index 11d599c2dad..bfdf74e3cdc 100644 --- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py +++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py @@ -349,11 +349,14 @@ class SolidFireVolumeTestCase(test.TestCase): 'attributes': {'uuid': f_uuid[1]}, 'qos': None, 'iqn': test_name}]}} - if params and params['startVolumeID']: + if params and params.get('startVolumeID', None): volumes = result['result']['volumes'] - selected_volumes = [v for v in volumes if v.get('volumeID') - != params['startVolumeID']] + selected_volumes = [v for v in volumes if v.get('volumeID') != + params['startVolumeID']] result['result']['volumes'] = selected_volumes + else: + result = {'result': {'volumes': []}} + return result elif method == 'DeleteSnapshot': return {'result': {}} diff --git a/cinder/volume/drivers/solidfire.py b/cinder/volume/drivers/solidfire.py index a8eb5349c55..c84258a6bfc 100644 --- a/cinder/volume/drivers/solidfire.py +++ b/cinder/volume/drivers/solidfire.py @@ -88,6 +88,7 @@ sf_opts = [ '\'usedSpace\', the driver will report correct ' 'values as expected by Cinder ' 'thin provisioning.'), + cfg.IntOpt('sf_api_request_timeout', default=30, min=30, @@ -98,7 +99,13 @@ sf_opts = [ min=60, help='Sets time in seconds to wait for a clone of a volume or ' 'snapshot to complete.' - )] + ), + + cfg.IntOpt('sf_volume_create_timeout', + default=60, + min=30, + help='Sets time in seconds to wait for a create volume ' + 'operation to complete.')] CONF = cfg.CONF CONF.register_opts(sf_opts, group=configuration.SHARED_CONF_GROUP) @@ -241,9 +248,11 @@ class SolidFireDriver(san.SanISCSIDriver): - Implement Active/Active replication support 2.2.1 - Fix bug #1891914 fix error on cluster workload rebalancing by adding xNotPrimary to the retryable exception list + 2.2.2 - Fix bug #1896112 SolidFire Driver creates duplicate volume + when API response is lost """ - VERSION = '2.2.1' + VERSION = '2.2.2' SUPPORTS_ACTIVE_ACTIVE = True @@ -939,10 +948,62 @@ class SolidFireDriver(san.SanISCSIDriver): params['attributes'] = attributes return self._issue_api_request('ModifyVolume', params) + def _list_volumes_by_name(self, sf_volume_name): + params = {'volumeName': sf_volume_name} + return self._issue_api_request( + 'ListVolumes', params, version='8.0')['result']['volumes'] + + def _wait_volume_is_active(self, sf_volume_name): + + def _wait(): + volumes = self._list_volumes_by_name(sf_volume_name) + if volumes: + LOG.debug("Found Volume [%s] in SolidFire backend. " + "Current status is [%s].", + sf_volume_name, volumes[0]['status']) + if volumes[0]['status'] == 'active': + raise loopingcall.LoopingCallDone(volumes[0]) + + try: + timer = loopingcall.FixedIntervalWithTimeoutLoopingCall( + _wait) + sf_volume = (timer.start( + interval=1, + timeout=self.configuration.sf_volume_create_timeout).wait()) + + return sf_volume + except loopingcall.LoopingCallTimeOut: + msg = ("Timeout while waiting volume [%s] " + "to be in active state." % sf_volume_name) + LOG.error(msg) + raise SolidFireAPIException(msg) + def _do_volume_create(self, sf_account, params, endpoint=None): - params['accountID'] = sf_account['accountID'] - sf_volid = self._issue_api_request( - 'CreateVolume', params, endpoint=endpoint)['result']['volumeID'] + + sf_volume_name = params['name'] + volumes_found = self._list_volumes_by_name(sf_volume_name) + if volumes_found: + msg = ('Volume name [%s] already exists ' + 'in SolidFire backend.') % sf_volume_name + LOG.error(msg) + raise DuplicateSfVolumeNames(message=msg) + + sf_volid = None + try: + params['accountID'] = sf_account['accountID'] + response = self._issue_api_request( + 'CreateVolume', params, endpoint=endpoint) + sf_volid = response['result']['volumeID'] + + except requests.exceptions.ReadTimeout: + LOG.debug("Read Timeout exception caught while creating " + "volume [%s].", sf_volume_name) + # Check if volume was created for the given name, + # in case the backend has processed the request but failed + # to deliver the response before api request timeout. + volume_created = self._wait_volume_is_active(sf_volume_name) + sf_volid = volume_created['volumeID'] + return self._get_model_info(sf_account, sf_volid, endpoint=endpoint) def _do_snapshot_create(self, params): diff --git a/releasenotes/notes/sf-fix-duplicate-volume-request-lost-adefacda1298dc62.yaml b/releasenotes/notes/sf-fix-duplicate-volume-request-lost-adefacda1298dc62.yaml new file mode 100644 index 00000000000..775bcd425f4 --- /dev/null +++ b/releasenotes/notes/sf-fix-duplicate-volume-request-lost-adefacda1298dc62.yaml @@ -0,0 +1,14 @@ +--- +fixes: + - | + NetApp SolidFire driver `Bug #1896112 + `_: + Fixes an issue that may duplicate volumes during creation, in case + the SolidFire backend successfully processes a request and creates + the volume, but fails to deliver the result back to the driver (the + response is lost). When this scenario occurs, the SolidFire driver + will retry the operation, which previously resulted in the creation + of a duplicate volume. This fix adds the ``sf_volume_create_timeout`` + configuration option (default value: 60 seconds) which specifies an + additional length of time that the driver will wait for the volume to + become active on the backend before raising an exception.