Handle TFlow short comings in SF driver

Taskflow by default does an auto retry of failed driver
calls.  By default the behavior is to retry unless the
exception is added to a blacklist, this has caused
issues in the past and we should probably fix it.

For now, this patch addresses a combination of issues that
result from Taskflows retry policy and the new object/HA
code changes that have been added to the cinder.volume.manager
code.

It's possible currently for a create to be issued to a volume
driver and for a volume to be created, BUT if anything goes
wrong on some subsequent operation (like setting up replication)
and an exception is thrown.  The changes to the manager code
now actively clear the host column prior to the retry (first problem),
then we go through and repeat 3 times.

The result here is that we end up with at least 3 orphaned volumes
on the backend device.  Also, now when you delete the error state
volume that Cinder still knows about, the code checks the host column
sees it's empty and of course can't issue a call to a driver, so it
just pretends things are ok, deletes the DB entry and moves on.

So now you have 3 orphaned volumes on the backend device that you
don't know about.  The logs don't contain any information to point
this out, and there's no mechanism in cinder to clean these
orphaned volumes up.

For now this patch just fixes things in the SF driver.
The only place we really have a potential of hitting this case
in the SF driver right now is if something fails in replication
setup after the create, so we'll just detect this scenario
ourselves and delete the volume from the backend before
returning the exception.

NOTE that this patch is part of the add_cheesecak_to_solidfire topic
chain of patches, and gerrit seems aware of it, although we don't get
the fancy "depends on" anymore.

Change-Id: Ib0c258046abc315993880eacfa190b957eaaed50
This commit is contained in:
John Griffith 2016-03-31 16:14:52 +00:00
parent 7eb9580b01
commit 689d94e04e
2 changed files with 47 additions and 8 deletions

View File

@ -272,6 +272,7 @@ class SolidFireVolumeTestCase(test.TestCase):
return test_qos_spec
def _fake_do_volume_create(account, params):
params['provider_location'] = '1.1.1.1 iqn 0'
return params
sfv = solidfire.SolidFireDriver(configuration=self.configuration)
@ -1789,3 +1790,24 @@ class SolidFireVolumeTestCase(test.TestCase):
{'initiatorSecret': 'shhh',
'targetSecret': 'dont-tell'},
{}))
def test_pythons_try_except(self):
def _fake_retrieve_rep(vol):
raise exception.SolidFireAPIException
sfv = solidfire.SolidFireDriver(configuration=self.configuration)
with mock.patch.object(sfv,
'_get_create_account',
return_value={'accountID': 5}),\
mock.patch.object(sfv,
'_retrieve_qos_setting',
return_value=None),\
mock.patch.object(sfv,
'_do_volume_create',
return_value={'provider_id': '1 2 xxxx'}),\
mock.patch.object(sfv,
'_retrieve_replication_settings',
side_effect=_fake_retrieve_rep):
self.assertRaises(exception.SolidFireAPIException,
sfv.create_volume,
self.mock_volume)

View File

@ -147,7 +147,8 @@ class SolidFireDriver(san.SanISCSIDriver):
2.0.2 - Implement secondary account
2.0.3 - Implement cluster pairing
2.0.4 - Implement volume replication
2.0.5 - Try and deal with the stupid retry/clear issues from objects
and tflow
"""
VERSION = '2.0.2'
@ -1183,12 +1184,27 @@ class SolidFireDriver(san.SanISCSIDriver):
params['name'] = vname
params['attributes']['migration_uuid'] = volume['id']
params['attributes']['uuid'] = v
model_update = self._do_volume_create(sf_account, params)
rep_settings = self._retrieve_replication_settings(volume)
if self.replication_enabled and rep_settings:
volume['volumeID'] = int(model_update['provider_id'].split()[0])
self._replicate_volume(volume, params,
sf_account, rep_settings)
try:
rep_settings = self._retrieve_replication_settings(volume)
if self.replication_enabled and rep_settings:
volume['volumeID'] = (
int(model_update['provider_id'].split()[0]))
self._replicate_volume(volume, params,
sf_account, rep_settings)
except exception.SolidFireAPIException:
# NOTE(jdg): Something went wrong after the source create, due to
# the way TFLOW works and it's insistence on retrying the same
# command over and over coupled with the fact that the introduction
# of objects now sets host to None on failures we'll end up with an
# orphaned volume on the backend for every one of these segments
# that fail, for n-retries. Sad Sad Panda!! We'll just do it
# ourselves until we can get a general fix in Cinder further up the
# line
with excutils.save_and_reraise_exception():
sf_volid = int(model_update['provider_id'].split()[0])
self._issue_api_request('DeleteVolume', {'volumeID': sf_volid})
return model_update
def _retrieve_replication_settings(self, volume):
@ -1316,8 +1332,9 @@ class SolidFireDriver(san.SanISCSIDriver):
self._issue_api_request('DeleteVolume', params,
endpoint=cluster['endpoint'])
params = {'volumeID': sf_vol['volumeID']}
self._issue_api_request('DeleteVolume', params)
if sf_vol['status'] == 'active':
params = {'volumeID': sf_vol['volumeID']}
self._issue_api_request('DeleteVolume', params)
if volume.get('multiattach'):
self._remove_volume_from_vags(sf_vol['volumeID'])
else: