Handle TFlow short comings in SF driver

Taskflow by default does an auto retry of failed driver calls. By default the behavior is to retry unless the exception is added to a blacklist, this has caused issues in the past and we should probably fix it. For now, this patch addresses a combination of issues that result from Taskflows retry policy and the new object/HA code changes that have been added to the cinder.volume.manager code. It's possible currently for a create to be issued to a volume driver and for a volume to be created, BUT if anything goes wrong on some subsequent operation (like setting up replication) and an exception is thrown. The changes to the manager code now actively clear the host column prior to the retry (first problem), then we go through and repeat 3 times. The result here is that we end up with at least 3 orphaned volumes on the backend device. Also, now when you delete the error state volume that Cinder still knows about, the code checks the host column sees it's empty and of course can't issue a call to a driver, so it just pretends things are ok, deletes the DB entry and moves on. So now you have 3 orphaned volumes on the backend device that you don't know about. The logs don't contain any information to point this out, and there's no mechanism in cinder to clean these orphaned volumes up. For now this patch just fixes things in the SF driver. The only place we really have a potential of hitting this case in the SF driver right now is if something fails in replication setup after the create, so we'll just detect this scenario ourselves and delete the volume from the backend before returning the exception. NOTE that this patch is part of the add_cheesecak_to_solidfire topic chain of patches, and gerrit seems aware of it, although we don't get the fancy "depends on" anymore. Change-Id: Ib0c258046abc315993880eacfa190b957eaaed50
2016-03-31 16:14:52 +00:00 · 2016-03-31 16:14:52 +00:00 · 689d94e04e
parent 7eb9580b01
commit 689d94e04e
2 changed files with 47 additions and 8 deletions
--- a/cinder/tests/unit/test_solidfire.py
+++ b/cinder/tests/unit/test_solidfire.py
@ -272,6 +272,7 @@ class SolidFireVolumeTestCase(test.TestCase):
            return test_qos_spec

        def _fake_do_volume_create(account, params):
+            params['provider_location'] = '1.1.1.1 iqn 0'
            return params

        sfv = solidfire.SolidFireDriver(configuration=self.configuration)
@ -1789,3 +1790,24 @@ class SolidFireVolumeTestCase(test.TestCase):
                                 {'initiatorSecret': 'shhh',
                                  'targetSecret': 'dont-tell'},
                                 {}))
+
+    def test_pythons_try_except(self):
+        def _fake_retrieve_rep(vol):
+            raise exception.SolidFireAPIException
+
+        sfv = solidfire.SolidFireDriver(configuration=self.configuration)
+        with mock.patch.object(sfv,
+                               '_get_create_account',
+                               return_value={'accountID': 5}),\
+                mock.patch.object(sfv,
+                                  '_retrieve_qos_setting',
+                                  return_value=None),\
+                mock.patch.object(sfv,
+                                  '_do_volume_create',
+                                  return_value={'provider_id': '1 2 xxxx'}),\
+                mock.patch.object(sfv,
+                                  '_retrieve_replication_settings',
+                                  side_effect=_fake_retrieve_rep):
+            self.assertRaises(exception.SolidFireAPIException,
+                              sfv.create_volume,
+                              self.mock_volume)
--- a/cinder/volume/drivers/solidfire.py
+++ b/cinder/volume/drivers/solidfire.py
@ -147,7 +147,8 @@ class SolidFireDriver(san.SanISCSIDriver):
        2.0.2 - Implement secondary account
        2.0.3 - Implement cluster pairing
        2.0.4 - Implement volume replication
-
+        2.0.5 - Try and deal with the stupid retry/clear issues from objects
+                and tflow
    """

    VERSION = '2.0.2'
@ -1183,12 +1184,27 @@ class SolidFireDriver(san.SanISCSIDriver):
            params['name'] = vname
            params['attributes']['migration_uuid'] = volume['id']
            params['attributes']['uuid'] = v
+
        model_update = self._do_volume_create(sf_account, params)
-        rep_settings = self._retrieve_replication_settings(volume)
-        if self.replication_enabled and rep_settings:
-            volume['volumeID'] = int(model_update['provider_id'].split()[0])
-            self._replicate_volume(volume, params,
-                                   sf_account, rep_settings)
+        try:
+            rep_settings = self._retrieve_replication_settings(volume)
+            if self.replication_enabled and rep_settings:
+                volume['volumeID'] = (
+                    int(model_update['provider_id'].split()[0]))
+                self._replicate_volume(volume, params,
+                                       sf_account, rep_settings)
+        except exception.SolidFireAPIException:
+            # NOTE(jdg): Something went wrong after the source create, due to
+            # the way TFLOW works and it's insistence on retrying the same
+            # command over and over coupled with the fact that the introduction
+            # of objects now sets host to None on failures we'll end up with an
+            # orphaned volume on the backend for every one of these segments
+            # that fail, for n-retries.  Sad Sad Panda!!  We'll just do it
+            # ourselves until we can get a general fix in Cinder further up the
+            # line
+            with excutils.save_and_reraise_exception():
+                sf_volid = int(model_update['provider_id'].split()[0])
+                self._issue_api_request('DeleteVolume', {'volumeID': sf_volid})
        return model_update

    def _retrieve_replication_settings(self, volume):
@ -1316,8 +1332,9 @@ class SolidFireDriver(san.SanISCSIDriver):
                        self._issue_api_request('DeleteVolume', params,
                                                endpoint=cluster['endpoint'])

-            params = {'volumeID': sf_vol['volumeID']}
-            self._issue_api_request('DeleteVolume', params)
+            if sf_vol['status'] == 'active':
+                params = {'volumeID': sf_vol['volumeID']}
+                self._issue_api_request('DeleteVolume', params)
            if volume.get('multiattach'):
                self._remove_volume_from_vags(sf_vol['volumeID'])
        else: