From eec1b92171367545bc2de3217830e9354e6c6def Mon Sep 17 00:00:00 2001
From: Fernando Ferraz <sfernand@netapp.com>
Date: Thu, 6 Feb 2020 21:22:33 +0000
Subject: [PATCH] NetApp SolidFire: Fix failback failing after service restart

After a successful failover, if the volume service was restarted,
the driver wouldn't be able to failback to the primary cluster
due to incomplete cluster data.

This patch address this problem by gathering data from the
primary cluster during the failback procedure.

Change-Id: I796fcf8969adae3338861a1b2f3310fafcacfbcb
Closes-bug: #1859653
(cherry picked from commit f24eb2fc630f32697fd9afdfdec97e0f61493a46)
---
 .../drivers/solidfire/test_solidfire.py       | 48 +++++++----
 cinder/volume/drivers/solidfire.py            | 84 +++++++++++--------
 ...fter-service-restart-77e5e4da45c9c1aa.yaml |  6 ++
 3 files changed, 85 insertions(+), 53 deletions(-)
 create mode 100644 releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml

diff --git a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
index 89d86777128..0e0ebc79e11 100644
--- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
+++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
@@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase):
                            'login': 'admin'},
               'name': 'AutoTest2-6AjG-FOR-TEST-ONLY',
               'clusterPairID': 33,
+              'clusterAPIVersion': '9.4',
               'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889',
               'svip': '10.10.23.2',
               'mvipNodeID': 1,
@@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase):
             cinder_vols.append(vol)
 
         mock_map_sf_volumes.return_value = sf_vols
-        mock_create_cluster_reference.return_value = self.cluster_pairs[0]
+
+        self.configuration.replication_device = []
+
+        reset_mocks()
+        drv_args = {'active_backend_id': None}
+        sfv = solidfire.SolidFireDriver(configuration=self.configuration,
+                                        **drv_args)
+
+        self.assertRaises(exception.UnableToFailOver,
+                          sfv.failover_host, ctx, cinder_vols, 'fake', None)
+        mock_map_sf_volumes.assert_not_called()
 
         fake_replication_device = {'backend_id': 'fake',
                                    'mvip': '0.0.0.0',
@@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase):
                           sfv.failover_host, ctx, cinder_vols, 'default', None)
         mock_map_sf_volumes.assert_not_called()
 
-        reset_mocks()
-        drv_args = {'active_backend_id': 'default'}
-        sfv = solidfire.SolidFireDriver(configuration=self.configuration,
-                                        **drv_args)
-        self.assertRaises(exception.UnableToFailOver,
-                          sfv.failover_host, ctx, cinder_vols, 'default', None)
-        mock_map_sf_volumes.assert_not_called()
-
         reset_mocks()
         drv_args = {'active_backend_id': None}
         sfv = solidfire.SolidFireDriver(configuration=self.configuration,
@@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase):
                           secondary_id='not_fake_id', groups=None)
         mock_map_sf_volumes.assert_not_called()
 
+        mock_create_cluster_reference.return_value = self.cluster_pairs[0]
+
         reset_mocks()
-        drv_args = {'active_backend_id': None}
+        drv_args = {'active_backend_id': 'secondary'}
         sfv = solidfire.SolidFireDriver(configuration=self.configuration,
                                         **drv_args)
-        sfv.cluster_pairs = [None]
-        self.assertRaises(exception.UnableToFailOver,
-                          sfv.failover_host, ctx, cinder_vols,
-                          secondary_id='fake', groups=None)
-        mock_map_sf_volumes.assert_not_called()
+        sfv.cluster_pairs = self.cluster_pairs
+        sfv.cluster_pairs[0]['backend_id'] = 'fake'
+        sfv.replication_enabled = True
+        cluster_id, updates, _ = sfv.failover_host(
+            ctx, cinder_vols, secondary_id='default', groups=None)
+        self.assertEqual(5, len(updates))
+        for update in updates:
+            self.assertEqual(fields.ReplicationStatus.ENABLED,
+                             update['updates']['replication_status'])
+        self.assertEqual('', cluster_id)
+        mock_get_create_account.assert_called()
+        mock_failover_volume.assert_called()
+        mock_map_sf_volumes.assert_called()
+        mock_update_cluster_status.assert_called()
+        mock_set_cluster_pairs.assert_called()
+        mock_create_cluster_reference.assert_called()
 
         reset_mocks()
         drv_args = {'active_backend_id': None}
@@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase):
         mock_get_create_account.assert_called()
         mock_failover_volume.assert_called()
         mock_map_sf_volumes.assert_called()
-        mock_get_cluster_info.assert_not_called()
         mock_update_cluster_status.assert_called()
         mock_set_cluster_pairs.assert_called()
         mock_create_cluster_reference.assert_called()
-        mock_issue_api_request.assert_not_called()
 
     @mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request')
     @mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference')
diff --git a/cinder/volume/drivers/solidfire.py b/cinder/volume/drivers/solidfire.py
index e6b97a346d6..40c26fbb862 100644
--- a/cinder/volume/drivers/solidfire.py
+++ b/cinder/volume/drivers/solidfire.py
@@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver):
           2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors
           2.0.16 - Add options for replication mode (Async, Sync and
                    SnapshotsOnly)
+          2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
+                   service is restarted
     """
 
-    VERSION = '2.0.16'
+    VERSION = '2.0.17'
 
     # ThirdPartySystems wiki page
     CI_WIKI_NAME = "NetApp_SolidFire_CI"
@@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver):
             self.active_cluster = self._create_cluster_reference(
                 remote_endpoint)
 
-            # When in failed-over state, we have only endpoint info from the
-            # primary cluster.
-            self.primary_cluster = {"endpoint": self._build_endpoint_info()}
             self.failed_over = True
+            self.replication_enabled = True
         else:
-            self.primary_cluster = self._create_cluster_reference()
-            self.active_cluster = self.primary_cluster
+            self.active_cluster = self._create_cluster_reference()
             if self.configuration.replication_device:
                 self._set_cluster_pairs()
+                self.replication_enabled = True
 
         LOG.debug("Active cluster: %s", self.active_cluster)
 
@@ -437,9 +437,11 @@ class SolidFireDriver(san.SanISCSIDriver):
             # clusterPairID in remote_info for us
             self._create_remote_pairing(remote_info)
 
+        if self.cluster_pairs:
+            self.cluster_pairs.clear()
+
         self.cluster_pairs.append(remote_info)
         LOG.debug("Available cluster pairs: %s", self.cluster_pairs)
-        self.replication_enabled = True
 
     def _create_cluster_reference(self, endpoint=None):
         cluster_ref = {}
@@ -2352,8 +2354,13 @@ class SolidFireDriver(san.SanISCSIDriver):
         failback = False
         volume_updates = []
 
-        LOG.info("Failing over. Secondary ID is: %s",
-                 secondary_id)
+        if not self.replication_enabled:
+            LOG.error("SolidFire driver received failover_host "
+                      "request, however replication is NOT "
+                      "enabled.")
+            raise exception.UnableToFailOver(reason=_("Failover requested "
+                                                      "on non replicated "
+                                                      "backend."))
 
         # NOTE(erlon): For now we only support one replication target device.
         # So, there are two cases we have to deal with here:
@@ -2371,8 +2378,10 @@ class SolidFireDriver(san.SanISCSIDriver):
                     "state.")
             raise exception.InvalidReplicationTarget(msg)
         elif secondary_id == "default" and self.failed_over:
-            remote = self.primary_cluster
+            LOG.info("Failing back to primary cluster.")
+            remote = self._create_cluster_reference()
             failback = True
+
         else:
             repl_configs = self.configuration.replication_device[0]
             if secondary_id and repl_configs['backend_id'] != secondary_id:
@@ -2380,25 +2389,24 @@ class SolidFireDriver(san.SanISCSIDriver):
                         "on cinder.conf.") % secondary_id
                 raise exception.InvalidReplicationTarget(msg)
 
+            LOG.info("Failing over to secondary cluster %s.", secondary_id)
             remote = self.cluster_pairs[0]
 
-        if not remote or not self.replication_enabled:
-            LOG.error("SolidFire driver received failover_host "
-                      "request, however replication is NOT "
-                      "enabled, or there are no available "
-                      "targets to fail-over to.")
-            raise exception.UnableToFailOver(reason=_("Failover requested "
-                                                      "on non replicated "
-                                                      "backend."))
+        LOG.debug("Target cluster to failover: %s.",
+                  {'name': remote['name'],
+                   'mvip': remote['mvip'],
+                   'clusterAPIVersion': remote['clusterAPIVersion']})
 
         target_vols = self._map_sf_volumes(volumes,
                                            endpoint=remote['endpoint'])
-        LOG.debug("Mapped target_vols: %s", target_vols)
+        LOG.debug("Total Cinder volumes found in target: %d",
+                  len(target_vols))
 
         primary_vols = None
         try:
             primary_vols = self._map_sf_volumes(volumes)
-            LOG.debug("Mapped Primary_vols: %s", target_vols)
+            LOG.debug("Total Cinder volumes found in primary cluster: %d",
+                      len(primary_vols))
         except SolidFireAPIException:
             # API Request failed on source. Failover/failback will skip next
             # calls to it.
@@ -2433,14 +2441,26 @@ class SolidFireDriver(san.SanISCSIDriver):
                 else:
                     primary_vol = None
 
-                LOG.debug('Failing-over volume %s, target vol %s, '
-                          'primary vol %s', v, target_vol, primary_vol)
+                LOG.info('Failing-over volume %s.', v.id)
+                LOG.debug('Target vol: %s',
+                          {'access': target_vol['access'],
+                           'accountID': target_vol['accountID'],
+                           'name': target_vol['name'],
+                           'status': target_vol['status'],
+                           'volumeID': target_vol['volumeID']})
+                LOG.debug('Primary vol: %s',
+                          {'access': primary_vol['access'],
+                           'accountID': primary_vol['accountID'],
+                           'name': primary_vol['name'],
+                           'status': primary_vol['status'],
+                           'volumeID': primary_vol['volumeID']})
 
                 try:
                     self._failover_volume(target_vol, remote, primary_vol)
 
                     sf_account = self._get_create_account(
                         v.project_id, endpoint=remote['endpoint'])
+                    LOG.debug("Target account: %s", sf_account['accountID'])
 
                     conn_info = self._build_connection_info(
                         sf_account, target_vol, endpoint=remote['endpoint'])
@@ -2468,12 +2488,7 @@ class SolidFireDriver(san.SanISCSIDriver):
                 except Exception as e:
                     volume_updates.append({'volume_id': v['id'],
                                            'updates': {'status': 'error', }})
-
-                    if failback:
-                        LOG.error("Error trying to failback volume %s", v.id)
-                    else:
-                        LOG.error("Error trying to failover volume %s", v.id)
-
+                    LOG.error("Error trying to failover volume %s", v.id)
                     msg = e.message if hasattr(e, 'message') else e
                     LOG.exception(msg)
 
@@ -2481,20 +2496,17 @@ class SolidFireDriver(san.SanISCSIDriver):
                 volume_updates.append({'volume_id': v['id'],
                                        'updates': {'status': 'error', }})
 
-        # FIXME(jdg): This introduces a problem for us, up until now our driver
-        # has been pretty much stateless and has allowed customers to run
-        # active/active HA c-vol services with SolidFire.  The introduction of
-        # the active_cluster and failed_over attributes is going to break that
-        # but for now that's going to be the trade off of using replication
+        self.active_cluster = remote
+
         if failback:
-            active_cluster_id = None
+            active_cluster_id = ''
             self.failed_over = False
+            # Recreating cluster pairs after a successful failback
+            self._set_cluster_pairs()
         else:
             active_cluster_id = remote['backend_id']
             self.failed_over = True
 
-        self.active_cluster = remote
-
         return active_cluster_id, volume_updates, []
 
     def freeze_backend(self, context):
diff --git a/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml b/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml
new file mode 100644
index 00000000000..e1364ce7ec4
--- /dev/null
+++ b/releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    NetApp SolidFire driver: Fixed an issue that causes failback
+    to fail after a volume service restart. This change fixes
+    bug `1859653 <https://bugs.launchpad.net/cinder/+bug/1859653>`_.
\ No newline at end of file