From e7498ca5bdd6e16b46e8a6d17dbc7492f6e710e9 Mon Sep 17 00:00:00 2001
From: John Griffith <john.griffith8@gmail.com>
Date: Tue, 19 Dec 2017 22:05:01 +0000
Subject: [PATCH] Enable fail back in SolidFire driver

This change fixes up a few things in the SolidFire driver to enable the
ability to "fail back" to the original cluster in a replication scenario.

We're still assuming a cluster-wide fail over, but now an admin has the
ability to specify subsequent `cinder failover-host` calls to switch
back and forth between their SF clusters.

This change promotes the target on a fail over and also now attempts to
set up replication back to the original src  as well (assuming the
original is available and not a puddle of melted goo).  This means that
writes to the new back end will be replicated back to the src, so if a
user so chooses they can fail back to the original and have their new
data.

Change-Id: I678986e557755c2fe4183927c17342e430f5df0d
---
 .../drivers/solidfire/test_solidfire.py       |   4 +-
 cinder/volume/drivers/solidfire.py            | 192 +++++++++++++-----
 ...ailback_to_solidfire-82668c071f4fa91d.yaml |   7 +
 3 files changed, 155 insertions(+), 48 deletions(-)
 create mode 100644 releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml
diff --git a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
index ec028497c4b..2a5809edce8 100644
--- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
+++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
@@ -1104,7 +1104,7 @@ class SolidFireVolumeTestCase(test.TestCase):
             self.assertEqual('1.1.1.1:3260  0', v['provider_location'])
 
             configured_svip = '9.9.9.9:6500'
-            sfv.active_cluster_info['svip'] = configured_svip
+            sfv.active_cluster['svip'] = configured_svip
             v = sfv._get_model_info(sfaccount, 1)
             self.assertEqual('%s  0' % configured_svip, v['provider_location'])
 
@@ -1969,7 +1969,7 @@ class SolidFireVolumeTestCase(test.TestCase):
                               'fake-mvip'}]
         ctxt = None
         type_id = '290edb2a-f5ea-11e5-9ce9-5e5517507c66'
-        fake_type = {'extra_specs': {'replication': 'enabled'}}
+        fake_type = {'extra_specs': {'replication_enabled': '<is> True'}}
         with mock.patch.object(volume_types,
                                'get_volume_type',
                                return_value=fake_type):
diff --git a/cinder/volume/drivers/solidfire.py b/cinder/volume/drivers/solidfire.py
index 64836833807..edb3f9dd114 100644
--- a/cinder/volume/drivers/solidfire.py
+++ b/cinder/volume/drivers/solidfire.py
@@ -166,10 +166,11 @@ class SolidFireDriver(san.SanISCSIDriver):
           2.0.8 - Add active status filter to get volume ops
           2.0.9 - Always purge on delete volume
           2.0.10 - Add response to debug on retryable errors
+          2.0.11 - Add ability to failback replicating volumes
 
     """
 
-    VERSION = '2.0.10'
+    VERSION = '2.0.11'
 
     # ThirdPartySystems wiki page
     CI_WIKI_NAME = "NetApp_SolidFire_CI"
@@ -210,7 +211,7 @@ class SolidFireDriver(san.SanISCSIDriver):
     def __init__(self, *args, **kwargs):
         super(SolidFireDriver, self).__init__(*args, **kwargs)
         self.failed_over_id = kwargs.get('active_backend_id', None)
-        self.active_cluster_info = {}
+        self.replication_status = kwargs.get('replication_status', "na")
         self.configuration.append_config_values(sf_opts)
         self.template_account_id = None
         self.max_volumes_per_account = 1990
@@ -220,17 +221,26 @@ class SolidFireDriver(san.SanISCSIDriver):
         self.failed_over = False
         self.target_driver = SolidFireISCSI(solidfire_driver=self,
                                             configuration=self.configuration)
+        self.default_cluster = self._create_cluster_reference()
+        self.active_cluster = self.default_cluster
+
+        # If we're failed over, we need to parse things out and set the active
+        # cluster appropriately
         if self.failed_over_id:
+            self.failed_over = True
             remote_info = self._get_remote_info_by_id(self.failed_over_id)
             if remote_info:
-                self._set_active_cluster_info(remote_info['endpoint'])
+                self.active_cluster = self._create_cluster_reference(
+                    remote_info['endpoint'])
             else:
                 LOG.error('Failed to initialize SolidFire driver to '
                           'a remote cluster specified at id: %s',
                           self.failed_over_id)
-        else:
-            self._set_active_cluster_info()
 
+        # NOTE(jdg):  This works even in a failed over state, because what we
+        # do is use self.active_cluster in issue_api_request so by default we
+        # always use the currently active cluster, override that by provding
+        # an endpoint to issue_api_request if needed
         try:
             self._update_cluster_status()
         except exception.SolidFireAPIException:
@@ -240,8 +250,7 @@ class SolidFireDriver(san.SanISCSIDriver):
             account = self.configuration.sf_template_account_name
             self.template_account_id = self._create_template_account(account)
 
-        if not self.failed_over_id:
-            self._set_cluster_pairs()
+        self._set_cluster_pairs()
 
     def locked_image_id_operation(f, external=False):
         def lvo_inner1(inst, *args, **kwargs):
@@ -348,7 +357,8 @@ class SolidFireDriver(san.SanISCSIDriver):
                     remote_info['clusterPairID'] = ep['clusterPairID']
                     break
 
-            if not remote_pair:
+            if (not remote_pair and
+                    remote_info['mvip'] != self.active_cluster['mvip']):
                 # NOTE(jdg): create_remote_pairing sets the
                 # clusterPairID in remote_info for us
                 self._create_remote_pairing(remote_info)
@@ -356,23 +366,51 @@ class SolidFireDriver(san.SanISCSIDriver):
             LOG.debug("Setting replication_enabled to True.")
             self.replication_enabled = True
 
-    def _set_active_cluster_info(self, endpoint=None):
+    def _create_cluster_reference(self, endpoint=None):
+        cluster_ref = {}
+        cluster_ref['endpoint'] = endpoint
         if not endpoint:
-            self.active_cluster_info['endpoint'] = self._build_endpoint_info()
+            cluster_ref['endpoint'] = self._build_endpoint_info()
+
+        cluster_info = (self._issue_api_request(
+            'GetClusterInfo', {}, endpoint=cluster_ref['endpoint'])
+            ['result']['clusterInfo'])
+
+        for k, v in cluster_info.items():
+            cluster_ref[k] = v
+
+        # Add a couple extra things that are handy for us
+        cluster_ref['clusterAPIVersion'] = (
+            self._issue_api_request('GetClusterVersionInfo',
+                                    {}, endpoint=cluster_ref['endpoint'])
+            ['result']['clusterAPIVersion'])
+
+        # FIXME(jdg): This is fine for the default/base cluster, but
+        # if we have a secondary configured, and are using vlans etc
+        # we don't use what's in the config (that's the primary only),
+        # we need to set this from the replication_device config
+        if self.configuration.get('sf_svip', None):
+            cluster_ref['svip'] = (
+                self.configuration.get('sf_svip'))
+        return cluster_ref
+
+    def _set_active_cluster(self, endpoint=None):
+        if not endpoint:
+            self.active_cluster['endpoint'] = self._build_endpoint_info()
         else:
-            self.active_cluster_info['endpoint'] = endpoint
+            self.active_cluster['endpoint'] = endpoint
 
         for k, v in self._issue_api_request(
                 'GetClusterInfo',
                 {})['result']['clusterInfo'].items():
-            self.active_cluster_info[k] = v
+            self.active_cluster[k] = v
 
         # Add a couple extra things that are handy for us
-        self.active_cluster_info['clusterAPIVersion'] = (
+        self.active_cluster['clusterAPIVersion'] = (
             self._issue_api_request('GetClusterVersionInfo',
                                     {})['result']['clusterAPIVersion'])
         if self.configuration.get('sf_svip', None):
-            self.active_cluster_info['svip'] = (
+            self.active_cluster['svip'] = (
                 self.configuration.get('sf_svip'))
 
     def _create_provider_id_string(self,
@@ -383,7 +421,7 @@ class SolidFireDriver(san.SanISCSIDriver):
         # swap that with the parent volume id
         return "%s %s %s" % (resource_id,
                              account_or_vol_id,
-                             self.active_cluster_info['uuid'])
+                             self.active_cluster['uuid'])
 
     def _init_snapshot_mappings(self, srefs):
         updates = []
@@ -470,7 +508,7 @@ class SolidFireDriver(san.SanISCSIDriver):
         if params is None:
             params = {}
         if endpoint is None:
-            endpoint = self.active_cluster_info['endpoint']
+            endpoint = self.active_cluster['endpoint']
 
         payload = {'method': method, 'params': params}
         url = '%s/json-rpc/%s/' % (endpoint['url'], version)
@@ -577,7 +615,7 @@ class SolidFireDriver(san.SanISCSIDriver):
         if endpoint:
             iscsi_portal = endpoint['svip']
         else:
-            iscsi_portal = self.active_cluster_info['svip']
+            iscsi_portal = self.active_cluster['svip']
 
         if ':' not in iscsi_portal:
             iscsi_portal += ':3260'
@@ -1343,7 +1381,11 @@ class SolidFireDriver(san.SanISCSIDriver):
         type_ref = volume_types.get_volume_type(ctxt, type_id)
         specs = type_ref.get('extra_specs')
 
-        if specs.get('replication', 'disabled').lower() == 'enabled':
+        # We use the replication_enabled flag for both the trigger in the
+        # driver, as well as capabilities for scheduler. Note we don't
+        # require or check for the additional "replication:True|False"
+        # spec in the type any longer.
+        if specs.get('replication_enabled', "") == "<is> True":
             rep_opts['targets'] = specs.get(
                 'solidfire:replication_targets', self.cluster_pairs[0])
         return rep_opts
@@ -1824,7 +1866,7 @@ class SolidFireDriver(san.SanISCSIDriver):
         data['replication_enabled'] = self.replication_enabled
         if self.replication_enabled:
             data['replication'] = 'enabled'
-        data['active_cluster_mvip'] = self.active_cluster_info['mvip']
+        data['active_cluster_mvip'] = self.active_cluster['mvip']
         data['reserved_percentage'] = self.configuration.reserved_percentage
         data['QoS_support'] = True
 
@@ -2058,35 +2100,88 @@ class SolidFireDriver(san.SanISCSIDriver):
         self._issue_api_request('ModifyVolume',
                                 params, version='5.0')
 
-    def _failover_volume(self, remote_vol, remote):
+    def _failover_volume(self, src_vol, tgt_vol, tgt_cluster):
         """Modify remote volume to R/W mode."""
-        self._issue_api_request(
-            'RemoveVolumePair',
-            {'volumeID': remote_vol['volumeID']},
-            endpoint=remote['endpoint'], version='7.0')
+        # Put the src in tgt mode assuming it's still available
+        # catch the exception if the cluster isn't available and
+        # continue on
+        params = {'volumeID': src_vol['volumeID'],
+                  'access': 'replicationTarget'}
+        try:
+            self._issue_api_request('ModifyVolume', params)
+        except exception.SolidFireAPIException:
+            # FIXME
+            pass
 
-        params = {'volumeID': remote_vol['volumeID'],
+        # Now call out to the remote and make the tgt our new src
+        params = {'volumeID': tgt_vol['volumeID'],
                   'access': 'readWrite'}
         self._issue_api_request('ModifyVolume', params,
-                                endpoint=remote['endpoint'])
+                                endpoint=tgt_cluster['endpoint'])
 
     def failover_host(self, context, volumes, secondary_id=None, groups=None):
-        """Failover to replication target."""
+        """Failover to replication target.
+
+        In order to do failback, you MUST specify the original/default cluster
+        using secondary_id option.  You can do this simply by specifying:
+        `secondary_id=default`
+        """
+        failback = False
         volume_updates = []
         remote = None
+        secondary_id = secondary_id.lower() if secondary_id else None
 
+        # FIXME(jdg): There's an awful lot going on in this if/else block
+        # it's pretty simple in terms of what it does, but would be
+        # good to come back and clean it up and make it a bit more
+        # readable/maintainable.
+
+        # There's two cases we have to deal with
+        #  1. Caller specified a backend target to fail too
+        #  2. Caller just wants to failover to anything available
+        # In case `1` we need to check if they specified the default
+        # and want to failback, so make sure we're even failed-over
+        #
+        # In case `2` they didn't specify a target, but if we're failed
+        # over already, can't just grab a target off the list, we might
+        # already be on that target, so check that and try and go back to
+        # whence you came
         if secondary_id:
-            for rc in self.cluster_pairs:
-                if rc['mvip'] == secondary_id:
-                    remote = rc
-                    break
+            if secondary_id == "default" and not self.failed_over:
+                LOG.error("SolidFire driver received failover_host "
+                          "specifying failback to default, the "
+                          "host however is not in `failed_over` "
+                          "state, so can't failback.")
+                raise exception.InvalidReplicationTarget
+            elif secondary_id == "default" and self.failed_over:
+                remote = self.default_cluster
+                failback = True
+                # TODO(jdg): Add a simple check here to make
+                # sure the default is online
+            else:
+                for rc in self.cluster_pairs:
+                    if rc['mvip'] == secondary_id:
+                        remote = rc
+                        break
             if not remote:
                 LOG.error("SolidFire driver received failover_host "
                           "but was unable to find specified replication "
                           "pair with id: %s.", secondary_id)
                 raise exception.InvalidReplicationTarget
         else:
-            remote = self.cluster_pairs[0]
+            # Otherwise, we just grab a target off the list
+            # but beware, we may already be failed over and there
+            # may not be another target left, so recycle back to
+            # the default
+            if self.failed_over:
+                for cp in self.cluster_pairs:
+                    if cp['endpoint'] != self.active_cluster['endpoint']:
+                        remote = cp
+                if not remote:
+                    remote = self.default_cluster
+                    failback = True
+            else:
+                remote = self.cluster_pairs[0]
 
         if not remote or not self.replication_enabled:
             LOG.error("SolidFire driver received failover_host "
@@ -2097,24 +2192,25 @@ class SolidFireDriver(san.SanISCSIDriver):
                                                       "on non replicated "
                                                       "backend."))
 
-        remote_vols = self._map_sf_volumes(volumes,
+        # Ok, that was annoying; get on with it
+        target_vols = self._map_sf_volumes(volumes,
                                            endpoint=remote['endpoint'])
         primary_vols = self._map_sf_volumes(volumes)
         for v in volumes:
-            remote_vlist = [sfv for sfv in remote_vols
+            target_vlist = [sfv for sfv in target_vols
                             if sfv['cinder_id'] == v['id']]
 
-            if len(remote_vlist) > 0:
-                remote_vol = remote_vlist[0]
-                self._failover_volume(remote_vol, remote)
+            if len(target_vlist) > 0:
+                target_vol = target_vlist[0]
+                # BOOKMARK This fails on failback using 'default'
+                #
                 primary_vol = [sfv for sfv in primary_vols if
                                sfv['cinder_id'] == v['id']][0]
-                if len(primary_vol['volumePairs']) > 0:
-                    self._issue_api_request(
-                        'RemoveVolumePair',
-                        {'volumeID': primary_vol['volumeID']},
-                        version='7.0')
-                iqn = remote_vol['iqn']
+                self._failover_volume(primary_vol, target_vol, remote)
+
+                # Now  we need to update the iqn of the volume to match
+                # the target svip etc
+                iqn = target_vol['iqn']
                 volume_updates.append(
                     {'volume_id': v['id'],
                      'updates': {
@@ -2131,10 +2227,14 @@ class SolidFireDriver(san.SanISCSIDriver):
         # has been pretty much stateless and has allowed customers to run
         # active/active HA c-vol services with SolidFire.  The introduction of
         # the active_cluster and failed_over attributes is going to break that
-        # but for now that's going to be the trade off of using replciation
-        self.active_cluster_info = remote
+        # but for now that's going to be the trade off of using replication
+        active_cluster_id = remote['mvip']
+        self.active_cluster = remote
         self.failed_over = True
-        return remote['mvip'], volume_updates, []
+        if failback:
+            active_cluster_id = 'default'
+
+        return active_cluster_id, volume_updates, []
 
     def freeze_backend(self, context):
         """Freeze backend notification."""
diff --git a/releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml b/releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml
new file mode 100644
index 00000000000..8f5c037f75c
--- /dev/null
+++ b/releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml
@@ -0,0 +1,7 @@
+---
+features:
+  - |
+    Add ability to call failover-host on a replication
+    enabled SF cluster a second time with host id = default
+    to initiate a failback to the default configured SolidFire
+    Cluster.