From e7498ca5bdd6e16b46e8a6d17dbc7492f6e710e9 Mon Sep 17 00:00:00 2001 From: John Griffith Date: Tue, 19 Dec 2017 22:05:01 +0000 Subject: [PATCH] Enable fail back in SolidFire driver This change fixes up a few things in the SolidFire driver to enable the ability to "fail back" to the original cluster in a replication scenario. We're still assuming a cluster-wide fail over, but now an admin has the ability to specify subsequent `cinder failover-host` calls to switch back and forth between their SF clusters. This change promotes the target on a fail over and also now attempts to set up replication back to the original src as well (assuming the original is available and not a puddle of melted goo). This means that writes to the new back end will be replicated back to the src, so if a user so chooses they can fail back to the original and have their new data. Change-Id: I678986e557755c2fe4183927c17342e430f5df0d --- .../drivers/solidfire/test_solidfire.py | 4 +- cinder/volume/drivers/solidfire.py | 192 +++++++++++++----- ...ailback_to_solidfire-82668c071f4fa91d.yaml | 7 + 3 files changed, 155 insertions(+), 48 deletions(-) create mode 100644 releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml diff --git a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py index ec028497c4b..2a5809edce8 100644 --- a/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py +++ b/cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py @@ -1104,7 +1104,7 @@ class SolidFireVolumeTestCase(test.TestCase): self.assertEqual('1.1.1.1:3260 0', v['provider_location']) configured_svip = '9.9.9.9:6500' - sfv.active_cluster_info['svip'] = configured_svip + sfv.active_cluster['svip'] = configured_svip v = sfv._get_model_info(sfaccount, 1) self.assertEqual('%s 0' % configured_svip, v['provider_location']) @@ -1969,7 +1969,7 @@ class SolidFireVolumeTestCase(test.TestCase): 'fake-mvip'}] ctxt = None type_id = '290edb2a-f5ea-11e5-9ce9-5e5517507c66' - fake_type = {'extra_specs': {'replication': 'enabled'}} + fake_type = {'extra_specs': {'replication_enabled': ' True'}} with mock.patch.object(volume_types, 'get_volume_type', return_value=fake_type): diff --git a/cinder/volume/drivers/solidfire.py b/cinder/volume/drivers/solidfire.py index 64836833807..edb3f9dd114 100644 --- a/cinder/volume/drivers/solidfire.py +++ b/cinder/volume/drivers/solidfire.py @@ -166,10 +166,11 @@ class SolidFireDriver(san.SanISCSIDriver): 2.0.8 - Add active status filter to get volume ops 2.0.9 - Always purge on delete volume 2.0.10 - Add response to debug on retryable errors + 2.0.11 - Add ability to failback replicating volumes """ - VERSION = '2.0.10' + VERSION = '2.0.11' # ThirdPartySystems wiki page CI_WIKI_NAME = "NetApp_SolidFire_CI" @@ -210,7 +211,7 @@ class SolidFireDriver(san.SanISCSIDriver): def __init__(self, *args, **kwargs): super(SolidFireDriver, self).__init__(*args, **kwargs) self.failed_over_id = kwargs.get('active_backend_id', None) - self.active_cluster_info = {} + self.replication_status = kwargs.get('replication_status', "na") self.configuration.append_config_values(sf_opts) self.template_account_id = None self.max_volumes_per_account = 1990 @@ -220,17 +221,26 @@ class SolidFireDriver(san.SanISCSIDriver): self.failed_over = False self.target_driver = SolidFireISCSI(solidfire_driver=self, configuration=self.configuration) + self.default_cluster = self._create_cluster_reference() + self.active_cluster = self.default_cluster + + # If we're failed over, we need to parse things out and set the active + # cluster appropriately if self.failed_over_id: + self.failed_over = True remote_info = self._get_remote_info_by_id(self.failed_over_id) if remote_info: - self._set_active_cluster_info(remote_info['endpoint']) + self.active_cluster = self._create_cluster_reference( + remote_info['endpoint']) else: LOG.error('Failed to initialize SolidFire driver to ' 'a remote cluster specified at id: %s', self.failed_over_id) - else: - self._set_active_cluster_info() + # NOTE(jdg): This works even in a failed over state, because what we + # do is use self.active_cluster in issue_api_request so by default we + # always use the currently active cluster, override that by provding + # an endpoint to issue_api_request if needed try: self._update_cluster_status() except exception.SolidFireAPIException: @@ -240,8 +250,7 @@ class SolidFireDriver(san.SanISCSIDriver): account = self.configuration.sf_template_account_name self.template_account_id = self._create_template_account(account) - if not self.failed_over_id: - self._set_cluster_pairs() + self._set_cluster_pairs() def locked_image_id_operation(f, external=False): def lvo_inner1(inst, *args, **kwargs): @@ -348,7 +357,8 @@ class SolidFireDriver(san.SanISCSIDriver): remote_info['clusterPairID'] = ep['clusterPairID'] break - if not remote_pair: + if (not remote_pair and + remote_info['mvip'] != self.active_cluster['mvip']): # NOTE(jdg): create_remote_pairing sets the # clusterPairID in remote_info for us self._create_remote_pairing(remote_info) @@ -356,23 +366,51 @@ class SolidFireDriver(san.SanISCSIDriver): LOG.debug("Setting replication_enabled to True.") self.replication_enabled = True - def _set_active_cluster_info(self, endpoint=None): + def _create_cluster_reference(self, endpoint=None): + cluster_ref = {} + cluster_ref['endpoint'] = endpoint if not endpoint: - self.active_cluster_info['endpoint'] = self._build_endpoint_info() + cluster_ref['endpoint'] = self._build_endpoint_info() + + cluster_info = (self._issue_api_request( + 'GetClusterInfo', {}, endpoint=cluster_ref['endpoint']) + ['result']['clusterInfo']) + + for k, v in cluster_info.items(): + cluster_ref[k] = v + + # Add a couple extra things that are handy for us + cluster_ref['clusterAPIVersion'] = ( + self._issue_api_request('GetClusterVersionInfo', + {}, endpoint=cluster_ref['endpoint']) + ['result']['clusterAPIVersion']) + + # FIXME(jdg): This is fine for the default/base cluster, but + # if we have a secondary configured, and are using vlans etc + # we don't use what's in the config (that's the primary only), + # we need to set this from the replication_device config + if self.configuration.get('sf_svip', None): + cluster_ref['svip'] = ( + self.configuration.get('sf_svip')) + return cluster_ref + + def _set_active_cluster(self, endpoint=None): + if not endpoint: + self.active_cluster['endpoint'] = self._build_endpoint_info() else: - self.active_cluster_info['endpoint'] = endpoint + self.active_cluster['endpoint'] = endpoint for k, v in self._issue_api_request( 'GetClusterInfo', {})['result']['clusterInfo'].items(): - self.active_cluster_info[k] = v + self.active_cluster[k] = v # Add a couple extra things that are handy for us - self.active_cluster_info['clusterAPIVersion'] = ( + self.active_cluster['clusterAPIVersion'] = ( self._issue_api_request('GetClusterVersionInfo', {})['result']['clusterAPIVersion']) if self.configuration.get('sf_svip', None): - self.active_cluster_info['svip'] = ( + self.active_cluster['svip'] = ( self.configuration.get('sf_svip')) def _create_provider_id_string(self, @@ -383,7 +421,7 @@ class SolidFireDriver(san.SanISCSIDriver): # swap that with the parent volume id return "%s %s %s" % (resource_id, account_or_vol_id, - self.active_cluster_info['uuid']) + self.active_cluster['uuid']) def _init_snapshot_mappings(self, srefs): updates = [] @@ -470,7 +508,7 @@ class SolidFireDriver(san.SanISCSIDriver): if params is None: params = {} if endpoint is None: - endpoint = self.active_cluster_info['endpoint'] + endpoint = self.active_cluster['endpoint'] payload = {'method': method, 'params': params} url = '%s/json-rpc/%s/' % (endpoint['url'], version) @@ -577,7 +615,7 @@ class SolidFireDriver(san.SanISCSIDriver): if endpoint: iscsi_portal = endpoint['svip'] else: - iscsi_portal = self.active_cluster_info['svip'] + iscsi_portal = self.active_cluster['svip'] if ':' not in iscsi_portal: iscsi_portal += ':3260' @@ -1343,7 +1381,11 @@ class SolidFireDriver(san.SanISCSIDriver): type_ref = volume_types.get_volume_type(ctxt, type_id) specs = type_ref.get('extra_specs') - if specs.get('replication', 'disabled').lower() == 'enabled': + # We use the replication_enabled flag for both the trigger in the + # driver, as well as capabilities for scheduler. Note we don't + # require or check for the additional "replication:True|False" + # spec in the type any longer. + if specs.get('replication_enabled', "") == " True": rep_opts['targets'] = specs.get( 'solidfire:replication_targets', self.cluster_pairs[0]) return rep_opts @@ -1824,7 +1866,7 @@ class SolidFireDriver(san.SanISCSIDriver): data['replication_enabled'] = self.replication_enabled if self.replication_enabled: data['replication'] = 'enabled' - data['active_cluster_mvip'] = self.active_cluster_info['mvip'] + data['active_cluster_mvip'] = self.active_cluster['mvip'] data['reserved_percentage'] = self.configuration.reserved_percentage data['QoS_support'] = True @@ -2058,35 +2100,88 @@ class SolidFireDriver(san.SanISCSIDriver): self._issue_api_request('ModifyVolume', params, version='5.0') - def _failover_volume(self, remote_vol, remote): + def _failover_volume(self, src_vol, tgt_vol, tgt_cluster): """Modify remote volume to R/W mode.""" - self._issue_api_request( - 'RemoveVolumePair', - {'volumeID': remote_vol['volumeID']}, - endpoint=remote['endpoint'], version='7.0') + # Put the src in tgt mode assuming it's still available + # catch the exception if the cluster isn't available and + # continue on + params = {'volumeID': src_vol['volumeID'], + 'access': 'replicationTarget'} + try: + self._issue_api_request('ModifyVolume', params) + except exception.SolidFireAPIException: + # FIXME + pass - params = {'volumeID': remote_vol['volumeID'], + # Now call out to the remote and make the tgt our new src + params = {'volumeID': tgt_vol['volumeID'], 'access': 'readWrite'} self._issue_api_request('ModifyVolume', params, - endpoint=remote['endpoint']) + endpoint=tgt_cluster['endpoint']) def failover_host(self, context, volumes, secondary_id=None, groups=None): - """Failover to replication target.""" + """Failover to replication target. + + In order to do failback, you MUST specify the original/default cluster + using secondary_id option. You can do this simply by specifying: + `secondary_id=default` + """ + failback = False volume_updates = [] remote = None + secondary_id = secondary_id.lower() if secondary_id else None + # FIXME(jdg): There's an awful lot going on in this if/else block + # it's pretty simple in terms of what it does, but would be + # good to come back and clean it up and make it a bit more + # readable/maintainable. + + # There's two cases we have to deal with + # 1. Caller specified a backend target to fail too + # 2. Caller just wants to failover to anything available + # In case `1` we need to check if they specified the default + # and want to failback, so make sure we're even failed-over + # + # In case `2` they didn't specify a target, but if we're failed + # over already, can't just grab a target off the list, we might + # already be on that target, so check that and try and go back to + # whence you came if secondary_id: - for rc in self.cluster_pairs: - if rc['mvip'] == secondary_id: - remote = rc - break + if secondary_id == "default" and not self.failed_over: + LOG.error("SolidFire driver received failover_host " + "specifying failback to default, the " + "host however is not in `failed_over` " + "state, so can't failback.") + raise exception.InvalidReplicationTarget + elif secondary_id == "default" and self.failed_over: + remote = self.default_cluster + failback = True + # TODO(jdg): Add a simple check here to make + # sure the default is online + else: + for rc in self.cluster_pairs: + if rc['mvip'] == secondary_id: + remote = rc + break if not remote: LOG.error("SolidFire driver received failover_host " "but was unable to find specified replication " "pair with id: %s.", secondary_id) raise exception.InvalidReplicationTarget else: - remote = self.cluster_pairs[0] + # Otherwise, we just grab a target off the list + # but beware, we may already be failed over and there + # may not be another target left, so recycle back to + # the default + if self.failed_over: + for cp in self.cluster_pairs: + if cp['endpoint'] != self.active_cluster['endpoint']: + remote = cp + if not remote: + remote = self.default_cluster + failback = True + else: + remote = self.cluster_pairs[0] if not remote or not self.replication_enabled: LOG.error("SolidFire driver received failover_host " @@ -2097,24 +2192,25 @@ class SolidFireDriver(san.SanISCSIDriver): "on non replicated " "backend.")) - remote_vols = self._map_sf_volumes(volumes, + # Ok, that was annoying; get on with it + target_vols = self._map_sf_volumes(volumes, endpoint=remote['endpoint']) primary_vols = self._map_sf_volumes(volumes) for v in volumes: - remote_vlist = [sfv for sfv in remote_vols + target_vlist = [sfv for sfv in target_vols if sfv['cinder_id'] == v['id']] - if len(remote_vlist) > 0: - remote_vol = remote_vlist[0] - self._failover_volume(remote_vol, remote) + if len(target_vlist) > 0: + target_vol = target_vlist[0] + # BOOKMARK This fails on failback using 'default' + # primary_vol = [sfv for sfv in primary_vols if sfv['cinder_id'] == v['id']][0] - if len(primary_vol['volumePairs']) > 0: - self._issue_api_request( - 'RemoveVolumePair', - {'volumeID': primary_vol['volumeID']}, - version='7.0') - iqn = remote_vol['iqn'] + self._failover_volume(primary_vol, target_vol, remote) + + # Now we need to update the iqn of the volume to match + # the target svip etc + iqn = target_vol['iqn'] volume_updates.append( {'volume_id': v['id'], 'updates': { @@ -2131,10 +2227,14 @@ class SolidFireDriver(san.SanISCSIDriver): # has been pretty much stateless and has allowed customers to run # active/active HA c-vol services with SolidFire. The introduction of # the active_cluster and failed_over attributes is going to break that - # but for now that's going to be the trade off of using replciation - self.active_cluster_info = remote + # but for now that's going to be the trade off of using replication + active_cluster_id = remote['mvip'] + self.active_cluster = remote self.failed_over = True - return remote['mvip'], volume_updates, [] + if failback: + active_cluster_id = 'default' + + return active_cluster_id, volume_updates, [] def freeze_backend(self, context): """Freeze backend notification.""" diff --git a/releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml b/releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml new file mode 100644 index 00000000000..8f5c037f75c --- /dev/null +++ b/releasenotes/notes/add_replication_failback_to_solidfire-82668c071f4fa91d.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add ability to call failover-host on a replication + enabled SF cluster a second time with host id = default + to initiate a failback to the default configured SolidFire + Cluster.