Merge "NetApp SolidFire: Fix failback failing after service restart" into stable/train

This commit is contained in:
Zuul 2020-06-28 21:20:38 +00:00 committed by Gerrit Code Review
commit 3b92f873ef
3 changed files with 85 additions and 53 deletions

View File

@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'login': 'admin'}, 'login': 'admin'},
'name': 'AutoTest2-6AjG-FOR-TEST-ONLY', 'name': 'AutoTest2-6AjG-FOR-TEST-ONLY',
'clusterPairID': 33, 'clusterPairID': 33,
'clusterAPIVersion': '9.4',
'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889', 'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889',
'svip': '10.10.23.2', 'svip': '10.10.23.2',
'mvipNodeID': 1, 'mvipNodeID': 1,
@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase):
cinder_vols.append(vol) cinder_vols.append(vol)
mock_map_sf_volumes.return_value = sf_vols mock_map_sf_volumes.return_value = sf_vols
mock_create_cluster_reference.return_value = self.cluster_pairs[0]
self.configuration.replication_device = []
reset_mocks()
drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'fake', None)
mock_map_sf_volumes.assert_not_called()
fake_replication_device = {'backend_id': 'fake', fake_replication_device = {'backend_id': 'fake',
'mvip': '0.0.0.0', 'mvip': '0.0.0.0',
@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase):
sfv.failover_host, ctx, cinder_vols, 'default', None) sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called() mock_map_sf_volumes.assert_not_called()
reset_mocks()
drv_args = {'active_backend_id': 'default'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called()
reset_mocks() reset_mocks()
drv_args = {'active_backend_id': None} drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration, sfv = solidfire.SolidFireDriver(configuration=self.configuration,
@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase):
secondary_id='not_fake_id', groups=None) secondary_id='not_fake_id', groups=None)
mock_map_sf_volumes.assert_not_called() mock_map_sf_volumes.assert_not_called()
mock_create_cluster_reference.return_value = self.cluster_pairs[0]
reset_mocks() reset_mocks()
drv_args = {'active_backend_id': None} drv_args = {'active_backend_id': 'secondary'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration, sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args) **drv_args)
sfv.cluster_pairs = [None] sfv.cluster_pairs = self.cluster_pairs
self.assertRaises(exception.UnableToFailOver, sfv.cluster_pairs[0]['backend_id'] = 'fake'
sfv.failover_host, ctx, cinder_vols, sfv.replication_enabled = True
secondary_id='fake', groups=None) cluster_id, updates, _ = sfv.failover_host(
mock_map_sf_volumes.assert_not_called() ctx, cinder_vols, secondary_id='default', groups=None)
self.assertEqual(5, len(updates))
for update in updates:
self.assertEqual(fields.ReplicationStatus.ENABLED,
update['updates']['replication_status'])
self.assertEqual('', cluster_id)
mock_get_create_account.assert_called()
mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called()
mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called()
reset_mocks() reset_mocks()
drv_args = {'active_backend_id': None} drv_args = {'active_backend_id': None}
@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase):
mock_get_create_account.assert_called() mock_get_create_account.assert_called()
mock_failover_volume.assert_called() mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called() mock_map_sf_volumes.assert_called()
mock_get_cluster_info.assert_not_called()
mock_update_cluster_status.assert_called() mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called() mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called() mock_create_cluster_reference.assert_called()
mock_issue_api_request.assert_not_called()
@mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request') @mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request')
@mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference') @mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference')

View File

@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver):
2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors 2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors
2.0.16 - Add options for replication mode (Async, Sync and 2.0.16 - Add options for replication mode (Async, Sync and
SnapshotsOnly) SnapshotsOnly)
2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
service is restarted
""" """
VERSION = '2.0.16' VERSION = '2.0.17'
# ThirdPartySystems wiki page # ThirdPartySystems wiki page
CI_WIKI_NAME = "NetApp_SolidFire_CI" CI_WIKI_NAME = "NetApp_SolidFire_CI"
@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver):
self.active_cluster = self._create_cluster_reference( self.active_cluster = self._create_cluster_reference(
remote_endpoint) remote_endpoint)
# When in failed-over state, we have only endpoint info from the
# primary cluster.
self.primary_cluster = {"endpoint": self._build_endpoint_info()}
self.failed_over = True self.failed_over = True
self.replication_enabled = True
else: else:
self.primary_cluster = self._create_cluster_reference() self.active_cluster = self._create_cluster_reference()
self.active_cluster = self.primary_cluster
if self.configuration.replication_device: if self.configuration.replication_device:
self._set_cluster_pairs() self._set_cluster_pairs()
self.replication_enabled = True
LOG.debug("Active cluster: %s", self.active_cluster) LOG.debug("Active cluster: %s", self.active_cluster)
@ -437,9 +437,11 @@ class SolidFireDriver(san.SanISCSIDriver):
# clusterPairID in remote_info for us # clusterPairID in remote_info for us
self._create_remote_pairing(remote_info) self._create_remote_pairing(remote_info)
if self.cluster_pairs:
self.cluster_pairs.clear()
self.cluster_pairs.append(remote_info) self.cluster_pairs.append(remote_info)
LOG.debug("Available cluster pairs: %s", self.cluster_pairs) LOG.debug("Available cluster pairs: %s", self.cluster_pairs)
self.replication_enabled = True
def _create_cluster_reference(self, endpoint=None): def _create_cluster_reference(self, endpoint=None):
cluster_ref = {} cluster_ref = {}
@ -2352,8 +2354,13 @@ class SolidFireDriver(san.SanISCSIDriver):
failback = False failback = False
volume_updates = [] volume_updates = []
LOG.info("Failing over. Secondary ID is: %s", if not self.replication_enabled:
secondary_id) LOG.error("SolidFire driver received failover_host "
"request, however replication is NOT "
"enabled.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
# NOTE(erlon): For now we only support one replication target device. # NOTE(erlon): For now we only support one replication target device.
# So, there are two cases we have to deal with here: # So, there are two cases we have to deal with here:
@ -2371,8 +2378,10 @@ class SolidFireDriver(san.SanISCSIDriver):
"state.") "state.")
raise exception.InvalidReplicationTarget(msg) raise exception.InvalidReplicationTarget(msg)
elif secondary_id == "default" and self.failed_over: elif secondary_id == "default" and self.failed_over:
remote = self.primary_cluster LOG.info("Failing back to primary cluster.")
remote = self._create_cluster_reference()
failback = True failback = True
else: else:
repl_configs = self.configuration.replication_device[0] repl_configs = self.configuration.replication_device[0]
if secondary_id and repl_configs['backend_id'] != secondary_id: if secondary_id and repl_configs['backend_id'] != secondary_id:
@ -2380,25 +2389,24 @@ class SolidFireDriver(san.SanISCSIDriver):
"on cinder.conf.") % secondary_id "on cinder.conf.") % secondary_id
raise exception.InvalidReplicationTarget(msg) raise exception.InvalidReplicationTarget(msg)
LOG.info("Failing over to secondary cluster %s.", secondary_id)
remote = self.cluster_pairs[0] remote = self.cluster_pairs[0]
if not remote or not self.replication_enabled: LOG.debug("Target cluster to failover: %s.",
LOG.error("SolidFire driver received failover_host " {'name': remote['name'],
"request, however replication is NOT " 'mvip': remote['mvip'],
"enabled, or there are no available " 'clusterAPIVersion': remote['clusterAPIVersion']})
"targets to fail-over to.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
target_vols = self._map_sf_volumes(volumes, target_vols = self._map_sf_volumes(volumes,
endpoint=remote['endpoint']) endpoint=remote['endpoint'])
LOG.debug("Mapped target_vols: %s", target_vols) LOG.debug("Total Cinder volumes found in target: %d",
len(target_vols))
primary_vols = None primary_vols = None
try: try:
primary_vols = self._map_sf_volumes(volumes) primary_vols = self._map_sf_volumes(volumes)
LOG.debug("Mapped Primary_vols: %s", target_vols) LOG.debug("Total Cinder volumes found in primary cluster: %d",
len(primary_vols))
except SolidFireAPIException: except SolidFireAPIException:
# API Request failed on source. Failover/failback will skip next # API Request failed on source. Failover/failback will skip next
# calls to it. # calls to it.
@ -2433,14 +2441,26 @@ class SolidFireDriver(san.SanISCSIDriver):
else: else:
primary_vol = None primary_vol = None
LOG.debug('Failing-over volume %s, target vol %s, ' LOG.info('Failing-over volume %s.', v.id)
'primary vol %s', v, target_vol, primary_vol) LOG.debug('Target vol: %s',
{'access': target_vol['access'],
'accountID': target_vol['accountID'],
'name': target_vol['name'],
'status': target_vol['status'],
'volumeID': target_vol['volumeID']})
LOG.debug('Primary vol: %s',
{'access': primary_vol['access'],
'accountID': primary_vol['accountID'],
'name': primary_vol['name'],
'status': primary_vol['status'],
'volumeID': primary_vol['volumeID']})
try: try:
self._failover_volume(target_vol, remote, primary_vol) self._failover_volume(target_vol, remote, primary_vol)
sf_account = self._get_create_account( sf_account = self._get_create_account(
v.project_id, endpoint=remote['endpoint']) v.project_id, endpoint=remote['endpoint'])
LOG.debug("Target account: %s", sf_account['accountID'])
conn_info = self._build_connection_info( conn_info = self._build_connection_info(
sf_account, target_vol, endpoint=remote['endpoint']) sf_account, target_vol, endpoint=remote['endpoint'])
@ -2468,12 +2488,7 @@ class SolidFireDriver(san.SanISCSIDriver):
except Exception as e: except Exception as e:
volume_updates.append({'volume_id': v['id'], volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }}) 'updates': {'status': 'error', }})
LOG.error("Error trying to failover volume %s", v.id)
if failback:
LOG.error("Error trying to failback volume %s", v.id)
else:
LOG.error("Error trying to failover volume %s", v.id)
msg = e.message if hasattr(e, 'message') else e msg = e.message if hasattr(e, 'message') else e
LOG.exception(msg) LOG.exception(msg)
@ -2481,20 +2496,17 @@ class SolidFireDriver(san.SanISCSIDriver):
volume_updates.append({'volume_id': v['id'], volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }}) 'updates': {'status': 'error', }})
# FIXME(jdg): This introduces a problem for us, up until now our driver self.active_cluster = remote
# has been pretty much stateless and has allowed customers to run
# active/active HA c-vol services with SolidFire. The introduction of
# the active_cluster and failed_over attributes is going to break that
# but for now that's going to be the trade off of using replication
if failback: if failback:
active_cluster_id = None active_cluster_id = ''
self.failed_over = False self.failed_over = False
# Recreating cluster pairs after a successful failback
self._set_cluster_pairs()
else: else:
active_cluster_id = remote['backend_id'] active_cluster_id = remote['backend_id']
self.failed_over = True self.failed_over = True
self.active_cluster = remote
return active_cluster_id, volume_updates, [] return active_cluster_id, volume_updates, []
def freeze_backend(self, context): def freeze_backend(self, context):

View File

@ -0,0 +1,6 @@
---
fixes:
- |
NetApp SolidFire driver: Fixed an issue that causes failback
to fail after a volume service restart. This change fixes
bug `1859653 <https://bugs.launchpad.net/cinder/+bug/1859653>`_.