Browse Source

Merge "NetApp SolidFire: Fix failback failing after service restart" into stable/train

changes/86/738586/1
Zuul 2 weeks ago
committed by Gerrit Code Review
parent
commit
3b92f873ef
3 changed files with 85 additions and 53 deletions
  1. +31
    -17
      cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py
  2. +48
    -36
      cinder/volume/drivers/solidfire.py
  3. +6
    -0
      releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml

+ 31
- 17
cinder/tests/unit/volume/drivers/solidfire/test_solidfire.py View File

@@ -159,6 +159,7 @@ class SolidFireVolumeTestCase(test.TestCase):
'login': 'admin'},
'name': 'AutoTest2-6AjG-FOR-TEST-ONLY',
'clusterPairID': 33,
'clusterAPIVersion': '9.4',
'uuid': '9c499d4b-8fff-48b4-b875-27601d5d9889',
'svip': '10.10.23.2',
'mvipNodeID': 1,
@@ -3166,7 +3167,17 @@ class SolidFireVolumeTestCase(test.TestCase):
cinder_vols.append(vol)

mock_map_sf_volumes.return_value = sf_vols
mock_create_cluster_reference.return_value = self.cluster_pairs[0]

self.configuration.replication_device = []

reset_mocks()
drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)

self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'fake', None)
mock_map_sf_volumes.assert_not_called()

fake_replication_device = {'backend_id': 'fake',
'mvip': '0.0.0.0',
@@ -3183,14 +3194,6 @@ class SolidFireVolumeTestCase(test.TestCase):
sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called()

reset_mocks()
drv_args = {'active_backend_id': 'default'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols, 'default', None)
mock_map_sf_volumes.assert_not_called()

reset_mocks()
drv_args = {'active_backend_id': None}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
@@ -3200,15 +3203,28 @@ class SolidFireVolumeTestCase(test.TestCase):
secondary_id='not_fake_id', groups=None)
mock_map_sf_volumes.assert_not_called()

mock_create_cluster_reference.return_value = self.cluster_pairs[0]

reset_mocks()
drv_args = {'active_backend_id': None}
drv_args = {'active_backend_id': 'secondary'}
sfv = solidfire.SolidFireDriver(configuration=self.configuration,
**drv_args)
sfv.cluster_pairs = [None]
self.assertRaises(exception.UnableToFailOver,
sfv.failover_host, ctx, cinder_vols,
secondary_id='fake', groups=None)
mock_map_sf_volumes.assert_not_called()
sfv.cluster_pairs = self.cluster_pairs
sfv.cluster_pairs[0]['backend_id'] = 'fake'
sfv.replication_enabled = True
cluster_id, updates, _ = sfv.failover_host(
ctx, cinder_vols, secondary_id='default', groups=None)
self.assertEqual(5, len(updates))
for update in updates:
self.assertEqual(fields.ReplicationStatus.ENABLED,
update['updates']['replication_status'])
self.assertEqual('', cluster_id)
mock_get_create_account.assert_called()
mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called()
mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called()

reset_mocks()
drv_args = {'active_backend_id': None}
@@ -3228,11 +3244,9 @@ class SolidFireVolumeTestCase(test.TestCase):
mock_get_create_account.assert_called()
mock_failover_volume.assert_called()
mock_map_sf_volumes.assert_called()
mock_get_cluster_info.assert_not_called()
mock_update_cluster_status.assert_called()
mock_set_cluster_pairs.assert_called()
mock_create_cluster_reference.assert_called()
mock_issue_api_request.assert_not_called()

@mock.patch.object(solidfire.SolidFireDriver, '_issue_api_request')
@mock.patch.object(solidfire.SolidFireDriver, '_create_cluster_reference')


+ 48
- 36
cinder/volume/drivers/solidfire.py View File

@@ -223,9 +223,11 @@ class SolidFireDriver(san.SanISCSIDriver):
2.0.15 - Fix bug #1834013 NetApp SolidFire replication errors
2.0.16 - Add options for replication mode (Async, Sync and
SnapshotsOnly)
2.0.17 - Fix bug #1859653 SolidFire fails to failback when volume
service is restarted
"""

VERSION = '2.0.16'
VERSION = '2.0.17'

# ThirdPartySystems wiki page
CI_WIKI_NAME = "NetApp_SolidFire_CI"
@@ -300,15 +302,13 @@ class SolidFireDriver(san.SanISCSIDriver):
self.active_cluster = self._create_cluster_reference(
remote_endpoint)

# When in failed-over state, we have only endpoint info from the
# primary cluster.
self.primary_cluster = {"endpoint": self._build_endpoint_info()}
self.failed_over = True
self.replication_enabled = True
else:
self.primary_cluster = self._create_cluster_reference()
self.active_cluster = self.primary_cluster
self.active_cluster = self._create_cluster_reference()
if self.configuration.replication_device:
self._set_cluster_pairs()
self.replication_enabled = True

LOG.debug("Active cluster: %s", self.active_cluster)

@@ -437,9 +437,11 @@ class SolidFireDriver(san.SanISCSIDriver):
# clusterPairID in remote_info for us
self._create_remote_pairing(remote_info)

if self.cluster_pairs:
self.cluster_pairs.clear()

self.cluster_pairs.append(remote_info)
LOG.debug("Available cluster pairs: %s", self.cluster_pairs)
self.replication_enabled = True

def _create_cluster_reference(self, endpoint=None):
cluster_ref = {}
@@ -2352,8 +2354,13 @@ class SolidFireDriver(san.SanISCSIDriver):
failback = False
volume_updates = []

LOG.info("Failing over. Secondary ID is: %s",
secondary_id)
if not self.replication_enabled:
LOG.error("SolidFire driver received failover_host "
"request, however replication is NOT "
"enabled.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))

# NOTE(erlon): For now we only support one replication target device.
# So, there are two cases we have to deal with here:
@@ -2371,8 +2378,10 @@ class SolidFireDriver(san.SanISCSIDriver):
"state.")
raise exception.InvalidReplicationTarget(msg)
elif secondary_id == "default" and self.failed_over:
remote = self.primary_cluster
LOG.info("Failing back to primary cluster.")
remote = self._create_cluster_reference()
failback = True

else:
repl_configs = self.configuration.replication_device[0]
if secondary_id and repl_configs['backend_id'] != secondary_id:
@@ -2380,25 +2389,24 @@ class SolidFireDriver(san.SanISCSIDriver):
"on cinder.conf.") % secondary_id
raise exception.InvalidReplicationTarget(msg)

LOG.info("Failing over to secondary cluster %s.", secondary_id)
remote = self.cluster_pairs[0]

if not remote or not self.replication_enabled:
LOG.error("SolidFire driver received failover_host "
"request, however replication is NOT "
"enabled, or there are no available "
"targets to fail-over to.")
raise exception.UnableToFailOver(reason=_("Failover requested "
"on non replicated "
"backend."))
LOG.debug("Target cluster to failover: %s.",
{'name': remote['name'],
'mvip': remote['mvip'],
'clusterAPIVersion': remote['clusterAPIVersion']})

target_vols = self._map_sf_volumes(volumes,
endpoint=remote['endpoint'])
LOG.debug("Mapped target_vols: %s", target_vols)
LOG.debug("Total Cinder volumes found in target: %d",
len(target_vols))

primary_vols = None
try:
primary_vols = self._map_sf_volumes(volumes)
LOG.debug("Mapped Primary_vols: %s", target_vols)
LOG.debug("Total Cinder volumes found in primary cluster: %d",
len(primary_vols))
except SolidFireAPIException:
# API Request failed on source. Failover/failback will skip next
# calls to it.
@@ -2433,14 +2441,26 @@ class SolidFireDriver(san.SanISCSIDriver):
else:
primary_vol = None

LOG.debug('Failing-over volume %s, target vol %s, '
'primary vol %s', v, target_vol, primary_vol)
LOG.info('Failing-over volume %s.', v.id)
LOG.debug('Target vol: %s',
{'access': target_vol['access'],
'accountID': target_vol['accountID'],
'name': target_vol['name'],
'status': target_vol['status'],
'volumeID': target_vol['volumeID']})
LOG.debug('Primary vol: %s',
{'access': primary_vol['access'],
'accountID': primary_vol['accountID'],
'name': primary_vol['name'],
'status': primary_vol['status'],
'volumeID': primary_vol['volumeID']})

try:
self._failover_volume(target_vol, remote, primary_vol)

sf_account = self._get_create_account(
v.project_id, endpoint=remote['endpoint'])
LOG.debug("Target account: %s", sf_account['accountID'])

conn_info = self._build_connection_info(
sf_account, target_vol, endpoint=remote['endpoint'])
@@ -2468,12 +2488,7 @@ class SolidFireDriver(san.SanISCSIDriver):
except Exception as e:
volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }})

if failback:
LOG.error("Error trying to failback volume %s", v.id)
else:
LOG.error("Error trying to failover volume %s", v.id)

LOG.error("Error trying to failover volume %s", v.id)
msg = e.message if hasattr(e, 'message') else e
LOG.exception(msg)

@@ -2481,20 +2496,17 @@ class SolidFireDriver(san.SanISCSIDriver):
volume_updates.append({'volume_id': v['id'],
'updates': {'status': 'error', }})

# FIXME(jdg): This introduces a problem for us, up until now our driver
# has been pretty much stateless and has allowed customers to run
# active/active HA c-vol services with SolidFire. The introduction of
# the active_cluster and failed_over attributes is going to break that
# but for now that's going to be the trade off of using replication
self.active_cluster = remote

if failback:
active_cluster_id = None
active_cluster_id = ''
self.failed_over = False
# Recreating cluster pairs after a successful failback
self._set_cluster_pairs()
else:
active_cluster_id = remote['backend_id']
self.failed_over = True

self.active_cluster = remote

return active_cluster_id, volume_updates, []

def freeze_backend(self, context):


+ 6
- 0
releasenotes/notes/bug-1859653-solidfire-fix-failover-after-service-restart-77e5e4da45c9c1aa.yaml View File

@@ -0,0 +1,6 @@
---
fixes:
- |
NetApp SolidFire driver: Fixed an issue that causes failback
to fail after a volume service restart. This change fixes
bug `1859653 <https://bugs.launchpad.net/cinder/+bug/1859653>`_.

Loading…
Cancel
Save