[SVF]: Fixed host and group failover issues

[Spectrum Virtualize Family] Fixed issue regarding host-failover and
group-failover that impacts IBM Spectrum Virtualize Family storage
back-end performance and applies to mirror replication types such as
metro, global and GMCV volume replication types.

closes bug: #1898746
Change-Id: I34f764aaebe9166110eb91cf2ac3b41f173d2a68
This commit is contained in:
venkatakrishnathumu 2020-10-09 11:23:32 +00:00
parent 8ea6f80319
commit d56b47d339
4 changed files with 79 additions and 274 deletions

View File

@ -2515,7 +2515,7 @@ port_speed!N/A
except KeyError:
return self._errors['CMMVC5753E']
function = 'stop_access' if force_access else 'stop'
function = 'stop'
self._rccg_state_transition(function, rccg)
for rcrel_info in self._rcrelationship_list.values():
if rcrel_info['consistency_group_name'] == rccg['name']:
@ -10021,133 +10021,46 @@ class StorwizeSVCReplicationTestCase(test.TestCase):
get_relationship_info.assert_called_once_with(fake_name)
delete_relationship.assert_called_once_with(fake_name)
@ddt.data((True, True, 1), (False, True, 2),
(True, False, 2), (False, False, 2))
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_vdisk')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_relationship')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'get_relationship_info')
def test_retain_target_volume(self, get_relationship_info,
delete_relationship,
delete_vdisk):
@ddt.unpack
def test_retain_target_volume(self, target_volume, retain_aux_vol,
call_count, get_relationship_info,
delete_relationship, delete_vdisk):
# Set replication target.
self.driver.configuration.set_override('replication_device',
[self.rep_target])
self.driver.do_setup(self.ctxt)
fake_name = 'volume-%s' % fake.VOLUME_ID
target_volume_fake_name = (
storwize_const.REPLICA_AUX_VOL_PREFIX + fake_name)
target_change_fake_name = (
storwize_const.REPLICA_CHG_VOL_PREFIX + target_volume_fake_name)
get_relationship_info.return_value = {'aux_vdisk_name':
fake_name}
self.driver._helpers.delete_rc_volume(fake_name,
target_vol=True,
retain_aux_volume=True)
get_relationship_info.assert_called_once_with(target_volume_fake_name)
delete_relationship.assert_called_once_with(target_volume_fake_name)
target_vol=target_volume,
retain_aux_volume=retain_aux_vol)
calls = [mock.call(target_change_fake_name, force_delete=False,
vol_name = fake_name
change_vol_name = (storwize_const.REPLICA_CHG_VOL_PREFIX + vol_name)
if target_volume:
vol_name = (storwize_const.REPLICA_AUX_VOL_PREFIX + fake_name)
change_vol_name = (
storwize_const.REPLICA_CHG_VOL_PREFIX + vol_name)
get_relationship_info.assert_called_once_with(vol_name)
delete_relationship.assert_called_once_with(vol_name)
calls = [mock.call(change_vol_name, force_delete=False,
force_unmap=True)]
if (target_volume and not retain_aux_vol) or not target_volume:
calls.extend([mock.call(vol_name, force_delete=False,
force_unmap=True)])
delete_vdisk.assert_has_calls(calls, any_order=True)
self.assertEqual(1, delete_vdisk.call_count)
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_vdisk')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_relationship')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'get_relationship_info')
def test_retain_target_volume_invalid_parameters_1(
self, get_relationship_info,
delete_relationship,
delete_vdisk):
# Set replication target.
self.driver.configuration.set_override('replication_device',
[self.rep_target])
self.driver.do_setup(self.ctxt)
fake_name = 'volume-%s' % fake.VOLUME_ID
master_change_fake_name = (
storwize_const.REPLICA_CHG_VOL_PREFIX + fake_name)
get_relationship_info.return_value = {'aux_vdisk_name':
fake_name}
self.driver._helpers.delete_rc_volume(fake_name,
target_vol=False,
retain_aux_volume=True)
get_relationship_info.assert_called_once_with(fake_name)
delete_relationship.assert_called_once_with(fake_name)
calls = [mock.call(master_change_fake_name, force_delete=False,
force_unmap=True),
mock.call(fake_name, force_delete=False, force_unmap=True)]
delete_vdisk.assert_has_calls(calls, any_order=True)
self.assertEqual(2, delete_vdisk.call_count)
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_vdisk')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_relationship')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'get_relationship_info')
def test_retain_target_volume_invalid_parameters_2(
self, get_relationship_info,
delete_relationship,
delete_vdisk):
# Set replication target.
self.driver.configuration.set_override('replication_device',
[self.rep_target])
self.driver.do_setup(self.ctxt)
fake_name = 'volume-%s' % fake.VOLUME_ID
target_volume_fake_name = (
storwize_const.REPLICA_AUX_VOL_PREFIX + fake_name)
target_change_fake_name = (
storwize_const.REPLICA_CHG_VOL_PREFIX + target_volume_fake_name)
get_relationship_info.return_value = {'aux_vdisk_name':
fake_name}
self.driver._helpers.delete_rc_volume(fake_name,
target_vol=True,
retain_aux_volume=False)
get_relationship_info.assert_called_once_with(target_volume_fake_name)
delete_relationship.assert_called_once_with(target_volume_fake_name)
calls = [mock.call(target_change_fake_name, force_delete=False,
force_unmap=True),
mock.call(target_volume_fake_name, force_delete=False,
force_unmap=True)]
delete_vdisk.assert_has_calls(calls, any_order=True)
self.assertEqual(2, delete_vdisk.call_count)
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_vdisk')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'delete_relationship')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'get_relationship_info')
def test_retain_target_volume_invalid_parameters_3(
self, get_relationship_info,
delete_relationship,
delete_vdisk):
# Set replication target.
self.driver.configuration.set_override('replication_device',
[self.rep_target])
self.driver.do_setup(self.ctxt)
fake_name = 'volume-%s' % fake.VOLUME_ID
master_change_fake_name = (
storwize_const.REPLICA_CHG_VOL_PREFIX + fake_name)
get_relationship_info.return_value = {'aux_vdisk_name':
fake_name}
self.driver._helpers.delete_rc_volume(fake_name,
target_vol=False,
retain_aux_volume=False)
get_relationship_info.assert_called_once_with(fake_name)
delete_relationship.assert_called_once_with(fake_name)
calls = [mock.call(master_change_fake_name, force_delete=False,
force_unmap=True),
mock.call(fake_name, force_delete=False, force_unmap=True)]
delete_vdisk.assert_has_calls(calls, any_order=True)
self.assertEqual(2, delete_vdisk.call_count)
self.assertEqual(call_count, delete_vdisk.call_count)
def test_storwize_failover_host_backend_error(self):
self.driver.configuration.set_override('replication_device',
@ -10445,26 +10358,20 @@ class StorwizeSVCReplicationTestCase(test.TestCase):
self.driver.delete_volume(non_replica_vol)
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'switch_relationship')
'start_relationship')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'stop_relationship')
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
'get_relationship_info')
def test_failover_host_by_force_access(self, get_relationship_info,
stop_relationship,
switch_relationship):
def test_failover_host_by_force_access(self, stop_relationship,
start_relationship):
replica_obj = self.driver._get_replica_obj(storwize_const.METRO)
mm_vol, model_update = self._create_test_volume(self.mm_type)
target_vol = storwize_const.REPLICA_AUX_VOL_PREFIX + mm_vol.name
context = mock.Mock
get_relationship_info.side_effect = [{
'aux_vdisk_name': 'replica-12345678-1234-5678-1234-567812345678',
'name': 'RC_name'}]
switch_relationship.side_effect = exception.VolumeDriverException
replica_obj.failover_volume_host(context, mm_vol)
get_relationship_info.assert_called_once_with(target_vol)
switch_relationship.assert_called_once_with('RC_name')
stop_relationship.assert_called_once_with(target_vol, access=True)
calls = [mock.call(mm_vol.name), mock.call(target_vol, 'aux')]
start_relationship.assert_has_calls(calls, any_order=True)
self.assertEqual(2, start_relationship.call_count)
@mock.patch.object(storwize_svc_common.StorwizeSVCCommonDriver,
'_update_volume_stats')
@ -11259,32 +11166,18 @@ class StorwizeSVCReplicationTestCase(test.TestCase):
self.driver.failover_replication, self.ctxt, group,
vols, self.fake_target['backend_id'])
self.assertRaises(exception.UnableToFailOver,
self.driver.failover_replication, self.ctxt, group,
vols, self.rep_target['backend_id'])
self.assertRaises(exception.UnableToFailOver,
self.driver.failover_replication, self.ctxt, group,
vols, storwize_const.FAILBACK_VALUE)
with mock.patch.object(storwize_svc_common.StorwizeHelpers,
'get_system_info') as get_sys_info:
get_sys_info.side_effect = [
exception.VolumeBackendAPIException(data='CMMVC6071E'),
exception.VolumeBackendAPIException(data='CMMVC6071E')]
with mock.patch.object(storwize_svc_common.StorwizeSSH,
'stoprcconsistgrp') as stoprccg:
stoprccg.side_effect = exception.VolumeBackendAPIException(
data='CMMVC6071E')
self.assertRaises(exception.UnableToFailOver,
self.driver.failover_replication, self.ctxt,
group, vols, self.rep_target['backend_id'])
self.driver._active_backend_id = self.rep_target['backend_id']
self.assertRaises(exception.UnableToFailOver,
self.driver.failover_replication, self.ctxt,
group, vols, 'default')
with mock.patch.object(storwize_svc_common.StorwizeSSH,
'lsrcconsistgrp', side_effect=[None]):
self.assertRaises(exception.UnableToFailOver,
self.driver.failover_replication, self.ctxt,
group, vols, self.rep_target['backend_id'])
self.driver.delete_group(self.ctxt, group, vols)
@mock.patch.object(storwize_svc_common.StorwizeHelpers,
@ -11376,9 +11269,7 @@ class StorwizeSVCReplicationTestCase(test.TestCase):
self.driver.delete_group(self.ctxt, group, vols)
@mock.patch.object(storwize_svc_common.StorwizeSSH,
'switchrcconsistgrp')
def test_failover_replica_group_by_force_access(self, switchrcconsistgrp):
def test_failover_replica_group_by_force_access(self):
self.driver.do_setup(self.ctxt)
group = self._create_test_rccg(self.rccg_type, [self.mm_type.id])
mm_vol1, model_update = self._create_test_volume(self.mm_type)
@ -11386,13 +11277,9 @@ class StorwizeSVCReplicationTestCase(test.TestCase):
rccg_name = self.driver._get_rccg_name(group)
self.sim._rccg_state_transition('wait',
self.sim._rcconsistgrp_list[rccg_name])
switchrcconsistgrp.side_effect = [
exception.VolumeBackendAPIException(data='CMMVC6071E'),
exception.VolumeBackendAPIException(data='CMMVC6071E')]
with mock.patch.object(storwize_svc_common.StorwizeSSH,
'startrcconsistgrp') as startrcconsistgrp:
self.driver.failover_replication(self.ctxt, group, [mm_vol1], None)
switchrcconsistgrp.assert_called_once_with(rccg_name, True)
startrcconsistgrp.assert_called_once_with(rccg_name, 'aux')
with mock.patch.object(storwize_svc_common.StorwizeSSH,

View File

@ -36,8 +36,24 @@ class StorwizeSVCReplication(object):
self.driver = driver
self.target = replication_target or {}
@utils.trace
def failover_volume_host(self, context, vref):
pass
# Make the aux volume writeable.
try:
tgt_volume = storwize_const.REPLICA_AUX_VOL_PREFIX + vref.name
self.target_helpers.stop_relationship(tgt_volume, access=True)
try:
self.target_helpers.start_relationship(tgt_volume, 'aux')
except exception.VolumeBackendAPIException as e:
LOG.error('Error running startrcrelationship due to %(err)s.',
{'err': e})
return
except Exception as e:
msg = (_('Unable to fail-over the volume %(id)s to the '
'secondary back-end, error: %(error)s') %
{"id": vref['id'], "error": six.text_type(e)})
LOG.exception(msg)
raise exception.VolumeDriverException(message=msg)
def replication_failback(self, volume):
pass
@ -91,40 +107,6 @@ class StorwizeSVCReplicationGlobalMirror(StorwizeSVCReplication):
raise exception.VolumeDriverException(message=msg)
LOG.debug('leave: volume_replication_setup:volume %s', vref['name'])
def failover_volume_host(self, context, vref):
LOG.debug('enter: failover_volume_host: vref=%(vref)s',
{'vref': vref['name']})
target_vol = storwize_const.REPLICA_AUX_VOL_PREFIX + vref['name']
try:
rel_info = self.target_helpers.get_relationship_info(target_vol)
# Reverse the role of the primary and secondary volumes
self.target_helpers.switch_relationship(rel_info['name'])
return
except Exception as e:
LOG.exception('Unable to fail-over the volume %(id)s to the '
'secondary back-end by switchrcrelationship '
'command, error: %(error)s',
{"id": vref['id'], "error": e})
# If the switch command fail, try to make the aux volume
# writeable again.
try:
self.target_helpers.stop_relationship(target_vol,
access=True)
try:
self.target_helpers.start_relationship(target_vol, 'aux')
except exception.VolumeBackendAPIException as e:
LOG.error(
'Error running startrcrelationship due to %(err)s.',
{'err': e})
return
except Exception as e:
msg = (_('Unable to fail-over the volume %(id)s to the '
'secondary back-end, error: %(error)s') %
{"id": vref['id'], "error": e})
LOG.exception(msg)
raise exception.VolumeDriverException(message=msg)
def replication_failback(self, volume):
tgt_volume = storwize_const.REPLICA_AUX_VOL_PREFIX + volume['name']
rel_info = self.target_helpers.get_relationship_info(tgt_volume)
@ -249,26 +231,6 @@ class StorwizeSVCReplicationGMCV(StorwizeSVCReplicationGlobalMirror):
raise exception.VolumeDriverException(message=msg)
LOG.debug('leave: volume_replication_setup:volume %s', vref['name'])
def failover_volume_host(self, context, vref):
LOG.debug('enter: failover_volume_host: vref=%(vref)s',
{'vref': vref['name']})
# Make the aux volume writeable.
try:
tgt_volume = storwize_const.REPLICA_AUX_VOL_PREFIX + vref.name
self.target_helpers.stop_relationship(tgt_volume, access=True)
try:
self.target_helpers.start_relationship(tgt_volume, 'aux')
except exception.VolumeBackendAPIException as e:
LOG.error('Error running startrcrelationship due to %(err)s.',
{'err': e})
return
except Exception as e:
msg = (_('Unable to fail-over the volume %(id)s to the '
'secondary back-end, error: %(error)s') %
{"id": vref['id'], "error": six.text_type(e)})
LOG.exception(msg)
raise exception.VolumeDriverException(message=msg)
def replication_failback(self, volume):
LOG.debug('enter: replication_failback: volume=%(volume)s',
{'volume': volume['name']})

View File

@ -154,7 +154,7 @@ storwize_svc_opts = [
'storage during delete of the volume on primary storage '
'or moving the primary volume from mirror to non-mirror '
'with replication enabled. This option is valid for '
'SVC.'),
'Spectrum Virtualize Family.'),
]
CONF = cfg.CONF
@ -2263,12 +2263,12 @@ class StorwizeHelpers(object):
storwize_const.REPLICA_CHG_VOL_PREFIX + vol_name,
force_unmap=force_unmap,
force_delete=False)
# We want to retain the aux volume after retyping
# from mirror to non mirror storage template or
# on delete of the primary volume based on user's
# We want to retain/remove the aux volume after retyping of
# primary volume from mirror to non-mirror storage template
# or on the delete of the primary volume based on user's
# choice of config value for storwize_svc_retain_aux_volume.
# Default value is False.
if (retain_aux_volume is False and target_vol) or not target_vol:
# The default value is False.
if (not retain_aux_volume and target_vol) or not target_vol:
self.delete_vdisk(vol_name,
force_unmap=force_unmap,
force_delete=False)
@ -3573,8 +3573,8 @@ class StorwizeSVCCommonDriver(san.SanDriver,
# In this case the administrator would like to fail back.
secondary_id, volumes_update, groups_update = self._host_failback(
context, volumes, groups)
elif (secondary_id == self._replica_target['backend_id']
or secondary_id is None):
elif (secondary_id == self._replica_target['backend_id'] or
secondary_id is None):
# In this case the administrator would like to fail over.
secondary_id, volumes_update, groups_update = self._host_failover(
context, volumes, groups)
@ -4205,8 +4205,8 @@ class StorwizeSVCCommonDriver(san.SanDriver,
if storwize_const.FAILBACK_VALUE == secondary_backend_id:
# In this case the administrator would like to group fail back.
model_update = self._rep_grp_failback(context, group)
elif (secondary_backend_id == self._replica_target['backend_id']
or secondary_backend_id is None):
elif (secondary_backend_id == self._replica_target['backend_id'] or
secondary_backend_id is None):
# In this case the administrator would like to group fail over.
model_update = self._rep_grp_failover(context, group)
else:
@ -4282,74 +4282,23 @@ class StorwizeSVCCommonDriver(san.SanDriver,
raise exception.UnableToFailOver(reason=msg)
return model_update
@cinder_utils.trace
def _rep_grp_failover(self, ctxt, group):
"""Fail over all the volume in the replication group."""
model_update = {
'replication_status': fields.ReplicationStatus.FAILED_OVER}
rccg_name = self._get_rccg_name(group)
try:
self._aux_backend_helpers.get_system_info()
except Exception as ex:
msg = (_("Unable to failover group %(rccg)s due to replication "
"target is not reachable. error=%(error)s"),
{'rccg': rccg_name, 'error': ex})
LOG.error(msg)
raise exception.UnableToFailOver(reason=msg)
rccg = self._aux_backend_helpers.get_rccg(rccg_name)
if not rccg:
msg = (_("Unable to failover group %(rccg)s due to replication "
"group does not exist on backend."),
{'rccg': rccg_name})
LOG.error(msg)
raise exception.UnableToFailOver(reason=msg)
if rccg['relationship_count'] == '0':
msg = (_("Unable to failover group %(rccg)s due to it is an "
"empty group."), {'rccg': rccg['name']})
LOG.error(msg)
raise exception.UnableToFailOver(reason=msg)
if rccg['primary'] == 'aux':
LOG.info("Do not need to fail over group %(rccg)s again due to "
"primary is already aux.", {'rccg': rccg['name']})
self._aux_backend_helpers.stop_rccg(rccg_name, access=True)
self._helpers.start_rccg(rccg_name, primary='aux')
return model_update
if rccg['cycling_mode'] == 'multi':
# This is a gmcv replication group
try:
self._aux_backend_helpers.stop_rccg(rccg['name'], access=True)
self._sync_with_aux_grp(ctxt, rccg['name'])
return model_update
except exception.VolumeBackendAPIException as e:
msg = (_('Unable to fail over the group %(rccg)s to the aux '
'back-end, error: %(error)s') %
{"rccg": rccg['name'], "error": e})
LOG.exception(msg)
raise exception.UnableToFailOver(reason=msg)
else:
try:
# Reverse the role of the primary and secondary volumes
self._helpers.switch_rccg(rccg['name'], aux=True)
return model_update
except exception.VolumeBackendAPIException as e:
LOG.exception('Unable to fail over the group %(rccg)s to the '
'aux back-end by switchrcconsistgrp command, '
'error: %(error)s',
{"rccg": rccg['name'], "error": e})
# If the switch command fail, try to make the aux group
# writeable again.
try:
self._aux_backend_helpers.stop_rccg(rccg['name'],
access=True)
self._sync_with_aux_grp(ctxt, rccg['name'])
return model_update
except exception.VolumeBackendAPIException as e:
msg = (_('Unable to fail over the group %(rccg)s to the '
'aux back-end, error: %(error)s') %
{"rccg": rccg['name'], "error": e})
LOG.exception(msg)
raise exception.UnableToFailOver(reason=msg)
except exception.VolumeBackendAPIException as e:
msg = (_('Unable to fail over the group %(rccg)s to the aux '
'back-end, error: %(error)s') %
{"rccg": rccg_name, "error": e})
LOG.exception(msg)
raise exception.UnableToFailOver(reason=msg)
@cinder_utils.trace
def _sync_with_aux_grp(self, ctxt, rccg_name):

View File

@ -0,0 +1,7 @@
---
fixes:
- |
IBM Spectrum Virtualize Family `Bug #1898746
<https://bugs.launchpad.net/cinder/+bug/1898746>`_:
Fixed issue regarding host-failover and group-failover
which impacts storage back-end performance.