From d56b47d339957383b015b2b02f41c57c4f8bcad5 Mon Sep 17 00:00:00 2001 From: venkatakrishnathumu Date: Fri, 9 Oct 2020 11:23:32 +0000 Subject: [PATCH] [SVF]: Fixed host and group failover issues [Spectrum Virtualize Family] Fixed issue regarding host-failover and group-failover that impacts IBM Spectrum Virtualize Family storage back-end performance and applies to mirror replication types such as metro, global and GMCV volume replication types. closes bug: #1898746 Change-Id: I34f764aaebe9166110eb91cf2ac3b41f173d2a68 --- .../volume/drivers/ibm/test_storwize_svc.py | 183 ++++-------------- .../drivers/ibm/storwize_svc/replication.py | 72 ++----- .../ibm/storwize_svc/storwize_svc_common.py | 91 ++------- ...-switch-relationship-9d3c58822a8c918c.yaml | 7 + 4 files changed, 79 insertions(+), 274 deletions(-) create mode 100644 releasenotes/notes/bug-1898746-ibm-svf-fix-host-failover-switch-relationship-9d3c58822a8c918c.yaml diff --git a/cinder/tests/unit/volume/drivers/ibm/test_storwize_svc.py b/cinder/tests/unit/volume/drivers/ibm/test_storwize_svc.py index 89fb42ff87e..d2ea15271f8 100644 --- a/cinder/tests/unit/volume/drivers/ibm/test_storwize_svc.py +++ b/cinder/tests/unit/volume/drivers/ibm/test_storwize_svc.py @@ -2515,7 +2515,7 @@ port_speed!N/A except KeyError: return self._errors['CMMVC5753E'] - function = 'stop_access' if force_access else 'stop' + function = 'stop' self._rccg_state_transition(function, rccg) for rcrel_info in self._rcrelationship_list.values(): if rcrel_info['consistency_group_name'] == rccg['name']: @@ -10021,133 +10021,46 @@ class StorwizeSVCReplicationTestCase(test.TestCase): get_relationship_info.assert_called_once_with(fake_name) delete_relationship.assert_called_once_with(fake_name) + @ddt.data((True, True, 1), (False, True, 2), + (True, False, 2), (False, False, 2)) @mock.patch.object(storwize_svc_common.StorwizeHelpers, 'delete_vdisk') @mock.patch.object(storwize_svc_common.StorwizeHelpers, 'delete_relationship') @mock.patch.object(storwize_svc_common.StorwizeHelpers, 'get_relationship_info') - def test_retain_target_volume(self, get_relationship_info, - delete_relationship, - delete_vdisk): + @ddt.unpack + def test_retain_target_volume(self, target_volume, retain_aux_vol, + call_count, get_relationship_info, + delete_relationship, delete_vdisk): # Set replication target. - self.driver.configuration.set_override('replication_device', [self.rep_target]) self.driver.do_setup(self.ctxt) fake_name = 'volume-%s' % fake.VOLUME_ID - target_volume_fake_name = ( - storwize_const.REPLICA_AUX_VOL_PREFIX + fake_name) - target_change_fake_name = ( - storwize_const.REPLICA_CHG_VOL_PREFIX + target_volume_fake_name) get_relationship_info.return_value = {'aux_vdisk_name': fake_name} self.driver._helpers.delete_rc_volume(fake_name, - target_vol=True, - retain_aux_volume=True) - get_relationship_info.assert_called_once_with(target_volume_fake_name) - delete_relationship.assert_called_once_with(target_volume_fake_name) + target_vol=target_volume, + retain_aux_volume=retain_aux_vol) - calls = [mock.call(target_change_fake_name, force_delete=False, + vol_name = fake_name + change_vol_name = (storwize_const.REPLICA_CHG_VOL_PREFIX + vol_name) + if target_volume: + vol_name = (storwize_const.REPLICA_AUX_VOL_PREFIX + fake_name) + change_vol_name = ( + storwize_const.REPLICA_CHG_VOL_PREFIX + vol_name) + + get_relationship_info.assert_called_once_with(vol_name) + delete_relationship.assert_called_once_with(vol_name) + + calls = [mock.call(change_vol_name, force_delete=False, force_unmap=True)] + if (target_volume and not retain_aux_vol) or not target_volume: + calls.extend([mock.call(vol_name, force_delete=False, + force_unmap=True)]) delete_vdisk.assert_has_calls(calls, any_order=True) - self.assertEqual(1, delete_vdisk.call_count) - - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'delete_vdisk') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'delete_relationship') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'get_relationship_info') - def test_retain_target_volume_invalid_parameters_1( - self, get_relationship_info, - delete_relationship, - delete_vdisk): - # Set replication target. - - self.driver.configuration.set_override('replication_device', - [self.rep_target]) - self.driver.do_setup(self.ctxt) - fake_name = 'volume-%s' % fake.VOLUME_ID - master_change_fake_name = ( - storwize_const.REPLICA_CHG_VOL_PREFIX + fake_name) - get_relationship_info.return_value = {'aux_vdisk_name': - fake_name} - self.driver._helpers.delete_rc_volume(fake_name, - target_vol=False, - retain_aux_volume=True) - get_relationship_info.assert_called_once_with(fake_name) - delete_relationship.assert_called_once_with(fake_name) - calls = [mock.call(master_change_fake_name, force_delete=False, - force_unmap=True), - mock.call(fake_name, force_delete=False, force_unmap=True)] - delete_vdisk.assert_has_calls(calls, any_order=True) - self.assertEqual(2, delete_vdisk.call_count) - - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'delete_vdisk') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'delete_relationship') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'get_relationship_info') - def test_retain_target_volume_invalid_parameters_2( - self, get_relationship_info, - delete_relationship, - delete_vdisk): - # Set replication target. - - self.driver.configuration.set_override('replication_device', - [self.rep_target]) - self.driver.do_setup(self.ctxt) - fake_name = 'volume-%s' % fake.VOLUME_ID - target_volume_fake_name = ( - storwize_const.REPLICA_AUX_VOL_PREFIX + fake_name) - target_change_fake_name = ( - storwize_const.REPLICA_CHG_VOL_PREFIX + target_volume_fake_name) - get_relationship_info.return_value = {'aux_vdisk_name': - fake_name} - self.driver._helpers.delete_rc_volume(fake_name, - target_vol=True, - retain_aux_volume=False) - get_relationship_info.assert_called_once_with(target_volume_fake_name) - delete_relationship.assert_called_once_with(target_volume_fake_name) - calls = [mock.call(target_change_fake_name, force_delete=False, - force_unmap=True), - mock.call(target_volume_fake_name, force_delete=False, - force_unmap=True)] - delete_vdisk.assert_has_calls(calls, any_order=True) - self.assertEqual(2, delete_vdisk.call_count) - - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'delete_vdisk') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'delete_relationship') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'get_relationship_info') - def test_retain_target_volume_invalid_parameters_3( - self, get_relationship_info, - delete_relationship, - delete_vdisk): - # Set replication target. - - self.driver.configuration.set_override('replication_device', - [self.rep_target]) - self.driver.do_setup(self.ctxt) - fake_name = 'volume-%s' % fake.VOLUME_ID - master_change_fake_name = ( - storwize_const.REPLICA_CHG_VOL_PREFIX + fake_name) - get_relationship_info.return_value = {'aux_vdisk_name': - fake_name} - self.driver._helpers.delete_rc_volume(fake_name, - target_vol=False, - retain_aux_volume=False) - get_relationship_info.assert_called_once_with(fake_name) - delete_relationship.assert_called_once_with(fake_name) - calls = [mock.call(master_change_fake_name, force_delete=False, - force_unmap=True), - mock.call(fake_name, force_delete=False, force_unmap=True)] - delete_vdisk.assert_has_calls(calls, any_order=True) - self.assertEqual(2, delete_vdisk.call_count) + self.assertEqual(call_count, delete_vdisk.call_count) def test_storwize_failover_host_backend_error(self): self.driver.configuration.set_override('replication_device', @@ -10445,26 +10358,20 @@ class StorwizeSVCReplicationTestCase(test.TestCase): self.driver.delete_volume(non_replica_vol) @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'switch_relationship') + 'start_relationship') @mock.patch.object(storwize_svc_common.StorwizeHelpers, 'stop_relationship') - @mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'get_relationship_info') - def test_failover_host_by_force_access(self, get_relationship_info, - stop_relationship, - switch_relationship): + def test_failover_host_by_force_access(self, stop_relationship, + start_relationship): replica_obj = self.driver._get_replica_obj(storwize_const.METRO) mm_vol, model_update = self._create_test_volume(self.mm_type) target_vol = storwize_const.REPLICA_AUX_VOL_PREFIX + mm_vol.name context = mock.Mock - get_relationship_info.side_effect = [{ - 'aux_vdisk_name': 'replica-12345678-1234-5678-1234-567812345678', - 'name': 'RC_name'}] - switch_relationship.side_effect = exception.VolumeDriverException replica_obj.failover_volume_host(context, mm_vol) - get_relationship_info.assert_called_once_with(target_vol) - switch_relationship.assert_called_once_with('RC_name') stop_relationship.assert_called_once_with(target_vol, access=True) + calls = [mock.call(mm_vol.name), mock.call(target_vol, 'aux')] + start_relationship.assert_has_calls(calls, any_order=True) + self.assertEqual(2, start_relationship.call_count) @mock.patch.object(storwize_svc_common.StorwizeSVCCommonDriver, '_update_volume_stats') @@ -11259,32 +11166,18 @@ class StorwizeSVCReplicationTestCase(test.TestCase): self.driver.failover_replication, self.ctxt, group, vols, self.fake_target['backend_id']) - self.assertRaises(exception.UnableToFailOver, - self.driver.failover_replication, self.ctxt, group, - vols, self.rep_target['backend_id']) - self.assertRaises(exception.UnableToFailOver, self.driver.failover_replication, self.ctxt, group, vols, storwize_const.FAILBACK_VALUE) - with mock.patch.object(storwize_svc_common.StorwizeHelpers, - 'get_system_info') as get_sys_info: - get_sys_info.side_effect = [ - exception.VolumeBackendAPIException(data='CMMVC6071E'), - exception.VolumeBackendAPIException(data='CMMVC6071E')] + with mock.patch.object(storwize_svc_common.StorwizeSSH, + 'stoprcconsistgrp') as stoprccg: + stoprccg.side_effect = exception.VolumeBackendAPIException( + data='CMMVC6071E') self.assertRaises(exception.UnableToFailOver, self.driver.failover_replication, self.ctxt, group, vols, self.rep_target['backend_id']) - self.driver._active_backend_id = self.rep_target['backend_id'] - self.assertRaises(exception.UnableToFailOver, - self.driver.failover_replication, self.ctxt, - group, vols, 'default') - with mock.patch.object(storwize_svc_common.StorwizeSSH, - 'lsrcconsistgrp', side_effect=[None]): - self.assertRaises(exception.UnableToFailOver, - self.driver.failover_replication, self.ctxt, - group, vols, self.rep_target['backend_id']) self.driver.delete_group(self.ctxt, group, vols) @mock.patch.object(storwize_svc_common.StorwizeHelpers, @@ -11376,9 +11269,7 @@ class StorwizeSVCReplicationTestCase(test.TestCase): self.driver.delete_group(self.ctxt, group, vols) - @mock.patch.object(storwize_svc_common.StorwizeSSH, - 'switchrcconsistgrp') - def test_failover_replica_group_by_force_access(self, switchrcconsistgrp): + def test_failover_replica_group_by_force_access(self): self.driver.do_setup(self.ctxt) group = self._create_test_rccg(self.rccg_type, [self.mm_type.id]) mm_vol1, model_update = self._create_test_volume(self.mm_type) @@ -11386,13 +11277,9 @@ class StorwizeSVCReplicationTestCase(test.TestCase): rccg_name = self.driver._get_rccg_name(group) self.sim._rccg_state_transition('wait', self.sim._rcconsistgrp_list[rccg_name]) - switchrcconsistgrp.side_effect = [ - exception.VolumeBackendAPIException(data='CMMVC6071E'), - exception.VolumeBackendAPIException(data='CMMVC6071E')] with mock.patch.object(storwize_svc_common.StorwizeSSH, 'startrcconsistgrp') as startrcconsistgrp: self.driver.failover_replication(self.ctxt, group, [mm_vol1], None) - switchrcconsistgrp.assert_called_once_with(rccg_name, True) startrcconsistgrp.assert_called_once_with(rccg_name, 'aux') with mock.patch.object(storwize_svc_common.StorwizeSSH, diff --git a/cinder/volume/drivers/ibm/storwize_svc/replication.py b/cinder/volume/drivers/ibm/storwize_svc/replication.py index be672fbe7b1..54ae29c7f88 100644 --- a/cinder/volume/drivers/ibm/storwize_svc/replication.py +++ b/cinder/volume/drivers/ibm/storwize_svc/replication.py @@ -36,8 +36,24 @@ class StorwizeSVCReplication(object): self.driver = driver self.target = replication_target or {} + @utils.trace def failover_volume_host(self, context, vref): - pass + # Make the aux volume writeable. + try: + tgt_volume = storwize_const.REPLICA_AUX_VOL_PREFIX + vref.name + self.target_helpers.stop_relationship(tgt_volume, access=True) + try: + self.target_helpers.start_relationship(tgt_volume, 'aux') + except exception.VolumeBackendAPIException as e: + LOG.error('Error running startrcrelationship due to %(err)s.', + {'err': e}) + return + except Exception as e: + msg = (_('Unable to fail-over the volume %(id)s to the ' + 'secondary back-end, error: %(error)s') % + {"id": vref['id'], "error": six.text_type(e)}) + LOG.exception(msg) + raise exception.VolumeDriverException(message=msg) def replication_failback(self, volume): pass @@ -91,40 +107,6 @@ class StorwizeSVCReplicationGlobalMirror(StorwizeSVCReplication): raise exception.VolumeDriverException(message=msg) LOG.debug('leave: volume_replication_setup:volume %s', vref['name']) - def failover_volume_host(self, context, vref): - LOG.debug('enter: failover_volume_host: vref=%(vref)s', - {'vref': vref['name']}) - target_vol = storwize_const.REPLICA_AUX_VOL_PREFIX + vref['name'] - - try: - rel_info = self.target_helpers.get_relationship_info(target_vol) - # Reverse the role of the primary and secondary volumes - self.target_helpers.switch_relationship(rel_info['name']) - return - except Exception as e: - LOG.exception('Unable to fail-over the volume %(id)s to the ' - 'secondary back-end by switchrcrelationship ' - 'command, error: %(error)s', - {"id": vref['id'], "error": e}) - # If the switch command fail, try to make the aux volume - # writeable again. - try: - self.target_helpers.stop_relationship(target_vol, - access=True) - try: - self.target_helpers.start_relationship(target_vol, 'aux') - except exception.VolumeBackendAPIException as e: - LOG.error( - 'Error running startrcrelationship due to %(err)s.', - {'err': e}) - return - except Exception as e: - msg = (_('Unable to fail-over the volume %(id)s to the ' - 'secondary back-end, error: %(error)s') % - {"id": vref['id'], "error": e}) - LOG.exception(msg) - raise exception.VolumeDriverException(message=msg) - def replication_failback(self, volume): tgt_volume = storwize_const.REPLICA_AUX_VOL_PREFIX + volume['name'] rel_info = self.target_helpers.get_relationship_info(tgt_volume) @@ -249,26 +231,6 @@ class StorwizeSVCReplicationGMCV(StorwizeSVCReplicationGlobalMirror): raise exception.VolumeDriverException(message=msg) LOG.debug('leave: volume_replication_setup:volume %s', vref['name']) - def failover_volume_host(self, context, vref): - LOG.debug('enter: failover_volume_host: vref=%(vref)s', - {'vref': vref['name']}) - # Make the aux volume writeable. - try: - tgt_volume = storwize_const.REPLICA_AUX_VOL_PREFIX + vref.name - self.target_helpers.stop_relationship(tgt_volume, access=True) - try: - self.target_helpers.start_relationship(tgt_volume, 'aux') - except exception.VolumeBackendAPIException as e: - LOG.error('Error running startrcrelationship due to %(err)s.', - {'err': e}) - return - except Exception as e: - msg = (_('Unable to fail-over the volume %(id)s to the ' - 'secondary back-end, error: %(error)s') % - {"id": vref['id'], "error": six.text_type(e)}) - LOG.exception(msg) - raise exception.VolumeDriverException(message=msg) - def replication_failback(self, volume): LOG.debug('enter: replication_failback: volume=%(volume)s', {'volume': volume['name']}) diff --git a/cinder/volume/drivers/ibm/storwize_svc/storwize_svc_common.py b/cinder/volume/drivers/ibm/storwize_svc/storwize_svc_common.py index 893ac336f3e..1094270b335 100644 --- a/cinder/volume/drivers/ibm/storwize_svc/storwize_svc_common.py +++ b/cinder/volume/drivers/ibm/storwize_svc/storwize_svc_common.py @@ -154,7 +154,7 @@ storwize_svc_opts = [ 'storage during delete of the volume on primary storage ' 'or moving the primary volume from mirror to non-mirror ' 'with replication enabled. This option is valid for ' - 'SVC.'), + 'Spectrum Virtualize Family.'), ] CONF = cfg.CONF @@ -2263,12 +2263,12 @@ class StorwizeHelpers(object): storwize_const.REPLICA_CHG_VOL_PREFIX + vol_name, force_unmap=force_unmap, force_delete=False) - # We want to retain the aux volume after retyping - # from mirror to non mirror storage template or - # on delete of the primary volume based on user's + # We want to retain/remove the aux volume after retyping of + # primary volume from mirror to non-mirror storage template + # or on the delete of the primary volume based on user's # choice of config value for storwize_svc_retain_aux_volume. - # Default value is False. - if (retain_aux_volume is False and target_vol) or not target_vol: + # The default value is False. + if (not retain_aux_volume and target_vol) or not target_vol: self.delete_vdisk(vol_name, force_unmap=force_unmap, force_delete=False) @@ -3573,8 +3573,8 @@ class StorwizeSVCCommonDriver(san.SanDriver, # In this case the administrator would like to fail back. secondary_id, volumes_update, groups_update = self._host_failback( context, volumes, groups) - elif (secondary_id == self._replica_target['backend_id'] - or secondary_id is None): + elif (secondary_id == self._replica_target['backend_id'] or + secondary_id is None): # In this case the administrator would like to fail over. secondary_id, volumes_update, groups_update = self._host_failover( context, volumes, groups) @@ -4205,8 +4205,8 @@ class StorwizeSVCCommonDriver(san.SanDriver, if storwize_const.FAILBACK_VALUE == secondary_backend_id: # In this case the administrator would like to group fail back. model_update = self._rep_grp_failback(context, group) - elif (secondary_backend_id == self._replica_target['backend_id'] - or secondary_backend_id is None): + elif (secondary_backend_id == self._replica_target['backend_id'] or + secondary_backend_id is None): # In this case the administrator would like to group fail over. model_update = self._rep_grp_failover(context, group) else: @@ -4282,74 +4282,23 @@ class StorwizeSVCCommonDriver(san.SanDriver, raise exception.UnableToFailOver(reason=msg) return model_update + @cinder_utils.trace def _rep_grp_failover(self, ctxt, group): """Fail over all the volume in the replication group.""" model_update = { 'replication_status': fields.ReplicationStatus.FAILED_OVER} rccg_name = self._get_rccg_name(group) + try: - self._aux_backend_helpers.get_system_info() - except Exception as ex: - msg = (_("Unable to failover group %(rccg)s due to replication " - "target is not reachable. error=%(error)s"), - {'rccg': rccg_name, 'error': ex}) - LOG.error(msg) - raise exception.UnableToFailOver(reason=msg) - - rccg = self._aux_backend_helpers.get_rccg(rccg_name) - if not rccg: - msg = (_("Unable to failover group %(rccg)s due to replication " - "group does not exist on backend."), - {'rccg': rccg_name}) - LOG.error(msg) - raise exception.UnableToFailOver(reason=msg) - - if rccg['relationship_count'] == '0': - msg = (_("Unable to failover group %(rccg)s due to it is an " - "empty group."), {'rccg': rccg['name']}) - LOG.error(msg) - raise exception.UnableToFailOver(reason=msg) - - if rccg['primary'] == 'aux': - LOG.info("Do not need to fail over group %(rccg)s again due to " - "primary is already aux.", {'rccg': rccg['name']}) + self._aux_backend_helpers.stop_rccg(rccg_name, access=True) + self._helpers.start_rccg(rccg_name, primary='aux') return model_update - - if rccg['cycling_mode'] == 'multi': - # This is a gmcv replication group - try: - self._aux_backend_helpers.stop_rccg(rccg['name'], access=True) - self._sync_with_aux_grp(ctxt, rccg['name']) - return model_update - except exception.VolumeBackendAPIException as e: - msg = (_('Unable to fail over the group %(rccg)s to the aux ' - 'back-end, error: %(error)s') % - {"rccg": rccg['name'], "error": e}) - LOG.exception(msg) - raise exception.UnableToFailOver(reason=msg) - else: - try: - # Reverse the role of the primary and secondary volumes - self._helpers.switch_rccg(rccg['name'], aux=True) - return model_update - except exception.VolumeBackendAPIException as e: - LOG.exception('Unable to fail over the group %(rccg)s to the ' - 'aux back-end by switchrcconsistgrp command, ' - 'error: %(error)s', - {"rccg": rccg['name'], "error": e}) - # If the switch command fail, try to make the aux group - # writeable again. - try: - self._aux_backend_helpers.stop_rccg(rccg['name'], - access=True) - self._sync_with_aux_grp(ctxt, rccg['name']) - return model_update - except exception.VolumeBackendAPIException as e: - msg = (_('Unable to fail over the group %(rccg)s to the ' - 'aux back-end, error: %(error)s') % - {"rccg": rccg['name'], "error": e}) - LOG.exception(msg) - raise exception.UnableToFailOver(reason=msg) + except exception.VolumeBackendAPIException as e: + msg = (_('Unable to fail over the group %(rccg)s to the aux ' + 'back-end, error: %(error)s') % + {"rccg": rccg_name, "error": e}) + LOG.exception(msg) + raise exception.UnableToFailOver(reason=msg) @cinder_utils.trace def _sync_with_aux_grp(self, ctxt, rccg_name): diff --git a/releasenotes/notes/bug-1898746-ibm-svf-fix-host-failover-switch-relationship-9d3c58822a8c918c.yaml b/releasenotes/notes/bug-1898746-ibm-svf-fix-host-failover-switch-relationship-9d3c58822a8c918c.yaml new file mode 100644 index 00000000000..1c48806b7c7 --- /dev/null +++ b/releasenotes/notes/bug-1898746-ibm-svf-fix-host-failover-switch-relationship-9d3c58822a8c918c.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + IBM Spectrum Virtualize Family `Bug #1898746 + `_: + Fixed issue regarding host-failover and group-failover + which impacts storage back-end performance.