From 1199489d998a729d7fcf7cf7f88ab34b6464d46b Mon Sep 17 00:00:00 2001 From: danielarthurt Date: Mon, 6 Apr 2020 13:26:13 +0000 Subject: [PATCH] [NetApp] Fix falsely report migration cancelation success NetApp ONTAP share delete operation can fail sometimes when is triggered immediately after migration cancelation on a overloaded NetApp backend. Migration cancelation invokes "abort_volume_move" which is an asynchronous API. If share delete operation is requested immediately after call the former API, it fails because the "abort_volume_move" is still in progress. Now NetApp cDOT driver checks, for a period of time, if the ``volume-move-abort`` operation has ended before report migration cancelation success. This patch squash the following commit that improves the release note for this fix: [NetApp] Updating the release note for bugfix 1688620 (cherry picked from commit a0dd86a98788f7e2d1ca55be26c2a3dea4e36f57) Change-Id: I76e11fef27c9723f019cfdfdc6ea86878db78776 Closes-Bug: #1688620 (cherry picked from commit 0ee414082318d22b3ed19acad6a479cb105c30e5) (cherry picked from commit 71c3a08ead50e8aa68cef3a26c8f8485922895b6) (cherry picked from commit c915918f3c6ac608469263dd05e3f2b5eb61dc4e) --- .../netapp/dataontap/cluster_mode/lib_base.py | 29 +++++++++ manila/share/drivers/netapp/options.py | 8 ++- .../dataontap/cluster_mode/test_lib_base.py | 60 +++++++++++++++---- ...igration-cancelation-fb913131eb8eb82a.yaml | 20 +++++++ 4 files changed, 103 insertions(+), 14 deletions(-) create mode 100644 releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml diff --git a/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py b/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py index d0e0edf263..57320b5a23 100644 --- a/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py +++ b/manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py @@ -2204,6 +2204,8 @@ class NetAppCmodeFileStorageLibrary(object): """Abort an ongoing migration.""" vserver, vserver_client = self._get_vserver(share_server=share_server) share_volume = self._get_backend_share_name(source_share['id']) + retries = (self.configuration.netapp_migration_cancel_timeout / 5 or + 1) try: self._get_volume_move_status(source_share, share_server) @@ -2213,6 +2215,33 @@ class NetAppCmodeFileStorageLibrary(object): self._client.abort_volume_move(share_volume, vserver) + @manila_utils.retry(exception.InUse, interval=5, + retries=retries, backoff_rate=1) + def wait_for_migration_cancel_complete(): + move_status = self._get_volume_move_status(source_share, + share_server) + if move_status['state'] == 'failed': + return + else: + msg = "Migration cancelation isn't finished yet." + raise exception.InUse(message=msg) + + try: + wait_for_migration_cancel_complete() + except exception.InUse: + move_status = self._get_volume_move_status(source_share, + share_server) + msg_args = { + 'share_move_state': move_status['state'] + } + msg = _("Migration cancelation was not successful. The share " + "migration state failed while transitioning from " + "%(share_move_state)s state to 'failed'. Retries " + "exhausted.") % msg_args + raise exception.NetAppException(message=msg) + except exception.NetAppException: + LOG.exception("Could not get volume move status.") + msg = ("Share volume move operation for share %(shr)s from host " "%(src)s to %(dest)s was successfully aborted.") msg_args = { diff --git a/manila/share/drivers/netapp/options.py b/manila/share/drivers/netapp/options.py index 534b32b26a..6173246ee0 100644 --- a/manila/share/drivers/netapp/options.py +++ b/manila/share/drivers/netapp/options.py @@ -150,7 +150,13 @@ netapp_data_motion_opts = [ default=3600, # One Hour, help='The maximum time in seconds to wait for the completion ' 'of a volume move operation after the cutover ' - 'was triggered.'), ] + 'was triggered.'), + cfg.IntOpt('netapp_migration_cancel_timeout', + min=0, + default=3600, # One Hour, + help='The maximum time in seconds that migration cancel ' + 'waits for all migration operations be completely ' + 'aborted.'), ] CONF = cfg.CONF CONF.register_opts(netapp_proxy_opts) diff --git a/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py b/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py index cdcb19ea39..520a4618b3 100644 --- a/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py +++ b/manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py @@ -4725,35 +4725,69 @@ class NetAppFileStorageLibraryTestCase(test.TestCase): self.assertDictMatch(expected_progress, migration_progress) mock_info_log.assert_called_once() - @ddt.data(utils.annotated('already_canceled', (True, )), - utils.annotated('not_canceled_yet', (False, ))) - def test_migration_cancel(self, already_canceled): + @ddt.data({'state': 'failed'}, + {'state': 'healthy'}) + @ddt.unpack + def test_migration_cancel(self, state): source_snapshots = mock.Mock() snapshot_mappings = mock.Mock() - already_canceled = already_canceled[0] - mock_exception_log = self.mock_object(lib_base.LOG, 'exception') + self.library.configuration.netapp_migration_cancel_timeout = 15 mock_info_log = self.mock_object(lib_base.LOG, 'info') - vol_move_side_effect = (exception.NetAppException - if already_canceled else None) + self.mock_object(self.library, '_get_vserver', mock.Mock(return_value=(fake.VSERVER1, mock.Mock()))) self.mock_object(self.library, '_get_backend_share_name', mock.Mock(return_value=fake.SHARE_NAME)) self.mock_object(self.client, 'abort_volume_move') self.mock_object(self.client, 'get_volume_move_status', - mock.Mock(side_effect=vol_move_side_effect)) + mock.Mock(return_value={'state': state})) - retval = self.library.migration_cancel( + if state == 'failed': + retval = self.library.migration_cancel( + self.context, fake_share.fake_share_instance(), + fake_share.fake_share_instance(), source_snapshots, + snapshot_mappings, share_server=fake.SHARE_SERVER, + destination_share_server='dst_srv') + + self.assertIsNone(retval) + mock_info_log.assert_called_once() + else: + self.assertRaises( + (exception.NetAppException), + self.library.migration_cancel, self.context, + fake_share.fake_share_instance(), + fake_share.fake_share_instance, source_snapshots, + snapshot_mappings, share_server=fake.SHARE_SERVER, + destination_share_server='dst_srv') + + @ddt.data({'already_canceled': True, 'effect': exception.NetAppException}, + {'already_canceled': False, 'effect': + (None, exception.NetAppException)}) + @ddt.unpack + def test_migration_cancel_exception_volume_status(self, already_canceled, + effect): + source_snapshots = mock.Mock() + snapshot_mappings = mock.Mock() + self.library.configuration.netapp_migration_cancel_timeout = 1 + mock_exception_log = self.mock_object(lib_base.LOG, 'exception') + mock_info_log = self.mock_object(lib_base.LOG, 'info') + self.mock_object(self.library, '_get_vserver', + mock.Mock(return_value=(fake.VSERVER1, mock.Mock()))) + self.mock_object(self.library, '_get_backend_share_name', + mock.Mock(return_value=fake.SHARE_NAME)) + self.mock_object(self.client, 'abort_volume_move') + self.mock_object(self.client, 'get_volume_move_status', + mock.Mock(side_effect=effect)) + self.library.migration_cancel( self.context, fake_share.fake_share_instance(), fake_share.fake_share_instance(), source_snapshots, snapshot_mappings, share_server=fake.SHARE_SERVER, destination_share_server='dst_srv') - self.assertIsNone(retval) - if already_canceled: - mock_exception_log.assert_called_once() - else: + mock_exception_log.assert_called_once() + if not already_canceled: mock_info_log.assert_called_once() + self.assertEqual(not already_canceled, self.client.abort_volume_move.called) diff --git a/releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml b/releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml new file mode 100644 index 0000000000..4a7002630c --- /dev/null +++ b/releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml @@ -0,0 +1,20 @@ +--- +fixes: + - | + NetApp ONTAP share delete operation can fail sometimes when is triggered + immediately after migration cancelation on a overloaded NetApp backend. + Canceling an ongoing migration is an asynchronous operation on an + ONTAP storage system. + Now the NetApp driver checks if the asynchronous API has ended its + operation before reporting migration cancelation success. If the operation + of the asynchronous API did not end within the specified timeout, the + ``migration cancel`` cancel operation will be considered unsuccessful. + To do so, a new configuration option ``netapp_migration_cancel_timeout`` + has been added. +upgrade: + - | + The configuration option ``netapp_migration_cancel_timeout`` can be + specified in the NetApp backend section to redefine the amount of time + that the NetApp driver must attempt to wait on the asynchronous + operation to cancel an ongoing migration. This option is set to 3600 + seconds by default, which is sufficient time in most cases.