Browse Source

[NetApp] Fix falsely report migration cancelation success

NetApp ONTAP share delete operation can fail sometimes when is triggered
immediately after migration cancelation on a overloaded NetApp backend.
Migration cancelation invokes "abort_volume_move" which is an asynchronous
API. If share delete operation is requested immediately after call the
former API, it fails because the "abort_volume_move" is still in progress.
Now NetApp cDOT driver checks, for a period of time, if the
``volume-move-abort`` operation has ended before report migration
cancelation success.

This patch squash the following commit that improves the release note
for this fix:
[NetApp] Updating the release note for bugfix 1688620
(cherry picked from commit a0dd86a98788f7e2d1ca55be26c2a3dea4e36f57)

Change-Id: I76e11fef27c9723f019cfdfdc6ea86878db78776
Closes-Bug: #1688620
(cherry picked from commit 0ee4140823)
(cherry picked from commit 71c3a08ead)
(cherry picked from commit c915918f3c)
(cherry picked from commit 1199489d99)
(cherry picked from commit b8fc8feb07)
changes/24/744724/1
danielarthurt 6 months ago
committed by Douglas Viroel
parent
commit
826cbfa7ce
4 changed files with 103 additions and 14 deletions
  1. +29
    -0
      manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py
  2. +7
    -1
      manila/share/drivers/netapp/options.py
  3. +47
    -13
      manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py
  4. +20
    -0
      releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml

+ 29
- 0
manila/share/drivers/netapp/dataontap/cluster_mode/lib_base.py View File

@@ -2174,6 +2174,8 @@ class NetAppCmodeFileStorageLibrary(object):
"""Abort an ongoing migration."""
vserver, vserver_client = self._get_vserver(share_server=share_server)
share_volume = self._get_backend_share_name(source_share['id'])
retries = (self.configuration.netapp_migration_cancel_timeout / 5 or
1)

try:
self._get_volume_move_status(source_share, share_server)
@@ -2183,6 +2185,33 @@ class NetAppCmodeFileStorageLibrary(object):

self._client.abort_volume_move(share_volume, vserver)

@manila_utils.retry(exception.InUse, interval=5,
retries=retries, backoff_rate=1)
def wait_for_migration_cancel_complete():
move_status = self._get_volume_move_status(source_share,
share_server)
if move_status['state'] == 'failed':
return
else:
msg = "Migration cancelation isn't finished yet."
raise exception.InUse(message=msg)

try:
wait_for_migration_cancel_complete()
except exception.InUse:
move_status = self._get_volume_move_status(source_share,
share_server)
msg_args = {
'share_move_state': move_status['state']
}
msg = _("Migration cancelation was not successful. The share "
"migration state failed while transitioning from "
"%(share_move_state)s state to 'failed'. Retries "
"exhausted.") % msg_args
raise exception.NetAppException(message=msg)
except exception.NetAppException:
LOG.exception("Could not get volume move status.")

msg = ("Share volume move operation for share %(shr)s from host "
"%(src)s to %(dest)s was successfully aborted.")
msg_args = {


+ 7
- 1
manila/share/drivers/netapp/options.py View File

@@ -138,7 +138,13 @@ netapp_data_motion_opts = [
default=3600, # One Hour,
help='The maximum time in seconds to wait for the completion '
'of a volume move operation after the cutover '
'was triggered.'), ]
'was triggered.'),
cfg.IntOpt('netapp_migration_cancel_timeout',
min=0,
default=3600, # One Hour,
help='The maximum time in seconds that migration cancel '
'waits for all migration operations be completely '
'aborted.'), ]

CONF = cfg.CONF
CONF.register_opts(netapp_proxy_opts)


+ 47
- 13
manila/tests/share/drivers/netapp/dataontap/cluster_mode/test_lib_base.py View File

@@ -4694,35 +4694,69 @@ class NetAppFileStorageLibraryTestCase(test.TestCase):
self.assertDictMatch(expected_progress, migration_progress)
mock_info_log.assert_called_once()

@ddt.data(utils.annotated('already_canceled', (True, )),
utils.annotated('not_canceled_yet', (False, )))
def test_migration_cancel(self, already_canceled):
@ddt.data({'state': 'failed'},
{'state': 'healthy'})
@ddt.unpack
def test_migration_cancel(self, state):
source_snapshots = mock.Mock()
snapshot_mappings = mock.Mock()
already_canceled = already_canceled[0]
mock_exception_log = self.mock_object(lib_base.LOG, 'exception')
self.library.configuration.netapp_migration_cancel_timeout = 15
mock_info_log = self.mock_object(lib_base.LOG, 'info')
vol_move_side_effect = (exception.NetAppException
if already_canceled else None)

self.mock_object(self.library, '_get_vserver',
mock.Mock(return_value=(fake.VSERVER1, mock.Mock())))
self.mock_object(self.library, '_get_backend_share_name',
mock.Mock(return_value=fake.SHARE_NAME))
self.mock_object(self.client, 'abort_volume_move')
self.mock_object(self.client, 'get_volume_move_status',
mock.Mock(side_effect=vol_move_side_effect))
mock.Mock(return_value={'state': state}))

retval = self.library.migration_cancel(
if state == 'failed':
retval = self.library.migration_cancel(
self.context, fake_share.fake_share_instance(),
fake_share.fake_share_instance(), source_snapshots,
snapshot_mappings, share_server=fake.SHARE_SERVER,
destination_share_server='dst_srv')

self.assertIsNone(retval)
mock_info_log.assert_called_once()
else:
self.assertRaises(
(exception.NetAppException),
self.library.migration_cancel, self.context,
fake_share.fake_share_instance(),
fake_share.fake_share_instance, source_snapshots,
snapshot_mappings, share_server=fake.SHARE_SERVER,
destination_share_server='dst_srv')

@ddt.data({'already_canceled': True, 'effect': exception.NetAppException},
{'already_canceled': False, 'effect':
(None, exception.NetAppException)})
@ddt.unpack
def test_migration_cancel_exception_volume_status(self, already_canceled,
effect):
source_snapshots = mock.Mock()
snapshot_mappings = mock.Mock()
self.library.configuration.netapp_migration_cancel_timeout = 1
mock_exception_log = self.mock_object(lib_base.LOG, 'exception')
mock_info_log = self.mock_object(lib_base.LOG, 'info')
self.mock_object(self.library, '_get_vserver',
mock.Mock(return_value=(fake.VSERVER1, mock.Mock())))
self.mock_object(self.library, '_get_backend_share_name',
mock.Mock(return_value=fake.SHARE_NAME))
self.mock_object(self.client, 'abort_volume_move')
self.mock_object(self.client, 'get_volume_move_status',
mock.Mock(side_effect=effect))
self.library.migration_cancel(
self.context, fake_share.fake_share_instance(),
fake_share.fake_share_instance(), source_snapshots,
snapshot_mappings, share_server=fake.SHARE_SERVER,
destination_share_server='dst_srv')

self.assertIsNone(retval)
if already_canceled:
mock_exception_log.assert_called_once()
else:
mock_exception_log.assert_called_once()
if not already_canceled:
mock_info_log.assert_called_once()

self.assertEqual(not already_canceled,
self.client.abort_volume_move.called)



+ 20
- 0
releasenotes/notes/bug-1688620-netapp-migration-cancelation-fb913131eb8eb82a.yaml View File

@@ -0,0 +1,20 @@
---
fixes:
- |
NetApp ONTAP share delete operation can fail sometimes when is triggered
immediately after migration cancelation on a overloaded NetApp backend.
Canceling an ongoing migration is an asynchronous operation on an
ONTAP storage system.
Now the NetApp driver checks if the asynchronous API has ended its
operation before reporting migration cancelation success. If the operation
of the asynchronous API did not end within the specified timeout, the
``migration cancel`` cancel operation will be considered unsuccessful.
To do so, a new configuration option ``netapp_migration_cancel_timeout``
has been added.
upgrade:
- |
The configuration option ``netapp_migration_cancel_timeout`` can be
specified in the NetApp backend section to redefine the amount of time
that the NetApp driver must attempt to wait on the asynchronous
operation to cancel an ongoing migration. This option is set to 3600
seconds by default, which is sufficient time in most cases.

Loading…
Cancel
Save