[NetApp] Fix falsely report migration cancelation success

NetApp ONTAP share delete operation can fail sometimes when is triggered
immediately after migration cancelation on a overloaded NetApp backend.
Migration cancelation invokes "abort_volume_move" which is an asynchronous
API. If share delete operation is requested immediately after call the
former API, it fails because the "abort_volume_move" is still in progress.
Now NetApp cDOT driver checks, for a period of time, if the
``volume-move-abort`` operation has ended before report migration
cancelation success.

Change-Id: I76e11fef27c9723f019cfdfdc6ea86878db78776
Closes-Bug: #1688620
This commit is contained in:
danielarthurt 2020-04-06 13:26:13 +00:00
parent 67fe72f563
commit 0ee4140823
4 changed files with 91 additions and 14 deletions

View File

@ -2512,6 +2512,8 @@ class NetAppCmodeFileStorageLibrary(object):
"""Abort an ongoing migration."""
vserver, vserver_client = self._get_vserver(share_server=share_server)
share_volume = self._get_backend_share_name(source_share['id'])
retries = (self.configuration.netapp_migration_cancel_timeout / 5 or
1)
try:
self._get_volume_move_status(source_share, share_server)
@ -2521,6 +2523,33 @@ class NetAppCmodeFileStorageLibrary(object):
self._client.abort_volume_move(share_volume, vserver)
@manila_utils.retry(exception.InUse, interval=5,
retries=retries, backoff_rate=1)
def wait_for_migration_cancel_complete():
move_status = self._get_volume_move_status(source_share,
share_server)
if move_status['state'] == 'failed':
return
else:
msg = "Migration cancelation isn't finished yet."
raise exception.InUse(message=msg)
try:
wait_for_migration_cancel_complete()
except exception.InUse:
move_status = self._get_volume_move_status(source_share,
share_server)
msg_args = {
'share_move_state': move_status['state']
}
msg = _("Migration cancelation was not successful. The share "
"migration state failed while transitioning from "
"%(share_move_state)s state to 'failed'. Retries "
"exhausted.") % msg_args
raise exception.NetAppException(message=msg)
except exception.NetAppException:
LOG.exception("Could not get volume move status.")
msg = ("Share volume move operation for share %(shr)s from host "
"%(src)s to %(dest)s was successfully aborted.")
msg_args = {

View File

@ -156,7 +156,13 @@ netapp_data_motion_opts = [
default=3600, # One Hour,
help='The maximum time in seconds to wait for the completion '
'of a volume clone split operation in order to start a '
'volume move.'), ]
'volume move.'),
cfg.IntOpt('netapp_migration_cancel_timeout',
min=0,
default=3600, # One Hour,
help='The maximum time in seconds that migration cancel '
'waits for all migration operations be completely '
'aborted.'), ]
CONF = cfg.CONF
CONF.register_opts(netapp_proxy_opts)

View File

@ -5210,24 +5210,24 @@ class NetAppFileStorageLibraryTestCase(test.TestCase):
self.assertDictMatch(expected_progress, migration_progress)
mock_info_log.assert_called_once()
@ddt.data(utils.annotated('already_canceled', (True, )),
utils.annotated('not_canceled_yet', (False, )))
def test_migration_cancel(self, already_canceled):
@ddt.data({'state': 'failed'},
{'state': 'healthy'})
@ddt.unpack
def test_migration_cancel(self, state):
source_snapshots = mock.Mock()
snapshot_mappings = mock.Mock()
already_canceled = already_canceled[0]
mock_exception_log = self.mock_object(lib_base.LOG, 'exception')
self.library.configuration.netapp_migration_cancel_timeout = 15
mock_info_log = self.mock_object(lib_base.LOG, 'info')
vol_move_side_effect = (exception.NetAppException
if already_canceled else None)
self.mock_object(self.library, '_get_vserver',
mock.Mock(return_value=(fake.VSERVER1, mock.Mock())))
self.mock_object(self.library, '_get_backend_share_name',
mock.Mock(return_value=fake.SHARE_NAME))
self.mock_object(self.client, 'abort_volume_move')
self.mock_object(self.client, 'get_volume_move_status',
mock.Mock(side_effect=vol_move_side_effect))
mock.Mock(return_value={'state': state}))
if state == 'failed':
retval = self.library.migration_cancel(
self.context, fake_share.fake_share_instance(),
fake_share.fake_share_instance(), source_snapshots,
@ -5235,10 +5235,44 @@ class NetAppFileStorageLibraryTestCase(test.TestCase):
destination_share_server='dst_srv')
self.assertIsNone(retval)
if already_canceled:
mock_exception_log.assert_called_once()
else:
mock_info_log.assert_called_once()
else:
self.assertRaises(
(exception.NetAppException),
self.library.migration_cancel, self.context,
fake_share.fake_share_instance(),
fake_share.fake_share_instance, source_snapshots,
snapshot_mappings, share_server=fake.SHARE_SERVER,
destination_share_server='dst_srv')
@ddt.data({'already_canceled': True, 'effect': exception.NetAppException},
{'already_canceled': False, 'effect':
(None, exception.NetAppException)})
@ddt.unpack
def test_migration_cancel_exception_volume_status(self, already_canceled,
effect):
source_snapshots = mock.Mock()
snapshot_mappings = mock.Mock()
self.library.configuration.netapp_migration_cancel_timeout = 1
mock_exception_log = self.mock_object(lib_base.LOG, 'exception')
mock_info_log = self.mock_object(lib_base.LOG, 'info')
self.mock_object(self.library, '_get_vserver',
mock.Mock(return_value=(fake.VSERVER1, mock.Mock())))
self.mock_object(self.library, '_get_backend_share_name',
mock.Mock(return_value=fake.SHARE_NAME))
self.mock_object(self.client, 'abort_volume_move')
self.mock_object(self.client, 'get_volume_move_status',
mock.Mock(side_effect=effect))
self.library.migration_cancel(
self.context, fake_share.fake_share_instance(),
fake_share.fake_share_instance(), source_snapshots,
snapshot_mappings, share_server=fake.SHARE_SERVER,
destination_share_server='dst_srv')
mock_exception_log.assert_called_once()
if not already_canceled:
mock_info_log.assert_called_once()
self.assertEqual(not already_canceled,
self.client.abort_volume_move.called)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
NetApp cDOT driver with a very busy backend may report that the
``migration cancelation`` operation was successful even if wasn't. Now
during the ``migration cancelation`` operation the current state of the
migration task will be tracked and the operation will only report success
when the operation ends correctly.