Fix race condition in RemoteFS create_snapshot_online

There's a `while True` loop in create_snapshot_online and each
iteration can make the `cinder-volume` process sleep up to 10sec. In
the mean time, if Nova fails to create the snapshot, the snapshot
status could turn to "error" and someone could want to delete the
erroneous snapshot, which would make its status to be "deleting".

In that case the `while True` loop would never exit. Cinder consumes
100% of CPU and keeps logging "Status of snapshot XX is now deleting".

The patch fix this issue by exiting the `while True` loop if we detect
that the snapshot is to be deleted.

Closes-Bug: #1538496

Change-Id: I5de0e8479a552ce101cecd06a874a170e54d5c18
This commit is contained in:
Jordan Pittier
2016-02-17 19:05:49 +01:00
parent b77835b23a
commit 94b3288fe5
3 changed files with 48 additions and 3 deletions

View File

@@ -767,6 +767,11 @@ class RemoteFSException(VolumeDriverException):
message = _("Unknown RemoteFS exception")
class RemoteFSConcurrentRequest(RemoteFSException):
message = _("A concurrent, possibly contradictory, request "
"has been made.")
class RemoteFSNoSharesMounted(RemoteFSException):
message = _("No mounted shares found")

View File

@@ -40,6 +40,7 @@ class RemoteFsSnapDriverTestCase(test.TestCase):
_FAKE_SNAPSHOT = {'context': _FAKE_CONTEXT,
'id': _FAKE_SNAPSHOT_ID,
'volume': _FAKE_VOLUME,
'volume_id': _FAKE_VOLUME_ID,
'status': 'available',
'volume_size': 1}
_FAKE_SNAPSHOT_PATH = (_FAKE_VOLUME_PATH + '.' + _FAKE_SNAPSHOT_ID)
@@ -271,6 +272,38 @@ class RemoteFsSnapDriverTestCase(test.TestCase):
self._driver._create_snapshot,
fake_snapshot)
@mock.patch('cinder.db.snapshot_get')
@mock.patch('time.sleep')
def test_create_snapshot_online_with_concurrent_delete(
self, mock_sleep, mock_snapshot_get):
self._driver._nova = mock.Mock()
# Test what happens when progress is so slow that someone
# decides to delete the snapshot while the last known status is
# "creating".
mock_snapshot_get.side_effect = [
{'status': 'creating', 'progress': '42%'},
{'status': 'creating', 'progress': '45%'},
{'status': 'deleting'},
]
with mock.patch.object(self._driver, '_do_create_snapshot') as \
mock_do_create_snapshot:
self.assertRaises(exception.RemoteFSConcurrentRequest,
self._driver._create_snapshot_online,
self._FAKE_SNAPSHOT,
self._FAKE_VOLUME_NAME,
self._FAKE_SNAPSHOT_PATH)
mock_do_create_snapshot.assert_called_once_with(
self._FAKE_SNAPSHOT, self._FAKE_VOLUME_NAME,
self._FAKE_SNAPSHOT_PATH)
self.assertEqual([mock.call(1), mock.call(1)],
mock_sleep.call_args_list)
self.assertEqual(3, mock_snapshot_get.call_count)
mock_snapshot_get.assert_called_with(self._FAKE_SNAPSHOT['context'],
self._FAKE_SNAPSHOT['id'])
@mock.patch.object(utils, 'synchronized')
def _locked_volume_operation_test_helper(self, mock_synchronized, func,
expected_exception=False,

View File

@@ -1287,6 +1287,10 @@ class RemoteFSSnapDriver(RemoteFSDriver, driver.SnapshotVD):
while True:
s = db.snapshot_get(context, snapshot['id'])
LOG.debug('Status of snapshot %(id)s is now %(status)s',
{'id': snapshot['id'],
'status': s['status']})
if s['status'] == 'creating':
if s['progress'] == '90%':
# Nova tasks completed successfully
@@ -1300,9 +1304,12 @@ class RemoteFSSnapDriver(RemoteFSDriver, driver.SnapshotVD):
'while creating snapshot.')
raise exception.RemoteFSException(msg)
LOG.debug('Status of snapshot %(id)s is now %(status)s',
{'id': snapshot['id'],
'status': s['status']})
elif s['status'] == 'deleting' or s['status'] == 'error_deleting':
msg = _('Snapshot %(id)s has been asked to be deleted while '
'waiting for it to become available. Perhaps a '
'concurrent request was made.') % {'id':
snapshot['id']}
raise exception.RemoteFSConcurrentRequest(msg)
if 10 < seconds_elapsed <= 20:
increment = 2