RBD: Flattening of child volumes during deletion

This patch allows delete_volume and delete_snapshot calls
to fail less often when using RBD volume clones and snapshots.

RBD clone v2 support allows remove() to pass in situations
where it would previously fail, but it still fails with an
ImageBusy error in some situations.  For example:

 volume1
   -> snapshot s1 of volume 1
     -> volume2 cloned from snapshot 1
Deleting snapshot s1 would fail with ImageBusy.

This is fixed by using RBD flatten operations to break
dependencies between volumes/snapshots and other RBD volumes
or snapshots.

Delete now works as follows:
  1. Attempt RBD remove()
     This is the "fast path" for removing a simple volume
     that involves no extra overhead.
  2. If busy and the volume has child dependencies,
     flatten those dependencies with RBD flatten()
  3. Attempt RBD remove() again
     This will succeed in more cases than (1) would have.
  4. If remove() failed, use trash_move() instead to move
     the image to the trash instead.
     This allows Cinder deletion of a volume (volume1) to proceed
     in the scenario where volume2 was cloned from snapshot s1 of
     volume1, and snapshot s1 has been trashed and not fully
     deleted from the RBD backend.  (Snapshots in the trash
     namespace are no longer visible but are still in the
     dependency chain.)

This allows Cinder deletions to succeed in most scenarios where
they would previously fail.

In cases where a .clone_snap snapshot is present, we still do a
rename to .deleted instead of deleting/trashing the volume.  This
should be worked on further in a follow-up as it is likely not
necessary most of the time.

A new configuration option, rbd_concurrent_flatten_operations, was
introduced to limit how many flatten calls can be made at the same time.
This is to prevent overloading the backend. The default value is 3.

Co-Authored-By: Eric Harney <eharney@redhat.com>
Co-Authored-By: Sofia Enriquez <lsofia.enriquez@gmail.com>

Closes-Bug: #1969643
Change-Id: I009d0748fdc829ca0b4f99bc9b70dadd19717d04
This commit is contained in:
Eric Harney 2022-04-20 12:08:31 -04:00
parent 0634096238
commit 1a675c9aa1
3 changed files with 404 additions and 231 deletions

View File

@ -193,6 +193,7 @@ class RBDTestCase(test.TestCase):
cfg.rados_connection_interval = 5
cfg.backup_use_temp_snapshot = False
cfg.enable_deferred_deletion = False
cfg.rbd_concurrent_flatten_operations = 3
# Because the mocked conf doesn't actually have an underlying oslo conf
# it doesn't have the set_default method, so we use a fake one.
@ -792,15 +793,15 @@ class RBDTestCase(test.TestCase):
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value,
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
None)
(self.driver.rbd.Image.return_value
.list_snaps.assert_called_once_with())
self.mock_proxy.return_value.__enter__.return_value.\
list_snaps.assert_called_once_with()
client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.driver.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
@ -827,16 +828,16 @@ class RBDTestCase(test.TestCase):
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value,
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
None)
m_del_clone_parent_refs.assert_called_once()
(self.driver.rbd.Image.return_value
.list_snaps.assert_called_once_with())
m_del_clone_parent_refs.assert_not_called()
self.mock_proxy.return_value.__enter__.return_value.list_snaps.\
assert_called_once_with()
client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None)
m_del_back_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.driver.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
@ -859,18 +860,17 @@ class RBDTestCase(test.TestCase):
drv.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
drv.rbd.Image.return_value,
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
None)
drv.rbd.Image.return_value.list_snaps.assert_called_once_with()
client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None)
mock_delete_backup_snaps.assert_called_once_with(
drv.rbd.Image.return_value)
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
drv.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
1, drv.rbd.RBD.return_value.trash_move.call_count)
0, drv.rbd.RBD.return_value.trash_move.call_count)
self.driver.rbd.RBD.return_value.remove.assert_not_called()
@common_mocks
@ -926,7 +926,7 @@ class RBDTestCase(test.TestCase):
drv.delete_volume(self.volume_a)
self.assertEqual(
1, drv.rbd.RBD.return_value.trash_move.call_count)
0, drv.rbd.RBD.return_value.trash_move.call_count)
@common_mocks
def test_deferred_deletion_w_deleted_parent(self):
@ -940,16 +940,18 @@ class RBDTestCase(test.TestCase):
drv.delete_volume(self.volume_a)
self.assertEqual(
2, drv.rbd.RBD.return_value.trash_move.call_count)
0, drv.rbd.RBD.return_value.trash_move.call_count)
@common_mocks
def test_delete_volume_not_found_at_open(self):
self.mock_rbd.Image.side_effect = self.mock_rbd.ImageNotFound
self.mock_proxy.side_effect = self.mock_rbd.ImageNotFound
self.assertIsNone(self.driver.delete_volume(self.volume_a))
with mock.patch.object(driver, 'RADOSClient') as client:
client = self.mock_client.return_value.__enter__.return_value
self.mock_rbd.Image.assert_called_once_with(client.ioctx,
self.volume_a.name)
self.mock_proxy.assert_called_once_with(self.driver,
self.volume_a.name,
ioctx=client.ioctx)
# Make sure the exception was raised
self.assertEqual([self.mock_rbd.ImageNotFound], RAISED_EXCEPTIONS)
@ -958,45 +960,58 @@ class RBDTestCase(test.TestCase):
self.mock_rbd.Image.return_value.list_snaps.return_value = []
self.mock_rbd.RBD.return_value.remove.side_effect = (
self.mock_rbd.ImageBusy)
mock_get_clone_info = self.mock_object(self.driver, '_get_clone_info')
mock_get_clone_info.return_value = (None, None, None)
self.mock_rbd.ImageBusy,
None)
mock_delete_backup_snaps = self.mock_object(self.driver,
'_delete_backup_snaps')
mock_rados_client = self.mock_object(driver, 'RADOSClient')
mock_flatten = self.mock_object(self.driver, '_flatten')
self.driver.delete_volume(self.volume_a)
with mock.patch.object(self.driver, '_get_clone_info') as \
mock_get_clone_info:
mock_get_clone_info.return_value = (None, None, None)
self.driver.rbd.Image.return_value.list_children.\
return_value = [('pool1', 'child1'),
('pool1', 'child2')]
self.mock_proxy.return_value.__enter__.return_value.list_children.\
return_value = [('pool1', 'child1'), ('pool1', 'child2')]
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value,
self.volume_a.name,
None)
self.mock_rbd.Image.return_value.list_snaps.assert_called_once_with()
mock_rados_client.assert_called_once_with(self.driver)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
1, self.mock_rbd.RBD.return_value.remove.call_count)
self.assertEqual(1, len(RAISED_EXCEPTIONS))
# Make sure the exception was raised
self.assertIn(self.mock_rbd.ImageBusy, RAISED_EXCEPTIONS)
mock_flatten.assert_has_calls(
[mock.call('pool1', 'child1'),
mock.call('pool1', 'child2')])
self.mock_rbd.RBD.return_value.trash_move.\
assert_called_once_with(
mock.ANY,
mock_get_clone_info.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
0)
None)
self.mock_proxy.return_value.__enter__.return_value.list_snaps.\
assert_called_once_with()
mock_rados_client.assert_called_once_with(self.driver)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.
called)
self.assertEqual(
2,
self.mock_rbd.RBD.return_value.remove.call_count)
self.assertEqual(1, len(RAISED_EXCEPTIONS))
# Make sure the exception was raised
self.assertIn(self.mock_rbd.ImageBusy,
RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.assert_not_called()
@common_mocks
def test_delete_volume_has_snapshots(self):
self.mock_rbd.Image.return_value.list_snaps.return_value = []
self.mock_rbd.RBD.return_value.remove.side_effect = (
self.mock_rbd.ImageHasSnapshots)
self.mock_rbd.ImageHasSnapshots, # initial vol remove attempt
None # removal of child image
)
mock_get_clone_info = self.mock_object(self.driver,
'_get_clone_info',
return_value=(None,
@ -1005,18 +1020,70 @@ class RBDTestCase(test.TestCase):
m_del_backup_snaps = self.mock_object(self.driver,
'_delete_backup_snaps')
mock_try_remove_volume = self.mock_object(self.driver,
'_try_remove_volume',
return_value=True)
mock_rados_client = self.mock_object(driver, 'RADOSClient')
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
None)
mock_rados_client.assert_called_once_with(self.driver)
m_del_backup_snaps.assert_called_once_with(
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
1, self.mock_rbd.RBD.return_value.remove.call_count)
self.assertEqual(1, len(RAISED_EXCEPTIONS))
# Make sure the exception was raised
self.assertIn(self.mock_rbd.ImageHasSnapshots,
RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.assert_not_called()
mock_try_remove_volume.assert_called_once_with(mock.ANY,
self.volume_a.name)
@common_mocks
def test_delete_volume_has_snapshots_trash(self):
self.mock_rbd.Image.return_value.list_snaps.return_value = []
self.mock_rbd.RBD.return_value.remove.side_effect = (
self.mock_rbd.ImageHasSnapshots, # initial vol remove attempt
None # removal of child image
)
mock_get_clone_info = self.mock_object(self.driver,
'_get_clone_info',
return_value=(None,
None,
None))
m_del_backup_snaps = self.mock_object(self.driver,
'_delete_backup_snaps')
mock_try_remove_volume = self.mock_object(self.driver,
'_try_remove_volume',
return_value=False)
mock_trash_volume = self.mock_object(self.driver,
'_move_volume_to_trash')
with mock.patch.object(driver, 'RADOSClient') as mock_rados_client:
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value,
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
None)
(self.mock_rbd.Image.return_value.list_snaps
.assert_called_once_with())
self.mock_proxy.return_value.__enter__.return_value.list_snaps.\
assert_called_once_with()
mock_rados_client.assert_called_once_with(self.driver)
m_del_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
@ -1027,10 +1094,14 @@ class RBDTestCase(test.TestCase):
RAISED_EXCEPTIONS)
self.mock_rbd.RBD.return_value.trash_move.\
assert_called_once_with(
mock.ANY,
self.volume_a.name,
0)
assert_not_called()
mock_trash_volume.assert_called_once_with(mock.ANY,
self.volume_a.name,
0)
mock_try_remove_volume.assert_called_once_with(mock.ANY,
self.volume_a.name)
@common_mocks
def test_delete_volume_not_found(self):
@ -1046,15 +1117,20 @@ class RBDTestCase(test.TestCase):
mock_get_clone_info = self.mock_object(self.driver, '_get_clone_info')
mock_get_clone_info.return_value = (None, None, None)
mock_find_clone_snap = self.mock_object(self.driver,
'_find_clone_snap',
return_value=None)
self.assertIsNone(self.driver.delete_volume(self.volume_a))
image = self.mock_proxy.return_value.__enter__.return_value
mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value,
image,
self.volume_a.name,
None)
self.mock_rbd.Image.return_value.list_snaps.assert_called_once_with()
mock_find_clone_snap.assert_called_once_with(image)
mock_rados_client.assert_called_once_with(self.driver)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
mock_delete_backup_snaps.assert_called_once_with(image)
self.assertFalse(
self.mock_rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
@ -1083,21 +1159,22 @@ class RBDTestCase(test.TestCase):
return_value=(None,
None,
None))
self.mock_object(self.driver, '_find_clone_snap',
return_value=snapshots[2]['name'])
with mock.patch.object(self.driver, '_delete_backup_snaps') as \
mock_delete_backup_snaps:
self.driver.delete_volume(self.volume_a)
mock_get_clone_info.assert_called_once_with(
self.mock_rbd.Image.return_value,
self.mock_proxy.return_value.__enter__.return_value,
self.volume_a.name,
snapshots[2]['name'])
(self.driver.rbd.Image.return_value
.list_snaps.assert_called_once_with())
client.__enter__.assert_called_once_with()
client.__exit__.assert_called_once_with(None, None, None)
mock_delete_backup_snaps.assert_called_once_with(
self.mock_rbd.Image.return_value)
self.mock_proxy.return_value.__enter__.return_value)
self.assertFalse(
self.driver.rbd.Image.return_value.unprotect_snap.called)
self.assertEqual(
@ -1281,26 +1358,40 @@ class RBDTestCase(test.TestCase):
proxy.__enter__.return_value = proxy
proxy.unprotect_snap.side_effect = (
self.mock_rbd.ImageBusy,
None)
with mock.patch.object(self.driver, '_flatten_children') as \
mock_flatten_children:
self.driver.delete_snapshot(self.snapshot)
mock_flatten_children.assert_called_once_with(mock.ANY,
self.volume_a.name,
self.snapshot.name)
self.assertTrue(proxy.unprotect_snap.called)
self.assertTrue(proxy.remove_snap.called)
@common_mocks
@mock.patch.object(driver.RBDDriver, '_flatten')
@mock.patch('cinder.objects.Volume.get_by_id')
def test_delete_busy_snapshot_fail(self, volume_get_by_id, flatten_mock):
volume_get_by_id.return_value = self.volume_a
proxy = self.mock_proxy.return_value
proxy.__enter__.return_value = proxy
proxy.unprotect_snap.side_effect = (
self.mock_rbd.ImageBusy,
self.mock_rbd.ImageBusy,
self.mock_rbd.ImageBusy)
flatten_mock.side_effect = exception.SnapshotIsBusy(self.snapshot.name)
with mock.patch.object(self.driver, '_get_children_info') as \
mock_get_children_info:
mock_get_children_info.return_value = [('pool', 'volume2')]
self.assertRaises(exception.SnapshotIsBusy,
self.driver.delete_snapshot,
self.snapshot)
with mock.patch.object(driver, 'LOG') as \
mock_log:
self.assertRaises(exception.SnapshotIsBusy,
self.driver.delete_snapshot,
self.snapshot)
mock_get_children_info.assert_called_once_with(
proxy,
self.snapshot.name)
self.assertTrue(mock_log.info.called)
self.assertTrue(proxy.unprotect_snap.called)
self.assertFalse(proxy.remove_snap.called)
self.assertTrue(proxy.unprotect_snap.called)
self.assertFalse(proxy.remove_snap.called)
@common_mocks
@mock.patch('cinder.objects.Volume.get_by_id')
@ -1325,19 +1416,6 @@ class RBDTestCase(test.TestCase):
self.snapshot)
image.rollback_to_snap.assert_called_once_with(self.snapshot.name)
@common_mocks
def test_get_children_info(self):
volume = self.mock_proxy
volume.set_snap = mock.Mock()
volume.list_children = mock.Mock()
list_children = [('pool', 'volume2')]
volume.list_children.return_value = list_children
info = self.driver._get_children_info(volume,
self.snapshot['name'])
self.assertEqual(list_children, info)
@common_mocks
def test_get_clone_info(self):
volume = self.mock_rbd.Image()

View File

@ -1,4 +1,5 @@
# Copyright 2013 OpenStack Foundation
# Copyright 2022 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@ -133,6 +134,9 @@ RBD_OPTS = [
cfg.IntOpt('deferred_deletion_purge_interval', default=60,
help='Number of seconds between runs of the periodic task '
'to purge volumes tagged for deletion.'),
cfg.IntOpt('rbd_concurrent_flatten_operations', default=3, min=0,
help='Number of flatten operations that will run '
'concurrently on this volume service.')
]
CONF = cfg.CONF
@ -356,6 +360,10 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
self.keyring_data: Optional[str] = None
self._set_keyring_attributes()
self._semaphore = utils.semaphore_factory(
limit=self.configuration.rbd_concurrent_flatten_operations,
concurrent_processes=1)
def _set_keyring_attributes(self) -> None:
# The rbd_keyring_conf option is not available for OpenStack usage
# for security reasons (OSSN-0085) and in OpenStack we use
@ -830,6 +838,25 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
'dst_size': volume.size})
self._resize(volume)
def _flatten_volume(
self,
image_name: str,
client: RADOSClient) -> None:
# Flatten destination volume
try:
with RBDVolumeProxy(self, image_name, client=client,
ioctx=client.ioctx) as dest_volume:
LOG.debug("flattening volume %s", image_name)
dest_volume.flatten()
except Exception as e:
msg = (_("Failed to flatten volume %(volume)s with "
"error: %(error)s.") %
{'volume': image_name,
'error': e})
LOG.exception(msg)
raise exception.VolumeBackendAPIException(data=msg)
def create_cloned_volume(
self,
volume: Volume,
@ -890,24 +917,12 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
# volumes are always flattened.
if (volume.use_quota and
depth >= self.configuration.rbd_max_clone_depth):
LOG.info("maximum clone depth (%d) has been reached - "
"flattening dest volume",
self.configuration.rbd_max_clone_depth)
# Flatten destination volume
try:
with RBDVolumeProxy(self, dest_name, client=client,
ioctx=client.ioctx) as dest_volume:
LOG.debug("flattening dest volume %s", dest_name)
dest_volume.flatten()
except Exception as e:
msg = (_("Failed to flatten volume %(volume)s with "
"error: %(error)s.") %
{'volume': dest_name,
'error': e})
LOG.exception(msg)
src_volume.close()
raise exception.VolumeBackendAPIException(data=msg)
self._flatten_volume(dest_name, client)
try:
# remove temporary snap
@ -1168,11 +1183,22 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
return volume_update
@utils.limit_operations
def _do_flatten(self, volume_name: str, pool: str) -> None:
LOG.debug('flattening %s/%s', pool, volume_name)
try:
with RBDVolumeProxy(self, volume_name, pool=pool) as vol:
vol.flatten()
LOG.debug('flattening of %s/%s has completed',
pool, volume_name)
except self.rbd.ImageNotFound:
LOG.debug('image %s not found during flatten', volume_name)
# do nothing
def _flatten(self, pool: str, volume_name: str) -> None:
LOG.debug('flattening %(pool)s/%(img)s',
dict(pool=pool, img=volume_name))
with RBDVolumeProxy(self, volume_name, pool) as vol:
vol.flatten()
image = pool + '/' + volume_name
LOG.debug('Queueing %s for flattening', image)
self._do_flatten(volume_name, pool)
def _get_stripe_unit(self, ioctx: 'rados.Ioctx', volume_name: str) -> int:
"""Return the correct stripe unit for a cloned volume.
@ -1310,23 +1336,6 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
return (None, None, None)
def _get_children_info(self,
volume: 'rbd.Image',
snap: Optional[str]) -> list[tuple]:
"""List children for the given snapshot of a volume(image).
Returns a list of (pool, image).
"""
children_list = []
if snap:
volume.set_snap(snap)
children_list = volume.list_children()
volume.set_snap(None)
return children_list
def _delete_clone_parent_refs(self,
client: RADOSClient,
parent_name: str,
@ -1369,96 +1378,161 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
g_parent_snap = typing.cast(str, g_parent_snap)
self._delete_clone_parent_refs(client, g_parent, g_parent_snap)
def delete_volume(self, volume: Volume) -> None:
"""Deletes a logical volume."""
volume_name = volume.name
with RADOSClient(self) as client:
def _flatten_children(self,
client_ioctx: 'rados.Ioctx',
volume_name: str,
snap_name: Optional[str] = None) -> None:
with RBDVolumeProxy(self,
volume_name,
ioctx=client_ioctx) as rbd_image:
if snap_name is not None:
rbd_image.set_snap(snap_name)
children_list = rbd_image.list_children()
for (pool, child_name) in children_list:
LOG.info('Image %(pool)s/%(image)s%(snap)s is dependent '
'on the image %(volume_name)s.',
{'pool': pool,
'image': child_name,
'volume_name': volume_name,
'snap': '@' + snap_name if snap_name else ''})
try:
rbd_image = self.rbd.Image(client.ioctx, volume_name)
except self.rbd.ImageNotFound:
LOG.info("volume %s no longer exists in backend",
volume_name)
return
self._flatten(pool, child_name)
except Exception as e:
LOG.error(e)
raise
clone_snap = None
parent = None
def _move_volume_to_trash(self,
client_ioctx: 'rados.Ioctx',
volume_name: str,
delay: int) -> None:
# trash_move() will succeed in some situations when a regular
# remove() call will fail due to image dependencies
LOG.debug("moving volume %s to trash", volume_name)
try:
self.RBDProxy().trash_move(client_ioctx,
volume_name,
delay)
except self.rbd.ImageBusy:
msg = _('ImageBusy error raised while trashing RBD '
'volume.')
LOG.warning(msg)
raise exception.VolumeIsBusy(msg, volume_name=volume_name)
# Ensure any backup snapshots are deleted
self._delete_backup_snaps(rbd_image)
# If the volume has non-clone snapshots this delete is expected to
# raise VolumeIsBusy so do so straight away.
def _try_remove_volume(self, client: 'rados', volume_name: str) -> bool:
# Try a couple of times to delete the volume, rather than
# stopping on the first error.
# In the event of simultaneous Cinder delete operations,
# this gives a window for other deletes of snapshots and images
# to complete, freeing dependencies which allow this remove to
# succeed.
@utils.retry((self.rbd.ImageBusy, self.rbd.ImageHasSnapshots),
self.configuration.rados_connection_interval,
self.configuration.rados_connection_retries)
def _do_try_remove_volume(self, client, volume_name: str) -> bool:
try:
snaps = rbd_image.list_snaps()
for snap in snaps:
if snap['name'].endswith('.clone_snap'):
LOG.debug("volume has clone snapshot(s)")
# We grab one of these and use it when fetching parent
# info in case the volume has been flattened.
clone_snap = snap['name']
break
LOG.debug('Trying to remove image %s', volume_name)
self.RBDProxy().remove(client.ioctx, volume_name)
return True
except (self.rbd.ImageHasSnapshots, self.rbd.ImageBusy):
with excutils.save_and_reraise_exception():
msg = _('deletion failed')
LOG.info(msg)
return False
return _do_try_remove_volume(self, client, volume_name)
@staticmethod
def _find_clone_snap(rbd_image: RBDVolumeProxy) -> Optional[str]:
snaps = rbd_image.list_snaps()
for snap in snaps:
if snap['name'].endswith('.clone_snap'):
LOG.debug("volume has clone snapshot(s)")
# We grab one of these and use it when fetching parent
# info in case the volume has been flattened.
clone_snap = snap['name']
return clone_snap
return None
def _delete_volume(self, volume: Volume, client: RADOSClient) -> None:
clone_snap = None
parent = None
parent_snap = None
try:
with RBDVolumeProxy(self,
volume.name,
ioctx=client.ioctx) as rbd_image:
# Ensure any backup snapshots are deleted
self._delete_backup_snaps(rbd_image)
clone_snap = self._find_clone_snap(rbd_image)
# Determine if this volume is itself a clone
_pool, parent, parent_snap = self._get_clone_info(rbd_image,
volume_name,
volume.name,
clone_snap)
finally:
rbd_image.close()
except self.rbd.ImageNotFound:
LOG.info("volume %s no longer exists in backend", volume.name)
return
@utils.retry(self.rbd.ImageBusy,
self.configuration.rados_connection_interval,
self.configuration.rados_connection_retries)
def _try_remove_volume(client: Any, volume_name: str) -> None:
if self.configuration.enable_deferred_deletion:
delay = self.configuration.deferred_deletion_delay
else:
try:
self.RBDProxy().remove(client.ioctx, volume_name)
return
except (self.rbd.ImageHasSnapshots, self.rbd.ImageBusy):
delay = 0
LOG.debug("moving volume %s to trash", volume_name)
# When using the RBD v2 clone api, deleting a volume
# that has a snapshot in the trash space raises a
# busy exception.
# In order to solve this, call the trash operation
# which should succeed when the volume has
# dependencies.
self.RBDProxy().trash_move(client.ioctx,
volume_name,
delay)
if clone_snap is not None:
# If the volume has copy-on-write clones, keep it as a silent
# volume which will be deleted when its snapshots and clones
# are deleted.
# TODO: only do this if it actually can't be deleted?
new_name = "%s.deleted" % (volume.name)
self.RBDProxy().rename(client.ioctx, volume.name, new_name)
return
if clone_snap is None:
LOG.debug("deleting rbd volume %s", volume_name)
try:
_try_remove_volume(client, volume_name)
except self.rbd.ImageBusy:
msg = (_("ImageBusy error raised while deleting rbd "
"volume. This may have been caused by a "
"connection from a client that has crashed and, "
"if so, may be resolved by retrying the delete "
"after 30 seconds has elapsed."))
LOG.warning(msg)
# Now raise this so that the volume stays available and the
# deletion can be retried.
raise exception.VolumeIsBusy(msg, volume_name=volume_name)
except self.rbd.ImageNotFound:
LOG.info("RBD volume %s not found, allowing delete "
"operation to proceed.", volume_name)
return
LOG.debug("deleting RBD volume %s", volume.name)
# If it is a clone, walk back up the parent chain deleting
# references.
if parent:
LOG.debug("volume is a clone so cleaning references")
parent_snap = typing.cast(str, parent_snap)
self._delete_clone_parent_refs(client, parent, parent_snap)
else:
# If the volume has copy-on-write clones we will not be able to
# delete it. Instead we will keep it as a silent volume which
# will be deleted when it's snapshot and clones are deleted.
new_name = "%s.deleted" % (volume_name)
self.RBDProxy().rename(client.ioctx, volume_name, new_name)
try:
self.RBDProxy().remove(client.ioctx, volume.name)
return # the fast path was successful
except (self.rbd.ImageHasSnapshots, self.rbd.ImageBusy):
self._flatten_children(client.ioctx, volume.name)
except self.rbd.ImageNotFound:
LOG.info("RBD volume %s not found, allowing delete "
"operation to proceed.", volume.name)
return
try:
if self._try_remove_volume(client, volume.name):
return
except self.rbd.ImageHasSnapshots:
# perform trash instead, which can succeed when snapshots exist
pass
except self.rbd.ImageBusy:
msg = _('ImageBusy error raised while deleting RBD volume')
raise exception.VolumeIsBusy(msg, volume_name=volume.name)
delay = 0
if self.configuration.enable_deferred_deletion:
delay = self.configuration.deferred_deletion_delay
# Since it failed to remove above, trash the volume here instead.
# This covers the scenario of an image unable to be deleted because
# a child snapshot of it has been trashed but not yet removed.
# That snapshot is not visible but is still in the dependency
# chain of RBD images.
self._move_volume_to_trash(client.ioctx, volume.name, delay)
# If it is a clone, walk back up the parent chain deleting
# references.
if parent:
LOG.debug("volume is a clone so cleaning references")
parent_snap = typing.cast(str, parent_snap)
self._delete_clone_parent_refs(client, parent, parent_snap)
def delete_volume(self, volume: Volume) -> None:
"""Deletes an RBD volume."""
with RADOSClient(self) as client:
self._delete_volume(volume, client)
def create_snapshot(self, snapshot: Snapshot) -> None:
"""Creates an rbd snapshot."""
@ -1472,38 +1546,42 @@ class RBDDriver(driver.CloneableImageVD, driver.MigrateVD,
volume_name = snapshot.volume_name
snap_name = snapshot.name
@utils.retry(self.rbd.ImageBusy,
self.configuration.rados_connection_interval,
self.configuration.rados_connection_retries)
def do_unprotect_snap(self, volume_name, snap_name):
try:
with RBDVolumeProxy(self, volume_name) as volume:
volume.unprotect_snap(snap_name)
except self.rbd.InvalidArgument:
LOG.info(
"InvalidArgument: Unable to unprotect snapshot %s.",
snap_name)
except self.rbd.ImageNotFound as e:
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
raise e
except self.rbd.ImageBusy as e:
# flatten and then retry the operation
with RADOSClient(self) as client:
self._flatten_children(client.ioctx,
volume_name,
snap_name)
raise e
try:
do_unprotect_snap(self, volume_name, snap_name)
except self.rbd.ImageBusy:
raise exception.SnapshotIsBusy(snapshot_name=snap_name)
except self.rbd.ImageNotFound:
return
try:
with RBDVolumeProxy(self, volume_name) as volume:
try:
volume.unprotect_snap(snap_name)
except self.rbd.InvalidArgument:
LOG.info(
"InvalidArgument: Unable to unprotect snapshot %s.",
snap_name)
except self.rbd.ImageNotFound:
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
return
except self.rbd.ImageBusy:
children_list = self._get_children_info(volume, snap_name)
if children_list:
for (pool, image) in children_list:
LOG.info('Image %(pool)s/%(image)s is dependent '
'on the snapshot %(snap)s.',
{'pool': pool,
'image': image,
'snap': snap_name})
raise exception.SnapshotIsBusy(snapshot_name=snap_name)
try:
volume.remove_snap(snap_name)
except self.rbd.ImageNotFound:
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
volume.remove_snap(snap_name)
except self.rbd.ImageNotFound:
LOG.warning("Volume %s does not exist in backend.", volume_name)
LOG.info("Snapshot %s does not exist in backend.",
snap_name)
def snapshot_revert_use_temp_snapshot(self) -> bool:
"""Disable the use of a temporary snapshot on revert."""

View File

@ -0,0 +1,17 @@
---
upgrade:
- |
Cinder now uses the RBD trash functionality to handle some volume deletions.
Therefore, deployments must either a) enable scheduled RBD trash purging on
the RBD backend or b) enable the Cinder RBD driver's enable_deferred_deletion
option to have Cinder purge the RBD trash.
This adds the new configuration option 'rbd_concurrent_flatten_operations',
which limits how many RBD flattens the driver will run simultaneously.
This can be used to prevent flatten operations from consuming too much I/O
capacity on the Ceph cluster. It defaults to 3.
fixes:
- |
`Bug #1969643 <https://bugs.launchpad.net/cinder/+bug/1969643>`_:
The RBD driver can now delete volumes with other volumes cloned from it
(or its snapshots) in cases where deletion would previously fail. This
uses the RBD trash functionality.