XenAPI: Remove interrupted snapshots

Currently the VDI chain can grow very long when a snapshots happen at
the same time as nova-compute being terminated.

While we now clean up the instance state, the VDI chain is left in a bad
state, it has an extra snapshot that is no longer required.

This change improves that by looking at when we detect a failed
snapshot, we go back and tidy up the VDI chain.

Partial-Bug: #1331440

Change-Id: I9bae82048910d8c45bc2a4093064c1ac68f15750
This commit is contained in:
John Garbutt 2014-06-18 13:19:57 +01:00 committed by John Garbutt
parent cd641576fa
commit 64520ceeac
10 changed files with 210 additions and 19 deletions

View File

@ -855,6 +855,12 @@ class ComputeManager(manager.Manager):
LOG.debug("Instance in transitional state %s at start-up "
"clearing task state",
instance['task_state'], instance=instance)
try:
self._post_interrupted_snapshot_cleanup(context, instance)
except Exception:
# we don't want that an exception blocks the init_host
msg = _LE('Failed to cleanup snapshot.')
LOG.exception(msg, instance=instance)
instance.task_state = None
instance.save()
@ -2939,6 +2945,9 @@ class ComputeManager(manager.Manager):
msg = _("Image not found during snapshot")
LOG.warn(msg, instance=instance)
def _post_interrupted_snapshot_cleanup(self, context, instance):
self.driver.post_interrupted_snapshot_cleanup(context, instance)
@object_compat
@messaging.expected_exceptions(NotImplementedError)
def volume_snapshot_create(self, context, instance, volume_id,

View File

@ -484,10 +484,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
def _test_init_instance_cleans_image_states(self, instance):
with mock.patch.object(instance, 'save') as save:
self.compute._get_power_state = mock.Mock()
self.compute.driver.post_interrupted_snapshot_cleanup = mock.Mock()
instance.info_cache = None
instance.power_state = power_state.RUNNING
self.compute._init_instance(self.context, instance)
save.assert_called_once_with()
self.compute.driver.post_interrupted_snapshot_cleanup.\
assert_called_once_with(self.context, instance)
self.assertIsNone(instance.task_state)
def test_init_instance_cleans_image_state_pending_upload(self):

View File

@ -263,6 +263,12 @@ class _VirtDriverTestCase(_FakeDriverBackendTestCase):
self.connection.snapshot(self.ctxt, instance_ref, img_ref['id'],
lambda *args, **kwargs: None)
@catch_notimplementederror
def test_post_interrupted_snapshot_cleanup(self):
instance_ref, network_info = self._get_running_instance()
self.connection.post_interrupted_snapshot_cleanup(self.ctxt,
instance_ref)
@catch_notimplementederror
def test_reboot(self):
reboot_type = "SOFT"

View File

@ -15,6 +15,8 @@
import math
import mock
from nova.openstack.common import units
from nova.tests.virt import test_driver
from nova.tests.virt.xenapi import stubs
@ -27,6 +29,12 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB,
test_driver.DriverAPITestHelper):
"""Unit tests for Driver operations."""
def _get_driver(self):
stubs.stubout_session(self.stubs, stubs.FakeSessionForVMTests)
self.flags(connection_url='test_url',
connection_password='test_pass', group='xenserver')
return xenapi.XenAPIDriver(fake.FakeVirtAPI(), False)
def host_stats(self, refresh=True):
return {'host_memory_total': 3 * units.Mi,
'host_memory_free_computed': 2 * units.Mi,
@ -40,11 +48,7 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB,
'pci_passthrough_devices': ''}
def test_available_resource(self):
self.flags(connection_url='test_url',
connection_password='test_pass', group='xenserver')
stubs.stubout_session(self.stubs, stubs.FakeSessionForVMTests)
driver = xenapi.XenAPIDriver(fake.FakeVirtAPI(), False)
driver = self._get_driver()
driver._session.product_version = (6, 8, 2)
self.stubs.Set(driver, 'get_host_stats', self.host_stats)
@ -62,10 +66,7 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB,
self.assertEqual(1, resources['disk_available_least'])
def test_overhead(self):
self.flags(connection_url='test_url',
connection_password='test_pass', group='xenserver')
stubs.stubout_session(self.stubs, stubs.FakeSessionForVMTests)
driver = xenapi.XenAPIDriver(fake.FakeVirtAPI(), False)
driver = self._get_driver()
instance = {'memory_mb': 30720, 'vcpus': 4}
# expected memory overhead per:
@ -78,10 +79,7 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB,
self.assertEqual(expected, overhead['memory_mb'])
def test_set_bootable(self):
self.flags(connection_url='test_url', connection_password='test_pass',
group='xenserver')
stubs.stubout_session(self.stubs, stubs.FakeSessionForVMTests)
driver = xenapi.XenAPIDriver(fake.FakeVirtAPI(), False)
driver = self._get_driver()
self.mox.StubOutWithMock(driver._vmops, 'set_bootable')
driver._vmops.set_bootable('inst', True)
@ -89,9 +87,15 @@ class XenAPIDriverTestCase(stubs.XenAPITestBaseNoDB,
driver.set_bootable('inst', True)
def test_post_interrupted_snapshot_cleanup(self):
driver = self._get_driver()
fake_vmops_cleanup = mock.Mock()
driver._vmops.post_interrupted_snapshot_cleanup = fake_vmops_cleanup
driver.post_interrupted_snapshot_cleanup("context", "instance")
fake_vmops_cleanup.assert_called_once_with("context", "instance")
def test_public_api_signatures(self):
self.flags(connection_url='test_url', connection_password='test_pass',
group='xenserver')
stubs.stubout_session(self.stubs, stubs.FakeSessionForVMTests)
inst = xenapi.XenAPIDriver(fake.FakeVirtAPI(), False)
inst = self._get_driver()
self.assertPublicAPISignatures(inst)

View File

@ -2310,6 +2310,102 @@ class CreateVmRecordTestCase(VMUtilsTestBase):
self.assertIn(vm_ref, result_keys)
class ChildVHDsTestCase(test.NoDBTestCase):
all_vdis = [
("my-vdi-ref",
{"uuid": "my-uuid", "sm_config": {},
"is_a_snapshot": False, "other_config": {}}),
("non-parent",
{"uuid": "uuid-1", "sm_config": {},
"is_a_snapshot": False, "other_config": {}}),
("diff-parent",
{"uuid": "uuid-1", "sm_config": {"vhd-parent": "other-uuid"},
"is_a_snapshot": False, "other_config": {}}),
("child",
{"uuid": "uuid-child", "sm_config": {"vhd-parent": "my-uuid"},
"is_a_snapshot": False, "other_config": {}}),
("child-snap",
{"uuid": "uuid-child-snap", "sm_config": {"vhd-parent": "my-uuid"},
"is_a_snapshot": True, "other_config": {}}),
]
@mock.patch.object(vm_utils, '_get_all_vdis_in_sr')
def test_child_vhds_defaults(self, mock_get_all):
mock_get_all.return_value = self.all_vdis
result = vm_utils._child_vhds("session", "sr_ref", "my-uuid")
self.assertEqual(['uuid-child', 'uuid-child-snap'], result)
@mock.patch.object(vm_utils, '_get_all_vdis_in_sr')
def test_child_vhds_only_snapshots(self, mock_get_all):
mock_get_all.return_value = self.all_vdis
result = vm_utils._child_vhds("session", "sr_ref", "my-uuid",
old_snapshots_only=True)
self.assertEqual(['uuid-child-snap'], result)
def test_is_vdi_a_snapshot_works(self):
vdi_rec = {"is_a_snapshot": True,
"other_config": {}}
self.assertTrue(vm_utils._is_vdi_a_snapshot(vdi_rec))
def test_is_vdi_a_snapshot_base_images_false(self):
vdi_rec = {"is_a_snapshot": True,
"other_config": {"image-id": "fake"}}
self.assertFalse(vm_utils._is_vdi_a_snapshot(vdi_rec))
def test_is_vdi_a_snapshot_false_for_non_snapshot(self):
vdi_rec = {"is_a_snapshot": False,
"other_config": {}}
self.assertFalse(vm_utils._is_vdi_a_snapshot(vdi_rec))
class RemoveOldSnapshotsTestCase(test.NoDBTestCase):
@mock.patch.object(vm_utils, '_child_vhds')
@mock.patch.object(vm_utils, '_get_vhd_parent_uuid')
@mock.patch.object(vm_utils, 'get_vdi_for_vm_safely')
@mock.patch.object(vm_utils, 'safe_find_sr')
def test_get_snapshots_for_vm(self, mock_find, mock_get_vdi,
mock_parent, mock_child_vhds):
session = mock.Mock()
instance = {"uuid": "uuid"}
mock_find.return_value = "sr_ref"
mock_get_vdi.return_value = ("vm_vdi_ref", "vm_vdi_rec")
mock_parent.return_value = "parent_uuid"
mock_child_vhds.return_value = []
result = vm_utils._get_snapshots_for_vm(session, instance, "vm_ref")
self.assertEqual([], result)
mock_find.assert_called_once_with(session)
mock_get_vdi.assert_called_once_with(session, "vm_ref")
mock_parent.assert_called_once_with(session, "vm_vdi_ref")
mock_child_vhds.assert_called_once_with(session, "sr_ref",
"parent_uuid", old_snapshots_only=True)
@mock.patch.object(vm_utils, 'scan_default_sr')
@mock.patch.object(vm_utils, 'safe_destroy_vdis')
@mock.patch.object(vm_utils, '_get_snapshots_for_vm')
def test_remove_old_snapshots(self, mock_get, mock_destroy, mock_scan):
session = mock.Mock()
instance = {"uuid": "uuid"}
mock_get.return_value = ["vdi_uuid1", "vdi_uuid2"]
session.VDI.get_by_uuid.return_value = "vdi_ref"
vm_utils.remove_old_snapshots(session, instance, "vm_ref")
self.assertTrue(mock_scan.called)
session.VDI.get_by_uuid.assert_called_once_with("vdi_uuid1")
mock_destroy.assert_called_once_with(session, ["vdi_ref"])
mock_scan.assert_called_once_with(session)
class ResizeFunctionTestCase(test.NoDBTestCase):
def _call_get_resize_func_name(self, brand, version):
session = mock.Mock()

View File

@ -933,6 +933,18 @@ class ResizeVdisTestCase(VMOpsTestBase):
None, 5, 1000)
@mock.patch.object(vm_utils, 'remove_old_snapshots')
class CleanupFailedSnapshotTestCase(VMOpsTestBase):
def test_post_interrupted_snapshot_cleanup(self, mock_remove):
self.vmops._get_vm_opaque_ref = mock.Mock()
self.vmops._get_vm_opaque_ref.return_value = "vm_ref"
self.vmops.post_interrupted_snapshot_cleanup("context", "instance")
mock_remove.assert_called_once_with(self.vmops._session,
"instance", "vm_ref")
class LiveMigrateHelperTestCase(VMOpsTestBase):
def test_connect_block_device_volumes_none(self):
self.assertEqual({}, self.vmops.connect_block_device_volumes(None))

View File

@ -486,6 +486,14 @@ class ComputeDriver(object):
"""
raise NotImplementedError()
def post_interrupted_snapshot_cleanup(self, context, instance):
"""Cleans up any resources left after an interrupted snapshot.
:param context: security context
:param instance: nova.objects.instance.Instance
"""
pass
def finish_migration(self, context, migration, instance, disk_info,
network_info, image_meta, resize_instance,
block_device_info=None, power_on=True):

View File

@ -228,6 +228,10 @@ class XenAPIDriver(driver.ComputeDriver):
"""Create snapshot from a running VM instance."""
self._vmops.snapshot(context, instance, image_id, update_task_state)
def post_interrupted_snapshot_cleanup(self, context, instance):
"""Cleans up any resources left after a failed snapshot."""
self._vmops.post_interrupted_snapshot_cleanup(context, instance)
def reboot(self, context, instance, network_info, reboot_type,
block_device_info=None, bad_volumes_callback=None):
"""Reboot VM instance."""

View File

@ -728,6 +728,40 @@ def strip_base_mirror_from_vdis(session, vm_ref):
_try_strip_base_mirror_from_vdi(session, vdi_ref)
def _get_snapshots_for_vm(session, instance, vm_ref):
sr_ref = safe_find_sr(session)
vm_vdi_ref, vm_vdi_rec = get_vdi_for_vm_safely(session, vm_ref)
parent_uuid = _get_vhd_parent_uuid(session, vm_vdi_ref)
if not parent_uuid:
return []
return _child_vhds(session, sr_ref, parent_uuid, old_snapshots_only=True)
def remove_old_snapshots(session, instance, vm_ref):
"""See if there is an snapshot present that should be removed."""
LOG.debug("Starting remove_old_snapshots for VM", instance=instance)
snapshot_uuids = _get_snapshots_for_vm(session, instance, vm_ref)
number_of_snapshots = len(snapshot_uuids)
if number_of_snapshots <= 0:
LOG.debug("No snapshots to remove.", instance=instance)
return
if number_of_snapshots > 1:
LOG.debug("More snapshots than expected, only deleting one.",
instance=instance)
vdi_uuid = snapshot_uuids[0]
vdi_ref = session.VDI.get_by_uuid(vdi_uuid)
safe_destroy_vdis(session, [vdi_ref])
scan_default_sr(session)
# TODO(johnthetubaguy): we could look for older snapshots too
LOG.debug("Removed one old snapshot.", instance=instance)
@contextlib.contextmanager
def snapshot_attached_here(session, instance, vm_ref, label, userdevice='0',
post_snapshot_callback=None):
@ -2048,7 +2082,14 @@ def _walk_vdi_chain(session, vdi_uuid):
vdi_uuid = parent_uuid
def _child_vhds(session, sr_ref, vdi_uuid):
def _is_vdi_a_snapshot(vdi_rec):
"""Ensure VDI is a snapshot, and not cached image."""
is_a_snapshot = vdi_rec['is_a_snapshot']
image_id = vdi_rec['other_config'].get('image-id')
return is_a_snapshot and not image_id
def _child_vhds(session, sr_ref, vdi_uuid, old_snapshots_only=False):
"""Return the immediate children of a given VHD.
This is not recursive, only the immediate children are returned.
@ -2064,9 +2105,12 @@ def _child_vhds(session, sr_ref, vdi_uuid):
if parent_uuid != vdi_uuid:
continue
if old_snapshots_only and not _is_vdi_a_snapshot(rec):
continue
children.add(rec_uuid)
return children
return list(children)
def _count_parents_children(session, vdi_ref, sr_ref):

View File

@ -752,6 +752,11 @@ class VMOps(object):
LOG.debug("Finished snapshot and upload for VM",
instance=instance)
def post_interrupted_snapshot_cleanup(self, context, instance):
"""Cleans up any resources left after a failed snapshot."""
vm_ref = self._get_vm_opaque_ref(instance)
vm_utils.remove_old_snapshots(self._session, instance, vm_ref)
def _get_orig_vm_name_label(self, instance):
return instance['name'] + '-orig'