From 1a320f2a0e0918de6afcce5cf23b7de178ec3a49 Mon Sep 17 00:00:00 2001 From: Matthew Booth Date: Wed, 15 Aug 2018 14:39:48 +0100 Subject: [PATCH] libvirt: Don't delete disks on shared storage during evacuate When evacuating an instance between compute hosts on shared storage, during the rebuild operation we call spawn() on the destination compute. spawn() currently assumes that it should cleanup all resources on failure, which results in user data being deleted in the evacuate case. This change modifies spawn in the libvirt driver such that it only cleans up resources it created. Conflicts: nova/tests/unit/virt/libvirt/test_driver.py NOTE(lyarwood): Conflicts due to I08cd4787ab4c9539574237e26ba5bf6d4246b32e and I5a9e5532c46a5f7064441ae644125d21efe5fda1 not being present in stable/train. Co-Authored-By: Lee Yarwood Closes-Bug: #1550919 Change-Id: I764481966c96a67d993da6e902dc9fc3ad29ee36 (cherry picked from commit 497360b0ea970f1e68912be8229ef8c3f5454e9e) (cherry picked from commit 8b48ca672d9c0eb108c71b7f9f3f089d9ecf688a) --- nova/tests/functional/libvirt/base.py | 3 +- .../tests/functional/libvirt/test_evacuate.py | 17 +- .../regressions/test_bug_1595962.py | 3 +- nova/tests/unit/virt/libvirt/test_driver.py | 66 ++++--- nova/virt/libvirt/driver.py | 185 +++++++++++++----- 5 files changed, 184 insertions(+), 90 deletions(-) diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py index 2d3a74d1f181..25f6413a22b6 100644 --- a/nova/tests/functional/libvirt/base.py +++ b/nova/tests/functional/libvirt/base.py @@ -51,7 +51,8 @@ class ServersTestBase(base.ServersTestBase): self.useFixture(fakelibvirt.FakeLibvirtFixture()) self.useFixture(fixtures.MockPatch( - 'nova.virt.libvirt.LibvirtDriver._create_image')) + 'nova.virt.libvirt.LibvirtDriver._create_image', + return_value=(False, False))) self.useFixture(fixtures.MockPatch( 'nova.virt.libvirt.LibvirtDriver._get_local_gb_info', return_value={'total': 128, 'used': 44, 'free': 84})) diff --git a/nova/tests/functional/libvirt/test_evacuate.py b/nova/tests/functional/libvirt/test_evacuate.py index 605c0792f8ed..530f9f542de9 100644 --- a/nova/tests/functional/libvirt/test_evacuate.py +++ b/nova/tests/functional/libvirt/test_evacuate.py @@ -141,10 +141,9 @@ class _FileTest(object): source_root_disk = os.path.join( self.source_instance_path(server), disk) - # FIXME(mdbooth): We should not have deleted a shared disk - self.assertFalse(os.path.exists(source_root_disk), - "Source root disk %s for server %s exists" % - (source_root_disk, name)) + self.assertTrue(os.path.exists(source_root_disk), + "Source root disk %s for server %s does not exist" % + (source_root_disk, name)) class _FlatTest(_FileTest): @@ -235,8 +234,7 @@ class _RbdTest(object): # Check that we created a root disk and haven't called _cleanup_rbd at # all self.assertIn("%s_%s" % (server['id'], disk), self.created) - # FIXME(mdbooth): we should not have deleted shared disks - self.assertGreater(self.mock_rbd_driver.cleanup_volumes.call_count, 0) + self.mock_rbd_driver.cleanup_volumes.assert_not_called() # We never want to cleanup rbd disks during evacuate, regardless of # instance shared storage @@ -633,10 +631,9 @@ class _LibvirtEvacuateTest(integrated_helpers.InstanceHelperMixin): server = self._evacuate_with_failure(server, compute1) # Check that the instance directory still exists - # FIXME(mdbooth): the shared instance directory should still exist - self.assertFalse(os.path.exists(shared_instance_path), - "Shared instance directory %s for server %s " - "exists" % (shared_instance_path, name)) + self.assertTrue(os.path.exists(shared_instance_path), + "Shared instance directory %s for server %s " + "does not exist" % (shared_instance_path, name)) self.assert_disks_shared_instancedir(server) diff --git a/nova/tests/functional/regressions/test_bug_1595962.py b/nova/tests/functional/regressions/test_bug_1595962.py index daf13871b4f0..0180daee3e74 100644 --- a/nova/tests/functional/regressions/test_bug_1595962.py +++ b/nova/tests/functional/regressions/test_bug_1595962.py @@ -81,7 +81,8 @@ class TestSerialConsoleLiveMigrate(test.TestCase): @mock.patch('os.path.getsize', return_value=1024) @mock.patch('nova.conductor.tasks.live_migrate.LiveMigrationTask.' '_check_destination_is_not_source', return_value=False) - @mock.patch('nova.virt.libvirt.LibvirtDriver._create_image') + @mock.patch('nova.virt.libvirt.LibvirtDriver._create_image', + return_value=(False, False)) @mock.patch('nova.virt.libvirt.LibvirtDriver._get_local_gb_info', return_value={'total': 128, 'used': 44, diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 116067106445..595fdc152602 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -6180,7 +6180,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, return_value=False), mock.patch.object(drvr, 'plug_vifs'), mock.patch.object(drvr, 'firewall_driver'), - mock.patch.object(drvr, 'cleanup')): + mock.patch.object(drvr, '_cleanup')): self.assertRaises(ValueError, drvr._create_domain_and_network, self.context, @@ -13997,7 +13997,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, context, xml, instance, network_info, block_device_info=None, power_on=True, vifs_already_plugged=False, post_xml_callback=None, - destroy_disks_on_failure=False): + cleanup_instance_dir=False, cleanup_instance_disks=False): # The config disk should be created by this callback, so we need # to execute it. post_xml_callback() @@ -18352,7 +18352,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock.patch.object(drvr, 'firewall_driver'), mock.patch.object(drvr, '_create_domain', side_effect=exception.NovaException), - mock.patch.object(drvr, 'cleanup')): + mock.patch.object(drvr, '_cleanup')): self.assertRaises(exception.NovaException, drvr._create_domain_and_network, self.context, @@ -18419,7 +18419,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, @mock.patch.object(drvr, 'plug_vifs') @mock.patch.object(drvr, 'firewall_driver') @mock.patch.object(drvr, '_create_domain') - @mock.patch.object(drvr, 'cleanup') + @mock.patch.object(drvr, '_cleanup') def test_create(cleanup, create, fw_driver, plug_vifs): domain = drvr._create_domain_and_network(self.context, 'xml', instance, vifs, @@ -18525,34 +18525,31 @@ class LibvirtConnTestCase(test.NoDBTestCase, def the_test(mock_cleanup, mock_create, mock_fw, mock_plug): instance = objects.Instance(**self.test_instance) mock_create.side_effect = test.TestingException - self.assertRaises(test.TestingException, - drvr._create_domain_and_network, - self.context, 'xml', instance, [], None) - mock_cleanup.assert_called_once_with(self.context, instance, - [], None, None, False) - # destroy_disks_on_failure=True, used only by spawn() - mock_cleanup.reset_mock() - self.assertRaises(test.TestingException, - drvr._create_domain_and_network, - self.context, 'xml', instance, [], None, - destroy_disks_on_failure=True) - mock_cleanup.assert_called_once_with(self.context, instance, - [], None, None, True) - + self.assertRaises( + test.TestingException, drvr._create_domain_and_network, + self.context, 'xml', instance, [], None, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) + mock_cleanup.assert_called_once_with( + self.context, instance, [], None, None, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) the_test() def test_cleanup_failed_start_no_guest(self): drvr = libvirt_driver.LibvirtDriver(mock.MagicMock(), False) - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - drvr._cleanup_failed_start(None, None, None, None, None, False) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + drvr._cleanup_failed_start( + None, None, None, None, None, False, False) self.assertTrue(mock_cleanup.called) def test_cleanup_failed_start_inactive_guest(self): drvr = libvirt_driver.LibvirtDriver(mock.MagicMock(), False) guest = mock.MagicMock() guest.is_active.return_value = False - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - drvr._cleanup_failed_start(None, None, None, None, guest, False) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + drvr._cleanup_failed_start( + None, None, None, None, guest, False, False) self.assertTrue(mock_cleanup.called) self.assertFalse(guest.poweroff.called) @@ -18560,8 +18557,9 @@ class LibvirtConnTestCase(test.NoDBTestCase, drvr = libvirt_driver.LibvirtDriver(mock.MagicMock(), False) guest = mock.MagicMock() guest.is_active.return_value = True - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - drvr._cleanup_failed_start(None, None, None, None, guest, False) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + drvr._cleanup_failed_start( + None, None, None, None, guest, False, False) self.assertTrue(mock_cleanup.called) self.assertTrue(guest.poweroff.called) @@ -18570,10 +18568,10 @@ class LibvirtConnTestCase(test.NoDBTestCase, guest = mock.MagicMock() guest.is_active.return_value = True guest.poweroff.side_effect = test.TestingException - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: self.assertRaises(test.TestingException, drvr._cleanup_failed_start, - None, None, None, None, guest, False) + None, None, None, None, guest, False, False) self.assertTrue(mock_cleanup.called) self.assertTrue(guest.poweroff.called) @@ -18582,12 +18580,16 @@ class LibvirtConnTestCase(test.NoDBTestCase, guest = mock.MagicMock() guest.is_active.return_value = True guest.poweroff.side_effect = test.TestingException - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - self.assertRaises(test.TestingException, - drvr._cleanup_failed_start, - None, None, None, None, guest, True) - mock_cleanup.assert_called_once_with(None, None, network_info=None, - block_device_info=None, destroy_disks=True) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + self.assertRaises( + test.TestingException, drvr._cleanup_failed_start, + None, None, None, None, guest, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) + mock_cleanup.assert_called_once_with( + None, None, None, block_device_info=None, destroy_vifs=True, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) self.assertTrue(guest.poweroff.called) @mock.patch('os_brick.encryptors.get_encryption_metadata') diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 727260b3d638..d38ce62d1ee1 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1335,6 +1335,71 @@ class LibvirtDriver(driver.ComputeDriver): def cleanup(self, context, instance, network_info, block_device_info=None, destroy_disks=True, migrate_data=None, destroy_vifs=True): + """Cleanup the instance from the host. + + Identify if the instance disks and instance path should be removed + from the host before calling down into the _cleanup method for the + actual removal of resources from the host. + + :param context: security context + :param instance: instance object for the instance being cleaned up + :param network_info: instance network information + :param block_device_info: optional instance block device information + :param destroy_disks: if local ephemeral disks should be destroyed + :param migrate_data: optional migrate_data object + :param destroy_vifs: if plugged vifs should be unplugged + """ + cleanup_instance_dir = False + cleanup_instance_disks = False + # We assume destroy_disks means destroy instance directory and disks + if destroy_disks: + cleanup_instance_dir = True + cleanup_instance_disks = True + else: + # NOTE(mdbooth): I think the theory here was that if this is a + # migration with shared block storage then we need to delete the + # instance directory because that's not shared. I'm pretty sure + # this is wrong. + if migrate_data and 'is_shared_block_storage' in migrate_data: + cleanup_instance_dir = migrate_data.is_shared_block_storage + + # NOTE(lyarwood): The following workaround allows operators to + # ensure that non-shared instance directories are removed after an + # evacuation or revert resize when using the shared RBD + # imagebackend. This workaround is not required when cleaning up + # migrations that provide migrate_data to this method as the + # existing is_shared_block_storage conditional will cause the + # instance directory to be removed. + if not cleanup_instance_dir: + if CONF.workarounds.ensure_libvirt_rbd_instance_dir_cleanup: + cleanup_instance_dir = CONF.libvirt.images_type == 'rbd' + + return self._cleanup( + context, instance, network_info, + block_device_info=block_device_info, + destroy_vifs=destroy_vifs, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) + + def _cleanup(self, context, instance, network_info, block_device_info=None, + destroy_vifs=True, cleanup_instance_dir=False, + cleanup_instance_disks=False): + """Cleanup the domain and any attached resources from the host. + + This method cleans up any pmem devices, unplugs VIFs, disconnects + attached volumes and undefines the instance domain within libvirt. + It also optionally removes the ephemeral disks and the instance + directory from the host depending on the cleanup_instance_dir|disks + kwargs provided. + + :param context: security context + :param instance: instance object for the instance being cleaned up + :param network_info: instance network information + :param block_device_info: optional instance block device information + :param destroy_vifs: if plugged vifs should be unplugged + :param cleanup_instance_dir: If the instance dir should be removed + :param cleanup_instance_disks: If the instance disks should be removed + """ # zero the data on backend pmem device vpmems = self._get_vpmems(instance) if vpmems: @@ -1389,7 +1454,7 @@ class LibvirtDriver(driver.ComputeDriver): self._disconnect_volume(context, connection_info, instance) except Exception as exc: with excutils.save_and_reraise_exception() as ctxt: - if destroy_disks: + if cleanup_instance_disks: # Don't block on Volume errors if we're trying to # delete the instance as we may be partially created # or deleted @@ -1401,26 +1466,14 @@ class LibvirtDriver(driver.ComputeDriver): 'exc': encodeutils.exception_to_unicode(exc)}, instance=instance) - if destroy_disks: + if cleanup_instance_disks: # NOTE(haomai): destroy volumes if needed if CONF.libvirt.images_type == 'lvm': self._cleanup_lvm(instance, block_device_info) if CONF.libvirt.images_type == 'rbd': self._cleanup_rbd(instance) - is_shared_block_storage = False - if migrate_data and 'is_shared_block_storage' in migrate_data: - is_shared_block_storage = migrate_data.is_shared_block_storage - # NOTE(lyarwood): The following workaround allows operators to ensure - # that non-shared instance directories are removed after an evacuation - # or revert resize when using the shared RBD imagebackend. This - # workaround is not required when cleaning up migrations that provide - # migrate_data to this method as the existing is_shared_block_storage - # conditional will cause the instance directory to be removed. - if ((destroy_disks or is_shared_block_storage) or - (CONF.workarounds.ensure_libvirt_rbd_instance_dir_cleanup and - CONF.libvirt.images_type == 'rbd')): - + if cleanup_instance_dir: attempts = int(instance.system_metadata.get('clean_attempts', '0')) success = self.delete_instance_files(instance) @@ -3453,9 +3506,10 @@ class LibvirtDriver(driver.ComputeDriver): gen_confdrive = functools.partial(self._create_configdrive, context, instance, injection_info) - self._create_image(context, instance, disk_info['mapping'], - injection_info=injection_info, - block_device_info=block_device_info) + created_instance_dir, created_disks = self._create_image( + context, instance, disk_info['mapping'], + injection_info=injection_info, + block_device_info=block_device_info) # Required by Quobyte CI self._ensure_console_log_for_instance(instance) @@ -3471,8 +3525,9 @@ class LibvirtDriver(driver.ComputeDriver): context, xml, instance, network_info, block_device_info=block_device_info, post_xml_callback=gen_confdrive, - destroy_disks_on_failure=True, - power_on=power_on) + power_on=power_on, + cleanup_instance_dir=created_instance_dir, + cleanup_instance_disks=created_disks) LOG.debug("Guest created on hypervisor", instance=instance) def _wait_for_boot(): @@ -3763,8 +3818,17 @@ class LibvirtDriver(driver.ComputeDriver): def raw(fname): return image(fname, image_type='raw') + created_instance_dir = True + # ensure directories exist and are writable - fileutils.ensure_tree(libvirt_utils.get_instance_path(instance)) + instance_dir = libvirt_utils.get_instance_path(instance) + if os.path.exists(instance_dir): + LOG.debug("Instance directory exists: not creating", + instance=instance) + created_instance_dir = False + else: + LOG.debug("Creating instance directory", instance=instance) + fileutils.ensure_tree(libvirt_utils.get_instance_path(instance)) LOG.info('Creating image', instance=instance) @@ -3806,6 +3870,10 @@ class LibvirtDriver(driver.ComputeDriver): 'kernel_id': instance.kernel_id, 'ramdisk_id': instance.ramdisk_id} + # NOTE(mdbooth): kernel and ramdisk, if they are defined, are hardcoded + # to use raw, which means they will always be cleaned up with the + # instance directory. We must not consider them for created_disks, + # which may not be using the instance directory. if disk_images['kernel_id']: fname = imagecache.get_cache_fname(disk_images['kernel_id']) raw('kernel').cache(fetch_func=libvirt_utils.fetch_raw_image, @@ -3825,10 +3893,9 @@ class LibvirtDriver(driver.ComputeDriver): uid = pwd.getpwnam('root').pw_uid nova.privsep.path.chown(image('disk').path, uid=uid) - self._create_and_inject_local_root(context, instance, - booted_from_volume, suffix, - disk_images, injection_info, - fallback_from_host) + created_disks = self._create_and_inject_local_root( + context, instance, booted_from_volume, suffix, disk_images, + injection_info, fallback_from_host) # Lookup the filesystem type if required os_type_with_default = nova.privsep.fs.get_fs_type_for_os_type( @@ -3842,6 +3909,9 @@ class LibvirtDriver(driver.ComputeDriver): ephemeral_gb = instance.flavor.ephemeral_gb if 'disk.local' in disk_mapping: disk_image = image('disk.local') + # Short circuit the exists() tests if we already created a disk + created_disks = created_disks or not disk_image.exists() + fn = functools.partial(self._create_ephemeral, fs_label='ephemeral0', os_type=instance.os_type, @@ -3858,6 +3928,8 @@ class LibvirtDriver(driver.ComputeDriver): for idx, eph in enumerate(driver.block_device_info_get_ephemerals( block_device_info)): disk_image = image(blockinfo.get_eph_disk(idx)) + # Short circuit the exists() tests if we already created a disk + created_disks = created_disks or not disk_image.exists() specified_fs = eph.get('guest_format') if specified_fs and not self.is_supported_fs_format(specified_fs): @@ -3880,15 +3952,25 @@ class LibvirtDriver(driver.ComputeDriver): if swap_mb > 0: size = swap_mb * units.Mi - image('disk.swap').cache(fetch_func=self._create_swap, - context=context, - filename="swap_%s" % swap_mb, - size=size, - swap_mb=swap_mb) + swap = image('disk.swap') + # Short circuit the exists() tests if we already created a disk + created_disks = created_disks or not swap.exists() + swap.cache(fetch_func=self._create_swap, context=context, + filename="swap_%s" % swap_mb, + size=size, swap_mb=swap_mb) + + if created_disks: + LOG.debug('Created local disks', instance=instance) + else: + LOG.debug('Did not create local disks', instance=instance) + + return (created_instance_dir, created_disks) def _create_and_inject_local_root(self, context, instance, booted_from_volume, suffix, disk_images, injection_info, fallback_from_host): + created_disks = False + # File injection only if needed need_inject = (not configdrive.required_by(instance) and injection_info is not None and @@ -3906,6 +3988,8 @@ class LibvirtDriver(driver.ComputeDriver): backend = self.image_backend.by_name(instance, 'disk' + suffix, CONF.libvirt.images_type) + created_disks = not backend.exists() + if instance.task_state == task_states.RESIZE_FINISH: backend.create_snap(libvirt_utils.RESIZE_SNAPSHOT_NAME) if backend.SUPPORTS_CLONE: @@ -3928,6 +4012,8 @@ class LibvirtDriver(driver.ComputeDriver): LOG.warning('File injection into a boot from volume ' 'instance is not supported', instance=instance) + return created_disks + def _create_configdrive(self, context, instance, injection_info, rescue=False): # As this method being called right after the definition of a @@ -6187,21 +6273,26 @@ class LibvirtDriver(driver.ComputeDriver): for vif in network_info if vif.get('active', True) is False] def _cleanup_failed_start(self, context, instance, network_info, - block_device_info, guest, destroy_disks): + block_device_info, guest, + cleanup_instance_dir=False, + cleanup_instance_disks=False): try: if guest and guest.is_active(): guest.poweroff() finally: - self.cleanup(context, instance, network_info=network_info, - block_device_info=block_device_info, - destroy_disks=destroy_disks) + self._cleanup(context, instance, network_info, + block_device_info=block_device_info, + destroy_vifs=True, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) def _create_domain_and_network(self, context, xml, instance, network_info, block_device_info=None, power_on=True, vifs_already_plugged=False, post_xml_callback=None, - destroy_disks_on_failure=False, - external_events=None): + external_events=None, + cleanup_instance_dir=False, + cleanup_instance_disks=False): """Do required network setup and create domain.""" timeout = CONF.vif_plugging_timeout @@ -6236,9 +6327,10 @@ class LibvirtDriver(driver.ComputeDriver): # Neutron reported failure and we didn't swallow it, so # bail here with excutils.save_and_reraise_exception(): - self._cleanup_failed_start(context, instance, network_info, - block_device_info, guest, - destroy_disks_on_failure) + self._cleanup_failed_start( + context, instance, network_info, block_device_info, guest, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) except eventlet.timeout.Timeout: # We never heard from Neutron LOG.warning('Timeout waiting for %(events)s for ' @@ -6249,18 +6341,19 @@ class LibvirtDriver(driver.ComputeDriver): 'task_state': instance.task_state}, instance=instance) if CONF.vif_plugging_is_fatal: - self._cleanup_failed_start(context, instance, network_info, - block_device_info, guest, - destroy_disks_on_failure) + self._cleanup_failed_start( + context, instance, network_info, block_device_info, guest, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) raise exception.VirtualInterfaceCreateException() except Exception: # Any other error, be sure to clean up LOG.error('Failed to start libvirt guest', instance=instance) with excutils.save_and_reraise_exception(): - self._cleanup_failed_start(context, instance, network_info, - block_device_info, guest, - destroy_disks_on_failure) - + self._cleanup_failed_start( + context, instance, network_info, block_device_info, guest, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) # Resume only if domain has been paused if pause: guest.resume()