From 8b48ca672d9c0eb108c71b7f9f3f089d9ecf688a Mon Sep 17 00:00:00 2001 From: Matthew Booth Date: Wed, 15 Aug 2018 14:39:48 +0100 Subject: [PATCH] libvirt: Don't delete disks on shared storage during evacuate When evacuating an instance between compute hosts on shared storage, during the rebuild operation we call spawn() on the destination compute. spawn() currently assumes that it should cleanup all resources on failure, which results in user data being deleted in the evacuate case. This change modifies spawn in the libvirt driver such that it only cleans up resources it created. Co-Authored-By: Lee Yarwood Closes-Bug: #1550919 Change-Id: I764481966c96a67d993da6e902dc9fc3ad29ee36 (cherry picked from commit 497360b0ea970f1e68912be8229ef8c3f5454e9e) --- nova/tests/functional/libvirt/base.py | 3 +- .../tests/functional/libvirt/test_evacuate.py | 17 +- .../regressions/test_bug_1595962.py | 3 +- nova/tests/unit/virt/libvirt/test_driver.py | 68 +++---- nova/virt/libvirt/driver.py | 185 +++++++++++++----- 5 files changed, 185 insertions(+), 91 deletions(-) diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py index 7b4435fdaa07..5ef3efb6ba55 100644 --- a/nova/tests/functional/libvirt/base.py +++ b/nova/tests/functional/libvirt/base.py @@ -51,7 +51,8 @@ class ServersTestBase(base.ServersTestBase): self.useFixture(fakelibvirt.FakeLibvirtFixture()) self.useFixture(fixtures.MockPatch( - 'nova.virt.libvirt.LibvirtDriver._create_image')) + 'nova.virt.libvirt.LibvirtDriver._create_image', + return_value=(False, False))) self.useFixture(fixtures.MockPatch( 'nova.virt.libvirt.LibvirtDriver._get_local_gb_info', return_value={'total': 128, 'used': 44, 'free': 84})) diff --git a/nova/tests/functional/libvirt/test_evacuate.py b/nova/tests/functional/libvirt/test_evacuate.py index 4dd173adb266..96fc8af3fbfb 100644 --- a/nova/tests/functional/libvirt/test_evacuate.py +++ b/nova/tests/functional/libvirt/test_evacuate.py @@ -141,10 +141,9 @@ class _FileTest(object): source_root_disk = os.path.join( self.source_instance_path(server), disk) - # FIXME(mdbooth): We should not have deleted a shared disk - self.assertFalse(os.path.exists(source_root_disk), - "Source root disk %s for server %s exists" % - (source_root_disk, name)) + self.assertTrue(os.path.exists(source_root_disk), + "Source root disk %s for server %s does not exist" % + (source_root_disk, name)) class _FlatTest(_FileTest): @@ -235,8 +234,7 @@ class _RbdTest(object): # Check that we created a root disk and haven't called _cleanup_rbd at # all self.assertIn("%s_%s" % (server['id'], disk), self.created) - # FIXME(mdbooth): we should not have deleted shared disks - self.assertGreater(self.mock_rbd_driver.cleanup_volumes.call_count, 0) + self.mock_rbd_driver.cleanup_volumes.assert_not_called() # We never want to cleanup rbd disks during evacuate, regardless of # instance shared storage @@ -633,10 +631,9 @@ class _LibvirtEvacuateTest(integrated_helpers.InstanceHelperMixin): server = self._evacuate_with_failure(server, compute1) # Check that the instance directory still exists - # FIXME(mdbooth): the shared instance directory should still exist - self.assertFalse(os.path.exists(shared_instance_path), - "Shared instance directory %s for server %s " - "exists" % (shared_instance_path, name)) + self.assertTrue(os.path.exists(shared_instance_path), + "Shared instance directory %s for server %s " + "does not exist" % (shared_instance_path, name)) self.assert_disks_shared_instancedir(server) diff --git a/nova/tests/functional/regressions/test_bug_1595962.py b/nova/tests/functional/regressions/test_bug_1595962.py index 2695870a442d..f124996544bb 100644 --- a/nova/tests/functional/regressions/test_bug_1595962.py +++ b/nova/tests/functional/regressions/test_bug_1595962.py @@ -82,7 +82,8 @@ class TestSerialConsoleLiveMigrate(test.TestCase): @mock.patch('os.path.getsize', return_value=1024) @mock.patch('nova.conductor.tasks.live_migrate.LiveMigrationTask.' '_check_destination_is_not_source', return_value=False) - @mock.patch('nova.virt.libvirt.LibvirtDriver._create_image') + @mock.patch('nova.virt.libvirt.LibvirtDriver._create_image', + return_value=(False, False)) @mock.patch('nova.virt.libvirt.LibvirtDriver._get_local_gb_info', return_value={'total': 128, 'used': 44, diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index aabe92931349..7b7ca6dd71f0 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -6259,7 +6259,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock.patch.object(drvr, '_is_booted_from_volume', return_value=False), mock.patch.object(drvr, 'plug_vifs'), - mock.patch.object(drvr, 'cleanup')): + mock.patch.object(drvr, '_cleanup')): self.assertRaises(ValueError, drvr._create_domain_and_network, self.context, @@ -14239,7 +14239,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, def _test_spawn_accels(self, accel_info, mock_get_disk_info): mock_get_disk_info.return_value = {'mapping': None} self.stub_out('nova.virt.libvirt.driver.LibvirtDriver.' - '_create_image', lambda *a, **kw: None) + '_create_image', lambda *a, **kw: (False, False)) self.stub_out('nova.virt.libvirt.driver.LibvirtDriver.' '_ensure_console_log_for_instance', lambda *a, **kw: None) @@ -14305,7 +14305,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, context, xml, instance, network_info, block_device_info=None, power_on=True, vifs_already_plugged=False, post_xml_callback=None, - destroy_disks_on_failure=False): + cleanup_instance_dir=False, cleanup_instance_disks=False): # The config disk should be created by this callback, so we need # to execute it. post_xml_callback() @@ -18646,7 +18646,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock.patch.object(drvr, 'plug_vifs'), mock.patch.object(drvr, '_create_domain', side_effect=exception.NovaException), - mock.patch.object(drvr, 'cleanup')): + mock.patch.object(drvr, '_cleanup')): self.assertRaises(exception.NovaException, drvr._create_domain_and_network, self.context, @@ -18711,7 +18711,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, @mock.patch.object(drvr, 'plug_vifs') @mock.patch.object(drvr, '_create_domain') - @mock.patch.object(drvr, 'cleanup') + @mock.patch.object(drvr, '_cleanup') def test_create(cleanup, create, plug_vifs): domain = drvr._create_domain_and_network(self.context, 'xml', instance, vifs, @@ -18793,34 +18793,31 @@ class LibvirtConnTestCase(test.NoDBTestCase, def the_test(mock_cleanup, mock_create, mock_plug): instance = objects.Instance(**self.test_instance) mock_create.side_effect = test.TestingException - self.assertRaises(test.TestingException, - drvr._create_domain_and_network, - self.context, 'xml', instance, [], None) - mock_cleanup.assert_called_once_with(self.context, instance, - [], None, None, False) - # destroy_disks_on_failure=True, used only by spawn() - mock_cleanup.reset_mock() - self.assertRaises(test.TestingException, - drvr._create_domain_and_network, - self.context, 'xml', instance, [], None, - destroy_disks_on_failure=True) - mock_cleanup.assert_called_once_with(self.context, instance, - [], None, None, True) - + self.assertRaises( + test.TestingException, drvr._create_domain_and_network, + self.context, 'xml', instance, [], None, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) + mock_cleanup.assert_called_once_with( + self.context, instance, [], None, None, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) the_test() def test_cleanup_failed_start_no_guest(self): drvr = libvirt_driver.LibvirtDriver(mock.MagicMock(), False) - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - drvr._cleanup_failed_start(None, None, None, None, None, False) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + drvr._cleanup_failed_start( + None, None, None, None, None, False, False) self.assertTrue(mock_cleanup.called) def test_cleanup_failed_start_inactive_guest(self): drvr = libvirt_driver.LibvirtDriver(mock.MagicMock(), False) guest = mock.MagicMock() guest.is_active.return_value = False - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - drvr._cleanup_failed_start(None, None, None, None, guest, False) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + drvr._cleanup_failed_start( + None, None, None, None, guest, False, False) self.assertTrue(mock_cleanup.called) self.assertFalse(guest.poweroff.called) @@ -18828,8 +18825,9 @@ class LibvirtConnTestCase(test.NoDBTestCase, drvr = libvirt_driver.LibvirtDriver(mock.MagicMock(), False) guest = mock.MagicMock() guest.is_active.return_value = True - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - drvr._cleanup_failed_start(None, None, None, None, guest, False) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + drvr._cleanup_failed_start( + None, None, None, None, guest, False, False) self.assertTrue(mock_cleanup.called) self.assertTrue(guest.poweroff.called) @@ -18838,10 +18836,10 @@ class LibvirtConnTestCase(test.NoDBTestCase, guest = mock.MagicMock() guest.is_active.return_value = True guest.poweroff.side_effect = test.TestingException - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: self.assertRaises(test.TestingException, drvr._cleanup_failed_start, - None, None, None, None, guest, False) + None, None, None, None, guest, False, False) self.assertTrue(mock_cleanup.called) self.assertTrue(guest.poweroff.called) @@ -18850,12 +18848,16 @@ class LibvirtConnTestCase(test.NoDBTestCase, guest = mock.MagicMock() guest.is_active.return_value = True guest.poweroff.side_effect = test.TestingException - with mock.patch.object(drvr, 'cleanup') as mock_cleanup: - self.assertRaises(test.TestingException, - drvr._cleanup_failed_start, - None, None, None, None, guest, True) - mock_cleanup.assert_called_once_with(None, None, network_info=None, - block_device_info=None, destroy_disks=True) + with mock.patch.object(drvr, '_cleanup') as mock_cleanup: + self.assertRaises( + test.TestingException, drvr._cleanup_failed_start, + None, None, None, None, guest, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) + mock_cleanup.assert_called_once_with( + None, None, None, block_device_info=None, destroy_vifs=True, + cleanup_instance_dir=mock.sentinel.cleanup_instance_dir, + cleanup_instance_disks=mock.sentinel.cleanup_instance_disks) self.assertTrue(guest.poweroff.called) @mock.patch('os_brick.encryptors.get_encryption_metadata') diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 2c718dd9af5a..49ef4673eaf3 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1312,6 +1312,71 @@ class LibvirtDriver(driver.ComputeDriver): def cleanup(self, context, instance, network_info, block_device_info=None, destroy_disks=True, migrate_data=None, destroy_vifs=True): + """Cleanup the instance from the host. + + Identify if the instance disks and instance path should be removed + from the host before calling down into the _cleanup method for the + actual removal of resources from the host. + + :param context: security context + :param instance: instance object for the instance being cleaned up + :param network_info: instance network information + :param block_device_info: optional instance block device information + :param destroy_disks: if local ephemeral disks should be destroyed + :param migrate_data: optional migrate_data object + :param destroy_vifs: if plugged vifs should be unplugged + """ + cleanup_instance_dir = False + cleanup_instance_disks = False + # We assume destroy_disks means destroy instance directory and disks + if destroy_disks: + cleanup_instance_dir = True + cleanup_instance_disks = True + else: + # NOTE(mdbooth): I think the theory here was that if this is a + # migration with shared block storage then we need to delete the + # instance directory because that's not shared. I'm pretty sure + # this is wrong. + if migrate_data and 'is_shared_block_storage' in migrate_data: + cleanup_instance_dir = migrate_data.is_shared_block_storage + + # NOTE(lyarwood): The following workaround allows operators to + # ensure that non-shared instance directories are removed after an + # evacuation or revert resize when using the shared RBD + # imagebackend. This workaround is not required when cleaning up + # migrations that provide migrate_data to this method as the + # existing is_shared_block_storage conditional will cause the + # instance directory to be removed. + if not cleanup_instance_dir: + if CONF.workarounds.ensure_libvirt_rbd_instance_dir_cleanup: + cleanup_instance_dir = CONF.libvirt.images_type == 'rbd' + + return self._cleanup( + context, instance, network_info, + block_device_info=block_device_info, + destroy_vifs=destroy_vifs, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) + + def _cleanup(self, context, instance, network_info, block_device_info=None, + destroy_vifs=True, cleanup_instance_dir=False, + cleanup_instance_disks=False): + """Cleanup the domain and any attached resources from the host. + + This method cleans up any pmem devices, unplugs VIFs, disconnects + attached volumes and undefines the instance domain within libvirt. + It also optionally removes the ephemeral disks and the instance + directory from the host depending on the cleanup_instance_dir|disks + kwargs provided. + + :param context: security context + :param instance: instance object for the instance being cleaned up + :param network_info: instance network information + :param block_device_info: optional instance block device information + :param destroy_vifs: if plugged vifs should be unplugged + :param cleanup_instance_dir: If the instance dir should be removed + :param cleanup_instance_disks: If the instance disks should be removed + """ # zero the data on backend pmem device vpmems = self._get_vpmems(instance) if vpmems: @@ -1342,7 +1407,7 @@ class LibvirtDriver(driver.ComputeDriver): self._disconnect_volume(context, connection_info, instance) except Exception as exc: with excutils.save_and_reraise_exception() as ctxt: - if destroy_disks: + if cleanup_instance_disks: # Don't block on Volume errors if we're trying to # delete the instance as we may be partially created # or deleted @@ -1354,26 +1419,14 @@ class LibvirtDriver(driver.ComputeDriver): 'exc': encodeutils.exception_to_unicode(exc)}, instance=instance) - if destroy_disks: + if cleanup_instance_disks: # NOTE(haomai): destroy volumes if needed if CONF.libvirt.images_type == 'lvm': self._cleanup_lvm(instance, block_device_info) if CONF.libvirt.images_type == 'rbd': self._cleanup_rbd(instance) - is_shared_block_storage = False - if migrate_data and 'is_shared_block_storage' in migrate_data: - is_shared_block_storage = migrate_data.is_shared_block_storage - # NOTE(lyarwood): The following workaround allows operators to ensure - # that non-shared instance directories are removed after an evacuation - # or revert resize when using the shared RBD imagebackend. This - # workaround is not required when cleaning up migrations that provide - # migrate_data to this method as the existing is_shared_block_storage - # conditional will cause the instance directory to be removed. - if ((destroy_disks or is_shared_block_storage) or - (CONF.workarounds.ensure_libvirt_rbd_instance_dir_cleanup and - CONF.libvirt.images_type == 'rbd')): - + if cleanup_instance_dir: attempts = int(instance.system_metadata.get('clean_attempts', '0')) success = self.delete_instance_files(instance) @@ -3558,9 +3611,10 @@ class LibvirtDriver(driver.ComputeDriver): gen_confdrive = functools.partial(self._create_configdrive, context, instance, injection_info) - self._create_image(context, instance, disk_info['mapping'], - injection_info=injection_info, - block_device_info=block_device_info) + created_instance_dir, created_disks = self._create_image( + context, instance, disk_info['mapping'], + injection_info=injection_info, + block_device_info=block_device_info) # Required by Quobyte CI self._ensure_console_log_for_instance(instance) @@ -3576,8 +3630,9 @@ class LibvirtDriver(driver.ComputeDriver): context, xml, instance, network_info, block_device_info=block_device_info, post_xml_callback=gen_confdrive, - destroy_disks_on_failure=True, - power_on=power_on) + power_on=power_on, + cleanup_instance_dir=created_instance_dir, + cleanup_instance_disks=created_disks) LOG.debug("Guest created on hypervisor", instance=instance) def _wait_for_boot(): @@ -3868,8 +3923,17 @@ class LibvirtDriver(driver.ComputeDriver): def raw(fname): return image(fname, image_type='raw') + created_instance_dir = True + # ensure directories exist and are writable - fileutils.ensure_tree(libvirt_utils.get_instance_path(instance)) + instance_dir = libvirt_utils.get_instance_path(instance) + if os.path.exists(instance_dir): + LOG.debug("Instance directory exists: not creating", + instance=instance) + created_instance_dir = False + else: + LOG.debug("Creating instance directory", instance=instance) + fileutils.ensure_tree(libvirt_utils.get_instance_path(instance)) LOG.info('Creating image', instance=instance) @@ -3911,6 +3975,10 @@ class LibvirtDriver(driver.ComputeDriver): 'kernel_id': instance.kernel_id, 'ramdisk_id': instance.ramdisk_id} + # NOTE(mdbooth): kernel and ramdisk, if they are defined, are hardcoded + # to use raw, which means they will always be cleaned up with the + # instance directory. We must not consider them for created_disks, + # which may not be using the instance directory. if disk_images['kernel_id']: fname = imagecache.get_cache_fname(disk_images['kernel_id']) raw('kernel').cache(fetch_func=libvirt_utils.fetch_raw_image, @@ -3930,10 +3998,9 @@ class LibvirtDriver(driver.ComputeDriver): uid = pwd.getpwnam('root').pw_uid nova.privsep.path.chown(image('disk').path, uid=uid) - self._create_and_inject_local_root(context, instance, - booted_from_volume, suffix, - disk_images, injection_info, - fallback_from_host) + created_disks = self._create_and_inject_local_root( + context, instance, booted_from_volume, suffix, disk_images, + injection_info, fallback_from_host) # Lookup the filesystem type if required os_type_with_default = nova.privsep.fs.get_fs_type_for_os_type( @@ -3947,6 +4014,9 @@ class LibvirtDriver(driver.ComputeDriver): ephemeral_gb = instance.flavor.ephemeral_gb if 'disk.local' in disk_mapping: disk_image = image('disk.local') + # Short circuit the exists() tests if we already created a disk + created_disks = created_disks or not disk_image.exists() + fn = functools.partial(self._create_ephemeral, fs_label='ephemeral0', os_type=instance.os_type, @@ -3963,6 +4033,8 @@ class LibvirtDriver(driver.ComputeDriver): for idx, eph in enumerate(driver.block_device_info_get_ephemerals( block_device_info)): disk_image = image(blockinfo.get_eph_disk(idx)) + # Short circuit the exists() tests if we already created a disk + created_disks = created_disks or not disk_image.exists() specified_fs = eph.get('guest_format') if specified_fs and not self.is_supported_fs_format(specified_fs): @@ -3985,15 +4057,25 @@ class LibvirtDriver(driver.ComputeDriver): if swap_mb > 0: size = swap_mb * units.Mi - image('disk.swap').cache(fetch_func=self._create_swap, - context=context, - filename="swap_%s" % swap_mb, - size=size, - swap_mb=swap_mb) + swap = image('disk.swap') + # Short circuit the exists() tests if we already created a disk + created_disks = created_disks or not swap.exists() + swap.cache(fetch_func=self._create_swap, context=context, + filename="swap_%s" % swap_mb, + size=size, swap_mb=swap_mb) + + if created_disks: + LOG.debug('Created local disks', instance=instance) + else: + LOG.debug('Did not create local disks', instance=instance) + + return (created_instance_dir, created_disks) def _create_and_inject_local_root(self, context, instance, booted_from_volume, suffix, disk_images, injection_info, fallback_from_host): + created_disks = False + # File injection only if needed need_inject = (not configdrive.required_by(instance) and injection_info is not None and @@ -4011,6 +4093,8 @@ class LibvirtDriver(driver.ComputeDriver): backend = self.image_backend.by_name(instance, 'disk' + suffix, CONF.libvirt.images_type) + created_disks = not backend.exists() + if instance.task_state == task_states.RESIZE_FINISH: backend.create_snap(libvirt_utils.RESIZE_SNAPSHOT_NAME) if backend.SUPPORTS_CLONE: @@ -4048,6 +4132,8 @@ class LibvirtDriver(driver.ComputeDriver): LOG.warning('File injection into a boot from volume ' 'instance is not supported', instance=instance) + return created_disks + def _create_configdrive(self, context, instance, injection_info, rescue=False): # As this method being called right after the definition of a @@ -6370,21 +6456,26 @@ class LibvirtDriver(driver.ComputeDriver): for vif in network_info if vif.get('active', True) is False] def _cleanup_failed_start(self, context, instance, network_info, - block_device_info, guest, destroy_disks): + block_device_info, guest, + cleanup_instance_dir=False, + cleanup_instance_disks=False): try: if guest and guest.is_active(): guest.poweroff() finally: - self.cleanup(context, instance, network_info=network_info, - block_device_info=block_device_info, - destroy_disks=destroy_disks) + self._cleanup(context, instance, network_info, + block_device_info=block_device_info, + destroy_vifs=True, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) def _create_domain_and_network(self, context, xml, instance, network_info, block_device_info=None, power_on=True, vifs_already_plugged=False, post_xml_callback=None, - destroy_disks_on_failure=False, - external_events=None): + external_events=None, + cleanup_instance_dir=False, + cleanup_instance_disks=False): """Do required network setup and create domain.""" timeout = CONF.vif_plugging_timeout @@ -6412,9 +6503,10 @@ class LibvirtDriver(driver.ComputeDriver): # Neutron reported failure and we didn't swallow it, so # bail here with excutils.save_and_reraise_exception(): - self._cleanup_failed_start(context, instance, network_info, - block_device_info, guest, - destroy_disks_on_failure) + self._cleanup_failed_start( + context, instance, network_info, block_device_info, guest, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) except eventlet.timeout.Timeout: # We never heard from Neutron LOG.warning('Timeout waiting for %(events)s for ' @@ -6425,18 +6517,19 @@ class LibvirtDriver(driver.ComputeDriver): 'task_state': instance.task_state}, instance=instance) if CONF.vif_plugging_is_fatal: - self._cleanup_failed_start(context, instance, network_info, - block_device_info, guest, - destroy_disks_on_failure) + self._cleanup_failed_start( + context, instance, network_info, block_device_info, guest, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) raise exception.VirtualInterfaceCreateException() except Exception: # Any other error, be sure to clean up LOG.error('Failed to start libvirt guest', instance=instance) with excutils.save_and_reraise_exception(): - self._cleanup_failed_start(context, instance, network_info, - block_device_info, guest, - destroy_disks_on_failure) - + self._cleanup_failed_start( + context, instance, network_info, block_device_info, guest, + cleanup_instance_dir=cleanup_instance_dir, + cleanup_instance_disks=cleanup_instance_disks) # Resume only if domain has been paused if pause: guest.resume()