From 80191e6d828cf823ce3aa7c6176da5e531694900 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Fri, 3 May 2019 13:46:23 -0700
Subject: [PATCH] Add a workaround config toggle to refuse ceph image upload

If a compute node is backed by ceph, and the image is not clone-able
in that same ceph, nova will try to download the image from glance
and upload it to ceph itself. This is nice in that it "just works",
but it also means we store that image in ceph in an extremely
inefficient way. In a glance multi-store case with multiple ceph
clusters, the user is currently required to make sure that the image
they are going to use is stored in a backend local to the compute
node they land on, and if they do not (or can not), then nova will
do this non-COW inefficient copy of the image, which is likely not
what the operator expects.

Per the discussion at the Denver PTG, this adds a workaround flag
which allows the operators to direct nova to *not* do this behavior
and instead refuse to boot the instance entirely.

Related-Bug: #1858877
Change-Id: I069b6b1d28eaf1eee5c7fb8d0fdef9c0c229a1bf
---
 nova/conf/workarounds.py                      | 24 ++++++++++
 nova/tests/unit/virt/libvirt/test_driver.py   | 46 +++++++++++++++++++
 nova/virt/libvirt/driver.py                   | 15 ++++++
 ...d_muli_ceph_download-4083decf501dba40.yaml | 19 ++++++++
 4 files changed, 104 insertions(+)
 create mode 100644 releasenotes/notes/avoid_muli_ceph_download-4083decf501dba40.yaml

diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py
index 42f7108777a7..d7cad708a785 100644
--- a/nova/conf/workarounds.py
+++ b/nova/conf/workarounds.py
@@ -246,6 +246,30 @@ candidates if necessary. This has a slight performance impact and is not
 necessary on new or upgraded deployments where the new configuration has been
 set on all hosts. By setting this option, the second lookup is disabled and the
 scheduler will only request ``PCPU``-based allocations.
+"""),
+    cfg.BoolOpt(
+        'never_download_image_if_on_rbd',
+        default=False,
+        help="""
+When booting from an image on a ceph-backed compute node, if the image does not
+already reside on the ceph cluster (as would be the case if glance is
+also using the same cluster), nova will download the image from glance and
+upload it to ceph itself. If using multiple ceph clusters, this may cause nova
+to unintentionally duplicate the image in a non-COW-able way in the local
+ceph deployment, wasting space.
+
+For more information, refer to the bug report:
+
+https://bugs.launchpad.net/nova/+bug/1858877
+
+Enabling this option will cause nova to *refuse* to boot an instance if it
+would require downloading the image from glance and uploading it to ceph
+itself.
+
+Related options:
+
+* ``compute_driver`` (libvirt)
+* ``[libvirt]/images_type`` (rbd)
 """),
 ]
 
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 15a8cb1611f4..7060295c5b3e 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -21454,6 +21454,52 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
                                     None)
         self.assertFalse(mock_inject.called)
 
+    @mock.patch('nova.virt.libvirt.utils.fetch_image')
+    @mock.patch('nova.virt.libvirt.storage.rbd_utils.RBDDriver')
+    @mock.patch.object(imagebackend, 'IMAGE_API')
+    def test_create_fetch_image_ceph_workaround(self, mock_image, mock_rbd,
+                                                mock_fetch):
+        # Make sure that rbd clone will fail as un-clone-able
+        mock_rbd.is_cloneable.return_value = False
+        # Make sure the rbd code thinks the image does not already exist
+        mock_rbd.return_value.exists.return_value = False
+        # Make sure the rbd code says the image is small
+        mock_rbd.return_value.size.return_value = 128 * units.Mi
+        # Make sure IMAGE_API.get() returns a raw image
+        mock_image.get.return_value = {'locations': [], 'disk_format': 'raw'}
+
+        instance = self._create_instance()
+        disk_images = {'image_id': 'foo'}
+        self.flags(images_type='rbd', group='libvirt')
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+
+        def do_create():
+            # Reset the fetch mock and run our driver method so we can
+            # check for called-ness after each attempt
+            mock_fetch.reset_mock()
+            drvr._create_and_inject_local_root(self.context,
+                                               instance,
+                                               False,
+                                               '',
+                                               disk_images,
+                                               get_injection_info(),
+                                               None)
+
+        # Do an image create with rbd
+        do_create()
+        # Make sure it tried fetch, which implies that it tried and
+        # failed to clone.
+        mock_fetch.assert_called()
+
+        # Enable the workaround
+        self.flags(never_download_image_if_on_rbd=True,
+                   group='workarounds')
+        # Ensure that we raise the original ImageUnacceptable from the
+        # failed clone...
+        self.assertRaises(exception.ImageUnacceptable, do_create)
+        # ...and ensure that we did _not_ try to fetch
+        mock_fetch.assert_not_called()
+
     @mock.patch('nova.virt.netutils.get_injected_network_template')
     @mock.patch('nova.virt.disk.api.inject_data')
     @mock.patch.object(libvirt_driver.LibvirtDriver, "_conn")
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 297925771aa3..a6bd4d6fa72a 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -3867,9 +3867,24 @@ class LibvirtDriver(driver.ComputeDriver):
                 backend.create_snap(libvirt_utils.RESIZE_SNAPSHOT_NAME)
             if backend.SUPPORTS_CLONE:
                 def clone_fallback_to_fetch(*args, **kwargs):
+                    refuse_fetch = (
+                        CONF.libvirt.images_type == 'rbd' and
+                        CONF.workarounds.never_download_image_if_on_rbd)
                     try:
                         backend.clone(context, disk_images['image_id'])
                     except exception.ImageUnacceptable:
+                        if refuse_fetch:
+                            # Re-raise the exception from the failed
+                            # ceph clone.  The compute manager expects
+                            # ImageUnacceptable as a possible result
+                            # of spawn(), from which this is called.
+                            with excutils.save_and_reraise_exception():
+                                LOG.warning(
+                                    'Image %s is not on my ceph and '
+                                    '[workarounds]/'
+                                    'never_download_image_if_on_rbd=True;'
+                                    ' refusing to fetch and upload.',
+                                    disk_images['image_id'])
                         libvirt_utils.fetch_image(*args, **kwargs)
                 fetch_func = clone_fallback_to_fetch
             else:
diff --git a/releasenotes/notes/avoid_muli_ceph_download-4083decf501dba40.yaml b/releasenotes/notes/avoid_muli_ceph_download-4083decf501dba40.yaml
new file mode 100644
index 000000000000..f79c2781196c
--- /dev/null
+++ b/releasenotes/notes/avoid_muli_ceph_download-4083decf501dba40.yaml
@@ -0,0 +1,19 @@
+---
+other:
+  - |
+    Nova now has a config option called
+    ``[workarounds]/never_download_image_if_on_rbd`` which helps to
+    avoid pathological storage behavior with multiple ceph clusters.
+    Currently, Nova does *not* support multiple ceph clusters
+    properly, but Glance can be configured with them. If an instance
+    is booted from an image residing in a ceph cluster other than the
+    one Nova knows about, it will silently download it from Glance and
+    re-upload the image to the local ceph privately for that
+    instance. Unlike the behavior you expect when configuring Nova and
+    Glance for ceph, Nova will continue to do this over and over for
+    the same image when subsequent instances are booted, consuming a
+    large amount of storage unexpectedly. The new workaround option
+    will cause Nova to refuse to do this download/upload behavior and
+    instead fail the instance boot. It is simply a stop-gap effort to
+    allow unsupported deployments with multiple ceph clusters from
+    silently consuming large amounts of disk space.