From 44254ca865515c2ecd91886f0100ada874a40abe Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Tue, 27 Mar 2018 01:27:56 +0000
Subject: [PATCH] rbd: use MAX_AVAIL stat for reporting bytes available

Currently, the reporting of bytes available works well for recommended
Ceph deployments that run one OSD per disk [1]. However, for users who
are running multiple OSDs on a single disk, the current reporting will
reflect bytes available * number of replicas.

We can enhance the bytes available reporting method to accomodate
unrecommended Ceph deployments by using the MAX_AVAIL stat obtainable
via the 'ceph df' command. The MAX_AVAIL stat takes the number of
configured replicas into consideration and will reflect the correct
number of bytes available even when Ceph is deployed in a way the
documentation recommends against.

For most users, this change should make no difference. It will only be
a help for users who are running unrecommended Ceph deployments.

[1] http://docs.ceph.com/docs/luminous/start/hardware-recommendations/#hard-disk-drives

Change-Id: I96faff6d3b9747514441d83c629fdd1cface1eb5
---
 .../unit/virt/libvirt/storage/test_rbd.py     | 64 +++++++++++++++++++
 nova/virt/libvirt/storage/rbd_utils.py        | 32 ++++++++--
 ...nhance-get-pool-info-14afc8eccab49dcf.yaml |  9 +++
 3 files changed, 99 insertions(+), 6 deletions(-)
 create mode 100644 releasenotes/notes/rbd-enhance-get-pool-info-14afc8eccab49dcf.yaml

diff --git a/nova/tests/unit/virt/libvirt/storage/test_rbd.py b/nova/tests/unit/virt/libvirt/storage/test_rbd.py
index f6629b79f183..f38830d0ee2a 100644
--- a/nova/tests/unit/virt/libvirt/storage/test_rbd.py
+++ b/nova/tests/unit/virt/libvirt/storage/test_rbd.py
@@ -13,6 +13,7 @@
 
 from eventlet import tpool
 import mock
+from oslo_serialization import jsonutils
 from oslo_utils.fixture import uuidsentinel as uuids
 
 from nova.compute import task_states
@@ -51,6 +52,53 @@ CEPH_MON_DUMP = """dumped monmap epoch 1
 """
 
 
+# max_avail stats are tweaked for testing
+CEPH_DF = """
+{
+    "stats": {
+        "total_bytes": 25757220864,
+        "total_used_bytes": 274190336,
+        "total_avail_bytes": 25483030528
+    },
+    "pools": [
+        {
+            "name": "images",
+            "id": 1,
+            "stats": {
+                "kb_used": 12419,
+                "bytes_used": 12716067,
+                "percent_used": 0.05,
+                "max_avail": 24195168123,
+                "objects": 6
+            }
+        },
+        {
+            "name": "rbd",
+            "id": 2,
+            "stats": {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "percent_used": 0.00,
+                "max_avail": 24195168456,
+                "objects": 0
+            }
+        },
+        {
+            "name": "volumes",
+            "id": 3,
+            "stats": {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "percent_used": 0.00,
+                "max_avail": 24195168789,
+                "objects": 0
+            }
+        }
+    ]
+}
+"""
+
+
 class FakeException(Exception):
     pass
 
@@ -557,3 +605,19 @@ class RbdTestCase(test.NoDBTestCase):
         proxy.list_snaps.return_value = [{'name': self.snap_name}, ]
         self.driver.rollback_to_snap(self.volume_name, self.snap_name)
         proxy.rollback_to_snap.assert_called_once_with(self.snap_name)
+
+    @mock.patch('oslo_concurrency.processutils.execute')
+    def test_get_pool_info(self, mock_execute):
+        mock_execute.return_value = (CEPH_DF, '')
+        ceph_df_json = jsonutils.loads(CEPH_DF)
+        expected = {'total': ceph_df_json['stats']['total_bytes'],
+                    'free': ceph_df_json['pools'][1]['stats']['max_avail'],
+                    'used': ceph_df_json['pools'][1]['stats']['bytes_used']}
+        self.assertDictEqual(expected, self.driver.get_pool_info())
+
+    @mock.patch('oslo_concurrency.processutils.execute')
+    def test_get_pool_info_not_found(self, mock_execute):
+        # Make the pool something other than self.rbd_pool so it won't be found
+        ceph_df_not_found = CEPH_DF.replace('rbd', 'vms')
+        mock_execute.return_value = (ceph_df_not_found, '')
+        self.assertRaises(exception.NotFound, self.driver.get_pool_info)
diff --git a/nova/virt/libvirt/storage/rbd_utils.py b/nova/virt/libvirt/storage/rbd_utils.py
index 133f72f410ec..208bc9d86489 100644
--- a/nova/virt/libvirt/storage/rbd_utils.py
+++ b/nova/virt/libvirt/storage/rbd_utils.py
@@ -30,7 +30,6 @@ from oslo_serialization import jsonutils
 from oslo_service import loopingcall
 from oslo_utils import encodeutils
 from oslo_utils import excutils
-from oslo_utils import units
 
 from nova import exception
 from nova.i18n import _
@@ -366,11 +365,32 @@ class RBDDriver(object):
                 self._destroy_volume(client, volume)
 
     def get_pool_info(self):
-        with RADOSClient(self) as client:
-            stats = client.cluster.get_cluster_stats()
-            return {'total': stats['kb'] * units.Ki,
-                    'free': stats['kb_avail'] * units.Ki,
-                    'used': stats['kb_used'] * units.Ki}
+        # NOTE(melwitt): We're executing 'ceph df' here instead of calling
+        # the RADOSClient.get_cluster_stats python API because we need
+        # access to the MAX_AVAIL stat, which reports the available bytes
+        # taking replication into consideration. The global available stat
+        # from the RADOSClient.get_cluster_stats python API does not take
+        # replication size into consideration and will simply return the
+        # available storage per OSD, added together across all OSDs. The
+        # MAX_AVAIL stat will divide by the replication size when doing the
+        # calculation.
+        args = ['ceph', 'df', '--format=json'] + self.ceph_args()
+        out, _ = processutils.execute(*args)
+        stats = jsonutils.loads(out)
+
+        # Find the pool for which we are configured.
+        pool_stats = None
+        for pool in stats['pools']:
+            if pool['name'] == self.pool:
+                pool_stats = pool['stats']
+                break
+
+        if pool_stats is None:
+            raise exception.NotFound('Pool %s could not be found.' % self.pool)
+
+        return {'total': stats['stats']['total_bytes'],
+                'free': pool_stats['max_avail'],
+                'used': pool_stats['bytes_used']}
 
     def create_snap(self, volume, name, pool=None, protect=False):
         """Create a snapshot of an RBD volume.
diff --git a/releasenotes/notes/rbd-enhance-get-pool-info-14afc8eccab49dcf.yaml b/releasenotes/notes/rbd-enhance-get-pool-info-14afc8eccab49dcf.yaml
new file mode 100644
index 000000000000..4e4352e4bc06
--- /dev/null
+++ b/releasenotes/notes/rbd-enhance-get-pool-info-14afc8eccab49dcf.yaml
@@ -0,0 +1,9 @@
+---
+other:
+  - |
+    The reporting for bytes available for RBD has been enhanced to accomodate
+    `unrecommended
+    <http://docs.ceph.com/docs/luminous/start/hardware-recommendations/#hard-disk-drives>`_
+    Ceph deployments where multiple OSDs are running on a single disk. The new
+    reporting method takes the number of configured replicas into consideration
+    when reporting bytes available.