rbd: use MAX_AVAIL stat for reporting bytes available

Currently, the reporting of bytes available works well for recommended Ceph deployments that run one OSD per disk [1]. However, for users who are running multiple OSDs on a single disk, the current reporting will reflect bytes available * number of replicas. We can enhance the bytes available reporting method to accomodate unrecommended Ceph deployments by using the MAX_AVAIL stat obtainable via the 'ceph df' command. The MAX_AVAIL stat takes the number of configured replicas into consideration and will reflect the correct number of bytes available even when Ceph is deployed in a way the documentation recommends against. For most users, this change should make no difference. It will only be a help for users who are running unrecommended Ceph deployments. [1] http://docs.ceph.com/docs/luminous/start/hardware-recommendations/#hard-disk-drives Change-Id: I96faff6d3b9747514441d83c629fdd1cface1eb5
2018-03-27 01:27:56 +00:00 · 2018-03-27 01:27:56 +00:00 · 44254ca865
commit 44254ca865
parent 155da8f71f
3 changed files with 99 additions and 6 deletions
--- a/nova/tests/unit/virt/libvirt/storage/test_rbd.py
+++ b/nova/tests/unit/virt/libvirt/storage/test_rbd.py
@ -13,6 +13,7 @@

 from eventlet import tpool
 import mock
+from oslo_serialization import jsonutils
 from oslo_utils.fixture import uuidsentinel as uuids

 from nova.compute import task_states
@ -51,6 +52,53 @@ CEPH_MON_DUMP = """dumped monmap epoch 1
 """


+# max_avail stats are tweaked for testing
+CEPH_DF = """
+{
+    "stats": {
+        "total_bytes": 25757220864,
+        "total_used_bytes": 274190336,
+        "total_avail_bytes": 25483030528
+    },
+    "pools": [
+        {
+            "name": "images",
+            "id": 1,
+            "stats": {
+                "kb_used": 12419,
+                "bytes_used": 12716067,
+                "percent_used": 0.05,
+                "max_avail": 24195168123,
+                "objects": 6
+            }
+        },
+        {
+            "name": "rbd",
+            "id": 2,
+            "stats": {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "percent_used": 0.00,
+                "max_avail": 24195168456,
+                "objects": 0
+            }
+        },
+        {
+            "name": "volumes",
+            "id": 3,
+            "stats": {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "percent_used": 0.00,
+                "max_avail": 24195168789,
+                "objects": 0
+            }
+        }
+    ]
+}
+"""
+
+
 class FakeException(Exception):
    pass

@ -557,3 +605,19 @@ class RbdTestCase(test.NoDBTestCase):
        proxy.list_snaps.return_value = [{'name': self.snap_name}, ]
        self.driver.rollback_to_snap(self.volume_name, self.snap_name)
        proxy.rollback_to_snap.assert_called_once_with(self.snap_name)
+
+    @mock.patch('oslo_concurrency.processutils.execute')
+    def test_get_pool_info(self, mock_execute):
+        mock_execute.return_value = (CEPH_DF, '')
+        ceph_df_json = jsonutils.loads(CEPH_DF)
+        expected = {'total': ceph_df_json['stats']['total_bytes'],
+                    'free': ceph_df_json['pools'][1]['stats']['max_avail'],
+                    'used': ceph_df_json['pools'][1]['stats']['bytes_used']}
+        self.assertDictEqual(expected, self.driver.get_pool_info())
+
+    @mock.patch('oslo_concurrency.processutils.execute')
+    def test_get_pool_info_not_found(self, mock_execute):
+        # Make the pool something other than self.rbd_pool so it won't be found
+        ceph_df_not_found = CEPH_DF.replace('rbd', 'vms')
+        mock_execute.return_value = (ceph_df_not_found, '')
+        self.assertRaises(exception.NotFound, self.driver.get_pool_info)
--- a/nova/virt/libvirt/storage/rbd_utils.py
+++ b/nova/virt/libvirt/storage/rbd_utils.py
@ -30,7 +30,6 @@ from oslo_serialization import jsonutils
 from oslo_service import loopingcall
 from oslo_utils import encodeutils
 from oslo_utils import excutils
-from oslo_utils import units

 from nova import exception
 from nova.i18n import _
@ -366,11 +365,32 @@ class RBDDriver(object):
                self._destroy_volume(client, volume)

    def get_pool_info(self):
-        with RADOSClient(self) as client:
-            stats = client.cluster.get_cluster_stats()
-            return {'total': stats['kb'] * units.Ki,
-                    'free': stats['kb_avail'] * units.Ki,
-                    'used': stats['kb_used'] * units.Ki}
+        # NOTE(melwitt): We're executing 'ceph df' here instead of calling
+        # the RADOSClient.get_cluster_stats python API because we need
+        # access to the MAX_AVAIL stat, which reports the available bytes
+        # taking replication into consideration. The global available stat
+        # from the RADOSClient.get_cluster_stats python API does not take
+        # replication size into consideration and will simply return the
+        # available storage per OSD, added together across all OSDs. The
+        # MAX_AVAIL stat will divide by the replication size when doing the
+        # calculation.
+        args = ['ceph', 'df', '--format=json'] + self.ceph_args()
+        out, _ = processutils.execute(*args)
+        stats = jsonutils.loads(out)
+
+        # Find the pool for which we are configured.
+        pool_stats = None
+        for pool in stats['pools']:
+            if pool['name'] == self.pool:
+                pool_stats = pool['stats']
+                break
+
+        if pool_stats is None:
+            raise exception.NotFound('Pool %s could not be found.' % self.pool)
+
+        return {'total': stats['stats']['total_bytes'],
+                'free': pool_stats['max_avail'],
+                'used': pool_stats['bytes_used']}

    def create_snap(self, volume, name, pool=None, protect=False):
        """Create a snapshot of an RBD volume.
--- a/releasenotes/notes/rbd-enhance-get-pool-info-14afc8eccab49dcf.yaml
+++ b/releasenotes/notes/rbd-enhance-get-pool-info-14afc8eccab49dcf.yaml
@ -0,0 +1,9 @@
+---
+other:
+  - |
+    The reporting for bytes available for RBD has been enhanced to accomodate
+    `unrecommended
+    <http://docs.ceph.com/docs/luminous/start/hardware-recommendations/#hard-disk-drives>`_
+    Ceph deployments where multiple OSDs are running on a single disk. The new
+    reporting method takes the number of configured replicas into consideration
+    when reporting bytes available.