Add configuration options for disk usage alerting thresholds

The ceph cluster degrades to HEALTH_{WARN|CRIT} when the following default thresholds are breached: mon data avail warn = 30 mon data avail crit = 5 - These thresholds can be conservative. It might be desirable to change them. - A specific common scenario is when ceph-mon units are run in lxd containers which report the disk usage of the underlying host. The underlying host may have its own monitoring and its own thresholds which can lead to duplicate or conflicting alerts. Closes-Bug: #1890777 Change-Id: I13e35be71697b98b19260970bcf9812a43ef9369
2021-05-12 10:44:31 +03:00 · 2021-05-12 10:44:31 +03:00 · 320ddae827
parent 54dce1d24c
commit 320ddae827
4 changed files with 28 additions and 0 deletions
--- a/config.yaml
+++ b/config.yaml
@ -103,6 +103,20 @@ options:
      A space-separated list of ceph mon hosts to use. This field is only used
      to migrate an existing cluster to a juju-managed solution and should
      otherwise be left unset.
+  monitor-data-available-warning:
+    type: int
+    default: 30
+    description: |
+      Raise HEALTH_WARN status when the filesystem that houses a monitor's data
+      store reports that its available capacity is less than or equal to this
+      percentage.
+  monitor-data-available-critical:
+    type: int
+    default: 5
+    description: |
+      Raise HEALTH_ERR status when the filesystem that houses a monitor's data
+      store reports that its available capacity is less than or equal to this
+      percentage.
  expected-osd-count:
    type: int
    default: 0
--- a/hooks/ceph_hooks.py
+++ b/hooks/ceph_hooks.py
@ -192,6 +192,8 @@ def get_ceph_context():
        'ceph_cluster_network': cluster_network,
        'loglevel': config('loglevel'),
        'dio': str(config('use-direct-io')).lower(),
+        'mon_data_avail_warn': int(config('monitor-data-available-warning')),
+        'mon_data_avail_crit': int(config('monitor-data-available-critical')),
    }

    if config('prefer-ipv6'):
--- a/templates/ceph.conf
+++ b/templates/ceph.conf
@ -60,6 +60,8 @@ keyring = /var/lib/ceph/mon/$cluster-$id/keyring
 mon pg warn max object skew = 0
 {% endif %}

+mon data avail warn = {{ mon_data_avail_warn }}
+mon data avail crit = {{ mon_data_avail_crit }}

 [mds]
 keyring = /var/lib/ceph/mds/$cluster-$id/keyring
--- a/unit_tests/test_ceph_hooks.py
+++ b/unit_tests/test_ceph_hooks.py
@ -43,6 +43,8 @@ CHARM_CONFIG = {'config-flags': '',
                'use-direct-io': True,
                'osd-format': 'ext4',
                'monitor-hosts': '',
+                'monitor-data-available-warning': 30,
+                'monitor-data-available-critical': 5,
                'prefer-ipv6': False,
                'default-rbd-features': None,
                'nagios_degraded_thresh': '1',
@ -84,6 +86,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
                    'fsid': '1234',
                    'loglevel': 1,
                    'mon_hosts': '10.0.0.1 10.0.0.2',
+                    'mon_data_avail_warn': 30,
+                    'mon_data_avail_crit': 5,
                    'old_auth': False,
                    'public_addr': '10.0.0.1',
                    'use_syslog': 'true'}
@ -114,6 +118,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
                    'fsid': '1234',
                    'loglevel': 1,
                    'mon_hosts': '10.0.0.1 10.0.0.2',
+                    'mon_data_avail_warn': 30,
+                    'mon_data_avail_crit': 5,
                    'old_auth': False,
                    'public_addr': '10.0.0.1',
                    'use_syslog': 'true',
@ -145,6 +151,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
                    'fsid': '1234',
                    'loglevel': 1,
                    'mon_hosts': '10.0.0.1 10.0.0.2',
+                    'mon_data_avail_warn': 30,
+                    'mon_data_avail_crit': 5,
                    'old_auth': False,
                    'mon': {'mon sync max retries': 10},
                    'public_addr': '10.0.0.1',
@ -178,6 +186,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
                    'fsid': '1234',
                    'loglevel': 1,
                    'mon_hosts': '10.0.0.1 10.0.0.2',
+                    'mon_data_avail_warn': 30,
+                    'mon_data_avail_crit': 5,
                    'old_auth': False,
                    'mon': {'mon sync max retries': 10},
                    'public_addr': '10.0.0.1',