Add configuration options for disk usage alerting thresholds

The ceph cluster degrades to HEALTH_{WARN|CRIT} when the following
default thresholds are breached:

mon data avail warn = 30
mon data avail crit = 5

- These thresholds can be conservative. It might be desirable
  to change them.
- A specific common scenario is when ceph-mon units are run in lxd
  containers which report the disk usage of the underlying host. The
  underlying host may have its own monitoring and its own
  thresholds which can lead to duplicate or conflicting alerts.


Closes-Bug: #1890777
Change-Id: I13e35be71697b98b19260970bcf9812a43ef9369
This commit is contained in:
Cornellius Metto 2021-05-12 10:44:31 +03:00
parent 54dce1d24c
commit 320ddae827
4 changed files with 28 additions and 0 deletions

View File

@ -103,6 +103,20 @@ options:
A space-separated list of ceph mon hosts to use. This field is only used
to migrate an existing cluster to a juju-managed solution and should
otherwise be left unset.
monitor-data-available-warning:
type: int
default: 30
description: |
Raise HEALTH_WARN status when the filesystem that houses a monitor's data
store reports that its available capacity is less than or equal to this
percentage.
monitor-data-available-critical:
type: int
default: 5
description: |
Raise HEALTH_ERR status when the filesystem that houses a monitor's data
store reports that its available capacity is less than or equal to this
percentage.
expected-osd-count:
type: int
default: 0

View File

@ -192,6 +192,8 @@ def get_ceph_context():
'ceph_cluster_network': cluster_network,
'loglevel': config('loglevel'),
'dio': str(config('use-direct-io')).lower(),
'mon_data_avail_warn': int(config('monitor-data-available-warning')),
'mon_data_avail_crit': int(config('monitor-data-available-critical')),
}
if config('prefer-ipv6'):

View File

@ -60,6 +60,8 @@ keyring = /var/lib/ceph/mon/$cluster-$id/keyring
mon pg warn max object skew = 0
{% endif %}
mon data avail warn = {{ mon_data_avail_warn }}
mon data avail crit = {{ mon_data_avail_crit }}
[mds]
keyring = /var/lib/ceph/mds/$cluster-$id/keyring

View File

@ -43,6 +43,8 @@ CHARM_CONFIG = {'config-flags': '',
'use-direct-io': True,
'osd-format': 'ext4',
'monitor-hosts': '',
'monitor-data-available-warning': 30,
'monitor-data-available-critical': 5,
'prefer-ipv6': False,
'default-rbd-features': None,
'nagios_degraded_thresh': '1',
@ -84,6 +86,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
'fsid': '1234',
'loglevel': 1,
'mon_hosts': '10.0.0.1 10.0.0.2',
'mon_data_avail_warn': 30,
'mon_data_avail_crit': 5,
'old_auth': False,
'public_addr': '10.0.0.1',
'use_syslog': 'true'}
@ -114,6 +118,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
'fsid': '1234',
'loglevel': 1,
'mon_hosts': '10.0.0.1 10.0.0.2',
'mon_data_avail_warn': 30,
'mon_data_avail_crit': 5,
'old_auth': False,
'public_addr': '10.0.0.1',
'use_syslog': 'true',
@ -145,6 +151,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
'fsid': '1234',
'loglevel': 1,
'mon_hosts': '10.0.0.1 10.0.0.2',
'mon_data_avail_warn': 30,
'mon_data_avail_crit': 5,
'old_auth': False,
'mon': {'mon sync max retries': 10},
'public_addr': '10.0.0.1',
@ -178,6 +186,8 @@ class CephHooksTestCase(test_utils.CharmTestCase):
'fsid': '1234',
'loglevel': 1,
'mon_hosts': '10.0.0.1 10.0.0.2',
'mon_data_avail_warn': 30,
'mon_data_avail_crit': 5,
'old_auth': False,
'mon': {'mon sync max retries': 10},
'public_addr': '10.0.0.1',