Add nagios check for expected number of OSDs
This check does not require manually setting the number of expected OSDs. Initially, the charm sets the count (per-host) to that of what's present in the OSD tree. The count will be updated (on a per-host basis) when the number of OSDs grows, but not when it shrinks. There is a charm action to reset the expected count using information from the OSD tree. Closes-Bug: #1952985 Change-Id: Ia6a060bf151908c1d4159e6bdffa7bfe1f0a7988
This commit is contained in:
parent
fd9104907e
commit
b8af44aefa
|
@ -443,3 +443,5 @@ delete-user:
|
|||
required: [username]
|
||||
pg-repair:
|
||||
description: "Repair inconsistent placement groups, if safe to do so."
|
||||
reset-osd-count-report:
|
||||
description: "Update report of osds present in osd tree. Used for monitoring."
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
reset_osd_count_report.py
|
|
@ -0,0 +1,28 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2021 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
sys.path.append("hooks")
|
||||
from ceph_hooks import update_host_osd_count_report
|
||||
|
||||
|
||||
def reset_osd_count_report():
|
||||
update_host_osd_count_report(reset=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
reset_osd_count_report()
|
|
@ -0,0 +1,121 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (C) 2021 Canonical
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
EXIT_OK = 0
|
||||
EXIT_WARN = 1
|
||||
EXIT_CRIT = 2
|
||||
EXIT_UNKNOWN = 3
|
||||
EXIT_CODE_TEXT = ["OK", "WARN", "CRITICAL", "UNKNOWN"]
|
||||
|
||||
CURRENT_OSD_COUNT_FILE = "/var/lib/nagios/current-ceph-osd-count.json"
|
||||
|
||||
|
||||
class CriticalError(Exception):
|
||||
"""This indicates a critical error."""
|
||||
|
||||
|
||||
def check_file_freshness(filename, newer_than=3600):
|
||||
"""Check a file exists, is readable and is newer than <n> seconds.
|
||||
|
||||
:param filename: The filename to check
|
||||
:type filename: str
|
||||
:param newer_than: The file should be newer than n seconds, default 3600
|
||||
:type: newer_than: int
|
||||
:raises CriticalError: If file is not readable or older then <n> seconds
|
||||
"""
|
||||
# First check the file exists and is readable
|
||||
if not os.path.exists(filename):
|
||||
raise CriticalError("%s: does not exist." % (filename))
|
||||
if os.access(filename, os.R_OK) == 0:
|
||||
raise CriticalError("%s: is not readable." % (filename))
|
||||
|
||||
# Then ensure the file is up-to-date enough
|
||||
mtime = os.stat(filename).st_mtime
|
||||
last_modified = time.time() - mtime
|
||||
if last_modified > newer_than:
|
||||
raise CriticalError("%s: was last modified on %s and is too old "
|
||||
"(> %s seconds)."
|
||||
% (filename, time.ctime(mtime), newer_than))
|
||||
if last_modified < 0:
|
||||
raise CriticalError("%s: was last modified on %s which is in the "
|
||||
"future."
|
||||
% (filename, time.ctime(mtime)))
|
||||
|
||||
|
||||
def check_ceph_osd_count(host_osd_count_report):
|
||||
|
||||
with open(host_osd_count_report, "r") as f:
|
||||
expected_osd_map = json.load(f)
|
||||
|
||||
current_osd_map = get_osd_tree()
|
||||
|
||||
exit_code = EXIT_OK
|
||||
err_msgs = []
|
||||
for host, osd_list in expected_osd_map.items():
|
||||
if host not in current_osd_map:
|
||||
err_msgs.append("Missing host {}".format(host))
|
||||
current_osd_map[host] = {}
|
||||
|
||||
if len(osd_list) <= len(current_osd_map[host]):
|
||||
continue
|
||||
|
||||
missing_osds = list(set(osd_list) - set(current_osd_map[host]))
|
||||
if missing_osds:
|
||||
osd_ids = [str(osd) for osd in missing_osds]
|
||||
err_msgs.append("Missing osds on "
|
||||
"{}: {}".format(host,
|
||||
", ".join(osd_ids)))
|
||||
exit_code = EXIT_CRIT
|
||||
|
||||
return (exit_code, err_msgs)
|
||||
|
||||
|
||||
def get_osd_tree():
|
||||
"""Read CURRENT_OSD_COUNT_FILE to get the host osd map.
|
||||
|
||||
:return: The map of node and osd ids.
|
||||
:rtype: Dict[str: List[str]]
|
||||
"""
|
||||
check_file_freshness(CURRENT_OSD_COUNT_FILE)
|
||||
with open(CURRENT_OSD_COUNT_FILE, "r") as f:
|
||||
current_osd_counts = json.load(f)
|
||||
|
||||
host_osd_map = {}
|
||||
for node in current_osd_counts["nodes"]:
|
||||
if node["type"] != "host":
|
||||
continue
|
||||
|
||||
host_osd_map[node["name"]] = node["children"]
|
||||
|
||||
return host_osd_map
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
host_osd_report = sys.argv[1]
|
||||
if not os.path.isfile(host_osd_report):
|
||||
print("UNKNOWN: report file missing: {}".format(host_osd_report))
|
||||
sys.exit(EXIT_UNKNOWN)
|
||||
|
||||
(exit_code, err_msgs) = check_ceph_osd_count(host_osd_report)
|
||||
print("{} {}".format(EXIT_CODE_TEXT[exit_code],
|
||||
", ".join(err_msgs)))
|
||||
sys.exit(exit_code)
|
|
@ -22,3 +22,12 @@ ceph status --format json >${TMP_FILE}
|
|||
chown root:nagios ${TMP_FILE}
|
||||
chmod 0640 ${TMP_FILE}
|
||||
mv ${TMP_FILE} ${DATA_FILE}
|
||||
|
||||
DATA_FILE="${DATA_DIR}/current-ceph-osd-count.json"
|
||||
TMP_FILE=$(mktemp -p ${DATA_DIR})
|
||||
|
||||
ceph osd tree --format json > ${TMP_FILE}
|
||||
|
||||
chown root:nagios ${TMP_FILE}
|
||||
chmod 0640 ${TMP_FILE}
|
||||
mv ${TMP_FILE} ${DATA_FILE}
|
||||
|
|
|
@ -20,6 +20,7 @@ import os
|
|||
import subprocess
|
||||
import sys
|
||||
import uuid
|
||||
import pathlib
|
||||
|
||||
sys.path.append('lib')
|
||||
import charms_ceph.utils as ceph
|
||||
|
@ -109,9 +110,11 @@ from charmhelpers.contrib.hardening.harden import harden
|
|||
hooks = Hooks()
|
||||
|
||||
NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
|
||||
NAGIOS_FILE_FOLDER = '/var/lib/nagios'
|
||||
SCRIPTS_DIR = '/usr/local/bin'
|
||||
STATUS_FILE = '/var/lib/nagios/cat-ceph-status.txt'
|
||||
STATUS_FILE = '{}/cat-ceph-status.txt'.format(NAGIOS_FILE_FOLDER)
|
||||
STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health'
|
||||
HOST_OSD_COUNT_REPORT = '{}/host-osd-report.json'.format(NAGIOS_FILE_FOLDER)
|
||||
|
||||
|
||||
def check_for_upgrade():
|
||||
|
@ -215,6 +218,44 @@ def emit_cephconf():
|
|||
JOURNAL_ZAPPED = '/var/lib/ceph/journal_zapped'
|
||||
|
||||
|
||||
def update_host_osd_count_report(reset=False):
|
||||
"""Update report showing hosts->osds. Used for monitoring."""
|
||||
current_osd_tree = ceph.get_osd_tree('admin')
|
||||
|
||||
# Convert [CrushLocation,...] -> {<host>: [osdid],...} for easy comparison
|
||||
current_host_osd_map = {}
|
||||
for osd in current_osd_tree:
|
||||
osd_list = current_host_osd_map.get(osd.host, [])
|
||||
osd_list.append(osd.identifier)
|
||||
current_host_osd_map[osd.host] = osd_list
|
||||
|
||||
pathlib.Path(NAGIOS_FILE_FOLDER).mkdir(parents=True, exist_ok=True)
|
||||
if not os.path.isfile(HOST_OSD_COUNT_REPORT) or reset:
|
||||
write_file(HOST_OSD_COUNT_REPORT, '{}')
|
||||
|
||||
with open(HOST_OSD_COUNT_REPORT, "r") as f:
|
||||
expected_host_osd_map = json.load(f)
|
||||
|
||||
if current_host_osd_map == expected_host_osd_map:
|
||||
return
|
||||
|
||||
for host, osd_list in current_host_osd_map.items():
|
||||
if host not in expected_host_osd_map:
|
||||
expected_host_osd_map[host] = osd_list
|
||||
|
||||
if len(osd_list) > len(expected_host_osd_map[host]):
|
||||
# osd list is growing, add them to the expected
|
||||
expected_host_osd_map[host] = osd_list
|
||||
|
||||
if len(osd_list) == len(expected_host_osd_map[host]) and \
|
||||
osd_list != expected_host_osd_map[host]:
|
||||
# different osd ids, maybe hdd swap, refresh
|
||||
expected_host_osd_map[host] = osd_list
|
||||
|
||||
write_file(HOST_OSD_COUNT_REPORT,
|
||||
json.dumps(expected_host_osd_map))
|
||||
|
||||
|
||||
@hooks.hook('config-changed')
|
||||
@harden()
|
||||
def config_changed():
|
||||
|
@ -884,6 +925,9 @@ def osd_relation(relid=None, unit=None):
|
|||
for relid in relation_ids('dashboard'):
|
||||
dashboard_relation(relid)
|
||||
|
||||
if ready_for_service():
|
||||
update_host_osd_count_report()
|
||||
|
||||
else:
|
||||
log('mon cluster not in quorum - deferring fsid provision')
|
||||
|
||||
|
@ -1143,6 +1187,10 @@ def update_nrpe_config():
|
|||
'check_ceph_status.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_ceph_status.py'))
|
||||
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios',
|
||||
'check_ceph_osd_count.py'),
|
||||
os.path.join(NAGIOS_PLUGINS, 'check_ceph_osd_count.py'))
|
||||
|
||||
script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh')
|
||||
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files',
|
||||
'nagios', 'collect_ceph_status.sh'),
|
||||
|
@ -1168,6 +1216,14 @@ def update_nrpe_config():
|
|||
check_cmd=check_cmd
|
||||
)
|
||||
|
||||
check_cmd = 'check_ceph_osd_count.py {} '.format(
|
||||
HOST_OSD_COUNT_REPORT)
|
||||
nrpe_setup.add_check(
|
||||
shortname='ceph_osd_count',
|
||||
description='Check if osd count matches expected count',
|
||||
check_cmd=check_cmd
|
||||
)
|
||||
|
||||
if config('nagios_additional_checks'):
|
||||
additional_critical = config('nagios_additional_checks_critical')
|
||||
x = ast.literal_eval(config('nagios_additional_checks'))
|
||||
|
|
|
@ -0,0 +1,216 @@
|
|||
# Copyright 2021 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from unittest.mock import patch, mock_open
|
||||
from src.ceph_hooks import update_host_osd_count_report
|
||||
|
||||
os.sys.path.insert(1, os.path.join(sys.path[0], 'lib'))
|
||||
os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios'))
|
||||
|
||||
import check_ceph_osd_count
|
||||
|
||||
from charms_ceph.utils import CrushLocation
|
||||
|
||||
|
||||
class CheckCephOsdCountTestCase(unittest.TestCase):
|
||||
|
||||
@patch("check_ceph_osd_count.get_osd_tree")
|
||||
def test_check_equal_ceph_osd_trees(self, mock_get_osd_tree):
|
||||
"""Check that if current and expected osd trees match return OK exit"""
|
||||
|
||||
current_osd_tree = {"host1": [0]}
|
||||
mock_get_osd_tree.return_value = current_osd_tree
|
||||
expected_osd_tree = """{"host1": [0]}"""
|
||||
with patch(
|
||||
"check_ceph_osd_count.open",
|
||||
mock_open(read_data=expected_osd_tree),
|
||||
) as file:
|
||||
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
|
||||
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
|
||||
|
||||
# change osd order
|
||||
current_osd_tree = {"host1": [0, 1]}
|
||||
mock_get_osd_tree.return_value = current_osd_tree
|
||||
expected_osd_tree = """{"host1": [1, 0]}"""
|
||||
with patch(
|
||||
"check_ceph_osd_count.open",
|
||||
mock_open(read_data=expected_osd_tree),
|
||||
) as file:
|
||||
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
|
||||
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
|
||||
|
||||
@patch("check_ceph_osd_count.get_osd_tree")
|
||||
def test_check_missing_expected_osd(self, mock_get_osd_tree):
|
||||
"""Check that missing expected osd returns appropriate exit code."""
|
||||
current_osd_tree = {"host1": [0]}
|
||||
mock_get_osd_tree.return_value = current_osd_tree
|
||||
expected_osd_tree = """{"host1": [0, 1]}"""
|
||||
with patch(
|
||||
"check_ceph_osd_count.open",
|
||||
mock_open(read_data=expected_osd_tree),
|
||||
) as file:
|
||||
|
||||
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
|
||||
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_CRIT)
|
||||
|
||||
@patch("check_ceph_osd_count.get_osd_tree")
|
||||
def test_check_missing_expected_host(self,
|
||||
mock_get_osd_tree):
|
||||
"""Check that missing expected host returns appropriate exit code."""
|
||||
current_osd_tree = {"host1": [0]}
|
||||
mock_get_osd_tree.return_value = current_osd_tree
|
||||
expected_osd_tree = """{"host1": [0], "host2": [1]}"""
|
||||
with patch(
|
||||
"check_ceph_osd_count.open",
|
||||
mock_open(read_data=expected_osd_tree),
|
||||
) as file:
|
||||
|
||||
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
|
||||
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_CRIT)
|
||||
|
||||
@patch("check_ceph_osd_count.get_osd_tree")
|
||||
def test_check_change_osd_ids(self, mock_get_osd_tree):
|
||||
"""Check that a change in osd ids (of same length) is OK."""
|
||||
current_osd_tree = {"host1": [1], "host2": [3]}
|
||||
mock_get_osd_tree.return_value = current_osd_tree
|
||||
expected_osd_tree = """{"host1": [0], "host2": [1]}"""
|
||||
with patch(
|
||||
"check_ceph_osd_count.open",
|
||||
mock_open(read_data=expected_osd_tree),
|
||||
) as file:
|
||||
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
|
||||
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
|
||||
|
||||
@patch("check_ceph_osd_count.get_osd_tree")
|
||||
def test_osd_tree_current_gt_expected(self, mock_get_osd_tree):
|
||||
"""Check that growing osd list is added to expected."""
|
||||
current_osd_tree = {"host1": [0, 1], "host2": [2]}
|
||||
mock_get_osd_tree.return_value = current_osd_tree
|
||||
expected_osd_tree = """{"host1": [0]}"""
|
||||
with patch(
|
||||
"check_ceph_osd_count.open",
|
||||
mock_open(read_data=expected_osd_tree),
|
||||
) as file:
|
||||
(exit_code, _) = check_ceph_osd_count.check_ceph_osd_count(file)
|
||||
self.assertEqual(exit_code, check_ceph_osd_count.EXIT_OK)
|
||||
|
||||
@patch("json.dumps")
|
||||
@patch("src.ceph_hooks.write_file")
|
||||
@patch("src.ceph_hooks.pathlib")
|
||||
@patch("charms_ceph.utils.get_osd_tree")
|
||||
def test_update_report_fresh_tree(self,
|
||||
mock_get_osd_tree,
|
||||
mock_pathlib,
|
||||
mock_write_file,
|
||||
mock_json_dumps):
|
||||
"""Check that an empty expected tree triggers an update to expected."""
|
||||
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"),
|
||||
CrushLocation(1, "osd.1", osd="osd.1", host="host1")]
|
||||
new_osd_dict = {"host1": [0, 1]}
|
||||
mock_get_osd_tree.return_value = new_osd_tree
|
||||
|
||||
with patch(
|
||||
"src.ceph_hooks.open",
|
||||
mock_open(read_data="{}"),
|
||||
):
|
||||
update_host_osd_count_report()
|
||||
mock_json_dumps.assert_called_with(new_osd_dict)
|
||||
|
||||
@patch("json.dumps")
|
||||
@patch("src.ceph_hooks.write_file")
|
||||
@patch("src.ceph_hooks.pathlib")
|
||||
@patch("charms_ceph.utils.get_osd_tree")
|
||||
def test_update_report_new_host(self,
|
||||
mock_get_osd_tree,
|
||||
mock_pathlib,
|
||||
mock_write_file,
|
||||
mock_json_dumps):
|
||||
"""Check that adding new host adds new host to expected tree."""
|
||||
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"),
|
||||
CrushLocation(1, "osd.1", osd="osd.1", host="host1"),
|
||||
CrushLocation(2, "osd.2", osd="osd.2", host="host2")]
|
||||
mock_get_osd_tree.return_value = new_osd_tree
|
||||
with patch(
|
||||
"src.ceph_hooks.open",
|
||||
mock_open(read_data="""{"host1": [0, 1]}"""),
|
||||
):
|
||||
update_host_osd_count_report()
|
||||
mock_json_dumps.assert_called_with(
|
||||
{"host1": [0, 1], "host2": [2]})
|
||||
|
||||
@patch("json.dumps")
|
||||
@patch("src.ceph_hooks.write_file")
|
||||
@patch("src.ceph_hooks.pathlib")
|
||||
@patch("charms_ceph.utils.get_osd_tree")
|
||||
def test_update_report_missing_host(self,
|
||||
mock_get_osd_tree,
|
||||
mock_pathlib,
|
||||
mock_write_file,
|
||||
mock_json_dumps):
|
||||
"""Check that missing host is not removed from expected tree."""
|
||||
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1"),
|
||||
CrushLocation(2, "osd.2", osd="osd.2", host="host1")]
|
||||
mock_get_osd_tree.return_value = new_osd_tree
|
||||
with patch(
|
||||
"src.ceph_hooks.open",
|
||||
mock_open(read_data="""{"host1": [0], "host2": [1]}"""),
|
||||
):
|
||||
update_host_osd_count_report()
|
||||
mock_json_dumps.assert_called_with(
|
||||
{"host1": [0, 2], "host2": [1]})
|
||||
|
||||
@patch("json.dumps")
|
||||
@patch("src.ceph_hooks.write_file")
|
||||
@patch("src.ceph_hooks.pathlib")
|
||||
@patch("charms_ceph.utils.get_osd_tree")
|
||||
def test_update_report_fewer_osds(self,
|
||||
mock_get_osd_tree,
|
||||
mock_pathlib,
|
||||
mock_write_file,
|
||||
mock_json_dumps):
|
||||
"""Check that report isn't updated when osd list shrinks."""
|
||||
new_osd_tree = [CrushLocation(0, "osd.0", osd="osd.0", host="host1")]
|
||||
mock_get_osd_tree.return_value = new_osd_tree
|
||||
with patch(
|
||||
"src.ceph_hooks.open",
|
||||
mock_open(read_data="""{"host1": [0, 1]}"""),
|
||||
):
|
||||
update_host_osd_count_report()
|
||||
mock_json_dumps.assert_called_with(
|
||||
{"host1": [0, 1]})
|
||||
|
||||
@patch("json.dumps")
|
||||
@patch("src.ceph_hooks.write_file")
|
||||
@patch("src.ceph_hooks.pathlib")
|
||||
@patch("charms_ceph.utils.get_osd_tree")
|
||||
def test_update_report_diff_osd_ids(self,
|
||||
mock_get_osd_tree,
|
||||
mock_write_file,
|
||||
mock_pathlib,
|
||||
mock_json_dumps):
|
||||
"""Check that new osdid list (of same length) becomes new expected."""
|
||||
new_osd_tree = [CrushLocation(2, "osd.2", osd="osd.2", host="host1"),
|
||||
CrushLocation(3, "osd.3", osd="osd.3", host="host1")]
|
||||
mock_get_osd_tree.return_value = new_osd_tree
|
||||
with patch(
|
||||
"src.ceph_hooks.open",
|
||||
mock_open(read_data="""{"host1": [0, 1]}"""),
|
||||
):
|
||||
update_host_osd_count_report()
|
||||
mock_json_dumps.assert_called_with(
|
||||
{"host1": [2, 3]})
|
Loading…
Reference in New Issue