Create NRPE check to verify ceph daemons versions
This NRPE check confirms if the versions of cluster daemons are divergent. WARN - any minor version diverged WARN – any versions are 1 release behind the mon CRIT – any versions are 2 releases behind the mon CRIT – any versions releases are head the mon A juju action is also provided 'get-versions-report' which provide to users, a quick way to see daemons versions running on cluster hosts. Closes-Bug: #1943628 Change-Id: I41b5c8576dc9cf885fa813a93e6d51e8804eb9d8
This commit is contained in:
parent
b7c774cd8f
commit
dfbda68e1a
@ -4,6 +4,8 @@ resume-health:
|
||||
description: "Resume ceph health operations across the entire ceph cluster"
|
||||
get-health:
|
||||
description: "Output the current cluster health reported by `ceph health`"
|
||||
get-versions-report:
|
||||
description: "Outputs running daemon versions for all cluster members"
|
||||
create-cache-tier:
|
||||
description: "Create a new cache tier"
|
||||
params:
|
||||
|
@ -26,6 +26,11 @@ from charmhelpers.contrib.storage.linux.ceph import pool_set, \
|
||||
set_pool_quota, snapshot_pool, remove_pool_snapshot
|
||||
|
||||
|
||||
class CephReportError(Exception):
|
||||
"""This indicates a critical error."""
|
||||
pass
|
||||
|
||||
|
||||
def list_pools():
|
||||
"""Return a list of all Ceph pools."""
|
||||
try:
|
||||
@ -35,6 +40,52 @@ def list_pools():
|
||||
action_fail(str(e))
|
||||
|
||||
|
||||
def get_versions_report():
|
||||
"""
|
||||
Return a mapping of hosts and their related ceph daemon versions.
|
||||
|
||||
On error, raise a CephReportError.
|
||||
"""
|
||||
report = dict()
|
||||
try:
|
||||
output = check_output(['ceph', 'node', 'ls']).decode('UTF-8')
|
||||
except CalledProcessError as e:
|
||||
action_fail(str(e))
|
||||
raise(CephReportError("Getting nodes list fail"))
|
||||
nodes_list = json.loads(output)
|
||||
|
||||
# osd versions
|
||||
for osd_host, osds in nodes_list['osd'].items():
|
||||
report.setdefault(osd_host, [])
|
||||
for osd in osds:
|
||||
try:
|
||||
output = check_output(['ceph', 'tell',
|
||||
"osd.{}".format(osd),
|
||||
'version']).decode('UTF-8')
|
||||
except CalledProcessError:
|
||||
raise(
|
||||
CephReportError("Getting osd.{} version fail".format(osd))
|
||||
)
|
||||
report[osd_host].append(json.loads(output)['version'])
|
||||
|
||||
# mon versions
|
||||
for mon_host, mons in nodes_list['mon'].items():
|
||||
report.setdefault(mon_host, [])
|
||||
for mon in mons:
|
||||
try:
|
||||
output = check_output(['ceph', 'tell',
|
||||
"mon.{}".format(mon),
|
||||
'version']).decode('UTF-8')
|
||||
except CalledProcessError as e:
|
||||
action_fail(str(e))
|
||||
raise(
|
||||
CephReportError("Getting mon.{} version fail".format(mon))
|
||||
)
|
||||
report[mon_host].append(json.loads(output)['version'])
|
||||
|
||||
return json.dumps(report, indent=4)
|
||||
|
||||
|
||||
def get_health():
|
||||
"""
|
||||
Returns the output of 'ceph health'.
|
||||
|
1
actions/get-versions-report
Symbolic link
1
actions/get-versions-report
Symbolic link
@ -0,0 +1 @@
|
||||
get_versions_report.py
|
26
actions/get_versions_report.py
Executable file
26
actions/get_versions_report.py
Executable file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2022 Canonical Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ceph_ops import get_versions_report, CephReportError
|
||||
from charmhelpers.core.hookenv import log, action_set, action_fail
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
action_set({'message': get_versions_report()})
|
||||
except CephReportError as e:
|
||||
log(e)
|
||||
action_fail(
|
||||
"get versions report failed with message: {}".format(str(e)))
|
@ -86,6 +86,32 @@ def get_ceph_version():
|
||||
return out_version
|
||||
|
||||
|
||||
def get_daemons_versions():
|
||||
"""
|
||||
Uses CLI to get the ceph versions
|
||||
|
||||
:returns: set containing tuple of integers,
|
||||
all the differents versions encountered in the cluster
|
||||
:raises: UnknownError
|
||||
"""
|
||||
try:
|
||||
tree = subprocess.check_output(['ceph',
|
||||
'versions']).decode('UTF-8')
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise UnknownError(
|
||||
"UNKNOWN: could not determine OSDs versions, error: {}".format(e))
|
||||
ceph_versions = json.loads(tree)
|
||||
# ceph version command return a json output
|
||||
# containing version of all daemons connected to the cluster
|
||||
# here we parse the overall field,
|
||||
# to get a set of all versions seen by the cluster
|
||||
daemons_versions = set(map(
|
||||
lambda x: tuple(int(i) for i in
|
||||
x.split(' ')[2].split('.')),
|
||||
ceph_versions['overall'].keys()))
|
||||
return daemons_versions
|
||||
|
||||
|
||||
def get_status_and_messages(status_data):
|
||||
"""
|
||||
Used to get general status of a Ceph cluster as well as a list of
|
||||
@ -135,6 +161,50 @@ def check_ceph_status(args):
|
||||
"""
|
||||
|
||||
status_critical = False
|
||||
# if it is just --check_daemons_versions_consistency,
|
||||
# deal with it and ignore overall health
|
||||
if args.check_daemons_versions_consistency:
|
||||
daemons_versions = get_daemons_versions()
|
||||
# we check that the osds have same versions
|
||||
num_of_versions = len(daemons_versions)
|
||||
if num_of_versions == 1:
|
||||
message_ok = "OK: All versions alligned"
|
||||
return message_ok
|
||||
else:
|
||||
# version diverged
|
||||
# we check if major release are the same
|
||||
# by parsing version number in the daemon_version set
|
||||
# and keeping major version number or coverting the minor
|
||||
# version number if major version is 0
|
||||
num_of_releases = set(map(lambda x: x[0], daemons_versions))
|
||||
if len(num_of_releases) == 1:
|
||||
msg = 'WARNING: Components minor versions diverged.'
|
||||
'Run get-versions-report to know more'
|
||||
raise WarnError(msg)
|
||||
else:
|
||||
# Releases diverged
|
||||
major, _minor, _patch = get_ceph_version()
|
||||
release_versions_diff = list(map(lambda x: major - x,
|
||||
num_of_releases))
|
||||
if max(release_versions_diff) >= 2:
|
||||
msg = "CRITICAL: A component is " \
|
||||
"{} version behind osd leader" \
|
||||
". Run get-versions-report to know more".format(
|
||||
max(release_versions_diff))
|
||||
raise CriticalError(msg)
|
||||
if min(release_versions_diff) <= -1:
|
||||
msg = "CRITICAL: A component is " \
|
||||
"{} version ahead osd leader" \
|
||||
". Run get-versions-report to know more".format(
|
||||
abs(min(release_versions_diff)))
|
||||
raise CriticalError(msg)
|
||||
if max(release_versions_diff) == 1:
|
||||
msg = "WARNING: A component is " \
|
||||
"{} version behind osd leader" \
|
||||
". Run get-versions-report to know more".format(
|
||||
max(release_versions_diff))
|
||||
raise WarnError(msg)
|
||||
|
||||
if args.status_file:
|
||||
check_file_freshness(args.status_file)
|
||||
with open(args.status_file) as f:
|
||||
@ -287,6 +357,11 @@ def parse_args(args):
|
||||
dest='check_num_osds', default=False,
|
||||
action='store_true',
|
||||
help="Check whether all OSDs are up and in")
|
||||
parser.add_argument('--check_daemons_versions_consistency',
|
||||
dest='check_daemons_versions_consistency',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Check all OSDs versions")
|
||||
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
@ -1211,6 +1211,14 @@ def update_nrpe_config():
|
||||
description='Check whether all OSDs are up and in',
|
||||
check_cmd=check_cmd
|
||||
)
|
||||
if is_leader():
|
||||
check_cmd = 'check_ceph_status.py -f {}' \
|
||||
' --check_daemons_versions'.format(STATUS_FILE)
|
||||
nrpe_setup.add_check(
|
||||
shortname='ceph_daemons_versions',
|
||||
description='Check wheter all ceph daemons versions are alligned',
|
||||
check_cmd=check_cmd
|
||||
)
|
||||
nrpe_setup.write()
|
||||
|
||||
|
||||
|
35
unit_tests/ceph_ls_node.json
Normal file
35
unit_tests/ceph_ls_node.json
Normal file
@ -0,0 +1,35 @@
|
||||
{
|
||||
"mon": {
|
||||
"juju-c8b0a2-3-lxd-0": [
|
||||
"juju-c8b0a2-3-lxd-0"
|
||||
],
|
||||
"juju-c8b0a2-4-lxd-0": [
|
||||
"juju-c8b0a2-4-lxd-0"
|
||||
],
|
||||
"juju-c8b0a2-5-lxd-0": [
|
||||
"juju-c8b0a2-5-lxd-0"
|
||||
]
|
||||
},
|
||||
"osd": {
|
||||
"aware-bee": [
|
||||
1
|
||||
],
|
||||
"grand-ape": [
|
||||
0
|
||||
],
|
||||
"lucky-muskox": [
|
||||
2
|
||||
]
|
||||
},
|
||||
"mgr": {
|
||||
"juju-c8b0a2-3-lxd-0": [
|
||||
"juju-c8b0a2-3-lxd-0"
|
||||
],
|
||||
"juju-c8b0a2-4-lxd-0": [
|
||||
"juju-c8b0a2-4-lxd-0"
|
||||
],
|
||||
"juju-c8b0a2-5-lxd-0": [
|
||||
"juju-c8b0a2-5-lxd-0"
|
||||
]
|
||||
}
|
||||
}
|
15
unit_tests/ceph_versions_alligned.json
Normal file
15
unit_tests/ceph_versions_alligned.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"mon": {
|
||||
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3
|
||||
},
|
||||
"mgr": {
|
||||
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3
|
||||
},
|
||||
"osd": {
|
||||
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 2
|
||||
},
|
||||
"mds": {},
|
||||
"overall": {
|
||||
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 8
|
||||
}
|
||||
}
|
19
unit_tests/ceph_versions_diverged.json
Normal file
19
unit_tests/ceph_versions_diverged.json
Normal file
@ -0,0 +1,19 @@
|
||||
{
|
||||
"mon": {
|
||||
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 1,
|
||||
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2
|
||||
},
|
||||
"mgr": {
|
||||
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 3
|
||||
},
|
||||
"osd": {
|
||||
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3,
|
||||
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2
|
||||
},
|
||||
"mds": {},
|
||||
"overall": {
|
||||
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 4,
|
||||
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3,
|
||||
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 4
|
||||
}
|
||||
}
|
@ -13,6 +13,7 @@
|
||||
import json
|
||||
import sys
|
||||
import unittest.mock as mock
|
||||
from subprocess import CalledProcessError
|
||||
|
||||
from test_utils import CharmTestCase
|
||||
|
||||
@ -53,6 +54,45 @@ class OpsTestCase(CharmTestCase):
|
||||
cmd = ['ceph', 'health']
|
||||
self.check_output.assert_called_once_with(cmd)
|
||||
|
||||
def test_get_version_report_ok(self):
|
||||
def _call_rslt():
|
||||
with open('unit_tests/ceph_ls_node.json') as f:
|
||||
tree = f.read()
|
||||
yield tree.encode('UTF-8')
|
||||
while True:
|
||||
yield ('{'
|
||||
' "version": "16.2.7",'
|
||||
' "release": "pacific",'
|
||||
' "release_type": "stable"'
|
||||
'}').encode('UTF-8')
|
||||
self.check_output.side_effect = _call_rslt()
|
||||
result = actions.get_versions_report()
|
||||
self.assertEqual('{\n'
|
||||
' "aware-bee": [\n'
|
||||
' "16.2.7"\n'
|
||||
' ],\n'
|
||||
' "grand-ape": [\n'
|
||||
' "16.2.7"\n'
|
||||
' ],\n'
|
||||
' "lucky-muskox": [\n'
|
||||
' "16.2.7"\n'
|
||||
' ],\n'
|
||||
' "juju-c8b0a2-3-lxd-0": [\n'
|
||||
' "16.2.7"\n'
|
||||
' ],\n'
|
||||
' "juju-c8b0a2-4-lxd-0": [\n'
|
||||
' "16.2.7"\n'
|
||||
' ],\n'
|
||||
' "juju-c8b0a2-5-lxd-0": [\n'
|
||||
' "16.2.7"\n'
|
||||
' ]\n'
|
||||
'}', result)
|
||||
|
||||
def test_get_version_report_fail(self):
|
||||
self.check_output.side_effect = CalledProcessError(1, 'ceph node ls')
|
||||
self.assertRaises(actions.CephReportError,
|
||||
lambda: actions.get_versions_report())
|
||||
|
||||
@mock.patch('socket.gethostname')
|
||||
def test_get_quorum_status(self, mock_hostname):
|
||||
mock_hostname.return_value = 'mockhost'
|
||||
|
@ -17,6 +17,7 @@ import os
|
||||
import sys
|
||||
|
||||
from unittest.mock import patch
|
||||
from subprocess import CalledProcessError
|
||||
|
||||
# import the module we want to test
|
||||
os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios'))
|
||||
@ -25,6 +26,90 @@ import check_ceph_status
|
||||
|
||||
@patch('subprocess.check_output')
|
||||
class NagiosTestCase(unittest.TestCase):
|
||||
def test_get_daemons_versions_alligned(self, mock_subprocess):
|
||||
with open('unit_tests/ceph_versions_alligned.json', 'rb') as f:
|
||||
mock_subprocess.return_value = f.read()
|
||||
osds_versions = check_ceph_status.get_daemons_versions()
|
||||
self.assertEqual(osds_versions, set([(16, 2, 7)]))
|
||||
|
||||
def test_get_daemons_versions_diverged(self, mock_subprocess):
|
||||
with open('unit_tests/ceph_versions_diverged.json', 'rb') as f:
|
||||
mock_subprocess.return_value = f.read()
|
||||
osds_versions = check_ceph_status.get_daemons_versions()
|
||||
self.assertEqual(osds_versions, set([(16, 2, 7), (17, 2, 0),
|
||||
(15, 2, 16)]))
|
||||
|
||||
def test_get_daemons_versions_exeption(self, mock_subprocess):
|
||||
mock_subprocess.side_effect = CalledProcessError(1, 'ceph versions')
|
||||
self.assertRaises(check_ceph_status.UnknownError,
|
||||
lambda: check_ceph_status.get_daemons_versions())
|
||||
|
||||
# Version Alligned
|
||||
@patch('check_ceph_status.get_daemons_versions')
|
||||
def test_versions_alligned(self, mock_daemons_versions, mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
|
||||
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
|
||||
mock_daemons_versions.return_value = set([(16, 2, 7)])
|
||||
args = check_ceph_status.parse_args([
|
||||
'--check_daemons_versions_consistency'])
|
||||
check_output = check_ceph_status.check_ceph_status(args)
|
||||
self.assertRegex(check_output, r"^OK: All versions alligned$")
|
||||
|
||||
# Minor version diverged
|
||||
@patch('check_ceph_status.get_daemons_versions')
|
||||
def test_min_versions_diverged(self, mock_daemons_versions,
|
||||
mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
|
||||
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
|
||||
mock_daemons_versions.return_value = set([(16, 2, 7), (16, 1, 7)])
|
||||
args = check_ceph_status.parse_args([
|
||||
'--check_daemons_versions_consistency'])
|
||||
self.assertRaises(check_ceph_status.WarnError,
|
||||
lambda: check_ceph_status.check_ceph_status(args))
|
||||
|
||||
# Major version ahead
|
||||
@patch('check_ceph_status.get_daemons_versions')
|
||||
def test_one_version_ahead(self, mock_daemons_versions, mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
|
||||
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
|
||||
mock_daemons_versions.return_value = set([(16, 2, 7), (17, 2, 0)])
|
||||
args = check_ceph_status.parse_args([
|
||||
'--check_daemons_versions_consistency'])
|
||||
self.assertRaises(check_ceph_status.CriticalError,
|
||||
lambda: check_ceph_status.check_ceph_status(args))
|
||||
|
||||
# Two major version ahead
|
||||
@patch('check_ceph_status.get_daemons_versions')
|
||||
def test_two_version_ahead(self, mock_daemons_versions, mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 15.2.16 ' \
|
||||
'(d46a73d6d0a67a79558054a3a5a72cb561724974)'.encode('UTF-8')
|
||||
mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)])
|
||||
args = check_ceph_status.parse_args([
|
||||
'--check_daemons_versions_consistency'])
|
||||
self.assertRaises(check_ceph_status.CriticalError,
|
||||
lambda: check_ceph_status.check_ceph_status(args))
|
||||
|
||||
# Major version behind
|
||||
@patch('check_ceph_status.get_daemons_versions')
|
||||
def test_version_behind(self, mock_daemons_versions, mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
|
||||
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
|
||||
mock_daemons_versions.return_value = set([(15, 2, 16), (16, 2, 7)])
|
||||
args = check_ceph_status.parse_args([
|
||||
'--check_daemons_versions_consistency'])
|
||||
self.assertRaises(check_ceph_status.WarnError,
|
||||
lambda: check_ceph_status.check_ceph_status(args))
|
||||
|
||||
# Two major version behind
|
||||
@patch('check_ceph_status.get_daemons_versions')
|
||||
def test_two_version_behind(self, mock_daemons_versions, mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 17.2.0 ' \
|
||||
'(43e2e60a7559d3f46c9d53f1ca875fd499a1e35e)'.encode('UTF-8')
|
||||
mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)])
|
||||
args = check_ceph_status.parse_args([
|
||||
'--check_daemons_versions_consistency'])
|
||||
self.assertRaises(check_ceph_status.CriticalError,
|
||||
lambda: check_ceph_status.check_ceph_status(args))
|
||||
|
||||
def test_get_ceph_version(self, mock_subprocess):
|
||||
mock_subprocess.return_value = 'ceph version 10.2.9 ' \
|
||||
|
Loading…
Reference in New Issue
Block a user