Create NRPE check to verify ceph daemons versions

This NRPE check confirms if the versions of cluster daemons are divergent.

WARN - any minor version diverged
WARN – any versions are 1 release behind the mon
CRIT – any versions are 2 releases behind the mon
CRIT – any versions releases are head the mon

A juju action is also provided 'get-versions-report'
which provide to users, a quick way to see
daemons versions running on cluster hosts.

Closes-Bug: #1943628
Change-Id: I41b5c8576dc9cf885fa813a93e6d51e8804eb9d8
This commit is contained in:
Hicham El Gharbi 2022-07-19 12:18:06 +02:00
parent b7c774cd8f
commit dfbda68e1a
11 changed files with 357 additions and 0 deletions

View File

@ -4,6 +4,8 @@ resume-health:
description: "Resume ceph health operations across the entire ceph cluster"
get-health:
description: "Output the current cluster health reported by `ceph health`"
get-versions-report:
description: "Outputs running daemon versions for all cluster members"
create-cache-tier:
description: "Create a new cache tier"
params:

View File

@ -26,6 +26,11 @@ from charmhelpers.contrib.storage.linux.ceph import pool_set, \
set_pool_quota, snapshot_pool, remove_pool_snapshot
class CephReportError(Exception):
"""This indicates a critical error."""
pass
def list_pools():
"""Return a list of all Ceph pools."""
try:
@ -35,6 +40,52 @@ def list_pools():
action_fail(str(e))
def get_versions_report():
"""
Return a mapping of hosts and their related ceph daemon versions.
On error, raise a CephReportError.
"""
report = dict()
try:
output = check_output(['ceph', 'node', 'ls']).decode('UTF-8')
except CalledProcessError as e:
action_fail(str(e))
raise(CephReportError("Getting nodes list fail"))
nodes_list = json.loads(output)
# osd versions
for osd_host, osds in nodes_list['osd'].items():
report.setdefault(osd_host, [])
for osd in osds:
try:
output = check_output(['ceph', 'tell',
"osd.{}".format(osd),
'version']).decode('UTF-8')
except CalledProcessError:
raise(
CephReportError("Getting osd.{} version fail".format(osd))
)
report[osd_host].append(json.loads(output)['version'])
# mon versions
for mon_host, mons in nodes_list['mon'].items():
report.setdefault(mon_host, [])
for mon in mons:
try:
output = check_output(['ceph', 'tell',
"mon.{}".format(mon),
'version']).decode('UTF-8')
except CalledProcessError as e:
action_fail(str(e))
raise(
CephReportError("Getting mon.{} version fail".format(mon))
)
report[mon_host].append(json.loads(output)['version'])
return json.dumps(report, indent=4)
def get_health():
"""
Returns the output of 'ceph health'.

1
actions/get-versions-report Symbolic link
View File

@ -0,0 +1 @@
get_versions_report.py

26
actions/get_versions_report.py Executable file
View File

@ -0,0 +1,26 @@
#!/usr/bin/env python3
#
# Copyright 2022 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ceph_ops import get_versions_report, CephReportError
from charmhelpers.core.hookenv import log, action_set, action_fail
if __name__ == '__main__':
try:
action_set({'message': get_versions_report()})
except CephReportError as e:
log(e)
action_fail(
"get versions report failed with message: {}".format(str(e)))

View File

@ -86,6 +86,32 @@ def get_ceph_version():
return out_version
def get_daemons_versions():
"""
Uses CLI to get the ceph versions
:returns: set containing tuple of integers,
all the differents versions encountered in the cluster
:raises: UnknownError
"""
try:
tree = subprocess.check_output(['ceph',
'versions']).decode('UTF-8')
except subprocess.CalledProcessError as e:
raise UnknownError(
"UNKNOWN: could not determine OSDs versions, error: {}".format(e))
ceph_versions = json.loads(tree)
# ceph version command return a json output
# containing version of all daemons connected to the cluster
# here we parse the overall field,
# to get a set of all versions seen by the cluster
daemons_versions = set(map(
lambda x: tuple(int(i) for i in
x.split(' ')[2].split('.')),
ceph_versions['overall'].keys()))
return daemons_versions
def get_status_and_messages(status_data):
"""
Used to get general status of a Ceph cluster as well as a list of
@ -135,6 +161,50 @@ def check_ceph_status(args):
"""
status_critical = False
# if it is just --check_daemons_versions_consistency,
# deal with it and ignore overall health
if args.check_daemons_versions_consistency:
daemons_versions = get_daemons_versions()
# we check that the osds have same versions
num_of_versions = len(daemons_versions)
if num_of_versions == 1:
message_ok = "OK: All versions alligned"
return message_ok
else:
# version diverged
# we check if major release are the same
# by parsing version number in the daemon_version set
# and keeping major version number or coverting the minor
# version number if major version is 0
num_of_releases = set(map(lambda x: x[0], daemons_versions))
if len(num_of_releases) == 1:
msg = 'WARNING: Components minor versions diverged.'
'Run get-versions-report to know more'
raise WarnError(msg)
else:
# Releases diverged
major, _minor, _patch = get_ceph_version()
release_versions_diff = list(map(lambda x: major - x,
num_of_releases))
if max(release_versions_diff) >= 2:
msg = "CRITICAL: A component is " \
"{} version behind osd leader" \
". Run get-versions-report to know more".format(
max(release_versions_diff))
raise CriticalError(msg)
if min(release_versions_diff) <= -1:
msg = "CRITICAL: A component is " \
"{} version ahead osd leader" \
". Run get-versions-report to know more".format(
abs(min(release_versions_diff)))
raise CriticalError(msg)
if max(release_versions_diff) == 1:
msg = "WARNING: A component is " \
"{} version behind osd leader" \
". Run get-versions-report to know more".format(
max(release_versions_diff))
raise WarnError(msg)
if args.status_file:
check_file_freshness(args.status_file)
with open(args.status_file) as f:
@ -287,6 +357,11 @@ def parse_args(args):
dest='check_num_osds', default=False,
action='store_true',
help="Check whether all OSDs are up and in")
parser.add_argument('--check_daemons_versions_consistency',
dest='check_daemons_versions_consistency',
default=False,
action='store_true',
help="Check all OSDs versions")
return parser.parse_args(args)

View File

@ -1211,6 +1211,14 @@ def update_nrpe_config():
description='Check whether all OSDs are up and in',
check_cmd=check_cmd
)
if is_leader():
check_cmd = 'check_ceph_status.py -f {}' \
' --check_daemons_versions'.format(STATUS_FILE)
nrpe_setup.add_check(
shortname='ceph_daemons_versions',
description='Check wheter all ceph daemons versions are alligned',
check_cmd=check_cmd
)
nrpe_setup.write()

View File

@ -0,0 +1,35 @@
{
"mon": {
"juju-c8b0a2-3-lxd-0": [
"juju-c8b0a2-3-lxd-0"
],
"juju-c8b0a2-4-lxd-0": [
"juju-c8b0a2-4-lxd-0"
],
"juju-c8b0a2-5-lxd-0": [
"juju-c8b0a2-5-lxd-0"
]
},
"osd": {
"aware-bee": [
1
],
"grand-ape": [
0
],
"lucky-muskox": [
2
]
},
"mgr": {
"juju-c8b0a2-3-lxd-0": [
"juju-c8b0a2-3-lxd-0"
],
"juju-c8b0a2-4-lxd-0": [
"juju-c8b0a2-4-lxd-0"
],
"juju-c8b0a2-5-lxd-0": [
"juju-c8b0a2-5-lxd-0"
]
}
}

View File

@ -0,0 +1,15 @@
{
"mon": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3
},
"mgr": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3
},
"osd": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 2
},
"mds": {},
"overall": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 8
}
}

View File

@ -0,0 +1,19 @@
{
"mon": {
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 1,
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2
},
"mgr": {
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 3
},
"osd": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3,
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2
},
"mds": {},
"overall": {
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 4,
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3,
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 4
}
}

View File

@ -13,6 +13,7 @@
import json
import sys
import unittest.mock as mock
from subprocess import CalledProcessError
from test_utils import CharmTestCase
@ -53,6 +54,45 @@ class OpsTestCase(CharmTestCase):
cmd = ['ceph', 'health']
self.check_output.assert_called_once_with(cmd)
def test_get_version_report_ok(self):
def _call_rslt():
with open('unit_tests/ceph_ls_node.json') as f:
tree = f.read()
yield tree.encode('UTF-8')
while True:
yield ('{'
' "version": "16.2.7",'
' "release": "pacific",'
' "release_type": "stable"'
'}').encode('UTF-8')
self.check_output.side_effect = _call_rslt()
result = actions.get_versions_report()
self.assertEqual('{\n'
' "aware-bee": [\n'
' "16.2.7"\n'
' ],\n'
' "grand-ape": [\n'
' "16.2.7"\n'
' ],\n'
' "lucky-muskox": [\n'
' "16.2.7"\n'
' ],\n'
' "juju-c8b0a2-3-lxd-0": [\n'
' "16.2.7"\n'
' ],\n'
' "juju-c8b0a2-4-lxd-0": [\n'
' "16.2.7"\n'
' ],\n'
' "juju-c8b0a2-5-lxd-0": [\n'
' "16.2.7"\n'
' ]\n'
'}', result)
def test_get_version_report_fail(self):
self.check_output.side_effect = CalledProcessError(1, 'ceph node ls')
self.assertRaises(actions.CephReportError,
lambda: actions.get_versions_report())
@mock.patch('socket.gethostname')
def test_get_quorum_status(self, mock_hostname):
mock_hostname.return_value = 'mockhost'

View File

@ -17,6 +17,7 @@ import os
import sys
from unittest.mock import patch
from subprocess import CalledProcessError
# import the module we want to test
os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios'))
@ -25,6 +26,90 @@ import check_ceph_status
@patch('subprocess.check_output')
class NagiosTestCase(unittest.TestCase):
def test_get_daemons_versions_alligned(self, mock_subprocess):
with open('unit_tests/ceph_versions_alligned.json', 'rb') as f:
mock_subprocess.return_value = f.read()
osds_versions = check_ceph_status.get_daemons_versions()
self.assertEqual(osds_versions, set([(16, 2, 7)]))
def test_get_daemons_versions_diverged(self, mock_subprocess):
with open('unit_tests/ceph_versions_diverged.json', 'rb') as f:
mock_subprocess.return_value = f.read()
osds_versions = check_ceph_status.get_daemons_versions()
self.assertEqual(osds_versions, set([(16, 2, 7), (17, 2, 0),
(15, 2, 16)]))
def test_get_daemons_versions_exeption(self, mock_subprocess):
mock_subprocess.side_effect = CalledProcessError(1, 'ceph versions')
self.assertRaises(check_ceph_status.UnknownError,
lambda: check_ceph_status.get_daemons_versions())
# Version Alligned
@patch('check_ceph_status.get_daemons_versions')
def test_versions_alligned(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(16, 2, 7)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^OK: All versions alligned$")
# Minor version diverged
@patch('check_ceph_status.get_daemons_versions')
def test_min_versions_diverged(self, mock_daemons_versions,
mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(16, 2, 7), (16, 1, 7)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Major version ahead
@patch('check_ceph_status.get_daemons_versions')
def test_one_version_ahead(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(16, 2, 7), (17, 2, 0)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Two major version ahead
@patch('check_ceph_status.get_daemons_versions')
def test_two_version_ahead(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 15.2.16 ' \
'(d46a73d6d0a67a79558054a3a5a72cb561724974)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Major version behind
@patch('check_ceph_status.get_daemons_versions')
def test_version_behind(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(15, 2, 16), (16, 2, 7)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Two major version behind
@patch('check_ceph_status.get_daemons_versions')
def test_two_version_behind(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 17.2.0 ' \
'(43e2e60a7559d3f46c9d53f1ca875fd499a1e35e)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_get_ceph_version(self, mock_subprocess):
mock_subprocess.return_value = 'ceph version 10.2.9 ' \