NRPE: Monitor threads connected to MySQL.

Add a NRPE check to monitor the number of threads connected
to the MySQL database, in proportion to the maximum number of connections.
For this check, a nagios user will be created. This user does not have any
permissions set, does not have access to any database and can only connect
from localhost.
Warning and Critical thresholds (in percentage) can be configured.

Add an action to reset nagios's password. This action could only be run
on the leader unit.

Closes-Bug: #1816759
Change-Id: Id35b0331322c2744a9f839b3eb153eed1bc53aac
This commit is contained in:
Robert Gildein 2020-11-04 18:02:57 +01:00 committed by Alex Kavanagh
parent 2962cc9a9a
commit 11ee1ef563
11 changed files with 296 additions and 6 deletions

View File

@ -38,6 +38,14 @@ to form its cluster. It is best practice to use this option as doing so ensures
that the charm will wait until the cluster is up before accepting relations
from other client applications.
#### `nrpe-threads-connected`
The `nrpe-threads-connected` option sets Warning and Critical thresholds (in percent)
for NRPE check to monitor the number of threads connecting to the MySQL.
If the `nrpe-external-master` relationship is set, a nagios user who does
not have permission and can only connect from localhost is created before
the check is created.
## Deployment
To deploy a single percona-cluster unit:

View File

@ -64,3 +64,8 @@ mysqldump:
description: |
Comma delimited database names to dump. If left unset, all databases
will be dumped.
generate-nagios-password:
description: |
Re-generate the password for the nagios user. This action can only be run on the leader unit,
which sets a new password and then the `leader-settings-changed` hook will be activate.

View File

@ -36,6 +36,7 @@ from charmhelpers.core.hookenv import (
from charmhelpers.core.host import (
CompareHostReleases,
lsb_release,
pwgen,
)
from charmhelpers.contrib.openstack.utils import (
@ -223,6 +224,16 @@ def mysqldump(args):
action_fail("mysqldump failed")
def generate_nagios_password(args):
"""Regenerate nagios password."""
if is_leader():
leader_set({"nagios-password": pwgen()})
percona_utils.set_nagios_user()
action_set({"output": "New password for nagios created successfully."})
else:
action_fail("This action should only take place on the leader unit.")
# A dictionary of all the defined actions to callables (which take
# parsed arguments).
ACTIONS = {"pause": pause, "resume": resume, "backup": backup,
@ -230,7 +241,8 @@ ACTIONS = {"pause": pause, "resume": resume, "backup": backup,
"bootstrap-pxc": bootstrap_pxc,
"notify-bootstrapped": notify_bootstrapped,
"set-pxc-strict-mode": set_pxc_strict_mode,
"mysqldump": mysqldump}
"mysqldump": mysqldump,
"generate-nagios-password": generate_nagios_password}
def main(args):

View File

@ -0,0 +1 @@
actions.py

View File

@ -358,3 +358,10 @@ options:
control message will be sent. The fc_limit defaults to 16 transactions.
This effectively means that this is as far as a given node can be behind
committing transactions from the cluster.
nrpe-threads-connected:
type: string
default: '80,90'
description: |
This configuration option represents the warning and critical percentages
that are used to check the number of threads connected to MySQL.
The value should be written as a string containing two numbers separated by commas.

View File

@ -34,6 +34,7 @@ from charmhelpers.core.hookenv import (
DEBUG,
INFO,
WARNING,
ERROR,
is_leader,
network_get_primary_address,
leader_get,
@ -57,6 +58,7 @@ from charmhelpers.fetch import (
apt_install,
add_source,
SourceConfigError,
filter_installed_packages,
)
from charmhelpers.contrib.peerstorage import (
peer_echo,
@ -151,6 +153,9 @@ from percona_utils import (
delete_replication_user,
list_replication_users,
check_mysql_connection,
set_nagios_user,
get_nrpe_threads_connected_thresholds,
MYSQL_NAGIOS_CREDENTIAL_FILE,
update_source,
ADD_APT_REPOSITORY_FAILED,
)
@ -1035,16 +1040,40 @@ def leader_elected():
'nrpe-external-master-relation-changed')
def update_nrpe_config():
# python-dbus is used by check_upstart_job
apt_install('python-dbus')
# nagios-plugins-contrib add pmp-check-mysql-status check
packages = filter_installed_packages(["python-dbus",
"nagios-plugins-contrib"])
apt_install(packages)
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
nrpe.add_init_service_checks(nrpe_setup, ['mysql'], current_unit)
nrpe_setup.add_check(
shortname='mysql_proc',
description='Check MySQL process {%s}' % current_unit,
description='Check MySQL process {}'.format(current_unit),
check_cmd='check_procs -c 1:1 -C mysqld'
)
try:
warning_threads, critical_threads = \
get_nrpe_threads_connected_thresholds()
except ValueError as error:
log("failed to get thresholds from nrpe-threads-connected due: "
"{}".format(error), level=ERROR)
log("the default thresholds are used")
warning_threads, critical_threads = 80, 90
set_nagios_user()
nrpe_setup.add_check(
shortname='mysql_threads',
description='Check MySQL connected threads',
check_cmd='pmp-check-mysql-status --defaults-file {credential_file} '
'-x Threads_connected -o / -y max_connections -T pct '
'-w {warning} -c {critical}'.format(
credential_file=MYSQL_NAGIOS_CREDENTIAL_FILE,
warning=warning_threads,
critical=critical_threads)
)
nrpe_setup.write()

View File

@ -97,6 +97,7 @@ ADD_APT_REPOSITORY_FAILED = "add-apt-repository-failed"
# maintenance mode, but is currently not populated as the
# charm_check_function() checks whether the unit is working properly.
REQUIRED_INTERFACES = {}
MYSQL_NAGIOS_CREDENTIAL_FILE = "/etc/nagios/mysql-check.cnf"
class LeaderNoBootstrapUUIDError(Exception):
@ -1053,6 +1054,8 @@ root_password = partial(_get_password, 'root-password')
sst_password = partial(_get_password, 'sst-password')
nagios_password = partial(_get_password, 'nagios-password')
def pxc_installed():
'''Determine whether percona-xtradb-cluster is installed
@ -1744,3 +1747,94 @@ def update_source(source, key=None):
else:
apt_update() # run without retries
log("apt update after adding a new package source")
def get_nrpe_threads_connected_thresholds():
"""This function get and verifies threshold values.
The function confirms these conditions:
Thresholds for `nrpe-threads-connected` should be separable by `,` and
represented by an integer, where the warning threshold should be
in the range [0, 100) and the critical threshold in the range (0, 100).
At the same time, the warning threshold should be lower than
the critical one.
:returns: warning threshold, critical threshold
:rtype: Tuple[int, int]
:raises: ValueError
"""
values = config("nrpe-threads-connected").split(",")
if len(values) != 2:
raise ValueError("the wrong number of values was set for "
"the nrpe-threads-connected")
try:
warning_threshold, critical_threshold = int(values[0]), int(values[1])
except ValueError as error:
raise error
if not (0 <= warning_threshold < 100 and 0 < critical_threshold <= 100):
raise ValueError("the warning threshold must be in the range [0,100) "
"and the critical threshold must be in the range "
"(0,100]")
if warning_threshold >= critical_threshold:
raise ValueError("the warning threshold must be less than critical")
return warning_threshold, critical_threshold
def write_nagios_my_cnf():
"""Create a MySQL configuration file with nagios user credentials."""
context = {
"mysql_user": "nagios", "mysql_passwd": nagios_password(),
}
render("nagios-my.cnf", MYSQL_NAGIOS_CREDENTIAL_FILE, context,
owner="nagios", group="nagios", perms=0o640)
def create_nagios_user():
"""Create MySQL user for nagios to check the status of MySQL.
This user does not have any permissions set, does not have access
to any database and can only connect from localhost.
:raises: OperationalError
"""
m_helper = get_db_helper()
try:
m_helper.connect(password=m_helper.get_mysql_root_password())
except OperationalError:
log("Could not connect to db", level=ERROR)
raise
# NOTE (rgildein): This user has access to the SHOW GLOBAL STATUS
# statement, which does not require any privilege, only the ability to
# connect to the server.
nagios_exists = False
try:
# NOTE (rgildein): This method of creation nagios user should be
# replaced by the `MySQLHelper.create_user` method after
# bug #1905586 has been fixed and merged.
nagios_exists = m_helper.select(
"SELECT EXISTS(SELECT 1 FROM mysql.user WHERE user = 'nagios')"
)
m_helper.execute("CREATE USER 'nagios'@'localhost';")
except OperationalError as error:
if not nagios_exists:
log("Creating user nagios failed due: {}".format(error),
level="ERROR")
raise error
else:
log("User 'nagios'@'localhost' already exists.", level="WARNING")
# NOTE (rgildein): Update the user's password if it has changed.
m_helper.execute("ALTER USER 'nagios'@'localhost' IDENTIFIED BY "
"'{passwd}';".format(passwd=nagios_password()))
def set_nagios_user():
"""Create MySQL user and configuration file with user credentials."""
if is_leader():
create_nagios_user()
write_nagios_my_cnf()

3
templates/nagios-my.cnf Normal file
View File

@ -0,0 +1,3 @@
[client]
user="{{ mysql_user }}"
password="{{ mysql_passwd }}"

View File

@ -105,3 +105,35 @@ class MainTestCase(CharmTestCase):
with mock.patch.dict(actions.ACTIONS, {"foo": dummy_action}):
actions.main(["foo"])
self.assertEqual(dummy_calls, ["Action foo failed: uh oh"])
class NagiosTestCase(CharmTestCase):
def setUp(self):
super(NagiosTestCase, self).setUp(actions,
["action_set",
"action_fail",
"is_leader",
"leader_set",
"pwgen",
])
@patch.object(actions.percona_utils, "set_nagios_user")
def test_generate_nagios_password(self, mock_set_nagios_user):
"""Test regenerate new password for nagios user."""
self.is_leader.return_value = True
self.pwgen.return_value = "1234"
actions.generate_nagios_password([])
self.leader_set.assert_called_once_with({"nagios-password": "1234"})
mock_set_nagios_user.assert_called_once_with()
self.action_set.assert_called_once_with(
{"output": "New password for nagios created successfully."}
)
def test_generate_nagios_password_no_leader(self):
"""Test regenerate new password for nagios user at no leader unit."""
self.is_leader.return_value = False
actions.generate_nagios_password([])
self.action_fail.assert_called_once_with(
"This action should only take place on the leader unit."
)

View File

@ -266,15 +266,45 @@ class TestNRPERelation(CharmTestCase):
def setUp(self):
patch_targets_nrpe = TO_PATCH[:]
patch_targets_nrpe.remove("update_nrpe_config")
patch_targets_nrpe.append("nrpe")
patch_targets_nrpe.append("apt_install")
patch_targets_nrpe.extend(["nrpe",
"apt_install",
"config",
"set_nagios_user"])
CharmTestCase.setUp(self, hooks, patch_targets_nrpe)
def test_mysql_monitored(self):
@mock.patch("percona_utils.config")
@mock.patch("percona_utils.is_leader")
@mock.patch("percona_utils.leader_get")
def test_mysql_monitored(self,
mock_leader_get,
mock_is_leader,
mock_config):
"""The mysql service is monitored by Nagios."""
self.nrpe.get_nagios_unit_name.return_value = "nagios-0"
self.test_config.set("nrpe-threads-connected", "80,90")
mock_config.side_effect = self.test_config.get
mock_is_leader.return_value = True
mock_leader_get.return_value = "1234"
nrpe_setup = mock.MagicMock()
nrpe_setup.add_check = mock.MagicMock()
self.nrpe.NRPE.return_value = nrpe_setup
hooks.update_nrpe_config()
self.nrpe.add_init_service_checks.assert_called_once_with(
mock.ANY, ["mysql"], mock.ANY)
nrpe_setup.add_check.assert_has_calls([
mock.call(
shortname="mysql_proc",
description="Check MySQL process nagios-0",
check_cmd="check_procs -c 1:1 -C mysqld"),
mock.call(
shortname="mysql_threads",
description="Check MySQL connected threads",
check_cmd="pmp-check-mysql-status "
"--defaults-file /etc/nagios/mysql-check.cnf "
"-x Threads_connected -o / -y max_connections "
"-T pct -w 80 -c 90")
])
class TestMasterRelation(CharmTestCase):

View File

@ -17,6 +17,7 @@ class UtilsTests(CharmTestCase):
TO_PATCH = [
'config',
'kv',
'is_leader',
'leader_get',
'log',
'relation_ids',
@ -538,6 +539,74 @@ class UtilsTests(CharmTestCase):
mock_apt_update.assert_not_called()
@mock.patch.object(percona_utils, "get_db_helper")
@mock.patch.object(percona_utils, "write_nagios_my_cnf")
def test_create_nagios_user(self,
mock_create_nagios_mysql_credential,
mock_get_db_helper):
my_mock = mock.Mock()
self.leader_get.return_value = "1234"
self.is_leader.return_value = True
mock_get_db_helper.return_value = my_mock
percona_utils.create_nagios_user()
my_mock.select.assert_called_once_with(
"SELECT EXISTS(SELECT 1 FROM mysql.user WHERE user = 'nagios')"
)
my_mock.execute.assert_has_calls([
mock.call("CREATE USER 'nagios'@'localhost';"),
mock.call("ALTER USER 'nagios'@'localhost' IDENTIFIED BY '1234';"),
])
class OperationalError(Exception):
pass
percona_utils.OperationalError = OperationalError
def mysql_create_user(*args, **kwargs):
raise OperationalError()
my_mock.select.return_value = True
my_mock.execute.side_effect = mysql_create_user
with self.assertRaises(OperationalError):
percona_utils.create_nagios_user()
def test_get_nrpe_threads_connected_thresholds(self):
"""Test function for getting and verifying threshold values."""
self.config.return_value = "a,1,2"
with self.assertRaises(ValueError) as context:
percona_utils.get_nrpe_threads_connected_thresholds()
self.assertEqual(ValueError("the wrong number of values was set "
"for the nrpe-threads-connected"),
context.exception)
self.config.return_value = "a,1"
with self.assertRaises(ValueError) as context:
percona_utils.get_nrpe_threads_connected_thresholds()
self.assertEqual(
ValueError("invalid literal for int() with base 10: 'a'"),
context.exception)
self.config.return_value = "50,200"
with self.assertRaises(ValueError) as context:
percona_utils.get_nrpe_threads_connected_thresholds()
self.assertEqual(ValueError("the warning threshold must be in the "
"range [0,100) and the critical "
"threshold must be in the range "
"(0,100]"),
context.exception)
self.config.return_value = "90,60"
with self.assertRaises(ValueError) as context:
percona_utils.get_nrpe_threads_connected_thresholds()
self.assertEqual(ValueError("the warning threshold must be less "
"than critical"),
context.exception)
self.config.return_value = "80,90"
thresholds = percona_utils.get_nrpe_threads_connected_thresholds()
self.assertEqual(thresholds, (80, 90))
class UtilsTestsStatus(CharmTestCase):