Juju Charm - Ceph OSD
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrpe.py 16KB


  1. # Copyright 2014-2015 Canonical Limited.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Compatibility with the nrpe-external-master charm"""
  15. # Copyright 2012 Canonical Ltd.
  16. #
  17. # Authors:
  18. # Matthew Wedgwood <matthew.wedgwood@canonical.com>
  19. import subprocess
  20. import pwd
  21. import grp
  22. import os
  23. import glob
  24. import shutil
  25. import re
  26. import shlex
  27. import yaml
  28. from charmhelpers.core.hookenv import (
  29. config,
  30. hook_name,
  31. local_unit,
  32. log,
  33. relation_ids,
  34. relation_set,
  35. relations_of_type,
  36. )
  37. from charmhelpers.core.host import service
  38. from charmhelpers.core import host
  39. # This module adds compatibility with the nrpe-external-master and plain nrpe
  40. # subordinate charms. To use it in your charm:
  41. #
  42. # 1. Update metadata.yaml
  43. #
  44. # provides:
  45. # (...)
  46. # nrpe-external-master:
  47. # interface: nrpe-external-master
  48. # scope: container
  49. #
  50. # and/or
  51. #
  52. # provides:
  53. # (...)
  54. # local-monitors:
  55. # interface: local-monitors
  56. # scope: container
  57. #
  58. # 2. Add the following to config.yaml
  59. #
  60. # nagios_context:
  61. # default: "juju"
  62. # type: string
  63. # description: |
  64. # Used by the nrpe subordinate charms.
  65. # A string that will be prepended to instance name to set the host name
  66. # in nagios. So for instance the hostname would be something like:
  67. # juju-myservice-0
  68. # If you're running multiple environments with the same services in them
  69. # this allows you to differentiate between them.
  70. # nagios_servicegroups:
  71. # default: ""
  72. # type: string
  73. # description: |
  74. # A comma-separated list of nagios servicegroups.
  75. # If left empty, the nagios_context will be used as the servicegroup
  76. #
  77. # 3. Add custom checks (Nagios plugins) to files/nrpe-external-master
  78. #
  79. # 4. Update your hooks.py with something like this:
  80. #
  81. # from charmsupport.nrpe import NRPE
  82. # (...)
  83. # def update_nrpe_config():
  84. # nrpe_compat = NRPE()
  85. # nrpe_compat.add_check(
  86. # shortname = "myservice",
  87. # description = "Check MyService",
  88. # check_cmd = "check_http -w 2 -c 10 http://localhost"
  89. # )
  90. # nrpe_compat.add_check(
  91. # "myservice_other",
  92. # "Check for widget failures",
  93. # check_cmd = "/srv/myapp/scripts/widget_check"
  94. # )
  95. # nrpe_compat.write()
  96. #
  97. # def config_changed():
  98. # (...)
  99. # update_nrpe_config()
  100. #
  101. # def nrpe_external_master_relation_changed():
  102. # update_nrpe_config()
  103. #
  104. # def local_monitors_relation_changed():
  105. # update_nrpe_config()
  106. #
  107. # 4.a If your charm is a subordinate charm set primary=False
  108. #
  109. # from charmsupport.nrpe import NRPE
  110. # (...)
  111. # def update_nrpe_config():
  112. # nrpe_compat = NRPE(primary=False)
  113. #
  114. # 5. ln -s hooks.py nrpe-external-master-relation-changed
  115. # ln -s hooks.py local-monitors-relation-changed
  116. class CheckException(Exception):
  117. pass
  118. class Check(object):
  119. shortname_re = '[A-Za-z0-9-_.]+$'
  120. service_template = ("""
  121. #---------------------------------------------------
  122. # This file is Juju managed
  123. #---------------------------------------------------
  124. define service {{
  125. use active-service
  126. host_name {nagios_hostname}
  127. service_description {nagios_hostname}[{shortname}] """
  128. """{description}
  129. check_command check_nrpe!{command}
  130. servicegroups {nagios_servicegroup}
  131. }}
  132. """)
  133. def __init__(self, shortname, description, check_cmd):
  134. super(Check, self).__init__()
  135. # XXX: could be better to calculate this from the service name
  136. if not re.match(self.shortname_re, shortname):
  137. raise CheckException("shortname must match {}".format(
  138. Check.shortname_re))
  139. self.shortname = shortname
  140. self.command = "check_{}".format(shortname)
  141. # Note: a set of invalid characters is defined by the
  142. # Nagios server config
  143. # The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()=
  144. self.description = description
  145. self.check_cmd = self._locate_cmd(check_cmd)
  146. def _get_check_filename(self):
  147. return os.path.join(NRPE.nrpe_confdir, '{}.cfg'.format(self.command))
  148. def _get_service_filename(self, hostname):
  149. return os.path.join(NRPE.nagios_exportdir,
  150. 'service__{}_{}.cfg'.format(hostname, self.command))
  151. def _locate_cmd(self, check_cmd):
  152. search_path = (
  153. '/usr/lib/nagios/plugins',
  154. '/usr/local/lib/nagios/plugins',
  155. )
  156. parts = shlex.split(check_cmd)
  157. for path in search_path:
  158. if os.path.exists(os.path.join(path, parts[0])):
  159. command = os.path.join(path, parts[0])
  160. if len(parts) > 1:
  161. command += " " + " ".join(parts[1:])
  162. return command
  163. log('Check command not found: {}'.format(parts[0]))
  164. return ''
  165. def _remove_service_files(self):
  166. if not os.path.exists(NRPE.nagios_exportdir):
  167. return
  168. for f in os.listdir(NRPE.nagios_exportdir):
  169. if f.endswith('_{}.cfg'.format(self.command)):
  170. os.remove(os.path.join(NRPE.nagios_exportdir, f))
  171. def remove(self, hostname):
  172. nrpe_check_file = self._get_check_filename()
  173. if os.path.exists(nrpe_check_file):
  174. os.remove(nrpe_check_file)
  175. self._remove_service_files()
  176. def write(self, nagios_context, hostname, nagios_servicegroups):
  177. nrpe_check_file = self._get_check_filename()
  178. with open(nrpe_check_file, 'w') as nrpe_check_config:
  179. nrpe_check_config.write("# check {}\n".format(self.shortname))
  180. if nagios_servicegroups:
  181. nrpe_check_config.write(
  182. "# The following header was added automatically by juju\n")
  183. nrpe_check_config.write(
  184. "# Modifying it will affect nagios monitoring and alerting\n")
  185. nrpe_check_config.write(
  186. "# servicegroups: {}\n".format(nagios_servicegroups))
  187. nrpe_check_config.write("command[{}]={}\n".format(
  188. self.command, self.check_cmd))
  189. if not os.path.exists(NRPE.nagios_exportdir):
  190. log('Not writing service config as {} is not accessible'.format(
  191. NRPE.nagios_exportdir))
  192. else:
  193. self.write_service_config(nagios_context, hostname,
  194. nagios_servicegroups)
  195. def write_service_config(self, nagios_context, hostname,
  196. nagios_servicegroups):
  197. self._remove_service_files()
  198. templ_vars = {
  199. 'nagios_hostname': hostname,
  200. 'nagios_servicegroup': nagios_servicegroups,
  201. 'description': self.description,
  202. 'shortname': self.shortname,
  203. 'command': self.command,
  204. }
  205. nrpe_service_text = Check.service_template.format(**templ_vars)
  206. nrpe_service_file = self._get_service_filename(hostname)
  207. with open(nrpe_service_file, 'w') as nrpe_service_config:
  208. nrpe_service_config.write(str(nrpe_service_text))
  209. def run(self):
  210. subprocess.call(self.check_cmd)
  211. class NRPE(object):
  212. nagios_logdir = '/var/log/nagios'
  213. nagios_exportdir = '/var/lib/nagios/export'
  214. nrpe_confdir = '/etc/nagios/nrpe.d'
  215. homedir = '/var/lib/nagios' # home dir provided by nagios-nrpe-server
  216. def __init__(self, hostname=None, primary=True):
  217. super(NRPE, self).__init__()
  218. self.config = config()
  219. self.primary = primary
  220. self.nagios_context = self.config['nagios_context']
  221. if 'nagios_servicegroups' in self.config and self.config['nagios_servicegroups']:
  222. self.nagios_servicegroups = self.config['nagios_servicegroups']
  223. else:
  224. self.nagios_servicegroups = self.nagios_context
  225. self.unit_name = local_unit().replace('/', '-')
  226. if hostname:
  227. self.hostname = hostname
  228. else:
  229. nagios_hostname = get_nagios_hostname()
  230. if nagios_hostname:
  231. self.hostname = nagios_hostname
  232. else:
  233. self.hostname = "{}-{}".format(self.nagios_context, self.unit_name)
  234. self.checks = []
  235. # Iff in an nrpe-external-master relation hook, set primary status
  236. relation = relation_ids('nrpe-external-master')
  237. if relation:
  238. log("Setting charm primary status {}".format(primary))
  239. for rid in relation_ids('nrpe-external-master'):
  240. relation_set(relation_id=rid, relation_settings={'primary': self.primary})
  241. def add_check(self, *args, **kwargs):
  242. self.checks.append(Check(*args, **kwargs))
  243. def remove_check(self, *args, **kwargs):
  244. if kwargs.get('shortname') is None:
  245. raise ValueError('shortname of check must be specified')
  246. # Use sensible defaults if they're not specified - these are not
  247. # actually used during removal, but they're required for constructing
  248. # the Check object; check_disk is chosen because it's part of the
  249. # nagios-plugins-basic package.
  250. if kwargs.get('check_cmd') is None:
  251. kwargs['check_cmd'] = 'check_disk'
  252. if kwargs.get('description') is None:
  253. kwargs['description'] = ''
  254. check = Check(*args, **kwargs)
  255. check.remove(self.hostname)
  256. def write(self):
  257. try:
  258. nagios_uid = pwd.getpwnam('nagios').pw_uid
  259. nagios_gid = grp.getgrnam('nagios').gr_gid
  260. except Exception:
  261. log("Nagios user not set up, nrpe checks not updated")
  262. return
  263. if not os.path.exists(NRPE.nagios_logdir):
  264. os.mkdir(NRPE.nagios_logdir)
  265. os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid)
  266. nrpe_monitors = {}
  267. monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}}
  268. for nrpecheck in self.checks:
  269. nrpecheck.write(self.nagios_context, self.hostname,
  270. self.nagios_servicegroups)
  271. nrpe_monitors[nrpecheck.shortname] = {
  272. "command": nrpecheck.command,
  273. }
  274. # update-status hooks are configured to firing every 5 minutes by
  275. # default. When nagios-nrpe-server is restarted, the nagios server
  276. # reports checks failing causing unneccessary alerts. Let's not restart
  277. # on update-status hooks.
  278. if not hook_name() == 'update-status':
  279. service('restart', 'nagios-nrpe-server')
  280. monitor_ids = relation_ids("local-monitors") + \
  281. relation_ids("nrpe-external-master")
  282. for rid in monitor_ids:
  283. relation_set(relation_id=rid, monitors=yaml.dump(monitors))
  284. def get_nagios_hostcontext(relation_name='nrpe-external-master'):
  285. """
  286. Query relation with nrpe subordinate, return the nagios_host_context
  287. :param str relation_name: Name of relation nrpe sub joined to
  288. """
  289. for rel in relations_of_type(relation_name):
  290. if 'nagios_host_context' in rel:
  291. return rel['nagios_host_context']
  292. def get_nagios_hostname(relation_name='nrpe-external-master'):
  293. """
  294. Query relation with nrpe subordinate, return the nagios_hostname
  295. :param str relation_name: Name of relation nrpe sub joined to
  296. """
  297. for rel in relations_of_type(relation_name):
  298. if 'nagios_hostname' in rel:
  299. return rel['nagios_hostname']
  300. def get_nagios_unit_name(relation_name='nrpe-external-master'):
  301. """
  302. Return the nagios unit name prepended with host_context if needed
  303. :param str relation_name: Name of relation nrpe sub joined to
  304. """
  305. host_context = get_nagios_hostcontext(relation_name)
  306. if host_context:
  307. unit = "%s:%s" % (host_context, local_unit())
  308. else:
  309. unit = local_unit()
  310. return unit
  311. def add_init_service_checks(nrpe, services, unit_name, immediate_check=True):
  312. """
  313. Add checks for each service in list
  314. :param NRPE nrpe: NRPE object to add check to
  315. :param list services: List of services to check
  316. :param str unit_name: Unit name to use in check description
  317. :param bool immediate_check: For sysv init, run the service check immediately
  318. """
  319. for svc in services:
  320. # Don't add a check for these services from neutron-gateway
  321. if svc in ['ext-port', 'os-charm-phy-nic-mtu']:
  322. next
  323. upstart_init = '/etc/init/%s.conf' % svc
  324. sysv_init = '/etc/init.d/%s' % svc
  325. if host.init_is_systemd():
  326. nrpe.add_check(
  327. shortname=svc,
  328. description='process check {%s}' % unit_name,
  329. check_cmd='check_systemd.py %s' % svc
  330. )
  331. elif os.path.exists(upstart_init):
  332. nrpe.add_check(
  333. shortname=svc,
  334. description='process check {%s}' % unit_name,
  335. check_cmd='check_upstart_job %s' % svc
  336. )
  337. elif os.path.exists(sysv_init):
  338. cronpath = '/etc/cron.d/nagios-service-check-%s' % svc
  339. checkpath = '%s/service-check-%s.txt' % (nrpe.homedir, svc)
  340. croncmd = (
  341. '/usr/local/lib/nagios/plugins/check_exit_status.pl '
  342. '-e -s /etc/init.d/%s status' % svc
  343. )
  344. cron_file = '*/5 * * * * root %s > %s\n' % (croncmd, checkpath)
  345. f = open(cronpath, 'w')
  346. f.write(cron_file)
  347. f.close()
  348. nrpe.add_check(
  349. shortname=svc,
  350. description='service check {%s}' % unit_name,
  351. check_cmd='check_status_file.py -f %s' % checkpath,
  352. )
  353. # if /var/lib/nagios doesn't exist open(checkpath, 'w') will fail
  354. # (LP: #1670223).
  355. if immediate_check and os.path.isdir(nrpe.homedir):
  356. f = open(checkpath, 'w')
  357. subprocess.call(
  358. croncmd.split(),
  359. stdout=f,
  360. stderr=subprocess.STDOUT
  361. )
  362. f.close()
  363. os.chmod(checkpath, 0o644)
  364. def copy_nrpe_checks(nrpe_files_dir=None):
  365. """
  366. Copy the nrpe checks into place
  367. """
  368. NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins'
  369. default_nrpe_files_dir = os.path.join(
  370. os.getenv('CHARM_DIR'),
  371. 'hooks',
  372. 'charmhelpers',
  373. 'contrib',
  374. 'openstack',
  375. 'files')
  376. if not nrpe_files_dir:
  377. nrpe_files_dir = default_nrpe_files_dir
  378. if not os.path.exists(NAGIOS_PLUGINS):
  379. os.makedirs(NAGIOS_PLUGINS)
  380. for fname in glob.glob(os.path.join(nrpe_files_dir, "check_*")):
  381. if os.path.isfile(fname):
  382. shutil.copy2(fname,
  383. os.path.join(NAGIOS_PLUGINS, os.path.basename(fname)))
  384. def add_haproxy_checks(nrpe, unit_name):
  385. """
  386. Add checks for each service in list
  387. :param NRPE nrpe: NRPE object to add check to
  388. :param str unit_name: Unit name to use in check description
  389. """
  390. nrpe.add_check(
  391. shortname='haproxy_servers',
  392. description='Check HAProxy {%s}' % unit_name,
  393. check_cmd='check_haproxy.sh')
  394. nrpe.add_check(
  395. shortname='haproxy_queue',
  396. description='Check HAProxy queue depth {%s}' % unit_name,
  397. check_cmd='check_haproxy_queue_depth.sh')