Juju Charm - Ceph OSD
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ceph_hooks.py 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright 2016 Canonical Ltd
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import base64
  17. import json
  18. import glob
  19. import os
  20. import shutil
  21. import sys
  22. import socket
  23. import subprocess
  24. import netifaces
  25. sys.path.append('lib')
  26. import ceph.utils as ceph
  27. from charmhelpers.core import hookenv
  28. from charmhelpers.core.hookenv import (
  29. log,
  30. DEBUG,
  31. ERROR,
  32. INFO,
  33. config,
  34. relation_ids,
  35. related_units,
  36. relation_get,
  37. relation_set,
  38. Hooks,
  39. UnregisteredHookError,
  40. service_name,
  41. status_get,
  42. status_set,
  43. storage_get,
  44. storage_list,
  45. application_version_set,
  46. )
  47. from charmhelpers.core.host import (
  48. umount,
  49. mkdir,
  50. cmp_pkgrevno,
  51. service_reload,
  52. service_restart,
  53. add_to_updatedb_prunepath,
  54. restart_on_change,
  55. write_file,
  56. is_container,
  57. )
  58. from charmhelpers.fetch import (
  59. add_source,
  60. apt_install,
  61. apt_update,
  62. filter_installed_packages,
  63. get_upstream_version,
  64. )
  65. from charmhelpers.core.sysctl import create as create_sysctl
  66. from charmhelpers.contrib.openstack.context import (
  67. AppArmorContext,
  68. )
  69. from utils import (
  70. get_host_ip,
  71. get_networks,
  72. assert_charm_supports_ipv6,
  73. render_template,
  74. is_unit_paused_set,
  75. get_public_addr,
  76. get_cluster_addr,
  77. get_blacklist,
  78. get_journal_devices,
  79. )
  80. from charmhelpers.contrib.openstack.alternatives import install_alternative
  81. from charmhelpers.contrib.network.ip import (
  82. get_ipv6_addr,
  83. format_ipv6_addr,
  84. get_relation_ip,
  85. )
  86. from charmhelpers.contrib.storage.linux.ceph import (
  87. CephConfContext)
  88. from charmhelpers.contrib.storage.linux.utils import (
  89. is_device_mounted,
  90. )
  91. from charmhelpers.contrib.charmsupport import nrpe
  92. from charmhelpers.contrib.hardening.harden import harden
  93. import charmhelpers.contrib.openstack.vaultlocker as vaultlocker
  94. hooks = Hooks()
  95. STORAGE_MOUNT_PATH = '/var/lib/ceph'
  96. def check_for_upgrade():
  97. if not os.path.exists(ceph._upgrade_keyring):
  98. log("Ceph upgrade keyring not detected, skipping upgrade checks.")
  99. return
  100. c = hookenv.config()
  101. old_version = ceph.resolve_ceph_version(c.previous('source') or
  102. 'distro')
  103. log('old_version: {}'.format(old_version))
  104. new_version = ceph.resolve_ceph_version(hookenv.config('source') or
  105. 'distro')
  106. log('new_version: {}'.format(new_version))
  107. # May be in a previous upgrade that was failed if the directories
  108. # still need an ownership update. Check this condition.
  109. resuming_upgrade = ceph.dirs_need_ownership_update('osd')
  110. if old_version == new_version and not resuming_upgrade:
  111. log("No new ceph version detected, skipping upgrade.", DEBUG)
  112. return
  113. if (ceph.UPGRADE_PATHS.get(old_version) == new_version) or\
  114. resuming_upgrade:
  115. if old_version == new_version:
  116. log('Attempting to resume possibly failed upgrade.',
  117. INFO)
  118. else:
  119. log("{} to {} is a valid upgrade path. Proceeding.".format(
  120. old_version, new_version))
  121. emit_cephconf(upgrading=True)
  122. ceph.roll_osd_cluster(new_version=new_version,
  123. upgrade_key='osd-upgrade')
  124. emit_cephconf(upgrading=False)
  125. else:
  126. # Log a helpful error message
  127. log("Invalid upgrade path from {} to {}. "
  128. "Valid paths are: {}".format(old_version,
  129. new_version,
  130. ceph.pretty_print_upgrade_paths()))
  131. def tune_network_adapters():
  132. interfaces = netifaces.interfaces()
  133. for interface in interfaces:
  134. if interface == "lo":
  135. # Skip the loopback
  136. continue
  137. log("Looking up {} for possible sysctl tuning.".format(interface))
  138. ceph.tune_nic(interface)
  139. @restart_on_change({'/etc/apparmor.d/usr.bin.ceph-osd': ['apparmor']},
  140. restart_functions={'apparmor': service_reload})
  141. def copy_profile_into_place():
  142. """
  143. Copy the apparmor profiles included with the charm
  144. into the /etc/apparmor.d directory.
  145. """
  146. new_install = False
  147. apparmor_dir = os.path.join(os.sep,
  148. 'etc',
  149. 'apparmor.d')
  150. for x in glob.glob('files/apparmor/*'):
  151. if not os.path.exists(os.path.join(apparmor_dir,
  152. os.path.basename(x))):
  153. new_install = True
  154. shutil.copy(x, apparmor_dir)
  155. return new_install
  156. class CephOsdAppArmorContext(AppArmorContext):
  157. """"Apparmor context for ceph-osd binary"""
  158. def __init__(self):
  159. super(CephOsdAppArmorContext, self).__init__()
  160. self.aa_profile = 'usr.bin.ceph-osd'
  161. def __call__(self):
  162. super(CephOsdAppArmorContext, self).__call__()
  163. if not self.ctxt:
  164. return self.ctxt
  165. self._ctxt.update({'aa_profile': self.aa_profile})
  166. return self.ctxt
  167. def use_vaultlocker():
  168. """Determine whether vaultlocker should be used for OSD encryption
  169. :returns: whether vaultlocker should be used for key management
  170. :rtype: bool
  171. :raises: ValueError if vaultlocker is enable but ceph < 12.2.4"""
  172. if (config('osd-encrypt') and
  173. config('osd-encrypt-keymanager') == ceph.VAULT_KEY_MANAGER):
  174. if cmp_pkgrevno('ceph', '12.2.4') < 0:
  175. msg = ('vault usage only supported with ceph >= 12.2.4')
  176. status_set('blocked', msg)
  177. raise ValueError(msg)
  178. else:
  179. return True
  180. return False
  181. def install_apparmor_profile():
  182. """
  183. Install ceph apparmor profiles and configure
  184. based on current setting of 'aa-profile-mode'
  185. configuration option.
  186. """
  187. log('Installing apparmor profile for ceph-osd')
  188. new_install = copy_profile_into_place()
  189. if new_install or config().changed('aa-profile-mode'):
  190. aa_context = CephOsdAppArmorContext()
  191. aa_context.setup_aa_profile()
  192. service_reload('apparmor')
  193. if ceph.systemd():
  194. for osd_id in ceph.get_local_osd_ids():
  195. service_restart('ceph-osd@{}'.format(osd_id))
  196. else:
  197. service_restart('ceph-osd-all')
  198. def install_udev_rules():
  199. """
  200. Install and reload udev rules for ceph-volume LV
  201. permissions
  202. """
  203. if is_container():
  204. log('Skipping udev rule installation '
  205. 'as unit is in a container', level=DEBUG)
  206. return
  207. for x in glob.glob('files/udev/*'):
  208. shutil.copy(x, '/lib/udev/rules.d')
  209. subprocess.check_call(['udevadm', 'control',
  210. '--reload-rules'])
  211. @hooks.hook('install.real')
  212. @harden()
  213. def install():
  214. add_source(config('source'), config('key'))
  215. apt_update(fatal=True)
  216. apt_install(packages=ceph.determine_packages(), fatal=True)
  217. if config('autotune'):
  218. tune_network_adapters()
  219. install_udev_rules()
  220. def az_info():
  221. az_info = ""
  222. config_az = config("availability_zone")
  223. juju_az_info = os.environ.get('JUJU_AVAILABILITY_ZONE')
  224. if juju_az_info:
  225. # NOTE(jamespage): avoid conflicting key with root
  226. # of crush hierarchy
  227. if juju_az_info == 'default':
  228. juju_az_info = 'default-rack'
  229. az_info = "{} rack={}".format(az_info, juju_az_info)
  230. if config_az:
  231. # NOTE(jamespage): avoid conflicting key with root
  232. # of crush hierarchy
  233. if config_az == 'default':
  234. config_az = 'default-row'
  235. az_info = "{} row={}".format(az_info, config_az)
  236. if az_info != "":
  237. log("AZ Info: " + az_info)
  238. return az_info
  239. def use_short_objects():
  240. '''
  241. Determine whether OSD's should be configured with
  242. limited object name lengths.
  243. @return: boolean indicating whether OSD's should be limited
  244. '''
  245. if cmp_pkgrevno('ceph', "10.2.0") >= 0:
  246. if config('osd-format') in ('ext4'):
  247. return True
  248. devices = config('osd-devices')
  249. if not devices:
  250. return False
  251. for device in devices.split():
  252. if device and not device.startswith('/dev'):
  253. # TODO: determine format of directory based
  254. # OSD location
  255. return True
  256. return False
  257. def get_ceph_context(upgrading=False):
  258. """Returns the current context dictionary for generating ceph.conf
  259. :param upgrading: bool - determines if the context is invoked as
  260. part of an upgrade proedure Setting this to true
  261. causes settings useful during an upgrade to be
  262. defined in the ceph.conf file
  263. """
  264. mon_hosts = get_mon_hosts()
  265. log('Monitor hosts are ' + repr(mon_hosts))
  266. networks = get_networks('ceph-public-network')
  267. public_network = ', '.join(networks)
  268. networks = get_networks('ceph-cluster-network')
  269. cluster_network = ', '.join(networks)
  270. cephcontext = {
  271. 'auth_supported': get_auth(),
  272. 'mon_hosts': ' '.join(mon_hosts),
  273. 'fsid': get_fsid(),
  274. 'old_auth': cmp_pkgrevno('ceph', "0.51") < 0,
  275. 'crush_initial_weight': config('crush-initial-weight'),
  276. 'osd_journal_size': config('osd-journal-size'),
  277. 'osd_max_backfills': config('osd-max-backfills'),
  278. 'osd_recovery_max_active': config('osd-recovery-max-active'),
  279. 'use_syslog': str(config('use-syslog')).lower(),
  280. 'ceph_public_network': public_network,
  281. 'ceph_cluster_network': cluster_network,
  282. 'loglevel': config('loglevel'),
  283. 'dio': str(config('use-direct-io')).lower(),
  284. 'short_object_len': use_short_objects(),
  285. 'upgrade_in_progress': upgrading,
  286. 'bluestore': config('bluestore'),
  287. 'bluestore_experimental': cmp_pkgrevno('ceph', '12.1.0') < 0,
  288. 'bluestore_block_wal_size': config('bluestore-block-wal-size'),
  289. 'bluestore_block_db_size': config('bluestore-block-db-size'),
  290. }
  291. if config('prefer-ipv6'):
  292. dynamic_ipv6_address = get_ipv6_addr()[0]
  293. if not public_network:
  294. cephcontext['public_addr'] = dynamic_ipv6_address
  295. if not cluster_network:
  296. cephcontext['cluster_addr'] = dynamic_ipv6_address
  297. else:
  298. cephcontext['public_addr'] = get_public_addr()
  299. cephcontext['cluster_addr'] = get_cluster_addr()
  300. if config('customize-failure-domain'):
  301. az = az_info()
  302. if az:
  303. cephcontext['crush_location'] = "root=default {} host={}" \
  304. .format(az, socket.gethostname())
  305. else:
  306. log(
  307. "Your Juju environment doesn't"
  308. "have support for Availability Zones"
  309. )
  310. # NOTE(dosaboy): these sections must correspond to what is supported in the
  311. # config template.
  312. sections = ['global', 'osd']
  313. cephcontext.update(CephConfContext(permitted_sections=sections)())
  314. return cephcontext
  315. def emit_cephconf(upgrading=False):
  316. # Install ceph.conf as an alternative to support
  317. # co-existence with other charms that write this file
  318. charm_ceph_conf = "/var/lib/charm/{}/ceph.conf".format(service_name())
  319. mkdir(os.path.dirname(charm_ceph_conf), owner=ceph.ceph_user(),
  320. group=ceph.ceph_user())
  321. with open(charm_ceph_conf, 'w') as cephconf:
  322. context = get_ceph_context(upgrading)
  323. cephconf.write(render_template('ceph.conf', context))
  324. install_alternative('ceph.conf', '/etc/ceph/ceph.conf',
  325. charm_ceph_conf, 90)
  326. @hooks.hook('config-changed')
  327. @harden()
  328. def config_changed():
  329. # Determine whether vaultlocker is required and install
  330. if use_vaultlocker():
  331. installed = len(filter_installed_packages(['vaultlocker'])) == 0
  332. if not installed:
  333. add_source('ppa:openstack-charmers/vaultlocker')
  334. apt_update(fatal=True)
  335. apt_install('vaultlocker', fatal=True)
  336. # Check if an upgrade was requested
  337. check_for_upgrade()
  338. # Pre-flight checks
  339. if config('osd-format') not in ceph.DISK_FORMATS:
  340. log('Invalid OSD disk format configuration specified', level=ERROR)
  341. sys.exit(1)
  342. if config('prefer-ipv6'):
  343. assert_charm_supports_ipv6()
  344. sysctl_dict = config('sysctl')
  345. if sysctl_dict:
  346. create_sysctl(sysctl_dict, '/etc/sysctl.d/50-ceph-osd-charm.conf')
  347. e_mountpoint = config('ephemeral-unmount')
  348. if e_mountpoint and ceph.filesystem_mounted(e_mountpoint):
  349. umount(e_mountpoint)
  350. prepare_disks_and_activate()
  351. install_apparmor_profile()
  352. add_to_updatedb_prunepath(STORAGE_MOUNT_PATH)
  353. @hooks.hook('storage.real')
  354. def prepare_disks_and_activate():
  355. # NOTE: vault/vaultlocker preflight check
  356. vault_kv = vaultlocker.VaultKVContext(vaultlocker.VAULTLOCKER_BACKEND)
  357. context = vault_kv()
  358. if use_vaultlocker() and not vault_kv.complete:
  359. log('Deferring OSD preparation as vault not ready',
  360. level=DEBUG)
  361. return
  362. elif use_vaultlocker() and vault_kv.complete:
  363. log('Vault ready, writing vaultlocker configuration',
  364. level=DEBUG)
  365. vaultlocker.write_vaultlocker_conf(context)
  366. osd_journal = get_journal_devices()
  367. if not osd_journal.isdisjoint(set(get_devices())):
  368. raise ValueError('`osd-journal` and `osd-devices` options must not'
  369. 'overlap.')
  370. log("got journal devs: {}".format(osd_journal), level=DEBUG)
  371. # pre-flight check of eligible device pristinity
  372. devices = get_devices()
  373. # filter osd-devices that are file system paths
  374. devices = [dev for dev in devices if dev.startswith('/dev')]
  375. # filter osd-devices that does not exist on this unit
  376. devices = [dev for dev in devices if os.path.exists(dev)]
  377. # filter osd-devices that are already mounted
  378. devices = [dev for dev in devices if not is_device_mounted(dev)]
  379. # filter osd-devices that are active bluestore devices
  380. devices = [dev for dev in devices
  381. if not ceph.is_active_bluestore_device(dev)]
  382. log('Checking for pristine devices: "{}"'.format(devices), level=DEBUG)
  383. if not all(ceph.is_pristine_disk(dev) for dev in devices):
  384. status_set('blocked',
  385. 'Non-pristine devices detected, consult '
  386. '`list-disks`, `zap-disk` and `blacklist-*` actions.')
  387. return
  388. if ceph.is_bootstrapped():
  389. log('ceph bootstrapped, rescanning disks')
  390. emit_cephconf()
  391. for dev in get_devices():
  392. ceph.osdize(dev, config('osd-format'),
  393. osd_journal,
  394. config('ignore-device-errors'),
  395. config('osd-encrypt'),
  396. config('bluestore'),
  397. config('osd-encrypt-keymanager'))
  398. # Make it fast!
  399. if config('autotune'):
  400. ceph.tune_dev(dev)
  401. ceph.start_osds(get_devices())
  402. def get_mon_hosts():
  403. hosts = []
  404. for relid in relation_ids('mon'):
  405. for unit in related_units(relid):
  406. addr = \
  407. relation_get('ceph-public-address',
  408. unit,
  409. relid) or get_host_ip(
  410. relation_get(
  411. 'private-address',
  412. unit,
  413. relid))
  414. if addr:
  415. hosts.append('{}:6789'.format(format_ipv6_addr(addr) or addr))
  416. return sorted(hosts)
  417. def get_fsid():
  418. return get_conf('fsid')
  419. def get_auth():
  420. return get_conf('auth')
  421. def get_conf(name):
  422. for relid in relation_ids('mon'):
  423. for unit in related_units(relid):
  424. conf = relation_get(name,
  425. unit, relid)
  426. if conf:
  427. return conf
  428. return None
  429. def get_devices():
  430. devices = []
  431. if config('osd-devices'):
  432. for path in config('osd-devices').split(' '):
  433. path = path.strip()
  434. # Make sure its a device which is specified using an
  435. # absolute path so that the current working directory
  436. # or any relative path under this directory is not used
  437. if os.path.isabs(path):
  438. devices.append(os.path.realpath(path))
  439. # List storage instances for the 'osd-devices'
  440. # store declared for this charm too, and add
  441. # their block device paths to the list.
  442. storage_ids = storage_list('osd-devices')
  443. devices.extend((storage_get('location', s) for s in storage_ids))
  444. # Filter out any devices in the action managed unit-local device blacklist
  445. _blacklist = get_blacklist()
  446. return [device for device in devices if device not in _blacklist]
  447. @hooks.hook('mon-relation-changed',
  448. 'mon-relation-departed')
  449. def mon_relation():
  450. bootstrap_key = relation_get('osd_bootstrap_key')
  451. upgrade_key = relation_get('osd_upgrade_key')
  452. if get_fsid() and get_auth() and bootstrap_key:
  453. log('mon has provided conf- scanning disks')
  454. emit_cephconf()
  455. ceph.import_osd_bootstrap_key(bootstrap_key)
  456. ceph.import_osd_upgrade_key(upgrade_key)
  457. prepare_disks_and_activate()
  458. else:
  459. log('mon cluster has not yet provided conf')
  460. @hooks.hook('upgrade-charm.real')
  461. @harden()
  462. def upgrade_charm():
  463. if get_fsid() and get_auth():
  464. emit_cephconf()
  465. apt_install(packages=filter_installed_packages(ceph.determine_packages()),
  466. fatal=True)
  467. install_udev_rules()
  468. @hooks.hook('nrpe-external-master-relation-joined',
  469. 'nrpe-external-master-relation-changed')
  470. def update_nrpe_config():
  471. # python-dbus is used by check_upstart_job
  472. apt_install('python3-dbus')
  473. hostname = nrpe.get_nagios_hostname()
  474. current_unit = nrpe.get_nagios_unit_name()
  475. nrpe_setup = nrpe.NRPE(hostname=hostname)
  476. nrpe_setup.add_check(
  477. shortname='ceph-osd',
  478. description='process check {%s}' % current_unit,
  479. check_cmd=('/bin/cat /var/lib/ceph/osd/ceph-*/whoami |'
  480. 'xargs -I@ status ceph-osd id=@ && exit 0 || exit 2')
  481. )
  482. nrpe_setup.write()
  483. @hooks.hook('secrets-storage-relation-joined')
  484. def secrets_storage_joined(relation_id=None):
  485. relation_set(relation_id=relation_id,
  486. secret_backend='charm-vaultlocker',
  487. isolated=True,
  488. access_address=get_relation_ip('secrets-storage'),
  489. hostname=socket.gethostname())
  490. @hooks.hook('secrets-storage-relation-changed')
  491. def secrets_storage_changed():
  492. vault_ca = relation_get('vault_ca')
  493. if vault_ca:
  494. vault_ca = base64.decodestring(json.loads(vault_ca).encode())
  495. write_file('/usr/local/share/ca-certificates/vault-ca.crt',
  496. vault_ca, perms=0o644)
  497. subprocess.check_call(['update-ca-certificates', '--fresh'])
  498. prepare_disks_and_activate()
  499. VERSION_PACKAGE = 'ceph-common'
  500. def assess_status():
  501. """Assess status of current unit"""
  502. # check to see if the unit is paused.
  503. application_version_set(get_upstream_version(VERSION_PACKAGE))
  504. if is_unit_paused_set():
  505. status_set('maintenance',
  506. "Paused. Use 'resume' action to resume normal service.")
  507. return
  508. # Check for mon relation
  509. if len(relation_ids('mon')) < 1:
  510. status_set('blocked', 'Missing relation: monitor')
  511. return
  512. # Check for monitors with presented addresses
  513. # Check for bootstrap key presentation
  514. monitors = get_mon_hosts()
  515. if len(monitors) < 1 or not get_conf('osd_bootstrap_key'):
  516. status_set('waiting', 'Incomplete relation: monitor')
  517. return
  518. # Check for vault
  519. if use_vaultlocker():
  520. if not relation_ids('secrets-storage'):
  521. status_set('blocked', 'Missing relation: vault')
  522. return
  523. if not vaultlocker.vault_relation_complete():
  524. status_set('waiting', 'Incomplete relation: vault')
  525. return
  526. # Check for OSD device creation parity i.e. at least some devices
  527. # must have been presented and used for this charm to be operational
  528. (prev_status, prev_message) = status_get()
  529. running_osds = ceph.get_running_osds()
  530. if not prev_message.startswith('Non-pristine'):
  531. if not running_osds:
  532. status_set('blocked',
  533. 'No block devices detected using current configuration')
  534. else:
  535. status_set('active',
  536. 'Unit is ready ({} OSD)'.format(len(running_osds)))
  537. @hooks.hook('update-status')
  538. @harden()
  539. def update_status():
  540. log('Updating status.')
  541. if __name__ == '__main__':
  542. try:
  543. hooks.execute(sys.argv)
  544. except UnregisteredHookError as e:
  545. log('Unknown hook {} - skipping.'.format(e))
  546. assess_status()