commit 1683ffaa8401fd0c255d5e6656a64aca62af74aa Author: James Page Date: Mon Oct 8 15:07:16 2012 +0100 Initial ceph-osd charm diff --git a/.bzrignore b/.bzrignore new file mode 100644 index 00000000..3a4edf69 --- /dev/null +++ b/.bzrignore @@ -0,0 +1 @@ +.project diff --git a/.project b/.project new file mode 100644 index 00000000..c5e385b7 --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + ceph-osd + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 00000000..bb30cc40 --- /dev/null +++ b/.pydevproject @@ -0,0 +1,8 @@ + + +python 2.7 +Default + +/ceph-osd/hooks + + diff --git a/README b/README new file mode 100644 index 00000000..a886a83e --- /dev/null +++ b/README @@ -0,0 +1,87 @@ +Overview +======== + +Ceph is a distributed storage and network file system designed to provide +excellent performance, reliability, and scalability. + +This charm deploys a Ceph cluster. + +Usage +===== + +The ceph charm has two pieces of mandatory configuration for which no defaults +are provided: + + fsid: + uuid specific to a ceph cluster used to ensure that different + clusters don't get mixed up - use `uuid` to generate one. + + monitor-secret: + a ceph generated key used by the daemons that manage to cluster + to control security. You can use the ceph-authtool command to + generate one: + + ceph-authtool /dev/stdout --name=mon. --gen-key + +These two pieces of configuration must NOT be changed post bootstrap; attempting +todo this will cause a reconfiguration error and new service units will not join +the existing ceph cluster. + +The charm also supports specification of the storage devices to use in the ceph +cluster. + + osd-devices: + A list of devices that the charm will attempt to detect, initialise and + activate as ceph storage. + + This this can be a superset of the actual storage devices presented to + each service unit and can be changed post ceph bootstrap using `juju set`. + +At a minimum you must provide a juju config file during initial deployment +with the fsid and monitor-secret options (contents of cepy.yaml below): + + ceph-brolin: + fsid: ecbb8960-0e21-11e2-b495-83a88f44db01 + monitor-secret: AQD1P2xQiKglDhAA4NGUF5j38Mhq56qwz+45wg== + osd-devices: /dev/vdb /dev/vdc /dev/vdd /dev/vde + +Specifying the osd-devices to use is also a good idea. + +Boot things up by using: + + juju deploy -n 3 --config ceph.yaml ceph-brolin + +By default the ceph cluster will not bootstrap until 3 service units have been +deployed and started; this is to ensure that a quorum is achieved prior to adding +storage devices. + +Technical Bootnotes +=================== + +This charm is currently deliberately inflexible and potentially destructive. +It is designed to deploy on exactly three machines. Each machine will run mon +and osd. + +This charm uses the new-style Ceph deployment as reverse-engineered from the +Chef cookbook at https://github.com/ceph/ceph-cookbooks, although we selected +a different strategy to form the monitor cluster. Since we don't know the +names *or* addresses of the machines in advance, we use the relation-joined +hook to wait for all three nodes to come up, and then write their addresses +to ceph.conf in the "mon host" parameter. After we initialize the monitor +cluster a quorum forms quickly, and OSD bringup proceeds. + +The osds use so-called "OSD hotplugging". ceph-disk-prepare is used to create +the filesystems with a special GPT partition type. udev is set up to mounti +such filesystems and start the osd daemons as their storage becomes visible to +the system (or after "udevadm trigger"). + +The Chef cookbook above performs some extra steps to generate an OSD +bootstrapping key and propagate it to the other nodes in the cluster. Since +all OSDs run on nodes that also run mon, we don't need this and did not +implement it. + +The charm does not currently implement cephx and its explicitly turned off in +the configuration generated for ceph. + +See http://ceph.com/docs/master/dev/mon-bootstrap/ for more information on Ceph +monitor cluster deployment strategies and pitfalls. diff --git a/TODO b/TODO new file mode 100644 index 00000000..46549b7a --- /dev/null +++ b/TODO @@ -0,0 +1,11 @@ +== Minor == + + * fix tunables (http://tracker.newdream.net/issues/2210) + * more than 192 PGs + * fixup data placement in crush to be host not osd driven + +== Public Charm == + + * cephx support + * rel: remote MON clients (+client keys for cephx) + * rel: RADOS gateway (+client key for cephx) diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000..06222338 --- /dev/null +++ b/config.yaml @@ -0,0 +1,37 @@ +options: + fsid: + type: string + description: | + fsid of the ceph cluster. To generate a suitable value use `uuid` + . + This configuration element is mandatory and the service will fail on + install if it is not provided. + osd-devices: + type: string + default: /dev/sdb /dev/sdc /dev/sdd /dev/sde + description: | + The devices to format and set up as osd volumes. + . + These devices are the range of devices that will be checked for and + used across all service units. + source: + type: string + default: ppa:ceph-ubuntu/dev + description: | + Optional configuration to support use of additional sources such as: + . + - ppa:myteam/ppa + - cloud:folsom-proposed + - http://my.archive.com/ubuntu main + . + The last option should be used in conjunction with the key configuration + option. + . + Note that a minimum ceph version of 0.48.2 is required for use with this + charm which is NOT provided by the packages in the main Ubuntu archive + for precise. + key: + type: string + description: | + Key ID to import to the apt keyring to support use with arbitary source + configuration from outside of Launchpad archives or PPA's. diff --git a/copyright b/copyright new file mode 100644 index 00000000..bdfae0e0 --- /dev/null +++ b/copyright @@ -0,0 +1,15 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0 +Comment: The licensing of this charm is aligned to upstream ceph + as the ceph upstart integration is distributed as part of the charm. + +Files: * +Copyright: 2012, Canonical Ltd. +License: LGPL-2.1 + +Files: files/upstart/* +Copyright: 2004-2010 by Sage Weil +License: LGPL-2.1 + +License: LGPL-2.1 + On Debian GNU/Linux system you can find the complete text of the + LGPL-2.1 license in '/usr/share/common-licenses/LGPL-2.1' diff --git a/files/upstart/ceph-create-keys.conf b/files/upstart/ceph-create-keys.conf new file mode 100644 index 00000000..6fb45818 --- /dev/null +++ b/files/upstart/ceph-create-keys.conf @@ -0,0 +1,8 @@ +description "Create Ceph client.admin key when possible" + +start on started ceph-mon +stop on runlevel [!2345] + +task + +exec /usr/sbin/ceph-create-keys --cluster="${cluster:-ceph}" -i "${id:-$(hostname)}" diff --git a/files/upstart/ceph-hotplug.conf b/files/upstart/ceph-hotplug.conf new file mode 100644 index 00000000..70204529 --- /dev/null +++ b/files/upstart/ceph-hotplug.conf @@ -0,0 +1,11 @@ +description "Ceph hotplug" + +start on block-device-added \ + DEVTYPE=partition \ + ID_PART_ENTRY_TYPE=4fbd7e29-9d25-41b8-afd0-062c0ceff05d +stop on runlevel [!2345] + +task +instance $DEVNAME + +exec /usr/sbin/ceph-disk-activate --mount -- "$DEVNAME" diff --git a/files/upstart/ceph-mon-all-starter.conf b/files/upstart/ceph-mon-all-starter.conf new file mode 100644 index 00000000..f7188cb7 --- /dev/null +++ b/files/upstart/ceph-mon-all-starter.conf @@ -0,0 +1,20 @@ +description "Ceph MON (start all instances)" + +start on starting ceph-mon-all +stop on runlevel [!2345] + +task + +script + set -e + # TODO what's the valid charset for cluster names and mon ids? + find /var/lib/ceph/mon/ -mindepth 1 -maxdepth 1 -regextype posix-egrep -regex '.*/[a-z0-9]+-[a-z0-9._-]+' -printf '%P\n' \ + | while read f; do + if [ -e "/var/lib/ceph/mon/$f/done" ]; then + cluster="${f%%-*}" + id="${f#*-}" + + initctl emit ceph-mon cluster="$cluster" id="$id" + fi + done +end script diff --git a/files/upstart/ceph-mon-all.conf b/files/upstart/ceph-mon-all.conf new file mode 100644 index 00000000..006f2f20 --- /dev/null +++ b/files/upstart/ceph-mon-all.conf @@ -0,0 +1,4 @@ +description "Ceph monitor (all instances)" + +start on (local-filesystems and net-device-up IFACE!=lo) +stop on runlevel [!2345] diff --git a/files/upstart/ceph-mon.conf b/files/upstart/ceph-mon.conf new file mode 100644 index 00000000..2cf7bfa5 --- /dev/null +++ b/files/upstart/ceph-mon.conf @@ -0,0 +1,24 @@ +description "Ceph MON" + +start on ceph-mon +stop on runlevel [!2345] or stopping ceph-mon-all + +respawn +respawn limit 5 30 + +pre-start script + set -e + test -x /usr/bin/ceph-mon || { stop; exit 0; } + test -d "/var/lib/ceph/mon/${cluster:-ceph}-$id" || { stop; exit 0; } + + install -d -m0755 /var/run/ceph +end script + +instance ${cluster:-ceph}/$id +export cluster +export id + +# this breaks oneiric +#usage "cluster = name of cluster (defaults to 'ceph'); id = monitor instance id" + +exec /usr/bin/ceph-mon --cluster="${cluster:-ceph}" -i "$id" -f diff --git a/files/upstart/ceph-osd.conf b/files/upstart/ceph-osd.conf new file mode 100644 index 00000000..119ad000 --- /dev/null +++ b/files/upstart/ceph-osd.conf @@ -0,0 +1,37 @@ +description "Ceph OSD" + +start on ceph-osd +stop on runlevel [!2345] + +respawn +respawn limit 5 30 + +pre-start script + set -e + test -x /usr/bin/ceph-osd || { stop; exit 0; } + test -d "/var/lib/ceph/osd/${cluster:-ceph}-$id" || { stop; exit 0; } + + install -d -m0755 /var/run/ceph + + # update location in crush; put in some suitable defaults on the + # command line, ceph.conf can override what it wants + location="$(ceph-conf --cluster="${cluster:-ceph}" --name="osd.$id" --lookup osd_crush_location || :)" + weight="$(ceph-conf --cluster="$cluster" --name="osd.$id" --lookup osd_crush_weight || :)" + ceph \ + --cluster="${cluster:-ceph}" \ + --name="osd.$id" \ + --keyring="/var/lib/ceph/osd/${cluster:-ceph}-$id/keyring" \ + osd crush set \ + -- \ + "$id" "osd.$id" "${weight:-1}" \ + pool=default \ + host="$(hostname -s)" \ + $location \ + || : +end script + +instance ${cluster:-ceph}/$id +export cluster +export id + +exec /usr/bin/ceph-osd --cluster="${cluster:-ceph}" -i "$id" -f diff --git a/hooks/ceph.py b/hooks/ceph.py new file mode 100644 index 00000000..2a193d58 --- /dev/null +++ b/hooks/ceph.py @@ -0,0 +1,73 @@ + +# +# Copyright 2012 Canonical Ltd. +# +# Authors: +# James Page +# Paul Collins +# + +import json +import subprocess +import time +import utils +import os + +QUORUM = ['leader', 'peon'] + + +def is_quorum(): + asok = "/var/run/ceph/ceph-mon.{}.asok".format(utils.get_unit_hostname()) + cmd = [ + "ceph", + "--admin-daemon", + asok, + "mon_status" + ] + if os.path.exists(asok): + try: + result = json.loads(subprocess.check_output(cmd)) + except subprocess.CalledProcessError: + return False + except ValueError: + # Non JSON response from mon_status + return False + if result['state'] in QUORUM: + return True + else: + return False + else: + return False + + +def wait_for_quorum(): + while not is_quorum(): + time.sleep(3) + + +def add_bootstrap_hint(peer): + asok = "/var/run/ceph/ceph-mon.{}.asok".format(utils.get_unit_hostname()) + cmd = [ + "ceph", + "--admin-daemon", + asok, + "add_bootstrap_peer_hint", + peer + ] + if os.path.exists(asok): + # Ignore any errors for this call + subprocess.call(cmd) + + +def is_osd_disk(dev): + try: + info = subprocess.check_output(['sgdisk', '-i', '1', dev]) + info = info.split("\n") # IGNORE:E1103 + for line in info: + if line.startswith( + 'Partition GUID code: 4FBD7E29-9D25-41B8-AFD0-062C0CEFF05D' + ): + return True + except subprocess.CalledProcessError: + pass + return False diff --git a/hooks/config-changed b/hooks/config-changed new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/config-changed @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/hooks.py b/hooks/hooks.py new file mode 100755 index 00000000..97437a1d --- /dev/null +++ b/hooks/hooks.py @@ -0,0 +1,145 @@ +#!/usr/bin/python + +# +# Copyright 2012 Canonical Ltd. +# +# Authors: +# James Page +# + +import glob +import os +import subprocess +import shutil +import sys + +import ceph +import utils + + +def install_upstart_scripts(): + for x in glob.glob('files/upstart/*.conf'): + shutil.copy(x, '/etc/init/') + + +def install(): + utils.juju_log('INFO', 'Begin install hook.') + utils.configure_source() + utils.install('ceph', 'gdisk') + install_upstart_scripts() + utils.juju_log('INFO', 'End install hook.') + + +def emit_cephconf(): + cephcontext = { + 'mon_hosts': ' '.join(get_mon_hosts()), + 'fsid': get_fsid() + } + + with open('/etc/ceph/ceph.conf', 'w') as cephconf: + cephconf.write(utils.render_template('ceph.conf', cephcontext)) + + +def config_changed(): + utils.juju_log('INFO', 'Begin config-changed hook.') + + utils.juju_log('INFO', 'Monitor hosts are ' + repr(get_mon_hosts())) + + if get_fsid(): + utils.juju_log('INFO', 'cluster fsid detected, rescanning disks') + emit_cephconf() + for dev in utils.config_get('osd-devices').split(' '): + osdize(dev) + subprocess.call(['udevadm', 'trigger', + '--subsystem-match=block', '--action=add']) + + utils.juju_log('INFO', 'End config-changed hook.') + + +def get_mon_hosts(): + hosts = [] + hosts.append('{}:6789'.format(utils.get_host_ip())) + + for relid in utils.relation_ids('mon'): + for unit in utils.relation_list(relid): + hosts.append( + '{}:6789'.format(utils.get_host_ip( + utils.relation_get('private-address', + unit, relid))) + ) + + hosts.sort() + return hosts + + +def get_fsid(): + for relid in utils.relation_ids('mon'): + for unit in utils.relation_list(relid): + fsid = utils.relation_get('fsid', + unit, relid) + if fsid != "": + return fsid + return None + + +def osdize(dev): + # XXX hack for instances + subprocess.call(['umount', '/mnt']) + + if ceph.is_osd_disk(dev): + utils.juju_log('INFO', + 'Looks like {} is already an OSD, skipping.' + .format(dev)) + return + + if subprocess.call(['grep', '-wqs', dev + '1', '/proc/mounts']) == 0: + utils.juju_log('INFO', + 'Looks like {} is in use, skipping.'.format(dev)) + return + + if os.path.exists(dev): + subprocess.call(['ceph-disk-prepare', dev]) + + +def mon_relation(): + utils.juju_log('INFO', 'Begin mon-relation hook.') + + if get_fsid(): + utils.juju_log('INFO', 'mon has provided fsid - scanning disks') + emit_cephconf() + for dev in utils.config_get('osd-devices').split(' '): + osdize(dev) + subprocess.call(['udevadm', 'trigger', + '--subsystem-match=block', '--action=add']) + else: + utils.juju_log('INFO', + 'mon cluster has not yet provided fsid') + + utils.juju_log('INFO', 'End mon-relation hook.') + + +def upgrade_charm(): + utils.juju_log('INFO', 'Begin upgrade-charm hook.') + if get_fsid(): + emit_cephconf() + install_upstart_scripts() + utils.juju_log('INFO', 'End upgrade-charm hook.') + + +def start(): + # In case we're being redeployed to the same machines, try + # to make sure everything is running as soon as possible. + subprocess.call(['udevadm', 'trigger', + '--subsystem-match=block', '--action=add']) + + +utils.do_hooks({ + 'config-changed': config_changed, + 'install': install, + 'mon-relation-departed': mon_relation, + 'mon-relation-changed': mon_relation, + 'start': start, + 'upgrade-charm': upgrade_charm, + }) + +sys.exit(0) diff --git a/hooks/install b/hooks/install new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/install @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/mon-relation-changed b/hooks/mon-relation-changed new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/mon-relation-changed @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/mon-relation-departed b/hooks/mon-relation-departed new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/mon-relation-departed @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/start b/hooks/start new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/start @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/stop b/hooks/stop new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/stop @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/upgrade-charm b/hooks/upgrade-charm new file mode 120000 index 00000000..9416ca6a --- /dev/null +++ b/hooks/upgrade-charm @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/utils.py b/hooks/utils.py new file mode 100644 index 00000000..64c578e1 --- /dev/null +++ b/hooks/utils.py @@ -0,0 +1,163 @@ + +# +# Copyright 2012 Canonical Ltd. +# +# Authors: +# James Page +# Paul Collins +# + +import os +import subprocess +import socket +import sys + + +def do_hooks(hooks): + hook = os.path.basename(sys.argv[0]) + + try: + hooks[hook]() + except KeyError: + juju_log('INFO', + "This charm doesn't know how to handle '{}'.".format(hook)) + + +def install(*pkgs): + cmd = [ + 'apt-get', + '-y', + 'install' + ] + for pkg in pkgs: + cmd.append(pkg) + subprocess.check_call(cmd) + +TEMPLATES_DIR = 'templates' + +try: + import jinja2 +except ImportError: + install('python-jinja2') + import jinja2 + + +def render_template(template_name, context, template_dir=TEMPLATES_DIR): + templates = jinja2.Environment( + loader=jinja2.FileSystemLoader(template_dir) + ) + template = templates.get_template(template_name) + return template.render(context) + + +def configure_source(): + source = config_get('source') + if (source.startswith('ppa:') or + source.startswith('cloud:') or + source.startswith('http:')): + cmd = [ + 'add-apt-repository', + source + ] + subprocess.check_call(cmd) + if source.startswith('http:'): + key = config_get('key') + cmd = [ + 'apt-key', + 'import', + key + ] + subprocess.check_call(cmd) + cmd = [ + 'apt-get', + 'update' + ] + subprocess.check_call(cmd) + +# Protocols +TCP = 'TCP' +UDP = 'UDP' + + +def expose(port, protocol='TCP'): + cmd = [ + 'open-port', + '{}/{}'.format(port, protocol) + ] + subprocess.check_call(cmd) + + +def juju_log(severity, message): + cmd = [ + 'juju-log', + '--log-level', severity, + message + ] + subprocess.check_call(cmd) + + +def relation_ids(relation): + cmd = [ + 'relation-ids', + relation + ] + return subprocess.check_output(cmd).split() # IGNORE:E1103 + + +def relation_list(rid): + cmd = [ + 'relation-list', + '-r', rid, + ] + return subprocess.check_output(cmd).split() # IGNORE:E1103 + + +def relation_get(attribute, unit=None, rid=None): + cmd = [ + 'relation-get', + ] + if rid: + cmd.append('-r') + cmd.append(rid) + cmd.append(attribute) + if unit: + cmd.append(unit) + return subprocess.check_output(cmd).strip() # IGNORE:E1103 + + +def relation_set(**kwargs): + cmd = [ + 'relation-set' + ] + for k, v in kwargs.items(): + cmd.append('{}={}'.format(k, v)) + subprocess.check_call(cmd) + + +def unit_get(attribute): + cmd = [ + 'unit-get', + attribute + ] + return subprocess.check_output(cmd).strip() # IGNORE:E1103 + + +def config_get(attribute): + cmd = [ + 'config-get', + attribute + ] + return subprocess.check_output(cmd).strip() # IGNORE:E1103 + + +def get_unit_hostname(): + return socket.gethostname() + + +def get_host_ip(hostname=unit_get('private-address')): + cmd = [ + 'dig', + '+short', + hostname + ] + return subprocess.check_output(cmd).strip() # IGNORE:E1103 diff --git a/metadata.yaml b/metadata.yaml new file mode 100644 index 00000000..530f4142 --- /dev/null +++ b/metadata.yaml @@ -0,0 +1,12 @@ +name: ceph-osd +summary: Highly scalable distributed storage - OSD nodes +maintainer: James Page +description: | + Ceph is a distributed storage and network file system designed to provide + excellent performance, reliability, and scalability. + . + This charm provides the OSD personality for expanding storage nodes within + a ceph deployment. +requires: + mon: + interface: ceph-osd diff --git a/revision b/revision new file mode 100644 index 00000000..0cfbf088 --- /dev/null +++ b/revision @@ -0,0 +1 @@ +2 diff --git a/templates/ceph.conf b/templates/ceph.conf new file mode 100644 index 00000000..32103fb5 --- /dev/null +++ b/templates/ceph.conf @@ -0,0 +1,17 @@ +[global] + auth supported = none + keyring = /etc/ceph/$cluster.$name.keyring + mon host = {{ mon_hosts }} + fsid = {{ fsid }} + +[mon] + keyring = /var/lib/ceph/mon/$cluster-$id/keyring + +[mds] + keyring = /var/lib/ceph/mds/$cluster-$id/keyring + +[osd] + keyring = /var/lib/ceph/osd/$cluster-$id/keyring + osd journal size = 1000 + filestore xattr use omap = true +