charm-hacluster/hooks/hooks.py
james.page@ubuntu.com 511e30ef74 [james-page,r=gnuoy,r=*] Refactor hacluster charm
1) supports reconfiguration of cluster resources from principle charm

2) direct configuration of mcastport and bindiface via juju configuration

3) quorum policy based on expected size of cluster

    2 = ignore quorum loss
    3 = stop on quorum loss

4) conditional restarting of corosync/pacemaker as required.

It's all just a bit nicer to use now!
2014-10-07 09:30:10 +01:00

554 lines
19 KiB
Python
Executable File

#!/usr/bin/python
#
# Copyright 2012 Canonical Ltd.
#
# Authors:
# Andres Rodriguez <andres.rodriguez@canonical.com>
#
import ast
import shutil
import sys
import time
import os
from base64 import b64decode
import maas as MAAS
import pcmk
import hacluster
import socket
from charmhelpers.core.hookenv import (
log,
relation_get,
related_units,
relation_ids,
relation_set,
unit_get,
config,
Hooks, UnregisteredHookError,
local_unit,
)
from charmhelpers.core.host import (
service_stop,
service_start,
service_restart,
service_running,
write_file,
mkdir,
file_hash,
lsb_release
)
from charmhelpers.fetch import (
apt_install,
apt_purge
)
from charmhelpers.contrib.hahelpers.cluster import (
peer_units,
oldest_peer
)
hooks = Hooks()
COROSYNC_CONF = '/etc/corosync/corosync.conf'
COROSYNC_DEFAULT = '/etc/default/corosync'
COROSYNC_AUTHKEY = '/etc/corosync/authkey'
COROSYNC_CONF_FILES = [
COROSYNC_DEFAULT,
COROSYNC_AUTHKEY,
COROSYNC_CONF
]
PACKAGES = ['corosync', 'pacemaker', 'python-netaddr', 'ipmitool']
@hooks.hook()
def install():
apt_install(PACKAGES, fatal=True)
# NOTE(adam_g) rbd OCF only included with newer versions of
# ceph-resource-agents. Bundle /w charm until we figure out a
# better way to install it.
mkdir('/usr/lib/ocf/resource.d/ceph')
if not os.path.isfile('/usr/lib/ocf/resource.d/ceph/rbd'):
shutil.copy('ocf/ceph/rbd', '/usr/lib/ocf/resource.d/ceph/rbd')
def get_corosync_conf():
if config('prefer-ipv6'):
ip_version = 'ipv6'
bindnetaddr = hacluster.get_ipv6_network_address
else:
ip_version = 'ipv4'
bindnetaddr = hacluster.get_network_address
# NOTE(jamespage) use local charm configuration over any provided by
# principle charm
conf = {
'corosync_bindnetaddr':
bindnetaddr(config('corosync_bindiface')),
'corosync_mcastport': config('corosync_mcastport'),
'corosync_mcastaddr': config('corosync_mcastaddr'),
'ip_version': ip_version,
}
if None not in conf.itervalues():
return conf
conf = {}
for relid in relation_ids('ha'):
for unit in related_units(relid):
bindiface = relation_get('corosync_bindiface',
unit, relid)
conf = {
'corosync_bindnetaddr': bindnetaddr(bindiface),
'corosync_mcastport': relation_get('corosync_mcastport',
unit, relid),
'corosync_mcastaddr': config('corosync_mcastaddr'),
'ip_version': ip_version,
}
if config('prefer-ipv6'):
local_unit_no = int(local_unit().split('/')[1])
# nodeid should not be 0
conf['nodeid'] = local_unit_no + 1
conf['netmtu'] = config('netmtu')
if None not in conf.itervalues():
return conf
missing = [k for k, v in conf.iteritems() if v is None]
log('Missing required configuration: %s' % missing)
return None
def emit_corosync_conf():
corosync_conf_context = get_corosync_conf()
if corosync_conf_context:
write_file(path=COROSYNC_CONF,
content=render_template('corosync.conf',
corosync_conf_context))
return True
else:
return False
def emit_base_conf():
corosync_default_context = {'corosync_enabled': 'yes'}
write_file(path=COROSYNC_DEFAULT,
content=render_template('corosync',
corosync_default_context))
corosync_key = config('corosync_key')
if corosync_key:
write_file(path=COROSYNC_AUTHKEY,
content=b64decode(corosync_key),
perms=0o400)
return True
else:
return False
@hooks.hook()
def config_changed():
if config('prefer-ipv6'):
assert_charm_supports_ipv6()
corosync_key = config('corosync_key')
if not corosync_key:
log('CRITICAL',
'No Corosync key supplied, cannot proceed')
sys.exit(1)
hacluster.enable_lsb_services('pacemaker')
if configure_corosync():
pcmk.wait_for_pcmk()
configure_cluster_global()
configure_monitor_host()
configure_stonith()
@hooks.hook()
def upgrade_charm():
install()
def restart_corosync():
if service_running("pacemaker"):
service_stop("pacemaker")
service_restart("corosync")
time.sleep(5)
service_start("pacemaker")
def restart_corosync_on_change():
'''Simple decorator to restart corosync if any of its config changes'''
def wrap(f):
def wrapped_f(*args):
checksums = {}
for path in COROSYNC_CONF_FILES:
checksums[path] = file_hash(path)
return_data = f(*args)
# NOTE: this assumes that this call is always done around
# configure_corosync, which returns true if configuration
# files where actually generated
if return_data:
for path in COROSYNC_CONF_FILES:
if checksums[path] != file_hash(path):
restart_corosync()
break
return return_data
return wrapped_f
return wrap
@restart_corosync_on_change()
def configure_corosync():
log('Configuring and (maybe) restarting corosync')
return emit_base_conf() and emit_corosync_conf()
def configure_monitor_host():
'''Configure extra monitor host for better network failure detection'''
log('Checking monitor host configuration')
monitor_host = config('monitor_host')
if monitor_host:
if not pcmk.crm_opt_exists('ping'):
log('Implementing monitor host'
' configuration (host: %s)' % monitor_host)
monitor_interval = config('monitor_interval')
cmd = 'crm -w -F configure primitive ping' \
' ocf:pacemaker:ping params host_list="%s"' \
' multiplier="100" op monitor interval="%s"' %\
(monitor_host, monitor_interval)
pcmk.commit(cmd)
cmd = 'crm -w -F configure clone cl_ping ping' \
' meta interleave="true"'
pcmk.commit(cmd)
else:
log('Reconfiguring monitor host'
' configuration (host: %s)' % monitor_host)
cmd = 'crm -w -F resource param ping set host_list="%s"' %\
monitor_host
else:
if pcmk.crm_opt_exists('ping'):
log('Disabling monitor host configuration')
pcmk.commit('crm -w -F resource stop ping')
pcmk.commit('crm -w -F configure delete ping')
def configure_cluster_global():
'''Configure global cluster options'''
log('Applying global cluster configuration')
if int(config('cluster_count')) >= 3:
# NOTE(jamespage) if 3 or more nodes, then quorum can be
# managed effectively, so stop if quorum lost
log('Configuring no-quorum-policy to stop')
cmd = "crm configure property no-quorum-policy=stop"
else:
# NOTE(jamespage) if less that 3 nodes, quorum not possible
# so ignore
log('Configuring no-quorum-policy to ignore')
cmd = "crm configure property no-quorum-policy=ignore"
pcmk.commit(cmd)
cmd = 'crm configure rsc_defaults $id="rsc-options"' \
' resource-stickiness="100"'
pcmk.commit(cmd)
def parse_data(relid, unit, key):
'''Simple helper to ast parse relation data'''
data = relation_get(key, unit, relid)
if data:
return ast.literal_eval(data)
else:
return {}
@hooks.hook('ha-relation-joined',
'ha-relation-changed',
'hanode-relation-joined',
'hanode-relation-changed')
def configure_principle_cluster_resources():
# Check that we are related to a principle and that
# it has already provided the required corosync configuration
if not get_corosync_conf():
log('Unable to configure corosync right now, deferring configuration')
return
else:
if relation_ids('hanode'):
log('Ready to form cluster - informing peers')
relation_set(relation_id=relation_ids('hanode')[0],
ready=True)
else:
log('Ready to form cluster, but not related to peers just yet')
return
# Check that there's enough nodes in order to perform the
# configuration of the HA cluster
if (len(get_cluster_nodes()) <
int(config('cluster_count'))):
log('Not enough nodes in cluster, deferring configuration')
return
relids = relation_ids('ha')
if len(relids) == 1: # Should only ever be one of these
# Obtain relation information
relid = relids[0]
units = related_units(relid)
if len(units) < 1:
log('No principle unit found, deferring configuration')
return
unit = units[0]
log('Parsing cluster configuration'
' using rid: {}, unit: {}'.format(relid, unit))
resources = parse_data(relid, unit, 'resources')
delete_resources = parse_data(relid, unit, 'delete_resources')
resource_params = parse_data(relid, unit, 'resource_params')
groups = parse_data(relid, unit, 'groups')
ms = parse_data(relid, unit, 'ms')
orders = parse_data(relid, unit, 'orders')
colocations = parse_data(relid, unit, 'colocations')
clones = parse_data(relid, unit, 'clones')
init_services = parse_data(relid, unit, 'init_services')
else:
log('Related to {} ha services'.format(len(relids)))
return
if True in [ra.startswith('ocf:openstack')
for ra in resources.itervalues()]:
apt_install('openstack-resource-agents')
if True in [ra.startswith('ocf:ceph')
for ra in resources.itervalues()]:
apt_install('ceph-resource-agents')
# NOTE: this should be removed in 15.04 cycle as corosync
# configuration should be set directly on subordinate
configure_corosync()
pcmk.wait_for_pcmk()
configure_cluster_global()
configure_monitor_host()
configure_stonith()
# Only configure the cluster resources
# from the oldest peer unit.
if oldest_peer(peer_units()):
log('Deleting Resources')
log(str(delete_resources))
for res_name in delete_resources:
if pcmk.crm_opt_exists(res_name):
log('Stopping and deleting resource %s' % res_name)
if pcmk.crm_res_running(res_name):
pcmk.commit('crm -w -F resource stop %s' % res_name)
pcmk.commit('crm -w -F configure delete %s' % res_name)
log('Configuring Resources')
log(str(resources))
for res_name, res_type in resources.iteritems():
# disable the service we are going to put in HA
if res_type.split(':')[0] == "lsb":
hacluster.disable_lsb_services(res_type.split(':')[1])
if service_running(res_type.split(':')[1]):
service_stop(res_type.split(':')[1])
elif (len(init_services) != 0 and
res_name in init_services and
init_services[res_name]):
hacluster.disable_upstart_services(init_services[res_name])
if service_running(init_services[res_name]):
service_stop(init_services[res_name])
# Put the services in HA, if not already done so
# if not pcmk.is_resource_present(res_name):
if not pcmk.crm_opt_exists(res_name):
if res_name not in resource_params:
cmd = 'crm -w -F configure primitive %s %s' % (res_name,
res_type)
else:
cmd = 'crm -w -F configure primitive %s %s %s' % \
(res_name,
res_type,
resource_params[res_name])
pcmk.commit(cmd)
log('%s' % cmd)
if config('monitor_host'):
cmd = 'crm -F configure location Ping-%s %s rule' \
' -inf: pingd lte 0' % (res_name, res_name)
pcmk.commit(cmd)
log('Configuring Groups')
log(str(groups))
for grp_name, grp_params in groups.iteritems():
if not pcmk.crm_opt_exists(grp_name):
cmd = 'crm -w -F configure group %s %s' % (grp_name,
grp_params)
pcmk.commit(cmd)
log('%s' % cmd)
log('Configuring Master/Slave (ms)')
log(str(ms))
for ms_name, ms_params in ms.iteritems():
if not pcmk.crm_opt_exists(ms_name):
cmd = 'crm -w -F configure ms %s %s' % (ms_name, ms_params)
pcmk.commit(cmd)
log('%s' % cmd)
log('Configuring Orders')
log(str(orders))
for ord_name, ord_params in orders.iteritems():
if not pcmk.crm_opt_exists(ord_name):
cmd = 'crm -w -F configure order %s %s' % (ord_name,
ord_params)
pcmk.commit(cmd)
log('%s' % cmd)
log('Configuring Colocations')
log(str(colocations))
for col_name, col_params in colocations.iteritems():
if not pcmk.crm_opt_exists(col_name):
cmd = 'crm -w -F configure colocation %s %s' % (col_name,
col_params)
pcmk.commit(cmd)
log('%s' % cmd)
log('Configuring Clones')
log(str(clones))
for cln_name, cln_params in clones.iteritems():
if not pcmk.crm_opt_exists(cln_name):
cmd = 'crm -w -F configure clone %s %s' % (cln_name,
cln_params)
pcmk.commit(cmd)
log('%s' % cmd)
for res_name, res_type in resources.iteritems():
if len(init_services) != 0 and res_name in init_services:
# Checks that the resources are running and started.
# Ensure that clones are excluded as the resource is
# not directly controllable (dealt with below)
# Ensure that groups are cleaned up as a whole rather
# than as individual resources.
if (res_name not in clones.values() and
res_name not in groups.values() and
not pcmk.crm_res_running(res_name)):
# Just in case, cleanup the resources to ensure they get
# started in case they failed for some unrelated reason.
cmd = 'crm resource cleanup %s' % res_name
pcmk.commit(cmd)
for cl_name in clones:
# Always cleanup clones
cmd = 'crm resource cleanup %s' % cl_name
pcmk.commit(cmd)
for grp_name in groups:
# Always cleanup groups
cmd = 'crm resource cleanup %s' % grp_name
pcmk.commit(cmd)
for rel_id in relation_ids('ha'):
relation_set(relation_id=rel_id,
clustered="yes")
def configure_stonith():
if config('stonith_enabled') not in ['true', 'True', True]:
log('Disabling STONITH')
cmd = "crm configure property stonith-enabled=false"
pcmk.commit(cmd)
else:
log('Enabling STONITH for all nodes in cluster.')
# configure stontih resources for all nodes in cluster.
# note: this is totally provider dependent and requires
# access to the MAAS API endpoint, using endpoint and credentials
# set in config.
url = config('maas_url')
creds = config('maas_credentials')
if None in [url, creds]:
log('maas_url and maas_credentials must be set'
' in config to enable STONITH.')
sys.exit(1)
maas = MAAS.MAASHelper(url, creds)
nodes = maas.list_nodes()
if not nodes:
log('Could not obtain node inventory from '
'MAAS @ %s.' % url)
sys.exit(1)
cluster_nodes = pcmk.list_nodes()
for node in cluster_nodes:
rsc, constraint = pcmk.maas_stonith_primitive(nodes, node)
if not rsc:
log('Failed to determine STONITH primitive for node'
' %s' % node)
sys.exit(1)
rsc_name = str(rsc).split(' ')[1]
if not pcmk.is_resource_present(rsc_name):
log('Creating new STONITH primitive %s.' %
rsc_name)
cmd = 'crm -F configure %s' % rsc
pcmk.commit(cmd)
if constraint:
cmd = 'crm -F configure %s' % constraint
pcmk.commit(cmd)
else:
log('STONITH primitive already exists '
'for node.')
cmd = "crm configure property stonith-enabled=true"
pcmk.commit(cmd)
def get_cluster_nodes():
hosts = []
hosts.append(unit_get('private-address'))
for relid in relation_ids('hanode'):
for unit in related_units(relid):
if relation_get('ready',
rid=relid,
unit=unit):
hosts.append(relation_get('private-address',
unit, relid))
hosts.sort()
return hosts
TEMPLATES_DIR = 'templates'
try:
import jinja2
except ImportError:
apt_install('python-jinja2', fatal=True)
import jinja2
def render_template(template_name, context, template_dir=TEMPLATES_DIR):
templates = jinja2.Environment(
loader=jinja2.FileSystemLoader(template_dir)
)
template = templates.get_template(template_name)
return template.render(context)
@hooks.hook()
def stop():
cmd = 'crm -w -F node delete %s' % socket.gethostname()
pcmk.commit(cmd)
apt_purge(['corosync', 'pacemaker'], fatal=True)
def assert_charm_supports_ipv6():
"""Check whether we are able to support charms ipv6."""
if lsb_release()['DISTRIB_CODENAME'].lower() < "trusty":
raise Exception("IPv6 is not supported in the charms for Ubuntu "
"versions less than Trusty 14.04")
if __name__ == '__main__':
try:
hooks.execute(sys.argv)
except UnregisteredHookError as e:
log('Unknown hook {} - skipping.'.format(e))