511e30ef74
1) supports reconfiguration of cluster resources from principle charm 2) direct configuration of mcastport and bindiface via juju configuration 3) quorum policy based on expected size of cluster 2 = ignore quorum loss 3 = stop on quorum loss 4) conditional restarting of corosync/pacemaker as required. It's all just a bit nicer to use now!
554 lines
19 KiB
Python
Executable File
554 lines
19 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
#
|
|
# Copyright 2012 Canonical Ltd.
|
|
#
|
|
# Authors:
|
|
# Andres Rodriguez <andres.rodriguez@canonical.com>
|
|
#
|
|
|
|
import ast
|
|
import shutil
|
|
import sys
|
|
import time
|
|
import os
|
|
from base64 import b64decode
|
|
|
|
import maas as MAAS
|
|
import pcmk
|
|
import hacluster
|
|
import socket
|
|
|
|
from charmhelpers.core.hookenv import (
|
|
log,
|
|
relation_get,
|
|
related_units,
|
|
relation_ids,
|
|
relation_set,
|
|
unit_get,
|
|
config,
|
|
Hooks, UnregisteredHookError,
|
|
local_unit,
|
|
)
|
|
|
|
from charmhelpers.core.host import (
|
|
service_stop,
|
|
service_start,
|
|
service_restart,
|
|
service_running,
|
|
write_file,
|
|
mkdir,
|
|
file_hash,
|
|
lsb_release
|
|
)
|
|
|
|
from charmhelpers.fetch import (
|
|
apt_install,
|
|
apt_purge
|
|
)
|
|
|
|
from charmhelpers.contrib.hahelpers.cluster import (
|
|
peer_units,
|
|
oldest_peer
|
|
)
|
|
|
|
hooks = Hooks()
|
|
|
|
COROSYNC_CONF = '/etc/corosync/corosync.conf'
|
|
COROSYNC_DEFAULT = '/etc/default/corosync'
|
|
COROSYNC_AUTHKEY = '/etc/corosync/authkey'
|
|
|
|
COROSYNC_CONF_FILES = [
|
|
COROSYNC_DEFAULT,
|
|
COROSYNC_AUTHKEY,
|
|
COROSYNC_CONF
|
|
]
|
|
|
|
PACKAGES = ['corosync', 'pacemaker', 'python-netaddr', 'ipmitool']
|
|
|
|
|
|
@hooks.hook()
|
|
def install():
|
|
apt_install(PACKAGES, fatal=True)
|
|
# NOTE(adam_g) rbd OCF only included with newer versions of
|
|
# ceph-resource-agents. Bundle /w charm until we figure out a
|
|
# better way to install it.
|
|
mkdir('/usr/lib/ocf/resource.d/ceph')
|
|
if not os.path.isfile('/usr/lib/ocf/resource.d/ceph/rbd'):
|
|
shutil.copy('ocf/ceph/rbd', '/usr/lib/ocf/resource.d/ceph/rbd')
|
|
|
|
|
|
def get_corosync_conf():
|
|
if config('prefer-ipv6'):
|
|
ip_version = 'ipv6'
|
|
bindnetaddr = hacluster.get_ipv6_network_address
|
|
else:
|
|
ip_version = 'ipv4'
|
|
bindnetaddr = hacluster.get_network_address
|
|
|
|
# NOTE(jamespage) use local charm configuration over any provided by
|
|
# principle charm
|
|
conf = {
|
|
'corosync_bindnetaddr':
|
|
bindnetaddr(config('corosync_bindiface')),
|
|
'corosync_mcastport': config('corosync_mcastport'),
|
|
'corosync_mcastaddr': config('corosync_mcastaddr'),
|
|
'ip_version': ip_version,
|
|
}
|
|
if None not in conf.itervalues():
|
|
return conf
|
|
|
|
conf = {}
|
|
for relid in relation_ids('ha'):
|
|
for unit in related_units(relid):
|
|
bindiface = relation_get('corosync_bindiface',
|
|
unit, relid)
|
|
conf = {
|
|
'corosync_bindnetaddr': bindnetaddr(bindiface),
|
|
'corosync_mcastport': relation_get('corosync_mcastport',
|
|
unit, relid),
|
|
'corosync_mcastaddr': config('corosync_mcastaddr'),
|
|
'ip_version': ip_version,
|
|
}
|
|
|
|
if config('prefer-ipv6'):
|
|
local_unit_no = int(local_unit().split('/')[1])
|
|
# nodeid should not be 0
|
|
conf['nodeid'] = local_unit_no + 1
|
|
conf['netmtu'] = config('netmtu')
|
|
|
|
if None not in conf.itervalues():
|
|
return conf
|
|
missing = [k for k, v in conf.iteritems() if v is None]
|
|
log('Missing required configuration: %s' % missing)
|
|
return None
|
|
|
|
|
|
def emit_corosync_conf():
|
|
corosync_conf_context = get_corosync_conf()
|
|
if corosync_conf_context:
|
|
write_file(path=COROSYNC_CONF,
|
|
content=render_template('corosync.conf',
|
|
corosync_conf_context))
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def emit_base_conf():
|
|
corosync_default_context = {'corosync_enabled': 'yes'}
|
|
write_file(path=COROSYNC_DEFAULT,
|
|
content=render_template('corosync',
|
|
corosync_default_context))
|
|
|
|
corosync_key = config('corosync_key')
|
|
if corosync_key:
|
|
write_file(path=COROSYNC_AUTHKEY,
|
|
content=b64decode(corosync_key),
|
|
perms=0o400)
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
@hooks.hook()
|
|
def config_changed():
|
|
if config('prefer-ipv6'):
|
|
assert_charm_supports_ipv6()
|
|
|
|
corosync_key = config('corosync_key')
|
|
if not corosync_key:
|
|
log('CRITICAL',
|
|
'No Corosync key supplied, cannot proceed')
|
|
sys.exit(1)
|
|
|
|
hacluster.enable_lsb_services('pacemaker')
|
|
|
|
if configure_corosync():
|
|
pcmk.wait_for_pcmk()
|
|
configure_cluster_global()
|
|
configure_monitor_host()
|
|
configure_stonith()
|
|
|
|
|
|
@hooks.hook()
|
|
def upgrade_charm():
|
|
install()
|
|
|
|
|
|
def restart_corosync():
|
|
if service_running("pacemaker"):
|
|
service_stop("pacemaker")
|
|
service_restart("corosync")
|
|
time.sleep(5)
|
|
service_start("pacemaker")
|
|
|
|
|
|
def restart_corosync_on_change():
|
|
'''Simple decorator to restart corosync if any of its config changes'''
|
|
def wrap(f):
|
|
def wrapped_f(*args):
|
|
checksums = {}
|
|
for path in COROSYNC_CONF_FILES:
|
|
checksums[path] = file_hash(path)
|
|
return_data = f(*args)
|
|
# NOTE: this assumes that this call is always done around
|
|
# configure_corosync, which returns true if configuration
|
|
# files where actually generated
|
|
if return_data:
|
|
for path in COROSYNC_CONF_FILES:
|
|
if checksums[path] != file_hash(path):
|
|
restart_corosync()
|
|
break
|
|
return return_data
|
|
return wrapped_f
|
|
return wrap
|
|
|
|
|
|
@restart_corosync_on_change()
|
|
def configure_corosync():
|
|
log('Configuring and (maybe) restarting corosync')
|
|
return emit_base_conf() and emit_corosync_conf()
|
|
|
|
|
|
def configure_monitor_host():
|
|
'''Configure extra monitor host for better network failure detection'''
|
|
log('Checking monitor host configuration')
|
|
monitor_host = config('monitor_host')
|
|
if monitor_host:
|
|
if not pcmk.crm_opt_exists('ping'):
|
|
log('Implementing monitor host'
|
|
' configuration (host: %s)' % monitor_host)
|
|
monitor_interval = config('monitor_interval')
|
|
cmd = 'crm -w -F configure primitive ping' \
|
|
' ocf:pacemaker:ping params host_list="%s"' \
|
|
' multiplier="100" op monitor interval="%s"' %\
|
|
(monitor_host, monitor_interval)
|
|
pcmk.commit(cmd)
|
|
cmd = 'crm -w -F configure clone cl_ping ping' \
|
|
' meta interleave="true"'
|
|
pcmk.commit(cmd)
|
|
else:
|
|
log('Reconfiguring monitor host'
|
|
' configuration (host: %s)' % monitor_host)
|
|
cmd = 'crm -w -F resource param ping set host_list="%s"' %\
|
|
monitor_host
|
|
else:
|
|
if pcmk.crm_opt_exists('ping'):
|
|
log('Disabling monitor host configuration')
|
|
pcmk.commit('crm -w -F resource stop ping')
|
|
pcmk.commit('crm -w -F configure delete ping')
|
|
|
|
|
|
def configure_cluster_global():
|
|
'''Configure global cluster options'''
|
|
log('Applying global cluster configuration')
|
|
if int(config('cluster_count')) >= 3:
|
|
# NOTE(jamespage) if 3 or more nodes, then quorum can be
|
|
# managed effectively, so stop if quorum lost
|
|
log('Configuring no-quorum-policy to stop')
|
|
cmd = "crm configure property no-quorum-policy=stop"
|
|
else:
|
|
# NOTE(jamespage) if less that 3 nodes, quorum not possible
|
|
# so ignore
|
|
log('Configuring no-quorum-policy to ignore')
|
|
cmd = "crm configure property no-quorum-policy=ignore"
|
|
pcmk.commit(cmd)
|
|
|
|
cmd = 'crm configure rsc_defaults $id="rsc-options"' \
|
|
' resource-stickiness="100"'
|
|
pcmk.commit(cmd)
|
|
|
|
|
|
def parse_data(relid, unit, key):
|
|
'''Simple helper to ast parse relation data'''
|
|
data = relation_get(key, unit, relid)
|
|
if data:
|
|
return ast.literal_eval(data)
|
|
else:
|
|
return {}
|
|
|
|
|
|
@hooks.hook('ha-relation-joined',
|
|
'ha-relation-changed',
|
|
'hanode-relation-joined',
|
|
'hanode-relation-changed')
|
|
def configure_principle_cluster_resources():
|
|
# Check that we are related to a principle and that
|
|
# it has already provided the required corosync configuration
|
|
if not get_corosync_conf():
|
|
log('Unable to configure corosync right now, deferring configuration')
|
|
return
|
|
else:
|
|
if relation_ids('hanode'):
|
|
log('Ready to form cluster - informing peers')
|
|
relation_set(relation_id=relation_ids('hanode')[0],
|
|
ready=True)
|
|
else:
|
|
log('Ready to form cluster, but not related to peers just yet')
|
|
return
|
|
|
|
# Check that there's enough nodes in order to perform the
|
|
# configuration of the HA cluster
|
|
if (len(get_cluster_nodes()) <
|
|
int(config('cluster_count'))):
|
|
log('Not enough nodes in cluster, deferring configuration')
|
|
return
|
|
|
|
relids = relation_ids('ha')
|
|
if len(relids) == 1: # Should only ever be one of these
|
|
# Obtain relation information
|
|
relid = relids[0]
|
|
units = related_units(relid)
|
|
if len(units) < 1:
|
|
log('No principle unit found, deferring configuration')
|
|
return
|
|
unit = units[0]
|
|
log('Parsing cluster configuration'
|
|
' using rid: {}, unit: {}'.format(relid, unit))
|
|
resources = parse_data(relid, unit, 'resources')
|
|
delete_resources = parse_data(relid, unit, 'delete_resources')
|
|
resource_params = parse_data(relid, unit, 'resource_params')
|
|
groups = parse_data(relid, unit, 'groups')
|
|
ms = parse_data(relid, unit, 'ms')
|
|
orders = parse_data(relid, unit, 'orders')
|
|
colocations = parse_data(relid, unit, 'colocations')
|
|
clones = parse_data(relid, unit, 'clones')
|
|
init_services = parse_data(relid, unit, 'init_services')
|
|
else:
|
|
log('Related to {} ha services'.format(len(relids)))
|
|
return
|
|
|
|
if True in [ra.startswith('ocf:openstack')
|
|
for ra in resources.itervalues()]:
|
|
apt_install('openstack-resource-agents')
|
|
if True in [ra.startswith('ocf:ceph')
|
|
for ra in resources.itervalues()]:
|
|
apt_install('ceph-resource-agents')
|
|
|
|
# NOTE: this should be removed in 15.04 cycle as corosync
|
|
# configuration should be set directly on subordinate
|
|
configure_corosync()
|
|
pcmk.wait_for_pcmk()
|
|
configure_cluster_global()
|
|
configure_monitor_host()
|
|
configure_stonith()
|
|
|
|
# Only configure the cluster resources
|
|
# from the oldest peer unit.
|
|
if oldest_peer(peer_units()):
|
|
log('Deleting Resources')
|
|
log(str(delete_resources))
|
|
for res_name in delete_resources:
|
|
if pcmk.crm_opt_exists(res_name):
|
|
log('Stopping and deleting resource %s' % res_name)
|
|
if pcmk.crm_res_running(res_name):
|
|
pcmk.commit('crm -w -F resource stop %s' % res_name)
|
|
pcmk.commit('crm -w -F configure delete %s' % res_name)
|
|
|
|
log('Configuring Resources')
|
|
log(str(resources))
|
|
for res_name, res_type in resources.iteritems():
|
|
# disable the service we are going to put in HA
|
|
if res_type.split(':')[0] == "lsb":
|
|
hacluster.disable_lsb_services(res_type.split(':')[1])
|
|
if service_running(res_type.split(':')[1]):
|
|
service_stop(res_type.split(':')[1])
|
|
elif (len(init_services) != 0 and
|
|
res_name in init_services and
|
|
init_services[res_name]):
|
|
hacluster.disable_upstart_services(init_services[res_name])
|
|
if service_running(init_services[res_name]):
|
|
service_stop(init_services[res_name])
|
|
# Put the services in HA, if not already done so
|
|
# if not pcmk.is_resource_present(res_name):
|
|
if not pcmk.crm_opt_exists(res_name):
|
|
if res_name not in resource_params:
|
|
cmd = 'crm -w -F configure primitive %s %s' % (res_name,
|
|
res_type)
|
|
else:
|
|
cmd = 'crm -w -F configure primitive %s %s %s' % \
|
|
(res_name,
|
|
res_type,
|
|
resource_params[res_name])
|
|
pcmk.commit(cmd)
|
|
log('%s' % cmd)
|
|
if config('monitor_host'):
|
|
cmd = 'crm -F configure location Ping-%s %s rule' \
|
|
' -inf: pingd lte 0' % (res_name, res_name)
|
|
pcmk.commit(cmd)
|
|
|
|
log('Configuring Groups')
|
|
log(str(groups))
|
|
for grp_name, grp_params in groups.iteritems():
|
|
if not pcmk.crm_opt_exists(grp_name):
|
|
cmd = 'crm -w -F configure group %s %s' % (grp_name,
|
|
grp_params)
|
|
pcmk.commit(cmd)
|
|
log('%s' % cmd)
|
|
|
|
log('Configuring Master/Slave (ms)')
|
|
log(str(ms))
|
|
for ms_name, ms_params in ms.iteritems():
|
|
if not pcmk.crm_opt_exists(ms_name):
|
|
cmd = 'crm -w -F configure ms %s %s' % (ms_name, ms_params)
|
|
pcmk.commit(cmd)
|
|
log('%s' % cmd)
|
|
|
|
log('Configuring Orders')
|
|
log(str(orders))
|
|
for ord_name, ord_params in orders.iteritems():
|
|
if not pcmk.crm_opt_exists(ord_name):
|
|
cmd = 'crm -w -F configure order %s %s' % (ord_name,
|
|
ord_params)
|
|
pcmk.commit(cmd)
|
|
log('%s' % cmd)
|
|
|
|
log('Configuring Colocations')
|
|
log(str(colocations))
|
|
for col_name, col_params in colocations.iteritems():
|
|
if not pcmk.crm_opt_exists(col_name):
|
|
cmd = 'crm -w -F configure colocation %s %s' % (col_name,
|
|
col_params)
|
|
pcmk.commit(cmd)
|
|
log('%s' % cmd)
|
|
|
|
log('Configuring Clones')
|
|
log(str(clones))
|
|
for cln_name, cln_params in clones.iteritems():
|
|
if not pcmk.crm_opt_exists(cln_name):
|
|
cmd = 'crm -w -F configure clone %s %s' % (cln_name,
|
|
cln_params)
|
|
pcmk.commit(cmd)
|
|
log('%s' % cmd)
|
|
|
|
for res_name, res_type in resources.iteritems():
|
|
if len(init_services) != 0 and res_name in init_services:
|
|
# Checks that the resources are running and started.
|
|
# Ensure that clones are excluded as the resource is
|
|
# not directly controllable (dealt with below)
|
|
# Ensure that groups are cleaned up as a whole rather
|
|
# than as individual resources.
|
|
if (res_name not in clones.values() and
|
|
res_name not in groups.values() and
|
|
not pcmk.crm_res_running(res_name)):
|
|
# Just in case, cleanup the resources to ensure they get
|
|
# started in case they failed for some unrelated reason.
|
|
cmd = 'crm resource cleanup %s' % res_name
|
|
pcmk.commit(cmd)
|
|
|
|
for cl_name in clones:
|
|
# Always cleanup clones
|
|
cmd = 'crm resource cleanup %s' % cl_name
|
|
pcmk.commit(cmd)
|
|
|
|
for grp_name in groups:
|
|
# Always cleanup groups
|
|
cmd = 'crm resource cleanup %s' % grp_name
|
|
pcmk.commit(cmd)
|
|
|
|
for rel_id in relation_ids('ha'):
|
|
relation_set(relation_id=rel_id,
|
|
clustered="yes")
|
|
|
|
|
|
def configure_stonith():
|
|
if config('stonith_enabled') not in ['true', 'True', True]:
|
|
log('Disabling STONITH')
|
|
cmd = "crm configure property stonith-enabled=false"
|
|
pcmk.commit(cmd)
|
|
else:
|
|
log('Enabling STONITH for all nodes in cluster.')
|
|
# configure stontih resources for all nodes in cluster.
|
|
# note: this is totally provider dependent and requires
|
|
# access to the MAAS API endpoint, using endpoint and credentials
|
|
# set in config.
|
|
url = config('maas_url')
|
|
creds = config('maas_credentials')
|
|
if None in [url, creds]:
|
|
log('maas_url and maas_credentials must be set'
|
|
' in config to enable STONITH.')
|
|
sys.exit(1)
|
|
|
|
maas = MAAS.MAASHelper(url, creds)
|
|
nodes = maas.list_nodes()
|
|
if not nodes:
|
|
log('Could not obtain node inventory from '
|
|
'MAAS @ %s.' % url)
|
|
sys.exit(1)
|
|
|
|
cluster_nodes = pcmk.list_nodes()
|
|
for node in cluster_nodes:
|
|
rsc, constraint = pcmk.maas_stonith_primitive(nodes, node)
|
|
if not rsc:
|
|
log('Failed to determine STONITH primitive for node'
|
|
' %s' % node)
|
|
sys.exit(1)
|
|
|
|
rsc_name = str(rsc).split(' ')[1]
|
|
if not pcmk.is_resource_present(rsc_name):
|
|
log('Creating new STONITH primitive %s.' %
|
|
rsc_name)
|
|
cmd = 'crm -F configure %s' % rsc
|
|
pcmk.commit(cmd)
|
|
if constraint:
|
|
cmd = 'crm -F configure %s' % constraint
|
|
pcmk.commit(cmd)
|
|
else:
|
|
log('STONITH primitive already exists '
|
|
'for node.')
|
|
|
|
cmd = "crm configure property stonith-enabled=true"
|
|
pcmk.commit(cmd)
|
|
|
|
|
|
def get_cluster_nodes():
|
|
hosts = []
|
|
hosts.append(unit_get('private-address'))
|
|
for relid in relation_ids('hanode'):
|
|
for unit in related_units(relid):
|
|
if relation_get('ready',
|
|
rid=relid,
|
|
unit=unit):
|
|
hosts.append(relation_get('private-address',
|
|
unit, relid))
|
|
hosts.sort()
|
|
return hosts
|
|
|
|
TEMPLATES_DIR = 'templates'
|
|
|
|
try:
|
|
import jinja2
|
|
except ImportError:
|
|
apt_install('python-jinja2', fatal=True)
|
|
import jinja2
|
|
|
|
|
|
def render_template(template_name, context, template_dir=TEMPLATES_DIR):
|
|
templates = jinja2.Environment(
|
|
loader=jinja2.FileSystemLoader(template_dir)
|
|
)
|
|
template = templates.get_template(template_name)
|
|
return template.render(context)
|
|
|
|
|
|
@hooks.hook()
|
|
def stop():
|
|
cmd = 'crm -w -F node delete %s' % socket.gethostname()
|
|
pcmk.commit(cmd)
|
|
apt_purge(['corosync', 'pacemaker'], fatal=True)
|
|
|
|
|
|
def assert_charm_supports_ipv6():
|
|
"""Check whether we are able to support charms ipv6."""
|
|
if lsb_release()['DISTRIB_CODENAME'].lower() < "trusty":
|
|
raise Exception("IPv6 is not supported in the charms for Ubuntu "
|
|
"versions less than Trusty 14.04")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
hooks.execute(sys.argv)
|
|
except UnregisteredHookError as e:
|
|
log('Unknown hook {} - skipping.'.format(e))
|