charm-hacluster/hooks/hooks.py
2016-03-23 14:47:13 +00:00

416 lines
14 KiB
Python
Executable File

#!/usr/bin/python
#
# Copyright 2015 Canonical Ltd.
#
import shutil
import os
import sys
import glob
import pcmk
import socket
from charmhelpers.core.hookenv import (
log,
DEBUG,
INFO,
related_units,
relation_ids,
relation_get,
relation_set,
config,
Hooks,
UnregisteredHookError,
local_unit,
status_set,
)
from charmhelpers.core.host import (
service_stop,
service_running,
mkdir,
)
from charmhelpers.fetch import (
apt_install,
apt_purge,
filter_installed_packages,
)
from charmhelpers.contrib.hahelpers.cluster import (
peer_units,
oldest_peer
)
from utils import (
get_corosync_conf,
assert_charm_supports_ipv6,
get_cluster_nodes,
parse_data,
configure_corosync,
configure_stonith,
configure_monitor_host,
configure_cluster_global,
enable_lsb_services,
disable_lsb_services,
disable_upstart_services,
get_ipv6_addr,
set_unit_status,
)
from charmhelpers.contrib.charmsupport import nrpe
from charmhelpers.contrib.network.ip import (
is_ipv6,
)
hooks = Hooks()
PACKAGES = ['corosync', 'pacemaker', 'python-netaddr', 'ipmitool']
COROSYNC_CONF = '/etc/corosync/corosync.conf'
COROSYNC_DEFAULT = '/etc/default/corosync'
COROSYNC_AUTHKEY = '/etc/corosync/authkey'
COROSYNC_CONF_FILES = [
COROSYNC_DEFAULT,
COROSYNC_AUTHKEY,
COROSYNC_CONF
]
PACKAGES = ['corosync', 'pacemaker', 'python-netaddr', 'ipmitool',
'libnagios-plugin-perl']
SUPPORTED_TRANSPORTS = ['udp', 'udpu', 'multicast', 'unicast']
DEPRECATED_TRANSPORT_VALUES = {"multicast": "udp", "unicast": "udpu"}
@hooks.hook()
def install():
# NOTE(dosaboy): we currently disallow upgrades due to bug #1382842. This
# should be removed once the pacemaker package is fixed.
status_set('maintenance', 'Installing apt packages')
apt_install(filter_installed_packages(PACKAGES), fatal=True)
# NOTE(adam_g) rbd OCF only included with newer versions of
# ceph-resource-agents. Bundle /w charm until we figure out a
# better way to install it.
mkdir('/usr/lib/ocf/resource.d/ceph')
if not os.path.isfile('/usr/lib/ocf/resource.d/ceph/rbd'):
shutil.copy('ocf/ceph/rbd', '/usr/lib/ocf/resource.d/ceph/rbd')
def get_transport():
transport = config('corosync_transport')
val = DEPRECATED_TRANSPORT_VALUES.get(transport, transport)
if val not in ['udp', 'udpu']:
msg = ("Unsupported corosync_transport type '%s' - supported "
"types are: %s" % (transport, ', '.join(SUPPORTED_TRANSPORTS)))
status_set('blocked', msg)
raise ValueError(msg)
return val
def ensure_ipv6_requirements(hanode_rid):
# hanode relation needs ipv6 private-address
addr = relation_get(rid=hanode_rid, unit=local_unit(),
attribute='private-address')
log("Current private-address is %s" % (addr))
if not is_ipv6(addr):
addr = get_ipv6_addr()
log("New private-address is %s" % (addr))
relation_set(relation_id=hanode_rid,
**{'private-address': addr})
@hooks.hook()
def config_changed():
if config('prefer-ipv6'):
assert_charm_supports_ipv6()
corosync_key = config('corosync_key')
if not corosync_key:
message = 'No Corosync key supplied, cannot proceed'
status_set('blocked', message)
raise Exception(message)
enable_lsb_services('pacemaker')
if config('prefer-ipv6'):
for rid in relation_ids('hanode'):
ensure_ipv6_requirements(rid)
status_set('maintenance', "Setting up corosync")
if configure_corosync():
pcmk.wait_for_pcmk()
configure_cluster_global()
configure_monitor_host()
configure_stonith()
update_nrpe_config()
@hooks.hook()
def upgrade_charm():
install()
update_nrpe_config()
@hooks.hook('hanode-relation-joined',
'hanode-relation-changed')
def hanode_relation_changed():
if config('prefer-ipv6'):
ensure_ipv6_requirements(None)
ha_relation_changed()
@hooks.hook('ha-relation-joined',
'ha-relation-changed')
def ha_relation_changed():
# Check that we are related to a principle and that
# it has already provided the required corosync configuration
if not get_corosync_conf():
log('Unable to configure corosync right now, deferring configuration',
level=INFO)
return
if relation_ids('hanode'):
log('Ready to form cluster - informing peers', level=DEBUG)
relation_set(relation_id=relation_ids('hanode')[0], ready=True)
else:
log('Ready to form cluster, but not related to peers just yet',
level=INFO)
return
# Check that there's enough nodes in order to perform the
# configuration of the HA cluster
if len(get_cluster_nodes()) < int(config('cluster_count')):
log('Not enough nodes in cluster, deferring configuration',
level=INFO)
return
relids = relation_ids('ha')
if len(relids) == 1: # Should only ever be one of these
# Obtain relation information
relid = relids[0]
units = related_units(relid)
if len(units) < 1:
log('No principle unit found, deferring configuration',
level=INFO)
return
unit = units[0]
log('Parsing cluster configuration using rid: %s, unit: %s' %
(relid, unit), level=DEBUG)
resources = parse_data(relid, unit, 'resources')
delete_resources = parse_data(relid, unit, 'delete_resources')
resource_params = parse_data(relid, unit, 'resource_params')
groups = parse_data(relid, unit, 'groups')
ms = parse_data(relid, unit, 'ms')
orders = parse_data(relid, unit, 'orders')
colocations = parse_data(relid, unit, 'colocations')
clones = parse_data(relid, unit, 'clones')
locations = parse_data(relid, unit, 'locations')
init_services = parse_data(relid, unit, 'init_services')
else:
log('Related to %s ha services' % (len(relids)), level=DEBUG)
return
if True in [ra.startswith('ocf:openstack')
for ra in resources.itervalues()]:
apt_install('openstack-resource-agents')
if True in [ra.startswith('ocf:ceph')
for ra in resources.itervalues()]:
apt_install('ceph-resource-agents')
# NOTE: this should be removed in 15.04 cycle as corosync
# configuration should be set directly on subordinate
configure_corosync()
pcmk.wait_for_pcmk()
configure_cluster_global()
configure_monitor_host()
configure_stonith()
# Only configure the cluster resources
# from the oldest peer unit.
if oldest_peer(peer_units()):
log('Deleting Resources' % (delete_resources), level=DEBUG)
for res_name in delete_resources:
if pcmk.crm_opt_exists(res_name):
log('Stopping and deleting resource %s' % res_name,
level=DEBUG)
if pcmk.crm_res_running(res_name):
pcmk.commit('crm -w -F resource stop %s' % res_name)
pcmk.commit('crm -w -F configure delete %s' % res_name)
log('Configuring Resources: %s' % (resources), level=DEBUG)
for res_name, res_type in resources.iteritems():
# disable the service we are going to put in HA
if res_type.split(':')[0] == "lsb":
disable_lsb_services(res_type.split(':')[1])
if service_running(res_type.split(':')[1]):
service_stop(res_type.split(':')[1])
elif (len(init_services) != 0 and
res_name in init_services and
init_services[res_name]):
disable_upstart_services(init_services[res_name])
if service_running(init_services[res_name]):
service_stop(init_services[res_name])
# Put the services in HA, if not already done so
# if not pcmk.is_resource_present(res_name):
if not pcmk.crm_opt_exists(res_name):
if res_name not in resource_params:
cmd = 'crm -w -F configure primitive %s %s' % (res_name,
res_type)
else:
cmd = ('crm -w -F configure primitive %s %s %s' %
(res_name, res_type, resource_params[res_name]))
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
if config('monitor_host'):
cmd = ('crm -F configure location Ping-%s %s rule '
'-inf: pingd lte 0' % (res_name, res_name))
pcmk.commit(cmd)
log('Configuring Groups: %s' % (groups), level=DEBUG)
for grp_name, grp_params in groups.iteritems():
if not pcmk.crm_opt_exists(grp_name):
cmd = ('crm -w -F configure group %s %s' %
(grp_name, grp_params))
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
log('Configuring Master/Slave (ms): %s' % (ms), level=DEBUG)
for ms_name, ms_params in ms.iteritems():
if not pcmk.crm_opt_exists(ms_name):
cmd = 'crm -w -F configure ms %s %s' % (ms_name, ms_params)
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
log('Configuring Orders: %s' % (orders), level=DEBUG)
for ord_name, ord_params in orders.iteritems():
if not pcmk.crm_opt_exists(ord_name):
cmd = 'crm -w -F configure order %s %s' % (ord_name,
ord_params)
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
log('Configuring Colocations: %s' % colocations, level=DEBUG)
for col_name, col_params in colocations.iteritems():
if not pcmk.crm_opt_exists(col_name):
cmd = 'crm -w -F configure colocation %s %s' % (col_name,
col_params)
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
log('Configuring Clones: %s' % clones, level=DEBUG)
for cln_name, cln_params in clones.iteritems():
if not pcmk.crm_opt_exists(cln_name):
cmd = 'crm -w -F configure clone %s %s' % (cln_name,
cln_params)
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
log('Configuring Locations: %s' % locations, level=DEBUG)
for loc_name, loc_params in locations.iteritems():
if not pcmk.crm_opt_exists(loc_name):
cmd = 'crm -w -F configure location %s %s' % (loc_name,
loc_params)
pcmk.commit(cmd)
log('%s' % cmd, level=DEBUG)
for res_name, res_type in resources.iteritems():
if len(init_services) != 0 and res_name in init_services:
# Checks that the resources are running and started.
# Ensure that clones are excluded as the resource is
# not directly controllable (dealt with below)
# Ensure that groups are cleaned up as a whole rather
# than as individual resources.
if (res_name not in clones.values() and
res_name not in groups.values() and
not pcmk.crm_res_running(res_name)):
# Just in case, cleanup the resources to ensure they get
# started in case they failed for some unrelated reason.
cmd = 'crm resource cleanup %s' % res_name
pcmk.commit(cmd)
for cl_name in clones:
# Always cleanup clones
cmd = 'crm resource cleanup %s' % cl_name
pcmk.commit(cmd)
for grp_name in groups:
# Always cleanup groups
cmd = 'crm resource cleanup %s' % grp_name
pcmk.commit(cmd)
for rel_id in relation_ids('ha'):
relation_set(relation_id=rel_id, clustered="yes")
@hooks.hook()
def stop():
cmd = 'crm -w -F node delete %s' % socket.gethostname()
pcmk.commit(cmd)
apt_purge(['corosync', 'pacemaker'], fatal=True)
@hooks.hook('nrpe-external-master-relation-joined',
'nrpe-external-master-relation-changed')
def update_nrpe_config():
scripts_src = os.path.join(os.environ["CHARM_DIR"], "files",
"nrpe")
scripts_dst = "/usr/local/lib/nagios/plugins"
if not os.path.exists(scripts_dst):
os.makedirs(scripts_dst)
for fname in glob.glob(os.path.join(scripts_src, "*")):
if os.path.isfile(fname):
shutil.copy2(fname,
os.path.join(scripts_dst, os.path.basename(fname)))
sudoers_src = os.path.join(os.environ["CHARM_DIR"], "files",
"sudoers")
sudoers_dst = "/etc/sudoers.d"
for fname in glob.glob(os.path.join(sudoers_src, "*")):
if os.path.isfile(fname):
shutil.copy2(fname,
os.path.join(sudoers_dst, os.path.basename(fname)))
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
apt_install('python-dbus')
# corosync/crm checks
nrpe_setup.add_check(
shortname='corosync_rings',
description='Check Corosync rings {%s}' % current_unit,
check_cmd='check_corosync_rings')
nrpe_setup.add_check(
shortname='crm_status',
description='Check crm status {%s}' % current_unit,
check_cmd='check_crm')
# process checks
nrpe_setup.add_check(
shortname='corosync_proc',
description='Check Corosync process {%s}' % current_unit,
check_cmd='check_procs -c 1:1 -C corosync'
)
nrpe_setup.add_check(
shortname='pacemakerd_proc',
description='Check Pacemakerd process {%s}' % current_unit,
check_cmd='check_procs -c 1:1 -C pacemakerd'
)
nrpe_setup.write()
if __name__ == '__main__':
try:
hooks.execute(sys.argv)
except UnregisteredHookError as e:
log('Unknown hook {} - skipping.'.format(e), level=DEBUG)
set_unit_status()