Manage Apache and Nagios with Pacemaker

This change implements the OCF resource agents that manage the Apache
and Nagios services in the network namespace where the VIP address is
running. It configures the necessary Pacemaker resources and makes sure
that the Apache and Nagios services are always co-located with the VIP.

Change-Id: I524157498537fa4a652f2f59e267a0ceb12f8192
This commit is contained in:
Simon Pasquier 2016-01-07 17:16:15 +01:00
parent 008c8c786f
commit e57f7261d2
5 changed files with 724 additions and 2 deletions

View File

@ -0,0 +1,122 @@
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
file { 'ocf-ns_apache':
ensure => present,
path => '/usr/lib/ocf/resource.d/fuel/ocf-ns_apache',
source => 'puppet:///modules/lma_infra_alerting/ocf-ns_apache',
mode => '0755',
owner => 'root',
group => 'root',
}
file { 'ocf-ns_nagios':
ensure => present,
path => '/usr/lib/ocf/resource.d/fuel/ocf-ns_nagios',
source => 'puppet:///modules/lma_infra_alerting/ocf-ns_nagios',
mode => '0755',
owner => 'root',
group => 'root',
}
# This is required so Apache and Nagios can bind to the VIP address
exec { 'net.ipv4.ip_nonlocal_bind':
command => '/sbin/sysctl -w net.ipv4.ip_nonlocal_bind=1',
unless => '/sbin/sysctl -n net.ipv4.ip_nonlocal_bind | /bin/grep 1',
}
# Apache2 resources for Pacemaker
pacemaker_wrappers::service { 'apache2':
primitive_type => 'ocf-ns_apache',
parameters => {
'ns' => 'infrastructure_alerting',
'status_url' => 'http://localhost:8001/server-status',
},
metadata => {
'migration-threshold' => '3',
'failure-timeout' => '120',
},
operations => {
'monitor' => {
'interval' => '30',
'timeout' => '60'
},
'start' => {
'timeout' => '60'
},
'stop' => {
'timeout' => '60'
},
},
prefix => false,
use_handler => false,
require => [File['ocf-ns_apache'], Exec['net.ipv4.ip_nonlocal_bind']],
}
cs_rsc_colocation { 'infrastructure_alerting_vip-with-apache2':
ensure => present,
score => 'INFINITY',
primitives => [
'vip__infrastructure_alerting_mgmt_vip',
'apache2'
],
require => Cs_resource['apache2'],
}
service { 'apache2':
ensure => 'running',
require => Cs_rsc_colocation['infrastructure_alerting_vip-with-apache2'],
}
# Nagios resources for Pacemaker
pacemaker_wrappers::service { 'nagios3':
primitive_type => 'ocf-ns_nagios',
parameters => {
'ns' => 'infrastructure_alerting',
},
metadata => {
'migration-threshold' => '3',
'failure-timeout' => '120',
},
operations => {
'monitor' => {
'interval' => '30',
'timeout' => '60'
},
'start' => {
'timeout' => '60'
},
'stop' => {
'timeout' => '60'
},
},
prefix => false,
use_handler => false,
require => [File['ocf-ns_nagios'], Exec['net.ipv4.ip_nonlocal_bind']],
}
cs_rsc_colocation { 'infrastructure_alerting_vip-with-nagios':
ensure => present,
score => 'INFINITY',
primitives => [
'vip__infrastructure_alerting_mgmt_vip',
'nagios3'
],
require => Cs_resource['nagios3'],
}
service { 'nagios3':
ensure => 'running',
require => Cs_rsc_colocation['infrastructure_alerting_vip-with-apache2'],
}

View File

@ -12,13 +12,17 @@
# License for the specific language governing permissions and limitations
# under the License.
#
$hiera_dir = '/etc/hiera/plugins'
$plugin_name = 'lma_infrastructure_alerting'
$hiera_dir = '/etc/hiera/plugins'
$plugin_name = 'lma_infrastructure_alerting'
$network_metadata = hiera('network_metadata')
$alerting_vip = $network_metadata['vips']['infrastructure_alerting_mgmt_vip']['ipaddr']
$calculated_content = inline_template('
---
lma::corosync_roles:
- infrastructure_alerting
lma::infrastructure_alerting::vip: <%= @alerting_vip %>
lma::infrastructure_alerting::vip_ns: infrastructure_alerting
')
file { "${hiera_dir}/${plugin_name}.yaml":

View File

@ -0,0 +1,300 @@
#!/bin/bash
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# See usage() function below for more details ...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_config
# OCF_RESKEY_ns
# OCF_RESKEY_status_url
#######################################################################
# Initialization:
: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}"
. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs"
#######################################################################
# Fill in some defaults if no values are specified
SERVICE_NAME="Apache"
OCF_RESKEY_binary_default="/usr/sbin/apache2"
OCF_RESKEY_config_default="/etc/apache2/apache2.conf"
OCF_RESKEY_ns_default=
OCF_RESKEY_status_url_default="http://localhost/status-server"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_ns=${OCF_RESKEY_ns_default}}
: ${OCF_RESKEY_status_url=${OCF_RESKEY_status_url_default}}
RUN_IN_NS="ip netns exec $OCF_RESKEY_ns "
if [ -z "${OCF_RESKEY_ns}" ] ; then
RUN=''
else
RUN="$RUN_IN_NS "
fi
APACHE_PID_FILE="/var/run/apache2/apache2.pid"
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
$0 manages the ${SERVICE_NAME} process as an HA resource
The 'start' operation starts the ${SERVICE_NAME}
The 'stop' operation stops the ${SERVICE_NAME}
The 'validate-all' operation reports whether the parameters are valid
The 'meta-data' operation reports this RA's meta-data information
The 'status' operation reports whether the ${SERVICE_NAME} is running
The 'monitor' operation reports whether the ${SERVICE_NAME} is running
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="apache">
<version>1.0</version>
<longdesc lang="en">
Manages the Apache daemon in a network namespace as a Pacemaker Resource.
</longdesc>
<shortdesc lang="en">Manages Apache</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Path of the Apache binary file that will be run.
</longdesc>
<shortdesc lang="en">Apache binary file</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Path to the Apache configuration file
</longdesc>
<shortdesc lang="en">Apache configuration</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="ns" unique="0" required="0">
<longdesc lang="en">
Network namespace in which Apache will be run
</longdesc>
<shortdesc lang="en">Apache namespace</shortdesc>
<content type="string" default="${OCF_RESKEY_ns_default}" />
</parameter>
<parameter name="status_url" unique="0" required="0">
<longdesc lang="en">
The status URL for checking Apache
</longdesc>
<shortdesc lang="en">Apache status URL</shortdesc>
<content type="string" default="${OCF_RESKEY_status_url_default}" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="30" interval="20" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# Functions invoked by resource manager actions
service_validate() {
local rc
check_binary "$OCF_RESKEY_binary"
if [[ ! -f $OCF_RESKEY_config ]]; then
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
return "$OCF_ERR_INSTALLED"
fi
ip netns pids "$OCF_RESKEY_ns" > /dev/null 2>&1
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "Namespace $OCF_RESKEY_ns doesn't exist or can't be accessed"
return "$OCF_ERR_INSTALLED"
fi
mkdir -p "$(dirname ${APACHE_PID_FILE})"
true
}
service_status() {
local rc
local pid
if [ ! -f "$APACHE_PID_FILE" ]; then
ocf_log info "Apache is not running"
return "$OCF_NOT_RUNNING"
else
pid=$(cat "$APACHE_PID_FILE")
fi
if [ -n "${pid}" ]; then
ocf_run -warn kill -s 0 "$pid"
rc=$?
if [ $rc -ne 0 ]; then
ocf_log info "Old PID file found, but Apache process isn't running"
return "$OCF_NOT_RUNNING"
fi
else
ocf_log err "PID file ${APACHE_PID_FILE} is empty!"
return "$OCF_ERR_GENERIC"
fi
if [ -n "${OCF_RESKEY_status_url}" ] ; then
if ! $RUN_IN_NS /usr/bin/curl -sL -w "%{http_code}" -XGET "${OCF_RESKEY_status_url}" -o /dev/null | grep -q 200; then
return "$OCF_ERR_GENERIC"
fi
fi
return "$OCF_SUCCESS"
}
service_monitor() {
local rc
service_status
rc=$?
return $rc
}
service_start() {
local rc
service_monitor
rc=$?
if [ $rc -eq "$OCF_SUCCESS" ]; then
ocf_log info "${SERVICE_NAME} is already running"
return "$OCF_SUCCESS"
fi
# This is required for Linux kernels >= 3.19. Previously the
# net.ipv4.ip_nonlocal_bind setting was global to all namespaces but
# starting with this version, it is per namespace.
ocf_run ${RUN_IN_NS} /sbin/sysctl -w net.ipv4.ip_nonlocal_bind=1
ocf_run ${RUN_IN_NS} ip link set up dev lo
ocf_run ${RUN} bash -c ". /etc/apache2/envvars && ${OCF_RESKEY_binary} -k start"
# Spin waiting for the server to come up
while true; do
service_monitor
rc=$?
[ $rc -eq "$OCF_SUCCESS" ] && break
if [ $rc -ne "$OCF_NOT_RUNNING" ]; then
ocf_log err "${SERVICE_NAME} start failed"
exit "$OCF_ERR_GENERIC"
fi
sleep 3
done
ocf_log info "${SERVICE_NAME} started"
return "$OCF_SUCCESS"
}
service_stop() {
local rc
local pid
service_monitor
rc=$?
if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
ocf_log info "${SERVICE_NAME} is already stopped"
return "$OCF_SUCCESS"
fi
# Try SIGTERM
pid=$(cat "$APACHE_PID_FILE")
ocf_run kill -s TERM "$pid"
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "${SERVICE_NAME} couldn't be stopped"
exit "$OCF_ERR_GENERIC"
fi
# stop waiting
shutdown_timeout=15
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$(( (OCF_RESKEY_CRM_meta_timeout/1000)-5 ))
fi
count=0
while [ $count -lt $shutdown_timeout ]; do
service_monitor
rc=$?
if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
break
fi
count=$(( count + 1))
sleep 1
ocf_log debug "${SERVICE_NAME} still hasn't stopped yet. Waiting ..."
done
service_monitor
rc=$?
if [ "${rc}" -ne "${OCF_NOT_RUNNING}" ]; then
# SIGTERM didn't help either, try SIGKILL
ocf_log info "${SERVICE_NAME} failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..."
ocf_run kill -s KILL "${pid}"
fi
ocf_log info "${SERVICE_NAME} stopped"
rm -f "${APACHE_PID_FILE}"
return "${OCF_SUCCESS}"
}
#######################################################################
case "$1" in
meta-data) meta_data
exit "$OCF_SUCCESS";;
usage|help) usage
exit "$OCF_SUCCESS";;
esac
# Anything except meta-data and help must pass validation
service_validate || exit $?
# What kind of method was invoked?
case "$1" in
start) service_start;;
stop) service_stop;;
status) service_status;;
monitor) service_monitor;;
validate-all) ;;
*) usage
exit "$OCF_ERR_UNIMPLEMENTED";;
esac

View File

@ -0,0 +1,288 @@
#!/bin/bash
# Copyright 2016 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# See usage() function below for more details ...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_config
# OCF_RESKEY_ns
#######################################################################
# Initialization:
: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}"
. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs"
#######################################################################
# Fill in some defaults if no values are specified
SERVICE_NAME="Nagios"
OCF_RESKEY_binary_default="/usr/sbin/nagios3"
OCF_RESKEY_config_default="/etc/nagios3/nagios.cfg"
OCF_RESKEY_ns_default=
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_ns=${OCF_RESKEY_ns_default}}
RUN_IN_NS="ip netns exec $OCF_RESKEY_ns "
if [ -z "${OCF_RESKEY_ns}" ] ; then
RUN=''
else
RUN="$RUN_IN_NS "
fi
NAGIOS_PID_FILE="/var/run/nagios3/nagios3.pid"
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
$0 manages the ${SERVICE_NAME} process as an HA resource
The 'start' operation starts the ${SERVICE_NAME}
The 'stop' operation stops the ${SERVICE_NAME}
The 'validate-all' operation reports whether the parameters are valid
The 'meta-data' operation reports this RA's meta-data information
The 'status' operation reports whether the ${SERVICE_NAME} is running
The 'monitor' operation reports whether the ${SERVICE_NAME} is running
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="nagios">
<version>1.0</version>
<longdesc lang="en">
Manages the Nagios daemon in a network namespace as a Pacemaker Resource.
</longdesc>
<shortdesc lang="en">Manages Nagios</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Path of the Nagios binary file that will be run.
</longdesc>
<shortdesc lang="en">Nagios binary file</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Path to the Nagios configuration file
</longdesc>
<shortdesc lang="en">Nagios configuration</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="ns" unique="0" required="0">
<longdesc lang="en">
Network namespace in which Nagios will be run
</longdesc>
<shortdesc lang="en">Nagios namespace</shortdesc>
<content type="string" default="${OCF_RESKEY_ns_default}" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="30" interval="20" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# Functions invoked by resource manager actions
service_validate() {
local rc
check_binary "$OCF_RESKEY_binary"
if [[ ! -f $OCF_RESKEY_config ]]; then
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
return "$OCF_ERR_INSTALLED"
fi
ip netns pids "$OCF_RESKEY_ns" > /dev/null 2>&1
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "Namespace $OCF_RESKEY_ns doesn't exist or can't be accessed"
return "$OCF_ERR_INSTALLED"
fi
mkdir -p "$(dirname ${NAGIOS_PID_FILE})"
true
}
service_status() {
local rc
local pid
if [ ! -f "$NAGIOS_PID_FILE" ]; then
ocf_log info "Nagios is not running"
return "$OCF_NOT_RUNNING"
else
pid=$(cat "$NAGIOS_PID_FILE")
fi
if [ -n "${pid}" ]; then
ocf_run -warn kill -s 0 "$pid"
rc=$?
if [ $rc -ne 0 ]; then
ocf_log info "Old PID file found, but Nagios process isn't running"
return "$OCF_NOT_RUNNING"
fi
else
ocf_log err "PID file ${NAGIOS_PID_FILE} is empty!"
return "$OCF_ERR_GENERIC"
fi
return "$OCF_SUCCESS"
}
service_monitor() {
local rc
service_status
rc=$?
return $rc
}
service_start() {
local rc
service_monitor
rc=$?
if [ $rc -eq "$OCF_SUCCESS" ]; then
ocf_log info "${SERVICE_NAME} is already running"
return "$OCF_SUCCESS"
fi
# This is required for Linux kernels >= 3.19. Previously the
# net.ipv4.ip_nonlocal_bind setting was global to all namespaces but
# starting with this version, it is per namespace.
ocf_run ${RUN_IN_NS} /sbin/sysctl -w net.ipv4.ip_nonlocal_bind=1
ocf_run ${RUN_IN_NS} ip link set up dev lo
if ! ocf_run ${OCF_RESKEY_binary} -v ${OCF_RESKEY_config}; then
ocf_log error "Nagios configuration is invalid"
exit "$OCF_ERR_GENERIC"
fi
ocf_run ${RUN} ${OCF_RESKEY_binary} -d ${OCF_RESKEY_config}
# Spin waiting for the server to come up
while true; do
service_monitor
rc=$?
[ $rc -eq "$OCF_SUCCESS" ] && break
if [ $rc -ne "$OCF_NOT_RUNNING" ]; then
ocf_log err "${SERVICE_NAME} start failed"
exit "$OCF_ERR_GENERIC"
fi
sleep 3
done
ocf_log info "${SERVICE_NAME} started"
return "$OCF_SUCCESS"
}
service_stop() {
local rc
local pid
service_monitor
rc=$?
if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
ocf_log info "${SERVICE_NAME} is already stopped"
return "$OCF_SUCCESS"
fi
# Try SIGTERM
pid=$(cat "$NAGIOS_PID_FILE")
ocf_run kill -s TERM "$pid"
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "${SERVICE_NAME} couldn't be stopped"
exit "$OCF_ERR_GENERIC"
fi
# stop waiting
shutdown_timeout=15
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$(( (OCF_RESKEY_CRM_meta_timeout/1000)-5 ))
fi
count=0
while [ $count -lt $shutdown_timeout ]; do
service_monitor
rc=$?
if [ $rc -eq "$OCF_NOT_RUNNING" ]; then
break
fi
count=$(( count + 1))
sleep 1
ocf_log debug "${SERVICE_NAME} still hasn't stopped yet. Waiting ..."
done
service_monitor
rc=$?
if [ "${rc}" -ne "${OCF_NOT_RUNNING}" ]; then
# SIGTERM didn't help either, try SIGKILL
ocf_log info "${SERVICE_NAME} failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..."
ocf_run kill -s KILL "${pid}"
fi
ocf_log info "${SERVICE_NAME} stopped"
rm -f "${NAGIOS_PID_FILE}"
return "${OCF_SUCCESS}"
}
#######################################################################
case "$1" in
meta-data) meta_data
exit "$OCF_SUCCESS";;
usage|help) usage
exit "$OCF_SUCCESS";;
esac
# Anything except meta-data and help must pass validation
service_validate || exit $?
# What kind of method was invoked?
case "$1" in
start) service_start;;
stop) service_stop;;
status) service_status;;
monitor) service_monitor;;
validate-all) ;;
*) usage
exit "$OCF_ERR_UNIMPLEMENTED";;
esac

View File

@ -8,3 +8,11 @@
puppet_manifest: puppet/manifests/nagios.pp
puppet_modules: puppet/modules
timeout: 600
- role: [infrastructure_alerting]
stage: post_deployment/8101
type: puppet
parameters:
puppet_manifest: puppet/manifests/ha_services.pp
puppet_modules: puppet/modules
timeout: 600