From e57f7261d2e50bac367bf3b8f6c8f66a256b95e9 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Thu, 7 Jan 2016 17:16:15 +0100 Subject: [PATCH] Manage Apache and Nagios with Pacemaker This change implements the OCF resource agents that manage the Apache and Nagios services in the network namespace where the VIP address is running. It configures the necessary Pacemaker resources and makes sure that the Apache and Nagios services are always co-located with the VIP. Change-Id: I524157498537fa4a652f2f59e267a0ceb12f8192 --- .../puppet/manifests/ha_services.pp | 122 +++++++ deployment_scripts/puppet/manifests/hiera.pp | 8 +- .../lma_infra_alerting/files/ocf-ns_apache | 300 ++++++++++++++++++ .../lma_infra_alerting/files/ocf-ns_nagios | 288 +++++++++++++++++ tasks.yaml | 8 + 5 files changed, 724 insertions(+), 2 deletions(-) create mode 100644 deployment_scripts/puppet/manifests/ha_services.pp create mode 100755 deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_apache create mode 100755 deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_nagios diff --git a/deployment_scripts/puppet/manifests/ha_services.pp b/deployment_scripts/puppet/manifests/ha_services.pp new file mode 100644 index 0000000..9ebd919 --- /dev/null +++ b/deployment_scripts/puppet/manifests/ha_services.pp @@ -0,0 +1,122 @@ +# Copyright 2016 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +file { 'ocf-ns_apache': + ensure => present, + path => '/usr/lib/ocf/resource.d/fuel/ocf-ns_apache', + source => 'puppet:///modules/lma_infra_alerting/ocf-ns_apache', + mode => '0755', + owner => 'root', + group => 'root', +} + +file { 'ocf-ns_nagios': + ensure => present, + path => '/usr/lib/ocf/resource.d/fuel/ocf-ns_nagios', + source => 'puppet:///modules/lma_infra_alerting/ocf-ns_nagios', + mode => '0755', + owner => 'root', + group => 'root', +} + +# This is required so Apache and Nagios can bind to the VIP address +exec { 'net.ipv4.ip_nonlocal_bind': + command => '/sbin/sysctl -w net.ipv4.ip_nonlocal_bind=1', + unless => '/sbin/sysctl -n net.ipv4.ip_nonlocal_bind | /bin/grep 1', +} + +# Apache2 resources for Pacemaker +pacemaker_wrappers::service { 'apache2': + primitive_type => 'ocf-ns_apache', + parameters => { + 'ns' => 'infrastructure_alerting', + 'status_url' => 'http://localhost:8001/server-status', + }, + metadata => { + 'migration-threshold' => '3', + 'failure-timeout' => '120', + }, + operations => { + 'monitor' => { + 'interval' => '30', + 'timeout' => '60' + }, + 'start' => { + 'timeout' => '60' + }, + 'stop' => { + 'timeout' => '60' + }, + }, + prefix => false, + use_handler => false, + require => [File['ocf-ns_apache'], Exec['net.ipv4.ip_nonlocal_bind']], +} + +cs_rsc_colocation { 'infrastructure_alerting_vip-with-apache2': + ensure => present, + score => 'INFINITY', + primitives => [ + 'vip__infrastructure_alerting_mgmt_vip', + 'apache2' + ], + require => Cs_resource['apache2'], +} + +service { 'apache2': + ensure => 'running', + require => Cs_rsc_colocation['infrastructure_alerting_vip-with-apache2'], +} + +# Nagios resources for Pacemaker +pacemaker_wrappers::service { 'nagios3': + primitive_type => 'ocf-ns_nagios', + parameters => { + 'ns' => 'infrastructure_alerting', + }, + metadata => { + 'migration-threshold' => '3', + 'failure-timeout' => '120', + }, + operations => { + 'monitor' => { + 'interval' => '30', + 'timeout' => '60' + }, + 'start' => { + 'timeout' => '60' + }, + 'stop' => { + 'timeout' => '60' + }, + }, + prefix => false, + use_handler => false, + require => [File['ocf-ns_nagios'], Exec['net.ipv4.ip_nonlocal_bind']], +} + +cs_rsc_colocation { 'infrastructure_alerting_vip-with-nagios': + ensure => present, + score => 'INFINITY', + primitives => [ + 'vip__infrastructure_alerting_mgmt_vip', + 'nagios3' + ], + require => Cs_resource['nagios3'], +} + +service { 'nagios3': + ensure => 'running', + require => Cs_rsc_colocation['infrastructure_alerting_vip-with-apache2'], +} diff --git a/deployment_scripts/puppet/manifests/hiera.pp b/deployment_scripts/puppet/manifests/hiera.pp index 0412b2b..dd3ce46 100644 --- a/deployment_scripts/puppet/manifests/hiera.pp +++ b/deployment_scripts/puppet/manifests/hiera.pp @@ -12,13 +12,17 @@ # License for the specific language governing permissions and limitations # under the License. # -$hiera_dir = '/etc/hiera/plugins' -$plugin_name = 'lma_infrastructure_alerting' +$hiera_dir = '/etc/hiera/plugins' +$plugin_name = 'lma_infrastructure_alerting' +$network_metadata = hiera('network_metadata') +$alerting_vip = $network_metadata['vips']['infrastructure_alerting_mgmt_vip']['ipaddr'] $calculated_content = inline_template(' --- lma::corosync_roles: - infrastructure_alerting +lma::infrastructure_alerting::vip: <%= @alerting_vip %> +lma::infrastructure_alerting::vip_ns: infrastructure_alerting ') file { "${hiera_dir}/${plugin_name}.yaml": diff --git a/deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_apache b/deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_apache new file mode 100755 index 0000000..6bc2be5 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_apache @@ -0,0 +1,300 @@ +#!/bin/bash +# Copyright 2016 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# See usage() function below for more details ... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_config +# OCF_RESKEY_ns +# OCF_RESKEY_status_url +####################################################################### +# Initialization: + +: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}" +. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs" + +####################################################################### + +# Fill in some defaults if no values are specified + +SERVICE_NAME="Apache" + +OCF_RESKEY_binary_default="/usr/sbin/apache2" +OCF_RESKEY_config_default="/etc/apache2/apache2.conf" +OCF_RESKEY_ns_default= +OCF_RESKEY_status_url_default="http://localhost/status-server" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_ns=${OCF_RESKEY_ns_default}} +: ${OCF_RESKEY_status_url=${OCF_RESKEY_status_url_default}} + +RUN_IN_NS="ip netns exec $OCF_RESKEY_ns " +if [ -z "${OCF_RESKEY_ns}" ] ; then + RUN='' +else + RUN="$RUN_IN_NS " +fi +APACHE_PID_FILE="/var/run/apache2/apache2.pid" + +####################################################################### + +usage() { + cat < + + +1.0 + + +Manages the Apache daemon in a network namespace as a Pacemaker Resource. + +Manages Apache + + + + +Path of the Apache binary file that will be run. + +Apache binary file + + + + + +Path to the Apache configuration file + +Apache configuration + + + + + +Network namespace in which Apache will be run + +Apache namespace + + + + + +The status URL for checking Apache + +Apache status URL + + + + + + + + + + + + + + +END +} + +####################################################################### +# Functions invoked by resource manager actions + +service_validate() { + local rc + + check_binary "$OCF_RESKEY_binary" + + if [[ ! -f $OCF_RESKEY_config ]]; then + ocf_log err "Config $OCF_RESKEY_config doesn't exist" + return "$OCF_ERR_INSTALLED" + fi + + ip netns pids "$OCF_RESKEY_ns" > /dev/null 2>&1 + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Namespace $OCF_RESKEY_ns doesn't exist or can't be accessed" + return "$OCF_ERR_INSTALLED" + fi + + mkdir -p "$(dirname ${APACHE_PID_FILE})" + + true +} + +service_status() { + local rc + local pid + + if [ ! -f "$APACHE_PID_FILE" ]; then + ocf_log info "Apache is not running" + return "$OCF_NOT_RUNNING" + else + pid=$(cat "$APACHE_PID_FILE") + fi + + if [ -n "${pid}" ]; then + ocf_run -warn kill -s 0 "$pid" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log info "Old PID file found, but Apache process isn't running" + return "$OCF_NOT_RUNNING" + fi + else + ocf_log err "PID file ${APACHE_PID_FILE} is empty!" + return "$OCF_ERR_GENERIC" + fi + + if [ -n "${OCF_RESKEY_status_url}" ] ; then + if ! $RUN_IN_NS /usr/bin/curl -sL -w "%{http_code}" -XGET "${OCF_RESKEY_status_url}" -o /dev/null | grep -q 200; then + return "$OCF_ERR_GENERIC" + fi + fi + + return "$OCF_SUCCESS" +} + +service_monitor() { + local rc + service_status + rc=$? + return $rc +} + +service_start() { + local rc + + service_monitor + rc=$? + if [ $rc -eq "$OCF_SUCCESS" ]; then + ocf_log info "${SERVICE_NAME} is already running" + return "$OCF_SUCCESS" + fi + + # This is required for Linux kernels >= 3.19. Previously the + # net.ipv4.ip_nonlocal_bind setting was global to all namespaces but + # starting with this version, it is per namespace. + ocf_run ${RUN_IN_NS} /sbin/sysctl -w net.ipv4.ip_nonlocal_bind=1 + ocf_run ${RUN_IN_NS} ip link set up dev lo + + ocf_run ${RUN} bash -c ". /etc/apache2/envvars && ${OCF_RESKEY_binary} -k start" + + # Spin waiting for the server to come up + while true; do + service_monitor + rc=$? + [ $rc -eq "$OCF_SUCCESS" ] && break + if [ $rc -ne "$OCF_NOT_RUNNING" ]; then + ocf_log err "${SERVICE_NAME} start failed" + exit "$OCF_ERR_GENERIC" + fi + sleep 3 + done + + ocf_log info "${SERVICE_NAME} started" + return "$OCF_SUCCESS" +} + +service_stop() { + local rc + local pid + + service_monitor + rc=$? + if [ $rc -eq "$OCF_NOT_RUNNING" ]; then + ocf_log info "${SERVICE_NAME} is already stopped" + return "$OCF_SUCCESS" + fi + + # Try SIGTERM + pid=$(cat "$APACHE_PID_FILE") + ocf_run kill -s TERM "$pid" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "${SERVICE_NAME} couldn't be stopped" + exit "$OCF_ERR_GENERIC" + fi + + # stop waiting + shutdown_timeout=15 + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + shutdown_timeout=$(( (OCF_RESKEY_CRM_meta_timeout/1000)-5 )) + fi + count=0 + while [ $count -lt $shutdown_timeout ]; do + service_monitor + rc=$? + if [ $rc -eq "$OCF_NOT_RUNNING" ]; then + break + fi + count=$(( count + 1)) + sleep 1 + ocf_log debug "${SERVICE_NAME} still hasn't stopped yet. Waiting ..." + done + + service_monitor + rc=$? + if [ "${rc}" -ne "${OCF_NOT_RUNNING}" ]; then + # SIGTERM didn't help either, try SIGKILL + ocf_log info "${SERVICE_NAME} failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..." + ocf_run kill -s KILL "${pid}" + fi + + ocf_log info "${SERVICE_NAME} stopped" + rm -f "${APACHE_PID_FILE}" + + return "${OCF_SUCCESS}" +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit "$OCF_SUCCESS";; + usage|help) usage + exit "$OCF_SUCCESS";; +esac + +# Anything except meta-data and help must pass validation +service_validate || exit $? + +# What kind of method was invoked? +case "$1" in + start) service_start;; + stop) service_stop;; + status) service_status;; + monitor) service_monitor;; + validate-all) ;; + *) usage + exit "$OCF_ERR_UNIMPLEMENTED";; +esac diff --git a/deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_nagios b/deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_nagios new file mode 100755 index 0000000..0ffcd67 --- /dev/null +++ b/deployment_scripts/puppet/modules/lma_infra_alerting/files/ocf-ns_nagios @@ -0,0 +1,288 @@ +#!/bin/bash +# Copyright 2016 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# See usage() function below for more details ... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_config +# OCF_RESKEY_ns +####################################################################### +# Initialization: + +: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}" +. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs" + +####################################################################### + +# Fill in some defaults if no values are specified + +SERVICE_NAME="Nagios" + +OCF_RESKEY_binary_default="/usr/sbin/nagios3" +OCF_RESKEY_config_default="/etc/nagios3/nagios.cfg" +OCF_RESKEY_ns_default= + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_ns=${OCF_RESKEY_ns_default}} + +RUN_IN_NS="ip netns exec $OCF_RESKEY_ns " +if [ -z "${OCF_RESKEY_ns}" ] ; then + RUN='' +else + RUN="$RUN_IN_NS " +fi +NAGIOS_PID_FILE="/var/run/nagios3/nagios3.pid" + +####################################################################### + +usage() { + cat < + + +1.0 + + +Manages the Nagios daemon in a network namespace as a Pacemaker Resource. + +Manages Nagios + + + + +Path of the Nagios binary file that will be run. + +Nagios binary file + + + + + +Path to the Nagios configuration file + +Nagios configuration + + + + + +Network namespace in which Nagios will be run + +Nagios namespace + + + + + + + + + + + + + + +END +} + +####################################################################### +# Functions invoked by resource manager actions + +service_validate() { + local rc + + check_binary "$OCF_RESKEY_binary" + + if [[ ! -f $OCF_RESKEY_config ]]; then + ocf_log err "Config $OCF_RESKEY_config doesn't exist" + return "$OCF_ERR_INSTALLED" + fi + + ip netns pids "$OCF_RESKEY_ns" > /dev/null 2>&1 + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Namespace $OCF_RESKEY_ns doesn't exist or can't be accessed" + return "$OCF_ERR_INSTALLED" + fi + + mkdir -p "$(dirname ${NAGIOS_PID_FILE})" + + true +} + +service_status() { + local rc + local pid + + if [ ! -f "$NAGIOS_PID_FILE" ]; then + ocf_log info "Nagios is not running" + return "$OCF_NOT_RUNNING" + else + pid=$(cat "$NAGIOS_PID_FILE") + fi + + if [ -n "${pid}" ]; then + ocf_run -warn kill -s 0 "$pid" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log info "Old PID file found, but Nagios process isn't running" + return "$OCF_NOT_RUNNING" + fi + else + ocf_log err "PID file ${NAGIOS_PID_FILE} is empty!" + return "$OCF_ERR_GENERIC" + fi + + return "$OCF_SUCCESS" +} + +service_monitor() { + local rc + service_status + rc=$? + return $rc +} + +service_start() { + local rc + + service_monitor + rc=$? + if [ $rc -eq "$OCF_SUCCESS" ]; then + ocf_log info "${SERVICE_NAME} is already running" + return "$OCF_SUCCESS" + fi + + # This is required for Linux kernels >= 3.19. Previously the + # net.ipv4.ip_nonlocal_bind setting was global to all namespaces but + # starting with this version, it is per namespace. + ocf_run ${RUN_IN_NS} /sbin/sysctl -w net.ipv4.ip_nonlocal_bind=1 + ocf_run ${RUN_IN_NS} ip link set up dev lo + + if ! ocf_run ${OCF_RESKEY_binary} -v ${OCF_RESKEY_config}; then + ocf_log error "Nagios configuration is invalid" + exit "$OCF_ERR_GENERIC" + fi + + ocf_run ${RUN} ${OCF_RESKEY_binary} -d ${OCF_RESKEY_config} + + # Spin waiting for the server to come up + while true; do + service_monitor + rc=$? + [ $rc -eq "$OCF_SUCCESS" ] && break + if [ $rc -ne "$OCF_NOT_RUNNING" ]; then + ocf_log err "${SERVICE_NAME} start failed" + exit "$OCF_ERR_GENERIC" + fi + sleep 3 + done + + ocf_log info "${SERVICE_NAME} started" + return "$OCF_SUCCESS" +} + +service_stop() { + local rc + local pid + + service_monitor + rc=$? + if [ $rc -eq "$OCF_NOT_RUNNING" ]; then + ocf_log info "${SERVICE_NAME} is already stopped" + return "$OCF_SUCCESS" + fi + + # Try SIGTERM + pid=$(cat "$NAGIOS_PID_FILE") + ocf_run kill -s TERM "$pid" + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "${SERVICE_NAME} couldn't be stopped" + exit "$OCF_ERR_GENERIC" + fi + + # stop waiting + shutdown_timeout=15 + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + shutdown_timeout=$(( (OCF_RESKEY_CRM_meta_timeout/1000)-5 )) + fi + count=0 + while [ $count -lt $shutdown_timeout ]; do + service_monitor + rc=$? + if [ $rc -eq "$OCF_NOT_RUNNING" ]; then + break + fi + count=$(( count + 1)) + sleep 1 + ocf_log debug "${SERVICE_NAME} still hasn't stopped yet. Waiting ..." + done + + service_monitor + rc=$? + if [ "${rc}" -ne "${OCF_NOT_RUNNING}" ]; then + # SIGTERM didn't help either, try SIGKILL + ocf_log info "${SERVICE_NAME} failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..." + ocf_run kill -s KILL "${pid}" + fi + + ocf_log info "${SERVICE_NAME} stopped" + rm -f "${NAGIOS_PID_FILE}" + + return "${OCF_SUCCESS}" +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit "$OCF_SUCCESS";; + usage|help) usage + exit "$OCF_SUCCESS";; +esac + +# Anything except meta-data and help must pass validation +service_validate || exit $? + +# What kind of method was invoked? +case "$1" in + start) service_start;; + stop) service_stop;; + status) service_status;; + monitor) service_monitor;; + validate-all) ;; + *) usage + exit "$OCF_ERR_UNIMPLEMENTED";; +esac diff --git a/tasks.yaml b/tasks.yaml index 06e8365..c909d69 100644 --- a/tasks.yaml +++ b/tasks.yaml @@ -8,3 +8,11 @@ puppet_manifest: puppet/manifests/nagios.pp puppet_modules: puppet/modules timeout: 600 + +- role: [infrastructure_alerting] + stage: post_deployment/8101 + type: puppet + parameters: + puppet_manifest: puppet/manifests/ha_services.pp + puppet_modules: puppet/modules + timeout: 600