ebac150f8a
This change separates the processing of the logs/notifications and metric/alerting into 2 dedicated hekad processes, these services are named 'log_collector' and 'metric_collector'. Both services are managed by Pacemaker on controller nodes and by Upstart on other nodes. All metrics computed by log_collector (HTTP response times and creation time for instances and volumes) are sent directly to the metric_collector via TCP. Elasticsearch output (log_collector) uses full_action='block' and the TCP output uses full_action='drop'. All outputs of metric_collector (InfluxDB, HTTP and TCP) use full_action='drop'. The buffer size configurations are: * metric_collector: - influxdb-output buffer size is increased to 1Gb. - aggregator-output (tcp) buffer size is decreased to 256Mb (vs 1Gb). - nagios outputs (x3) buffer size are decreased to 1Mb. * log_collector: - elasticsearch-output buffer size is decreased to 256Mb (vs 1Gb). - tcp-output buffer size is set to 256Mb. Implements: blueprint separate-lma-collector-pipelines Fixes-bug: #1566748 Change-Id: Ieadb93b89f81e944e21cf8e5a65f4d683fd0ffb8
106 lines
1.9 KiB
Plaintext
106 lines
1.9 KiB
Plaintext
#!/bin/sh
|
|
#
|
|
# hekad <%= @service_name %>
|
|
#
|
|
# chkconfig: - 98 02
|
|
# description: Starts and stops a single heka instance on this system
|
|
### END INIT INFO
|
|
|
|
#
|
|
# Source function library.
|
|
#
|
|
if [ -f /etc/rc.d/init.d/functions ]; then
|
|
. /etc/rc.d/init.d/functions
|
|
fi
|
|
|
|
exec="<%= @hekad_wrapper %>"
|
|
prog="<%= @service_name %>"
|
|
pidfile=/var/run/${prog}.pid
|
|
|
|
[ -e /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog
|
|
|
|
lockfile=/var/lock/subsys/$prog
|
|
|
|
HEKA_USER=root
|
|
|
|
start() {
|
|
[ -x $exec ] || exit 5
|
|
[ -f $CONF_FILE ] || exit 6
|
|
<% unless @_run_as_root -%>
|
|
touch <%= @log_file %>
|
|
chown <%= @heka_user %>:<%= @heka_user %> <%= @log_file %>
|
|
<% end -%>
|
|
echo -n $"Starting $prog: "
|
|
daemonize -p $pidfile -e <%= @log_file %> <%= @_run_as_root ? "" : "-u #{ @heka_user }" %> -l $lockfile $exec
|
|
retval=$?
|
|
[ $retval -eq 0 ] && success || failure
|
|
echo
|
|
[ $retval -eq 0 ] && touch $lockfile
|
|
return $retval
|
|
}
|
|
|
|
stop() {
|
|
echo -n $"Stopping $prog: "
|
|
pkill -P $(cat $pidfile)
|
|
retval=$?
|
|
rm -f $pidfile
|
|
echo
|
|
[ $retval -eq 0 ] && rm -f $lockfile
|
|
return $retval
|
|
}
|
|
|
|
restart() {
|
|
stop
|
|
start
|
|
}
|
|
|
|
reload() {
|
|
restart
|
|
}
|
|
|
|
force_reload() {
|
|
restart
|
|
}
|
|
|
|
rh_status() {
|
|
# run checks to determine if the service is running or use generic status
|
|
status -p $pidfile $prog
|
|
}
|
|
|
|
rh_status_q() {
|
|
rh_status >/dev/null 2>&1
|
|
}
|
|
|
|
|
|
case "$1" in
|
|
start)
|
|
rh_status_q && exit 0
|
|
$1
|
|
;;
|
|
stop)
|
|
rh_status_q || exit 0
|
|
$1
|
|
;;
|
|
restart)
|
|
$1
|
|
;;
|
|
reload)
|
|
rh_status_q || exit 7
|
|
$1
|
|
;;
|
|
force-reload)
|
|
force_reload
|
|
;;
|
|
status)
|
|
rh_status
|
|
;;
|
|
condrestart|try-restart)
|
|
rh_status_q || exit 0
|
|
restart
|
|
;;
|
|
*)
|
|
echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}"
|
|
exit 2
|
|
esac
|
|
exit $?
|