Add statsd metrics for ansible runs

Add some coarse-grained statsd tracking for the global ansible runs.
Adds a timer for each step, along with an overall timer.

This adds a single argument so that we only try to run stats when
running from the cron job (so if we're debugging by hand or something,
this doesn't trigger).  Graphite also needs to accept stats from
bridge.o.o.  The plan is to present this via a simple grafana
dashboard.

Change-Id: I299c0ab5dc3dea4841e560d8fb95b8f3e7df89f2
This commit is contained in:
Ian Wienand 2018-09-10 14:49:45 +10:00
parent 951b9c0bae
commit 97a3ab9bf3
3 changed files with 58 additions and 1 deletions

View File

@ -1,4 +1,7 @@
iptables_extra_allowed_hosts:
- hostname: bridge.openstack.org
port: 8125
protocol: udp
- hostname: git.openstack.org
port: 8125
protocol: udp

View File

@ -13,7 +13,7 @@
cron:
name: run_all.sh
state: present
job: 'flock -n /var/run/ansible/run_all.lock bash /opt/system-config/run_all.sh >> /var/log/ansible/run_all_cron.log 2>&1'
job: 'flock -n /var/run/ansible/run_all.lock bash /opt/system-config/run_all.sh -c >> /var/log/ansible/run_all_cron.log 2>&1'
minute: "{{ update_cron_interval.minute }}"
hour: "{{ update_cron_interval.hour }}"
day: "{{ update_cron_interval.day }}"

View File

@ -22,6 +22,41 @@ set -e
SYSTEM_CONFIG=/opt/system-config
ANSIBLE_PLAYBOOKS=$SYSTEM_CONFIG/playbooks
# We only send stats if running under cron
UNDER_CRON=0
while getopts ":c" arg; do
case $arg in
c)
UNDER_CRON=1
;;
esac
done
GLOBAL_START_TIME=$(date '+%s')
# Send a timer stat to statsd
# send_timer metric [start_time]
# * uses timer metric bridge.ansible.run_all.<$1>
# * time will be taken from last call of start_timer, or $2 if set
function send_timer {
# Only send stats under cron conditions
if [[ ${UNDER_CRON} != 1 ]]; then
return
fi
local current=$(date '+%s')
local name=$1
local start=${2-$_START_TIME}
local elapsed_ms=$(( (current - start) * 1000 ))
echo "bridge.ansible.run_all.${name}:${elapsed_ms}|ms" | nc -w 1 -u graphite.openstack.org 8125
}
# See send_timer
function start_timer {
_START_TIME=$(date '+%s')
}
echo "--- begin run @ $(date -Is) ---"
# It's possible for connectivity to a server or manifest application to break
@ -33,24 +68,43 @@ set +e
# stuck if they are oomkilled
# Clone system-config and install modules and roles
start_timer
timeout -k 2m 120m ansible-playbook ${ANSIBLE_PLAYBOOKS}/update-system-config.yaml
send_timer update_system_config
# Update the code on bridge
start_timer
timeout -k 2m 120m ansible-playbook ${ANSIBLE_PLAYBOOKS}/bridge.yaml
send_timer bridge
# Run the base playbook everywhere
start_timer
timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/base.yaml
send_timer base
# Update the puppet version
start_timer
timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/update_puppet_version.yaml
send_timer update_puppet_version
# Run the git/gerrit/zuul sequence, since it's important that they all work together
start_timer
timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/remote_puppet_git.yaml
send_timer git
# Run AFS changes separately so we can make sure to only do one at a time
# (turns out quorum is nice to have)
start_timer
timeout -k 2m 120m ansible-playbook -f 1 ${ANSIBLE_PLAYBOOKS}/remote_puppet_afs.yaml
send_timer afs
# Run everything else. We do not care if the other things worked
start_timer
timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/remote_puppet_else.yaml
send_timer else
# Send the combined time for everything
send_timer total $GLOBAL_START_TIME
echo "--- end run @ $(date -Is) ---"
echo