From 97a3ab9bf347903e3b9993e36f7dd113c6709f37 Mon Sep 17 00:00:00 2001 From: Ian Wienand Date: Mon, 10 Sep 2018 14:49:45 +1000 Subject: [PATCH] Add statsd metrics for ansible runs Add some coarse-grained statsd tracking for the global ansible runs. Adds a timer for each step, along with an overall timer. This adds a single argument so that we only try to run stats when running from the cron job (so if we're debugging by hand or something, this doesn't trigger). Graphite also needs to accept stats from bridge.o.o. The plan is to present this via a simple grafana dashboard. Change-Id: I299c0ab5dc3dea4841e560d8fb95b8f3e7df89f2 --- playbooks/group_vars/graphite.yaml | 3 ++ playbooks/roles/ansible-cron/tasks/main.yaml | 2 +- run_all.sh | 54 ++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/playbooks/group_vars/graphite.yaml b/playbooks/group_vars/graphite.yaml index e8a5c57513..639d3988cf 100644 --- a/playbooks/group_vars/graphite.yaml +++ b/playbooks/group_vars/graphite.yaml @@ -1,4 +1,7 @@ iptables_extra_allowed_hosts: + - hostname: bridge.openstack.org + port: 8125 + protocol: udp - hostname: git.openstack.org port: 8125 protocol: udp diff --git a/playbooks/roles/ansible-cron/tasks/main.yaml b/playbooks/roles/ansible-cron/tasks/main.yaml index c25dbea5f6..4e65e12b67 100644 --- a/playbooks/roles/ansible-cron/tasks/main.yaml +++ b/playbooks/roles/ansible-cron/tasks/main.yaml @@ -13,7 +13,7 @@ cron: name: run_all.sh state: present - job: 'flock -n /var/run/ansible/run_all.lock bash /opt/system-config/run_all.sh >> /var/log/ansible/run_all_cron.log 2>&1' + job: 'flock -n /var/run/ansible/run_all.lock bash /opt/system-config/run_all.sh -c >> /var/log/ansible/run_all_cron.log 2>&1' minute: "{{ update_cron_interval.minute }}" hour: "{{ update_cron_interval.hour }}" day: "{{ update_cron_interval.day }}" diff --git a/run_all.sh b/run_all.sh index eadf347af0..2f4fc11598 100755 --- a/run_all.sh +++ b/run_all.sh @@ -22,6 +22,41 @@ set -e SYSTEM_CONFIG=/opt/system-config ANSIBLE_PLAYBOOKS=$SYSTEM_CONFIG/playbooks +# We only send stats if running under cron +UNDER_CRON=0 + +while getopts ":c" arg; do + case $arg in + c) + UNDER_CRON=1 + ;; + esac +done + +GLOBAL_START_TIME=$(date '+%s') + +# Send a timer stat to statsd +# send_timer metric [start_time] +# * uses timer metric bridge.ansible.run_all.<$1> +# * time will be taken from last call of start_timer, or $2 if set +function send_timer { + # Only send stats under cron conditions + if [[ ${UNDER_CRON} != 1 ]]; then + return + fi + + local current=$(date '+%s') + local name=$1 + local start=${2-$_START_TIME} + local elapsed_ms=$(( (current - start) * 1000 )) + + echo "bridge.ansible.run_all.${name}:${elapsed_ms}|ms" | nc -w 1 -u graphite.openstack.org 8125 +} +# See send_timer +function start_timer { + _START_TIME=$(date '+%s') +} + echo "--- begin run @ $(date -Is) ---" # It's possible for connectivity to a server or manifest application to break @@ -33,24 +68,43 @@ set +e # stuck if they are oomkilled # Clone system-config and install modules and roles +start_timer timeout -k 2m 120m ansible-playbook ${ANSIBLE_PLAYBOOKS}/update-system-config.yaml +send_timer update_system_config # Update the code on bridge +start_timer timeout -k 2m 120m ansible-playbook ${ANSIBLE_PLAYBOOKS}/bridge.yaml +send_timer bridge # Run the base playbook everywhere +start_timer timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/base.yaml +send_timer base # Update the puppet version +start_timer timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/update_puppet_version.yaml +send_timer update_puppet_version # Run the git/gerrit/zuul sequence, since it's important that they all work together +start_timer timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/remote_puppet_git.yaml +send_timer git + # Run AFS changes separately so we can make sure to only do one at a time # (turns out quorum is nice to have) +start_timer timeout -k 2m 120m ansible-playbook -f 1 ${ANSIBLE_PLAYBOOKS}/remote_puppet_afs.yaml +send_timer afs + # Run everything else. We do not care if the other things worked +start_timer timeout -k 2m 120m ansible-playbook -f 50 ${ANSIBLE_PLAYBOOKS}/remote_puppet_else.yaml +send_timer else + +# Send the combined time for everything +send_timer total $GLOBAL_START_TIME echo "--- end run @ $(date -Is) ---" echo