From eded2fdbd1b55b470347b1210d0dd43ca32e54ea Mon Sep 17 00:00:00 2001
From: Bogdan Dobrelya <bdobreli@redhat.com>
Date: Wed, 9 Aug 2017 17:29:46 +0200
Subject: [PATCH] Rework the getthelogs helper script for wget recursive

The getthelogs script uses wget and parallel to fetch
some of the CI jobs logs. Update the README to encourage
contributors to improve the elastic-recheck detection rules
with the help of that script and being smart folks.

Change-Id: Ia48e45118776ca710ecd17421c5fb024ab436293
Signed-off-by: Bogdan Dobrelya <bdobreli@redhat.com>
---
 README.rst         |   4 ++
 scripts/getthelogs | 112 ++++++++++++++++++++++++++-------------------
 2 files changed, 69 insertions(+), 47 deletions(-)

diff --git a/README.rst b/README.rst
index 8ca09c07b..481d0e2c7 100644
--- a/README.rst
+++ b/README.rst
@@ -17,6 +17,10 @@ Tools to help run CI jobs for TripleO. Includes things like:
 * Heat templates to help deploy and maintain test environment nodes
   using an undercloud.
 * Helper script(s) to generate CI status reports. (tox -ecireport -- -f)
+* Helper `getthelogs` script to download important job logs locally.
+  Then you may want to inspect the logs for known errors and contribute
+  discovered search patterns as the
+  `elastic-recheck queries <https://git.openstack.org/cgit/openstack-infra/elastic-recheck/tree/queries>`_.
 
 
 OpenStack Infrastructure is deploying multiple jobs with different scenarios.
diff --git a/scripts/getthelogs b/scripts/getthelogs
index e35bf4a74..d17907451 100755
--- a/scripts/getthelogs
+++ b/scripts/getthelogs
@@ -1,54 +1,72 @@
 #!/bin/bash
+set -eu -o pipefail
 
-# Helper script for downloading tripleo-ci logs, it then prompts the users for
-# file they want to download, unzips them into a tmp directory and changes into
-# the tmp directory, while in the tmp directory run "getthelogs" with no params
-# to download any log files you hadn't previously downloaded
-# Run it like this
-# getthelogs http://logs.openstack.org/68/237568/5/check-tripleo/gate-tripleo-ci-f22-ha/dd8f61d/
-
-set -eu
-
-BASEURL=$1
-TDIR=${BASEURL//\//}
-
-# We do not clean up this directory, so data doesn't need to be downloaded a second time
-# if the script is rerun again with the same url
-TDIR=~/tmp/ci-${TDIR//\//-}
-
-mkdir -p $TDIR
-cd $TDIR
-
-echo $BASEURL > BASEURL
-
-function _getfile(){
-    URL=$1
-    BASENAME=$(basename $1)
-
-    if [[ $BASENAME =~ .*(tar|console).* && ! -e $BASENAME ]] ; then
-        read -p "Want $BASENAME? " X
-        if [ "$X" = "y" ] ; then
-            curl -O $URL || curl -O ${URL}.gz
-            if [[ $BASENAME =~ .*(\.tar).* ]] ; then
-                mkdir ${BASENAME}_
-                tar -xf $BASENAME -C ${BASENAME}_
-            fi
-        fi
-    fi
+function usage(){
+  echo "Helper script for downloading tripleo-ci jobs logs"
+  echo
+  echo "Example:"
+  echo "getthelogs http://logs.openstack.org/00/123456/7/check/gate-tripleo-ci-foo/d3adbeef"
+  echo
+  echo "Downloads the logs and starts a shell from the logs root directory"
 }
 
-FILES="$BASEURL/console.html"
-for FILE in $(curl $BASEURL/logs/ 2> /dev/null | grep href | sed -e 's/.*href="\([^"]*\)".*/\1/g' ) ; do
-    FILES="$FILES $BASEURL/logs/$FILE"
+function finish(){
+  rc=${rc:-$?}
+  trap - EXIT
+  cd $TDIR/../
+  echo "Download job exited ${rc}"
+  PS1="JOBLOGS ]\$  " bash --noprofile --norc
+}
+
+function get_dirs(){
+  local drop="\b(etc|ara|ara_oooq|docs|build|stackviz|sudoers.d|lib|config-data|extra)\b"
+  local directories=""
+  directories=$(curl -s "$1" 2> /dev/null | grep -E "\[DIR" | grep -vE "${drop}" | sed -e "s,.*href=\"\([^\"]*\)\".*,${1}\1,g")
+  if [ -n "$directories" ]; then
+    for d in $directories; do
+      directories="$directories $(get_dirs $d/)"
+    done
+    echo $directories
+  else
+    echo ""
+  fi
+  return 0
+}
+
+[[ "${1:--}" =~ ^\s+?- ]] && (usage; exit 1)
+type -p wget 2>&1 >/dev/null || ( echo "Please install a wget tool!"; exit 127 )
+trap finish EXIT SIGINT SIGTERM
+
+WORKERS=6
+BASEURL=${1%/}
+SC=$(dirname $BASEURL | grep -o \/ | wc -w)
+if [[ ! $(basename $BASEURL) == 'logs' && SC -le 7 ]]; then
+  console=$BASEURL/console.html
+  BASEURL=${BASEURL}/logs
+else
+  console=''
+fi
+TDIR=${BASEURL##*http://}
+TDIR=/tmp/${TDIR}
+mkdir -p $TDIR
+cd /tmp
+
+echo "Target dir for download: $TDIR"
+echo Will download logs from the following URLs:
+list_to_get="$console $(get_dirs $BASEURL/)"
+for d in $list_to_get; do
+  echo $d
 done
 
-function getthelogs(){
-    for FILE in $FILES ; do
-        _getfile $FILE
-    done
-}
-getthelogs
+rm -f wget-jobs.txt
+for d in $list_to_get; do
+  args="\"-nv -nc --no-use-server-timestamps \
+  --accept-regex='\.txt\.gz$|console\.htm[l]?$|messages$' \
+  --reject='index.html*' \
+  --recursive -l 10 --domains logs.openstack.org --no-parent \
+  -erobots=off --wait 0.25 ${d}\""
+  echo "${args}" >> wget-jobs.txt
+done
 
-export FILES
-export -f getthelogs _getfile
-PS1="JOBLOGS ]\$  " bash
+cat wget-jobs.txt | sed -n '{p;p}' | shuf > wget-jobs-shuf.txt
+cat wget-jobs-shuf.txt  | xargs -r -n1 -P ${WORKERS} -I{} sh -c "wget {}"