Rework the getthelogs helper script for wget recursive

The getthelogs script uses wget and parallel to fetch some of the CI jobs logs. Update the README to encourage contributors to improve the elastic-recheck detection rules with the help of that script and being smart folks. Change-Id: Ia48e45118776ca710ecd17421c5fb024ab436293 Signed-off-by: Bogdan Dobrelya <bdobreli@redhat.com>
2017-08-09 17:29:46 +02:00 · 2017-08-09 17:29:46 +02:00 · eded2fdbd1
commit eded2fdbd1
parent 6cf056cc68
2 changed files with 69 additions and 47 deletions
--- a/README.rst
+++ b/README.rst
@ -17,6 +17,10 @@ Tools to help run CI jobs for TripleO. Includes things like:
 * Heat templates to help deploy and maintain test environment nodes
  using an undercloud.
 * Helper script(s) to generate CI status reports. (tox -ecireport -- -f)
 * Helper `getthelogs` script to download important job logs locally.
  Then you may want to inspect the logs for known errors and contribute
  discovered search patterns as the
  `elastic-recheck queries <https://git.openstack.org/cgit/openstack-infra/elastic-recheck/tree/queries>`_.
 OpenStack Infrastructure is deploying multiple jobs with different scenarios.
--- a/scripts/getthelogs
+++ b/scripts/getthelogs
@ -1,54 +1,72 @@
 #!/bin/bash
 set -eu -o pipefail
-# Helper script for downloading tripleo-ci logs, it then prompts the users for
+function usage(){
-# file they want to download, unzips them into a tmp directory and changes into
+  echo "Helper script for downloading tripleo-ci jobs logs"
-# the tmp directory, while in the tmp directory run "getthelogs" with no params
+  echo
-# to download any log files you hadn't previously downloaded
+  echo "Example:"
-# Run it like this
+  echo "getthelogs http://logs.openstack.org/00/123456/7/check/gate-tripleo-ci-foo/d3adbeef"
-# getthelogs http://logs.openstack.org/68/237568/5/check-tripleo/gate-tripleo-ci-f22-ha/dd8f61d/
+  echo
-
+  echo "Downloads the logs and starts a shell from the logs root directory"
 set -eu
 BASEURL=$1
 TDIR=${BASEURL//\//}
 # We do not clean up this directory, so data doesn't need to be downloaded a second time
 # if the script is rerun again with the same url
 TDIR=~/tmp/ci-${TDIR//\//-}
 mkdir -p $TDIR
 cd $TDIR
 echo $BASEURL > BASEURL
 function _getfile(){
    URL=$1
    BASENAME=$(basename $1)
    if [[ $BASENAME =~ .*(tar|console).* && ! -e $BASENAME ]] ; then
        read -p "Want $BASENAME? " X
        if [ "$X" = "y" ] ; then
            curl -O $URL || curl -O ${URL}.gz
            if [[ $BASENAME =~ .*(\.tar).* ]] ; then
                mkdir ${BASENAME}_
                tar -xf $BASENAME -C ${BASENAME}_
            fi
        fi
    fi
 }
-FILES="$BASEURL/console.html"
+function finish(){
-for FILE in $(curl $BASEURL/logs/ 2> /dev/null | grep href | sed -e 's/.*href="\([^"]*\)".*/\1/g' ) ; do
+  rc=${rc:-$?}
-    FILES="$FILES $BASEURL/logs/$FILE"
+  trap - EXIT
  cd $TDIR/../
  echo "Download job exited ${rc}"
  PS1="JOBLOGS ]\$  " bash --noprofile --norc
 }
 function get_dirs(){
  local drop="\b(etc|ara|ara_oooq|docs|build|stackviz|sudoers.d|lib|config-data|extra)\b"
  local directories=""
  directories=$(curl -s "$1" 2> /dev/null | grep -E "\[DIR" | grep -vE "${drop}" | sed -e "s,.*href=\"\([^\"]*\)\".*,${1}\1,g")
  if [ -n "$directories" ]; then
    for d in $directories; do
      directories="$directories $(get_dirs $d/)"
    done
    echo $directories
  else
    echo ""
  fi
  return 0
 }
 [[ "${1:--}" =~ ^\s+?- ]] && (usage; exit 1)
 type -p wget 2>&1 >/dev/null || ( echo "Please install a wget tool!"; exit 127 )
 trap finish EXIT SIGINT SIGTERM
 WORKERS=6
 BASEURL=${1%/}
 SC=$(dirname $BASEURL | grep -o \/ | wc -w)
 if [[ ! $(basename $BASEURL) == 'logs' && SC -le 7 ]]; then
  console=$BASEURL/console.html
  BASEURL=${BASEURL}/logs
 else
  console=''
 fi
 TDIR=${BASEURL##*http://}
 TDIR=/tmp/${TDIR}
 mkdir -p $TDIR
 cd /tmp
 echo "Target dir for download: $TDIR"
 echo Will download logs from the following URLs:
 list_to_get="$console $(get_dirs $BASEURL/)"
 for d in $list_to_get; do
  echo $d
 done
-function getthelogs(){
+rm -f wget-jobs.txt
-    for FILE in $FILES ; do
+for d in $list_to_get; do
-        _getfile $FILE
+  args="\"-nv -nc --no-use-server-timestamps \
-    done
+  --accept-regex='\.txt\.gz$|console\.htm[l]?$|messages$' \
-}
+  --reject='index.html*' \
-getthelogs
+  --recursive -l 10 --domains logs.openstack.org --no-parent \
  -erobots=off --wait 0.25 ${d}\""
  echo "${args}" >> wget-jobs.txt
 done
-export FILES
+cat wget-jobs.txt | sed -n '{p;p}' | shuf > wget-jobs-shuf.txt
-export -f getthelogs _getfile
+cat wget-jobs-shuf.txt  | xargs -r -n1 -P ${WORKERS} -I{} sh -c "wget {}"
 PS1="JOBLOGS ]\$  " bash