getthelogs

#!/bin/bash -xxx
# Source http://git.openstack.org/cgit/openstack-infra/tripleo-ci/tree/scripts/getthelogs
# but the source is no longer maintainer or cared about
RETRY_LOCAL=${RETRY_LOCAL:-false}
set -eu -o pipefail

DROP="\b(ara|ara_oooq|docs|build|stackviz|conf\.modules\.d|\.git)\b"
NOT="log\.[0-9]|~lock~|md5"

function usage(){
  echo "Helper script for downloading tripleo/rdo/osp/os-infra CI jobs logs"
  echo "Downloads the logs and starts a shell from the logs root directory"
  echo
  echo "Example:"
  echo "getthelogs http://logs.openstack.org/00/123456/7/check/gate-tripleo-ci-foo/d3adbeef"
  echo
  echo "Example 2 - Rebuild from local cache"
  echo "(helpful to get the logs of another run of the same job - retaining the files structure):"
  echo "RETRY_LOCAL=/tmp/entries.123456789O getthelogs http://logs.openstack.org/00/123456/8/check/gate-tripleo-ci-foo/d3adbeef"
  echo
  echo "Repeat prepared download jobs stored in a local file"
  echo "RETRY_LOCAL=all getthelogs"
}

WORKERS=6
SSO="${SSO:-null}"
COOK=""
if [ "$SSO" != "null" ] ; then
  curl --fail  --negotiate -u: -b cookies -c cookies -L "$SSO"  &> /dev/null
  COOK="-b cookies -Ok"
fi

function getit(){
  # Do not fail if something is missing / cannot be downloaded
  set +e
  cat /tmp/curl-jobs-shuf.txt  | xargs -r -n1 -P ${1} -I{} sh -c "curl {} | ${2}"
  set -e
}

function finish(){
  rc=${rc:-$?}
  echo "Processed URLs stored in $urls":
  #cat $urls
  trap - EXIT
  cd $TDIR
  echo "Download job exited ${rc}"
  PS1="JOBLOGS ]\$  " bash --noprofile --norc
}

# args: url, [parent_entry,] [cached?]
function get_jobs(){
  local lurls=$(mktemp "$fentries/urls.XXXXXXXXXX")
  local entries
  local e
  local paths
  if [ "${3:-}" ]; then
    entries=$2
    local prefix=$(echo ${url%/logs/*})
    local postfix="logs/$(echo ${url##*/logs/})"
  else
    entries=$(mktemp "$fentries/entries.XXXXXXXXXX")
    [ "${2:-}" ] && echo $2 > $entries # add to tree of cached entries
    f=$(basename ${1%/})
    if [ "x$COOK" = "x" ]; then
      curl -sLk $COOK "$1" 2> /dev/null > "$f"
    else
      curl -sLk $COOK "${1%/}" 2> /dev/null
    fi
    cat "$f" | sed -r 's/></>\n</g' | $filter |\
      grep -E '\[DIR|href=\S+\/?<' | grep -vE "$DROP|href=\"?http" |\
      sed -e "s,.*href=[\"']\([^\"']*\)[\"'].*,${1}\1,g" |\
      awk -F'index' {'print $1'} | tee -a $entries
  fi
  grep -q '^http' $entries || return
  set +u
  while read -r d; do
    echo $d | grep -q '^http' || continue # ignore tree metadata
    echo $d | grep -Eq '\.\.\/|\/\/$' && continue
    echo "Processing URLs for $d"
    if echo $d | grep -q /$; then  # list directories via recursion
      if [ "${3:-}" ]; then
        # TODO: test read back through the tree entries
        # allowing different URLs to prefix the known log files structure
        e=$(head -1 $entries)
        echo $e | grep -q '^http' && e=' ' # reached a local bottom
        paths=$(grep -rl $e $(dirname $entries))
        e="$e cached"
      else
        # grow the tree
        e=$entries
      fi
      cat $lurls | grep -q "${d%*/}/ " || get_jobs "${d%*/}/" $e
    else  # also list files
      grep -q "$d" $urls && continue
      grep -q "$d" $lurls && continue
      cat $lurls | grep -q "$d " || echo "$d" >> $lurls
    fi
  done < <(cat -- $entries)
  set -u
  cat $lurls >> $urls
}

if [ "$RETRY_LOCAL" = "all" ]; then
  getit $WORKERS cat
  exit 0
fi

[[ "${1:--}" =~ ^\s+?- ]] && (usage; exit 1)
trap finish EXIT SIGINT SIGTERM
urls=$(mktemp -t tmp.XXXXXXXXXX)
if [ "$RETRY_LOCAL" = "false" ]; then
  fentries=$(mktemp -d -t entries-XXXXXXXXXX)
else
  fentries=$RETRY_LOCAL
fi

BASEURL=${1%/}
SC=$(dirname $BASEURL | grep -o \/ | wc -w)
TDIR=${BASEURL##*http://}
TDIR=${TDIR##*https://}
TDIR=/tmp/${TDIR}
mkdir -p "$TDIR"
cd "$TDIR"

echo "Target dir for download: $TDIR"
echo Will download logs from the following URLs:
filter="cat"
if [ "x$COOK" = "x" ]; then
  curl -sIk $COOK "$BASEURL/" | grep -qi 'content-encoding: gzip' && filter="zcat"
else
  curl -sIk $COOK "$BASEURL"
  f=$(basename "$BASEURL")
  grep -qi 'content-encoding: gzip' "$f" && filter="zcat"
fi
if [ "$RETRY_LOCAL" = "false" ]; then
  get_jobs "$BASEURL/"
else
  # rebuild urls descending the tree (top is the oldest entry dir)
  # TODO: it must traverse all paths to all bottoms...
  get_jobs "$BASEURL/" $(ls -t1 "${fentries}/entries.*" | tail -1) cached
fi
rm -f /tmp/curl-jobs.txt
while read -r d; do
  echo $d | grep -q '^http' || continue
  echo $d | grep -Eq "$NOT" && continue
  mkdir -p $(dirname "${d##*logs/}") ||:
  echo "-Lfk ${COOK} ${d} -o $(dirname ${d##*logs/})/$(basename ${d##*logs/})" >> /tmp/curl-jobs.txt
done < <(cat -- $urls)

cat /tmp/curl-jobs.txt | sort -u >> /tmp/curl-jobs_.txt
cat /tmp/curl-jobs_.txt | shuf > /tmp/curl-jobs-shuf.txt
rm -f /tmp/curl-jobs_.txt
getit $WORKERS $filter
set +e
cat /tmp/curl-jobs-shuf.txt  | xargs -r -n1 -P ${WORKERS} -I{} sh -c "curl {} | $filter"
set -e
if [ "$filter" = "cat" ]; then
  find . -type f -name "*.gz" | xargs -r -n1 gunzip
else
  find . -type f -name "*.*" | xargs -r -n1 -I{} bash -c 'mv -f "{}" "{}_" && zcat "{}_" > "{}" && rm -f "{}_"'
fi