#!/bin/bash
#< Apache access_log parser to generate report detailing Google searchterms and referrers

# Version 0.1	20070302	Initial release
# Version 0.2	20070312	-H, -4 and -R options added
# Version 0.3   20070315	-a option added

AWK="/usr/bin/awk"
BASENAME="/usr/bin/basename"
ECHO="/bin/echo"
EGREP="/usr/bin/egrep"
GREP="/usr/bin/grep"
HOST="/usr/bin/host"
PERL="/usr/bin/perl"
PRINTF="/usr/bin/printf"
SED="/usr/bin/sed"
SORT="/usr/bin/sort"
TR="/usr/bin/tr"
UNIQ="/usr/bin/uniq"

DIVIDER=$( ${PERL} -e "printf(\"%s\n\", \"=\"x80);" )
INPUT_FILE="access_log"
THIS_PROG=$( ${BASENAME} $0 )

ALL=0
FOUROFOUR=0
HITS=0
REFERRERS=0
RESOLVE=0
SEARCH=0

function print_error {
   ${ECHO} "Error: $@" >&2
}

function print_usage {
   {
      ${ECHO} "Usage: ${THIS_PROG} [-hR] [-a <num> | -r <num> -s <num> -H <num> -4 <num>] -f <input_file>"
      ${ECHO} "       -f <input_file>   Specify input log file"
      ${ECHO} "       -a <num>          Generate all available reports"
      ${ECHO} "       -r <num>          Generate referrer report"
      ${ECHO} "       -s <num>          Generate Google searchterm report"
      ${ECHO} "       -H <num>          Hits by IP address"
      ${ECHO} "       -R (with -H)      Attempt to resolve addresses (Slow...)"
      ${ECHO} "       -4 <num>          404 Error Report" 
      ${ECHO} "       -h                Display this help message"

   } >&2
}

function print_divider {
   ${ECHO} "${DIVIDER}"
}

function check_input_file {
   if [ ! -e "${INPUT_FILE}" ]; then
      print_error "Input file ${INPUT_FILE} does not exist"
      exit 1
   fi
}

function check_args {
   if [ "${REFERRERS}" -ne "0" ]; then
      ${ECHO} "${REFNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
      if [ "$?" -ne "0" ]; then
         print_error "${REFNUM} not an integer" && exit 1
      fi
   fi
   if [ "${SEARCH}" -ne "0" ]; then
      ${ECHO} "${SEARCHNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
      if [ "$?" -ne "0" ]; then
         print_error "${SEARCHNUM} not an integer" && exit 1
      fi
   fi
   if [ "${HITS}" -ne "0" ]; then
      ${ECHO} "${HITNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
      if [ "$?" -ne "0" ]; then
         print_error "${HITNUM} not an integer" && exit 1
      fi
   fi
   if [ "${FOUROFOUR}" -ne "0" ]; then
      ${ECHO} "${FOUROFOURNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
      if [ "$?" -ne "0" ]; then
         print_error "${FOUROFOURNUM} not an integer" && exit 1
      fi
   fi
   if [ "${RESOLVE}" -ne "0" -a "${HITS}" -eq "0" ]; then
      print_error "-R only makes sense with -H"
      exit 1
   fi
}

function get_timestamps {
   TIMESTAMPS=$( ${SED} -n -e '1p' -e '$p' ${INPUT_FILE} | ${AWK} '{print $4}' | ${SED} 's/\[//' | ${TR} '\n' '|' | ${SED} 's/|$//' )
}

function print_timestamps {
  ${ECHO} "Reporting period from [ ${TIMESTAMPS%%|*} ] to [ ${TIMESTAMPS##*|} ]"
}

function check_referrers {
   print_divider
   ${ECHO} "Referrer Report (Top ${REFNUM} Results)"
   print_timestamps
   print_divider
   # Google is taken care of by a seperate report
   ${AWK} '{print $11}' ${INPUT_FILE} |\
      ${GREP} -v "google" |\
      ${GREP} -v "^\"-\"$" |\
      ${EGREP} -v "^\"http://(www\.)?zazzybob.com" |\
      ${EGREP} -v "search|translate|zbos" |\
      ${SED} -e 's/^.//' -e 's/.$//' -e 's/^$//' |\
      ${SORT} | ${UNIQ} -c | ${SORT} -k 1,1rn |\
      ${SED} -n "1,${REFNUM} p"
   print_divider
}

function check_search {
   print_divider
   ${ECHO} "Google Search Term Report (Top ${SEARCHNUM} Results)"
   print_timestamps
   print_divider
   # We could place all the sed invocations below into a single command, but that would break
   # the nice formatting of the script ;-)
   ${GREP} -i "google.*search" ${INPUT_FILE} |\
      ${AWK} '{print $11}' |\
      ${SED} -e 's/^\"//' -e 's/\"$//' |\
      ${SED} -e 's!http://www.google\.[^/]*/search.*[^a]q=\([^=]*\).*$!\1!' |\
      ${SED} -e 's/+/ /g' -e 's/&[^\ ]*//' -e 's/%[0-9A-F][0-9A-F]//g' | ${SORT} |\
      ${UNIQ} -c | ${SORT} -k 1,1rn | ${SED} -n "1,${SEARCHNUM} p"
   print_divider
}

function check_hits {
   print_divider
   ${ECHO} "Hits by IP Address (Top ${HITNUM} Results)"
   print_divider
   (( RESOLVE )) && {
      ${AWK} '{print $1}' ${INPUT_FILE} |\
         ${SORT} | ${UNIQ} -c | ${SORT} -k 1,1rn | ${SED} -n "1,${HITNUM} p" |\
         while read COUNT IPADDR; do
            RESOLVED=$( ${HOST} ${IPADDR} | ${AWK} 'NR==1{print $NF}' | ${SED} 's/\.$//' ) 
            if [ "${RESOLVED}" = "3(NXDOMAIN)" ]; then 
               ${PRINTF} "%d\t%s\n" "${COUNT}" "${IPADDR}" 
            else
               ${PRINTF} "%d\t%s\n" "${COUNT}" "${RESOLVED}" 
            fi
         done
   } || {
      ${AWK} '{print $1}' ${INPUT_FILE} |\
         ${SORT} | ${UNIQ} -c | ${SORT} -k 1,1rn | ${SED} -n "1,${HITNUM} p"
   }
   print_divider
}

function check_four_o_fours {
   print_divider
   ${ECHO} "404 Error Report (Top ${FOUROFOURNUM} Results)"
   print_divider
   ${AWK} '$9 == "404" { print $7 }' ${INPUT_FILE} | ${SORT} | ${UNIQ} -c |\
      ${SORT} -k 1,1rn | ${SED} -n "1,${FOUROFOURNUM} p"
   print_divider
}

while getopts ":hf:r:s:H:R4:a:" OPTION; do
  case ${OPTION} in
     "h")  print_usage && exit 0  ;;
     "f")  INPUT_FILE="${OPTARG}" ;; 
     "a")  ALL=1
           ALLNUM="${OPTARG}"     ;;
     "r")  REFERRERS=1            
           REFNUM="${OPTARG}"     ;;
     "s")  SEARCH=1               
           SEARCHNUM="${OPTARG}"  ;;
     "H")  HITS=1
           HITNUM="${OPTARG}"     ;;
     "R")  RESOLVE=1              ;;
     "4")  FOUROFOUR=1
           FOUROFOURNUM="${OPTARG}"     ;;
     *  )  print_usage && exit 1  ;;
  esac
done

shift $(( ${OPTIND} - 1 ))

if [ "$#" -ne "0" ]; then
   print_usage && exit 1
fi

if [ "${ALL}" -ne "0" ]; then
   FOUROFOUR=1
   HITS=1
   REFERRERS=1
   SEARCH=1
   FOUROFOURNUM=${ALLNUM}
   HITNUM=${ALLNUM}
   REFNUM=${ALLNUM}
   SEARCHNUM=${ALLNUM}
   
fi

if [ "${REFERRERS}" -eq "0" -a "${SEARCH}" -eq "0" -a "${HITS}" -eq "0" -a "${FOUROFOUR}" -eq "0" ]; then
   print_error "At least one report type must be specified" && exit 1
fi

check_args
check_input_file
get_timestamps
(( REFERRERS )) && check_referrers
(( SEARCH )) && check_search
(( HITS )) && check_hits
(( FOUROFOUR )) && check_four_o_fours

exit 0