#!/bin/bash
#< Apache access_log parser to generate report detailing Google searchterms and referrers
# Version 0.1 20070302 Initial release
# Version 0.2 20070312 -H, -4 and -R options added
# Version 0.3 20070315 -a option added
AWK="/usr/bin/awk"
BASENAME="/usr/bin/basename"
ECHO="/bin/echo"
EGREP="/usr/bin/egrep"
GREP="/usr/bin/grep"
HOST="/usr/bin/host"
PERL="/usr/bin/perl"
PRINTF="/usr/bin/printf"
SED="/usr/bin/sed"
SORT="/usr/bin/sort"
TR="/usr/bin/tr"
UNIQ="/usr/bin/uniq"
DIVIDER=$( ${PERL} -e "printf(\"%s\n\", \"=\"x80);" )
INPUT_FILE="access_log"
THIS_PROG=$( ${BASENAME} $0 )
ALL=0
FOUROFOUR=0
HITS=0
REFERRERS=0
RESOLVE=0
SEARCH=0
function print_error {
${ECHO} "Error: $@" >&2
}
function print_usage {
{
${ECHO} "Usage: ${THIS_PROG} [-hR] [-a <num> | -r <num> -s <num> -H <num> -4 <num>] -f <input_file>"
${ECHO} " -f <input_file> Specify input log file"
${ECHO} " -a <num> Generate all available reports"
${ECHO} " -r <num> Generate referrer report"
${ECHO} " -s <num> Generate Google searchterm report"
${ECHO} " -H <num> Hits by IP address"
${ECHO} " -R (with -H) Attempt to resolve addresses (Slow...)"
${ECHO} " -4 <num> 404 Error Report"
${ECHO} " -h Display this help message"
} >&2
}
function print_divider {
${ECHO} "${DIVIDER}"
}
function check_input_file {
if [ ! -e "${INPUT_FILE}" ]; then
print_error "Input file ${INPUT_FILE} does not exist"
exit 1
fi
}
function check_args {
if [ "${REFERRERS}" -ne "0" ]; then
${ECHO} "${REFNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
if [ "$?" -ne "0" ]; then
print_error "${REFNUM} not an integer" && exit 1
fi
fi
if [ "${SEARCH}" -ne "0" ]; then
${ECHO} "${SEARCHNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
if [ "$?" -ne "0" ]; then
print_error "${SEARCHNUM} not an integer" && exit 1
fi
fi
if [ "${HITS}" -ne "0" ]; then
${ECHO} "${HITNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
if [ "$?" -ne "0" ]; then
print_error "${HITNUM} not an integer" && exit 1
fi
fi
if [ "${FOUROFOUR}" -ne "0" ]; then
${ECHO} "${FOUROFOURNUM}" | ${EGREP} "^[0-9]+$" >/dev/null 2>&1
if [ "$?" -ne "0" ]; then
print_error "${FOUROFOURNUM} not an integer" && exit 1
fi
fi
if [ "${RESOLVE}" -ne "0" -a "${HITS}" -eq "0" ]; then
print_error "-R only makes sense with -H"
exit 1
fi
}
function get_timestamps {
TIMESTAMPS=$( ${SED} -n -e '1p' -e '$p' ${INPUT_FILE} | ${AWK} '{print $4}' | ${SED} 's/\[//' | ${TR} '\n' '|' | ${SED} 's/|$//' )
}
function print_timestamps {
${ECHO} "Reporting period from [ ${TIMESTAMPS%%|*} ] to [ ${TIMESTAMPS##*|} ]"
}
function check_referrers {
print_divider
${ECHO} "Referrer Report (Top ${REFNUM} Results)"
print_timestamps
print_divider
# Google is taken care of by a seperate report
${AWK} '{print $11}' ${INPUT_FILE} |\
${GREP} -v "google" |\
${GREP} -v "^\"-\"$" |\
${EGREP} -v "^\"http://(www\.)?zazzybob.com" |\
${EGREP} -v "search|translate|zbos" |\
${SED} -e 's/^.//' -e 's/.$//' -e 's/^$//' |\
${SORT} | ${UNIQ} -c | ${SORT} -k 1,1rn |\
${SED} -n "1,${REFNUM} p"
print_divider
}
function check_search {
print_divider
${ECHO} "Google Search Term Report (Top ${SEARCHNUM} Results)"
print_timestamps
print_divider
# We could place all the sed invocations below into a single command, but that would break
# the nice formatting of the script ;-)
${GREP} -i "google.*search" ${INPUT_FILE} |\
${AWK} '{print $11}' |\
${SED} -e 's/^\"//' -e 's/\"$//' |\
${SED} -e 's!http://www.google\.[^/]*/search.*[^a]q=\([^=]*\).*$!\1!' |\
${SED} -e 's/+/ /g' -e 's/&[^\ ]*//' -e 's/%[0-9A-F][0-9A-F]//g' | ${SORT} |\
${UNIQ} -c | ${SORT} -k 1,1rn | ${SED} -n "1,${SEARCHNUM} p"
print_divider
}
function check_hits {
print_divider
${ECHO} "Hits by IP Address (Top ${HITNUM} Results)"
print_divider
(( RESOLVE )) && {
${AWK} '{print $1}' ${INPUT_FILE} |\
${SORT} | ${UNIQ} -c | ${SORT} -k 1,1rn | ${SED} -n "1,${HITNUM} p" |\
while read COUNT IPADDR; do
RESOLVED=$( ${HOST} ${IPADDR} | ${AWK} 'NR==1{print $NF}' | ${SED} 's/\.$//' )
if [ "${RESOLVED}" = "3(NXDOMAIN)" ]; then
${PRINTF} "%d\t%s\n" "${COUNT}" "${IPADDR}"
else
${PRINTF} "%d\t%s\n" "${COUNT}" "${RESOLVED}"
fi
done
} || {
${AWK} '{print $1}' ${INPUT_FILE} |\
${SORT} | ${UNIQ} -c | ${SORT} -k 1,1rn | ${SED} -n "1,${HITNUM} p"
}
print_divider
}
function check_four_o_fours {
print_divider
${ECHO} "404 Error Report (Top ${FOUROFOURNUM} Results)"
print_divider
${AWK} '$9 == "404" { print $7 }' ${INPUT_FILE} | ${SORT} | ${UNIQ} -c |\
${SORT} -k 1,1rn | ${SED} -n "1,${FOUROFOURNUM} p"
print_divider
}
while getopts ":hf:r:s:H:R4:a:" OPTION; do
case ${OPTION} in
"h") print_usage && exit 0 ;;
"f") INPUT_FILE="${OPTARG}" ;;
"a") ALL=1
ALLNUM="${OPTARG}" ;;
"r") REFERRERS=1
REFNUM="${OPTARG}" ;;
"s") SEARCH=1
SEARCHNUM="${OPTARG}" ;;
"H") HITS=1
HITNUM="${OPTARG}" ;;
"R") RESOLVE=1 ;;
"4") FOUROFOUR=1
FOUROFOURNUM="${OPTARG}" ;;
* ) print_usage && exit 1 ;;
esac
done
shift $(( ${OPTIND} - 1 ))
if [ "$#" -ne "0" ]; then
print_usage && exit 1
fi
if [ "${ALL}" -ne "0" ]; then
FOUROFOUR=1
HITS=1
REFERRERS=1
SEARCH=1
FOUROFOURNUM=${ALLNUM}
HITNUM=${ALLNUM}
REFNUM=${ALLNUM}
SEARCHNUM=${ALLNUM}
fi
if [ "${REFERRERS}" -eq "0" -a "${SEARCH}" -eq "0" -a "${HITS}" -eq "0" -a "${FOUROFOUR}" -eq "0" ]; then
print_error "At least one report type must be specified" && exit 1
fi
check_args
check_input_file
get_timestamps
(( REFERRERS )) && check_referrers
(( SEARCH )) && check_search
(( HITS )) && check_hits
(( FOUROFOUR )) && check_four_o_fours
exit 0