#! /bin/sh
# This script returns information on search queries that lead to your site (i.e. what keywords were used)
# This script assumes the referer info is part of the access_log file in APACHE format.
#
# Put this script in ~/bin/ and run it every time you backup you log file (here named ~/logs/access_log.1)
# If the referer file is separate from the log file,
# you might want to join them both with the command "paste -d=" " access.log referer.log >access_log.1"
#
# This program can also run on Windows with cygwin and activeperl
# For more explanations, look at http://www.gdargaud.net/Hack/Searches.html
date
# List of the various search engine query strings, separated by \|
Queries='q=\|qry=\|query=\|search=\|ask=\|Term=\|Topic=\|pt=\|wf,\|prev=\|back=\|str=\|Keywords=\|qkw=\|kwd=\|p=\|searchfor='
# List of referers to ignore.
# Should contain your own website name and aliases as well as "-", separated by \|
Avoid='"-"\|gdargaud.net\|sung3\|rome.atmos'
# Lets make a list of queries coming from remote pages to use in various analysis
grep "$Queries" ~/logs/access_log.1 |
perl -p -e 's/%(..)/pack("c", hex($1))/eg' |
sed -e "s/.*GET \(.*\) HTTP.*\($Queries\)\([^&, ]*\).*/\3 -> \1/" -e "s/cache:[^+ ]*+//" |
perl -p -e 's/%(..)/pack("c", hex($1))/eg' |
sed -e 's/"//g' -e "s/+/ /g" |
tr -s " " " " |
sed -e "s/^ //" >/tmp/tmp$$
# You can reorder the following sections the way you like
echo
echo "++++++++ New referers +++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
gunzip ~/bin/Referers.txt.gz
grep -v "$Queries" ~/logs/access_log.1 |
cut -d" " -f11 |
grep -v '?' |
grep -v $Avoid |
grep -aivxFf ~/bin/Referers.txt |
sort -bf | uniq -ci | sort -nrf
# Update the list of past searches
grep -v "$Queries" ~/logs/access_log.1 |
cut -d" " -f11 |
grep -v '?' |
grep -v $Avoid >> ~/bin/Referers.txt
grep -avE "^.{1,5}$" ~/bin/Referers.txt |
sort -bf | uniq -i >/tmp/tmp$$_
mv /tmp/tmp$$_ ~/bin/Referers.txt
gzip ~/bin/Referers.txt
echo
echo "++++++++ Most popular referers (static pages) ++++++++++++++++++++++++++++++++++"
# cat ~/logs/access_log.1 |
grep -v "$Queries" ~/logs/access_log.1 |
cut -d" " -f11 |
grep -v '?' |
grep -v $Avoid |
sort -bf | uniq -ci | sort -nrf
echo
echo "++++++++ Most popular search engines +++++++++++++++++++++++++++++++++++++++++++"
grep "$Queries" ~/logs/access_log.1 |
cut -d" " -f11 | cut -d/ -f3 |
sort -i | uniq -ci | sort -nrf
echo
echo "++++++++ Most popular referers (blogs, non search engines) +++++++++++++++++++++"
# cat ~/logs/access_log.1 |
grep -v "$Queries" ~/logs/access_log.1 |
cut -d" " -f11 |
grep '?' |
grep -v $Avoid |
sort -i | uniq -ci | sort -nrf
echo
echo "++++++++ Sort destinations +++++++++++++++++++++++++++++++++++++++++++++++++++++"
grep -a " -> " /tmp/tmp$$ |
sed -e "s/.* -> //" |
sort -bf | uniq -ci | sort -nrf
echo
echo "++++++++ Search phrases ++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
grep -a " -> " /tmp/tmp$$ |
sed -e "s/ -> .*//" |
sort -bf | uniq -ci | sort -nrf
echo
echo "++++++++ Search phrases and their destinations +++++++++++++++++++++++++++++++++"
# sort -bf /tmp/tmp$$ | uniq -ci
echo
echo "++++++++ Search keywords only ++++++++++++++++++++++++++++++++++++++++++++++++++"
# grep -a " -> " /tmp/tmp$$ |
# sed -e "s/ -> .*//" |
# tr "?" " " | tr " " "\n" |
# grep -ave "^.$" | grep -ave "^..$" |
# sort -bf | uniq -ci | sort -nrf
echo
echo "++++++++ Leftovers +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
grep -av " -> " /tmp/tmp$$
echo
echo "++++++++ New phrases +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
gunzip ~/bin/PhraseList.txt.gz
grep -a " -> " /tmp/tmp$$ |
sed -e "s/ -> .*//" |
grep -aivxFf ~/bin/PhraseList.txt |
sort -bf | uniq -ci | sort -rf
# Update the list of past searches
grep -a " -> " /tmp/tmp$$ |
sed -e "s/ -> .*//" >> ~/bin/PhraseList.txt
grep -avE "^.{1,3}$" ~/bin/PhraseList.txt |
sort -bf | uniq -i >/tmp/tmp$$_
mv /tmp/tmp$$_ ~/bin/PhraseList.txt
gzip ~/bin/PhraseList.txt
rm /tmp/tmp$$*
echo
echo "++++++++++++++++++++++++++++++++++++++++"
date