John Bokma Perl
freelance Perl programmer

Google Search Report

A handy overview of Google Search queries | 0 comments

The following Perl program is alpha, use at your own risk. Moreover, if you use it while in alpha stage, make sure to check often this page for updates. The program generates an HTML page with statistics if you provide it with a Apache HTTP server access_log. More information will follow, but here is the program:

#!/usr/bin/perl
#
# gsreport.pl - Google Search Report (ALPHA!)
#
#  Copyright, 2006 by John Bokma, http://johnbokma.com/
# License: The Artistic License
#
# $Id: gsreport.pl 1080 2008-09-30 18:27:07Z john $ 

use strict;
use warnings;

use Carp;
use Encode;
use List::Util 'first';
use HTML::Entities;
use URI::Escape;
use Getopt::Long;
use Time::Local;


my $time = time;
my $steps = 14;
my $mapping = 'log';
my $prefix = '';
my $begin =        0;
my $end   = 99991231;


sub print_usage_and_exit {

    print <<USAGE;
usage: gsreport.pl [OPTIONS] ACCESS_LOG

options:

    steps   - number of color steps, default $steps
    mapping - log or lin, default $mapping
    prefix  - prefix for paths (creates links), default none
    begin   - start at YYYYMMDD
    end     - stop at YYYYMMDD
USAGE

    exit;
}


GetOptions(

    "steps=i"   => \$steps,
    "mapping=s" => \$mapping,
    "prefix=s"  => \$prefix,
    "begin=i"   => \$begin,
    "end=i"     => \$end,
);

my $filename = shift;
defined $filename or print_usage_and_exit;

open my $fh, $filename or
    die "Can't open '$filename' for reading: $!";

# create a conversion table for Xxx month name to number
my %mon_number;
@mon_number{ qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ) }
    = 1..12;

my %stats;
while ( my $line = <$fh> ) {

    $line =~ m!

        \[(\d{2})/(\w{3})/(\d{4})(?::\d\d){3}.+?\]
        \s"GET\s(\S+)\sHTTP/\d.\d"
        \s(\S+)
        \s\S+
        \s"http://w{1,3}\.google\.
        (?:[a-z]{2}|com?\.[a-z]{2}|com)\.?/
        [^\"]*q=([^\"&]+)[^\"]*"

    !xi or next;

    my ( $day, $mm, $year, $path, $status, $query )
        = ( $1, $mon_number{ $2 }, $3, $4, $5, $6 );

    "$year$mm$day" < $begin and next;
    "$year$mm$day" > $end   and last;

    $day *= 1;  # convert to number, remove leading zero if present

    $query =~ s/\+/ /g;
    $query = join ' ' => split ' ', uri_unescape $query;
    $query = Encode::decode_utf8 $query;

    $stats{ $year }{ $mm }{ $status }{ $path }{ count }++;
    $stats{ $year }{ $mm }{ $status }{ $path }{ days }[ $day ]++;
    $stats{ $year }{ $mm }{ $status }{ $path }{ queries }{ $query }++;
}

close $fh or die "Can't close '$filename' after reading: $!";


my %status2description = (

    200 => 'OK',
    206 => 'Partial Content',
    301 => 'Moved Permanently',
    304 => 'Not Modified',
    403 => 'Forbidden',
    404 => 'Not Found',
);

print_html_start();

for my $year ( sort { $a <=> $b } keys %stats ) {

    my $months = $stats{ $year };

    for my $month ( sort { $a <=> $b } keys %$months ) {

        my $all_status = $months->{ $month };

        for my $status ( sort keys %$all_status ) {

            my $paths = $all_status->{ $status };

            my @paths = sort {

                $paths->{ $b }{ count } <=> $paths->{ $a }{ count }

            } keys %$paths;

            print "<table>\n";

            printf "<caption>%d-%d status: %d%s</caption>",
                $month,
                $year,
                $status,
                exists $status2description{ $status }
                    ? " ($status2description{ $status })\n"
                    : '';

            print '<tr class="header"><th class="left">Path</th>';
            print_day_header_cells( $month, $year );
            print '<th>Min</th><th>Max</th><th>Total</th>';
            print '<th>Top query (%)</th>';
            print "</tr>\n";

            for my $path ( @paths ) {

                print_row_start( $prefix, $path );

                print_day_and_min_and_max_cells(

                    days    => $paths->{ $path }{ days },
                    steps   => $steps,
                    mapping => $mapping,
                );

                print_count_and_top_query(

                    $paths->{ $path }{ count },
                    $paths->{ $path }{ queries },
                );

                print "</tr>\n";    # row end
            }

            print "</table>";
        }
    }
}

print_html_end( time - $time );
exit;


sub print_html_start {

    print <<"START";
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
 "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
    <title>Google Search Report</title>
    <link rel="stylesheet" type="text/css" href="gsreport.css">
</head>
<body>
<h1>Google Search Report</h1>
START
}


sub print_html_end {

    my $delta = shift;
    print <<FOOTER;
<div class="footer">
    <a href="http://johnbokma.com/perl/google-search-report.html">Google
    Search Report</a>, written by John Bokma, took $delta seconds to
    generate this page.
</div>
FOOTER
}


sub print_row_start {

    my ( $prefix, $path ) = @_;

    print "<tr>\n<th>";

    $prefix ne ''
        and print '<a href="' . encode_entities( "$prefix$path" ) . '">';

    print encode_entities( $path );

    $prefix ne ''
        and print '</a>';

    print '</th>';
}


sub print_day_header_cells {

    my ( $month, $year ) = @_;

    my $weekday = first_day_of_month( $month, $year );

    for my $date ( 1..31 ) {

        if ( $weekday == 0 or $weekday == 6 ) {

            print qq(<th class="weekend">$date</th>);
        }
        elsif ( $weekday == 3 ) {

            print qq(<th class="wednesday">$date</th>);
        }
        else {

            print "<th>$date</th>";
        }

        ++$weekday == 7 and $weekday = 0;
    }
}


sub print_day_and_min_and_max_cells {

    my %params = @_;

    my $days = $params{ days }
        or croak "Parameter 'days' not given";

    my $steps = $params{ steps }
        or croak "Parameter 'steps' not given";

    my $mapping = $params{ mapping } || 'log';
    $mapping eq 'log' or $mapping eq 'lin'
        or croak "Parameter 'mapping' has an unsupported value ($mapping)";

    # there is at least one day with at least one hit, set min and
    # max to this first defined value.
    my $min = first { defined $_ } @{ $days };
    my $max = $min;
    my @counts;
    for my $day ( 1..31 ) {

        my $count = $days->[ $day ];
        push @counts, $count;
        defined $count or next;

        $count < $min and $min = $count;
        $count > $max and $max = $count;
    }

    my $max_step = $steps - 1;

    my $step = $min == $max
        ? sub { 1 }
        : $mapping eq 'log'
            ? sub {

                1 + int( $max_step * (
                    ( log( $_[ 0 ] ) - log( $min )) /
                    ( log( $max ) - log( $min ) ) )
                )
            }
            : sub {

                1 + int( $max_step *
                    ( $_[ 0 ] - $min ) /
                    ( $max - $min )
                )

            };

    print map {

        defined $_
            ? '<td class="color' . $step->( $_ ) . "\">$_</td>"
            : '<td> </td>'

    } @counts;

    print "<td>$min</td><td>$max</td>\n";
}


sub print_count_and_top_query {

    my ( $count, $queries ) = @_;

    my @queries = sort {

        $queries->{ $b } <=> $queries->{ $a }

    } keys %$queries;

    my $top_query = $queries[ 0 ];

    printf '<td>%d</td><td>%s (%.1f%%)</td>',
        $count,
        encode_entities( $top_query ),
        $queries->{ $top_query } / $count * 100;
}


sub first_day_of_month {

    my ( $month, $year ) = @_;

    my $time = timegm( 0, 0, 0, 1, $month - 1, $year );

    return ( gmtime( $time ) )[ 6 ];
}

The Google Search Report program generates an HTML page which requires an external stylesheet named gsreport.css. You can tweak this stylesheet and even add more color levels if required so. Note that in the latter case you have to pass the number of levels via the steps command line option, or update the default in the Perl program.

Copy and paste the following code into your favorite editor and save it as gsreport.css.

/* gsreport.css - external stylesheet for gsreport.pl
 *
 * (c) Copyright, 2006 by John Bokma, http://johnbokma.com/
 * License: The Artistic License
 */
 
body { font-size: 11px; font-family: "Trebuchet MS" }

h1 { font-size: 14px }

table { margin-bottom: 20px }

caption { text-align: left; font-size: 12px; }

th { text-align: left; font-weight: normal; padding: 2px }
th.weekend   { background-color: #999 }
th.wednesday { background-color: #ccc }
tr.header th {

    font-weight: bold;
    border: solid 1px black;
    text-align: right
}
tr.header th.left { text-align: left }
 
td { text-align: right; padding: 1px; }
 
.color1  { background-color: #100; color: #fff }
.color2  { background-color: #200; color: #fff }
.color3  { background-color: #300; color: #fff }
.color4  { background-color: #400; color: #fff }
.color5  { background-color: #500; color: #fff }
.color6  { background-color: #600; color: #fff }
.color7  { background-color: #700; color: #fff }
.color8  { background-color: #800; color: #fff }
.color9  { background-color: #900; color: #fff }
.color10 { background-color: #a00; color: #fff }
.color11 { background-color: #b00; color: #fff }
.color12 { background-color: #c00; color: #fff }
.color13 { background-color: #d00 }
.color14 { background-color: #f44 }
Please post a comment | read 0 comments | RSS feed