The Perl program on this page is an example, for educational purposes only, of how to use the LWP::UserAgent and HTTP::Request::Common modules to download a web page and extract information from it.
Prog is an interface to Google that shows search engine result pages (SERPs) together with their PageRank.
Since the PageRank is encoded in an image I calculate an MD5 digest of each PageRank image on the result page. The digest is used to look up the PR value in a table. If somehow the image(s) change in the future, the program reports the MD5 value and the link, so you can update the table.
The program sorts the links on PageRank within the result. This is very useful if you use site:yoursite.com as a query. It also reports the position before sorting, for each PageRank the number of pages, and the number of results.
I leave obtaining more than 100 results as an exercise to the reader.
# prog.pl - Page Rank search engine interface
#
# © Copyright, 2004-2005 By John Bokma, http://johnbokma.com/
#
# This script is for educational purposes only.
#
# $Id$
use strict;
use warnings;
use URI::Escape;
use LWP::UserAgent;
use Digest::MD5 'md5_hex';
# number of results/GET
my $RESULTS = 100;
unless ( @ARGV ) {
print "usage: prog.pl query\n";
exit(1);
}
my $content = get_content(
"http://www.seochat.com/" .
"?go=1&option=com_seotools&tool=7&q=".
uri_escape( join(' ', @ARGV ) ) .
"&result_mode=relevance&num=$RESULTS"
);
# get the URLs for the images that show the PageRank
my @pr_images = $content =~ /src="(.+?&ch=[0-9a-f]{32})"/g;
unless ( @pr_images ) {
print "No result\n";
exit;
}
# conversion table 'MD5 digest of image' to PageRank
my %img2pr = (
'320d74a17556069111f8fbf222ff721d' => 0,
'4385f50bd06f1b6e4ce5fff9e7dfb65c' => 1,
'46daa8b23af7a6ba14abcbbdd00a71dc' => 2,
'92d2527b99b871227b14662a9533596b' => 3,
'a0fc736733fec2be7000d2f18c3f79b9' => 4,
'd69852318b5b5296bc0641d4bfc9b2d9' => 5,
'786e052246aa5a065d7862c1e05135d9' => 6,
'f59b46c3a9f61059003af2e10890ef83' => 7,
'd92ed4ec0d7f30662caf2bd141c57067' => 8,
'f45d3d5af97ea7e90046bb957ccc9194' => 9,
);
my @result;
my $position = 1;
my %pr;
for my $src ( @pr_images ) {
my ( $url ) = $src =~ /url=(.+?)&ch=[0-9a-f]{32}$/;
$url = uri_unescape( $url );
my $img = get_content( $src );
my $md5 = md5_hex( $img );
if ( defined $img2pr{ $md5 } ) {
my $pagerank = $img2pr{ $md5 };
push @result, {
position => $position,
pagerank => $pagerank,
url => $url,
};
$pr{ $pagerank }++;
} else {
print "Conversion table out of date\n",
" pos='$position'\n",
" url='$url'\n",
" md5='$md5'\n";
}
$position++;
}
@result = sort {
# sort first on PageRank, descending
$b->{pagerank} <=> $a->{pagerank}
||
# sort on position if PageRank same, ascending
$a->{position} <=> $b->{position}
} @result;
# result sorted on PageRank
printf "%2d/10 - %3d - %s\n",
$_->{pagerank},
$_->{position},
$_->{url}
for @result;
print "\n";
# How many of each
printf "%3d page(s) with a PageRank of %d\n",
$pr{$_},
$_
for ( sort { $b <=> $a } keys %pr );
print "\nNumber of results: ", scalar @result, "\n";
exit;
sub get_content {
my ( $url ) = @_;
my $ua = LWP::UserAgent->new( 'agent' => 'Mozilla/5.0' );
my $response = $ua->get( $url );
$response->is_success or
die "$url: ", $response->status_line;
return $response->content;
}