The following (beta) Perl program scans your document root, the directory containing the HTML files of your website, and automatically creates an RSS web feed for a given number of most recently modified pages.
The title and description of each RSS feed entry are extracted from the HTML file by using the contents of the title element for the former, and the contents of the first paragraph element (p) for the latter.
# feed-builder.pl
#
# © Copyright, 2005 By John Bokma, http://johnbokma.com/
#
# $Id$
use strict;
use warnings;
use File::Find;
use XML::RSS;
use HTML::TreeBuilder;
use Getopt::Long;
my $domain;
my $dir;
my $title;
my $description;
my $items = 12;
GetOptions(
"dir=s" => \$dir,
"domain=s" => \$domain,
"title=s" => \$title,
"desc=s" => \$description,
"items=i" => \$items,
) or show_help();
(
defined $dir
and defined $domain
and defined $title
and defined $description
) or show_help();
# scan the given (web) directory for htm(l) files and
# obtain the modification time of each found.
my %file2time;
find sub {
-f or return;
/\.html?$/ or return;
$file2time{ $File::Find::name } = ( stat )[ 9 ];
}, $dir;
# sort the filenames on modification time, descending.
my @filenames = sort {
$file2time{ $b } <=> $file2time{ $a }
} keys %file2time;
# keep the $items most recent ones
@filenames = splice @filenames, 0, $items;
# create the RSS file (version 1.0)
my $rss = new XML::RSS( version => '1.0' );
$rss->channel(
title => $title,
link => "http://$domain/",
description => $description,
);
# add an item for each filename
for my $filename ( @filenames ) {
my ( $title, $description ) =
get_title_and_description( $filename );
my $link = "http://$domain" . substr $filename, length $dir;
$link =~ s/index\.html?$//;
$rss->add_item(
title => $title,
link => $link,
description => $description,
dc => {
date => format_date_time( $file2time{ $filename } )
}
);
}
# output the result to STDOUT
print $rss->as_string;
sub show_help {
print <<HELP;
Usage: feed-builder [options] > index.rss
Options:
--dir path to the document root
--domain domain name
--title title of feed
--desc description of feed
--items number of items in feed
(default is 12)
Only --items is optional
HELP
exit 1;
}
# formats date and time for use in the RSS feed
sub format_date_time {
my ( $time ) = @_;
my @time = gmtime $time;
return sprintf "%4d-%02d-%02dT%02d:%02dZ",
$time[5] + 1900, $time[4] + 1, $time[3],
$time[2], $time[1], $time[0];
}
# extracts a title and a description from the given HTML file
sub get_title_and_description {
my $filename = shift;
my $root = HTML::TreeBuilder->new;
$root->parse_file( $filename );
# use the contents of the title element as title or
# a default if not present.
my $title_element = $root->look_down( _tag => 'title' );
my $title = defined $title_element
? $title_element->as_text
: 'No title';
# use the contents of the first paragraph element as
# a description. Fall back to the title element, if
# present, otherwise use a default.
my $p_element = $root->look_down( _tag => 'p' );
my $description = defined $p_element
? $p_element->as_text
: ( defined $title_element
? $title
: 'No description'
);
# free memory
$root->delete;
return ( $title, $description );
}
Example of how to use the Perl program:
feed-builder.pl --dir web --domain johnbokma.com --title "John
Bokma - freelance Perl Programmer" --desc "John Bokma, a freel
ance Perl programmer living in Mexico" --items 15 > web\index.
rss
You probably have to install the XML::RSS module. If you are going to run this program locally with ActiveState Perl, just install XML::RSS as follows:
ppm install XML-RSS