#!/usr/local/bin/perl 

#-----------------------------------------------------------------------

=head1 NAME

Robot - a WWW traversal robot based on L<ModifiedWWW::Robot>

=head1 SYNOPSIS

Robot [-version -verbose restrict_1_url ... ] restrict_N_url url

=head1 DESCRIPTION

B<Robot> is a web robot to traverse the WWW and build a 
It is a modification of B<Poacher>, a
sample application of the ModifiedWWW::Robot module.

=head1 EXAMPLE USAGE

Run Robot on ACM site, restrict it just to ACM. (It may be good
to put this in a Makefile, so we can just say 'make ACM' and it
will fire up on ACM.

 perl Robot.pm \
    -verbose \
    -starturl http://www.acm.org/ \
    -spillfile '/nobu/curtis/ACMWORKFILE' \
    -maxPages 20000 \
    http://www.acm.org/ >> /nobu/curtis/acmscript

See also L<WWWGraph> and L<JumpingSpider::TitleTable>.

=cut

#-----------------------------------------------------------------------

use strict;
require 5.002;

use ModifiedWWW::Robot;
use Getopt::Long;
use IO::Pipe;
use English;

# We need the JumpingSpider TitleTable
use JumpingSpider;

my $goodPage = 1;

#-----------------------------------------------------------------------

=head1 OPTIONS

=over 4

=item -help

Display a short help message with a reminder of supported
command-line options.

=item -spillfile

File to use as a temporary workfile.

=item -version

Display the version of Robot.

=item -verbose

Enabled verbose reporting as the Robot runs.

=item -robotverbose

Enabled verbose from ModifiedWWW::Robot as the Robot runs.

=back

=cut

#-----------------------------------------------------------------------

use vars qw($VERSION);
$VERSION         = '0.005';

my $SHOW_HELP    = 0;
my $MAXPAGES = 1500;
my $SHOW_VERSION = 0;
my $TRAVERSAL    = 'breadth';
my $VERBOSE      = 0;
my $ROBOTVERBOSE = 0;
my $STARTURL     = '';
my $SPILLFILE    = 'WORKFILE';
my $BOTNAME      = 'Robot';
my $SHORTBOTNAME = 'LR';
my $EMAIL        = 'curtis@cs.jcu.edu.au';


my $SiteRoot = '';
my $Robot;
my @Restricted;
my @Excluded;


&ParseCommandLine();
&Initialise();
if ($SiteRoot) { $Robot->run($SiteRoot); }
else { $Robot->run(); }

#-----------------------------------------------------------------------
# Initialise() - initialise global variables, contents, tables, etc
#-----------------------------------------------------------------------
sub Initialise
{
    # set up the ModifiedWWW::Robot object we use to traverse web pages
    $Robot = new ModifiedWWW::Robot(
                            'NAME'      => $BOTNAME,
                            'VERSION'   => $VERSION,
                            'EMAIL'     => $EMAIL,
                            'TRAVERSAL' => $TRAVERSAL,
                            'VERBOSE'   => $ROBOTVERBOSE,
                            'SPILLFILE' => $SPILLFILE,
                            'STARTURL'  => $STARTURL,
                            'MAXPAGES'  => $MAXPAGES,
                           );

    die "Failed to create robot, unable to continue.\n" unless defined $Robot;

    $Robot->addHook('follow-url-test',     \&follow_url_test);
    $Robot->addHook('invoke-on-all-url',   \&invoke_on_all_url);
    $Robot->addHook('invoke-on-link',      \&invoke_on_link);
    $Robot->addHook('invoke-on-contents',  \&process_contents);
    $Robot->addHook('invoke-on-get-error', \&process_get_error);

    #$Robot->proxy(['http'], 'http://proxy.jcu.edu.au:8080/');

    $Robot->setAttribute('REQUEST_DELAY', 0);
}

#-----------------------------------------------------------------------
# invoke_on_all_url() - reset good
#-----------------------------------------------------------------------
sub invoke_on_all_url {
  my ($robot, $url) = @_;
  $goodPage = 1;
}

#-----------------------------------------------------------------------
# invoke_on_link() - add an edge to the WWWgraph
#-----------------------------------------------------------------------
sub invoke_on_link {
  my ($robot, $hook_name, $url, $link, $text) = @_;
  my ($linkString) = $link->as_string();
  my $extension;
  my $site;

  # only want good guys
  return 0 unless $goodPage;

  # only link URLs
  return 0 if $url->scheme ne 'http';
  return 0 if $link->scheme ne 'http';

  # Check whether URL is within the restricted area
  my ($restricted) = 1;  
  foreach $site (@Restricted) {
    $restricted = 0 if ($linkString =~ m!^$site!);
    }

  # We don't want it if it is in a restricted area
  if ($restricted) {
    print "RESTRICTED " . $url->as_string() . " $linkString $text\n";
    return 0;
    }

  # only add HTML extensions or things that do not have a . in them
  if ($linkString =~ m![^/.]+\.([^/.]+)$!) {
    if ($1 !~ /^s?[hH][tT][mM][lL]?$/) { return 0; }
    }

  print "LINK " . $url->as_string() . " $linkString $text\n";

  return 1;
}

#-----------------------------------------------------------------------
# follow_url_test() - tell the robot module whether is should follow link
#    return 1 if want to follow
#    return 0 if do not want to follow
#-----------------------------------------------------------------------
sub follow_url_test {
  my ($robot, $hook_name, $url) = @_;
  my ($urlString) = $url->as_string();
  my $extension;
  my $site;

  # only want good guys
  return 0 unless $goodPage;

  # only follow http links
  if ($url->scheme ne 'http') {
    #print "REJNOTHTTP " . $url->as_string() . "\n";
    return 0;
    }

  return 0
        if $url =~ m!\.(gz|ps|gif|jpg|png|xbm|wav|mpg|zip|pdf|txt)[/]?$!;

  # if it is to a page that does not end in HTML, and only has word
  # chars (no periods), then ignore
  if ($urlString =~ m![^/.]+\.([^/.]+)$!) {
    if ($1 !~ /^s?[hH][tT][mM][lL]?$/) {
      #print "REJNOHTML " . $url->as_string() . "\n";
      return 0;
      }
    }

  # Check whether URL is in an excluded area
  foreach $site (@Excluded) {
    if ($urlString =~ /^\Q$site\E/) {
      #print "REJEXCLUDED " . $url->as_string() . "\n";
      return 0;
      }
    }

  # Check whether URL is within the restricted area
  foreach $site (@Restricted) {
    if ($urlString =~ m!^$site!) {
      return 1;
      }
    }

  # Not within a restricted area, don't follow it!
  #print "REJOUTSIDE " . $url->as_string() . "\n";
  return 0;
}

#------------------------------------------------------------------------
# process_get_error() - hook function invoked whenever a GET fails
#------------------------------------------------------------------------
sub process_get_error {
  my ($robot, $hook_name, $url, $response) = @_;
  print STDERR "$url\n    error code ", $response->code, "\n";
}

#------------------------------------------------------------------------
# process_contents() - process the contents of a URL we've retrieved
#------------------------------------------------------------------------
sub process_contents {
  my ($robot, $hook_name, $url, $response, $structure, $filename) = @_;

  # ignore if it is not an HTML page
  return 1 if $response->content_type ne 'text/html';

  print "PAGE " . $url->as_string() . " " . scalar @{ $$structure{'links'} } . "\n";

  # add the title from this page to the title table
  $goodPage = &JumpingSpider::TitleTable::addTitle($url->as_string(), $structure);
}

#------------------------------------------------------------------------
# ParseCommandLine() - read command line
#------------------------------------------------------------------------
sub ParseCommandLine {
  my ($excluded);
  my @switches = (
    'email=s',      \$EMAIL,
    'exclude=s',    \$excluded,
    'help',         \$SHOW_HELP,
    'verbose',      \$VERBOSE,
    'robotVerbose', \$ROBOTVERBOSE,
    'version',      \$SHOW_VERSION,
    'starturl=s',   \$STARTURL,
    'spillfile=s',  \$SPILLFILE,
    'siteroot=s',   \$SiteRoot,
    'maxPages=s',   \$MAXPAGES,
    );


  &GetOptions(@switches) || die "use -help switch to display brief help\n";

  if ($SHOW_VERSION) {
    print "This is $BOTNAME, version $VERSION\n\n";
    exit 0;
    }

  if ($SHOW_HELP) {
    print <<EofHelp;

    $BOTNAME, v$VERSION - run the Robot 

    Usage: $BOTNAME  [-verbose] \
                     [-robotverbose] \
                     [-version] \
                     restrict_1_url \
                     [restrict_2_url restrict_N_url] \
                     url 

	-help            : display this message
	-verbose         : display verbose information as running
	-robotverbose    : display verbose information as running
	-version         : display the version of $BOTNAME
	retrict_1_url    : the Robot can visit URLs with this prefix
          ....
	retrict_N_url    : the Robot can visit URLs with this prefix
        url              : starting URL
EofHelp
    exit 0;
    }

  # use the user's default email environment
  $EMAIL = $ENV{'USER'} || die "Please set your email address\n";

  #--------------------------------------------------------------------
  # A single URL on the command-line at this point: the URL for the
  # root of the site we are to check.
  #--------------------------------------------------------------------
  @Restricted = @ARGV;
  @Excluded = split(/\s+/, $excluded);
}
