#!/usr/local/bin/perl -I../lib -w

#-----------------------------------------------------------------------

=head1 NAME

B<createInitialGraph> - Create the initial Jumping Spider graph from the results
file.  

=head1 SYNOPSIS

  createInitialGraph 
    [-help] | 
    [-version] | 
    [-verbose] 
    [-databaseName databaseName] 
    [-databaseMode GDBM | DBM | BSD] 
    resultsFile

=head1 DESCRIPTION

B<createInitialGraph> is a simple script to read the results of the 
robot search and create an initial WWWGraph based on that.  For
more on the WWWGraph see L<JumpingSpider::WWWGraph>.

=head1 EXAMPLE USAGE

Create an index for the results of the ACM site run.
Run Robot on ACM site, restrict it just to ACM. (It may be good
to put this in a Makefile, so we can just say 'make ACM' and it
will fire up on ACM.

 perl -I../lib createIndex \
    -verbose \
    -fastCreation \
    -exclude excludedWords \
    -databaseName databaseName \
    -databaseMode DBM \
    ../results/acmscript

See also L<JumpingSpider>.

=cut

#-----------------------------------------------------------------------

use strict;
require 5.002;

use Getopt::Long;
use IO::Pipe;
use English;
use JumpingSpider;

#-----------------------------------------------------------------------

=head1 OPTIONS

=over 4

=item -help

Display a short help message with a reminder of supported
command-line options.

=item -version

Display the version of Robot.

=item -verbose

Enable verbose reporting.

=item -databaseName databaseName

The name of the database, overides the default name 
in L<JumpingSpider::Constants>.

=item -databaseMode databaseMode

The mode of the database, overides the default mode
in L<JumpingSpider::Constants>.

=back

=cut

#-----------------------------------------------------------------------

use vars qw($VERSION);

my $VERSION       = '1.00';
my $SHOW_VERSION  = 0;
my $VERBOSE       = 0;
my $HELP          = 0;
my $COMMAND_NAME  = 'createInitialGraph';
my $InputFileName;

#-----------------------------------------------------------------------
# Parse the command line
#-----------------------------------------------------------------------
&ParseCommandLine();

print "$COMMAND_NAME: Initializing the WWW graph.\n" if $VERBOSE;
&JumpingSpider::WWWGraph::init(new JumpingSpider::Globals());

print "$COMMAND_NAME: Opening input file.\n" if $VERBOSE;
open(INPUT, "< $InputFileName") ||
   die "Could not open input file: $InputFileName";

print "$COMMAND_NAME: Processing input...\n" if $VERBOSE;
# Process the input file looking for certain kinds of information
while (<INPUT>) {
  /^(\w+)\s+([^\s]+)\s+(.*)$/;
  my ($action, $first, $second) = ($1, $2, $3);
  if ($action eq 'LINK') {
    $first = &_addSlash($first);
    ($second) = split(/\s+/, $second);
    $second = &_addSlash($second);
    &JumpingSpider::WWWGraph::addEdge($first, $second);
    }
  elsif ($action eq 'RESTRICTED') {
    $first = &_addSlash($first);
    ($second) = split(/\s+/, $second);
    $second = &_addSlash($second);
    &JumpingSpider::WWWGraph::addEdge($first, $second);
    }
  }
    
close INPUT;
print "$COMMAND_NAME: Handling unknowns...\n" if $VERBOSE;
&JumpingSpider::WWWGraph::handleUnknowns();
&JumpingSpider::WWWGraph::done();
print "$COMMAND_NAME: Clean exit.\n" if $VERBOSE;

#----------------------------------------------------------------------
# Add a slash to URLs that don't end in .html or something like that.
#----------------------------------------------------------------------
sub _addSlash {
  my ($s) = @_;

  # already ends in a slash?
  $s =~ s/\s+$//;
  if ($s =~ /\/$/) {return $s;}

  $s =~ /([^\/]*)$/;
  if ($1  =~ /\.[sS]?[hH][tT][mM][lL]?/) {return $s;}
  $s .= '/';
  return $s;
  }

#------------------------------------------------------------------------
# ParseCommandLine() - handle command line
#------------------------------------------------------------------------
sub ParseCommandLine {
  my @switches = (
    'databaseMode=s', \$JumpingSpider::Constants::databaseMode,
    'databaseName=s', \$JumpingSpider::Constants::databaseName,
    'help',           \$HELP,
    'verbose',        \$VERBOSE,
    'version',        \$SHOW_VERSION,
    );

  &GetOptions(@switches) || die "use -help switch to display brief help\n";

  if ($SHOW_VERSION) {
    print "This is $COMMAND_NAME, version $VERSION\n";
    exit 0;
    }

  if ($HELP) {
    print <<HelpEnd;
    $COMMAND_NAME, v$VERSION - create the Jumping Spider WWW Graph
                               Robot results

    Usage: $COMMAND_NAME
                         [-help] |
                         [-version] |
                         [-verbose]
                         [-databaseName databaseName]
                         [-databaseMode GDBM | DBM | BSD]
                         resultsFile

        -help            : display this message
        -verbose         : display verbose information as running
        -databaseName name : name of the database
        -databaseMode mode : mode for the database

HelpEnd
    exit 0;
    }

  $InputFileName = shift @ARGV || 
     die "$COMMAND_NAME: Needs an input file name ";
}
