#!/usr/local/bin/perl -I../lib -w
 
#-----------------------------------------------------------------------

=head1 NAME

B<createIndex> - Create the Jumping Spider index from the results
file.  This command creates the spider's index.

=head1 SYNOPSIS

  createIndex 
    [-help] |
    [-version] |
    [-verbose] 
    [-fastCreation ] 
    [-exclude excludeFileName] 
    [-databaseName databaseName] 
    [-databaseMode GDBM | DBM | BSD] 
    resultsFile

=head1 DESCRIPTION

B<createIndex> is a simple script to read the results of the robot search
and create an index that maps terms to URLs.

=head1 EXAMPLE USAGE

Create an index for the results of the ACM site run.
Run Robot on ACM site, restrict it just to ACM. (It may be good
to put this in a Makefile, so we can just say 'make ACM' and it
will fire up on ACM.

 perl -I../lib createIndex \
    -verbose \
    -fastCreation \
    -exclude excludedWords \
    -databaseName databaseName \
    -databaseMode DBM \
    ../results/acmscript

See also L<JumpingSpider>.

=cut

#-----------------------------------------------------------------------

use strict;
require 5.002;

use Getopt::Long;
use IO::Pipe;
use English;
use JumpingSpider;

#-----------------------------------------------------------------------

=head1 OPTIONS

=over 4

=item -help

Display a short help message with a reminder of supported
command-line options.

=item -version

Display the version of Robot.

=item -verbose

Enable verbose reporting.

=item -exclude fileName

Exclude the list of words in the indicated file from the index.

=item -fastCreation

Use an in-memory index during creation and then dump it to disk on
completion.

=item -databaseName databaseName

The name of the database, overides the default name 
in L<JumpingSpider::Constants>.

=item -databaseMode databaseMode

The mode of the database, overides the default mode
in L<JumpingSpider::Constants>.

=back

=cut

#-----------------------------------------------------------------------

use vars qw($VERSION);

my $VERSION       = '1.00';
my $SHOW_VERSION  = 0;
my $VERBOSE       = 0;
my $HELP          = 0;
my $FAST_CREATION = 0;
my $EXCLUDE_FILE  = '';
my $COMMAND_NAME  = 'createIndex';

#-----------------------------------------------------------------------
# This is a list of words to exclude from the index
#-----------------------------------------------------------------------
my %bad = ( 
            "\M" => 1,
            '' => 1,
            ' ' => 1,
          );
my $InputFileName;


#-----------------------------------------------------------------------
# Parse the command line
#-----------------------------------------------------------------------
&ParseCommandLine();

#-----------------------------------------------------------------------
# OK, this is a hash table which is the in memory temporary index, bad idea?
# Well, it makes the index fast for small indexes.  Turn its use on with
# the fast flag.
#-----------------------------------------------------------------------
my %indexHash = ();

# open the database so conversions can proceed
my $Globals = new JumpingSpider::Globals();

#
# Grab the necessary table handler
#
print "$COMMAND_NAME: Opening database tables\n" if $VERBOSE;
my $indexTable = $Globals->{'indexTable'};

print "$COMMAND_NAME: Opening input file.\n" if $VERBOSE;
open(INPUT, "< $InputFileName") || 
   die "Could not open input file: $InputFileName";

print "$COMMAND_NAME: Processing input...\n" if $VERBOSE;
# Process the input file looking for certain kinds of information
while (<INPUT>) {
  /^(\w+)\s+([^\s]+)\s+(.*)$/;
  my ($action, $first, $second) = (' ', ' ', ' ');
  $action = $1 if defined $1;
  $first = $2 if defined $2;
  $second = $3 if defined $3;
  my ($third);
  if ($action eq 'TITLE') {
    $first = &_addSlash($first);
    $second =~ s/\s+$//;
    _addToIndex($second, Id::fromString($first));
    }
  elsif ($action eq 'LINK') {
    $first = &_addSlash($first);
    $second =~ /\s+/;
    $second = $`;
    $third = $';
    $third =~ s/\s+$//;
    $second = &_addSlash($second);
    _addToIndex($third, Id::fromString($second));
    }
  elsif ($action eq 'RESTRICTED') {
    $first = &_addSlash($first);
    $second =~ /\s+/;
    $second = $`;
    $third = $';
    $third =~ s/\s+$//;
    $second = &_addSlash($second);
    _addToIndex($third, Id::fromString($second));
    }
  #elsif ($action eq 'HEADER') {
  #  $first = &_addSlash($first);
  #  $second =~ s/\s+$//;
  #  _addToIndex($second, Id::fromString($first));
  #  }
  }
    
close INPUT;

#
# Now write the in memory index to disk
#
print "$COMMAND_NAME: Dumping index to disk.\n" if $VERBOSE;
my ($key);
foreach $key (keys %indexHash) {
  my $wordKey = new StringCol($key);
  my $temp = $indexHash{$key};
  #print "Indexing " . $wordKey->image() . $temp->image() . "\n";
  my $t = $indexTable->retrieveTuple($wordKey);
  if ($t) { $temp->unionSelf($t->getValueAsIdSetWithCount()); }
  $indexTable->insertTuple(new Tuple($wordKey, $temp));
  }

#
# clean up
#
$Globals->close();
print "$COMMAND_NAME: Cleaned up and exiting.\n" if $VERBOSE;

#----------------------------------------------------------------------
#
# Add a word to the Index 
#
#----------------------------------------------------------------------
sub _addToIndex {
  my ($words, $id) = @_;

  $words =~ tr/A-Z/a-z/;
  # We will index each word in the title
  my @words = split(/\W+/,$words); 
  my ($word);
  foreach $word (@words) {
    if (!defined $bad{$word}) {
      # Does this word already exist in the index table?
      $indexHash{$word} = new IdSetWithCount() unless defined $indexHash{$word};
      my $idSet = $indexHash{$word};
      # add this id to the set
      $idSet->insert($id);
      # insert the updated index to the table 
      $indexHash{$word} = $idSet;
      }
    }
}

#----------------------------------------------------------------------
# Add a slash to the URL
#----------------------------------------------------------------------
sub _addSlash {
  my ($s) = @_;

  # already ends in a slash?
  $s =~ s/\s+$//;
  if ($s =~ /\/$/) {return $s;}

  $s =~ /([^\/]*)$/;
  if ($1  =~ /\.[sS]?[hH][tT][mM][lL]?/) {return $s;}
  $s .= '/';
  return $s;
  }

#------------------------------------------------------------------------
# ParseCommandLine() - handle command line
#------------------------------------------------------------------------
sub ParseCommandLine {
  my @switches = (
    'databaseMode=s', \$JumpingSpider::Constants::databaseMode,
    'databaseName=s', \$JumpingSpider::Constants::databaseName,
    'exclude=s',      \$EXCLUDE_FILE,
    'help',           \$HELP,
    'verbose',        \$VERBOSE,
    'version',        \$SHOW_VERSION,
    'fastCreation',   \$FAST_CREATION,
    );

  &GetOptions(@switches) || die "use -help switch to display brief help\n";

  if ($SHOW_VERSION) {
    print "This is $COMMAND_NAME, version $VERSION\n";
    exit 0;
    }

  if ($HELP) {
    print <<HelpEnd;
    $COMMAND_NAME, v$VERSION - create the Jumping Spider index from the
                               Robot results

    Usage: $COMMAND_NAME 
                         [-help] |
                         [-version] |
                         [-verbose]
                         [-fastCreation ]
                         [-exclude excludeFileName]
                         [-databaseName databaseName]
                         [-databaseMode GDBM | DBM | BSD]
                         resultsFile

        -help            : display this message
        -verbose         : display verbose information as running
        -fastCreation    : use an in-memory index during creation
        -exclude file    : exclude this list of words from the index
        -databaseName name : name of the database
        -databaseMode mode : mode for the database 

HelpEnd
    exit 0;
    }

  if ($EXCLUDE_FILE ne '') {
    # read the excluded file
    open(EXCLUDEFILE, "< $EXCLUDE_FILE") || 
      die "No exclude file: $EXCLUDE_FILE\n";
    # slurp file
    my @words = <EXCLUDEFILE>;
    foreach (@words) { $bad{$_} = 1; }
    }
  
  $InputFileName = shift @ARGV || 
    die "$COMMAND_NAME: Needs an input file name ";

  die "Non-fastCreation currently unsupported\n" unless $FAST_CREATION;

}
