devel/benchmarks/indexers/BenchmarkingIndexer.pm - lucy - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 use strict;
 use warnings;

 package BenchmarkingIndexer;

 use Carp;
 use Config;
 use File::Spec::Functions qw( catfile catdir );
 use POSIX qw( uname );

 sub new {
     my $either = shift;
     my $class = ref($either) || $either;
     return bless {
         docs              => undef,
         increment         => undef,
         store             => undef,
         engine            => undef,
         version           => undef,
         index_dir         => undef,
         corpus_dir        => 'extracted_corpus',
         article_filepaths => undef,
         @_,
     }, $class;
 }

 sub init_indexer { confess "abstract method" }
 sub build_index  { confess "abstract method" }

 sub delayed_init {
     my $self              = shift;
     my $article_filepaths = $self->{article_filepaths}
         = $self->build_file_list;
     $self->{docs} = @$article_filepaths unless defined $self->{docs};
     $self->{increment} = $self->{docs} + 1 unless defined $self->{increment};
 }

 # Return a lexically sorted list of all article files from all subdirs.
 sub build_file_list {
     my $self       = shift;
     my $corpus_dir = $self->{corpus_dir};
     my @article_filepaths;
     opendir CORPUS_DIR, $corpus_dir
         or confess "Can't opendir '$corpus_dir': $!";
     my @article_dir_names = grep {/articles/} readdir CORPUS_DIR;
     for my $article_dir_name (@article_dir_names) {
         my $article_dir = catdir( $corpus_dir, $article_dir_name );
         opendir ARTICLE_DIR, $article_dir
             or die "Can't opendir '$article_dir': $!";
         push @article_filepaths, map { catfile( $article_dir, $_ ) }
             grep {m/^article\d+\.txt$/} readdir ARTICLE_DIR;
     }
     @article_filepaths = sort @article_filepaths;
     $self->{article_filepaths} = \@article_filepaths;
 }

 # Print out stats for one run.
 sub print_interim_report {
     my ( $self, %args ) = @_;
     printf( "%-3d  Secs: %.3f  Docs: %-4d\n", @args{qw( rep secs count )} );
 }

 sub start_report {
     # Start the output.
     print '-' x 60 . "\n";
 }

 # Print out aggregate stats.
 sub print_final_report {
     my ( $self, $times ) = @_;

     # Produce mean and truncated mean.
     my @sorted_times = sort @$times;
     my $num_to_chop  = int( @sorted_times >> 2 );
     my $mean         = 0;
     my $trunc_mean   = 0;
     my $num_kept     = 0;
     for ( my $i = 0; $i < @sorted_times; $i++ ) {
         $mean += $sorted_times[$i];
         # Discard fastest 25% and slowest 25% of runs.
         next if $i < $num_to_chop;
         next if $i > ( $#sorted_times - $num_to_chop );
         $trunc_mean += $sorted_times[$i];
         $num_kept++;
     }

     $mean       /= @sorted_times;
     $trunc_mean /= $num_kept;
     my $num_discarded = @sorted_times - $num_kept;
     $mean       = sprintf( "%.3f", $mean );
     $trunc_mean = sprintf( "%.3f", $trunc_mean );

     # Get some info about the system.
     my $thread_support = $Config{usethreads} ? "yes" : "no";
     my @uname_info = (uname)[ 0, 2, 4 ];

     print <<END_REPORT;
 ------------------------------------------------------------
 $self->{engine} $self->{version}
 Perl $Config{version}
 Thread support: $thread_support
 @uname_info
 Mean: $mean secs
 Truncated mean ($num_kept kept, $num_discarded discarded): $trunc_mean secs
 ------------------------------------------------------------
 END_REPORT
 }

 package BenchSchema::WhiteSpaceTokenizer;
 use base qw( Lucy::Analysis::RegexTokenizer );

 sub new { return shift->SUPER::new( pattern => '\S+' ) }

 package BenchSchema;
 use base qw( Lucy::Plan::Schema );
 use Lucy::Analysis::RegexTokenizer;

 sub new {
     my $self = shift->SUPER::new;
     my $type = Lucy::Plan::FullTextType->new(
         analyzer => BenchSchema::WhiteSpaceTokenizer->new, );
     $self->spec_field( name => 'title', type => $type );
     return $self;
 }

 package BenchmarkingIndexer::Lucy;
 use base qw( BenchmarkingIndexer );

 use Time::HiRes qw( gettimeofday );

 sub new {
     my $class = shift;
     my $self  = $class->SUPER::new(@_);

     require Lucy;
     require Lucy::Index::Indexer;

     # Provide runtime flexibility.
     my $schema = $self->{schema} = BenchSchema->new;
     my $body_type = Lucy::Plan::FullTextType->new(
         analyzer      => BenchSchema::WhiteSpaceTokenizer->new,
         highlightable => $self->{store} ? 1 : 0,
         stored        => $self->{store} ? 1 : 0,
     );
     $schema->spec_field( name => 'body', type => $body_type );

     $self->{index_dir} = 'lucy_index';
     $self->{engine}    = 'Lucy';
     $self->{version}   = $Lucy::VERSION;

     return $self;
 }

 sub init_indexer {
     my ( $self, $count ) = @_;
     my $truncate = $count == 0 ? 1 : 0;
     return Lucy::Index::Indexer->new(
         schema   => $self->{schema},
         index    => $self->{index_dir},
         truncate => $truncate,
         create   => 1,
     );
 }

 # Build an index, stopping at $max docs if $max > 0.
 sub build_index {
     my $self = shift;
     $self->delayed_init;
     my ( $max, $increment, $article_filepaths )
         = @{$self}{qw( docs increment article_filepaths )};

     # Start timer.
     my $start = gettimeofday();

     my $indexer = $self->init_indexer(0);

     my $count = 0;
     while ( $count < $max ) {
         for my $article_filepath (@$article_filepaths) {
             # The title is the first line, the body is the rest.
             open( my $article_fh, '<', $article_filepath )
                 or die "Can't open file '$article_filepath'";

             my %doc;
             $doc{title} = <$article_fh>;
             $doc{body} = do { local $/; <$article_fh> };

             $indexer->add_doc( \%doc );

             # Bail if we've reached spec'd number of docs.
             $count++;
             last if $count >= $max;
             if ( $count % $increment == 0 and $count ) {
                 $indexer->commit;
                 undef $indexer;
                 $indexer = $self->init_indexer($count);
             }
         }
     }

     # Finish index.
     $indexer->optimize;
     $indexer->commit;

     # Return elapsed seconds.
     my $end  = gettimeofday();
     my $secs = $end - $start;
     return ( $count, $secs );
 }

 1;
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	use strict;
	use warnings;

	package BenchmarkingIndexer;

	use Carp;
	use Config;
	use File::Spec::Functions qw( catfile catdir );
	use POSIX qw( uname );

	sub new {
	my $either = shift;
	my $class = ref($either) \|\| $either;
	return bless {
	docs => undef,
	increment => undef,
	store => undef,
	engine => undef,
	version => undef,
	index_dir => undef,
	corpus_dir => 'extracted_corpus',
	article_filepaths => undef,
	@_,
	}, $class;
	}

	sub init_indexer { confess "abstract method" }
	sub build_index { confess "abstract method" }

	sub delayed_init {
	my $self = shift;
	my $article_filepaths = $self->{article_filepaths}
	= $self->build_file_list;
	$self->{docs} = @$article_filepaths unless defined $self->{docs};
	$self->{increment} = $self->{docs} + 1 unless defined $self->{increment};
	}

	# Return a lexically sorted list of all article files from all subdirs.
	sub build_file_list {
	my $self = shift;
	my $corpus_dir = $self->{corpus_dir};
	my @article_filepaths;
	opendir CORPUS_DIR, $corpus_dir
	or confess "Can't opendir '$corpus_dir': $!";
	my @article_dir_names = grep {/articles/} readdir CORPUS_DIR;
	for my $article_dir_name (@article_dir_names) {
	my $article_dir = catdir( $corpus_dir, $article_dir_name );
	opendir ARTICLE_DIR, $article_dir
	or die "Can't opendir '$article_dir': $!";
	push @article_filepaths, map { catfile( $article_dir, $_ ) }
	grep {m/^article\d+\.txt$/} readdir ARTICLE_DIR;
	}
	@article_filepaths = sort @article_filepaths;
	$self->{article_filepaths} = \@article_filepaths;
	}

	# Print out stats for one run.
	sub print_interim_report {
	my ( $self, %args ) = @_;
	printf( "%-3d Secs: %.3f Docs: %-4d\n", @args{qw( rep secs count )} );
	}

	sub start_report {
	# Start the output.
	print '-' x 60 . "\n";
	}

	# Print out aggregate stats.
	sub print_final_report {
	my ( $self, $times ) = @_;

	# Produce mean and truncated mean.
	my @sorted_times = sort @$times;
	my $num_to_chop = int( @sorted_times >> 2 );
	my $mean = 0;
	my $trunc_mean = 0;
	my $num_kept = 0;
	for ( my $i = 0; $i < @sorted_times; $i++ ) {
	$mean += $sorted_times[$i];
	# Discard fastest 25% and slowest 25% of runs.
	next if $i < $num_to_chop;
	next if $i > ( $#sorted_times - $num_to_chop );
	$trunc_mean += $sorted_times[$i];
	$num_kept++;
	}

	$mean /= @sorted_times;
	$trunc_mean /= $num_kept;
	my $num_discarded = @sorted_times - $num_kept;
	$mean = sprintf( "%.3f", $mean );
	$trunc_mean = sprintf( "%.3f", $trunc_mean );

	# Get some info about the system.
	my $thread_support = $Config{usethreads} ? "yes" : "no";
	my @uname_info = (uname)[ 0, 2, 4 ];

	print <<END_REPORT;
	------------------------------------------------------------
	$self->{engine} $self->{version}
	Perl $Config{version}
	Thread support: $thread_support
	@uname_info
	Mean: $mean secs
	Truncated mean ($num_kept kept, $num_discarded discarded): $trunc_mean secs
	------------------------------------------------------------
	END_REPORT
	}

	package BenchSchema::WhiteSpaceTokenizer;
	use base qw( Lucy::Analysis::RegexTokenizer );

	sub new { return shift->SUPER::new( pattern => '\S+' ) }

	package BenchSchema;
	use base qw( Lucy::Plan::Schema );
	use Lucy::Analysis::RegexTokenizer;

	sub new {
	my $self = shift->SUPER::new;
	my $type = Lucy::Plan::FullTextType->new(
	analyzer => BenchSchema::WhiteSpaceTokenizer->new, );
	$self->spec_field( name => 'title', type => $type );
	return $self;
	}

	package BenchmarkingIndexer::Lucy;
	use base qw( BenchmarkingIndexer );

	use Time::HiRes qw( gettimeofday );

	sub new {
	my $class = shift;
	my $self = $class->SUPER::new(@_);

	require Lucy;
	require Lucy::Index::Indexer;

	# Provide runtime flexibility.
	my $schema = $self->{schema} = BenchSchema->new;
	my $body_type = Lucy::Plan::FullTextType->new(
	analyzer => BenchSchema::WhiteSpaceTokenizer->new,
	highlightable => $self->{store} ? 1 : 0,
	stored => $self->{store} ? 1 : 0,
	);
	$schema->spec_field( name => 'body', type => $body_type );

	$self->{index_dir} = 'lucy_index';
	$self->{engine} = 'Lucy';
	$self->{version} = $Lucy::VERSION;

	return $self;
	}

	sub init_indexer {
	my ( $self, $count ) = @_;
	my $truncate = $count == 0 ? 1 : 0;
	return Lucy::Index::Indexer->new(
	schema => $self->{schema},
	index => $self->{index_dir},
	truncate => $truncate,
	create => 1,
	);
	}

	# Build an index, stopping at $max docs if $max > 0.
	sub build_index {
	my $self = shift;
	$self->delayed_init;
	my ( $max, $increment, $article_filepaths )
	= @{$self}{qw( docs increment article_filepaths )};

	# Start timer.
	my $start = gettimeofday();

	my $indexer = $self->init_indexer(0);

	my $count = 0;
	while ( $count < $max ) {
	for my $article_filepath (@$article_filepaths) {
	# The title is the first line, the body is the rest.
	open( my $article_fh, '<', $article_filepath )
	or die "Can't open file '$article_filepath'";

	my %doc;
	$doc{title} = <$article_fh>;
	$doc{body} = do { local $/; <$article_fh> };

	$indexer->add_doc( \%doc );

	# Bail if we've reached spec'd number of docs.
	$count++;
	last if $count >= $max;
	if ( $count % $increment == 0 and $count ) {
	$indexer->commit;
	undef $indexer;
	$indexer = $self->init_indexer($count);
	}
	}
	}

	# Finish index.
	$indexer->optimize;
	$indexer->commit;

	# Return elapsed seconds.
	my $end = gettimeofday();
	my $secs = $end - $start;
	return ( $count, $secs );
	}

	1;