blob: 53522b37bb5de03a0b401dab7cf2a5475830695b [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
package BenchmarkingIndexer;
use Carp;
use Config;
use File::Spec::Functions qw( catfile catdir );
use POSIX qw( uname );
sub new {
my $either = shift;
my $class = ref($either) || $either;
return bless {
docs => undef,
increment => undef,
store => undef,
engine => undef,
version => undef,
index_dir => undef,
corpus_dir => 'extracted_corpus',
article_filepaths => undef,
@_,
}, $class;
}
sub init_indexer { confess "abstract method" }
sub build_index { confess "abstract method" }
sub delayed_init {
my $self = shift;
my $article_filepaths = $self->{article_filepaths}
= $self->build_file_list;
$self->{docs} = @$article_filepaths unless defined $self->{docs};
$self->{increment} = $self->{docs} + 1 unless defined $self->{increment};
}
# Return a lexically sorted list of all article files from all subdirs.
sub build_file_list {
my $self = shift;
my $corpus_dir = $self->{corpus_dir};
my @article_filepaths;
opendir CORPUS_DIR, $corpus_dir
or confess "Can't opendir '$corpus_dir': $!";
my @article_dir_names = grep {/articles/} readdir CORPUS_DIR;
for my $article_dir_name (@article_dir_names) {
my $article_dir = catdir( $corpus_dir, $article_dir_name );
opendir ARTICLE_DIR, $article_dir
or die "Can't opendir '$article_dir': $!";
push @article_filepaths, map { catfile( $article_dir, $_ ) }
grep {m/^article\d+\.txt$/} readdir ARTICLE_DIR;
}
@article_filepaths = sort @article_filepaths;
$self->{article_filepaths} = \@article_filepaths;
}
# Print out stats for one run.
sub print_interim_report {
my ( $self, %args ) = @_;
printf( "%-3d Secs: %.3f Docs: %-4d\n", @args{qw( rep secs count )} );
}
sub start_report {
# Start the output.
print '-' x 60 . "\n";
}
# Print out aggregate stats.
sub print_final_report {
my ( $self, $times ) = @_;
# Produce mean and truncated mean.
my @sorted_times = sort @$times;
my $num_to_chop = int( @sorted_times >> 2 );
my $mean = 0;
my $trunc_mean = 0;
my $num_kept = 0;
for ( my $i = 0; $i < @sorted_times; $i++ ) {
$mean += $sorted_times[$i];
# Discard fastest 25% and slowest 25% of runs.
next if $i < $num_to_chop;
next if $i > ( $#sorted_times - $num_to_chop );
$trunc_mean += $sorted_times[$i];
$num_kept++;
}
$mean /= @sorted_times;
$trunc_mean /= $num_kept;
my $num_discarded = @sorted_times - $num_kept;
$mean = sprintf( "%.3f", $mean );
$trunc_mean = sprintf( "%.3f", $trunc_mean );
# Get some info about the system.
my $thread_support = $Config{usethreads} ? "yes" : "no";
my @uname_info = (uname)[ 0, 2, 4 ];
print <<END_REPORT;
------------------------------------------------------------
$self->{engine} $self->{version}
Perl $Config{version}
Thread support: $thread_support
@uname_info
Mean: $mean secs
Truncated mean ($num_kept kept, $num_discarded discarded): $trunc_mean secs
------------------------------------------------------------
END_REPORT
}
package BenchSchema::WhiteSpaceTokenizer;
use base qw( Lucy::Analysis::RegexTokenizer );
sub new { return shift->SUPER::new( pattern => '\S+' ) }
package BenchSchema;
use base qw( Lucy::Plan::Schema );
use Lucy::Analysis::RegexTokenizer;
sub new {
my $self = shift->SUPER::new;
my $type = Lucy::Plan::FullTextType->new(
analyzer => BenchSchema::WhiteSpaceTokenizer->new, );
$self->spec_field( name => 'title', type => $type );
return $self;
}
package BenchmarkingIndexer::Lucy;
use base qw( BenchmarkingIndexer );
use Time::HiRes qw( gettimeofday );
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
require Lucy;
require Lucy::Index::Indexer;
# Provide runtime flexibility.
my $schema = $self->{schema} = BenchSchema->new;
my $body_type = Lucy::Plan::FullTextType->new(
analyzer => BenchSchema::WhiteSpaceTokenizer->new,
highlightable => $self->{store} ? 1 : 0,
stored => $self->{store} ? 1 : 0,
);
$schema->spec_field( name => 'body', type => $body_type );
$self->{index_dir} = 'lucy_index';
$self->{engine} = 'Lucy';
$self->{version} = $Lucy::VERSION;
return $self;
}
sub init_indexer {
my ( $self, $count ) = @_;
my $truncate = $count == 0 ? 1 : 0;
return Lucy::Index::Indexer->new(
schema => $self->{schema},
index => $self->{index_dir},
truncate => $truncate,
create => 1,
);
}
# Build an index, stopping at $max docs if $max > 0.
sub build_index {
my $self = shift;
$self->delayed_init;
my ( $max, $increment, $article_filepaths )
= @{$self}{qw( docs increment article_filepaths )};
# Start timer.
my $start = gettimeofday();
my $indexer = $self->init_indexer(0);
my $count = 0;
while ( $count < $max ) {
for my $article_filepath (@$article_filepaths) {
# The title is the first line, the body is the rest.
open( my $article_fh, '<', $article_filepath )
or die "Can't open file '$article_filepath'";
my %doc;
$doc{title} = <$article_fh>;
$doc{body} = do { local $/; <$article_fh> };
$indexer->add_doc( \%doc );
# Bail if we've reached spec'd number of docs.
$count++;
last if $count >= $max;
if ( $count % $increment == 0 and $count ) {
$indexer->commit;
undef $indexer;
$indexer = $self->init_indexer($count);
}
}
}
# Finish index.
$indexer->optimize;
$indexer->commit;
# Return elapsed seconds.
my $end = gettimeofday();
my $secs = $end - $start;
return ( $count, $secs );
}
1;