blob: 7003dda9a6d12a3e0cb2a7f88fac2f2dba8e23af [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
package Lucy::Simple;
use Lucy;
our $VERSION = '0.004001';
$VERSION = eval $VERSION;
use Carp;
use Scalar::Util qw( weaken reftype refaddr );
use Lucy::Plan::Schema;
use Lucy::Analysis::EasyAnalyzer;
use Lucy::Index::Indexer;
use Lucy::Search::IndexSearcher;
my %obj_cache;
sub new {
my ( $either, %args ) = @_;
my $path = delete $args{path};
my $language = lc( delete $args{language} );
confess("Missing required parameter 'path'") unless defined $path;
confess("Invalid language: '$language'")
unless $language =~ /^(?:da|de|en|es|fi|fr|it|nl|no|pt|ru|sv)$/;
my @remaining = keys %args;
confess("Invalid params: @remaining") if @remaining;
my $self = bless {
type => undef,
schema => undef,
indexer => undef,
searcher => undef,
hits => undef,
language => $language,
path => $path,
},
ref($either) || $either;
# Cache the object for later clean-up.
weaken( $obj_cache{ refaddr $self } = $self );
return $self;
}
sub _lazily_create_indexer {
my $self = shift;
if ( !defined $self->{indexer} ) {
# Get type and schema
my $schema;
my $reader = Lucy::Index::PolyReader->open( index => $self->{path} );
if ( !@{ $reader->seg_readers } ) {
# index is empty, create new schema and type
$schema = Lucy::Plan::Schema->new;
my $analyzer = Lucy::Analysis::EasyAnalyzer->new(
language => $self->{language}, );
$self->{type}
= Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
}
else {
# get schema from reader
$schema = $reader->get_schema;
my $field = $schema->all_fields->[0];
$self->{type} = $schema->fetch_type($field);
}
$self->{schema} = $schema;
$self->{indexer} = Lucy::Index::Indexer->new(
schema => $schema,
index => $self->{path},
);
}
}
sub add_doc {
my ( $self, $hashref ) = @_;
croak("add_doc requires exactly one argument: a hashref")
unless ( @_ == 2 and reftype($hashref) eq 'HASH' );
$self->_lazily_create_indexer;
my $schema = $self->{schema};
my $type = $self->{type};
$schema->spec_field( name => $_, type => $type ) for keys %$hashref;
$self->{indexer}->add_doc($hashref);
}
sub _finish_indexing {
my $self = shift;
# Don't bother to throw an error if index not modified.
if ( defined $self->{indexer} ) {
$self->{indexer}->commit;
# Trigger searcher and indexer refresh.
undef $self->{indexer};
undef $self->{searcher};
}
}
sub search {
my ( $self, %args ) = @_;
# Flush recent adds; lazily create searcher.
$self->_finish_indexing;
if ( !defined $self->{searcher} ) {
$self->{searcher}
= Lucy::Search::IndexSearcher->new( index => $self->{path} );
}
$self->{hits} = $self->{searcher}->hits(%args);
return $self->{hits}->total_hits;
}
sub next {
my $self = shift;
return unless defined $self->{hits};
# Get the hit, bail if hits are exhausted.
my $hit = $self->{hits}->next;
if ( !defined $hit ) {
undef $self->{hits};
return;
}
return $hit;
}
sub DESTROY {
for (shift) {
$_->_finish_indexing;
delete $obj_cache{ refaddr $_ };
}
}
END {
# Finish indexing for any objects that still exist, since, if we wait
# until global destruction, our Indexer might no longer exist,
# (see bug #32689)
$_->_finish_indexing for values %obj_cache;
}
1;
__END__
__POD__
=head1 NAME
Lucy::Simple - Basic search engine.
=head1 SYNOPSIS
First, build an index of your documents.
my $index = Lucy::Simple->new(
path => '/path/to/index/'
language => 'en',
);
while ( my ( $title, $content ) = each %source_docs ) {
$index->add_doc({
title => $title,
content => $content,
});
}
Later, search the index.
my $total_hits = $index->search(
query => $query_string,
offset => 0,
num_wanted => 10,
);
print "Total hits: $total_hits\n";
while ( my $hit = $index->next ) {
print "$hit->{title}\n",
}
=head1 DESCRIPTION
Lucy::Simple is a stripped-down interface for the L<Apache Lucy|Lucy> search
engine library.
=head1 METHODS
=head2 new
my $lucy = Lucy::Simple->new(
path => '/path/to/index/',
language => 'en',
);
Create a Lucy::Simple object, which can be used for both indexing and
searching. Two hash-style parameters are required.
=over
=item *
B<path> - Where the index directory should be located. If no index is found
at the specified location, one will be created.
=item *
B<language> - The language of the documents in your collection, indicated
by a two-letter ISO code. 12 languages are supported:
|-----------------------|
| Language | ISO code |
|-----------------------|
| Danish | da |
| Dutch | nl |
| English | en |
| Finnish | fi |
| French | fr |
| German | de |
| Italian | it |
| Norwegian | no |
| Portuguese | pt |
| Spanish | es |
| Swedish | sv |
| Russian | ru |
|-----------------------|
=back
=head2 add_doc
$lucy->add_doc({
location => $url,
title => $title,
content => $content,
});
Add a document to the index. The document must be supplied as a hashref, with
field names as keys and content as values.
=head2 search
my $total_hits = $lucy->search(
query => $query_string, # required
offset => 40, # default 0
num_wanted => 20, # default 10
);
Search the index. Returns the total number of documents which match the
query. (This number is unlikely to match C<num_wanted>.)
=over
=item *
B<query> - A search query string.
=item *
B<offset> - The number of most-relevant hits to discard, typically used when
"paging" through hits N at a time. Setting offset to 20 and num_wanted to 10
retrieves hits 21-30, assuming that 30 hits can be found.
=item *
B<num_wanted> - The number of hits you would like to see after C<offset> is
taken into account.
=back
=head1 BUGS
Not thread-safe.
=cut