| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| =head1 NAME |
| |
| Lucy::Docs::Tutorial::Simple - Bare-bones search app. |
| |
| =head2 Setup |
| |
| Copy the text presentation of the US Constitution from the C<sample> directory |
| of the Apache Lucy distribution to the base level of your web server's |
| C<htdocs> directory. |
| |
| $ cp -R sample/us_constitution /usr/local/apache2/htdocs/ |
| |
| =head2 Indexing: indexer.pl |
| |
| Our first task will be to create an application called C<indexer.pl> which |
| builds a searchable "inverted index" from a collection of documents. |
| |
| After we specify some configuration variables and load all necessary |
| modules... |
| |
| #!/usr/local/bin/perl |
| use strict; |
| use warnings; |
| |
| # (Change configuration variables as needed.) |
| my $path_to_index = '/path/to/index'; |
| my $uscon_source = '/usr/local/apache2/htdocs/us_constitution'; |
| |
| use Lucy::Simple; |
| use File::Spec::Functions qw( catfile ); |
| |
| ... we'll start by creating a Lucy::Simple object, telling it where we'd |
| like the index to be located and the language of the source material. |
| |
| my $lucy = Lucy::Simple->new( |
| path => $path_to_index, |
| language => 'en', |
| ); |
| |
| Next, we'll add a subroutine which parses our sample documents. |
| |
| # Parse a file from our US Constitution collection and return a hashref with |
| # the fields title, body, and url. |
| sub parse_file { |
| my $filename = shift; |
| my $filepath = catfile( $uscon_source, $filename ); |
| open( my $fh, '<', $filepath ) or die "Can't open '$filepath': $!"; |
| my $text = do { local $/; <$fh> }; # slurp file content |
| $text =~ /\A(.+?)^\s+(.*)/ms |
| or die "Can't extract title/bodytext from '$filepath'"; |
| my $title = $1; |
| my $bodytext = $2; |
| return { |
| title => $title, |
| content => $bodytext, |
| url => "/us_constitution/$filename", |
| category => $category, |
| }; |
| } |
| |
| Add some elementary directory reading code... |
| |
| # Collect names of source files. |
| opendir( my $dh, $uscon_source ) |
| or die "Couldn't opendir '$uscon_source': $!"; |
| my @filenames = grep { $_ =~ /\.txt/ } readdir $dh; |
| |
| ... and now we're ready for the meat of indexer.pl -- which occupies exactly |
| one line of code. |
| |
| foreach my $filename (@filenames) { |
| my $doc = parse_file($filename); |
| $lucy->add_doc($doc); # ta-da! |
| } |
| |
| =head2 Search: search.cgi |
| |
| As with our indexing app, the bulk of the code in our search script won't be |
| Lucy-specific. |
| |
| The beginning is dedicated to CGI processing and configuration. |
| |
| #!/usr/local/bin/perl -T |
| use strict; |
| use warnings; |
| |
| # (Change configuration variables as needed.) |
| my $path_to_index = '/path/to/index'; |
| |
| use CGI; |
| use List::Util qw( max min ); |
| use POSIX qw( ceil ); |
| use Encode qw( decode ); |
| use Lucy::Simple; |
| |
| my $cgi = CGI->new; |
| my $q = decode( "UTF-8", $cgi->param('q') || '' ); |
| my $offset = decode( "UTF-8", $cgi->param('offset') || 0 ); |
| my $page_size = 10; |
| |
| Once that's out of the way, we create our Lucy::Simple object and feed |
| it a query string. |
| |
| my $lucy = Lucy::Simple->new( |
| path => $path_to_index, |
| language => 'en', |
| ); |
| my $hit_count = $lucy->search( |
| query => $q, |
| offset => $offset, |
| num_wanted => $page_size, |
| ); |
| |
| The value returned by search() is the total number of documents in the |
| collection which matched the query. We'll show this hit count to the user, |
| and also use it in conjunction with the parameters C<offset> and C<num_wanted> |
| to break up results into "pages" of manageable size. |
| |
| Calling search() on our Simple object turns it into an iterator. Invoking |
| next() now returns hits one at a time as L<Lucy::Document::HitDoc> |
| objects, starting with the most relevant. |
| |
| # Create result list. |
| my $report = ''; |
| while ( my $hit = $lucy->next ) { |
| my $score = sprintf( "%0.3f", $hit->get_score ); |
| $report .= qq| |
| <p> |
| <a href="$hit->{url}"><strong>$hit->{title}</strong></a> |
| <em>$score</em> |
| <br> |
| <span class="excerptURL">$hit->{url}</span> |
| </p> |
| |; |
| } |
| |
| The rest of the script is just text wrangling. |
| |
| #---------------------------------------------------------------# |
| # No tutorial material below this point - just html generation. # |
| #---------------------------------------------------------------# |
| |
| # Generate paging links and hit count, print and exit. |
| my $paging_links = generate_paging_info( $q, $hit_count ); |
| blast_out_content( $q, $report, $paging_links ); |
| |
| # Create html fragment with links for paging through results n-at-a-time. |
| sub generate_paging_info { |
| my ( $query_string, $total_hits ) = @_; |
| my $escaped_q = CGI::escapeHTML($query_string); |
| my $paging_info; |
| if ( !length $query_string ) { |
| # No query? No display. |
| $paging_info = ''; |
| } |
| elsif ( $total_hits == 0 ) { |
| # Alert the user that their search failed. |
| $paging_info |
| = qq|<p>No matches for <strong>$escaped_q</strong></p>|; |
| } |
| else { |
| # Calculate the nums for the first and last hit to display. |
| my $last_result = min( ( $offset + $page_size ), $total_hits ); |
| my $first_result = min( ( $offset + 1 ), $last_result ); |
| |
| # Display the result nums, start paging info. |
| $paging_info = qq| |
| <p> |
| Results <strong>$first_result-$last_result</strong> |
| of <strong>$total_hits</strong> |
| for <strong>$escaped_q</strong>. |
| </p> |
| <p> |
| Results Page: |
| |; |
| |
| # Calculate first and last hits pages to display / link to. |
| my $current_page = int( $first_result / $page_size ) + 1; |
| my $last_page = ceil( $total_hits / $page_size ); |
| my $first_page = max( 1, ( $current_page - 9 ) ); |
| $last_page = min( $last_page, ( $current_page + 10 ) ); |
| |
| # Create a url for use in paging links. |
| my $href = $cgi->url( -relative => 1 ); |
| $href .= "?q=" . CGI::escape($query_string); |
| $href .= ";category=" . CGI::escape($category); |
| $href .= ";offset=" . CGI::escape($offset); |
| |
| # Generate the "Prev" link. |
| if ( $current_page > 1 ) { |
| my $new_offset = ( $current_page - 2 ) * $page_size; |
| $href =~ s/(?<=offset=)\d+/$new_offset/; |
| $paging_info .= qq|<a href="$href"><= Prev</a>\n|; |
| } |
| |
| # Generate paging links. |
| for my $page_num ( $first_page .. $last_page ) { |
| if ( $page_num == $current_page ) { |
| $paging_info .= qq|$page_num \n|; |
| } |
| else { |
| my $new_offset = ( $page_num - 1 ) * $page_size; |
| $href =~ s/(?<=offset=)\d+/$new_offset/; |
| $paging_info .= qq|<a href="$href">$page_num</a>\n|; |
| } |
| } |
| |
| # Generate the "Next" link. |
| if ( $current_page != $last_page ) { |
| my $new_offset = $current_page * $page_size; |
| $href =~ s/(?<=offset=)\d+/$new_offset/; |
| $paging_info .= qq|<a href="$href">Next =></a>\n|; |
| } |
| |
| # Close tag. |
| $paging_info .= "</p>\n"; |
| } |
| |
| return $paging_info; |
| } |
| |
| # Print content to output. |
| sub blast_out_content { |
| my ( $query_string, $hit_list, $paging_info ) = @_; |
| my $escaped_q = CGI::escapeHTML($query_string); |
| binmode( STDOUT, ":encoding(UTF-8)" ); |
| print qq|Content-type: text/html; charset=UTF-8\n\n|; |
| print qq| |
| <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
| "http://www.w3.org/TR/html4/loose.dtd"> |
| <html> |
| <head> |
| <meta http-equiv="Content-type" |
| content="text/html;charset=UTF-8"> |
| <link rel="stylesheet" type="text/css" |
| href="/us_constitution/uscon.css"> |
| <title>Lucy: $escaped_q</title> |
| </head> |
| |
| <body> |
| |
| <div id="navigation"> |
| <form id="usconSearch" action=""> |
| <strong> |
| Search the |
| <a href="/us_constitution/index.html">US Constitution</a>: |
| </strong> |
| <input type="text" name="q" id="q" value="$escaped_q"> |
| <input type="submit" value="=>"> |
| </form> |
| </div><!--navigation--> |
| |
| <div id="bodytext"> |
| |
| $hit_list |
| |
| $paging_info |
| |
| <p style="font-size: smaller; color: #666"> |
| <em> |
| Powered by <a href="http://incubator.apache.org/lucy/" |
| >Apache Lucy<small><sup>TM</sup></small></a> |
| </em> |
| </p> |
| </div><!--bodytext--> |
| |
| </body> |
| |
| </html> |
| |; |
| } |
| |
| =head2 OK... now what? |
| |
| Lucy::Simple is perfectly adequate for some tasks, but it's not very flexible. |
| Many people find that it doesn't do at least one or two things they can't live |
| without. |
| |
| In our next tutorial chapter, |
| L<BeyondSimple|Lucy::Docs::Tutorial::BeyondSimple>, we'll rewrite our |
| indexing and search scripts using the classes that Lucy::Simple hides |
| from view, opening up the possibilities for expansion; then, we'll spend the |
| rest of the tutorial chapters exploring these possibilities. |
| |
| |