perl/lib/Lucy/Docs/Tutorial/Simple.pod - lucy - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 =head1 NAME

 Lucy::Docs::Tutorial::Simple - Bare-bones search app.

 =head2 Setup

 Copy the text presentation of the US Constitution from the C<sample> directory
 of the Apache Lucy distribution to the base level of your web server's
 C<htdocs> directory.

     $ cp -R sample/us_constitution /usr/local/apache2/htdocs/

 =head2 Indexing: indexer.pl

 Our first task will be to create an application called C<indexer.pl> which
 builds a searchable "inverted index" from a collection of documents.

 After we specify some configuration variables and load all necessary
 modules...

     #!/usr/local/bin/perl
     use strict;
     use warnings;

     # (Change configuration variables as needed.)
     my $path_to_index = '/path/to/index';
     my $uscon_source  = '/usr/local/apache2/htdocs/us_constitution';

     use Lucy::Simple;
     use File::Spec::Functions qw( catfile );

 ... we'll start by creating a Lucy::Simple object, telling it where we'd
 like the index to be located and the language of the source material.

     my $lucy = Lucy::Simple->new(
         path     => $path_to_index,
         language => 'en',
     );

 Next, we'll add a subroutine which parses our sample documents.

     # Parse a file from our US Constitution collection and return a hashref with
     # the fields title, body, and url.
     sub parse_file {
         my $filename = shift;
         my $filepath = catfile( $uscon_source, $filename );
         open( my $fh, '<', $filepath ) or die "Can't open '$filepath': $!";
         my $text = do { local $/; <$fh> };    # slurp file content
         $text =~ /\A(.+?)^\s+(.*)/ms
             or die "Can't extract title/bodytext from '$filepath'";
         my $title    = $1;
         my $bodytext = $2;
         return {
             title    => $title,
             content  => $bodytext,
             url      => "/us_constitution/$filename",
             category => $category,
         };
     }

 Add some elementary directory reading code...

     # Collect names of source files.
     opendir( my $dh, $uscon_source )
         or die "Couldn't opendir '$uscon_source': $!";
     my @filenames = grep { $_ =~ /\.txt/ } readdir $dh;

 ... and now we're ready for the meat of indexer.pl -- which occupies exactly
 one line of code.

     foreach my $filename (@filenames) {
         my $doc = parse_file($filename);
         $lucy->add_doc($doc);  # ta-da!
     }

 =head2 Search: search.cgi

 As with our indexing app, the bulk of the code in our search script won't be
 Lucy-specific.

 The beginning is dedicated to CGI processing and configuration.

     #!/usr/local/bin/perl -T
     use strict;
     use warnings;

     # (Change configuration variables as needed.)
     my $path_to_index = '/path/to/index';

     use CGI;
     use List::Util qw( max min );
     use POSIX qw( ceil );
     use Encode qw( decode );
     use Lucy::Simple;

     my $cgi       = CGI->new;
     my $q         = decode( "UTF-8", $cgi->param('q') || '' );
     my $offset    = decode( "UTF-8", $cgi->param('offset') || 0 );
     my $page_size = 10;

 Once that's out of the way, we create our Lucy::Simple object and feed
 it a query string.

     my $lucy = Lucy::Simple->new(
         path     => $path_to_index,
         language => 'en',
     );
     my $hit_count = $lucy->search(
         query      => $q,
         offset     => $offset,
         num_wanted => $page_size,
     );

 The value returned by search() is the total number of documents in the
 collection which matched the query.  We'll show this hit count to the user,
 and also use it in conjunction with the parameters C<offset> and C<num_wanted>
 to break up results into "pages" of manageable size.

 Calling search() on our Simple object turns it into an iterator. Invoking
 next() now returns hits one at a time as L<Lucy::Document::HitDoc>
 objects, starting with the most relevant.

     # Create result list.
     my $report = '';
     while ( my $hit = $lucy->next ) {
         my $score = sprintf( "%0.3f", $hit->get_score );
         $report .= qq|
             <p>
               <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
               <em>$score</em>
               <br>
               <span class="excerptURL">$hit->{url}</span>
             </p>
             |;
     }

 The rest of the script is just text wrangling.

     #---------------------------------------------------------------#
     # No tutorial material below this point - just html generation. #
     #---------------------------------------------------------------#

     # Generate paging links and hit count, print and exit.
     my $paging_links = generate_paging_info( $q, $hit_count );
     blast_out_content( $q, $report, $paging_links );

     # Create html fragment with links for paging through results n-at-a-time.
     sub generate_paging_info {
         my ( $query_string, $total_hits ) = @_;
         my $escaped_q = CGI::escapeHTML($query_string);
         my $paging_info;
         if ( !length $query_string ) {
             # No query?  No display.
             $paging_info = '';
         }
         elsif ( $total_hits == 0 ) {
             # Alert the user that their search failed.
             $paging_info
                 = qq|<p>No matches for <strong>$escaped_q</strong></p>|;
         }
         else {
             # Calculate the nums for the first and last hit to display.
             my $last_result = min( ( $offset + $page_size ), $total_hits );
             my $first_result = min( ( $offset + 1 ), $last_result );

             # Display the result nums, start paging info.
             $paging_info = qq|
                 <p>
                     Results <strong>$first_result-$last_result</strong>
                     of <strong>$total_hits</strong>
                     for <strong>$escaped_q</strong>.
                 </p>
                 <p>
                     Results Page:
                 |;

             # Calculate first and last hits pages to display / link to.
             my $current_page = int( $first_result / $page_size ) + 1;
             my $last_page    = ceil( $total_hits / $page_size );
             my $first_page   = max( 1, ( $current_page - 9 ) );
             $last_page = min( $last_page, ( $current_page + 10 ) );

             # Create a url for use in paging links.
             my $href = $cgi->url( -relative => 1 );
             $href .= "?q=" . CGI::escape($query_string);
             $href .= ";category=" . CGI::escape($category);
             $href .= ";offset=" . CGI::escape($offset);

             # Generate the "Prev" link.
             if ( $current_page > 1 ) {
                 my $new_offset = ( $current_page - 2 ) * $page_size;
                 $href =~ s/(?<=offset=)\d+/$new_offset/;
                 $paging_info .= qq|<a href="$href">&lt;= Prev</a>\n|;
             }

             # Generate paging links.
             for my $page_num ( $first_page .. $last_page ) {
                 if ( $page_num == $current_page ) {
                     $paging_info .= qq|$page_num \n|;
                 }
                 else {
                     my $new_offset = ( $page_num - 1 ) * $page_size;
                     $href =~ s/(?<=offset=)\d+/$new_offset/;
                     $paging_info .= qq|<a href="$href">$page_num</a>\n|;
                 }
             }

             # Generate the "Next" link.
             if ( $current_page != $last_page ) {
                 my $new_offset = $current_page * $page_size;
                 $href =~ s/(?<=offset=)\d+/$new_offset/;
                 $paging_info .= qq|<a href="$href">Next =&gt;</a>\n|;
             }

             # Close tag.
             $paging_info .= "</p>\n";
         }

         return $paging_info;
     }

     # Print content to output.
     sub blast_out_content {
         my ( $query_string, $hit_list, $paging_info ) = @_;
         my $escaped_q = CGI::escapeHTML($query_string);
         binmode( STDOUT, ":encoding(UTF-8)" );
         print qq|Content-type: text/html; charset=UTF-8\n\n|;
         print qq|
     <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
         "http://www.w3.org/TR/html4/loose.dtd">
     <html>
     <head>
       <meta http-equiv="Content-type"
         content="text/html;charset=UTF-8">
       <link rel="stylesheet" type="text/css"
         href="/us_constitution/uscon.css">
       <title>Lucy: $escaped_q</title>
     </head>

     <body>

       <div id="navigation">
         <form id="usconSearch" action="">
           <strong>
             Search the
             <a href="/us_constitution/index.html">US Constitution</a>:
           </strong>
           <input type="text" name="q" id="q" value="$escaped_q">
           <input type="submit" value="=&gt;">
         </form>
       </div><!--navigation-->

       <div id="bodytext">

       $hit_list

       $paging_info

         <p style="font-size: smaller; color: #666">
           <em>
             Powered by <a href="http://incubator.apache.org/lucy/"
             >Apache Lucy<small><sup>TM</sup></small></a>
           </em>
         </p>
       </div><!--bodytext-->

     </body>

     </html>
     |;
     }

 =head2 OK... now what?

 Lucy::Simple is perfectly adequate for some tasks, but it's not very flexible.
 Many people find that it doesn't do at least one or two things they can't live
 without.

 In our next tutorial chapter,
 L<BeyondSimple|Lucy::Docs::Tutorial::BeyondSimple>, we'll rewrite our
 indexing and search scripts using the classes that Lucy::Simple hides
 from view, opening up the possibilities for expansion; then, we'll spend the
 rest of the tutorial chapters exploring these possibilities.
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	=head1 NAME

	Lucy::Docs::Tutorial::Simple - Bare-bones search app.

	=head2 Setup

	Copy the text presentation of the US Constitution from the C<sample> directory
	of the Apache Lucy distribution to the base level of your web server's
	C<htdocs> directory.

	$ cp -R sample/us_constitution /usr/local/apache2/htdocs/

	=head2 Indexing: indexer.pl

	Our first task will be to create an application called C<indexer.pl> which
	builds a searchable "inverted index" from a collection of documents.

	After we specify some configuration variables and load all necessary
	modules...

	#!/usr/local/bin/perl
	use strict;
	use warnings;

	# (Change configuration variables as needed.)
	my $path_to_index = '/path/to/index';
	my $uscon_source = '/usr/local/apache2/htdocs/us_constitution';

	use Lucy::Simple;
	use File::Spec::Functions qw( catfile );

	... we'll start by creating a Lucy::Simple object, telling it where we'd
	like the index to be located and the language of the source material.

	my $lucy = Lucy::Simple->new(
	path => $path_to_index,
	language => 'en',
	);

	Next, we'll add a subroutine which parses our sample documents.

	# Parse a file from our US Constitution collection and return a hashref with
	# the fields title, body, and url.
	sub parse_file {
	my $filename = shift;
	my $filepath = catfile( $uscon_source, $filename );
	open( my $fh, '<', $filepath ) or die "Can't open '$filepath': $!";
	my $text = do { local $/; <$fh> }; # slurp file content
	$text =~ /\A(.+?)^\s+(.*)/ms
	or die "Can't extract title/bodytext from '$filepath'";
	my $title = $1;
	my $bodytext = $2;
	return {
	title => $title,
	content => $bodytext,
	url => "/us_constitution/$filename",
	category => $category,
	};
	}

	Add some elementary directory reading code...

	# Collect names of source files.
	opendir( my $dh, $uscon_source )
	or die "Couldn't opendir '$uscon_source': $!";
	my @filenames = grep { $_ =~ /\.txt/ } readdir $dh;

	... and now we're ready for the meat of indexer.pl -- which occupies exactly
	one line of code.

	foreach my $filename (@filenames) {
	my $doc = parse_file($filename);
	$lucy->add_doc($doc); # ta-da!
	}

	=head2 Search: search.cgi

	As with our indexing app, the bulk of the code in our search script won't be
	Lucy-specific.

	The beginning is dedicated to CGI processing and configuration.

	#!/usr/local/bin/perl -T
	use strict;
	use warnings;

	# (Change configuration variables as needed.)
	my $path_to_index = '/path/to/index';

	use CGI;
	use List::Util qw( max min );
	use POSIX qw( ceil );
	use Encode qw( decode );
	use Lucy::Simple;

	my $cgi = CGI->new;
	my $q = decode( "UTF-8", $cgi->param('q') \|\| '' );
	my $offset = decode( "UTF-8", $cgi->param('offset') \|\| 0 );
	my $page_size = 10;

	Once that's out of the way, we create our Lucy::Simple object and feed
	it a query string.

	my $lucy = Lucy::Simple->new(
	path => $path_to_index,
	language => 'en',
	);
	my $hit_count = $lucy->search(
	query => $q,
	offset => $offset,
	num_wanted => $page_size,
	);

	The value returned by search() is the total number of documents in the
	collection which matched the query. We'll show this hit count to the user,
	and also use it in conjunction with the parameters C<offset> and C<num_wanted>
	to break up results into "pages" of manageable size.

	Calling search() on our Simple object turns it into an iterator. Invoking
	next() now returns hits one at a time as L<Lucy::Document::HitDoc>
	objects, starting with the most relevant.

	# Create result list.
	my $report = '';
	while ( my $hit = $lucy->next ) {
	my $score = sprintf( "%0.3f", $hit->get_score );
	$report .= qq\|
	<p>
	<a href="$hit->{url}"><strong>$hit->{title}</strong></a>
	<em>$score</em>
	<br>
	<span class="excerptURL">$hit->{url}</span>
	</p>
	\|;
	}

	The rest of the script is just text wrangling.

	#---------------------------------------------------------------#
	# No tutorial material below this point - just html generation. #
	#---------------------------------------------------------------#

	# Generate paging links and hit count, print and exit.
	my $paging_links = generate_paging_info( $q, $hit_count );
	blast_out_content( $q, $report, $paging_links );

	# Create html fragment with links for paging through results n-at-a-time.
	sub generate_paging_info {
	my ( $query_string, $total_hits ) = @_;
	my $escaped_q = CGI::escapeHTML($query_string);
	my $paging_info;
	if ( !length $query_string ) {
	# No query? No display.
	$paging_info = '';
	}
	elsif ( $total_hits == 0 ) {
	# Alert the user that their search failed.
	$paging_info
	= qq\|<p>No matches for <strong>$escaped_q</strong></p>\|;
	}
	else {
	# Calculate the nums for the first and last hit to display.
	my $last_result = min( ( $offset + $page_size ), $total_hits );
	my $first_result = min( ( $offset + 1 ), $last_result );

	# Display the result nums, start paging info.
	$paging_info = qq\|
	<p>
	Results <strong>$first_result-$last_result</strong>
	of <strong>$total_hits</strong>
	for <strong>$escaped_q</strong>.
	</p>
	<p>
	Results Page:
	\|;

	# Calculate first and last hits pages to display / link to.
	my $current_page = int( $first_result / $page_size ) + 1;
	my $last_page = ceil( $total_hits / $page_size );
	my $first_page = max( 1, ( $current_page - 9 ) );
	$last_page = min( $last_page, ( $current_page + 10 ) );

	# Create a url for use in paging links.
	my $href = $cgi->url( -relative => 1 );
	$href .= "?q=" . CGI::escape($query_string);
	$href .= ";category=" . CGI::escape($category);
	$href .= ";offset=" . CGI::escape($offset);

	# Generate the "Prev" link.
	if ( $current_page > 1 ) {
	my $new_offset = ( $current_page - 2 ) * $page_size;
	$href =~ s/(?<=offset=)\d+/$new_offset/;
	$paging_info .= qq\|<a href="$href"><= Prev</a>\n\|;
	}

	# Generate paging links.
	for my $page_num ( $first_page .. $last_page ) {
	if ( $page_num == $current_page ) {
	$paging_info .= qq\|$page_num \n\|;
	}
	else {
	my $new_offset = ( $page_num - 1 ) * $page_size;
	$href =~ s/(?<=offset=)\d+/$new_offset/;
	$paging_info .= qq\|<a href="$href">$page_num</a>\n\|;
	}
	}

	# Generate the "Next" link.
	if ( $current_page != $last_page ) {
	my $new_offset = $current_page * $page_size;
	$href =~ s/(?<=offset=)\d+/$new_offset/;
	$paging_info .= qq\|<a href="$href">Next =></a>\n\|;
	}

	# Close tag.
	$paging_info .= "</p>\n";
	}

	return $paging_info;
	}

	# Print content to output.
	sub blast_out_content {
	my ( $query_string, $hit_list, $paging_info ) = @_;
	my $escaped_q = CGI::escapeHTML($query_string);
	binmode( STDOUT, ":encoding(UTF-8)" );
	print qq\|Content-type: text/html; charset=UTF-8\n\n\|;
	print qq\|
	<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
	"http://www.w3.org/TR/html4/loose.dtd">
	<html>
	<head>
	<meta http-equiv="Content-type"
	content="text/html;charset=UTF-8">
	<link rel="stylesheet" type="text/css"
	href="/us_constitution/uscon.css">
	<title>Lucy: $escaped_q</title>
	</head>

	<body>

	<div id="navigation">
	<form id="usconSearch" action="">
	<strong>
	Search the
	<a href="/us_constitution/index.html">US Constitution</a>:
	</strong>
	<input type="text" name="q" id="q" value="$escaped_q">
	<input type="submit" value="=>">
	</form>
	</div><!--navigation-->

	<div id="bodytext">

	$hit_list

	$paging_info

	<p style="font-size: smaller; color: #666">
	<em>
	Powered by <a href="http://incubator.apache.org/lucy/"
	>Apache Lucy<small><sup>TM</sup></small></a>
	</em>
	</p>
	</div><!--bodytext-->

	</body>

	</html>
	\|;
	}

	=head2 OK... now what?

	Lucy::Simple is perfectly adequate for some tasks, but it's not very flexible.
	Many people find that it doesn't do at least one or two things they can't live
	without.

	In our next tutorial chapter,
	L<BeyondSimple\|Lucy::Docs::Tutorial::BeyondSimple>, we'll rewrite our
	indexing and search scripts using the classes that Lucy::Simple hides
	from view, opening up the possibilities for expansion; then, we'll spend the
	rest of the tutorial chapters exploring these possibilities.