perl/t/504-similarity.t - lucy - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 use strict;
 use warnings;

 package MySchema::LongTextField;
 use base qw( Lucy::Plan::FullTextType );
 use LucyX::Index::LongFieldSim;

 sub make_similarity { LucyX::Index::LongFieldSim->new }

 package MySchema;
 use base qw( Lucy::Plan::Schema );
 use Lucy::Analysis::RegexTokenizer;

 sub new {
     my $self       = shift->SUPER::new(@_);
     my $analyzer   = Lucy::Analysis::RegexTokenizer->new;
     my $plain_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
     my $long_field_type
         = MySchema::LongTextField->new( analyzer => $analyzer, );
     $self->spec_field( name => 'title', type => $plain_type );
     $self->spec_field( name => 'body',  type => $long_field_type );
     return $self;
 }

 package main;
 use Test::More tests => 9;
 use Lucy::Test;
 use bytes;
 no bytes;

 my $sim  = Lucy::Index::Similarity->new;
 my $twin = $sim->load( $sim->dump );
 ok( $sim->equals($twin), "Dump/Load" );

 cmp_ok( $sim->tf(10) - $sim->tf(9), '<', 1, "TF is damped" );

 my $rare_idf   = $sim->idf( doc_freq => 3,  total_docs => 100 );
 my $common_idf = $sim->idf( doc_freq => 50, total_docs => 100 );
 cmp_ok( $rare_idf, '>', $common_idf, 'Rarer terms have higher IDF' );

 my $less_coordinated = $sim->coord( overlap => 2, max_overlap => 5 );
 my $more_coordinated = $sim->coord( overlap => 3, max_overlap => 5 );
 cmp_ok( $less_coordinated, '<', $more_coordinated,
     "greater overlap means bigger coord bonus" );

 my @bytes  = ( 100,      110,     120, 130, 140 );
 my @floats = ( 0.015625, 0.09375, 0.5, 3.0, 16.0 );
 my @transformed = map { $sim->decode_norm($_) } @bytes;
 is_deeply( \@floats, \@transformed,
     "decode_norm more or less matches Java Lucene behavior" );

 @bytes       = 0 .. 255;
 @floats      = map { $sim->decode_norm($_) } @bytes;
 @transformed = map { $sim->encode_norm($_) } @floats;
 is_deeply( \@transformed, \@bytes,
     "encode_norm and decode_norm are complementary" );

 my $norm_decoder = $sim->get_norm_decoder;
 @transformed = ();
 for ( 0 .. 255 ) {
     push @transformed,
         unpack( 'f', bytes::substr( $norm_decoder, $_ * 4, 4 ) );
 }
 is_deeply( \@transformed, \@floats,
     "using the norm_decoder produces desired results" );

 my $folder  = Lucy::Store::RAMFolder->new;
 my $indexer = Lucy::Index::Indexer->new(
     index  => $folder,
     schema => MySchema->new,
 );

 my %source_docs = (
     'spam'     => 'spam spam',
     'not spam' => 'not spam not even close to spam no spam here',
 );
 while ( my ( $title, $body ) = each %source_docs ) {
     $indexer->add_doc(
         {   title => $title,
             body  => $body,
         }
     );
 }
 $indexer->commit;
 undef $indexer;

 my $searcher = Lucy::Search::IndexSearcher->new( index => $folder );

 my $hits = $searcher->hits(
     query => Lucy::Search::TermQuery->new(
         field => 'title',
         term  => 'spam',
     )
 );
 is( $hits->next->{'title'},
     'spam', "Default Similarity biased towards short fields" );

 $hits = $searcher->hits(
     query => Lucy::Search::TermQuery->new(
         field => 'body',
         term  => 'spam',
     )
 );
 is( $hits->next->{'title'},
     'not spam', "LongFieldSim cancels short-field bias" );
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	use strict;
	use warnings;

	package MySchema::LongTextField;
	use base qw( Lucy::Plan::FullTextType );
	use LucyX::Index::LongFieldSim;

	sub make_similarity { LucyX::Index::LongFieldSim->new }

	package MySchema;
	use base qw( Lucy::Plan::Schema );
	use Lucy::Analysis::RegexTokenizer;

	sub new {
	my $self = shift->SUPER::new(@_);
	my $analyzer = Lucy::Analysis::RegexTokenizer->new;
	my $plain_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
	my $long_field_type
	= MySchema::LongTextField->new( analyzer => $analyzer, );
	$self->spec_field( name => 'title', type => $plain_type );
	$self->spec_field( name => 'body', type => $long_field_type );
	return $self;
	}

	package main;
	use Test::More tests => 9;
	use Lucy::Test;
	use bytes;
	no bytes;

	my $sim = Lucy::Index::Similarity->new;
	my $twin = $sim->load( $sim->dump );
	ok( $sim->equals($twin), "Dump/Load" );

	cmp_ok( $sim->tf(10) - $sim->tf(9), '<', 1, "TF is damped" );

	my $rare_idf = $sim->idf( doc_freq => 3, total_docs => 100 );
	my $common_idf = $sim->idf( doc_freq => 50, total_docs => 100 );
	cmp_ok( $rare_idf, '>', $common_idf, 'Rarer terms have higher IDF' );

	my $less_coordinated = $sim->coord( overlap => 2, max_overlap => 5 );
	my $more_coordinated = $sim->coord( overlap => 3, max_overlap => 5 );
	cmp_ok( $less_coordinated, '<', $more_coordinated,
	"greater overlap means bigger coord bonus" );

	my @bytes = ( 100, 110, 120, 130, 140 );
	my @floats = ( 0.015625, 0.09375, 0.5, 3.0, 16.0 );
	my @transformed = map { $sim->decode_norm($_) } @bytes;
	is_deeply( \@floats, \@transformed,
	"decode_norm more or less matches Java Lucene behavior" );

	@bytes = 0 .. 255;
	@floats = map { $sim->decode_norm($_) } @bytes;
	@transformed = map { $sim->encode_norm($_) } @floats;
	is_deeply( \@transformed, \@bytes,
	"encode_norm and decode_norm are complementary" );

	my $norm_decoder = $sim->get_norm_decoder;
	@transformed = ();
	for ( 0 .. 255 ) {
	push @transformed,
	unpack( 'f', bytes::substr( $norm_decoder, $_ * 4, 4 ) );
	}
	is_deeply( \@transformed, \@floats,
	"using the norm_decoder produces desired results" );

	my $folder = Lucy::Store::RAMFolder->new;
	my $indexer = Lucy::Index::Indexer->new(
	index => $folder,
	schema => MySchema->new,
	);

	my %source_docs = (
	'spam' => 'spam spam',
	'not spam' => 'not spam not even close to spam no spam here',
	);
	while ( my ( $title, $body ) = each %source_docs ) {
	$indexer->add_doc(
	{ title => $title,
	body => $body,
	}
	);
	}
	$indexer->commit;
	undef $indexer;

	my $searcher = Lucy::Search::IndexSearcher->new( index => $folder );

	my $hits = $searcher->hits(
	query => Lucy::Search::TermQuery->new(
	field => 'title',
	term => 'spam',
	)
	);
	is( $hits->next->{'title'},
	'spam', "Default Similarity biased towards short fields" );

	$hits = $searcher->hits(
	query => Lucy::Search::TermQuery->new(
	field => 'body',
	term => 'spam',
	)
	);
	is( $hits->next->{'title'},
	'not spam', "LongFieldSim cancels short-field bias" );