| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| use strict; |
| use warnings; |
| |
| package MySchema::LongTextField; |
| use base qw( Lucy::Plan::FullTextType ); |
| use LucyX::Index::LongFieldSim; |
| |
| sub make_similarity { LucyX::Index::LongFieldSim->new } |
| |
| package MySchema; |
| use base qw( Lucy::Plan::Schema ); |
| use Lucy::Analysis::RegexTokenizer; |
| |
| sub new { |
| my $self = shift->SUPER::new(@_); |
| my $analyzer = Lucy::Analysis::RegexTokenizer->new; |
| my $plain_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, ); |
| my $long_field_type |
| = MySchema::LongTextField->new( analyzer => $analyzer, ); |
| $self->spec_field( name => 'title', type => $plain_type ); |
| $self->spec_field( name => 'body', type => $long_field_type ); |
| return $self; |
| } |
| |
| package main; |
| use Test::More tests => 9; |
| use Lucy::Test; |
| use bytes; |
| no bytes; |
| |
| my $sim = Lucy::Index::Similarity->new; |
| my $twin = $sim->load( $sim->dump ); |
| ok( $sim->equals($twin), "Dump/Load" ); |
| |
| cmp_ok( $sim->tf(10) - $sim->tf(9), '<', 1, "TF is damped" ); |
| |
| my $rare_idf = $sim->idf( doc_freq => 3, total_docs => 100 ); |
| my $common_idf = $sim->idf( doc_freq => 50, total_docs => 100 ); |
| cmp_ok( $rare_idf, '>', $common_idf, 'Rarer terms have higher IDF' ); |
| |
| my $less_coordinated = $sim->coord( overlap => 2, max_overlap => 5 ); |
| my $more_coordinated = $sim->coord( overlap => 3, max_overlap => 5 ); |
| cmp_ok( $less_coordinated, '<', $more_coordinated, |
| "greater overlap means bigger coord bonus" ); |
| |
| my @bytes = ( 100, 110, 120, 130, 140 ); |
| my @floats = ( 0.015625, 0.09375, 0.5, 3.0, 16.0 ); |
| my @transformed = map { $sim->decode_norm($_) } @bytes; |
| is_deeply( \@floats, \@transformed, |
| "decode_norm more or less matches Java Lucene behavior" ); |
| |
| @bytes = 0 .. 255; |
| @floats = map { $sim->decode_norm($_) } @bytes; |
| @transformed = map { $sim->encode_norm($_) } @floats; |
| is_deeply( \@transformed, \@bytes, |
| "encode_norm and decode_norm are complementary" ); |
| |
| my $norm_decoder = $sim->get_norm_decoder; |
| @transformed = (); |
| for ( 0 .. 255 ) { |
| push @transformed, |
| unpack( 'f', bytes::substr( $norm_decoder, $_ * 4, 4 ) ); |
| } |
| is_deeply( \@transformed, \@floats, |
| "using the norm_decoder produces desired results" ); |
| |
| my $folder = Lucy::Store::RAMFolder->new; |
| my $indexer = Lucy::Index::Indexer->new( |
| index => $folder, |
| schema => MySchema->new, |
| ); |
| |
| my %source_docs = ( |
| 'spam' => 'spam spam', |
| 'not spam' => 'not spam not even close to spam no spam here', |
| ); |
| while ( my ( $title, $body ) = each %source_docs ) { |
| $indexer->add_doc( |
| { title => $title, |
| body => $body, |
| } |
| ); |
| } |
| $indexer->commit; |
| undef $indexer; |
| |
| my $searcher = Lucy::Search::IndexSearcher->new( index => $folder ); |
| |
| my $hits = $searcher->hits( |
| query => Lucy::Search::TermQuery->new( |
| field => 'title', |
| term => 'spam', |
| ) |
| ); |
| is( $hits->next->{'title'}, |
| 'spam', "Default Similarity biased towards short fields" ); |
| |
| $hits = $searcher->hits( |
| query => Lucy::Search::TermQuery->new( |
| field => 'body', |
| term => 'spam', |
| ) |
| ); |
| is( $hits->next->{'title'}, |
| 'not spam', "LongFieldSim cancels short-field bias" ); |