blob: ef8166610f51376a5ed09682f4a2043f217c36f1 [file] [log] [blame]
using System.Collections.Generic;
using Lucene.Net.Documents;
using Lucene.Net.Util;
namespace Lucene.Net.Search
{
using Lucene.Net.Support;
using NUnit.Framework;
using Directory = Lucene.Net.Store.Directory;
using Document = Documents.Document;
using Field = Field;
using IndexReader = Lucene.Net.Index.IndexReader;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer;
using MockTokenizer = Lucene.Net.Analysis.MockTokenizer;
using MultiReader = Lucene.Net.Index.MultiReader;
using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter;
using Term = Lucene.Net.Index.Term;
/// <summary>
/// Tests <seealso cref="FuzzyQuery"/>.
///
/// </summary>
[TestFixture]
public class TestFuzzyQuery : LuceneTestCase
{
[Test]
public virtual void TestFuzziness()
{
Directory directory = NewDirectory();
RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, directory);
AddDoc("aaaaa", writer);
AddDoc("aaaab", writer);
AddDoc("aaabb", writer);
AddDoc("aabbb", writer);
AddDoc("abbbb", writer);
AddDoc("bbbbb", writer);
AddDoc("ddddd", writer);
IndexReader reader = writer.GetReader();
IndexSearcher searcher = NewSearcher(reader);
writer.Dispose();
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 0);
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
// same with prefix
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 3);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(2, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 6);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
// test scoring
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.DefaultMaxEdits, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length, "3 documents should match");
IList<string> order = new List<string> { "bbbbb", "abbbb", "aabbb" };
for (int i = 0; i < hits.Length; i++)
{
string term = searcher.Doc(hits[i].Doc).Get("field");
//System.out.println(hits[i].Score);
Assert.AreEqual(order[i], term);
}
// test pq size by supplying maxExpansions=2
// this query would normally return 3 documents, because 3 terms match (see above):
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.DefaultMaxEdits, 0, 2, false);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(2, hits.Length, "only 2 documents should match");
order = new List<string> { "bbbbb", "abbbb" };
for (int i = 0; i < hits.Length; i++)
{
string term = searcher.Doc(hits[i].Doc).Get("field");
//System.out.println(hits[i].Score);
Assert.AreEqual(order[i], term);
}
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.DefaultMaxEdits, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.DefaultMaxEdits, 0); // edit distance to "aaaaa" = 3
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// query identical to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.DefaultMaxEdits, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
// default allows for up to two edits:
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
// query similar to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.DefaultMaxEdits, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.DefaultMaxEdits, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.DefaultMaxEdits, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.DefaultMaxEdits, 3);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.DefaultMaxEdits, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(2, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.DefaultMaxEdits, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.DefaultMaxEdits, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
// now with prefix
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.DefaultMaxEdits, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.DefaultMaxEdits, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.DefaultMaxEdits, 3);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.DefaultMaxEdits, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.DefaultMaxEdits, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// different field = no match:
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.DefaultMaxEdits, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
reader.Dispose();
directory.Dispose();
}
[Test]
public virtual void Test2()
{
Directory directory = NewDirectory();
RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, directory, new MockAnalyzer(Random, MockTokenizer.KEYWORD, false));
AddDoc("LANGE", writer);
AddDoc("LUETH", writer);
AddDoc("PIRSING", writer);
AddDoc("RIEGEL", writer);
AddDoc("TRZECZIAK", writer);
AddDoc("WALKER", writer);
AddDoc("WBR", writer);
AddDoc("WE", writer);
AddDoc("WEB", writer);
AddDoc("WEBE", writer);
AddDoc("WEBER", writer);
AddDoc("WEBERE", writer);
AddDoc("WEBREE", writer);
AddDoc("WEBEREI", writer);
AddDoc("WBRE", writer);
AddDoc("WITTKOPF", writer);
AddDoc("WOJNAROWSKI", writer);
AddDoc("WRICKE", writer);
IndexReader reader = writer.GetReader();
IndexSearcher searcher = NewSearcher(reader);
writer.Dispose();
FuzzyQuery query = new FuzzyQuery(new Term("field", "WEBER"), 2, 1);
//query.setRewriteMethod(FuzzyQuery.SCORING_BOOLEAN_QUERY_REWRITE);
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(8, hits.Length);
reader.Dispose();
directory.Dispose();
}
/// <summary>
/// MultiTermQuery provides (via attribute) information about which values
/// must be competitive to enter the priority queue.
///
/// FuzzyQuery optimizes itself around this information, if the attribute
/// is not implemented correctly, there will be problems!
/// </summary>
[Test]
public virtual void TestTieBreaker()
{
Directory directory = NewDirectory();
RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, directory);
AddDoc("a123456", writer);
AddDoc("c123456", writer);
AddDoc("d123456", writer);
AddDoc("e123456", writer);
Directory directory2 = NewDirectory();
RandomIndexWriter writer2 = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, directory2);
AddDoc("a123456", writer2);
AddDoc("b123456", writer2);
AddDoc("b123456", writer2);
AddDoc("b123456", writer2);
AddDoc("c123456", writer2);
AddDoc("f123456", writer2);
IndexReader ir1 = writer.GetReader();
IndexReader ir2 = writer2.GetReader();
MultiReader mr = new MultiReader(ir1, ir2);
IndexSearcher searcher = NewSearcher(mr);
FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false);
TopDocs docs = searcher.Search(fq, 2);
Assert.AreEqual(5, docs.TotalHits); // 5 docs, from the a and b's
mr.Dispose();
ir1.Dispose();
ir2.Dispose();
writer.Dispose();
writer2.Dispose();
directory.Dispose();
directory2.Dispose();
}
/// <summary>
/// Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. </summary>
[Test]
public virtual void TestBoostOnlyRewrite()
{
Directory directory = NewDirectory();
RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, directory);
AddDoc("Lucene", writer);
AddDoc("Lucene", writer);
AddDoc("Lucenne", writer);
IndexReader reader = writer.GetReader();
IndexSearcher searcher = NewSearcher(reader);
writer.Dispose();
FuzzyQuery query = new FuzzyQuery(new Term("field", "lucene"));
query.MultiTermRewriteMethod = (new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
// normally, 'Lucenne' would be the first result as IDF will skew the score.
Assert.AreEqual("Lucene", reader.Document(hits[0].Doc).Get("field"));
Assert.AreEqual("Lucene", reader.Document(hits[1].Doc).Get("field"));
Assert.AreEqual("Lucenne", reader.Document(hits[2].Doc).Get("field"));
reader.Dispose();
directory.Dispose();
}
[Test]
public virtual void TestGiga()
{
MockAnalyzer analyzer = new MockAnalyzer(Random);
Directory index = NewDirectory();
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, index);
AddDoc("Lucene in Action", w);
AddDoc("Lucene for Dummies", w);
//addDoc("Giga", w);
AddDoc("Giga byte", w);
AddDoc("ManagingGigabytesManagingGigabyte", w);
AddDoc("ManagingGigabytesManagingGigabytes", w);
AddDoc("The Art of Computer Science", w);
AddDoc("J. K. Rowling", w);
AddDoc("JK Rowling", w);
AddDoc("Joanne K Roling", w);
AddDoc("Bruce Willis", w);
AddDoc("Willis bruce", w);
AddDoc("Brute willis", w);
AddDoc("B. willis", w);
IndexReader r = w.GetReader();
w.Dispose();
Query q = new FuzzyQuery(new Term("field", "giga"), 0);
// 3. search
IndexSearcher searcher = NewSearcher(r);
ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual("Giga byte", searcher.Doc(hits[0].Doc).Get("field"));
r.Dispose();
index.Dispose();
}
[Test]
public virtual void TestDistanceAsEditsSearching()
{
Directory index = NewDirectory();
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, index);
AddDoc("foobar", w);
AddDoc("test", w);
AddDoc("working", w);
IndexReader reader = w.GetReader();
IndexSearcher searcher = NewSearcher(reader);
w.Dispose();
FuzzyQuery q = new FuzzyQuery(new Term("field", "fouba"), 2);
ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual("foobar", searcher.Doc(hits[0].Doc).Get("field"));
q = new FuzzyQuery(new Term("field", "foubara"), 2);
hits = searcher.Search(q, 10).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual("foobar", searcher.Doc(hits[0].Doc).Get("field"));
try
{
q = new FuzzyQuery(new Term("field", "t"), 3);
Assert.Fail();
}
#pragma warning disable 168
catch (System.ArgumentException expected)
#pragma warning restore 168
{
// expected
}
reader.Dispose();
index.Dispose();
}
private void AddDoc(string text, RandomIndexWriter writer)
{
Document doc = new Document();
doc.Add(NewTextField("field", text, Field.Store.YES));
writer.AddDocument(doc);
}
}
}