blob: 68be05d9dc3aa05efc55b9213a9e254325b7f51f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using NUnit.Framework;
using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
using WhitespaceAnalyzer = Lucene.Net.Analysis.WhitespaceAnalyzer;
using Document = Lucene.Net.Documents.Document;
using Field = Lucene.Net.Documents.Field;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using IndexReader = Lucene.Net.Index.IndexReader;
using Term = Lucene.Net.Index.Term;
using RAMDirectory = Lucene.Net.Store.RAMDirectory;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
using Directory = Lucene.Net.Store.Directory;
using MockRAMDirectory = Lucene.Net.Store.MockRAMDirectory;
using QueryParser = Lucene.Net.QueryParsers.QueryParser;
namespace Lucene.Net.Search
{
/// <summary> Tests {@link FuzzyQuery}.
///
/// </summary>
[TestFixture]
public class TestFuzzyQuery:LuceneTestCase
{
[Test]
public virtual void TestFuzziness()
{
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true,
IndexWriter.MaxFieldLength.LIMITED);
AddDoc("aaaaa", writer);
AddDoc("aaaab", writer);
AddDoc("aaabb", writer);
AddDoc("aabbb", writer);
AddDoc("abbbb", writer);
AddDoc("bbbbb", writer);
AddDoc("ddddd", writer);
writer.Optimize();
writer.Close();
IndexSearcher searcher = new IndexSearcher(directory, true);
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
// same with prefix
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(2, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
// test scoring
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length, "3 documents should match");
List<String> order = new List<string>(new[] {"bbbbb", "abbbb", "aabbb"});
for (int i = 0; i < hits.Length; i++)
{
String term = searcher.Doc(hits[i].Doc).Get("field");
//System.out.println(hits[i].score);
Assert.AreEqual(order[i], term);
}
// test BooleanQuery.maxClauseCount
int savedClauseCount = BooleanQuery.MaxClauseCount;
try
{
BooleanQuery.MaxClauseCount = 2;
// This query would normally return 3 documents, because 3 terms match (see above):
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(2, hits.Length, "only 2 documents should match");
order = new List<string>(new[] {"bbbbb", "abbbb"});
for (int i = 0; i < hits.Length; i++)
{
String term = searcher.Doc(hits[i].Doc).Get("field");
//System.out.println(hits[i].score);
Assert.AreEqual(order[i], term);
}
}
finally
{
BooleanQuery.MaxClauseCount = savedClauseCount;
}
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0);
// edit distance to "aaaaa" = 3
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// query identical to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
// default allows for up to two edits:
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
// query similar to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(3, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(2, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa"));
Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
// now with prefix
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// different field = no match:
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
searcher.Close();
directory.Close();
}
[Test]
public virtual void TestFuzzinessLong()
{
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
AddDoc("aaaaaaa", writer);
AddDoc("segment", writer);
writer.Optimize();
writer.Close();
IndexSearcher searcher = new IndexSearcher(directory, true);
FuzzyQuery query;
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
// in testDefaultFuzziness so a bigger difference is allowed:
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// no match, more than half of the characters is wrong:
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// now with prefix
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// "student" and "stellent" are indeed similar to "segment" by default:
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
// now with prefix
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// "student" doesn't match anymore thanks to increased minimum similarity:
query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
Assert.Throws<ArgumentException>(() => new FuzzyQuery(new Term("field", "student"), 1.1f),
"Expected ArgumentException");
Assert.Throws<ArgumentException>(() => new FuzzyQuery(new Term("field", "student"), -0.1f),
"Expected ArgumentException");
searcher.Close();
directory.Close();
}
[Test]
public virtual void TestTokenLengthOpt()
{
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
AddDoc("12345678911", writer);
AddDoc("segment", writer);
writer.Optimize();
writer.Close();
IndexSearcher searcher = new IndexSearcher(directory, true);
Query query;
// term not over 10 chars, so optimization shortcuts
query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// 10 chars, so no optimization
query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
// over 10 chars, so no optimization
query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(1, hits.Length);
// over 10 chars, no match
query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
hits = searcher.Search(query, null, 1000).ScoreDocs;
Assert.AreEqual(0, hits.Length);
}
[Test]
public virtual void TestGiga()
{
StandardAnalyzer analyzer = new StandardAnalyzer(Util.Version.LUCENE_CURRENT);
Directory index = new MockRAMDirectory();
IndexWriter w = new IndexWriter(index, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
AddDoc("Lucene in Action", w);
AddDoc("Lucene for Dummies", w);
// addDoc("Giga", w);
AddDoc("Giga byte", w);
AddDoc("ManagingGigabytesManagingGigabyte", w);
AddDoc("ManagingGigabytesManagingGigabytes", w);
AddDoc("The Art of Computer Science", w);
AddDoc("J. K. Rowling", w);
AddDoc("JK Rowling", w);
AddDoc("Joanne K Roling", w);
AddDoc("Bruce Willis", w);
AddDoc("Willis bruce", w);
AddDoc("Brute willis", w);
AddDoc("B. willis", w);
IndexReader r = w.GetReader();
w.Close();
Query q = new QueryParser(Util.Version.LUCENE_CURRENT, "field", analyzer).Parse("giga~0.9");
// 3. search
IndexSearcher searcher = new IndexSearcher(r);
ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs;
Assert.AreEqual(1, hits.Length);
Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), "Giga byte");
r.Close();
}
private void AddDoc(System.String text, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
}
}