blob: d2dfcec2c5068d7ab85cfe7980aab054fd4ed7a8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import static org.hamcrest.CoreMatchers.containsString;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Supplier;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import com.carrotsearch.randomizedtesting.RandomizedTest;
/**
* Tests {@link FuzzyQuery}.
*
*/
public class TestFuzzyQuery extends LuceneTestCase {
public void testBasicPrefix() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("abc", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
FuzzyQuery query = new FuzzyQuery(new Term("field", "abc"), FuzzyQuery.defaultMaxEdits, 1);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
reader.close();
directory.close();
}
public void testFuzziness() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("aaaaa", writer);
addDoc("aaaab", writer);
addDoc("aaabb", writer);
addDoc("aabbb", writer);
addDoc("abbbb", writer);
addDoc("bbbbb", writer);
addDoc("ddddd", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
// same with prefix
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(2, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
// test scoring
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("3 documents should match", 3, hits.length);
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
//System.out.println(hits[i].score);
assertEquals(order.get(i), term);
}
// test pq size by supplying maxExpansions=2
// This query would normally return 3 documents, because 3 terms match (see above):
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
order = Arrays.asList("bbbbb","abbbb");
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
//System.out.println(hits[i].score);
assertEquals(order.get(i), term);
}
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(0, hits.length);
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(0, hits.length);
// query identical to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
// default allows for up to two edits:
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// query similar to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(2, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(0, hits.length);
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
// now with prefix
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(0, hits.length);
// different field = no match:
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
directory.close();
}
public void test2() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
addDoc("LANGE", writer);
addDoc("LUETH", writer);
addDoc("PIRSING", writer);
addDoc("RIEGEL", writer);
addDoc("TRZECZIAK", writer);
addDoc("WALKER", writer);
addDoc("WBR", writer);
addDoc("WE", writer);
addDoc("WEB", writer);
addDoc("WEBE", writer);
addDoc("WEBER", writer);
addDoc("WEBERE", writer);
addDoc("WEBREE", writer);
addDoc("WEBEREI", writer);
addDoc("WBRE", writer);
addDoc("WITTKOPF", writer);
addDoc("WOJNAROWSKI", writer);
addDoc("WRICKE", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
FuzzyQuery query = new FuzzyQuery(new Term("field", "WEBER"), 2, 1);
//query.setRewriteMethod(FuzzyQuery.SCORING_BOOLEAN_QUERY_REWRITE);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(8, hits.length);
reader.close();
directory.close();
}
public void testSingleQueryExactMatchScoresHighest() throws Exception {
//See issue LUCENE-329 - IDF shouldn't wreck similarity ranking
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smythe", writer);
addDoc("smdssasd", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework
writer.close();
String searchTerms[] = { "smith", "smythe", "smdssasd" };
for (String searchTerm : searchTerms) {
FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
Document bestDoc = searcher.doc(hits[0].doc);
assertTrue(hits.length > 0);
String topMatch = bestDoc.get("field");
assertEquals(searchTerm, topMatch);
if (hits.length > 1) {
Document worstDoc = searcher.doc(hits[hits.length - 1].doc);
String worstMatch = worstDoc.get("field");
assertNotSame(searchTerm, worstMatch);
}
}
reader.close();
directory.close();
}
public void testMultipleQueriesIdfWorks() throws Exception {
// With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite
// is the solution as it disables IDF.
// However - IDF is still useful as in this case where there are multiple FuzzyQueries.
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("michael smith", writer);
addDoc("michael lucero", writer);
addDoc("doug cutting", writer);
addDoc("doug cuttin", writer);
addDoc("michael wardle", writer);
addDoc("micheal vegas", writer);
addDoc("michael lydon", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework
writer.close();
BooleanQuery.Builder query = new BooleanQuery.Builder();
String commonSearchTerm = "michael";
FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1);
query.add(commonQuery, Occur.SHOULD);
String rareSearchTerm = "cutting";
FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1);
query.add(rareQuery, Occur.SHOULD);
ScoreDoc[] hits = searcher.search(query.build(), 1000).scoreDocs;
// Matches on the rare surname should be worth more than matches on the common forename
assertEquals(7, hits.length);
Document bestDoc = searcher.doc(hits[0].doc);
String topMatch = bestDoc.get("field");
assertTrue(topMatch.contains(rareSearchTerm));
Document runnerUpDoc = searcher.doc(hits[1].doc);
String runnerUpMatch = runnerUpDoc.get("field");
assertTrue(runnerUpMatch.contains("cuttin"));
Document worstDoc = searcher.doc(hits[hits.length - 1].doc);
String worstMatch = worstDoc.get("field");
assertTrue(worstMatch.contains("micheal")); //misspelling of common name
reader.close();
directory.close();
}
/**
* MultiTermQuery provides (via attribute) information about which values
* must be competitive to enter the priority queue.
*
* FuzzyQuery optimizes itself around this information, if the attribute
* is not implemented correctly, there will be problems!
*/
public void testTieBreaker() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("a123456", writer);
addDoc("c123456", writer);
addDoc("d123456", writer);
addDoc("e123456", writer);
Directory directory2 = newDirectory();
RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2);
addDoc("a123456", writer2);
addDoc("b123456", writer2);
addDoc("b123456", writer2);
addDoc("b123456", writer2);
addDoc("c123456", writer2);
addDoc("f123456", writer2);
IndexReader ir1 = writer.getReader();
IndexReader ir2 = writer2.getReader();
MultiReader mr = new MultiReader(ir1, ir2);
IndexSearcher searcher = newSearcher(mr);
FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false);
TopDocs docs = searcher.search(fq, 2);
assertEquals(5, docs.totalHits.value); // 5 docs, from the a and b's
mr.close();
ir1.close();
ir2.close();
writer.close();
writer2.close();
directory.close();
directory2.close();
}
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
public void testBoostOnlyRewrite() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("Lucene", writer);
addDoc("Lucene", writer);
addDoc("Lucenne", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
FuzzyQuery query = new FuzzyQuery(new Term("field", "lucene"));
query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
// normally, 'Lucenne' would be the first result as IDF will skew the score.
assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
reader.close();
directory.close();
}
public void testGiga() throws Exception {
Directory index = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), index);
addDoc("Lucene in Action", w);
addDoc("Lucene for Dummies", w);
//addDoc("Giga", w);
addDoc("Giga byte", w);
addDoc("ManagingGigabytesManagingGigabyte", w);
addDoc("ManagingGigabytesManagingGigabytes", w);
addDoc("The Art of Computer Science", w);
addDoc("J. K. Rowling", w);
addDoc("JK Rowling", w);
addDoc("Joanne K Roling", w);
addDoc("Bruce Willis", w);
addDoc("Willis bruce", w);
addDoc("Brute willis", w);
addDoc("B. willis", w);
IndexReader r = w.getReader();
w.close();
Query q = new FuzzyQuery(new Term("field", "giga"), 0);
// 3. search
IndexSearcher searcher = newSearcher(r);
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
r.close();
w.close();
index.close();
}
public void testDistanceAsEditsSearching() throws Exception {
Directory index = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), index);
addDoc("foobar", w);
addDoc("test", w);
addDoc("working", w);
IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
w.close();
FuzzyQuery q = new FuzzyQuery(new Term("field", "fouba"), 2);
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
q = new FuzzyQuery(new Term("field", "foubara"), 2);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
expectThrows(IllegalArgumentException.class, () -> {
new FuzzyQuery(new Term("field", "t"), 3);
});
reader.close();
index.close();
}
public void testValidation() {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
new FuzzyQuery(new Term("field", "foo"), -1, 0, 1, false);
});
assertTrue(expected.getMessage().contains("maxEdits"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1, 0, 1, false);
});
assertTrue(expected.getMessage().contains("maxEdits must be between"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new FuzzyQuery(new Term("field", "foo"), 1, -1, 1, false);
});
assertTrue(expected.getMessage().contains("prefixLength cannot be negative"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
});
assertTrue(expected.getMessage().contains("maxExpansions must be positive"));
expected = expectThrows(IllegalArgumentException.class, () -> {
new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
});
assertTrue(expected.getMessage().contains("maxExpansions must be positive"));
}
private String randomRealisticMultiByteUnicode(int length) {
while (true) {
// There is 1 single-byte unicode block, and 194 multi-byte blocks
String value = RandomizedTest.randomRealisticUnicodeOfCodepointLength(length);
if (value.charAt(0) > Byte.MAX_VALUE) {
return value;
}
}
}
public void testErrorMessage() {
// 45 states per vector from Lev2TParametricDescription
final int length = (Operations.DEFAULT_DETERMINIZE_WORK_LIMIT / 5) + 10;
final String value = randomRealisticMultiByteUnicode(length);
FuzzyTermsEnum.FuzzyTermsException expected =
expectThrows(
FuzzyTermsEnum.FuzzyTermsException.class,
() -> {
new FuzzyAutomatonBuilder(value, 2, 0, true).buildMaxEditAutomaton();
});
assertThat(expected.getMessage(), containsString(value));
expected =
expectThrows(
FuzzyTermsEnum.FuzzyTermsException.class,
() -> new FuzzyAutomatonBuilder(value, 2, 0, true).buildAutomatonSet());
assertThat(expected.getMessage(), containsString(value));
}
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(newTextField("field", text, Field.Store.YES));
writer.addDocument(doc);
}
private String randomSimpleString(int digits) {
int termLength = TestUtil.nextInt(random(), 1, 8);
char[] chars = new char[termLength];
for(int i=0;i<termLength;i++) {
chars[i] = (char) ('a' + random().nextInt(digits));
}
return new String(chars);
}
@SuppressWarnings({"unchecked","rawtypes"})
public void testRandom() throws Exception {
int digits = TestUtil.nextInt(random(), 2, 3);
// underestimated total number of unique terms that randomSimpleString
// maybe generate, it assumes all terms have a length of 7
int vocabularySize = digits << 7;
int numTerms = Math.min(atLeast(100), vocabularySize);
Set<String> terms = new HashSet<>();
while (terms.size() < numTerms) {
terms.add(randomSimpleString(digits));
}
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
for(String term : terms) {
Document doc = new Document();
doc.add(new StringField("field", term, Field.Store.YES));
w.addDocument(doc);
}
DirectoryReader r = w.getReader();
w.close();
//System.out.println("TEST: reader=" + r);
IndexSearcher s = newSearcher(r);
int iters = atLeast(200);
for(int iter=0;iter<iters;iter++) {
String queryTerm = randomSimpleString(digits);
int prefixLength = random().nextInt(queryTerm.length());
String queryPrefix = queryTerm.substring(0, prefixLength);
// we don't look at scores here:
List<TermAndScore>[] expected = new List[3];
for(int ed=0;ed<3;ed++) {
expected[ed] = new ArrayList<TermAndScore>();
}
for(String term : terms) {
if (term.startsWith(queryPrefix) == false) {
continue;
}
int ed = getDistance(term, queryTerm);
float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
while (ed < 3) {
expected[ed].add(new TermAndScore(term, score));
ed++;
}
}
for(int ed=0;ed<3;ed++) {
Collections.sort(expected[ed]);
int queueSize = TestUtil.nextInt(random(), 1, terms.size());
/*
System.out.println("\nTEST: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength);
for(TermAndScore ent : expected[ed]) {
System.out.println(" " + ent);
}
*/
FuzzyQuery query = new FuzzyQuery(new Term("field", queryTerm), ed, prefixLength, queueSize, true);
TopDocs hits = s.search(query, terms.size());
Set<String> actual = new HashSet<>();
for(ScoreDoc hit : hits.scoreDocs) {
Document doc = s.doc(hit.doc);
actual.add(doc.get("field"));
//System.out.println(" actual: " + doc.get("field") + " score=" + hit.score);
}
Set<String> expectedTop = new HashSet<>();
int limit = Math.min(queueSize, expected[ed].size());
for(int i=0;i<limit;i++) {
expectedTop.add(expected[ed].get(i).term);
}
if (actual.equals(expectedTop) == false) {
StringBuilder sb = new StringBuilder();
sb.append("FAILED: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength + "\n");
boolean first = true;
for(String term : actual) {
if (expectedTop.contains(term) == false) {
if (first) {
sb.append(" these matched but shouldn't:\n");
first = false;
}
sb.append(" " + term + "\n");
}
}
first = true;
for(String term : expectedTop) {
if (actual.contains(term) == false) {
if (first) {
sb.append(" these did not match but should:\n");
first = false;
}
sb.append(" " + term + "\n");
}
}
throw new AssertionError(sb.toString());
}
}
}
IOUtils.close(r, dir);
}
private static class TermAndScore implements Comparable<TermAndScore> {
final String term;
final float score;
public TermAndScore(String term, float score) {
this.term = term;
this.score = score;
}
@Override
public int compareTo(TermAndScore other) {
// higher score sorts first, and if scores are tied, lower term sorts first
if (score > other.score) {
return -1;
} else if (score < other.score) {
return 1;
} else {
return term.compareTo(other.term);
}
}
@Override
public String toString() {
return term + " score=" + score;
}
}
// Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2)
private static int getDistance(String target, String other) {
IntsRef targetPoints;
IntsRef otherPoints;
int n;
int d[][]; // cost array
// NOTE: if we cared, we could 3*m space instead of m*n space, similar to
// what LevenshteinDistance does, except cycling thru a ring of three
// horizontal cost arrays... but this comparator is never actually used by
// DirectSpellChecker, it's only used for merging results from multiple shards
// in "distributed spellcheck", and it's inefficient in other ways too...
// cheaper to do this up front once
targetPoints = toIntsRef(target);
otherPoints = toIntsRef(other);
n = targetPoints.length;
final int m = otherPoints.length;
d = new int[n+1][m+1];
if (n == 0 || m == 0) {
if (n == m) {
return 0;
}
else {
return Math.max(n, m);
}
}
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
int t_j; // jth character of t
int cost; // cost
for (i = 0; i<=n; i++) {
d[i][0] = i;
}
for (j = 0; j<=m; j++) {
d[0][j] = j;
}
for (j = 1; j<=m; j++) {
t_j = otherPoints.ints[j-1];
for (i=1; i<=n; i++) {
cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
// transposition
if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
}
}
}
return d[n][m];
}
private static IntsRef toIntsRef(String s) {
IntsRef ref = new IntsRef(s.length()); // worst case
int utf16Len = s.length();
for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
}
return ref;
}
public void testVisitor() {
FuzzyQuery q = new FuzzyQuery(new Term("field", "blob"), 2);
AtomicBoolean visited = new AtomicBoolean(false);
q.visit(new QueryVisitor() {
@Override
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
visited.set(true);
ByteRunAutomaton a = automaton.get();
assertMatches(a, "blob");
assertMatches(a, "bolb");
assertMatches(a, "blobby");
assertNoMatches(a, "bolbby");
}
});
assertTrue(visited.get());
}
private static void assertMatches(ByteRunAutomaton automaton, String text) {
BytesRef b = new BytesRef(text);
assertTrue(automaton.run(b.bytes, b.offset, b.length));
}
private static void assertNoMatches(ByteRunAutomaton automaton, String text) {
BytesRef b = new BytesRef(text);
assertFalse(automaton.run(b.bytes, b.offset, b.length));
}
}