lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search;

 import static org.hamcrest.CoreMatchers.containsString;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.Supplier;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.apache.lucene.util.automaton.Operations;

 import com.carrotsearch.randomizedtesting.RandomizedTest;

 /**
  * Tests {@link FuzzyQuery}.
  *
  */
 public class TestFuzzyQuery extends LuceneTestCase {

   public void testBasicPrefix() throws Exception {
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
     addDoc("abc", writer);
     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
     writer.close();

     FuzzyQuery query = new FuzzyQuery(new Term("field", "abc"), FuzzyQuery.defaultMaxEdits, 1);
     ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     reader.close();
     directory.close();
   }

   public void testFuzziness() throws Exception {
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
     addDoc("aaaaa", writer);
     addDoc("aaaab", writer);
     addDoc("aaabb", writer);
     addDoc("aabbb", writer);
     addDoc("abbbb", writer);
     addDoc("bbbbb", writer);
     addDoc("ddddd", writer);

     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
     writer.close();

     FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
     ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);

     // same with prefix
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(2, hits.length);
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);

     // test scoring
     query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals("3 documents should match", 3, hits.length);
     List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
     for (int i = 0; i < hits.length; i++) {
       final String term = searcher.doc(hits[i].doc).get("field");
       //System.out.println(hits[i].score);
       assertEquals(order.get(i), term);
     }

     // test pq size by supplying maxExpansions=2
     // This query would normally return 3 documents, because 3 terms match (see above):
     query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals("only 2 documents should match", 2, hits.length);
     order = Arrays.asList("bbbbb","abbbb");
     for (int i = 0; i < hits.length; i++) {
       final String term = searcher.doc(hits[i].doc).get("field");
       //System.out.println(hits[i].score);
       assertEquals(order.get(i), term);
     }

     // not similar enough:
     query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(0, hits.length);
     query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0);   // edit distance to "aaaaa" = 3
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(0, hits.length);

     // query identical to a word in the index:
     query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
     // default allows for up to two edits:
     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));

     // query similar to a word in the index:
     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));

     // now with prefix
     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
     assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(2, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
     assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
     query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(0, hits.length);


     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));

     // now with prefix
     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
     query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(0, hits.length);


     // different field = no match:
     query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(0, hits.length);

     reader.close();
     directory.close();
   }

   public void test2() throws Exception {
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
     addDoc("LANGE", writer);
     addDoc("LUETH", writer);
     addDoc("PIRSING", writer);
     addDoc("RIEGEL", writer);
     addDoc("TRZECZIAK", writer);
     addDoc("WALKER", writer);
     addDoc("WBR", writer);
     addDoc("WE", writer);
     addDoc("WEB", writer);
     addDoc("WEBE", writer);
     addDoc("WEBER", writer);
     addDoc("WEBERE", writer);
     addDoc("WEBREE", writer);
     addDoc("WEBEREI", writer);
     addDoc("WBRE", writer);
     addDoc("WITTKOPF", writer);
     addDoc("WOJNAROWSKI", writer);
     addDoc("WRICKE", writer);

     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
     writer.close();

     FuzzyQuery query = new FuzzyQuery(new Term("field", "WEBER"), 2, 1);
     //query.setRewriteMethod(FuzzyQuery.SCORING_BOOLEAN_QUERY_REWRITE);
     ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(8, hits.length);

     reader.close();
     directory.close();
   }

   public void testSingleQueryExactMatchScoresHighest() throws Exception {
     //See issue LUCENE-329 - IDF shouldn't wreck similarity ranking
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
     addDoc("smith", writer);
     addDoc("smith", writer);
     addDoc("smith", writer);
     addDoc("smith", writer);
     addDoc("smith", writer);
     addDoc("smith", writer);
     addDoc("smythe", writer);
     addDoc("smdssasd", writer);

     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
     searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework
     writer.close();
     String searchTerms[] = { "smith", "smythe", "smdssasd" };
     for (String searchTerm : searchTerms) {
       FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
       ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
       Document bestDoc = searcher.doc(hits[0].doc);
       assertTrue(hits.length > 0);
       String topMatch = bestDoc.get("field");
       assertEquals(searchTerm, topMatch);
       if (hits.length > 1) {
         Document worstDoc = searcher.doc(hits[hits.length - 1].doc);
         String worstMatch = worstDoc.get("field");
         assertNotSame(searchTerm, worstMatch);
       }
     }
     reader.close();
     directory.close();
   }

   public void testMultipleQueriesIdfWorks() throws Exception {
     // With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite
     // is the solution as it disables IDF.
     // However - IDF is still useful as in this case where there are multiple FuzzyQueries.
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory);

     addDoc("michael smith", writer);
     addDoc("michael lucero", writer);
     addDoc("doug cutting", writer);
     addDoc("doug cuttin", writer);
     addDoc("michael wardle", writer);
     addDoc("micheal vegas", writer);
     addDoc("michael lydon", writer);

     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
     searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework

     writer.close();

     BooleanQuery.Builder query = new BooleanQuery.Builder();
     String commonSearchTerm = "michael";
     FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1);
     query.add(commonQuery, Occur.SHOULD);

     String rareSearchTerm = "cutting";
     FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1);
     query.add(rareQuery, Occur.SHOULD);
     ScoreDoc[] hits = searcher.search(query.build(), 1000).scoreDocs;

     // Matches on the rare surname should be worth more than matches on the common forename
     assertEquals(7, hits.length);
     Document bestDoc = searcher.doc(hits[0].doc);
     String topMatch = bestDoc.get("field");
     assertTrue(topMatch.contains(rareSearchTerm));

     Document runnerUpDoc = searcher.doc(hits[1].doc);
     String runnerUpMatch = runnerUpDoc.get("field");
     assertTrue(runnerUpMatch.contains("cuttin"));

     Document worstDoc = searcher.doc(hits[hits.length - 1].doc);
     String worstMatch = worstDoc.get("field");
     assertTrue(worstMatch.contains("micheal")); //misspelling of common name

     reader.close();
     directory.close();
   }

   /**
    * MultiTermQuery provides (via attribute) information about which values
    * must be competitive to enter the priority queue.
    *
    * FuzzyQuery optimizes itself around this information, if the attribute
    * is not implemented correctly, there will be problems!
    */
   public void testTieBreaker() throws Exception {
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
     addDoc("a123456", writer);
     addDoc("c123456", writer);
     addDoc("d123456", writer);
     addDoc("e123456", writer);

     Directory directory2 = newDirectory();
     RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2);
     addDoc("a123456", writer2);
     addDoc("b123456", writer2);
     addDoc("b123456", writer2);
     addDoc("b123456", writer2);
     addDoc("c123456", writer2);
     addDoc("f123456", writer2);

     IndexReader ir1 = writer.getReader();
     IndexReader ir2 = writer2.getReader();

     MultiReader mr = new MultiReader(ir1, ir2);
     IndexSearcher searcher = newSearcher(mr);
     FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false);
     TopDocs docs = searcher.search(fq, 2);
     assertEquals(5, docs.totalHits.value); // 5 docs, from the a and b's
     mr.close();
     ir1.close();
     ir2.close();
     writer.close();
     writer2.close();
     directory.close();
     directory2.close();
   }

   /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
   public void testBoostOnlyRewrite() throws Exception {
     Directory directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
     addDoc("Lucene", writer);
     addDoc("Lucene", writer);
     addDoc("Lucenne", writer);

     IndexReader reader = writer.getReader();
     IndexSearcher searcher = newSearcher(reader);
     writer.close();

     FuzzyQuery query = new FuzzyQuery(new Term("field", "lucene"));
     query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
     ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
     assertEquals(3, hits.length);
     // normally, 'Lucenne' would be the first result as IDF will skew the score.
     assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
     assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
     assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
     reader.close();
     directory.close();
   }

   public void testGiga() throws Exception {

     Directory index = newDirectory();
     RandomIndexWriter w = new RandomIndexWriter(random(), index);

     addDoc("Lucene in Action", w);
     addDoc("Lucene for Dummies", w);

     //addDoc("Giga", w);
     addDoc("Giga byte", w);

     addDoc("ManagingGigabytesManagingGigabyte", w);
     addDoc("ManagingGigabytesManagingGigabytes", w);

     addDoc("The Art of Computer Science", w);
     addDoc("J. K. Rowling", w);
     addDoc("JK Rowling", w);
     addDoc("Joanne K Roling", w);
     addDoc("Bruce Willis", w);
     addDoc("Willis bruce", w);
     addDoc("Brute willis", w);
     addDoc("B. willis", w);
     IndexReader r = w.getReader();
     w.close();

     Query q = new FuzzyQuery(new Term("field", "giga"), 0);

     // 3. search
     IndexSearcher searcher = newSearcher(r);
     ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
     r.close();
     w.close();
     index.close();
   }

   public void testDistanceAsEditsSearching() throws Exception {
     Directory index = newDirectory();
     RandomIndexWriter w = new RandomIndexWriter(random(), index);
     addDoc("foobar", w);
     addDoc("test", w);
     addDoc("working", w);
     IndexReader reader = w.getReader();
     IndexSearcher searcher = newSearcher(reader);
     w.close();

     FuzzyQuery q = new FuzzyQuery(new Term("field", "fouba"), 2);
     ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));

     q = new FuzzyQuery(new Term("field", "foubara"), 2);
     hits = searcher.search(q, 10).scoreDocs;
     assertEquals(1, hits.length);
     assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));

     expectThrows(IllegalArgumentException.class, () -> {
       new FuzzyQuery(new Term("field", "t"), 3);
     });

     reader.close();
     index.close();
   }

   public void testValidation() {
     IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
       new FuzzyQuery(new Term("field", "foo"), -1, 0, 1, false);
     });
     assertTrue(expected.getMessage().contains("maxEdits"));

     expected = expectThrows(IllegalArgumentException.class, () -> {
       new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1, 0, 1, false);
     });
     assertTrue(expected.getMessage().contains("maxEdits must be between"));

     expected = expectThrows(IllegalArgumentException.class, () -> {
       new FuzzyQuery(new Term("field", "foo"), 1, -1, 1, false);
     });
     assertTrue(expected.getMessage().contains("prefixLength cannot be negative"));

     expected = expectThrows(IllegalArgumentException.class, () -> {
       new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
     });
     assertTrue(expected.getMessage().contains("maxExpansions must be positive"));

     expected = expectThrows(IllegalArgumentException.class, () -> {
       new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
     });
     assertTrue(expected.getMessage().contains("maxExpansions must be positive"));
   }

   private String randomRealisticMultiByteUnicode(int length) {
     while (true) {
       // There is 1 single-byte unicode block, and 194 multi-byte blocks
       String value = RandomizedTest.randomRealisticUnicodeOfCodepointLength(length);
       if (value.charAt(0) > Byte.MAX_VALUE) {
         return value;
       }
     }
   }

   public void testErrorMessage() {
     // 45 states per vector from Lev2TParametricDescription
     final int length = (Operations.DEFAULT_DETERMINIZE_WORK_LIMIT / 5) + 10;
     final String value = randomRealisticMultiByteUnicode(length);

     FuzzyTermsEnum.FuzzyTermsException expected =
         expectThrows(
             FuzzyTermsEnum.FuzzyTermsException.class,
             () -> {
               new FuzzyAutomatonBuilder(value, 2, 0, true).buildMaxEditAutomaton();
             });
     assertThat(expected.getMessage(), containsString(value));

     expected =
         expectThrows(
             FuzzyTermsEnum.FuzzyTermsException.class,
             () -> new FuzzyAutomatonBuilder(value, 2, 0, true).buildAutomatonSet());
     assertThat(expected.getMessage(), containsString(value));
   }

   private void addDoc(String text, RandomIndexWriter writer) throws IOException {
     Document doc = new Document();
     doc.add(newTextField("field", text, Field.Store.YES));
     writer.addDocument(doc);
   }

   private String randomSimpleString(int digits) {
     int termLength = TestUtil.nextInt(random(), 1, 8);
     char[] chars = new char[termLength];
     for(int i=0;i<termLength;i++) {
       chars[i] = (char) ('a' + random().nextInt(digits));
     }
     return new String(chars);
   }

   @SuppressWarnings({"unchecked","rawtypes"})
   public void testRandom() throws Exception {
     int digits = TestUtil.nextInt(random(), 2, 3);
     // underestimated total number of unique terms that randomSimpleString
     // maybe generate, it assumes all terms have a length of 7
     int vocabularySize = digits << 7;
     int numTerms = Math.min(atLeast(100), vocabularySize);
     Set<String> terms = new HashSet<>();
     while (terms.size() < numTerms) {
       terms.add(randomSimpleString(digits));
     }

     Directory dir = newDirectory();
     RandomIndexWriter w = new RandomIndexWriter(random(), dir);
     for(String term : terms) {
       Document doc = new Document();
       doc.add(new StringField("field", term, Field.Store.YES));
       w.addDocument(doc);
     }
     DirectoryReader r = w.getReader();
     w.close();
     //System.out.println("TEST: reader=" + r);
     IndexSearcher s = newSearcher(r);
     int iters = atLeast(200);
     for(int iter=0;iter<iters;iter++) {
       String queryTerm = randomSimpleString(digits);
       int prefixLength = random().nextInt(queryTerm.length());
       String queryPrefix = queryTerm.substring(0, prefixLength);

       // we don't look at scores here:
       List<TermAndScore>[] expected = new List[3];
       for(int ed=0;ed<3;ed++) {
         expected[ed] = new ArrayList<TermAndScore>();
       }
       for(String term : terms) {
         if (term.startsWith(queryPrefix) == false) {
           continue;
         }
         int ed = getDistance(term, queryTerm);
         float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
         while (ed < 3) {
           expected[ed].add(new TermAndScore(term, score));
           ed++;
         }
       }

       for(int ed=0;ed<3;ed++) {
         Collections.sort(expected[ed]);
         int queueSize = TestUtil.nextInt(random(), 1, terms.size());
         /*
         System.out.println("\nTEST: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength);
         for(TermAndScore ent : expected[ed]) {
           System.out.println("  " + ent);
         }
         */
         FuzzyQuery query = new FuzzyQuery(new Term("field", queryTerm), ed, prefixLength, queueSize, true);
         TopDocs hits = s.search(query, terms.size());
         Set<String> actual = new HashSet<>();
         for(ScoreDoc hit : hits.scoreDocs) {
           Document doc = s.doc(hit.doc);
           actual.add(doc.get("field"));
           //System.out.println("   actual: " + doc.get("field") + " score=" + hit.score);
         }
         Set<String> expectedTop = new HashSet<>();
         int limit = Math.min(queueSize, expected[ed].size());
         for(int i=0;i<limit;i++) {
           expectedTop.add(expected[ed].get(i).term);
         }

         if (actual.equals(expectedTop) == false) {
           StringBuilder sb = new StringBuilder();
           sb.append("FAILED: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength + "\n");

           boolean first = true;
           for(String term : actual) {
             if (expectedTop.contains(term) == false) {
               if (first) {
                 sb.append("  these matched but shouldn't:\n");
                 first = false;
               }
               sb.append("    " + term + "\n");
             }
           }
           first = true;
           for(String term : expectedTop) {
             if (actual.contains(term) == false) {
               if (first) {
                 sb.append("  these did not match but should:\n");
                 first = false;
               }
               sb.append("    " + term + "\n");
             }
           }
           throw new AssertionError(sb.toString());
         }
       }
     }

     IOUtils.close(r, dir);
   }

   private static class TermAndScore implements Comparable<TermAndScore> {
     final String term;
     final float score;

     public TermAndScore(String term, float score) {
       this.term = term;
       this.score = score;
     }

     @Override
     public int compareTo(TermAndScore other) {
       // higher score sorts first, and if scores are tied, lower term sorts first
       if (score > other.score) {
         return -1;
       } else if (score < other.score) {
         return 1;
       } else {
         return term.compareTo(other.term);
       }
     }

     @Override
     public String toString() {
       return term + " score=" + score;
     }
   }

   // Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2)
   private static int getDistance(String target, String other) {
     IntsRef targetPoints;
     IntsRef otherPoints;
     int n;
     int d[][]; // cost array

     // NOTE: if we cared, we could 3*m space instead of m*n space, similar to
     // what LevenshteinDistance does, except cycling thru a ring of three
     // horizontal cost arrays... but this comparator is never actually used by
     // DirectSpellChecker, it's only used for merging results from multiple shards
     // in "distributed spellcheck", and it's inefficient in other ways too...

     // cheaper to do this up front once
     targetPoints = toIntsRef(target);
     otherPoints = toIntsRef(other);
     n = targetPoints.length;
     final int m = otherPoints.length;
     d = new int[n+1][m+1];

     if (n == 0 || m == 0) {
       if (n == m) {
         return 0;
       }
       else {
         return Math.max(n, m);
       }
     }

     // indexes into strings s and t
     int i; // iterates through s
     int j; // iterates through t

     int t_j; // jth character of t

     int cost; // cost

     for (i = 0; i<=n; i++) {
       d[i][0] = i;
     }

     for (j = 0; j<=m; j++) {
       d[0][j] = j;
     }

     for (j = 1; j<=m; j++) {
       t_j = otherPoints.ints[j-1];

       for (i=1; i<=n; i++) {
         cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
         // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
         d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
         // transposition
         if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
           d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
         }
       }
     }

     return d[n][m];
   }

   private static IntsRef toIntsRef(String s) {
     IntsRef ref = new IntsRef(s.length()); // worst case
     int utf16Len = s.length();
     for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
       cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
     }
     return ref;
   }

   public void testVisitor() {
     FuzzyQuery q = new FuzzyQuery(new Term("field", "blob"), 2);
     AtomicBoolean visited = new AtomicBoolean(false);
     q.visit(new QueryVisitor() {
       @Override
       public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
         visited.set(true);
         ByteRunAutomaton a = automaton.get();
         assertMatches(a, "blob");
         assertMatches(a, "bolb");
         assertMatches(a, "blobby");
         assertNoMatches(a, "bolbby");
       }
     });
     assertTrue(visited.get());
   }

   private static void assertMatches(ByteRunAutomaton automaton, String text) {
     BytesRef b = new BytesRef(text);
     assertTrue(automaton.run(b.bytes, b.offset, b.length));
   }

   private static void assertNoMatches(ByteRunAutomaton automaton, String text) {
     BytesRef b = new BytesRef(text);
     assertFalse(automaton.run(b.bytes, b.offset, b.length));
   }
 }