| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.analyzing; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| import java.util.TreeSet; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockTokenFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.TokenStreamToAutomaton; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.search.suggest.Input; |
| import org.apache.lucene.search.suggest.InputArrayIterator; |
| import org.apache.lucene.search.suggest.Lookup.LookupResult; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.FiniteStringsIterator; |
| import org.apache.lucene.util.fst.Util; |
| |
| public class FuzzySuggesterTest extends LuceneTestCase { |
| |
| public void testRandomEdits() throws IOException { |
| List<Input> keys = new ArrayList<>(); |
| int numTerms = atLeast(100); |
| for (int i = 0; i < numTerms; i++) { |
| keys.add(new Input("boo" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); |
| } |
| keys.add(new Input("foo bar boo far", 12)); |
| MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, |
| 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE); |
| suggester.build(new InputArrayIterator(keys)); |
| int numIters = atLeast(10); |
| for (int i = 0; i < numIters; i++) { |
| String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX); |
| List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); |
| assertEquals(addRandomEdit, 1, results.size()); |
| assertEquals("foo bar boo far", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| } |
| IOUtils.close(analyzer, tempDir); |
| } |
| |
| public void testNonLatinRandomEdits() throws IOException { |
| List<Input> keys = new ArrayList<>(); |
| int numTerms = atLeast(100); |
| for (int i = 0; i < numTerms; i++) { |
| keys.add(new Input("буу" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); |
| } |
| keys.add(new Input("фуу бар буу фар", 12)); |
| MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, |
| 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, true); |
| suggester.build(new InputArrayIterator(keys)); |
| int numIters = atLeast(10); |
| for (int i = 0; i < numIters; i++) { |
| String addRandomEdit = addRandomEdit("фуу бар буу", 0); |
| List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); |
| assertEquals(addRandomEdit, 1, results.size()); |
| assertEquals("фуу бар буу фар", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| } |
| IOUtils.close(analyzer, tempDir); |
| } |
| |
| /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ |
| public void testKeyword() throws Exception { |
| Input keys[] = new Input[] { |
| new Input("foo", 50), |
| new Input("bar", 10), |
| new Input("barbar", 12), |
| new Input("barbara", 6) |
| }; |
| |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",analyzer); |
| suggester.build(new InputArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("bariar", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| |
| results = suggester.lookup(TestUtil.stringToCharSequence("barbr", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| |
| results = suggester.lookup(TestUtil.stringToCharSequence("barbara", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbara", results.get(0).key.toString()); |
| assertEquals(6, results.get(0).value, 0.01F); |
| |
| results = suggester.lookup(TestUtil.stringToCharSequence("barbar", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals("barbara", results.get(1).key.toString()); |
| assertEquals(6, results.get(1).value, 0.01F); |
| |
| results = suggester.lookup(TestUtil.stringToCharSequence("barbaa", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals("barbara", results.get(1).key.toString()); |
| assertEquals(6, results.get(1).value, 0.01F); |
| |
| // top N of 2, but only foo is available |
| results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2); |
| assertEquals(1, results.size()); |
| assertEquals("foo", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| // top N of 1 for 'bar': we return this even though |
| // barbar is higher because exactFirst is enabled: |
| results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("bar", results.get(0).key.toString()); |
| assertEquals(10, results.get(0).value, 0.01F); |
| |
| // top N Of 2 for 'b' |
| results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals("bar", results.get(1).key.toString()); |
| assertEquals(10, results.get(1).value, 0.01F); |
| |
| // top N of 3 for 'ba' |
| results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3); |
| assertEquals(3, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals("bar", results.get(1).key.toString()); |
| assertEquals(10, results.get(1).value, 0.01F); |
| assertEquals("barbara", results.get(2).key.toString()); |
| assertEquals(6, results.get(2).value, 0.01F); |
| |
| IOUtils.close(analyzer, tempDir); |
| } |
| |
| /** |
| * basic "standardanalyzer" test with stopword removal |
| */ |
| public void testStandard() throws Exception { |
| Input keys[] = new Input[] { |
| new Input("the ghost of christmas past", 50), |
| }; |
| |
| Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, |
| FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE); |
| suggester.build(new InputArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("the ghost of christmas past", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| // omit the 'the' since it's a stopword, it's suggested anyway |
| results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("the ghost of christmas past", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| // omit the 'the' and 'of' since they are stopwords, it's suggested anyway |
| results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("the ghost of christmas past", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| IOUtils.close(standard, tempDir); |
| } |
| |
| public void testNoSeps() throws Exception { |
| Input[] keys = new Input[] { |
| new Input("ab cd", 0), |
| new Input("abcd", 1), |
| }; |
| |
| int options = 0; |
| |
| Analyzer a = new MockAnalyzer(random()); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",a, a, options, 256, -1, true, 1, true, 1, 3, false); |
| suggester.build(new InputArrayIterator(keys)); |
| // TODO: would be nice if "ab " would allow the test to |
| // pass, and more generally if the analyzer can know |
| // that the user's current query has ended at a word, |
| // but, analyzers don't produce SEP tokens! |
| List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence("ab c", random()), false, 2); |
| assertEquals(2, r.size()); |
| |
| // With no PRESERVE_SEPS specified, "ab c" should also |
| // complete to "abcd", which has higher weight so should |
| // appear first: |
| assertEquals("abcd", r.get(0).key.toString()); |
| IOUtils.close(a, tempDir); |
| } |
| |
| public void testGraphDups() throws Exception { |
| |
| final Analyzer analyzer = new AnalyzingSuggesterTest.MultiCannedAnalyzer( |
| new CannedTokenStream( |
| token("wifi", 1, 1), |
| token("hotspot", 0, 2), |
| token("network", 1, 1), |
| token("is", 1, 1), |
| token("slow", 1, 1)), |
| new CannedTokenStream( |
| token("wi", 1, 1), |
| token("hotspot", 0, 3), |
| token("fi", 1, 1), |
| token("network", 1, 1), |
| token("is", 1, 1), |
| token("fast", 1, 1)), |
| new CannedTokenStream( |
| token("wifi", 1, 1), |
| token("hotspot",0,2), |
| token("network",1,1))); |
| |
| Input keys[] = new Input[] { |
| new Input("wifi network is slow", 50), |
| new Input("wi fi network is fast", 10), |
| }; |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer); |
| suggester.build(new InputArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup("wifi network", false, 10); |
| if (VERBOSE) { |
| System.out.println("Results: " + results); |
| } |
| assertEquals(2, results.size()); |
| assertEquals("wifi network is slow", results.get(0).key); |
| assertEquals(50, results.get(0).value); |
| assertEquals("wi fi network is fast", results.get(1).key); |
| assertEquals(10, results.get(1).value); |
| IOUtils.close(tempDir, analyzer); |
| } |
| |
| public void testEmpty() throws Exception { |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer); |
| suggester.build(new InputArrayIterator(new Input[0])); |
| |
| List<LookupResult> result = suggester.lookup("a", false, 20); |
| assertTrue(result.isEmpty()); |
| IOUtils.close(analyzer, tempDir); |
| } |
| |
| public void testInputPathRequired() throws Exception { |
| |
| // SynonymMap.Builder b = new SynonymMap.Builder(false); |
| // b.add(new CharsRef("ab"), new CharsRef("ba"), true); |
| // final SynonymMap map = b.build(); |
| |
| // The Analyzer below mimics the functionality of the SynonymAnalyzer |
| // using the above map, so that the suggest module does not need a dependency on the |
| // synonym module |
| |
| final Analyzer analyzer = new AnalyzingSuggesterTest.MultiCannedAnalyzer( |
| new CannedTokenStream( |
| token("ab", 1, 1), |
| token("ba", 0, 1), |
| token("xc", 1, 1)), |
| new CannedTokenStream( |
| token("ba", 1, 1), |
| token("xd", 1, 1)), |
| new CannedTokenStream( |
| token("ab", 1, 1), |
| token("ba", 0, 1), |
| token("x", 1, 1))); |
| |
| Input keys[] = new Input[] { |
| new Input("ab xc", 50), |
| new Input("ba xd", 50), |
| }; |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer); |
| suggester.build(new InputArrayIterator(keys)); |
| List<LookupResult> results = suggester.lookup("ab x", false, 1); |
| assertTrue(results.size() == 1); |
| IOUtils.close(analyzer, tempDir); |
| } |
| |
| private static Token token(String term, int posInc, int posLength) { |
| final Token t = new Token(term, 0, 0); |
| t.setPositionIncrement(posInc); |
| t.setPositionLength(posLength); |
| return t; |
| } |
| |
| /* |
| private void printTokens(final Analyzer analyzer, String input) throws IOException { |
| System.out.println("Tokens for " + input); |
| TokenStream ts = analyzer.tokenStream("", new StringReader(input)); |
| ts.reset(); |
| final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); |
| final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); |
| final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); |
| |
| while(ts.incrementToken()) { |
| termBytesAtt.fillBytesRef(); |
| System.out.println(String.format("%s,%s,%s", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); |
| } |
| ts.end(); |
| ts.close(); |
| } |
| */ |
| |
| private Analyzer getUnusualAnalyzer() { |
| // First three calls just returns "a", then returns ["a","b"], then "a" again |
| return new AnalyzingSuggesterTest.MultiCannedAnalyzer( |
| new CannedTokenStream(token("a", 1, 1)), |
| new CannedTokenStream(token("a", 1, 1)), |
| new CannedTokenStream(token("a", 1, 1)), |
| new CannedTokenStream(token("a", 1, 1), token("b", 1, 1)), |
| new CannedTokenStream(token("a", 1, 1)), |
| new CannedTokenStream(token("a", 1, 1))); |
| } |
| |
| public void testExactFirst() throws Exception { |
| |
| Analyzer a = getUnusualAnalyzer(); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, true, 1, true, 1, 3, false); |
| suggester.build(new InputArrayIterator(new Input[] { |
| new Input("x y", 1), |
| new Input("x y z", 3), |
| new Input("x", 2), |
| new Input("z z z", 20), |
| })); |
| |
| //System.out.println("ALL: " + suggester.lookup("x y", false, 6)); |
| |
| for(int topN=1;topN<6;topN++) { |
| List<LookupResult> results = suggester.lookup("x y", false, topN); |
| //System.out.println("topN=" + topN + " " + results); |
| |
| assertEquals(Math.min(topN, 4), results.size()); |
| |
| assertEquals("x y", results.get(0).key); |
| assertEquals(1, results.get(0).value); |
| |
| if (topN > 1) { |
| assertEquals("z z z", results.get(1).key); |
| assertEquals(20, results.get(1).value); |
| |
| if (topN > 2) { |
| assertEquals("x y z", results.get(2).key); |
| assertEquals(3, results.get(2).value); |
| |
| if (topN > 3) { |
| assertEquals("x", results.get(3).key); |
| assertEquals(2, results.get(3).value); |
| } |
| } |
| } |
| } |
| IOUtils.close(a, tempDir); |
| } |
| |
| public void testNonExactFirst() throws Exception { |
| |
| Analyzer a = getUnusualAnalyzer(); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, true, 1, true, 1, 3, false); |
| |
| suggester.build(new InputArrayIterator(new Input[] { |
| new Input("x y", 1), |
| new Input("x y z", 3), |
| new Input("x", 2), |
| new Input("z z z", 20), |
| })); |
| |
| for(int topN=1;topN<6;topN++) { |
| List<LookupResult> results = suggester.lookup("p", false, topN); |
| |
| assertEquals(Math.min(topN, 4), results.size()); |
| |
| assertEquals("z z z", results.get(0).key); |
| assertEquals(20, results.get(0).value); |
| |
| if (topN > 1) { |
| assertEquals("x y z", results.get(1).key); |
| assertEquals(3, results.get(1).value); |
| |
| if (topN > 2) { |
| assertEquals("x", results.get(2).key); |
| assertEquals(2, results.get(2).value); |
| |
| if (topN > 3) { |
| assertEquals("x y", results.get(3).key); |
| assertEquals(1, results.get(3).value); |
| } |
| } |
| } |
| } |
| IOUtils.close(a, tempDir); |
| } |
| |
| // Holds surface form separately: |
| private static class TermFreqPayload2 implements Comparable<TermFreqPayload2> { |
| public final String surfaceForm; |
| public final String analyzedForm; |
| public final long weight; |
| |
| public TermFreqPayload2(String surfaceForm, String analyzedForm, long weight) { |
| this.surfaceForm = surfaceForm; |
| this.analyzedForm = analyzedForm; |
| this.weight = weight; |
| } |
| |
| @Override |
| public int compareTo(TermFreqPayload2 other) { |
| int cmp = analyzedForm.compareTo(other.analyzedForm); |
| if (cmp != 0) { |
| return cmp; |
| } else if (weight > other.weight) { |
| return -1; |
| } else if (weight < other.weight) { |
| return 1; |
| } else { |
| assert false; |
| return 0; |
| } |
| } |
| } |
| |
| static boolean isStopChar(char ch, int numStopChars) { |
| //System.out.println("IS? " + ch + ": " + (ch - 'a') + ": " + ((ch - 'a') < numStopChars)); |
| return (ch - 'a') < numStopChars; |
| } |
| |
| // Like StopFilter: |
| private static class TokenEater extends TokenFilter { |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final int numStopChars; |
| private final boolean preserveHoles; |
| private boolean first; |
| |
| public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) { |
| super(in); |
| this.preserveHoles = preserveHoles; |
| this.numStopChars = numStopChars; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| first = true; |
| } |
| |
| @Override |
| public final boolean incrementToken() throws IOException { |
| int skippedPositions = 0; |
| while (input.incrementToken()) { |
| if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) { |
| int posInc = posIncrAtt.getPositionIncrement() + skippedPositions; |
| if (first) { |
| if (posInc == 0) { |
| // first token having posinc=0 is illegal. |
| posInc = 1; |
| } |
| first = false; |
| } |
| posIncrAtt.setPositionIncrement(posInc); |
| //System.out.println("RETURN term=" + termAtt + " numStopChars=" + numStopChars); |
| return true; |
| } |
| if (preserveHoles) { |
| skippedPositions += posIncrAtt.getPositionIncrement(); |
| } |
| } |
| |
| return false; |
| } |
| } |
| |
| private static class MockTokenEatingAnalyzer extends Analyzer { |
| private int numStopChars; |
| private boolean preserveHoles; |
| |
| public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) { |
| this.preserveHoles = preserveHoles; |
| this.numStopChars = numStopChars; |
| } |
| |
| @Override |
| public TokenStreamComponents createComponents(String fieldName) { |
| MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); |
| tokenizer.setEnableChecks(true); |
| TokenStream next; |
| if (numStopChars != 0) { |
| next = new TokenEater(preserveHoles, tokenizer, numStopChars); |
| } else { |
| next = tokenizer; |
| } |
| return new TokenStreamComponents(tokenizer, next); |
| } |
| } |
| |
| @Slow |
| public void testRandom() throws Exception { |
| |
| int numQueries = atLeast(20); |
| |
| final List<TermFreqPayload2> slowCompletor = new ArrayList<>(); |
| final TreeSet<String> allPrefixes = new TreeSet<>(); |
| final Set<String> seen = new HashSet<>(); |
| |
| Input[] keys = new Input[numQueries]; |
| |
| boolean preserveSep = random().nextBoolean(); |
| boolean unicodeAware = random().nextBoolean(); |
| |
| final int numStopChars = random().nextInt(10); |
| final boolean preserveHoles = random().nextBoolean(); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); |
| } |
| |
| for (int i = 0; i < numQueries; i++) { |
| int numTokens = TestUtil.nextInt(random(), 1, 4); |
| String key; |
| String analyzedKey; |
| while(true) { |
| key = ""; |
| analyzedKey = ""; |
| boolean lastRemoved = false; |
| for(int token=0;token < numTokens;token++) { |
| String s; |
| while (true) { |
| // TODO: would be nice to fix this slowCompletor/comparator to |
| // use full range, but we might lose some coverage too... |
| s = TestUtil.randomSimpleString(random()); |
| if (s.length() > 0) { |
| if (token > 0) { |
| key += " "; |
| } |
| if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length())-1) != ' ' : analyzedKey.charAt(analyzedKey.length()-1) != ' ')) { |
| analyzedKey += " "; |
| } |
| key += s; |
| if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) { |
| if (preserveSep && preserveHoles) { |
| analyzedKey += '\u0000'; |
| } |
| lastRemoved = true; |
| } else { |
| analyzedKey += s; |
| lastRemoved = false; |
| } |
| break; |
| } |
| } |
| } |
| |
| analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", ""); |
| |
| if (preserveSep && lastRemoved) { |
| analyzedKey += " "; |
| } |
| |
| // Don't add same surface form more than once: |
| if (!seen.contains(key)) { |
| seen.add(key); |
| break; |
| } |
| } |
| |
| for (int j = 1; j < key.length(); j++) { |
| allPrefixes.add(key.substring(0, j)); |
| } |
| // we can probably do Integer.MAX_VALUE here, but why worry. |
| int weight = random().nextInt(1<<24); |
| keys[i] = new Input(key, weight); |
| |
| slowCompletor.add(new TermFreqPayload2(key, analyzedKey, weight)); |
| } |
| |
| if (VERBOSE) { |
| // Don't just sort original list, to avoid VERBOSE |
| // altering the test: |
| List<TermFreqPayload2> sorted = new ArrayList<>(slowCompletor); |
| Collections.sort(sorted); |
| for(TermFreqPayload2 ent : sorted) { |
| System.out.println(" surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight); |
| } |
| } |
| |
| Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",a, a, |
| preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, true, 1, false, 1, 3, unicodeAware); |
| suggester.build(new InputArrayIterator(keys)); |
| |
| for (String prefix : allPrefixes) { |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: prefix=" + prefix); |
| } |
| |
| final int topN = TestUtil.nextInt(random(), 1, 10); |
| List<LookupResult> r = suggester.lookup(TestUtil.stringToCharSequence(prefix, random()), false, topN); |
| |
| // 2. go thru whole set to find suggestions: |
| List<LookupResult> matches = new ArrayList<>(); |
| |
| // "Analyze" the key: |
| String[] tokens = prefix.split(" "); |
| StringBuilder builder = new StringBuilder(); |
| boolean lastRemoved = false; |
| for(int i=0;i<tokens.length;i++) { |
| String token = tokens[i]; |
| if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) { |
| builder.append(' '); |
| } |
| |
| if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) { |
| if (preserveSep && preserveHoles) { |
| builder.append("\u0000"); |
| } |
| lastRemoved = true; |
| } else { |
| builder.append(token); |
| lastRemoved = false; |
| } |
| } |
| |
| String analyzedKey = builder.toString(); |
| |
| // Remove trailing sep/holes (TokenStream.end() does |
| // not tell us any trailing holes, yet ... there is an |
| // issue open for this): |
| while (true) { |
| String s = analyzedKey.replaceAll("(^| )\u0000$", ""); |
| s = s.replaceAll("\\s+$", ""); |
| if (s.equals(analyzedKey)) { |
| break; |
| } |
| analyzedKey = s; |
| } |
| |
| if (analyzedKey.length() == 0) { |
| // Currently suggester can't suggest from the empty |
| // string! You get no results, not all results... |
| continue; |
| } |
| |
| if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) { |
| analyzedKey += " "; |
| } |
| |
| if (VERBOSE) { |
| System.out.println(" analyzed: " + analyzedKey); |
| } |
| TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); |
| |
| // NOTE: not great that we ask the suggester to give |
| // us the "answer key" (ie maybe we have a bug in |
| // suggester.toLevA ...) ... but testRandom2() fixes |
| // this: |
| Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey))); |
| assertTrue(automaton.isDeterministic()); |
| |
| // TODO: could be faster... but it's slowCompletor for a reason |
| BytesRefBuilder spare = new BytesRefBuilder(); |
| for (TermFreqPayload2 e : slowCompletor) { |
| spare.copyChars(e.analyzedForm); |
| FiniteStringsIterator finiteStrings = |
| new FiniteStringsIterator(suggester.toAutomaton(spare.get(), tokenStreamToAutomaton)); |
| for (IntsRef string; (string = finiteStrings.next()) != null;) { |
| int p = 0; |
| BytesRef ref = Util.toBytesRef(string, spare); |
| boolean added = false; |
| for (int i = ref.offset; i < ref.length; i++) { |
| int q = automaton.step(p, ref.bytes[i] & 0xff); |
| if (q == -1) { |
| break; |
| } else if (automaton.isAccept(q)) { |
| matches.add(new LookupResult(e.surfaceForm, e.weight)); |
| added = true; |
| break; |
| } |
| p = q; |
| } |
| if (!added && automaton.isAccept(p)) { |
| matches.add(new LookupResult(e.surfaceForm, e.weight)); |
| } |
| } |
| } |
| |
| assertTrue(numStopChars > 0 || matches.size() > 0); |
| |
| if (matches.size() > 1) { |
| Collections.sort(matches, new Comparator<LookupResult>() { |
| @Override |
| public int compare(LookupResult left, LookupResult right) { |
| int cmp = Float.compare(right.value, left.value); |
| if (cmp == 0) { |
| return left.compareTo(right); |
| } else { |
| return cmp; |
| } |
| } |
| }); |
| } |
| |
| if (matches.size() > topN) { |
| matches = matches.subList(0, topN); |
| } |
| |
| if (VERBOSE) { |
| System.out.println(" expected:"); |
| for(LookupResult lr : matches) { |
| System.out.println(" key=" + lr.key + " weight=" + lr.value); |
| } |
| |
| System.out.println(" actual:"); |
| for(LookupResult lr : r) { |
| System.out.println(" key=" + lr.key + " weight=" + lr.value); |
| } |
| } |
| |
| assertEquals(prefix + " " + topN, matches.size(), r.size()); |
| for(int hit=0;hit<r.size();hit++) { |
| //System.out.println(" check hit " + hit); |
| assertEquals(prefix + " " + topN, matches.get(hit).key.toString(), r.get(hit).key.toString()); |
| assertEquals(matches.get(hit).value, r.get(hit).value, 0f); |
| } |
| } |
| IOUtils.close(a, tempDir); |
| } |
| |
| public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { |
| Analyzer a = new MockAnalyzer(random()); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, 0, 2, -1, true, 1, true, 1, 3, false); |
| |
| List<Input> keys = Arrays.asList(new Input[] { |
| new Input("a", 40), |
| new Input("a ", 50), |
| new Input(" a", 60), |
| }); |
| |
| Collections.shuffle(keys, random()); |
| suggester.build(new InputArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup("a", false, 5); |
| assertEquals(2, results.size()); |
| assertEquals(" a", results.get(0).key); |
| assertEquals(60, results.get(0).value); |
| assertEquals("a ", results.get(1).key); |
| assertEquals(50, results.get(1).value); |
| IOUtils.close(a, tempDir); |
| } |
| |
| public void testEditSeps() throws Exception { |
| Analyzer a = new MockAnalyzer(random()); |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, true, 2, true, 1, 3, false); |
| |
| List<Input> keys = Arrays.asList(new Input[] { |
| new Input("foo bar", 40), |
| new Input("foo bar baz", 50), |
| new Input("barbaz", 60), |
| new Input("barbazfoo", 10), |
| }); |
| |
| Collections.shuffle(keys, random()); |
| suggester.build(new InputArrayIterator(keys)); |
| |
| assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString()); |
| assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString()); |
| assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString()); |
| assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString()); |
| IOUtils.close(a, tempDir); |
| } |
| |
| @SuppressWarnings("fallthrough") |
| private static String addRandomEdit(String string, int prefixLength) { |
| char[] input = string.toCharArray(); |
| StringBuilder builder = new StringBuilder(); |
| for (int i = 0; i < input.length; i++) { |
| if (i >= prefixLength && random().nextBoolean() && i < input.length-1) { |
| switch(random().nextInt(4)) { |
| case 3: |
| if (i < input.length-1) { |
| // Transpose input[i] and input[1+i]: |
| builder.append(input[i+1]); |
| builder.append(input[i]); |
| for(int j=i+2;j<input.length;j++) { |
| builder.append(input[j]); |
| } |
| return builder.toString(); |
| } |
| // NOTE: fall through to delete: |
| case 2: |
| // Delete input[i] |
| for (int j = i+1; j < input.length; j++) { |
| builder.append(input[j]); |
| } |
| return builder.toString(); |
| case 1: |
| // Insert input[i+1] twice |
| if (i+1<input.length) { |
| builder.append(input[i+1]); |
| builder.append(input[i++]); |
| i++; |
| } |
| for (int j = i; j < input.length; j++) { |
| builder.append(input[j]); |
| } |
| return builder.toString(); |
| case 0: |
| // Insert random byte. |
| // NOTE: can only use ascii here so that, in |
| // UTF8 byte space it's still a single |
| // insertion: |
| // bytes 0x1e and 0x1f are reserved |
| int x = random().nextBoolean() ? random().nextInt(30) : 32 + random().nextInt(128 - 32); |
| builder.append((char) x); |
| for (int j = i; j < input.length; j++) { |
| builder.append(input[j]); |
| } |
| return builder.toString(); |
| } |
| } |
| |
| builder.append(input[i]); |
| } |
| |
| return builder.toString(); |
| } |
| |
| private String randomSimpleString(int maxLen) { |
| final int len = TestUtil.nextInt(random(), 1, maxLen); |
| final char[] chars = new char[len]; |
| for(int j=0;j<len;j++) { |
| chars[j] = (char) ('a' + random().nextInt(4)); |
| } |
| return new String(chars); |
| } |
| |
| public void testRandom2() throws Throwable { |
| final int NUM = atLeast(200); |
| final List<Input> answers = new ArrayList<>(); |
| final Set<String> seen = new HashSet<>(); |
| for(int i=0;i<NUM;i++) { |
| final String s = randomSimpleString(8); |
| if (!seen.contains(s)) { |
| answers.add(new Input(s, random().nextInt(1000))); |
| seen.add(s); |
| } |
| } |
| |
| Collections.sort(answers, new Comparator<Input>() { |
| @Override |
| public int compare(Input a, Input b) { |
| return a.term.compareTo(b.term); |
| } |
| }); |
| if (VERBOSE) { |
| System.out.println("\nTEST: targets"); |
| for(Input tf : answers) { |
| System.out.println(" " + tf.term.utf8ToString() + " freq=" + tf.v); |
| } |
| } |
| |
| Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); |
| int maxEdits = random().nextBoolean() ? 1 : 2; |
| int prefixLen = random().nextInt(4); |
| boolean transpositions = random().nextBoolean(); |
| // TODO: test graph analyzers |
| // TODO: test exactFirst / preserveSep permutations |
| Directory tempDir = getDirectory(); |
| FuzzySuggester suggest = new FuzzySuggester(tempDir, "fuzzy", a, a, 0, 256, -1, true, maxEdits, transpositions, prefixLen, prefixLen, false); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM); |
| } |
| |
| Collections.shuffle(answers, random()); |
| suggest.build(new InputArrayIterator(answers.toArray(new Input[answers.size()]))); |
| |
| final int ITERS = atLeast(100); |
| for(int iter=0;iter<ITERS;iter++) { |
| final String frag = randomSimpleString(6); |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter frag=" + frag); |
| } |
| final List<LookupResult> expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag); |
| if (VERBOSE) { |
| System.out.println(" expected: " + expected.size()); |
| for(LookupResult c : expected) { |
| System.out.println(" " + c); |
| } |
| } |
| final List<LookupResult> actual = suggest.lookup(frag, false, NUM); |
| if (VERBOSE) { |
| System.out.println(" actual: " + actual.size()); |
| for(LookupResult c : actual) { |
| System.out.println(" " + c); |
| } |
| } |
| |
| Collections.sort(actual, new CompareByCostThenAlpha()); |
| |
| final int limit = Math.min(expected.size(), actual.size()); |
| for(int ans=0;ans<limit;ans++) { |
| final LookupResult c0 = expected.get(ans); |
| final LookupResult c1 = actual.get(ans); |
| assertEquals("expected " + c0.key + |
| " but got " + c1.key, |
| 0, |
| CHARSEQUENCE_COMPARATOR.compare(c0.key, c1.key)); |
| assertEquals(c0.value, c1.value); |
| } |
| assertEquals(expected.size(), actual.size()); |
| } |
| IOUtils.close(a, tempDir); |
| } |
| |
| private List<LookupResult> slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List<Input> answers, String frag) { |
| final List<LookupResult> results = new ArrayList<>(); |
| final int fragLen = frag.length(); |
| for(Input tf : answers) { |
| //System.out.println(" check s=" + tf.term.utf8ToString()); |
| boolean prefixMatches = true; |
| for(int i=0;i<prefixLen;i++) { |
| if (i == fragLen) { |
| // Prefix still matches: |
| break; |
| } |
| if (i == tf.term.length || tf.term.bytes[i] != (byte) frag.charAt(i)) { |
| prefixMatches = false; |
| break; |
| } |
| } |
| //System.out.println(" prefixMatches=" + prefixMatches); |
| |
| if (prefixMatches) { |
| final int len = tf.term.length; |
| if (len >= fragLen-maxEdits) { |
| // OK it's possible: |
| //System.out.println(" possible"); |
| int d; |
| final String s = tf.term.utf8ToString(); |
| if (fragLen == prefixLen) { |
| d = 0; |
| } else if (false && len < fragLen) { |
| d = getDistance(frag, s, allowTransposition); |
| } else { |
| //System.out.println(" try loop"); |
| d = maxEdits + 1; |
| //for(int ed=-maxEdits;ed<=maxEdits;ed++) { |
| for(int ed=-maxEdits;ed<=maxEdits;ed++) { |
| if (s.length() < fragLen - ed) { |
| continue; |
| } |
| String check = s.substring(0, fragLen-ed); |
| d = getDistance(frag, check, allowTransposition); |
| //System.out.println(" sub check s=" + check + " d=" + d); |
| if (d <= maxEdits) { |
| break; |
| } |
| } |
| } |
| if (d <= maxEdits) { |
| results.add(new LookupResult(tf.term.utf8ToString(), tf.v)); |
| } |
| } |
| } |
| |
| Collections.sort(results, new CompareByCostThenAlpha()); |
| } |
| |
| return results; |
| } |
| |
| private static class CharSequenceComparator implements Comparator<CharSequence> { |
| |
| @Override |
| public int compare(CharSequence o1, CharSequence o2) { |
| final int l1 = o1.length(); |
| final int l2 = o2.length(); |
| |
| final int aStop = Math.min(l1, l2); |
| for (int i = 0; i < aStop; i++) { |
| int diff = o1.charAt(i) - o2.charAt(i); |
| if (diff != 0) { |
| return diff; |
| } |
| } |
| // One is a prefix of the other, or, they are equal: |
| return l1 - l2; |
| } |
| } |
| |
| private static final Comparator<CharSequence> CHARSEQUENCE_COMPARATOR = new CharSequenceComparator(); |
| |
| public static class CompareByCostThenAlpha implements Comparator<LookupResult> { |
| @Override |
| public int compare(LookupResult a, LookupResult b) { |
| if (a.value > b.value) { |
| return -1; |
| } else if (a.value < b.value) { |
| return 1; |
| } else { |
| final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); |
| assert c != 0: "term=" + a.key; |
| return c; |
| } |
| } |
| } |
| |
| // NOTE: copied from |
| // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java |
| // and tweaked to return the edit distance not the float |
| // lucene measure |
| |
| /* Finds unicode (code point) Levenstein (edit) distance |
| * between two strings, including transpositions. */ |
| public int getDistance(String target, String other, boolean allowTransposition) { |
| IntsRef targetPoints; |
| IntsRef otherPoints; |
| int n; |
| int d[][]; // cost array |
| |
| // NOTE: if we cared, we could 3*m space instead of m*n space, similar to |
| // what LevenshteinDistance does, except cycling thru a ring of three |
| // horizontal cost arrays... but this comparator is never actually used by |
| // DirectSpellChecker, it's only used for merging results from multiple shards |
| // in "distributed spellcheck", and it's inefficient in other ways too... |
| |
| // cheaper to do this up front once |
| targetPoints = toIntsRef(target); |
| otherPoints = toIntsRef(other); |
| n = targetPoints.length; |
| final int m = otherPoints.length; |
| d = new int[n+1][m+1]; |
| |
| if (n == 0 || m == 0) { |
| if (n == m) { |
| return 0; |
| } |
| else { |
| return Math.max(n, m); |
| } |
| } |
| |
| // indexes into strings s and t |
| int i; // iterates through s |
| int j; // iterates through t |
| |
| int t_j; // jth character of t |
| |
| int cost; // cost |
| |
| for (i = 0; i<=n; i++) { |
| d[i][0] = i; |
| } |
| |
| for (j = 0; j<=m; j++) { |
| d[0][j] = j; |
| } |
| |
| for (j = 1; j<=m; j++) { |
| t_j = otherPoints.ints[j-1]; |
| |
| for (i=1; i<=n; i++) { |
| cost = targetPoints.ints[i-1]==t_j ? 0 : 1; |
| // minimum of cell to the left+1, to the top+1, diagonally left and up +cost |
| d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost); |
| // transposition |
| if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { |
| d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); |
| } |
| } |
| } |
| |
| return d[n][m]; |
| } |
| |
| private static IntsRef toIntsRef(String s) { |
| IntsRef ref = new IntsRef(s.length()); // worst case |
| int utf16Len = s.length(); |
| for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { |
| cp = ref.ints[ref.length++] = Character.codePointAt(s, i); |
| } |
| return ref; |
| } |
| |
| private Directory getDirectory() { |
| return newDirectory(); |
| } |
| } |