| package org.apache.lucene.search.suggest.analyzing; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| import java.util.TreeSet; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken; |
| import org.apache.lucene.analysis.CannedBinaryTokenStream; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockBytesAttributeFactory; |
| import org.apache.lucene.analysis.MockTokenFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.search.suggest.Lookup.LookupResult; |
| import org.apache.lucene.search.suggest.TermFreq; |
| import org.apache.lucene.search.suggest.TermFreqArrayIterator; |
| import org.apache.lucene.search.suggest.TermFreqPayload; |
| import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util._TestUtil; |
| |
| public class AnalyzingSuggesterTest extends LuceneTestCase { |
| |
| /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ |
| public void testKeyword() throws Exception { |
| TermFreq keys[] = new TermFreq[] { |
| new TermFreq("foo", 50), |
| new TermFreq("bar", 10), |
| new TermFreq("barbar", 12), |
| new TermFreq("barbara", 6) |
| }; |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| |
| // top N of 2, but only foo is available |
| List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); |
| assertEquals(1, results.size()); |
| assertEquals("foo", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| // top N of 1 for 'bar': we return this even though |
| // barbar is higher because exactFirst is enabled: |
| results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("bar", results.get(0).key.toString()); |
| assertEquals(10, results.get(0).value, 0.01F); |
| |
| // top N Of 2 for 'b' |
| results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals("bar", results.get(1).key.toString()); |
| assertEquals(10, results.get(1).value, 0.01F); |
| |
| // top N of 3 for 'ba' |
| results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); |
| assertEquals(3, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals("bar", results.get(1).key.toString()); |
| assertEquals(10, results.get(1).value, 0.01F); |
| assertEquals("barbara", results.get(2).key.toString()); |
| assertEquals(6, results.get(2).value, 0.01F); |
| } |
| |
| public void testKeywordWithPayloads() throws Exception { |
| TermFreqPayload keys[] = new TermFreqPayload[] { |
| new TermFreqPayload("foo", 50, new BytesRef("hello")), |
| new TermFreqPayload("bar", 10, new BytesRef("goodbye")), |
| new TermFreqPayload("barbar", 12, new BytesRef("thank you")), |
| new TermFreqPayload("barbara", 6, new BytesRef("for all the fish")) |
| }; |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); |
| suggester.build(new TermFreqPayloadArrayIterator(keys)); |
| |
| // top N of 2, but only foo is available |
| List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); |
| assertEquals(1, results.size()); |
| assertEquals("foo", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| assertEquals(new BytesRef("hello"), results.get(0).payload); |
| |
| // top N of 1 for 'bar': we return this even though |
| // barbar is higher because exactFirst is enabled: |
| results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("bar", results.get(0).key.toString()); |
| assertEquals(10, results.get(0).value, 0.01F); |
| assertEquals(new BytesRef("goodbye"), results.get(0).payload); |
| |
| // top N Of 2 for 'b' |
| results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals(new BytesRef("thank you"), results.get(0).payload); |
| assertEquals("bar", results.get(1).key.toString()); |
| assertEquals(10, results.get(1).value, 0.01F); |
| assertEquals(new BytesRef("goodbye"), results.get(1).payload); |
| |
| // top N of 3 for 'ba' |
| results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); |
| assertEquals(3, results.size()); |
| assertEquals("barbar", results.get(0).key.toString()); |
| assertEquals(12, results.get(0).value, 0.01F); |
| assertEquals(new BytesRef("thank you"), results.get(0).payload); |
| assertEquals("bar", results.get(1).key.toString()); |
| assertEquals(10, results.get(1).value, 0.01F); |
| assertEquals(new BytesRef("goodbye"), results.get(1).payload); |
| assertEquals("barbara", results.get(2).key.toString()); |
| assertEquals(6, results.get(2).value, 0.01F); |
| assertEquals(new BytesRef("for all the fish"), results.get(2).payload); |
| } |
| |
| // TODO: more tests |
| /** |
| * basic "standardanalyzer" test with stopword removal |
| */ |
| public void testStandard() throws Exception { |
| TermFreq keys[] = new TermFreq[] { |
| new TermFreq("the ghost of christmas past", 50), |
| }; |
| |
| Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(standard); |
| suggester.setPreservePositionIncrements(false); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("the ghost of christmas past", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| // omit the 'the' since its a stopword, its suggested anyway |
| results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("the ghost of christmas past", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| |
| // omit the 'the' and 'of' since they are stopwords, its suggested anyway |
| results = suggester.lookup(_TestUtil.stringToCharSequence("ghost chris", random()), false, 1); |
| assertEquals(1, results.size()); |
| assertEquals("the ghost of christmas past", results.get(0).key.toString()); |
| assertEquals(50, results.get(0).value, 0.01F); |
| } |
| |
| public void testEmpty() throws Exception { |
| Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(standard); |
| suggester.build(new TermFreqArrayIterator(new TermFreq[0])); |
| |
| List<LookupResult> result = suggester.lookup("a", false, 20); |
| assertTrue(result.isEmpty()); |
| } |
| |
| public void testNoSeps() throws Exception { |
| TermFreq[] keys = new TermFreq[] { |
| new TermFreq("ab cd", 0), |
| new TermFreq("abcd", 1), |
| }; |
| |
| int options = 0; |
| |
| Analyzer a = new MockAnalyzer(random()); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, options, 256, -1); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| // TODO: would be nice if "ab " would allow the test to |
| // pass, and more generally if the analyzer can know |
| // that the user's current query has ended at a word, |
| // but, analyzers don't produce SEP tokens! |
| List<LookupResult> r = suggester.lookup(_TestUtil.stringToCharSequence("ab c", random()), false, 2); |
| assertEquals(2, r.size()); |
| |
| // With no PRESERVE_SEPS specified, "ab c" should also |
| // complete to "abcd", which has higher weight so should |
| // appear first: |
| assertEquals("abcd", r.get(0).key.toString()); |
| } |
| |
| public void testGraphDups() throws Exception { |
| |
| final Analyzer analyzer = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| return new TokenStreamComponents(tokenizer) { |
| int tokenStreamCounter = 0; |
| final TokenStream[] tokenStreams = new TokenStream[] { |
| new CannedTokenStream(new Token[] { |
| token("wifi",1,1), |
| token("hotspot",0,2), |
| token("network",1,1), |
| token("is",1,1), |
| token("slow",1,1) |
| }), |
| new CannedTokenStream(new Token[] { |
| token("wi",1,1), |
| token("hotspot",0,3), |
| token("fi",1,1), |
| token("network",1,1), |
| token("is",1,1), |
| token("fast",1,1) |
| |
| }), |
| new CannedTokenStream(new Token[] { |
| token("wifi",1,1), |
| token("hotspot",0,2), |
| token("network",1,1) |
| }), |
| }; |
| |
| @Override |
| public TokenStream getTokenStream() { |
| TokenStream result = tokenStreams[tokenStreamCounter]; |
| tokenStreamCounter++; |
| return result; |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| |
| TermFreq keys[] = new TermFreq[] { |
| new TermFreq("wifi network is slow", 50), |
| new TermFreq("wi fi network is fast", 10), |
| }; |
| //AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, AnalyzingSuggester.EXACT_FIRST, 256, -1); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| List<LookupResult> results = suggester.lookup("wifi network", false, 10); |
| if (VERBOSE) { |
| System.out.println("Results: " + results); |
| } |
| assertEquals(2, results.size()); |
| assertEquals("wifi network is slow", results.get(0).key); |
| assertEquals(50, results.get(0).value); |
| assertEquals("wi fi network is fast", results.get(1).key); |
| assertEquals(10, results.get(1).value); |
| } |
| |
| public void testInputPathRequired() throws Exception { |
| |
| // SynonymMap.Builder b = new SynonymMap.Builder(false); |
| // b.add(new CharsRef("ab"), new CharsRef("ba"), true); |
| // final SynonymMap map = b.build(); |
| |
| // The Analyzer below mimics the functionality of the SynonymAnalyzer |
| // using the above map, so that the suggest module does not need a dependency on the |
| // synonym module |
| |
| final Analyzer analyzer = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| return new TokenStreamComponents(tokenizer) { |
| int tokenStreamCounter = 0; |
| final TokenStream[] tokenStreams = new TokenStream[] { |
| new CannedTokenStream(new Token[] { |
| token("ab",1,1), |
| token("ba",0,1), |
| token("xc",1,1) |
| }), |
| new CannedTokenStream(new Token[] { |
| token("ba",1,1), |
| token("xd",1,1) |
| }), |
| new CannedTokenStream(new Token[] { |
| token("ab",1,1), |
| token("ba",0,1), |
| token("x",1,1) |
| }) |
| }; |
| |
| @Override |
| public TokenStream getTokenStream() { |
| TokenStream result = tokenStreams[tokenStreamCounter]; |
| tokenStreamCounter++; |
| return result; |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| |
| TermFreq keys[] = new TermFreq[] { |
| new TermFreq("ab xc", 50), |
| new TermFreq("ba xd", 50), |
| }; |
| AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| List<LookupResult> results = suggester.lookup("ab x", false, 1); |
| assertTrue(results.size() == 1); |
| } |
| |
| private static Token token(String term, int posInc, int posLength) { |
| final Token t = new Token(term, 0, 0); |
| t.setPositionIncrement(posInc); |
| t.setPositionLength(posLength); |
| return t; |
| } |
| |
| private static BinaryToken token(BytesRef term) { |
| return new BinaryToken(term); |
| } |
| |
| /* |
| private void printTokens(final Analyzer analyzer, String input) throws IOException { |
| System.out.println("Tokens for " + input); |
| TokenStream ts = analyzer.tokenStream("", new StringReader(input)); |
| ts.reset(); |
| final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); |
| final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); |
| final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); |
| |
| while(ts.incrementToken()) { |
| termBytesAtt.fillBytesRef(); |
| System.out.println(String.format("%s,%s,%s", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); |
| } |
| ts.end(); |
| ts.close(); |
| } |
| */ |
| |
| private final Analyzer getUnusualAnalyzer() { |
| return new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| return new TokenStreamComponents(tokenizer) { |
| |
| int count; |
| |
| @Override |
| public TokenStream getTokenStream() { |
| // 4th time we are called, return tokens a b, |
| // else just a: |
| if (count++ != 3) { |
| return new CannedTokenStream(new Token[] { |
| token("a", 1, 1), |
| }); |
| } else { |
| // After that "a b": |
| return new CannedTokenStream(new Token[] { |
| token("a", 1, 1), |
| token("b", 1, 1), |
| }); |
| } |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| } |
| |
| public void testExactFirst() throws Exception { |
| |
| Analyzer a = getUnusualAnalyzer(); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1); |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("x y", 1), |
| new TermFreq("x y z", 3), |
| new TermFreq("x", 2), |
| new TermFreq("z z z", 20), |
| })); |
| |
| //System.out.println("ALL: " + suggester.lookup("x y", false, 6)); |
| |
| for(int topN=1;topN<6;topN++) { |
| List<LookupResult> results = suggester.lookup("x y", false, topN); |
| //System.out.println("topN=" + topN + " " + results); |
| |
| assertEquals(Math.min(topN, 4), results.size()); |
| |
| assertEquals("x y", results.get(0).key); |
| assertEquals(1, results.get(0).value); |
| |
| if (topN > 1) { |
| assertEquals("z z z", results.get(1).key); |
| assertEquals(20, results.get(1).value); |
| |
| if (topN > 2) { |
| assertEquals("x y z", results.get(2).key); |
| assertEquals(3, results.get(2).value); |
| |
| if (topN > 3) { |
| assertEquals("x", results.get(3).key); |
| assertEquals(2, results.get(3).value); |
| } |
| } |
| } |
| } |
| } |
| |
| public void testNonExactFirst() throws Exception { |
| |
| Analyzer a = getUnusualAnalyzer(); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1); |
| |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("x y", 1), |
| new TermFreq("x y z", 3), |
| new TermFreq("x", 2), |
| new TermFreq("z z z", 20), |
| })); |
| |
| for(int topN=1;topN<6;topN++) { |
| List<LookupResult> results = suggester.lookup("p", false, topN); |
| |
| assertEquals(Math.min(topN, 4), results.size()); |
| |
| assertEquals("z z z", results.get(0).key); |
| assertEquals(20, results.get(0).value); |
| |
| if (topN > 1) { |
| assertEquals("x y z", results.get(1).key); |
| assertEquals(3, results.get(1).value); |
| |
| if (topN > 2) { |
| assertEquals("x", results.get(2).key); |
| assertEquals(2, results.get(2).value); |
| |
| if (topN > 3) { |
| assertEquals("x y", results.get(3).key); |
| assertEquals(1, results.get(3).value); |
| } |
| } |
| } |
| } |
| } |
| |
| // Holds surface form separately: |
| private static class TermFreq2 implements Comparable<TermFreq2> { |
| public final String surfaceForm; |
| public final String analyzedForm; |
| public final long weight; |
| public final BytesRef payload; |
| |
| public TermFreq2(String surfaceForm, String analyzedForm, long weight, BytesRef payload) { |
| this.surfaceForm = surfaceForm; |
| this.analyzedForm = analyzedForm; |
| this.weight = weight; |
| this.payload = payload; |
| } |
| |
| @Override |
| public int compareTo(TermFreq2 other) { |
| int cmp = analyzedForm.compareTo(other.analyzedForm); |
| if (cmp != 0) { |
| return cmp; |
| } else if (weight > other.weight) { |
| return -1; |
| } else if (weight < other.weight) { |
| return 1; |
| } else { |
| assert false; |
| return 0; |
| } |
| } |
| |
| @Override |
| public String toString() { |
| return surfaceForm + "/" + weight; |
| } |
| } |
| |
| static boolean isStopChar(char ch, int numStopChars) { |
| //System.out.println("IS? " + ch + ": " + (ch - 'a') + ": " + ((ch - 'a') < numStopChars)); |
| return (ch - 'a') < numStopChars; |
| } |
| |
| // Like StopFilter: |
| private static class TokenEater extends TokenFilter { |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final int numStopChars; |
| private final boolean preserveHoles; |
| private boolean first; |
| |
| public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) { |
| super(in); |
| this.preserveHoles = preserveHoles; |
| this.numStopChars = numStopChars; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| first = true; |
| } |
| |
| @Override |
| public final boolean incrementToken() throws IOException { |
| int skippedPositions = 0; |
| while (input.incrementToken()) { |
| if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) { |
| int posInc = posIncrAtt.getPositionIncrement() + skippedPositions; |
| if (first) { |
| if (posInc == 0) { |
| // first token having posinc=0 is illegal. |
| posInc = 1; |
| } |
| first = false; |
| } |
| posIncrAtt.setPositionIncrement(posInc); |
| //System.out.println("RETURN term=" + termAtt + " numStopChars=" + numStopChars); |
| return true; |
| } |
| if (preserveHoles) { |
| skippedPositions += posIncrAtt.getPositionIncrement(); |
| } |
| } |
| |
| return false; |
| } |
| } |
| |
| private static class MockTokenEatingAnalyzer extends Analyzer { |
| private int numStopChars; |
| private boolean preserveHoles; |
| |
| private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); |
| |
| public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) { |
| this.preserveHoles = preserveHoles; |
| this.numStopChars = numStopChars; |
| } |
| |
| @Override |
| public TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| MockTokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); |
| tokenizer.setEnableChecks(true); |
| TokenStream next; |
| if (numStopChars != 0) { |
| next = new TokenEater(preserveHoles, tokenizer, numStopChars); |
| } else { |
| next = tokenizer; |
| } |
| return new TokenStreamComponents(tokenizer, next); |
| } |
| } |
| |
| private static char SEP = '\uFFFF'; |
| |
| public void testRandom() throws Exception { |
| |
| int numQueries = atLeast(1000); |
| |
| final List<TermFreq2> slowCompletor = new ArrayList<TermFreq2>(); |
| final TreeSet<String> allPrefixes = new TreeSet<String>(); |
| final Set<String> seen = new HashSet<String>(); |
| |
| boolean doPayloads = random().nextBoolean(); |
| |
| TermFreq[] keys = null; |
| TermFreqPayload[] payloadKeys = null; |
| if (doPayloads) { |
| payloadKeys = new TermFreqPayload[numQueries]; |
| } else { |
| keys = new TermFreq[numQueries]; |
| } |
| |
| boolean preserveSep = random().nextBoolean(); |
| |
| final int numStopChars = random().nextInt(10); |
| final boolean preserveHoles = random().nextBoolean(); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); |
| } |
| |
| for (int i = 0; i < numQueries; i++) { |
| int numTokens = _TestUtil.nextInt(random(), 1, 4); |
| String key; |
| String analyzedKey; |
| while(true) { |
| key = ""; |
| analyzedKey = ""; |
| boolean lastRemoved = false; |
| for(int token=0;token < numTokens;token++) { |
| String s; |
| while (true) { |
| // TODO: would be nice to fix this slowCompletor/comparator to |
| // use full range, but we might lose some coverage too... |
| s = _TestUtil.randomSimpleString(random()); |
| if (s.length() > 0) { |
| if (token > 0) { |
| key += " "; |
| } |
| if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != SEP) { |
| analyzedKey += SEP; |
| } |
| key += s; |
| if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) { |
| lastRemoved = true; |
| if (preserveSep && preserveHoles) { |
| analyzedKey += SEP; |
| } |
| } else { |
| lastRemoved = false; |
| analyzedKey += s; |
| } |
| break; |
| } |
| } |
| } |
| |
| analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", ""); |
| |
| if (preserveSep && lastRemoved) { |
| analyzedKey += SEP; |
| } |
| |
| // Don't add same surface form more than once: |
| if (!seen.contains(key)) { |
| seen.add(key); |
| break; |
| } |
| } |
| |
| for (int j = 1; j < key.length(); j++) { |
| allPrefixes.add(key.substring(0, j)); |
| } |
| // we can probably do Integer.MAX_VALUE here, but why worry. |
| int weight = random().nextInt(1<<24); |
| BytesRef payload; |
| if (doPayloads) { |
| byte[] bytes = new byte[random().nextInt(10)]; |
| random().nextBytes(bytes); |
| payload = new BytesRef(bytes); |
| payloadKeys[i] = new TermFreqPayload(key, weight, payload); |
| } else { |
| keys[i] = new TermFreq(key, weight); |
| payload = null; |
| } |
| |
| slowCompletor.add(new TermFreq2(key, analyzedKey, weight, payload)); |
| } |
| |
| if (VERBOSE) { |
| // Don't just sort original list, to avoid VERBOSE |
| // altering the test: |
| List<TermFreq2> sorted = new ArrayList<TermFreq2>(slowCompletor); |
| Collections.sort(sorted); |
| for(TermFreq2 ent : sorted) { |
| System.out.println(" surface='" + ent.surfaceForm + "' analyzed='" + ent.analyzedForm + "' weight=" + ent.weight); |
| } |
| } |
| |
| Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, |
| preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1); |
| if (doPayloads) { |
| suggester.build(new TermFreqPayloadArrayIterator(payloadKeys)); |
| } else { |
| suggester.build(new TermFreqArrayIterator(keys)); |
| } |
| |
| for (String prefix : allPrefixes) { |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: prefix=" + prefix); |
| } |
| |
| final int topN = _TestUtil.nextInt(random(), 1, 10); |
| List<LookupResult> r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN); |
| |
| // 2. go thru whole set to find suggestions: |
| List<TermFreq2> matches = new ArrayList<TermFreq2>(); |
| |
| // "Analyze" the key: |
| String[] tokens = prefix.split(" "); |
| StringBuilder builder = new StringBuilder(); |
| boolean lastRemoved = false; |
| for(int i=0;i<tokens.length;i++) { |
| String token = tokens[i]; |
| if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) { |
| builder.append(SEP); |
| } |
| |
| if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) { |
| if (preserveSep && preserveHoles) { |
| builder.append(SEP); |
| } |
| lastRemoved = true; |
| } else { |
| builder.append(token); |
| lastRemoved = false; |
| } |
| } |
| |
| String analyzedKey = builder.toString(); |
| |
| // Remove trailing sep/holes (TokenStream.end() does |
| // not tell us any trailing holes, yet ... there is an |
| // issue open for this): |
| while (true) { |
| String s = analyzedKey.replaceAll(SEP + "$", ""); |
| if (s.equals(analyzedKey)) { |
| break; |
| } |
| analyzedKey = s; |
| } |
| |
| if (analyzedKey.length() == 0) { |
| // Currently suggester can't suggest from the empty |
| // string! You get no results, not all results... |
| continue; |
| } |
| |
| if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) { |
| analyzedKey += SEP; |
| } |
| |
| if (VERBOSE) { |
| System.out.println(" analyzed: " + analyzedKey); |
| } |
| |
| // TODO: could be faster... but its slowCompletor for a reason |
| for (TermFreq2 e : slowCompletor) { |
| if (e.analyzedForm.startsWith(analyzedKey)) { |
| matches.add(e); |
| } |
| } |
| |
| assertTrue(numStopChars > 0 || matches.size() > 0); |
| |
| if (matches.size() > 1) { |
| Collections.sort(matches, new Comparator<TermFreq2>() { |
| @Override |
| public int compare(TermFreq2 left, TermFreq2 right) { |
| int cmp = Float.compare(right.weight, left.weight); |
| if (cmp == 0) { |
| return left.analyzedForm.compareTo(right.analyzedForm); |
| } else { |
| return cmp; |
| } |
| } |
| }); |
| } |
| |
| if (matches.size() > topN) { |
| matches = matches.subList(0, topN); |
| } |
| |
| if (VERBOSE) { |
| System.out.println(" expected:"); |
| for(TermFreq2 lr : matches) { |
| System.out.println(" key=" + lr.surfaceForm + " weight=" + lr.weight); |
| } |
| |
| System.out.println(" actual:"); |
| for(LookupResult lr : r) { |
| System.out.println(" key=" + lr.key + " weight=" + lr.value); |
| } |
| } |
| |
| assertEquals(matches.size(), r.size()); |
| |
| for(int hit=0;hit<r.size();hit++) { |
| //System.out.println(" check hit " + hit); |
| assertEquals(matches.get(hit).surfaceForm.toString(), r.get(hit).key.toString()); |
| assertEquals(matches.get(hit).weight, r.get(hit).value, 0f); |
| if (doPayloads) { |
| assertEquals(matches.get(hit).payload, r.get(hit).payload); |
| } |
| } |
| } |
| } |
| |
| public void testStolenBytes() throws Exception { |
| |
| // First time w/ preserveSep, second time without: |
| for(int i=0;i<2;i++) { |
| |
| final Analyzer analyzer = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| // TokenStream stream = new SynonymFilter(tokenizer, map, true); |
| // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); |
| return new TokenStreamComponents(tokenizer) { |
| int tokenStreamCounter = 0; |
| final TokenStream[] tokenStreams = new TokenStream[] { |
| new CannedBinaryTokenStream(new BinaryToken[] { |
| token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), |
| }), |
| new CannedTokenStream(new Token[] { |
| token("a",1,1), |
| token("a",1,1) |
| }), |
| new CannedTokenStream(new Token[] { |
| token("a",1,1), |
| token("a",1,1) |
| }), |
| new CannedBinaryTokenStream(new BinaryToken[] { |
| token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), |
| }) |
| }; |
| |
| @Override |
| public TokenStream getTokenStream() { |
| TokenStream result = tokenStreams[tokenStreamCounter]; |
| tokenStreamCounter++; |
| return result; |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| |
| TermFreq keys[] = new TermFreq[] { |
| new TermFreq("a a", 50), |
| new TermFreq("a b", 50), |
| }; |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | (i==0 ? AnalyzingSuggester.PRESERVE_SEP : 0), 256, -1); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| List<LookupResult> results = suggester.lookup("a a", false, 5); |
| assertEquals(1, results.size()); |
| assertEquals("a b", results.get(0).key); |
| assertEquals(50, results.get(0).value); |
| |
| results = suggester.lookup("a a", false, 5); |
| assertEquals(1, results.size()); |
| assertEquals("a a", results.get(0).key); |
| assertEquals(50, results.get(0).value); |
| } |
| } |
| |
| public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { |
| Analyzer a = new MockAnalyzer(random()); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1); |
| |
| List<TermFreq> keys = Arrays.asList(new TermFreq[] { |
| new TermFreq("a", 40), |
| new TermFreq("a ", 50), |
| new TermFreq(" a", 60), |
| }); |
| |
| Collections.shuffle(keys, random()); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup("a", false, 5); |
| assertEquals(2, results.size()); |
| assertEquals(" a", results.get(0).key); |
| assertEquals(60, results.get(0).value); |
| assertEquals("a ", results.get(1).key); |
| assertEquals(50, results.get(1).value); |
| } |
| |
| public void testQueueExhaustion() throws Exception { |
| Analyzer a = new MockAnalyzer(random()); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1); |
| |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("a", 2), |
| new TermFreq("a b c", 3), |
| new TermFreq("a c a", 1), |
| new TermFreq("a c b", 1), |
| })); |
| |
| suggester.lookup("a", false, 4); |
| } |
| |
| public void testExactFirstMissingResult() throws Exception { |
| |
| Analyzer a = new MockAnalyzer(random()); |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1); |
| |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("a", 5), |
| new TermFreq("a b", 3), |
| new TermFreq("a c", 4), |
| })); |
| |
| List<LookupResult> results = suggester.lookup("a", false, 3); |
| assertEquals(3, results.size()); |
| assertEquals("a", results.get(0).key); |
| assertEquals(5, results.get(0).value); |
| assertEquals("a c", results.get(1).key); |
| assertEquals(4, results.get(1).value); |
| assertEquals("a b", results.get(2).key); |
| assertEquals(3, results.get(2).value); |
| |
| // Try again after save/load: |
| File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest"); |
| tmpDir.mkdir(); |
| |
| File path = new File(tmpDir, "suggester"); |
| |
| OutputStream os = new FileOutputStream(path); |
| suggester.store(os); |
| os.close(); |
| |
| InputStream is = new FileInputStream(path); |
| suggester.load(is); |
| is.close(); |
| |
| results = suggester.lookup("a", false, 3); |
| assertEquals(3, results.size()); |
| assertEquals("a", results.get(0).key); |
| assertEquals(5, results.get(0).value); |
| assertEquals("a c", results.get(1).key); |
| assertEquals(4, results.get(1).value); |
| assertEquals("a b", results.get(2).key); |
| assertEquals(3, results.get(2).value); |
| } |
| |
| public void testDupSurfaceFormsMissingResults() throws Exception { |
| Analyzer a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| return new TokenStreamComponents(tokenizer) { |
| |
| @Override |
| public TokenStream getTokenStream() { |
| return new CannedTokenStream(new Token[] { |
| token("hairy", 1, 1), |
| token("smelly", 0, 1), |
| token("dog", 1, 1), |
| }); |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); |
| |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("hambone", 6), |
| new TermFreq("nellie", 5), |
| })); |
| |
| List<LookupResult> results = suggester.lookup("nellie", false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("hambone", results.get(0).key); |
| assertEquals(6, results.get(0).value); |
| assertEquals("nellie", results.get(1).key); |
| assertEquals(5, results.get(1).value); |
| |
| // Try again after save/load: |
| File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest"); |
| tmpDir.mkdir(); |
| |
| File path = new File(tmpDir, "suggester"); |
| |
| OutputStream os = new FileOutputStream(path); |
| suggester.store(os); |
| os.close(); |
| |
| InputStream is = new FileInputStream(path); |
| suggester.load(is); |
| is.close(); |
| |
| results = suggester.lookup("nellie", false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("hambone", results.get(0).key); |
| assertEquals(6, results.get(0).value); |
| assertEquals("nellie", results.get(1).key); |
| assertEquals(5, results.get(1).value); |
| } |
| |
| public void testDupSurfaceFormsMissingResults2() throws Exception { |
| Analyzer a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| return new TokenStreamComponents(tokenizer) { |
| |
| int count; |
| |
| @Override |
| public TokenStream getTokenStream() { |
| if (count == 0) { |
| count++; |
| return new CannedTokenStream(new Token[] { |
| token("p", 1, 1), |
| token("q", 1, 1), |
| token("r", 0, 1), |
| token("s", 0, 1), |
| }); |
| } else { |
| return new CannedTokenStream(new Token[] { |
| token("p", 1, 1), |
| }); |
| } |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); |
| |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("a", 6), |
| new TermFreq("b", 5), |
| })); |
| |
| List<LookupResult> results = suggester.lookup("a", false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("a", results.get(0).key); |
| assertEquals(6, results.get(0).value); |
| assertEquals("b", results.get(1).key); |
| assertEquals(5, results.get(1).value); |
| |
| // Try again after save/load: |
| File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest"); |
| tmpDir.mkdir(); |
| |
| File path = new File(tmpDir, "suggester"); |
| |
| OutputStream os = new FileOutputStream(path); |
| suggester.store(os); |
| os.close(); |
| |
| InputStream is = new FileInputStream(path); |
| suggester.load(is); |
| is.close(); |
| |
| results = suggester.lookup("a", false, 2); |
| assertEquals(2, results.size()); |
| assertEquals("a", results.get(0).key); |
| assertEquals(6, results.get(0).value); |
| assertEquals("b", results.get(1).key); |
| assertEquals(5, results.get(1).value); |
| } |
| |
| public void test0ByteKeys() throws Exception { |
| final Analyzer a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| |
| return new TokenStreamComponents(tokenizer) { |
| int tokenStreamCounter = 0; |
| final TokenStream[] tokenStreams = new TokenStream[] { |
| new CannedBinaryTokenStream(new BinaryToken[] { |
| token(new BytesRef(new byte[] {0x0, 0x0, 0x0})), |
| }), |
| new CannedBinaryTokenStream(new BinaryToken[] { |
| token(new BytesRef(new byte[] {0x0, 0x0})), |
| }), |
| new CannedBinaryTokenStream(new BinaryToken[] { |
| token(new BytesRef(new byte[] {0x0, 0x0, 0x0})), |
| }), |
| new CannedBinaryTokenStream(new BinaryToken[] { |
| token(new BytesRef(new byte[] {0x0, 0x0})), |
| }), |
| }; |
| |
| @Override |
| public TokenStream getTokenStream() { |
| TokenStream result = tokenStreams[tokenStreamCounter]; |
| tokenStreamCounter++; |
| return result; |
| } |
| |
| @Override |
| protected void setReader(final Reader reader) throws IOException { |
| } |
| }; |
| } |
| }; |
| |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); |
| |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("a a", 50), |
| new TermFreq("a b", 50), |
| })); |
| } |
| |
| public void testDupSurfaceFormsMissingResults3() throws Exception { |
| Analyzer a = new MockAnalyzer(random()); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1); |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("a a", 7), |
| new TermFreq("a a", 7), |
| new TermFreq("a c", 6), |
| new TermFreq("a c", 3), |
| new TermFreq("a b", 5), |
| })); |
| assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString()); |
| } |
| |
| public void testEndingSpace() throws Exception { |
| Analyzer a = new MockAnalyzer(random()); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1); |
| suggester.build(new TermFreqArrayIterator(new TermFreq[] { |
| new TermFreq("i love lucy", 7), |
| new TermFreq("isla de muerta", 8), |
| })); |
| assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString()); |
| assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString()); |
| } |
| } |