| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.fst; |
| |
| import java.nio.charset.StandardCharsets; |
| import java.util.*; |
| |
| import org.apache.lucene.search.suggest.*; |
| import org.apache.lucene.search.suggest.Lookup.LookupResult; |
| import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.*; |
| |
| /** |
| * Unit tests for {@link FSTCompletion}. |
| */ |
| public class FSTCompletionTest extends LuceneTestCase { |
| |
| public static Input tf(String t, int v) { |
| return new Input(t, v); |
| } |
| |
| private FSTCompletion completion; |
| private FSTCompletion completionAlphabetical; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| |
| FSTCompletionBuilder builder = new FSTCompletionBuilder(); |
| for (Input tf : evalKeys()) { |
| builder.add(tf.term, (int) tf.v); |
| } |
| completion = builder.build(); |
| completionAlphabetical = new FSTCompletion(completion.getFST(), false, true); |
| } |
| |
| private Input[] evalKeys() { |
| final Input[] keys = new Input[] { |
| tf("one", 0), |
| tf("oneness", 1), |
| tf("onerous", 1), |
| tf("onesimus", 1), |
| tf("two", 1), |
| tf("twofold", 1), |
| tf("twonk", 1), |
| tf("thrive", 1), |
| tf("through", 1), |
| tf("threat", 1), |
| tf("three", 1), |
| tf("foundation", 1), |
| tf("fourblah", 1), |
| tf("fourteen", 1), |
| tf("four", 0), |
| tf("fourier", 0), |
| tf("fourty", 0), |
| tf("xo", 1), |
| }; |
| return keys; |
| } |
| |
| public void testExactMatchHighPriority() throws Exception { |
| assertMatchEquals(completion.lookup(stringToCharSequence("two"), 1), |
| "two/1.0"); |
| } |
| |
| public void testExactMatchLowPriority() throws Exception { |
| assertMatchEquals(completion.lookup(stringToCharSequence("one"), 2), |
| "one/0.0", |
| "oneness/1.0"); |
| } |
| |
| public void testExactMatchReordering() throws Exception { |
| // Check reordering of exact matches. |
| assertMatchEquals(completion.lookup(stringToCharSequence("four"), 4), |
| "four/0.0", |
| "fourblah/1.0", |
| "fourteen/1.0", |
| "fourier/0.0"); |
| } |
| |
| public void testRequestedCount() throws Exception { |
| // 'one' is promoted after collecting two higher ranking results. |
| assertMatchEquals(completion.lookup(stringToCharSequence("one"), 2), |
| "one/0.0", |
| "oneness/1.0"); |
| |
| // 'four' is collected in a bucket and then again as an exact match. |
| assertMatchEquals(completion.lookup(stringToCharSequence("four"), 2), |
| "four/0.0", |
| "fourblah/1.0"); |
| |
| // Check reordering of exact matches. |
| assertMatchEquals(completion.lookup(stringToCharSequence("four"), 4), |
| "four/0.0", |
| "fourblah/1.0", |
| "fourteen/1.0", |
| "fourier/0.0"); |
| |
| // 'one' is at the top after collecting all alphabetical results. |
| assertMatchEquals(completionAlphabetical.lookup(stringToCharSequence("one"), 2), |
| "one/0.0", |
| "oneness/1.0"); |
| |
| // 'one' is not promoted after collecting two higher ranking results. |
| FSTCompletion noPromotion = new FSTCompletion(completion.getFST(), true, false); |
| assertMatchEquals(noPromotion.lookup(stringToCharSequence("one"), 2), |
| "oneness/1.0", |
| "onerous/1.0"); |
| |
| // 'one' is at the top after collecting all alphabetical results. |
| assertMatchEquals(completionAlphabetical.lookup(stringToCharSequence("one"), 2), |
| "one/0.0", |
| "oneness/1.0"); |
| } |
| |
| public void testMiss() throws Exception { |
| assertMatchEquals(completion.lookup(stringToCharSequence("xyz"), 1)); |
| } |
| |
| public void testAlphabeticWithWeights() throws Exception { |
| assertEquals(0, completionAlphabetical.lookup(stringToCharSequence("xyz"), 1).size()); |
| } |
| |
| public void testFullMatchList() throws Exception { |
| assertMatchEquals(completion.lookup(stringToCharSequence("one"), Integer.MAX_VALUE), |
| "oneness/1.0", |
| "onerous/1.0", |
| "onesimus/1.0", |
| "one/0.0"); |
| } |
| |
| public void testThreeByte() throws Exception { |
| String key = new String(new byte[] { |
| (byte) 0xF0, (byte) 0xA4, (byte) 0xAD, (byte) 0xA2}, StandardCharsets.UTF_8); |
| FSTCompletionBuilder builder = new FSTCompletionBuilder(); |
| builder.add(new BytesRef(key), 0); |
| |
| FSTCompletion lookup = builder.build(); |
| List<Completion> result = lookup.lookup(stringToCharSequence(key), 1); |
| assertEquals(1, result.size()); |
| } |
| |
| public void testLargeInputConstantWeights() throws Exception { |
| Directory tempDir = getDirectory(); |
| FSTCompletionLookup lookup = new FSTCompletionLookup(tempDir, "fst", 10, true); |
| |
| Random r = random(); |
| List<Input> keys = new ArrayList<>(); |
| for (int i = 0; i < 5000; i++) { |
| keys.add(new Input(TestUtil.randomSimpleString(r), -1)); |
| } |
| |
| lookup.build(new InputArrayIterator(keys)); |
| |
| // All the weights were constant, so all returned buckets must be constant, whatever they |
| // are. |
| Long previous = null; |
| for (Input tf : keys) { |
| Long current = ((Number)lookup.get(TestUtil.bytesToCharSequence(tf.term, random()))).longValue(); |
| if (previous != null) { |
| assertEquals(previous, current); |
| } |
| previous = current; |
| } |
| tempDir.close(); |
| } |
| |
| @Slow |
| public void testMultilingualInput() throws Exception { |
| List<Input> input = LookupBenchmarkTest.readTop50KWiki(); |
| |
| Directory tempDir = getDirectory(); |
| FSTCompletionLookup lookup = new FSTCompletionLookup(tempDir, "fst"); |
| lookup.build(new InputArrayIterator(input)); |
| assertEquals(input.size(), lookup.getCount()); |
| for (Input tf : input) { |
| assertNotNull("Not found: " + tf.term.toString(), lookup.get(TestUtil.bytesToCharSequence(tf.term, random()))); |
| assertEquals(tf.term.utf8ToString(), lookup.lookup(TestUtil.bytesToCharSequence(tf.term, random()), true, 1).get(0).key.toString()); |
| } |
| |
| List<LookupResult> result = lookup.lookup(stringToCharSequence("wit"), true, 5); |
| assertEquals(5, result.size()); |
| assertTrue(result.get(0).key.toString().equals("wit")); // exact match. |
| assertTrue(result.get(1).key.toString().equals("with")); // highest count. |
| tempDir.close(); |
| } |
| |
| public void testEmptyInput() throws Exception { |
| completion = new FSTCompletionBuilder().build(); |
| assertMatchEquals(completion.lookup(stringToCharSequence(""), 10)); |
| } |
| |
| public void testRandom() throws Exception { |
| List<Input> freqs = new ArrayList<>(); |
| Random rnd = random(); |
| for (int i = 0; i < 2500 + rnd.nextInt(2500); i++) { |
| int weight = rnd.nextInt(100); |
| freqs.add(new Input("" + rnd.nextLong(), weight)); |
| } |
| |
| Directory tempDir = getDirectory(); |
| FSTCompletionLookup lookup = new FSTCompletionLookup(tempDir, "fst"); |
| lookup.build(new InputArrayIterator(freqs.toArray(new Input[freqs.size()]))); |
| |
| for (Input tf : freqs) { |
| final String term = tf.term.utf8ToString(); |
| for (int i = 1; i < term.length(); i++) { |
| String prefix = term.substring(0, i); |
| for (LookupResult lr : lookup.lookup(stringToCharSequence(prefix), true, 10)) { |
| assertTrue(lr.key.toString().startsWith(prefix)); |
| } |
| } |
| } |
| tempDir.close(); |
| } |
| |
| private CharSequence stringToCharSequence(String prefix) { |
| return TestUtil.stringToCharSequence(prefix, random()); |
| } |
| |
| private void assertMatchEquals(List<Completion> res, String... expected) { |
| String [] result = new String [res.size()]; |
| for (int i = 0; i < res.size(); i++) { |
| result[i] = res.get(i).toString(); |
| } |
| |
| if (!Arrays.equals(stripScore(expected), stripScore(result))) { |
| int colLen = Math.max(maxLen(expected), maxLen(result)); |
| |
| StringBuilder b = new StringBuilder(); |
| String format = "%" + colLen + "s " + "%" + colLen + "s\n"; |
| b.append(String.format(Locale.ROOT, format, "Expected", "Result")); |
| for (int i = 0; i < Math.max(result.length, expected.length); i++) { |
| b.append(String.format(Locale.ROOT, format, |
| i < expected.length ? expected[i] : "--", |
| i < result.length ? result[i] : "--")); |
| } |
| |
| System.err.println(b.toString()); |
| fail("Expected different output:\n" + b.toString()); |
| } |
| } |
| |
| private String[] stripScore(String[] expected) { |
| String [] result = new String [expected.length]; |
| for (int i = 0; i < result.length; i++) { |
| result[i] = expected[i].replaceAll("\\/[0-9\\.]+", ""); |
| } |
| return result; |
| } |
| |
| private int maxLen(String[] result) { |
| int len = 0; |
| for (String s : result) |
| len = Math.max(len, s.length()); |
| return len; |
| } |
| |
| private Directory getDirectory() { |
| return newDirectory(); |
| } |
| } |