| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.fst; |
| |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.StringWriter; |
| import java.io.Writer; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Random; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.TreeSet; |
| import java.util.concurrent.atomic.AtomicInteger; |
| |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.MultiTerms; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.store.MockDirectoryWrapper; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.LineFileDocs; |
| import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.RegExp; |
| import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; |
| import org.apache.lucene.util.fst.FST.Arc; |
| import org.apache.lucene.util.fst.FST.BytesReader; |
| import org.apache.lucene.util.fst.PairOutputs.Pair; |
| import org.apache.lucene.util.fst.Util.Result; |
| |
| import static org.apache.lucene.util.fst.FSTTester.getRandomString; |
| import static org.apache.lucene.util.fst.FSTTester.simpleRandomString; |
| import static org.apache.lucene.util.fst.FSTTester.toIntsRef; |
| |
| @SuppressCodecs({ "SimpleText", "Direct" }) |
| public class TestFSTs extends LuceneTestCase { |
| |
| private MockDirectoryWrapper dir; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| dir = newMockDirectory(); |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| // can be null if we force simpletext (funky, some kind of bug in test runner maybe) |
| if (dir != null) dir.close(); |
| super.tearDown(); |
| } |
| |
| public void testBasicFSA() throws IOException { |
| String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; |
| String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"}; |
| IntsRef[] terms = new IntsRef[strings.length]; |
| IntsRef[] terms2 = new IntsRef[strings2.length]; |
| for(int inputMode=0;inputMode<2;inputMode++) { |
| if (VERBOSE) { |
| System.out.println("TEST: inputMode=" + inputModeToString(inputMode)); |
| } |
| |
| for(int idx=0;idx<strings.length;idx++) { |
| terms[idx] = toIntsRef(strings[idx], inputMode); |
| } |
| for(int idx=0;idx<strings2.length;idx++) { |
| terms2[idx] = toIntsRef(strings2[idx], inputMode); |
| } |
| Arrays.sort(terms2); |
| |
| doTest(inputMode, terms); |
| |
| // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms): |
| |
| // FSA |
| { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<>(terms2.length); |
| for(IntsRef term : terms2) { |
| pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT)); |
| } |
| FSTTester<Object> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs, false); |
| FST<Object> fst = tester.doTest(0, 0, false); |
| assertNotNull(fst); |
| assertEquals(22, tester.nodeCount); |
| assertEquals(27, tester.arcCount); |
| } |
| |
| // FST ord pos int |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms2.length); |
| for(int idx=0;idx<terms2.length;idx++) { |
| pairs.add(new FSTTester.InputOutput<>(terms2[idx], (long) idx)); |
| } |
| FSTTester<Long> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs, true); |
| final FST<Long> fst = tester.doTest(0, 0, false); |
| assertNotNull(fst); |
| assertEquals(22, tester.nodeCount); |
| assertEquals(27, tester.arcCount); |
| } |
| |
| // FST byte sequence ord |
| { |
| final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| final BytesRef NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<>(terms2.length); |
| for(int idx=0;idx<terms2.length;idx++) { |
| final BytesRef output = idx == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); |
| pairs.add(new FSTTester.InputOutput<>(terms2[idx], output)); |
| } |
| FSTTester<BytesRef> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs, false); |
| final FST<BytesRef> fst = tester.doTest(0, 0, false); |
| assertNotNull(fst); |
| assertEquals(24, tester.nodeCount); |
| assertEquals(30, tester.arcCount); |
| } |
| } |
| } |
| |
| // given set of terms, test the different outputs for them |
| private void doTest(int inputMode, IntsRef[] terms) throws IOException { |
| Arrays.sort(terms); |
| |
| // NoOutputs (simple FSA) |
| { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<>(terms.length); |
| for(IntsRef term : terms) { |
| pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT)); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true); |
| } |
| |
| // PositiveIntOutput (ord) |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| pairs.add(new FSTTester.InputOutput<>(terms[idx], (long) idx)); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, true).doTest(true); |
| } |
| |
| // PositiveIntOutput (random monotonically increasing positive number) |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms.length); |
| long lastOutput = 0; |
| for(int idx=0;idx<terms.length;idx++) { |
| final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000); |
| lastOutput = value; |
| pairs.add(new FSTTester.InputOutput<>(terms[idx], value)); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, true).doTest(true); |
| } |
| |
| // PositiveIntOutput (random positive number) |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| pairs.add(new FSTTester.InputOutput<>(terms[idx], TestUtil.nextLong(random(), 0, Long.MAX_VALUE))); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true); |
| } |
| |
| // Pair<ord, (random monotonically increasing positive number> |
| { |
| final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(); |
| final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(); |
| final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2); |
| final List<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>> pairs = new ArrayList<>(terms.length); |
| long lastOutput = 0; |
| for(int idx=0;idx<terms.length;idx++) { |
| final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000); |
| lastOutput = value; |
| pairs.add(new FSTTester.InputOutput<>(terms[idx], |
| outputs.newPair((long) idx, value))); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true); |
| } |
| |
| // Sequence-of-bytes |
| { |
| final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| final BytesRef NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| final BytesRef output = random().nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); |
| pairs.add(new FSTTester.InputOutput<>(terms[idx], output)); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true); |
| } |
| |
| // Sequence-of-ints |
| { |
| final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); |
| final List<FSTTester.InputOutput<IntsRef>> pairs = new ArrayList<>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| final String s = Integer.toString(idx); |
| final IntsRef output = new IntsRef(s.length()); |
| output.length = s.length(); |
| for(int idx2=0;idx2<output.length;idx2++) { |
| output.ints[idx2] = s.charAt(idx2); |
| } |
| pairs.add(new FSTTester.InputOutput<>(terms[idx], output)); |
| } |
| new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true); |
| } |
| |
| } |
| |
| |
| public void testRandomWords() throws IOException { |
| if (TEST_NIGHTLY) { |
| testRandomWords(1000, atLeast(2)); |
| } else { |
| testRandomWords(100, 1); |
| } |
| } |
| |
| String inputModeToString(int mode) { |
| if (mode == 0) { |
| return "utf8"; |
| } else { |
| return "utf32"; |
| } |
| } |
| |
| private void testRandomWords(int maxNumWords, int numIter) throws IOException { |
| Random random = new Random(random().nextLong()); |
| for(int iter=0;iter<numIter;iter++) { |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter " + iter); |
| } |
| for(int inputMode=0;inputMode<2;inputMode++) { |
| final int numWords = random.nextInt(maxNumWords+1); |
| Set<IntsRef> termsSet = new HashSet<>(); |
| IntsRef[] terms = new IntsRef[numWords]; |
| while(termsSet.size() < numWords) { |
| final String term = getRandomString(random); |
| termsSet.add(toIntsRef(term, inputMode)); |
| } |
| doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); |
| } |
| } |
| } |
| |
| @Nightly |
| public void testBigSet() throws IOException { |
| testRandomWords(TestUtil.nextInt(random(), 50000, 60000), 1); |
| } |
| |
| // Build FST for all unique terms in the test line docs |
| // file, up until a doc limit |
| @Slow |
| public void testRealTerms() throws Exception { |
| |
| final LineFileDocs docs = new LineFileDocs(random()); |
| final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(50); |
| MockAnalyzer analyzer = new MockAnalyzer(random()); |
| analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); |
| |
| final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64); |
| final Path tempDir = createTempDir("fstlines"); |
| final Directory dir = newFSDirectory(tempDir); |
| final IndexWriter writer = new IndexWriter(dir, conf); |
| Document doc; |
| int docCount = 0; |
| while((doc = docs.nextDoc()) != null && docCount < numDocs) { |
| writer.addDocument(doc); |
| docCount++; |
| } |
| IndexReader r = DirectoryReader.open(writer); |
| writer.close(); |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| |
| Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| |
| boolean storeOrd = random().nextBoolean(); |
| if (VERBOSE) { |
| if (storeOrd) { |
| System.out.println("FST stores ord"); |
| } else { |
| System.out.println("FST stores docFreq"); |
| } |
| } |
| Terms terms = MultiTerms.getTerms(r, "body"); |
| if (terms != null) { |
| final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); |
| final TermsEnum termsEnum = terms.iterator(); |
| if (VERBOSE) { |
| System.out.println("TEST: got termsEnum=" + termsEnum); |
| } |
| BytesRef term; |
| int ord = 0; |
| |
| Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton(); |
| final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null); |
| |
| while((term = termsEnum.next()) != null) { |
| BytesRef term2 = termsEnum2.next(); |
| assertNotNull(term2); |
| assertEquals(term, term2); |
| assertEquals(termsEnum.docFreq(), termsEnum2.docFreq()); |
| assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq()); |
| |
| if (ord == 0) { |
| try { |
| termsEnum.ord(); |
| } catch (UnsupportedOperationException uoe) { |
| if (VERBOSE) { |
| System.out.println("TEST: codec doesn't support ord; FST stores docFreq"); |
| } |
| storeOrd = false; |
| } |
| } |
| final int output; |
| if (storeOrd) { |
| output = ord; |
| } else { |
| output = termsEnum.docFreq(); |
| } |
| builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output); |
| ord++; |
| if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { |
| System.out.println(ord + " terms..."); |
| } |
| } |
| FST<Long> fst = builder.finish(); |
| if (VERBOSE) { |
| System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes"); |
| } |
| |
| if (ord > 0) { |
| final Random random = new Random(random().nextLong()); |
| // Now confirm BytesRefFSTEnum and TermsEnum act the |
| // same: |
| final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); |
| int num = atLeast(1000); |
| for(int iter=0;iter<num;iter++) { |
| final BytesRef randomTerm = new BytesRef(getRandomString(random)); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm); |
| } |
| |
| final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm); |
| final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm); |
| |
| if (seekResult == TermsEnum.SeekStatus.END) { |
| assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult); |
| } else { |
| assertSame(termsEnum, fstEnum, storeOrd); |
| for(int nextIter=0;nextIter<10;nextIter++) { |
| if (VERBOSE) { |
| System.out.println("TEST: next"); |
| if (storeOrd) { |
| System.out.println(" ord=" + termsEnum.ord()); |
| } |
| } |
| if (termsEnum.next() != null) { |
| if (VERBOSE) { |
| System.out.println(" term=" + termsEnum.term().utf8ToString()); |
| } |
| assertNotNull(fstEnum.next()); |
| assertSame(termsEnum, fstEnum, storeOrd); |
| } else { |
| if (VERBOSE) { |
| System.out.println(" end!"); |
| } |
| BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next(); |
| if (nextResult != null) { |
| System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); |
| fail(); |
| } |
| break; |
| } |
| } |
| } |
| } |
| |
| } |
| } |
| |
| r.close(); |
| dir.close(); |
| } |
| |
| private void assertSame(TermsEnum termsEnum, BytesRefFSTEnum<?> fstEnum, boolean storeOrd) throws Exception { |
| if (termsEnum.term() == null) { |
| assertNull(fstEnum.current()); |
| } else { |
| assertNotNull(fstEnum.current()); |
| assertEquals(termsEnum.term().utf8ToString() + " != " + fstEnum.current().input.utf8ToString(), termsEnum.term(), fstEnum.current().input); |
| if (storeOrd) { |
| // fst stored the ord |
| assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); |
| } else { |
| // fst stored the docFreq |
| assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); |
| } |
| } |
| } |
| |
| private static abstract class VisitTerms<T> { |
| private final Path dirOut; |
| private final Path wordsFileIn; |
| private int inputMode; |
| private final Outputs<T> outputs; |
| private final Builder<T> builder; |
| |
| public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) { |
| this.dirOut = dirOut; |
| this.wordsFileIn = wordsFileIn; |
| this.inputMode = inputMode; |
| this.outputs = outputs; |
| |
| builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15); |
| } |
| |
| protected abstract T getOutput(IntsRef input, int ord) throws IOException; |
| |
| public void run(int limit, boolean verify, boolean verifyByOutput) throws IOException { |
| |
| BufferedReader is = Files.newBufferedReader(wordsFileIn, StandardCharsets.UTF_8); |
| try { |
| final IntsRefBuilder intsRef = new IntsRefBuilder(); |
| long tStart = System.currentTimeMillis(); |
| int ord = 0; |
| while(true) { |
| String w = is.readLine(); |
| if (w == null) { |
| break; |
| } |
| toIntsRef(w, inputMode, intsRef); |
| builder.add(intsRef.get(), |
| getOutput(intsRef.get(), ord)); |
| |
| ord++; |
| if (ord % 500000 == 0) { |
| System.out.println( |
| String.format(Locale.ROOT, |
| "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord)); |
| } |
| if (ord >= limit) { |
| break; |
| } |
| } |
| |
| long tMid = System.currentTimeMillis(); |
| System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms"); |
| |
| assert builder.getTermCount() == ord; |
| FST<T> fst = builder.finish(); |
| long tEnd = System.currentTimeMillis(); |
| System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack"); |
| if (fst == null) { |
| System.out.println("FST was fully pruned!"); |
| System.exit(0); |
| } |
| |
| if (dirOut == null) { |
| return; |
| } |
| |
| System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed()); |
| if (builder.getNodeCount() < 100) { |
| Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| System.out.println("Wrote FST to out.dot"); |
| } |
| |
| Directory dir = FSDirectory.open(dirOut); |
| IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT); |
| fst.save(out, out); |
| out.close(); |
| System.out.println("Saved FST to fst.bin."); |
| |
| if (!verify) { |
| return; |
| } |
| |
| /* |
| IndexInput in = dir.openInput("fst.bin", IOContext.DEFAULT); |
| fst = new FST<T>(in, outputs); |
| in.close(); |
| */ |
| |
| System.out.println("\nNow verify..."); |
| |
| while(true) { |
| for(int iter=0;iter<2;iter++) { |
| is.close(); |
| is = Files.newBufferedReader(wordsFileIn, StandardCharsets.UTF_8); |
| |
| ord = 0; |
| tStart = System.currentTimeMillis(); |
| while(true) { |
| String w = is.readLine(); |
| if (w == null) { |
| break; |
| } |
| toIntsRef(w, inputMode, intsRef); |
| if (iter == 0) { |
| T expected = getOutput(intsRef.get(), ord); |
| T actual = Util.get(fst, intsRef.get()); |
| if (actual == null) { |
| throw new RuntimeException("unexpected null output on input=" + w); |
| } |
| if (!actual.equals(expected)) { |
| throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); |
| } |
| } else { |
| // Get by output |
| final Long output = (Long) getOutput(intsRef.get(), ord); |
| @SuppressWarnings("unchecked") final IntsRef actual = Util.getByOutput((FST<Long>) fst, output.longValue()); |
| if (actual == null) { |
| throw new RuntimeException("unexpected null input from output=" + output); |
| } |
| if (!actual.equals(intsRef)) { |
| throw new RuntimeException("wrong input (got " + actual + " but expected " + intsRef + " from output=" + output); |
| } |
| } |
| |
| ord++; |
| if (ord % 500000 == 0) { |
| System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); |
| } |
| if (ord >= limit) { |
| break; |
| } |
| } |
| |
| double totSec = ((System.currentTimeMillis() - tStart)/1000.0); |
| System.out.println("Verify " + (iter == 1 ? "(by output) " : "") + "took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); |
| |
| if (!verifyByOutput) { |
| break; |
| } |
| } |
| |
| // NOTE: comment out to profile lookup... |
| break; |
| } |
| |
| } finally { |
| is.close(); |
| } |
| } |
| } |
| |
| // TODO: try experiment: reverse terms before |
| // compressing -- how much smaller? |
| |
| // TODO: can FST be used to index all internal substrings, |
| // mapping to term? |
| |
| // java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out |
| public static void main(String[] args) throws IOException { |
| int prune = 0; |
| int limit = Integer.MAX_VALUE; |
| int inputMode = 0; // utf8 |
| boolean storeOrds = false; |
| boolean storeDocFreqs = false; |
| boolean verify = true; |
| boolean noArcArrays = false; |
| Path wordsFileIn = null; |
| Path dirOut = null; |
| |
| int idx = 0; |
| while (idx < args.length) { |
| if (args[idx].equals("-prune")) { |
| prune = Integer.parseInt(args[1 + idx]); |
| idx++; |
| } else if (args[idx].equals("-limit")) { |
| limit = Integer.parseInt(args[1 + idx]); |
| idx++; |
| } else if (args[idx].equals("-utf8")) { |
| inputMode = 0; |
| } else if (args[idx].equals("-utf32")) { |
| inputMode = 1; |
| } else if (args[idx].equals("-docFreq")) { |
| storeDocFreqs = true; |
| } else if (args[idx].equals("-noArcArrays")) { |
| noArcArrays = true; |
| } else if (args[idx].equals("-ords")) { |
| storeOrds = true; |
| } else if (args[idx].equals("-noverify")) { |
| verify = false; |
| } else if (args[idx].startsWith("-")) { |
| System.err.println("Unrecognized option: " + args[idx]); |
| System.exit(-1); |
| } else { |
| if (wordsFileIn == null) { |
| wordsFileIn = Paths.get(args[idx]); |
| } else if (dirOut == null) { |
| dirOut = Paths.get(args[idx]); |
| } else { |
| System.err.println("Too many arguments, expected: input [output]"); |
| System.exit(-1); |
| } |
| } |
| idx++; |
| } |
| |
| if (wordsFileIn == null) { |
| System.err.println("No input file."); |
| System.exit(-1); |
| } |
| |
| // ord benefits from share, docFreqs don't: |
| |
| if (storeOrds && storeDocFreqs) { |
| // Store both ord & docFreq: |
| final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(); |
| final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(); |
| final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2); |
| new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| Random rand; |
| @Override |
| public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) { |
| if (ord == 0) { |
| rand = new Random(17); |
| } |
| return outputs.newPair((long) ord, |
| (long) TestUtil.nextInt(rand, 1, 5000)); |
| } |
| }.run(limit, verify, false); |
| } else if (storeOrds) { |
| // Store only ords |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| @Override |
| public Long getOutput(IntsRef input, int ord) { |
| return (long) ord; |
| } |
| }.run(limit, verify, true); |
| } else if (storeDocFreqs) { |
| // Store only docFreq |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| Random rand; |
| @Override |
| public Long getOutput(IntsRef input, int ord) { |
| if (ord == 0) { |
| rand = new Random(17); |
| } |
| return (long) TestUtil.nextInt(rand, 1, 5000); |
| } |
| }.run(limit, verify, false); |
| } else { |
| // Store nothing |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| @Override |
| public Object getOutput(IntsRef input, int ord) { |
| return NO_OUTPUT; |
| } |
| }.run(limit, verify, false); |
| } |
| } |
| |
| public void testSingleString() throws Exception { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput()); |
| final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish()); |
| assertNull(fstEnum.seekFloor(new BytesRef("foo"))); |
| assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); |
| } |
| |
| |
| public void testDuplicateFSAString() throws Exception { |
| String str = "foobar"; |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| IntsRefBuilder ints = new IntsRefBuilder(); |
| for(int i=0; i<10; i++) { |
| b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput()); |
| } |
| FST<Object> fst = b.finish(); |
| |
| // count the input paths |
| int count = 0; |
| final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst); |
| while(fstEnum.next()!=null) { |
| count++; |
| } |
| assertEquals(1, count); |
| |
| assertNotNull(Util.get(fst, new BytesRef(str))); |
| assertNull(Util.get(fst, new BytesRef("foobaz"))); |
| } |
| |
| /* |
| public void testTrivial() throws Exception { |
| |
| // Get outputs -- passing true means FST will share |
| // (delta code) the outputs. This should result in |
| // smaller FST if the outputs grow monotonically. But |
| // if numbers are "random", false should give smaller |
| // final size: |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| |
| String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; |
| |
| final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, |
| 0, 0, |
| true, |
| true, |
| Integer.MAX_VALUE, |
| outputs, |
| null, |
| true); |
| Arrays.sort(strings); |
| final IntsRef scratch = new IntsRef(); |
| for(String s : strings) { |
| builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput()); |
| } |
| final FST<Object> fst = builder.finish(); |
| System.out.println("DOT before rewrite"); |
| Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot")); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| |
| final FST<Object> rewrite = new FST<Object>(fst, 1, 100); |
| |
| System.out.println("DOT after rewrite"); |
| w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot")); |
| Util.toDot(rewrite, w, false, false); |
| w.close(); |
| } |
| */ |
| |
| public void testSimple() throws Exception { |
| |
| // Get outputs -- passing true means FST will share |
| // (delta code) the outputs. This should result in |
| // smaller FST if the outputs grow monotonically. But |
| // if numbers are "random", false should give smaller |
| // final size: |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| |
| // Build an FST mapping BytesRef -> Long |
| final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final BytesRef a = new BytesRef("a"); |
| final BytesRef b = new BytesRef("b"); |
| final BytesRef c = new BytesRef("c"); |
| |
| builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L); |
| builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L); |
| builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L); |
| |
| final FST<Long> fst = builder.finish(); |
| |
| assertEquals(13824324872317238L, (long) Util.get(fst, c)); |
| assertEquals(42, (long) Util.get(fst, b)); |
| assertEquals(17, (long) Util.get(fst, a)); |
| |
| BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); |
| BytesRefFSTEnum.InputOutput<Long> seekResult; |
| seekResult = fstEnum.seekFloor(a); |
| assertNotNull(seekResult); |
| assertEquals(17, (long) seekResult.output); |
| |
| // goes to a |
| seekResult = fstEnum.seekFloor(new BytesRef("aa")); |
| assertNotNull(seekResult); |
| assertEquals(17, (long) seekResult.output); |
| |
| // goes to b |
| seekResult = fstEnum.seekCeil(new BytesRef("aa")); |
| assertNotNull(seekResult); |
| assertEquals(b, seekResult.input); |
| assertEquals(42, (long) seekResult.output); |
| |
| assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRefBuilder()), |
| Util.getByOutput(fst, 13824324872317238L)); |
| assertNull(Util.getByOutput(fst, 47)); |
| assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRefBuilder()), |
| Util.getByOutput(fst, 42)); |
| assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRefBuilder()), |
| Util.getByOutput(fst, 17)); |
| } |
| |
| public void testPrimaryKeys() throws Exception { |
| Directory dir = newDirectory(); |
| |
| for(int cycle=0;cycle<2;cycle++) { |
| if (VERBOSE) { |
| System.out.println("TEST: cycle=" + cycle); |
| } |
| RandomIndexWriter w = new RandomIndexWriter(random(), dir, |
| newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); |
| Document doc = new Document(); |
| Field idField = newStringField("id", "", Field.Store.NO); |
| doc.add(idField); |
| |
| final int NUM_IDS = atLeast(200); |
| //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble())); |
| if (VERBOSE) { |
| System.out.println("TEST: NUM_IDS=" + NUM_IDS); |
| } |
| final Set<String> allIDs = new HashSet<>(); |
| for(int id=0;id<NUM_IDS;id++) { |
| String idString; |
| if (cycle == 0) { |
| // PKs are assigned sequentially |
| idString = String.format(Locale.ROOT, "%07d", id); |
| } else { |
| while(true) { |
| final String s = Long.toString(random().nextLong()); |
| if (!allIDs.contains(s)) { |
| idString = s; |
| break; |
| } |
| } |
| } |
| allIDs.add(idString); |
| idField.setStringValue(idString); |
| w.addDocument(doc); |
| } |
| |
| //w.forceMerge(1); |
| |
| // turn writer into reader: |
| final IndexReader r = w.getReader(); |
| final IndexSearcher s = newSearcher(r); |
| w.close(); |
| |
| final List<String> allIDsList = new ArrayList<>(allIDs); |
| final List<String> sortedAllIDsList = new ArrayList<>(allIDsList); |
| Collections.sort(sortedAllIDsList); |
| |
| // Sprinkle in some non-existent PKs: |
| Set<String> outOfBounds = new HashSet<>(); |
| for(int idx=0;idx<NUM_IDS/10;idx++) { |
| String idString; |
| if (cycle == 0) { |
| idString = String.format(Locale.ROOT, "%07d", (NUM_IDS + idx)); |
| } else { |
| while(true) { |
| idString = Long.toString(random().nextLong()); |
| if (!allIDs.contains(idString)) { |
| break; |
| } |
| } |
| } |
| outOfBounds.add(idString); |
| allIDsList.add(idString); |
| } |
| |
| // Verify w/ TermQuery |
| for(int iter=0;iter<2*NUM_IDS;iter++) { |
| final String id = allIDsList.get(random().nextInt(allIDsList.size())); |
| final boolean exists = !outOfBounds.contains(id); |
| if (VERBOSE) { |
| System.out.println("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id); |
| } |
| assertEquals((exists ? "" : "non-exist ") + "id=" + id, exists ? 1 : 0, s.count(new TermQuery(new Term("id", id)))); |
| } |
| |
| // Verify w/ MultiTermsEnum |
| final TermsEnum termsEnum = MultiTerms.getTerms(r, "id").iterator(); |
| for(int iter=0;iter<2*NUM_IDS;iter++) { |
| final String id; |
| final String nextID; |
| final boolean exists; |
| |
| if (random().nextBoolean()) { |
| id = allIDsList.get(random().nextInt(allIDsList.size())); |
| exists = !outOfBounds.contains(id); |
| nextID = null; |
| if (VERBOSE) { |
| System.out.println("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id); |
| } |
| } else { |
| // Pick ID between two IDs: |
| exists = false; |
| final int idv = random().nextInt(NUM_IDS-1); |
| if (cycle == 0) { |
| id = String.format(Locale.ROOT, "%07da", idv); |
| nextID = String.format(Locale.ROOT, "%07d", idv+1); |
| } else { |
| id = sortedAllIDsList.get(idv) + "a"; |
| nextID = sortedAllIDsList.get(idv+1); |
| } |
| if (VERBOSE) { |
| System.out.println("TEST: not exactOnly id=" + id + " nextID=" + nextID); |
| } |
| } |
| |
| final TermsEnum.SeekStatus status; |
| if (nextID == null) { |
| if (termsEnum.seekExact(new BytesRef(id))) { |
| status = TermsEnum.SeekStatus.FOUND; |
| } else { |
| status = TermsEnum.SeekStatus.NOT_FOUND; |
| } |
| } else { |
| status = termsEnum.seekCeil(new BytesRef(id)); |
| } |
| |
| if (nextID != null) { |
| assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status); |
| assertEquals("expected=" + nextID + " actual=" + termsEnum.term().utf8ToString(), new BytesRef(nextID), termsEnum.term()); |
| } else if (!exists) { |
| assertTrue(status == TermsEnum.SeekStatus.NOT_FOUND || |
| status == TermsEnum.SeekStatus.END); |
| } else { |
| assertEquals(TermsEnum.SeekStatus.FOUND, status); |
| } |
| } |
| |
| r.close(); |
| } |
| dir.close(); |
| } |
| |
| public void testRandomTermLookup() throws Exception { |
| Directory dir = newDirectory(); |
| |
| RandomIndexWriter w = new RandomIndexWriter(random(), dir, |
| newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); |
| Document doc = new Document(); |
| Field f = newStringField("field", "", Field.Store.NO); |
| doc.add(f); |
| |
| final int NUM_TERMS = (int) (1000*RANDOM_MULTIPLIER * (1+random().nextDouble())); |
| if (VERBOSE) { |
| System.out.println("TEST: NUM_TERMS=" + NUM_TERMS); |
| } |
| |
| final Set<String> allTerms = new HashSet<>(); |
| while(allTerms.size() < NUM_TERMS) { |
| allTerms.add(simpleRandomString(random())); |
| } |
| |
| for(String term : allTerms) { |
| f.setStringValue(term); |
| w.addDocument(doc); |
| } |
| |
| // turn writer into reader: |
| if (VERBOSE) { |
| System.out.println("TEST: get reader"); |
| } |
| IndexReader r = w.getReader(); |
| if (VERBOSE) { |
| System.out.println("TEST: got reader=" + r); |
| } |
| IndexSearcher s = newSearcher(r); |
| w.close(); |
| |
| final List<String> allTermsList = new ArrayList<>(allTerms); |
| Collections.shuffle(allTermsList, random()); |
| |
| // verify exact lookup |
| for(String term : allTermsList) { |
| if (VERBOSE) { |
| System.out.println("TEST: term=" + term); |
| } |
| assertEquals("term=" + term, 1, s.count(new TermQuery(new Term("field", term)))); |
| } |
| |
| r.close(); |
| dir.close(); |
| } |
| |
| |
| /** |
| * Test state expansion (array format) on close-to-root states. Creates |
| * synthetic input that has one expanded state on each level. |
| * |
| * @see <a href="https://issues.apache.org/jira/browse/LUCENE-2933">LUCENE-2933</a> |
| */ |
| public void testExpandedCloseToRoot() throws Exception { |
| class SyntheticData { |
| FST<Object> compile(String[] lines) throws IOException { |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| final Object nothing = outputs.getNoOutput(); |
| final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| int line = 0; |
| final BytesRefBuilder term = new BytesRefBuilder(); |
| final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); |
| while (line < lines.length) { |
| String w = lines[line++]; |
| if (w == null) { |
| break; |
| } |
| term.copyChars(w); |
| b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing); |
| } |
| |
| return b.finish(); |
| } |
| |
| void generate(ArrayList<String> out, StringBuilder b, char from, char to, |
| int depth) { |
| if (depth == 0 || from == to) { |
| String seq = b.toString() + "_" + out.size() + "_end"; |
| out.add(seq); |
| } else { |
| for (char c = from; c <= to; c++) { |
| b.append(c); |
| generate(out, b, from, c == to ? to : from, depth - 1); |
| b.deleteCharAt(b.length() - 1); |
| } |
| } |
| } |
| |
| public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) |
| throws IOException { |
| if (FST.targetHasArcs(arc)) { |
| int childCount = 0; |
| BytesReader fstReader = fst.getBytesReader(); |
| for (arc = fst.readFirstTargetArc(arc, arc, fstReader);; |
| arc = fst.readNextArc(arc, fstReader), childCount++) |
| { |
| boolean expanded = fst.isExpandedTarget(arc, fstReader); |
| int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1); |
| |
| assertEquals( |
| (depth <= FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH && |
| children >= FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) || |
| children >= FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS, |
| expanded); |
| if (arc.isLast()) break; |
| } |
| |
| return childCount; |
| } |
| return 0; |
| } |
| } |
| |
| // Sanity check. |
| assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS < FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS); |
| assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH >= 0); |
| |
| SyntheticData s = new SyntheticData(); |
| |
| ArrayList<String> out = new ArrayList<>(); |
| StringBuilder b = new StringBuilder(); |
| s.generate(out, b, 'a', 'i', 10); |
| String[] input = out.toArray(new String[out.size()]); |
| Arrays.sort(input); |
| FST<Object> fst = s.compile(input); |
| FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>()); |
| s.verifyStateAndBelow(fst, arc, 1); |
| } |
| |
| public void testFinalOutputOnEndState() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| |
| final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L); |
| builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L); |
| final FST<Long> fst = builder.finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); |
| StringWriter w = new StringWriter(); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| //System.out.println(w.toString()); |
| assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1); |
| } |
| |
| public void testInternalFinalState() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); |
| builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); |
| final FST<Long> fst = builder.finish(); |
| StringWriter w = new StringWriter(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| //System.out.println(w.toString()); |
| |
| // check for accept state at label t |
| assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1); |
| // check for accept state at label n |
| assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1); |
| } |
| |
| // Make sure raw FST can differentiate between final vs |
| // non-final end nodes |
| public void testNonFinalStopNode() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final Long nothing = outputs.getNoOutput(); |
| final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| //final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15); |
| final FST<Long> fst = b.fst; |
| |
| final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0); |
| |
| // Add final stop node |
| { |
| final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0); |
| node.isFinal = true; |
| rootNode.addArc('a', node); |
| final Builder.CompiledNode frozen = new Builder.CompiledNode(); |
| frozen.node = fst.addNode(b, node); |
| rootNode.arcs[0].nextFinalOutput = 17L; |
| rootNode.arcs[0].isFinal = true; |
| rootNode.arcs[0].output = nothing; |
| rootNode.arcs[0].target = frozen; |
| } |
| |
| // Add non-final stop node |
| { |
| final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0); |
| rootNode.addArc('b', node); |
| final Builder.CompiledNode frozen = new Builder.CompiledNode(); |
| frozen.node = fst.addNode(b, node); |
| rootNode.arcs[1].nextFinalOutput = nothing; |
| rootNode.arcs[1].output = 42L; |
| rootNode.arcs[1].target = frozen; |
| } |
| |
| fst.finish(fst.addNode(b, rootNode)); |
| |
| StringWriter w = new StringWriter(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| |
| checkStopNodes(fst, outputs); |
| |
| // Make sure it still works after save/load: |
| Directory dir = newDirectory(); |
| IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); |
| fst.save(out, out); |
| out.close(); |
| |
| IndexInput in = dir.openInput("fst", IOContext.DEFAULT); |
| final FST<Long> fst2 = new FST<>(in, in, outputs); |
| checkStopNodes(fst2, outputs); |
| in.close(); |
| dir.close(); |
| } |
| |
| private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception { |
| final Long nothing = outputs.getNoOutput(); |
| FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>()); |
| assertEquals(nothing, startArc.output()); |
| assertEquals(nothing, startArc.nextFinalOutput()); |
| |
| FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>(), |
| fst.getBytesReader()); |
| assertEquals('a', arc.label()); |
| assertEquals(17, arc.nextFinalOutput().longValue()); |
| assertTrue(arc.isFinal()); |
| |
| arc = fst.readNextArc(arc, fst.getBytesReader()); |
| assertEquals('b', arc.label()); |
| assertFalse(arc.isFinal()); |
| assertEquals(42, arc.output().longValue()); |
| } |
| |
| static final Comparator<Long> minLongComparator = new Comparator<Long> () { |
| @Override |
| public int compare(Long left, Long right) { |
| return left.compareTo(right); |
| } |
| }; |
| |
| public void testShortestPaths() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); |
| builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); |
| builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); |
| final FST<Long> fst = builder.finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.close(); |
| |
| Util.TopResults<Long> res = Util.shortestPaths(fst, |
| fst.getFirstArc(new FST.Arc<Long>()), |
| outputs.getNoOutput(), |
| minLongComparator, |
| 3, |
| true); |
| assertTrue(res.isComplete); |
| assertEquals(3, res.topN.size()); |
| assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input); |
| assertEquals(7L, res.topN.get(0).output.longValue()); |
| |
| assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input); |
| assertEquals(17L,res.topN.get(1).output.longValue()); |
| |
| assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input); |
| assertEquals(22L, res.topN.get(2).output.longValue()); |
| } |
| |
| public void testRejectNoLimits() throws IOException { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L); |
| builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L); |
| builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L); |
| builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L); |
| |
| builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L); |
| final FST<Long> fst = builder.finish(); |
| final AtomicInteger rejectCount = new AtomicInteger(); |
| Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) { |
| @Override |
| protected boolean acceptResult(IntsRef input, Long output) { |
| boolean accept = output.intValue() == 7; |
| if (!accept) { |
| rejectCount.incrementAndGet(); |
| } |
| return accept; |
| } |
| }; |
| |
| searcher.addStartPaths(fst.getFirstArc(new FST.Arc<Long>()), outputs.getNoOutput(), true, new IntsRefBuilder()); |
| Util.TopResults<Long> res = searcher.search(); |
| assertEquals(rejectCount.get(), 4); |
| assertTrue(res.isComplete); // rejected(4) + topN(2) <= maxQueueSize(6) |
| |
| assertEquals(1, res.topN.size()); |
| assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input); |
| assertEquals(7L, res.topN.get(0).output.longValue()); |
| rejectCount.set(0); |
| searcher = new Util.TopNSearcher<Long>(fst, 2, 5, minLongComparator) { |
| @Override |
| protected boolean acceptResult(IntsRef input, Long output) { |
| boolean accept = output.intValue() == 7; |
| if (!accept) { |
| rejectCount.incrementAndGet(); |
| } |
| return accept; |
| } |
| }; |
| |
| searcher.addStartPaths(fst.getFirstArc(new FST.Arc<Long>()), outputs.getNoOutput(), true, new IntsRefBuilder()); |
| res = searcher.search(); |
| assertEquals(rejectCount.get(), 4); |
| assertFalse(res.isComplete); // rejected(4) + topN(2) > maxQueueSize(5) |
| } |
| |
| // compares just the weight side of the pair |
| static final Comparator<Pair<Long,Long>> minPairWeightComparator = new Comparator<Pair<Long,Long>> () { |
| @Override |
| public int compare(Pair<Long,Long> left, Pair<Long,Long> right) { |
| return left.output1.compareTo(right.output1); |
| } |
| }; |
| |
| /** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */ |
| public void testShortestPathsWFST() throws Exception { |
| |
| PairOutputs<Long,Long> outputs = new PairOutputs<>( |
| PositiveIntOutputs.getSingleton(), // weight |
| PositiveIntOutputs.getSingleton() // output |
| ); |
| |
| final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); |
| builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); |
| builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); |
| final FST<Pair<Long,Long>> fst = builder.finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.close(); |
| |
| Util.TopResults<Pair<Long,Long>> res = Util.shortestPaths(fst, |
| fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()), |
| outputs.getNoOutput(), |
| minPairWeightComparator, |
| 3, |
| true); |
| assertTrue(res.isComplete); |
| assertEquals(3, res.topN.size()); |
| |
| assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input); |
| assertEquals(7L, res.topN.get(0).output.output1.longValue()); // weight |
| assertEquals(36L, res.topN.get(0).output.output2.longValue()); // output |
| |
| assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input); |
| assertEquals(17L, res.topN.get(1).output.output1.longValue()); // weight |
| assertEquals(85L, res.topN.get(1).output.output2.longValue()); // output |
| |
| assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input); |
| assertEquals(22L, res.topN.get(2).output.output1.longValue()); // weight |
| assertEquals(57L, res.topN.get(2).output.output2.longValue()); // output |
| } |
| |
| public void testShortestPathsRandom() throws Exception { |
| final Random random = random(); |
| int numWords = atLeast(1000); |
| |
| final TreeMap<String,Long> slowCompletor = new TreeMap<>(); |
| final TreeSet<String> allPrefixes = new TreeSet<>(); |
| |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| |
| for (int i = 0; i < numWords; i++) { |
| String s; |
| while (true) { |
| s = TestUtil.randomSimpleString(random); |
| if (!slowCompletor.containsKey(s)) { |
| break; |
| } |
| } |
| |
| for (int j = 1; j < s.length(); j++) { |
| allPrefixes.add(s.substring(0, j)); |
| } |
| int weight = TestUtil.nextInt(random, 1, 100); // weights 1..100 |
| slowCompletor.put(s, (long)weight); |
| } |
| |
| for (Map.Entry<String,Long> e : slowCompletor.entrySet()) { |
| //System.out.println("add: " + e); |
| builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue()); |
| } |
| |
| final FST<Long> fst = builder.finish(); |
| //System.out.println("SAVE out.dot"); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.close(); |
| |
| BytesReader reader = fst.getBytesReader(); |
| |
| //System.out.println("testing: " + allPrefixes.size() + " prefixes"); |
| for (String prefix : allPrefixes) { |
| // 1. run prefix against fst, then complete by value |
| //System.out.println("TEST: " + prefix); |
| |
| long prefixOutput = 0; |
| FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>()); |
| for(int idx=0;idx<prefix.length();idx++) { |
| if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) { |
| fail(); |
| } |
| prefixOutput += arc.output(); |
| } |
| |
| final int topN = TestUtil.nextInt(random, 1, 10); |
| |
| Util.TopResults<Long> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minLongComparator, topN, true); |
| assertTrue(r.isComplete); |
| |
| // 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion |
| final List<Result<Long>> matches = new ArrayList<>(); |
| |
| // TODO: could be faster... but it's slowCompletor for a reason |
| for (Map.Entry<String,Long> e : slowCompletor.entrySet()) { |
| if (e.getKey().startsWith(prefix)) { |
| //System.out.println(" consider " + e.getKey()); |
| matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()), |
| e.getValue() - prefixOutput)); |
| } |
| } |
| |
| assertTrue(matches.size() > 0); |
| Collections.sort(matches, new TieBreakByInputComparator<>(minLongComparator)); |
| if (matches.size() > topN) { |
| matches.subList(topN, matches.size()).clear(); |
| } |
| |
| assertEquals(matches.size(), r.topN.size()); |
| |
| for(int hit=0;hit<r.topN.size();hit++) { |
| //System.out.println(" check hit " + hit); |
| assertEquals(matches.get(hit).input, r.topN.get(hit).input); |
| assertEquals(matches.get(hit).output, r.topN.get(hit).output); |
| } |
| } |
| } |
| |
| private static class TieBreakByInputComparator<T> implements Comparator<Result<T>> { |
| private final Comparator<T> comparator; |
| public TieBreakByInputComparator(Comparator<T> comparator) { |
| this.comparator = comparator; |
| } |
| |
| @Override |
| public int compare(Result<T> a, Result<T> b) { |
| int cmp = comparator.compare(a.output, b.output); |
| if (cmp == 0) { |
| return a.input.compareTo(b.input); |
| } else { |
| return cmp; |
| } |
| } |
| } |
| |
| // used by slowcompletor |
| static class TwoLongs { |
| long a; |
| long b; |
| |
| TwoLongs(long a, long b) { |
| this.a = a; |
| this.b = b; |
| } |
| } |
| |
| /** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */ |
| public void testShortestPathsWFSTRandom() throws Exception { |
| int numWords = atLeast(1000); |
| |
| final TreeMap<String,TwoLongs> slowCompletor = new TreeMap<>(); |
| final TreeSet<String> allPrefixes = new TreeSet<>(); |
| |
| PairOutputs<Long,Long> outputs = new PairOutputs<>( |
| PositiveIntOutputs.getSingleton(), // weight |
| PositiveIntOutputs.getSingleton() // output |
| ); |
| final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| |
| Random random = random(); |
| for (int i = 0; i < numWords; i++) { |
| String s; |
| while (true) { |
| s = TestUtil.randomSimpleString(random); |
| if (!slowCompletor.containsKey(s)) { |
| break; |
| } |
| } |
| |
| for (int j = 1; j < s.length(); j++) { |
| allPrefixes.add(s.substring(0, j)); |
| } |
| int weight = TestUtil.nextInt(random, 1, 100); // weights 1..100 |
| int output = TestUtil.nextInt(random, 0, 500); // outputs 0..500 |
| slowCompletor.put(s, new TwoLongs(weight, output)); |
| } |
| |
| for (Map.Entry<String,TwoLongs> e : slowCompletor.entrySet()) { |
| //System.out.println("add: " + e); |
| long weight = e.getValue().a; |
| long output = e.getValue().b; |
| builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output)); |
| } |
| |
| final FST<Pair<Long,Long>> fst = builder.finish(); |
| //System.out.println("SAVE out.dot"); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.close(); |
| |
| BytesReader reader = fst.getBytesReader(); |
| |
| //System.out.println("testing: " + allPrefixes.size() + " prefixes"); |
| for (String prefix : allPrefixes) { |
| // 1. run prefix against fst, then complete by value |
| //System.out.println("TEST: " + prefix); |
| |
| Pair<Long,Long> prefixOutput = outputs.getNoOutput(); |
| FST.Arc<Pair<Long,Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()); |
| for(int idx=0;idx<prefix.length();idx++) { |
| if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) { |
| fail(); |
| } |
| prefixOutput = outputs.add(prefixOutput, arc.output()); |
| } |
| |
| final int topN = TestUtil.nextInt(random, 1, 10); |
| |
| Util.TopResults<Pair<Long,Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true); |
| assertTrue(r.isComplete); |
| // 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion |
| final List<Result<Pair<Long,Long>>> matches = new ArrayList<>(); |
| |
| // TODO: could be faster... but it's slowCompletor for a reason |
| for (Map.Entry<String,TwoLongs> e : slowCompletor.entrySet()) { |
| if (e.getKey().startsWith(prefix)) { |
| //System.out.println(" consider " + e.getKey()); |
| matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()), |
| outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2))); |
| } |
| } |
| |
| assertTrue(matches.size() > 0); |
| Collections.sort(matches, new TieBreakByInputComparator<>(minPairWeightComparator)); |
| if (matches.size() > topN) { |
| matches.subList(topN, matches.size()).clear(); |
| } |
| |
| assertEquals(matches.size(), r.topN.size()); |
| |
| for(int hit=0;hit<r.topN.size();hit++) { |
| //System.out.println(" check hit " + hit); |
| assertEquals(matches.get(hit).input, r.topN.get(hit).input); |
| assertEquals(matches.get(hit).output, r.topN.get(hit).output); |
| } |
| } |
| } |
| |
| public void testLargeOutputsOnArrayArcs() throws Exception { |
| final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final byte[] bytes = new byte[300]; |
| final IntsRefBuilder input = new IntsRefBuilder(); |
| input.append(0); |
| final BytesRef output = new BytesRef(bytes); |
| for(int arc=0;arc<6;arc++) { |
| input.setIntAt(0, arc); |
| output.bytes[0] = (byte) arc; |
| builder.add(input.get(), BytesRef.deepCopyOf(output)); |
| } |
| |
| final FST<BytesRef> fst = builder.finish(); |
| for(int arc=0;arc<6;arc++) { |
| input.setIntAt(0, arc); |
| final BytesRef result = Util.get(fst, input.get()); |
| assertNotNull(result); |
| assertEquals(300, result.length); |
| assertEquals(result.bytes[result.offset], arc); |
| for(int byteIDX=1;byteIDX<result.length;byteIDX++) { |
| assertEquals(0, result.bytes[result.offset+byteIDX]); |
| } |
| } |
| } |
| |
| public void testIllegallyModifyRootArc() throws Exception { |
| assumeTrue("test relies on assertions", assertsAreEnabled); |
| |
| Set<BytesRef> terms = new HashSet<>(); |
| for(int i=0;i<100;i++) { |
| String prefix = Character.toString((char) ('a' + i)); |
| terms.add(new BytesRef(prefix)); |
| if (prefix.equals("m") == false) { |
| for(int j=0;j<20;j++) { |
| // Make a big enough FST that the root cache will be created: |
| String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20); |
| terms.add(new BytesRef(prefix + suffix)); |
| } |
| } |
| } |
| |
| List<BytesRef> termsList = new ArrayList<>(terms); |
| Collections.sort(termsList); |
| |
| ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| IntsRefBuilder input = new IntsRefBuilder(); |
| for(BytesRef term : termsList) { |
| Util.toIntsRef(term, input); |
| builder.add(input.get(), term); |
| } |
| |
| FST<BytesRef> fst = builder.finish(); |
| |
| Arc<BytesRef> arc = new FST.Arc<>(); |
| fst.getFirstArc(arc); |
| FST.BytesReader reader = fst.getBytesReader(); |
| arc = fst.findTargetArc((int) 'm', arc, arc, reader); |
| assertNotNull(arc); |
| assertEquals(new BytesRef("m"), arc.output()); |
| |
| // NOTE: illegal: |
| arc.output().length = 0; |
| |
| fst.getFirstArc(arc); |
| try { |
| arc = fst.findTargetArc((int) 'm', arc, arc, reader); |
| } catch (AssertionError ae) { |
| // expected |
| } |
| } |
| |
| public void testSimpleDepth() throws Exception { |
| PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| BytesRef ab = new BytesRef("ab"); |
| BytesRef ac = new BytesRef("ac"); |
| BytesRef bd = new BytesRef("bd"); |
| |
| builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L); |
| builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L); |
| builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L); |
| |
| FST<Long> fst = builder.finish(); |
| |
| assertEquals(3, (long) Util.get(fst, ab)); |
| assertEquals(5, (long) Util.get(fst, ac)); |
| assertEquals(7, (long) Util.get(fst, bd)); |
| } |
| } |