| package org.apache.lucene.util.fst; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.io.OutputStreamWriter; |
| import java.io.Writer; |
| import java.util.*; |
| |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermEnum; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.store.MockDirectoryWrapper; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.LineFileDocs; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util._TestUtil; |
| import org.apache.lucene.util.fst.FST.Arc; |
| |
| public class TestFSTs extends LuceneTestCase { |
| |
| private MockDirectoryWrapper dir; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| dir = newDirectory(); |
| dir.setPreventDoubleWrite(false); |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| dir.close(); |
| super.tearDown(); |
| } |
| |
| private static BytesRef toBytesRef(IntsRef ir) { |
| BytesRef br = new BytesRef(ir.length); |
| for(int i=0;i<ir.length;i++) { |
| int x = ir.ints[ir.offset+i]; |
| assert x >= 0 && x <= 255; |
| br.bytes[i] = (byte) x; |
| } |
| br.length = ir.length; |
| return br; |
| } |
| |
| private static IntsRef toIntsRef(String s, int inputMode) { |
| return toIntsRef(s, inputMode, new IntsRef(10)); |
| } |
| |
| private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { |
| if (inputMode == 0) { |
| // utf8 |
| return toIntsRef(new BytesRef(s), ir); |
| } else { |
| // utf32 |
| return toIntsRefUTF32(s, ir); |
| } |
| } |
| |
| private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { |
| final int charLength = s.length(); |
| int charIdx = 0; |
| int intIdx = 0; |
| while(charIdx < charLength) { |
| if (intIdx == ir.ints.length) { |
| ir.grow(intIdx+1); |
| } |
| final int utf32 = s.codePointAt(charIdx); |
| ir.ints[intIdx] = utf32; |
| charIdx += Character.charCount(utf32); |
| intIdx++; |
| } |
| ir.length = intIdx; |
| return ir; |
| } |
| |
| private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { |
| if (br.length > ir.ints.length) { |
| ir.grow(br.length); |
| } |
| for(int i=0;i<br.length;i++) { |
| ir.ints[i] = br.bytes[br.offset+i]&0xFF; |
| } |
| ir.length = br.length; |
| return ir; |
| } |
| |
| public void testBasicFSA() throws IOException { |
| String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; |
| String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"}; |
| IntsRef[] terms = new IntsRef[strings.length]; |
| IntsRef[] terms2 = new IntsRef[strings2.length]; |
| for(int inputMode=0;inputMode<2;inputMode++) { |
| if (VERBOSE) { |
| System.out.println("TEST: inputMode=" + inputModeToString(inputMode)); |
| } |
| |
| for(int idx=0;idx<strings.length;idx++) { |
| terms[idx] = toIntsRef(strings[idx], inputMode); |
| } |
| for(int idx=0;idx<strings2.length;idx++) { |
| terms2[idx] = toIntsRef(strings2[idx], inputMode); |
| } |
| Arrays.sort(terms2); |
| |
| doTest(inputMode, terms); |
| |
| // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms): |
| |
| // FSA |
| { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms2.length); |
| for(IntsRef term : terms2) { |
| pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT)); |
| } |
| FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); |
| assertNotNull(fst); |
| assertEquals(22, fst.getNodeCount()); |
| assertEquals(27, fst.getArcCount()); |
| } |
| |
| // FST ord pos int |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length); |
| for(int idx=0;idx<terms2.length;idx++) { |
| pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx))); |
| } |
| final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); |
| assertNotNull(fst); |
| assertEquals(22, fst.getNodeCount()); |
| assertEquals(27, fst.getArcCount()); |
| } |
| |
| // FST byte sequence ord |
| { |
| final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| final BytesRef NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms2.length); |
| for(int idx=0;idx<terms2.length;idx++) { |
| final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); |
| pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output)); |
| } |
| final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); |
| assertNotNull(fst); |
| assertEquals(24, fst.getNodeCount()); |
| assertEquals(30, fst.getArcCount()); |
| } |
| } |
| } |
| |
| private static String simpleRandomString(Random r) { |
| final int end = r.nextInt(10); |
| if (end == 0) { |
| // allow 0 length |
| return ""; |
| } |
| final char[] buffer = new char[end]; |
| for (int i = 0; i < end; i++) { |
| buffer[i] = (char) _TestUtil.nextInt(r, 97, 102); |
| } |
| return new String(buffer, 0, end); |
| } |
| |
| // given set of terms, test the different outputs for them |
| private void doTest(int inputMode, IntsRef[] terms) throws IOException { |
| Arrays.sort(terms); |
| |
| // NoOutputs (simple FSA) |
| { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length); |
| for(IntsRef term : terms) { |
| pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT)); |
| } |
| new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // PositiveIntOutput (ord) |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx))); |
| } |
| new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // PositiveIntOutput (random monotonically increasing positive number) |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length); |
| long lastOutput = 0; |
| for(int idx=0;idx<terms.length;idx++) { |
| final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000); |
| lastOutput = value; |
| pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value))); |
| } |
| new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // PositiveIntOutput (random positive number) |
| { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); |
| final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); |
| } |
| new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // Pair<ord, (random monotonically increasing positive number> |
| { |
| final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean()); |
| final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean()); |
| final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2); |
| final List<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>> pairs = new ArrayList<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>>(terms.length); |
| long lastOutput = 0; |
| for(int idx=0;idx<terms.length;idx++) { |
| final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000); |
| lastOutput = value; |
| pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx], |
| outputs.get(o1.get(idx), |
| o2.get(value)))); |
| } |
| new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // Sequence-of-bytes |
| { |
| final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| final BytesRef NO_OUTPUT = outputs.getNoOutput(); |
| final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); |
| pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output)); |
| } |
| new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // Sequence-of-ints |
| { |
| final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); |
| final List<FSTTester.InputOutput<IntsRef>> pairs = new ArrayList<FSTTester.InputOutput<IntsRef>>(terms.length); |
| for(int idx=0;idx<terms.length;idx++) { |
| final String s = Integer.toString(idx); |
| final IntsRef output = new IntsRef(s.length()); |
| output.length = s.length(); |
| for(int idx2=0;idx2<output.length;idx2++) { |
| output.ints[idx2] = s.charAt(idx2); |
| } |
| pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output)); |
| } |
| new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| |
| // Up to two positive ints, shared, generally but not |
| // monotonically increasing |
| { |
| if (VERBOSE) { |
| System.out.println("TEST: now test UpToTwoPositiveIntOutputs"); |
| } |
| final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true); |
| final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length); |
| long lastOutput = 0; |
| for(int idx=0;idx<terms.length;idx++) { |
| // Sometimes go backwards |
| long value = lastOutput + _TestUtil.nextInt(random, -100, 1000); |
| while(value < 0) { |
| value = lastOutput + _TestUtil.nextInt(random, -100, 1000); |
| } |
| final Object output; |
| if (random.nextInt(5) == 3) { |
| long value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000); |
| while(value2 < 0) { |
| value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000); |
| } |
| output = outputs.get(value, value2); |
| } else { |
| output = outputs.get(value); |
| } |
| pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output)); |
| } |
| new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(); |
| } |
| } |
| |
| private static class FSTTester<T> { |
| |
| final Random random; |
| final List<InputOutput<T>> pairs; |
| final int inputMode; |
| final Outputs<T> outputs; |
| final Directory dir; |
| |
| public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) { |
| this.random = random; |
| this.dir = dir; |
| this.inputMode = inputMode; |
| this.pairs = pairs; |
| this.outputs = outputs; |
| } |
| |
| private static class InputOutput<T> implements Comparable<InputOutput<T>> { |
| public final IntsRef input; |
| public final T output; |
| |
| public InputOutput(IntsRef input, T output) { |
| this.input = input; |
| this.output = output; |
| } |
| |
| public int compareTo(InputOutput<T> other) { |
| if (other instanceof InputOutput) { |
| return input.compareTo((other).input); |
| } else { |
| throw new IllegalArgumentException(); |
| } |
| } |
| } |
| |
| public void doTest() throws IOException { |
| // no pruning |
| doTest(0, 0, true); |
| |
| if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { |
| // simple pruning |
| doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true); |
| |
| // leafy pruning |
| doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true); |
| } |
| } |
| |
| // runs the term, returning the output, or null if term |
| // isn't accepted. if prefixLength is non-null it must be |
| // length 1 int array; prefixLength[0] is set to the length |
| // of the term prefix that matches |
| private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException { |
| assert prefixLength == null || prefixLength.length == 1; |
| final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); |
| final T NO_OUTPUT = fst.outputs.getNoOutput(); |
| T output = NO_OUTPUT; |
| |
| for(int i=0;i<=term.length;i++) { |
| final int label; |
| if (i == term.length) { |
| label = FST.END_LABEL; |
| } else { |
| label = term.ints[term.offset+i]; |
| } |
| //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); |
| if (fst.findTargetArc(label, arc, arc) == null) { |
| if (prefixLength != null) { |
| prefixLength[0] = i; |
| return output; |
| } else { |
| return null; |
| } |
| } |
| output = fst.outputs.add(output, arc.output); |
| } |
| |
| if (prefixLength != null) { |
| prefixLength[0] = term.length; |
| } |
| |
| return output; |
| } |
| |
| private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException { |
| FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); |
| |
| final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>(); |
| in.length = 0; |
| in.offset = 0; |
| final T NO_OUTPUT = fst.outputs.getNoOutput(); |
| T output = NO_OUTPUT; |
| |
| while(true) { |
| // read all arcs: |
| fst.readFirstTargetArc(arc, arc); |
| arcs.add(new FST.Arc<T>().copyFrom(arc)); |
| while(!arc.isLast()) { |
| fst.readNextArc(arc); |
| arcs.add(new FST.Arc<T>().copyFrom(arc)); |
| } |
| |
| // pick one |
| arc = arcs.get(random.nextInt(arcs.size())); |
| arcs.clear(); |
| |
| // accumulate output |
| output = fst.outputs.add(output, arc.output); |
| |
| // append label |
| if (arc.label == FST.END_LABEL) { |
| break; |
| } |
| |
| if (in.ints.length == in.length) { |
| in.grow(1+in.length); |
| } |
| in.ints[in.length++] = arc.label; |
| } |
| |
| return output; |
| } |
| |
| |
| FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { |
| if (VERBOSE) { |
| System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); |
| } |
| |
| final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, |
| prune1, prune2, |
| prune1==0 && prune2==0, |
| allowRandomSuffixSharing ? random.nextBoolean() : true, |
| allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, |
| outputs); |
| |
| for(InputOutput<T> pair : pairs) { |
| if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { |
| final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs; |
| final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output; |
| @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder; |
| builderObject.add(pair.input, _outputs.get(twoLongs.first)); |
| builderObject.add(pair.input, _outputs.get(twoLongs.second)); |
| } else { |
| builder.add(pair.input, pair.output); |
| } |
| } |
| FST<T> fst = builder.finish(); |
| |
| if (random.nextBoolean() && fst != null) { |
| IndexOutput out = dir.createOutput("fst.bin"); |
| fst.save(out); |
| out.close(); |
| IndexInput in = dir.openInput("fst.bin"); |
| try { |
| fst = new FST<T>(in, outputs); |
| } finally { |
| in.close(); |
| dir.deleteFile("fst.bin"); |
| } |
| } |
| |
| if (VERBOSE && pairs.size() <= 20 && fst != null) { |
| Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| System.out.println("SAVED out.dot"); |
| } |
| |
| if (VERBOSE) { |
| if (fst == null) { |
| System.out.println(" fst has 0 nodes (fully pruned)"); |
| } else { |
| System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs"); |
| } |
| } |
| |
| if (prune1 == 0 && prune2 == 0) { |
| verifyUnPruned(inputMode, fst); |
| } else { |
| verifyPruned(inputMode, fst, prune1, prune2); |
| } |
| |
| return fst; |
| } |
| |
| // FST is complete |
| private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException { |
| |
| if (pairs.size() == 0) { |
| assertNull(fst); |
| return; |
| } |
| |
| if (VERBOSE) { |
| System.out.println("TEST: now verify " + pairs.size() + " terms"); |
| for(InputOutput<T> pair : pairs) { |
| assertNotNull(pair); |
| assertNotNull(pair.input); |
| assertNotNull(pair.output); |
| System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); |
| } |
| } |
| |
| assertNotNull(fst); |
| |
| // visit valid paris in order -- make sure all words |
| // are accepted, and FSTEnum's next() steps through |
| // them correctly |
| if (VERBOSE) { |
| System.out.println("TEST: check valid terms/next()"); |
| } |
| { |
| IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst); |
| for(InputOutput<T> pair : pairs) { |
| IntsRef term = pair.input; |
| if (VERBOSE) { |
| System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output)); |
| } |
| Object output = run(fst, term, null); |
| |
| assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output); |
| assertEquals(pair.output, output); |
| |
| // verify enum's next |
| IntsRefFSTEnum.InputOutput<T> t = fstEnum.next(); |
| assertNotNull(t); |
| assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input); |
| assertEquals(pair.output, t.output); |
| } |
| assertNull(fstEnum.next()); |
| } |
| |
| final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>(); |
| for(InputOutput<T> pair : pairs) { |
| termsMap.put(pair.input, pair.output); |
| } |
| |
| // find random matching word and make sure it's valid |
| if (VERBOSE) { |
| System.out.println("TEST: verify random accepted terms"); |
| } |
| final IntsRef scratch = new IntsRef(10); |
| int num = atLeast(500); |
| for(int iter=0;iter<num;iter++) { |
| T output = randomAcceptedWord(fst, scratch); |
| assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch)); |
| assertEquals(termsMap.get(scratch), output); |
| } |
| |
| // test IntsRefFSTEnum.seek: |
| if (VERBOSE) { |
| System.out.println("TEST: verify seek"); |
| } |
| IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst); |
| num = atLeast(100); |
| for(int iter=0;iter<num;iter++) { |
| if (VERBOSE) { |
| System.out.println("TEST: iter=" + iter); |
| } |
| if (random.nextBoolean()) { |
| // seek to term that doesn't exist: |
| while(true) { |
| final IntsRef term = toIntsRef(getRandomString(), inputMode); |
| int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null)); |
| if (pos < 0) { |
| pos = -(pos+1); |
| // ok doesn't exist |
| //System.out.println(" seek " + inputToString(inputMode, term)); |
| final IntsRefFSTEnum.InputOutput<T> seekResult; |
| if (random.nextBoolean()) { |
| if (VERBOSE) { |
| System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term)); |
| } |
| seekResult = fstEnum.seekFloor(term); |
| pos--; |
| } else { |
| if (VERBOSE) { |
| System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term)); |
| } |
| seekResult = fstEnum.seekCeil(term); |
| } |
| |
| if (pos != -1 && pos < pairs.size()) { |
| //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output)); |
| assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult); |
| if (VERBOSE) { |
| System.out.println(" got " + inputToString(inputMode, seekResult.input)); |
| } |
| assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input); |
| assertEquals(pairs.get(pos).output, seekResult.output); |
| } else { |
| // seeked before start or beyond end |
| //System.out.println("seek=" + seekTerm); |
| assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult); |
| if (VERBOSE) { |
| System.out.println(" got null"); |
| } |
| } |
| |
| break; |
| } |
| } |
| } else { |
| // seek to term that does exist: |
| InputOutput<T> pair = pairs.get(random.nextInt(pairs.size())); |
| final IntsRefFSTEnum.InputOutput<T> seekResult; |
| if (random.nextBoolean()) { |
| if (VERBOSE) { |
| System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input)); |
| } |
| seekResult = fstEnum.seekFloor(pair.input); |
| } else { |
| if (VERBOSE) { |
| System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input)); |
| } |
| seekResult = fstEnum.seekCeil(pair.input); |
| } |
| assertNotNull(seekResult); |
| assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input); |
| assertEquals(pair.output, seekResult.output); |
| } |
| } |
| |
| if (VERBOSE) { |
| System.out.println("TEST: mixed next/seek"); |
| } |
| |
| // test mixed next/seek |
| num = atLeast(100); |
| for(int iter=0;iter<num;iter++) { |
| if (VERBOSE) { |
| System.out.println("TEST: iter " + iter); |
| } |
| // reset: |
| fstEnum = new IntsRefFSTEnum<T>(fst); |
| int upto = -1; |
| while(true) { |
| boolean isDone = false; |
| if (upto == pairs.size()-1 || random.nextBoolean()) { |
| // next |
| upto++; |
| if (VERBOSE) { |
| System.out.println(" do next"); |
| } |
| isDone = fstEnum.next() == null; |
| } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) { |
| int attempt = 0; |
| for(;attempt<10;attempt++) { |
| IntsRef term = toIntsRef(getRandomString(), inputMode); |
| if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) { |
| int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null)); |
| assert pos < 0; |
| upto = -(pos+1); |
| |
| if (random.nextBoolean()) { |
| upto--; |
| assertTrue(upto != -1); |
| if (VERBOSE) { |
| System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")"); |
| } |
| isDone = fstEnum.seekFloor(term) == null; |
| } else { |
| if (VERBOSE) { |
| System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")"); |
| } |
| isDone = fstEnum.seekCeil(term) == null; |
| } |
| |
| break; |
| } |
| } |
| if (attempt == 10) { |
| continue; |
| } |
| |
| } else { |
| final int inc = random.nextInt(pairs.size() - upto - 1); |
| upto += inc; |
| if (upto == -1) { |
| upto = 0; |
| } |
| |
| if (random.nextBoolean()) { |
| if (VERBOSE) { |
| System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")"); |
| } |
| isDone = fstEnum.seekCeil(pairs.get(upto).input) == null; |
| } else { |
| if (VERBOSE) { |
| System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")"); |
| } |
| isDone = fstEnum.seekFloor(pairs.get(upto).input) == null; |
| } |
| } |
| if (VERBOSE) { |
| if (!isDone) { |
| System.out.println(" got " + inputToString(inputMode, fstEnum.current().input)); |
| } else { |
| System.out.println(" got null"); |
| } |
| } |
| |
| if (upto == pairs.size()) { |
| assertTrue(isDone); |
| break; |
| } else { |
| assertFalse(isDone); |
| assertEquals(pairs.get(upto).input, fstEnum.current().input); |
| assertEquals(pairs.get(upto).output, fstEnum.current().output); |
| |
| /* |
| if (upto < pairs.size()-1) { |
| int tryCount = 0; |
| while(tryCount < 10) { |
| final IntsRef t = toIntsRef(getRandomString(), inputMode); |
| if (pairs.get(upto).input.compareTo(t) < 0) { |
| final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; |
| if (VERBOSE) { |
| System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); |
| } |
| assertEquals(expected, fstEnum.beforeNext(t)); |
| break; |
| } |
| tryCount++; |
| } |
| } |
| */ |
| } |
| } |
| } |
| } |
| |
| private static class CountMinOutput<T> { |
| int count; |
| T output; |
| T finalOutput; |
| boolean isLeaf = true; |
| boolean isFinal; |
| } |
| |
| // FST is pruned |
| private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException { |
| |
| if (VERBOSE) { |
| System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs); |
| for(InputOutput<T> pair : pairs) { |
| System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); |
| } |
| } |
| |
| // To validate the FST, we brute-force compute all prefixes |
| // in the terms, matched to their "common" outputs, prune that |
| // set according to the prune thresholds, then assert the FST |
| // matches that same set. |
| |
| // NOTE: Crazy RAM intensive!! |
| |
| //System.out.println("TEST: tally prefixes"); |
| |
| // build all prefixes |
| final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>(); |
| final IntsRef scratch = new IntsRef(10); |
| for(InputOutput<T> pair: pairs) { |
| scratch.copy(pair.input); |
| for(int idx=0;idx<=pair.input.length;idx++) { |
| scratch.length = idx; |
| CountMinOutput<T> cmo = prefixes.get(scratch); |
| if (cmo == null) { |
| cmo = new CountMinOutput<T>(); |
| cmo.count = 1; |
| cmo.output = pair.output; |
| prefixes.put(new IntsRef(scratch), cmo); |
| } else { |
| cmo.count++; |
| cmo.output = outputs.common(cmo.output, pair.output); |
| } |
| if (idx == pair.input.length) { |
| cmo.isFinal = true; |
| cmo.finalOutput = cmo.output; |
| } |
| } |
| } |
| |
| if (VERBOSE) { |
| System.out.println("TEST: now prune"); |
| } |
| |
| // prune 'em |
| final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator(); |
| while(it.hasNext()) { |
| Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next(); |
| final IntsRef prefix = ent.getKey(); |
| final CountMinOutput<T> cmo = ent.getValue(); |
| if (VERBOSE) { |
| System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal); |
| } |
| final boolean keep; |
| if (prune1 > 0) { |
| keep = cmo.count >= prune1; |
| } else { |
| assert prune2 > 0; |
| if (prune2 > 1 && cmo.count >= prune2) { |
| keep = true; |
| } else if (prefix.length > 0) { |
| // consult our parent |
| scratch.length = prefix.length-1; |
| System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length); |
| final CountMinOutput<T> cmo2 = prefixes.get(scratch); |
| //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); |
| keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1))); |
| } else if (cmo.count >= prune2) { |
| keep = true; |
| } else { |
| keep = false; |
| } |
| } |
| |
| if (!keep) { |
| it.remove(); |
| //System.out.println(" remove"); |
| } else { |
| // clear isLeaf for all ancestors |
| //System.out.println(" keep"); |
| scratch.copy(prefix); |
| scratch.length--; |
| while(scratch.length >= 0) { |
| final CountMinOutput<T> cmo2 = prefixes.get(scratch); |
| if (cmo2 != null) { |
| //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); |
| cmo2.isLeaf = false; |
| } |
| scratch.length--; |
| } |
| } |
| } |
| |
| //System.out.println("TEST: after prune"); |
| /* |
| for(Map.Entry<BytesRef,CountMinOutput> ent : prefixes.entrySet()) { |
| System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); |
| if (ent.getValue().isFinal) { |
| System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); |
| } |
| } |
| */ |
| |
| if (prefixes.size() <= 1) { |
| assertNull(fst); |
| return; |
| } |
| |
| assertNotNull(fst); |
| |
| // make sure FST only enums valid prefixes |
| if (VERBOSE) { |
| System.out.println("TEST: check pruned enum"); |
| } |
| IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst); |
| IntsRefFSTEnum.InputOutput<T> current; |
| while((current = fstEnum.next()) != null) { |
| if (VERBOSE) { |
| System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output)); |
| } |
| final CountMinOutput cmo = prefixes.get(current.input); |
| assertNotNull(cmo); |
| assertTrue(cmo.isLeaf || cmo.isFinal); |
| //if (cmo.isFinal && !cmo.isLeaf) { |
| if (cmo.isFinal) { |
| assertEquals(cmo.finalOutput, current.output); |
| } else { |
| assertEquals(cmo.output, current.output); |
| } |
| } |
| |
| // make sure all non-pruned prefixes are present in the FST |
| if (VERBOSE) { |
| System.out.println("TEST: verify all prefixes"); |
| } |
| final int[] stopNode = new int[1]; |
| for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) { |
| if (ent.getKey().length > 0) { |
| final CountMinOutput<T> cmo = ent.getValue(); |
| final T output = run(fst, ent.getKey(), stopNode); |
| if (VERBOSE) { |
| System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output)); |
| } |
| // if (cmo.isFinal && !cmo.isLeaf) { |
| if (cmo.isFinal) { |
| assertEquals(cmo.finalOutput, output); |
| } else { |
| assertEquals(cmo.output, output); |
| } |
| assertEquals(ent.getKey().length, stopNode[0]); |
| } |
| } |
| } |
| } |
| |
| public void testRandomWords() throws IOException { |
| testRandomWords(1000, atLeast(2)); |
| //testRandomWords(20, 100); |
| } |
| |
| private String inputModeToString(int mode) { |
| if (mode == 0) { |
| return "utf8"; |
| } else { |
| return "utf32"; |
| } |
| } |
| |
| private void testRandomWords(int maxNumWords, int numIter) throws IOException { |
| for(int iter=0;iter<numIter;iter++) { |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter " + iter); |
| } |
| for(int inputMode=0;inputMode<2;inputMode++) { |
| final int numWords = random.nextInt(maxNumWords+1); |
| Set<IntsRef> termsSet = new HashSet<IntsRef>(); |
| IntsRef[] terms = new IntsRef[numWords]; |
| while(termsSet.size() < numWords) { |
| final String term = getRandomString(); |
| termsSet.add(toIntsRef(term, inputMode)); |
| } |
| doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); |
| } |
| } |
| } |
| |
| static String getRandomString() { |
| final String term; |
| if (random.nextBoolean()) { |
| term = _TestUtil.randomRealisticUnicodeString(random); |
| } else { |
| // we want to mix in limited-alphabet symbols so |
| // we get more sharing of the nodes given how few |
| // terms we are testing... |
| term = simpleRandomString(random); |
| } |
| return term; |
| } |
| |
| @Nightly |
| public void testBigSet() throws IOException { |
| testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1); |
| } |
| |
| private static String inputToString(int inputMode, IntsRef term) { |
| return inputToString(inputMode, term, true); |
| } |
| |
| private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) { |
| if (!isValidUnicode) { |
| return term.toString(); |
| } else if (inputMode == 0) { |
| // utf8 |
| return toBytesRef(term).utf8ToString() + " " + term; |
| } else { |
| // utf32 |
| return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; |
| } |
| } |
| |
| private static IntsRef toIntsRef(String s) { |
| final int charCount = s.length(); |
| IntsRef ir = new IntsRef(charCount); |
| for(int charIDX=0;charIDX<charCount;charIDX++) { |
| ir.ints[charIDX] = s.charAt(charIDX); |
| } |
| ir.length = charCount; |
| return ir; |
| } |
| |
| private static String toString(IntsRef ints) { |
| char[] chars = new char[ints.length]; |
| for(int charIDX=0;charIDX<ints.length;charIDX++) { |
| final int ch = ints.ints[ints.offset+charIDX]; |
| assertTrue(ch >= 0 && ch < 65536); |
| chars[charIDX] = (char) ch; |
| } |
| return new String(chars); |
| } |
| |
| // Build FST for all unique terms in the test line docs |
| // file, up until a time limit |
| public void testRealTerms() throws Exception { |
| |
| /* |
| if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { |
| // no |
| CodecProvider.getDefault().setDefaultFieldCodec("Standard"); |
| } |
| */ |
| |
| final LineFileDocs docs = new LineFileDocs(random); |
| final int RUN_TIME_MSEC = atLeast(500); |
| final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64); |
| final File tempDir = _TestUtil.getTempDir("fstlines"); |
| final MockDirectoryWrapper dir = newFSDirectory(tempDir); |
| final IndexWriter writer = new IndexWriter(dir, conf); |
| writer.setInfoStream(VERBOSE ? System.out : null); |
| final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC; |
| Document doc; |
| int docCount = 0; |
| while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) { |
| writer.addDocument(doc); |
| docCount++; |
| } |
| IndexReader r = IndexReader.open(writer, true); |
| writer.close(); |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); |
| Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, outputs); |
| |
| boolean storeOrd = false; |
| if (VERBOSE) { |
| if (storeOrd) { |
| System.out.println("FST stores ord"); |
| } else { |
| System.out.println("FST stores docFreq"); |
| } |
| } |
| TermEnum termEnum = r.terms(new Term("body", "")); |
| if (VERBOSE) { |
| System.out.println("TEST: got termEnum=" + termEnum); |
| } |
| int ord = 0; |
| while(true) { |
| final Term term = termEnum.term(); |
| if (term == null || !"body".equals(term.field())) { |
| break; |
| } |
| |
| // No ord in 3.x: |
| /* |
| if (ord == 0) { |
| try { |
| termsEnum.ord(); |
| } catch (UnsupportedOperationException uoe) { |
| if (VERBOSE) { |
| System.out.println("TEST: codec doesn't support ord; FST stores docFreq"); |
| } |
| storeOrd = false; |
| } |
| } |
| */ |
| final int output; |
| if (storeOrd) { |
| output = ord; |
| } else { |
| output = termEnum.docFreq(); |
| } |
| //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0))); |
| builder.add(toIntsRef(term.text()), outputs.get(output)); |
| ord++; |
| if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { |
| System.out.println(ord + " terms..."); |
| } |
| termEnum.next(); |
| } |
| final FST<Long> fst = builder.finish(); |
| if (VERBOSE) { |
| System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); |
| } |
| |
| if (ord > 0) { |
| // Now confirm BytesRefFSTEnum and TermEnum act the |
| // same: |
| final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst); |
| int num = atLeast(1000); |
| for(int iter=0;iter<num;iter++) { |
| final String randomTerm = getRandomString(); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0))); |
| } |
| |
| termEnum = r.terms(new Term("body", randomTerm)); |
| final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm)); |
| |
| if (termEnum.term() == null || !"body".equals(termEnum.term().field())) { |
| assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult); |
| } else { |
| assertSame(termEnum, fstEnum, storeOrd); |
| for(int nextIter=0;nextIter<10;nextIter++) { |
| if (VERBOSE) { |
| System.out.println("TEST: next"); |
| //if (storeOrd) { |
| //System.out.println(" ord=" + termEnum.ord()); |
| //} |
| } |
| termEnum.next(); |
| if (termEnum.term() != null && "body".equals(termEnum.term().field())) { |
| if (VERBOSE) { |
| System.out.println(" term=" + termEnum.term()); |
| } |
| assertNotNull(fstEnum.next()); |
| assertSame(termEnum, fstEnum, storeOrd); |
| } else { |
| if (VERBOSE) { |
| System.out.println(" end!"); |
| } |
| IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next(); |
| if (nextResult != null) { |
| System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output)); |
| fail(); |
| } |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| r.close(); |
| dir.close(); |
| } |
| |
| private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception { |
| if (termEnum.term() == null || !"body".equals(termEnum.term().field())) { |
| if (fstEnum.current() != null) { |
| fail("fstEnum.current().input=" + toString(fstEnum.current().input)); |
| } |
| } else { |
| assertNotNull(fstEnum.current()); |
| assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input)); |
| if (storeOrd) { |
| // fst stored the ord |
| // No ord in 3.x |
| // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue()); |
| } else { |
| // fst stored the docFreq |
| assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); |
| } |
| } |
| } |
| |
| private static abstract class VisitTerms<T> { |
| private final String dirOut; |
| private final String wordsFileIn; |
| private int inputMode; |
| private final Outputs<T> outputs; |
| private final Builder<T> builder; |
| |
| public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) { |
| this.dirOut = dirOut; |
| this.wordsFileIn = wordsFileIn; |
| this.inputMode = inputMode; |
| this.outputs = outputs; |
| |
| builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs); |
| } |
| |
| protected abstract T getOutput(IntsRef input, int ord) throws IOException; |
| |
| public void run(int limit, boolean verify) throws IOException { |
| BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); |
| try { |
| final IntsRef intsRef = new IntsRef(10); |
| long tStart = System.currentTimeMillis(); |
| int ord = 0; |
| while(true) { |
| String w = is.readLine(); |
| if (w == null) { |
| break; |
| } |
| toIntsRef(w, inputMode, intsRef); |
| builder.add(intsRef, |
| getOutput(intsRef, ord)); |
| |
| ord++; |
| if (ord % 500000 == 0) { |
| System.out.println( |
| String.format(Locale.ENGLISH, |
| "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord)); |
| } |
| if (ord >= limit) { |
| break; |
| } |
| } |
| |
| assert builder.getTermCount() == ord; |
| final FST<T> fst = builder.finish(); |
| if (fst == null) { |
| System.out.println("FST was fully pruned!"); |
| System.exit(0); |
| } |
| |
| if (dirOut == null) |
| return; |
| |
| System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); |
| if (fst.getNodeCount() < 100) { |
| Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); |
| Util.toDot(fst, w, false, false); |
| w.close(); |
| System.out.println("Wrote FST to out.dot"); |
| } |
| |
| Directory dir = FSDirectory.open(new File(dirOut)); |
| IndexOutput out = dir.createOutput("fst.bin"); |
| fst.save(out); |
| out.close(); |
| |
| System.out.println("Saved FST to fst.bin."); |
| |
| if (!verify) { |
| return; |
| } |
| |
| System.out.println("\nNow verify..."); |
| |
| is.close(); |
| is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); |
| |
| ord = 0; |
| tStart = System.currentTimeMillis(); |
| while(true) { |
| String w = is.readLine(); |
| if (w == null) { |
| break; |
| } |
| toIntsRef(w, inputMode, intsRef); |
| T expected = getOutput(intsRef, ord); |
| T actual = Util.get(fst, intsRef); |
| if (actual == null) { |
| throw new RuntimeException("unexpected null output on input=" + w); |
| } |
| if (!actual.equals(expected)) { |
| throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); |
| } |
| |
| ord++; |
| if (ord % 500000 == 0) { |
| System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); |
| } |
| if (ord >= limit) { |
| break; |
| } |
| } |
| |
| double totSec = ((System.currentTimeMillis() - tStart)/1000.0); |
| System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); |
| |
| } finally { |
| is.close(); |
| } |
| } |
| } |
| |
| // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out |
| public static void main(String[] args) throws IOException { |
| int prune = 0; |
| int limit = Integer.MAX_VALUE; |
| int inputMode = 0; // utf8 |
| boolean storeOrds = false; |
| boolean storeDocFreqs = false; |
| boolean verify = true; |
| |
| String wordsFileIn = null; |
| String dirOut = null; |
| |
| int idx = 0; |
| while (idx < args.length) { |
| if (args[idx].equals("-prune")) { |
| prune = Integer.valueOf(args[1 + idx]); |
| idx++; |
| } else if (args[idx].equals("-limit")) { |
| limit = Integer.valueOf(args[1 + idx]); |
| idx++; |
| } else if (args[idx].equals("-utf8")) { |
| inputMode = 0; |
| } else if (args[idx].equals("-utf32")) { |
| inputMode = 1; |
| } else if (args[idx].equals("-docFreq")) { |
| storeDocFreqs = true; |
| } else if (args[idx].equals("-ords")) { |
| storeOrds = true; |
| } else if (args[idx].equals("-noverify")) { |
| verify = false; |
| } else if (args[idx].startsWith("-")) { |
| System.err.println("Unrecognized option: " + args[idx]); |
| System.exit(-1); |
| } else { |
| if (wordsFileIn == null) { |
| wordsFileIn = args[idx]; |
| } else if (dirOut == null) { |
| dirOut = args[idx]; |
| } else { |
| System.err.println("Too many arguments, expected: input [output]"); |
| System.exit(-1); |
| } |
| } |
| idx++; |
| } |
| |
| if (wordsFileIn == null) { |
| System.err.println("No input file."); |
| System.exit(-1); |
| } |
| |
| // ord benefits from share, docFreqs don't: |
| |
| if (storeOrds && storeDocFreqs) { |
| // Store both ord & docFreq: |
| final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); |
| final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); |
| final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2); |
| new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) { |
| Random rand; |
| @Override |
| public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) { |
| if (ord == 0) { |
| rand = new Random(17); |
| } |
| return new PairOutputs.Pair<Long,Long>(o1.get(ord), |
| o2.get(_TestUtil.nextInt(rand, 1, 5000))); |
| } |
| }.run(limit, verify); |
| } else if (storeOrds) { |
| // Store only ords |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) { |
| @Override |
| public Long getOutput(IntsRef input, int ord) { |
| return outputs.get(ord); |
| } |
| }.run(limit, verify); |
| } else if (storeDocFreqs) { |
| // Store only docFreq |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); |
| new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) { |
| Random rand; |
| @Override |
| public Long getOutput(IntsRef input, int ord) { |
| if (ord == 0) { |
| rand = new Random(17); |
| } |
| return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); |
| } |
| }.run(limit, verify); |
| } else { |
| // Store nothing |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) { |
| @Override |
| public Object getOutput(IntsRef input, int ord) { |
| return NO_OUTPUT; |
| } |
| }.run(limit, verify); |
| } |
| } |
| |
| public void testSingleString() throws Exception { |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs); |
| b.add(new BytesRef("foobar"), outputs.getNoOutput()); |
| final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish()); |
| assertNull(fstEnum.seekFloor(new BytesRef("foo"))); |
| assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); |
| } |
| |
| public void testSimple() throws Exception { |
| |
| // Get outputs -- passing true means FST will share |
| // (delta code) the outputs. This should result in |
| // smaller FST if the outputs grow monotonically. But |
| // if numbers are "random", false should give smaller |
| // final size: |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| |
| // Build an FST mapping BytesRef -> Long |
| final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final BytesRef a = new BytesRef("a"); |
| final BytesRef b = new BytesRef("b"); |
| final BytesRef c = new BytesRef("c"); |
| |
| builder.add(a, outputs.get(17)); |
| builder.add(b, outputs.get(42)); |
| builder.add(c, outputs.get(13824324872317238L)); |
| |
| final FST<Long> fst = builder.finish(); |
| |
| assertEquals(13824324872317238L, (long) Util.get(fst, c)); |
| assertEquals(42, (long) Util.get(fst, b)); |
| assertEquals(17, (long) Util.get(fst, a)); |
| |
| BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); |
| BytesRefFSTEnum.InputOutput<Long> seekResult; |
| seekResult = fstEnum.seekFloor(a); |
| assertNotNull(seekResult); |
| assertEquals(17, (long) seekResult.output); |
| |
| // goes to a |
| seekResult = fstEnum.seekFloor(new BytesRef("aa")); |
| assertNotNull(seekResult); |
| assertEquals(17, (long) seekResult.output); |
| |
| // goes to b |
| seekResult = fstEnum.seekCeil(new BytesRef("aa")); |
| assertNotNull(seekResult); |
| assertEquals(b, seekResult.input); |
| assertEquals(42, (long) seekResult.output); |
| } |
| |
| /** |
| * Test state expansion (array format) on close-to-root states. Creates |
| * synthetic input that has one expanded state on each level. |
| * |
| * @see "https://issues.apache.org/jira/browse/LUCENE-2933" |
| */ |
| public void testExpandedCloseToRoot() throws Exception { |
| class SyntheticData { |
| FST<Object> compile(String[] lines) throws IOException { |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| final Object nothing = outputs.getNoOutput(); |
| final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| int line = 0; |
| final BytesRef term = new BytesRef(); |
| while (line < lines.length) { |
| String w = lines[line++]; |
| if (w == null) { |
| break; |
| } |
| term.copy(w); |
| b.add(term, nothing); |
| } |
| |
| return b.finish(); |
| } |
| |
| void generate(ArrayList<String> out, StringBuilder b, char from, char to, |
| int depth) { |
| if (depth == 0 || from == to) { |
| String seq = b.toString() + "_" + out.size() + "_end"; |
| out.add(seq); |
| } else { |
| for (char c = from; c <= to; c++) { |
| b.append(c); |
| generate(out, b, from, c == to ? to : from, depth - 1); |
| b.deleteCharAt(b.length() - 1); |
| } |
| } |
| } |
| |
| public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) |
| throws IOException { |
| if (fst.targetHasArcs(arc)) { |
| int childCount = 0; |
| for (arc = fst.readFirstTargetArc(arc, arc);; |
| arc = fst.readNextArc(arc), childCount++) |
| { |
| boolean expanded = fst.isExpandedTarget(arc); |
| int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1); |
| |
| assertEquals( |
| expanded, |
| (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && |
| children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || |
| children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); |
| if (arc.isLast()) break; |
| } |
| |
| return childCount; |
| } |
| return 0; |
| } |
| } |
| |
| // Sanity check. |
| assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); |
| assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); |
| |
| SyntheticData s = new SyntheticData(); |
| |
| ArrayList<String> out = new ArrayList<String>(); |
| StringBuilder b = new StringBuilder(); |
| s.generate(out, b, 'a', 'i', 10); |
| String[] input = out.toArray(new String[out.size()]); |
| Arrays.sort(input); |
| FST<Object> fst = s.compile(input); |
| FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>()); |
| s.verifyStateAndBelow(fst, arc, 1); |
| } |
| |
| // Make sure raw FST can differentiate between final vs |
| // non-final end nodes |
| public void testNonFinalStopNodes() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); |
| final Long nothing = outputs.getNoOutput(); |
| final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0); |
| |
| // Add final stop node |
| { |
| final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0); |
| node.isFinal = true; |
| rootNode.addArc('a', node); |
| final Builder.CompiledNode frozen = new Builder.CompiledNode(); |
| frozen.address = fst.addNode(node); |
| rootNode.arcs[0].nextFinalOutput = outputs.get(17); |
| rootNode.arcs[0].isFinal = true; |
| rootNode.arcs[0].output = nothing; |
| rootNode.arcs[0].target = frozen; |
| } |
| |
| // Add non-final stop node |
| { |
| final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0); |
| rootNode.addArc('b', node); |
| final Builder.CompiledNode frozen = new Builder.CompiledNode(); |
| frozen.address = fst.addNode(node); |
| rootNode.arcs[1].nextFinalOutput = nothing; |
| rootNode.arcs[1].output = outputs.get(42); |
| rootNode.arcs[1].target = frozen; |
| } |
| |
| fst.finish(fst.addNode(rootNode)); |
| |
| checkStopNodes(fst, outputs); |
| |
| // Make sure it still works after save/load: |
| Directory dir = newDirectory(); |
| IndexOutput out = dir.createOutput("fst"); |
| fst.save(out); |
| out.close(); |
| |
| IndexInput in = dir.openInput("fst"); |
| final FST<Long> fst2 = new FST<Long>(in, outputs); |
| checkStopNodes(fst2, outputs); |
| in.close(); |
| dir.close(); |
| } |
| |
| private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception { |
| final Long nothing = outputs.getNoOutput(); |
| FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>()); |
| assertEquals(nothing, startArc.output); |
| assertEquals(nothing, startArc.nextFinalOutput); |
| |
| FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>()); |
| assertEquals('a', arc.label); |
| assertEquals(17, arc.nextFinalOutput.longValue()); |
| assertTrue(arc.isFinal()); |
| |
| arc = fst.readNextArc(arc); |
| assertEquals('b', arc.label); |
| assertFalse(arc.isFinal()); |
| assertEquals(42, arc.output.longValue()); |
| } |
| } |