blob: 9994fd8243dc65ede0e7b2633b643bd8898b40bf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.Result;
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
import static org.apache.lucene.util.fst.FSTTester.toIntsRef;
@SuppressCodecs({ "SimpleText", "Direct" })
public class TestFSTs extends LuceneTestCase {
private MockDirectoryWrapper dir;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newMockDirectory();
}
@Override
public void tearDown() throws Exception {
// can be null if we force simpletext (funky, some kind of bug in test runner maybe)
if (dir != null) dir.close();
super.tearDown();
}
public void testBasicFSA() throws IOException {
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
IntsRef[] terms = new IntsRef[strings.length];
IntsRef[] terms2 = new IntsRef[strings2.length];
for(int inputMode=0;inputMode<2;inputMode++) {
if (VERBOSE) {
System.out.println("TEST: inputMode=" + inputModeToString(inputMode));
}
for(int idx=0;idx<strings.length;idx++) {
terms[idx] = toIntsRef(strings[idx], inputMode);
}
for(int idx=0;idx<strings2.length;idx++) {
terms2[idx] = toIntsRef(strings2[idx], inputMode);
}
Arrays.sort(terms2);
doTest(inputMode, terms);
// Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms):
// FSA
{
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<>(terms2.length);
for(IntsRef term : terms2) {
pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT));
}
FSTTester<Object> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs, false);
FST<Object> fst = tester.doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, tester.nodeCount);
assertEquals(27, tester.arcCount);
}
// FST ord pos int
{
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms2.length);
for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<>(terms2[idx], (long) idx));
}
FSTTester<Long> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs, true);
final FST<Long> fst = tester.doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, tester.nodeCount);
assertEquals(27, tester.arcCount);
}
// FST byte sequence ord
{
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final BytesRef NO_OUTPUT = outputs.getNoOutput();
final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<>(terms2.length);
for(int idx=0;idx<terms2.length;idx++) {
final BytesRef output = idx == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<>(terms2[idx], output));
}
FSTTester<BytesRef> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs, false);
final FST<BytesRef> fst = tester.doTest(0, 0, false);
assertNotNull(fst);
assertEquals(24, tester.nodeCount);
assertEquals(30, tester.arcCount);
}
}
}
// given set of terms, test the different outputs for them
private void doTest(int inputMode, IntsRef[] terms) throws IOException {
Arrays.sort(terms);
// NoOutputs (simple FSA)
{
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<>(terms.length);
for(IntsRef term : terms) {
pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}
// PositiveIntOutput (ord)
{
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<>(terms[idx], (long) idx));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, true).doTest(true);
}
// PositiveIntOutput (random monotonically increasing positive number)
{
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<>(terms[idx], value));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, true).doTest(true);
}
// PositiveIntOutput (random positive number)
{
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<>(terms[idx], TestUtil.nextLong(random(), 0, Long.MAX_VALUE)));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}
// Pair<ord, (random monotonically increasing positive number>
{
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2);
final List<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>> pairs = new ArrayList<>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<>(terms[idx],
outputs.newPair((long) idx, value)));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}
// Sequence-of-bytes
{
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final BytesRef NO_OUTPUT = outputs.getNoOutput();
final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
final BytesRef output = random().nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}
// Sequence-of-ints
{
final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
final List<FSTTester.InputOutput<IntsRef>> pairs = new ArrayList<>(terms.length);
for(int idx=0;idx<terms.length;idx++) {
final String s = Integer.toString(idx);
final IntsRef output = new IntsRef(s.length());
output.length = s.length();
for(int idx2=0;idx2<output.length;idx2++) {
output.ints[idx2] = s.charAt(idx2);
}
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}
}
public void testRandomWords() throws IOException {
if (TEST_NIGHTLY) {
testRandomWords(1000, atLeast(2));
} else {
testRandomWords(100, 1);
}
}
String inputModeToString(int mode) {
if (mode == 0) {
return "utf8";
} else {
return "utf32";
}
}
private void testRandomWords(int maxNumWords, int numIter) throws IOException {
Random random = new Random(random().nextLong());
for(int iter=0;iter<numIter;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter " + iter);
}
for(int inputMode=0;inputMode<2;inputMode++) {
final int numWords = random.nextInt(maxNumWords+1);
Set<IntsRef> termsSet = new HashSet<>();
IntsRef[] terms = new IntsRef[numWords];
while(termsSet.size() < numWords) {
final String term = getRandomString(random);
termsSet.add(toIntsRef(term, inputMode));
}
doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
}
}
}
@Nightly
public void testBigSet() throws IOException {
testRandomWords(TestUtil.nextInt(random(), 50000, 60000), 1);
}
// Build FST for all unique terms in the test line docs
// file, up until a doc limit
@Slow
public void testRealTerms() throws Exception {
final LineFileDocs docs = new LineFileDocs(random());
final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(50);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
final Path tempDir = createTempDir("fstlines");
final Directory dir = newFSDirectory(tempDir);
final IndexWriter writer = new IndexWriter(dir, conf);
Document doc;
int docCount = 0;
while((doc = docs.nextDoc()) != null && docCount < numDocs) {
writer.addDocument(doc);
docCount++;
}
IndexReader r = DirectoryReader.open(writer);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
if (storeOrd) {
System.out.println("FST stores ord");
} else {
System.out.println("FST stores docFreq");
}
}
Terms terms = MultiTerms.getTerms(r, "body");
if (terms != null) {
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
final TermsEnum termsEnum = terms.iterator();
if (VERBOSE) {
System.out.println("TEST: got termsEnum=" + termsEnum);
}
BytesRef term;
int ord = 0;
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
while((term = termsEnum.next()) != null) {
BytesRef term2 = termsEnum2.next();
assertNotNull(term2);
assertEquals(term, term2);
assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
if (ord == 0) {
try {
termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
if (VERBOSE) {
System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
}
storeOrd = false;
}
}
final int output;
if (storeOrd) {
output = ord;
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
final Random random = new Random(random().nextLong());
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
int num = atLeast(1000);
for(int iter=0;iter<num;iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for(int nextIter=0;nextIter<10;nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
}
}
}
}
}
}
r.close();
dir.close();
}
private void assertSame(TermsEnum termsEnum, BytesRefFSTEnum<?> fstEnum, boolean storeOrd) throws Exception {
if (termsEnum.term() == null) {
assertNull(fstEnum.current());
} else {
assertNotNull(fstEnum.current());
assertEquals(termsEnum.term().utf8ToString() + " != " + fstEnum.current().input.utf8ToString(), termsEnum.term(), fstEnum.current().input);
if (storeOrd) {
// fst stored the ord
assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.ord(), ((Long) fstEnum.current().output).longValue());
} else {
// fst stored the docFreq
assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue()));
}
}
}
private static abstract class VisitTerms<T> {
private final Path dirOut;
private final Path wordsFileIn;
private int inputMode;
private final Outputs<T> outputs;
private final Builder<T> builder;
public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode;
this.outputs = outputs;
builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
public void run(int limit, boolean verify, boolean verifyByOutput) throws IOException {
BufferedReader is = Files.newBufferedReader(wordsFileIn, StandardCharsets.UTF_8);
try {
final IntsRefBuilder intsRef = new IntsRefBuilder();
long tStart = System.currentTimeMillis();
int ord = 0;
while(true) {
String w = is.readLine();
if (w == null) {
break;
}
toIntsRef(w, inputMode, intsRef);
builder.add(intsRef.get(),
getOutput(intsRef.get(), ord));
ord++;
if (ord % 500000 == 0) {
System.out.println(
String.format(Locale.ROOT,
"%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
}
if (ord >= limit) {
break;
}
}
long tMid = System.currentTimeMillis();
System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
assert builder.getTermCount() == ord;
FST<T> fst = builder.finish();
long tEnd = System.currentTimeMillis();
System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
if (fst == null) {
System.out.println("FST was fully pruned!");
System.exit(0);
}
if (dirOut == null) {
return;
}
System.out.println(ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs; tot size " + fst.ramBytesUsed());
if (builder.getNodeCount() < 100) {
Writer w = Files.newBufferedWriter(Paths.get("out.dot"), StandardCharsets.UTF_8);
Util.toDot(fst, w, false, false);
w.close();
System.out.println("Wrote FST to out.dot");
}
Directory dir = FSDirectory.open(dirOut);
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
fst.save(out, out);
out.close();
System.out.println("Saved FST to fst.bin.");
if (!verify) {
return;
}
/*
IndexInput in = dir.openInput("fst.bin", IOContext.DEFAULT);
fst = new FST<T>(in, outputs);
in.close();
*/
System.out.println("\nNow verify...");
while(true) {
for(int iter=0;iter<2;iter++) {
is.close();
is = Files.newBufferedReader(wordsFileIn, StandardCharsets.UTF_8);
ord = 0;
tStart = System.currentTimeMillis();
while(true) {
String w = is.readLine();
if (w == null) {
break;
}
toIntsRef(w, inputMode, intsRef);
if (iter == 0) {
T expected = getOutput(intsRef.get(), ord);
T actual = Util.get(fst, intsRef.get());
if (actual == null) {
throw new RuntimeException("unexpected null output on input=" + w);
}
if (!actual.equals(expected)) {
throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
}
} else {
// Get by output
final Long output = (Long) getOutput(intsRef.get(), ord);
@SuppressWarnings("unchecked") final IntsRef actual = Util.getByOutput((FST<Long>) fst, output.longValue());
if (actual == null) {
throw new RuntimeException("unexpected null input from output=" + output);
}
if (!actual.equals(intsRef)) {
throw new RuntimeException("wrong input (got " + actual + " but expected " + intsRef + " from output=" + output);
}
}
ord++;
if (ord % 500000 == 0) {
System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
}
if (ord >= limit) {
break;
}
}
double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
System.out.println("Verify " + (iter == 1 ? "(by output) " : "") + "took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
if (!verifyByOutput) {
break;
}
}
// NOTE: comment out to profile lookup...
break;
}
} finally {
is.close();
}
}
}
// TODO: try experiment: reverse terms before
// compressing -- how much smaller?
// TODO: can FST be used to index all internal substrings,
// mapping to term?
// java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
int prune = 0;
int limit = Integer.MAX_VALUE;
int inputMode = 0; // utf8
boolean storeOrds = false;
boolean storeDocFreqs = false;
boolean verify = true;
boolean noArcArrays = false;
Path wordsFileIn = null;
Path dirOut = null;
int idx = 0;
while (idx < args.length) {
if (args[idx].equals("-prune")) {
prune = Integer.parseInt(args[1 + idx]);
idx++;
} else if (args[idx].equals("-limit")) {
limit = Integer.parseInt(args[1 + idx]);
idx++;
} else if (args[idx].equals("-utf8")) {
inputMode = 0;
} else if (args[idx].equals("-utf32")) {
inputMode = 1;
} else if (args[idx].equals("-docFreq")) {
storeDocFreqs = true;
} else if (args[idx].equals("-noArcArrays")) {
noArcArrays = true;
} else if (args[idx].equals("-ords")) {
storeOrds = true;
} else if (args[idx].equals("-noverify")) {
verify = false;
} else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1);
} else {
if (wordsFileIn == null) {
wordsFileIn = Paths.get(args[idx]);
} else if (dirOut == null) {
dirOut = Paths.get(args[idx]);
} else {
System.err.println("Too many arguments, expected: input [output]");
System.exit(-1);
}
}
idx++;
}
if (wordsFileIn == null) {
System.err.println("No input file.");
System.exit(-1);
}
// ord benefits from share, docFreqs don't:
if (storeOrds && storeDocFreqs) {
// Store both ord & docFreq:
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2);
new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
return outputs.newPair((long) ord,
(long) TestUtil.nextInt(rand, 1, 5000));
}
}.run(limit, verify, false);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
return (long) ord;
}
}.run(limit, verify, true);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public Long getOutput(IntsRef input, int ord) {
if (ord == 0) {
rand = new Random(17);
}
return (long) TestUtil.nextInt(rand, 1, 5000);
}
}.run(limit, verify, false);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
}
}.run(limit, verify, false);
}
}
public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRefBuilder()), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(b.finish());
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
}
public void testDuplicateFSAString() throws Exception {
String str = "foobar";
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder ints = new IntsRefBuilder();
for(int i=0; i<10; i++) {
b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
}
FST<Object> fst = b.finish();
// count the input paths
int count = 0;
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst);
while(fstEnum.next()!=null) {
count++;
}
assertEquals(1, count);
assertNotNull(Util.get(fst, new BytesRef(str)));
assertNull(Util.get(fst, new BytesRef("foobaz")));
}
/*
public void testTrivial() throws Exception {
// Get outputs -- passing true means FST will share
// (delta code) the outputs. This should result in
// smaller FST if the outputs grow monotonically. But
// if numbers are "random", false should give smaller
// final size:
final NoOutputs outputs = NoOutputs.getSingleton();
String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
0, 0,
true,
true,
Integer.MAX_VALUE,
outputs,
null,
true);
Arrays.sort(strings);
final IntsRef scratch = new IntsRef();
for(String s : strings) {
builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput());
}
final FST<Object> fst = builder.finish();
System.out.println("DOT before rewrite");
Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot"));
Util.toDot(fst, w, false, false);
w.close();
final FST<Object> rewrite = new FST<Object>(fst, 1, 100);
System.out.println("DOT after rewrite");
w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot"));
Util.toDot(rewrite, w, false, false);
w.close();
}
*/
public void testSimple() throws Exception {
// Get outputs -- passing true means FST will share
// (delta code) the outputs. This should result in
// smaller FST if the outputs grow monotonically. But
// if numbers are "random", false should give smaller
// final size:
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
// Build an FST mapping BytesRef -> Long
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRef a = new BytesRef("a");
final BytesRef b = new BytesRef("b");
final BytesRef c = new BytesRef("c");
builder.add(Util.toIntsRef(a, new IntsRefBuilder()), 17L);
builder.add(Util.toIntsRef(b, new IntsRefBuilder()), 42L);
builder.add(Util.toIntsRef(c, new IntsRefBuilder()), 13824324872317238L);
final FST<Long> fst = builder.finish();
assertEquals(13824324872317238L, (long) Util.get(fst, c));
assertEquals(42, (long) Util.get(fst, b));
assertEquals(17, (long) Util.get(fst, a));
BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
BytesRefFSTEnum.InputOutput<Long> seekResult;
seekResult = fstEnum.seekFloor(a);
assertNotNull(seekResult);
assertEquals(17, (long) seekResult.output);
// goes to a
seekResult = fstEnum.seekFloor(new BytesRef("aa"));
assertNotNull(seekResult);
assertEquals(17, (long) seekResult.output);
// goes to b
seekResult = fstEnum.seekCeil(new BytesRef("aa"));
assertNotNull(seekResult);
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRefBuilder()),
Util.getByOutput(fst, 13824324872317238L));
assertNull(Util.getByOutput(fst, 47));
assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRefBuilder()),
Util.getByOutput(fst, 42));
assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRefBuilder()),
Util.getByOutput(fst, 17));
}
public void testPrimaryKeys() throws Exception {
Directory dir = newDirectory();
for(int cycle=0;cycle<2;cycle++) {
if (VERBOSE) {
System.out.println("TEST: cycle=" + cycle);
}
RandomIndexWriter w = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
Field idField = newStringField("id", "", Field.Store.NO);
doc.add(idField);
final int NUM_IDS = atLeast(200);
//final int NUM_IDS = (int) (377 * (1.0+random.nextDouble()));
if (VERBOSE) {
System.out.println("TEST: NUM_IDS=" + NUM_IDS);
}
final Set<String> allIDs = new HashSet<>();
for(int id=0;id<NUM_IDS;id++) {
String idString;
if (cycle == 0) {
// PKs are assigned sequentially
idString = String.format(Locale.ROOT, "%07d", id);
} else {
while(true) {
final String s = Long.toString(random().nextLong());
if (!allIDs.contains(s)) {
idString = s;
break;
}
}
}
allIDs.add(idString);
idField.setStringValue(idString);
w.addDocument(doc);
}
//w.forceMerge(1);
// turn writer into reader:
final IndexReader r = w.getReader();
final IndexSearcher s = newSearcher(r);
w.close();
final List<String> allIDsList = new ArrayList<>(allIDs);
final List<String> sortedAllIDsList = new ArrayList<>(allIDsList);
Collections.sort(sortedAllIDsList);
// Sprinkle in some non-existent PKs:
Set<String> outOfBounds = new HashSet<>();
for(int idx=0;idx<NUM_IDS/10;idx++) {
String idString;
if (cycle == 0) {
idString = String.format(Locale.ROOT, "%07d", (NUM_IDS + idx));
} else {
while(true) {
idString = Long.toString(random().nextLong());
if (!allIDs.contains(idString)) {
break;
}
}
}
outOfBounds.add(idString);
allIDsList.add(idString);
}
// Verify w/ TermQuery
for(int iter=0;iter<2*NUM_IDS;iter++) {
final String id = allIDsList.get(random().nextInt(allIDsList.size()));
final boolean exists = !outOfBounds.contains(id);
if (VERBOSE) {
System.out.println("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id);
}
assertEquals((exists ? "" : "non-exist ") + "id=" + id, exists ? 1 : 0, s.count(new TermQuery(new Term("id", id))));
}
// Verify w/ MultiTermsEnum
final TermsEnum termsEnum = MultiTerms.getTerms(r, "id").iterator();
for(int iter=0;iter<2*NUM_IDS;iter++) {
final String id;
final String nextID;
final boolean exists;
if (random().nextBoolean()) {
id = allIDsList.get(random().nextInt(allIDsList.size()));
exists = !outOfBounds.contains(id);
nextID = null;
if (VERBOSE) {
System.out.println("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id);
}
} else {
// Pick ID between two IDs:
exists = false;
final int idv = random().nextInt(NUM_IDS-1);
if (cycle == 0) {
id = String.format(Locale.ROOT, "%07da", idv);
nextID = String.format(Locale.ROOT, "%07d", idv+1);
} else {
id = sortedAllIDsList.get(idv) + "a";
nextID = sortedAllIDsList.get(idv+1);
}
if (VERBOSE) {
System.out.println("TEST: not exactOnly id=" + id + " nextID=" + nextID);
}
}
final TermsEnum.SeekStatus status;
if (nextID == null) {
if (termsEnum.seekExact(new BytesRef(id))) {
status = TermsEnum.SeekStatus.FOUND;
} else {
status = TermsEnum.SeekStatus.NOT_FOUND;
}
} else {
status = termsEnum.seekCeil(new BytesRef(id));
}
if (nextID != null) {
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status);
assertEquals("expected=" + nextID + " actual=" + termsEnum.term().utf8ToString(), new BytesRef(nextID), termsEnum.term());
} else if (!exists) {
assertTrue(status == TermsEnum.SeekStatus.NOT_FOUND ||
status == TermsEnum.SeekStatus.END);
} else {
assertEquals(TermsEnum.SeekStatus.FOUND, status);
}
}
r.close();
}
dir.close();
}
public void testRandomTermLookup() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
Field f = newStringField("field", "", Field.Store.NO);
doc.add(f);
final int NUM_TERMS = (int) (1000*RANDOM_MULTIPLIER * (1+random().nextDouble()));
if (VERBOSE) {
System.out.println("TEST: NUM_TERMS=" + NUM_TERMS);
}
final Set<String> allTerms = new HashSet<>();
while(allTerms.size() < NUM_TERMS) {
allTerms.add(simpleRandomString(random()));
}
for(String term : allTerms) {
f.setStringValue(term);
w.addDocument(doc);
}
// turn writer into reader:
if (VERBOSE) {
System.out.println("TEST: get reader");
}
IndexReader r = w.getReader();
if (VERBOSE) {
System.out.println("TEST: got reader=" + r);
}
IndexSearcher s = newSearcher(r);
w.close();
final List<String> allTermsList = new ArrayList<>(allTerms);
Collections.shuffle(allTermsList, random());
// verify exact lookup
for(String term : allTermsList) {
if (VERBOSE) {
System.out.println("TEST: term=" + term);
}
assertEquals("term=" + term, 1, s.count(new TermQuery(new Term("field", term))));
}
r.close();
dir.close();
}
/**
* Test state expansion (array format) on close-to-root states. Creates
* synthetic input that has one expanded state on each level.
*
* @see <a href="https://issues.apache.org/jira/browse/LUCENE-2933">LUCENE-2933</a>
*/
public void testExpandedCloseToRoot() throws Exception {
class SyntheticData {
FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0;
final BytesRefBuilder term = new BytesRefBuilder();
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
while (line < lines.length) {
String w = lines[line++];
if (w == null) {
break;
}
term.copyChars(w);
b.add(Util.toIntsRef(term.get(), scratchIntsRef), nothing);
}
return b.finish();
}
void generate(ArrayList<String> out, StringBuilder b, char from, char to,
int depth) {
if (depth == 0 || from == to) {
String seq = b.toString() + "_" + out.size() + "_end";
out.add(seq);
} else {
for (char c = from; c <= to; c++) {
b.append(c);
generate(out, b, from, c == to ? to : from, depth - 1);
b.deleteCharAt(b.length() - 1);
}
}
}
public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
throws IOException {
if (FST.targetHasArcs(arc)) {
int childCount = 0;
BytesReader fstReader = fst.getBytesReader();
for (arc = fst.readFirstTargetArc(arc, arc, fstReader);;
arc = fst.readNextArc(arc, fstReader), childCount++)
{
boolean expanded = fst.isExpandedTarget(arc, fstReader);
int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
assertEquals(
(depth <= FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH &&
children >= FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
children >= FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS,
expanded);
if (arc.isLast()) break;
}
return childCount;
}
return 0;
}
}
// Sanity check.
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS < FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH >= 0);
SyntheticData s = new SyntheticData();
ArrayList<String> out = new ArrayList<>();
StringBuilder b = new StringBuilder();
s.generate(out, b, 'a', 'i', 10);
String[] input = out.toArray(new String[out.size()]);
Arrays.sort(input);
FST<Object> fst = s.compile(input);
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
s.verifyStateAndBelow(fst, arc, 1);
}
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1);
}
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
// check for accept state at label t
assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1);
// check for accept state at label n
assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1);
}
// Make sure raw FST can differentiate between final vs
// non-final end nodes
public void testNonFinalStopNode() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
//final FST<Long> fst = new FST<>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT, 15);
final FST<Long> fst = b.fst;
final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<>(b, 0);
// Add final stop node
{
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
node.isFinal = true;
rootNode.addArc('a', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.node = fst.addNode(b, node);
rootNode.arcs[0].nextFinalOutput = 17L;
rootNode.arcs[0].isFinal = true;
rootNode.arcs[0].output = nothing;
rootNode.arcs[0].target = frozen;
}
// Add non-final stop node
{
final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<>(b, 0);
rootNode.addArc('b', node);
final Builder.CompiledNode frozen = new Builder.CompiledNode();
frozen.node = fst.addNode(b, node);
rootNode.arcs[1].nextFinalOutput = nothing;
rootNode.arcs[1].output = 42L;
rootNode.arcs[1].target = frozen;
}
fst.finish(fst.addNode(b, rootNode));
StringWriter w = new StringWriter();
//Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
Util.toDot(fst, w, false, false);
w.close();
checkStopNodes(fst, outputs);
// Make sure it still works after save/load:
Directory dir = newDirectory();
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
final FST<Long> fst2 = new FST<>(in, in, outputs);
checkStopNodes(fst2, outputs);
in.close();
dir.close();
}
private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
final Long nothing = outputs.getNoOutput();
FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>());
assertEquals(nothing, startArc.output());
assertEquals(nothing, startArc.nextFinalOutput());
FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>(),
fst.getBytesReader());
assertEquals('a', arc.label());
assertEquals(17, arc.nextFinalOutput().longValue());
assertTrue(arc.isFinal());
arc = fst.readNextArc(arc, fst.getBytesReader());
assertEquals('b', arc.label());
assertFalse(arc.isFinal());
assertEquals(42, arc.output().longValue());
}
static final Comparator<Long> minLongComparator = new Comparator<Long> () {
@Override
public int compare(Long left, Long right) {
return left.compareTo(right);
}
};
public void testShortestPaths() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = builder.finish();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
Util.TopResults<Long> res = Util.shortestPaths(fst,
fst.getFirstArc(new FST.Arc<Long>()),
outputs.getNoOutput(),
minLongComparator,
3,
true);
assertTrue(res.isComplete);
assertEquals(3, res.topN.size());
assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
assertEquals(7L, res.topN.get(0).output.longValue());
assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input);
assertEquals(17L,res.topN.get(1).output.longValue());
assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input);
assertEquals(22L, res.topN.get(2).output.longValue());
}
public void testRejectNoLimits() throws IOException {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), 22L);
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), 7L);
builder.add(Util.toIntsRef(new BytesRef("adcd"), scratch), 17L);
builder.add(Util.toIntsRef(new BytesRef("adcde"), scratch), 17L);
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), 17L);
final FST<Long> fst = builder.finish();
final AtomicInteger rejectCount = new AtomicInteger();
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, 2, 6, minLongComparator) {
@Override
protected boolean acceptResult(IntsRef input, Long output) {
boolean accept = output.intValue() == 7;
if (!accept) {
rejectCount.incrementAndGet();
}
return accept;
}
};
searcher.addStartPaths(fst.getFirstArc(new FST.Arc<Long>()), outputs.getNoOutput(), true, new IntsRefBuilder());
Util.TopResults<Long> res = searcher.search();
assertEquals(rejectCount.get(), 4);
assertTrue(res.isComplete); // rejected(4) + topN(2) <= maxQueueSize(6)
assertEquals(1, res.topN.size());
assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
assertEquals(7L, res.topN.get(0).output.longValue());
rejectCount.set(0);
searcher = new Util.TopNSearcher<Long>(fst, 2, 5, minLongComparator) {
@Override
protected boolean acceptResult(IntsRef input, Long output) {
boolean accept = output.intValue() == 7;
if (!accept) {
rejectCount.incrementAndGet();
}
return accept;
}
};
searcher.addStartPaths(fst.getFirstArc(new FST.Arc<Long>()), outputs.getNoOutput(), true, new IntsRefBuilder());
res = searcher.search();
assertEquals(rejectCount.get(), 4);
assertFalse(res.isComplete); // rejected(4) + topN(2) > maxQueueSize(5)
}
// compares just the weight side of the pair
static final Comparator<Pair<Long,Long>> minPairWeightComparator = new Comparator<Pair<Long,Long>> () {
@Override
public int compare(Pair<Long,Long> left, Pair<Long,Long> right) {
return left.output1.compareTo(right.output1);
}
};
/** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFST() throws Exception {
PairOutputs<Long,Long> outputs = new PairOutputs<>(
PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output
);
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L));
builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L));
builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L));
final FST<Pair<Long,Long>> fst = builder.finish();
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
Util.TopResults<Pair<Long,Long>> res = Util.shortestPaths(fst,
fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()),
outputs.getNoOutput(),
minPairWeightComparator,
3,
true);
assertTrue(res.isComplete);
assertEquals(3, res.topN.size());
assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input);
assertEquals(7L, res.topN.get(0).output.output1.longValue()); // weight
assertEquals(36L, res.topN.get(0).output.output2.longValue()); // output
assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input);
assertEquals(17L, res.topN.get(1).output.output1.longValue()); // weight
assertEquals(85L, res.topN.get(1).output.output2.longValue()); // output
assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input);
assertEquals(22L, res.topN.get(2).output.output1.longValue()); // weight
assertEquals(57L, res.topN.get(2).output.output2.longValue()); // output
}
public void testShortestPathsRandom() throws Exception {
final Random random = random();
int numWords = atLeast(1000);
final TreeMap<String,Long> slowCompletor = new TreeMap<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
for (int i = 0; i < numWords; i++) {
String s;
while (true) {
s = TestUtil.randomSimpleString(random);
if (!slowCompletor.containsKey(s)) {
break;
}
}
for (int j = 1; j < s.length(); j++) {
allPrefixes.add(s.substring(0, j));
}
int weight = TestUtil.nextInt(random, 1, 100); // weights 1..100
slowCompletor.put(s, (long)weight);
}
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), e.getValue());
}
final FST<Long> fst = builder.finish();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
BytesReader reader = fst.getBytesReader();
//System.out.println("testing: " + allPrefixes.size() + " prefixes");
for (String prefix : allPrefixes) {
// 1. run prefix against fst, then complete by value
//System.out.println("TEST: " + prefix);
long prefixOutput = 0;
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
for(int idx=0;idx<prefix.length();idx++) {
if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) {
fail();
}
prefixOutput += arc.output();
}
final int topN = TestUtil.nextInt(random, 1, 10);
Util.TopResults<Long> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minLongComparator, topN, true);
assertTrue(r.isComplete);
// 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion
final List<Result<Long>> matches = new ArrayList<>();
// TODO: could be faster... but it's slowCompletor for a reason
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
//System.out.println(" consider " + e.getKey());
matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()),
e.getValue() - prefixOutput));
}
}
assertTrue(matches.size() > 0);
Collections.sort(matches, new TieBreakByInputComparator<>(minLongComparator));
if (matches.size() > topN) {
matches.subList(topN, matches.size()).clear();
}
assertEquals(matches.size(), r.topN.size());
for(int hit=0;hit<r.topN.size();hit++) {
//System.out.println(" check hit " + hit);
assertEquals(matches.get(hit).input, r.topN.get(hit).input);
assertEquals(matches.get(hit).output, r.topN.get(hit).output);
}
}
}
private static class TieBreakByInputComparator<T> implements Comparator<Result<T>> {
private final Comparator<T> comparator;
public TieBreakByInputComparator(Comparator<T> comparator) {
this.comparator = comparator;
}
@Override
public int compare(Result<T> a, Result<T> b) {
int cmp = comparator.compare(a.output, b.output);
if (cmp == 0) {
return a.input.compareTo(b.input);
} else {
return cmp;
}
}
}
// used by slowcompletor
static class TwoLongs {
long a;
long b;
TwoLongs(long a, long b) {
this.a = a;
this.b = b;
}
}
/** like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output */
public void testShortestPathsWFSTRandom() throws Exception {
int numWords = atLeast(1000);
final TreeMap<String,TwoLongs> slowCompletor = new TreeMap<>();
final TreeSet<String> allPrefixes = new TreeSet<>();
PairOutputs<Long,Long> outputs = new PairOutputs<>(
PositiveIntOutputs.getSingleton(), // weight
PositiveIntOutputs.getSingleton() // output
);
final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
Random random = random();
for (int i = 0; i < numWords; i++) {
String s;
while (true) {
s = TestUtil.randomSimpleString(random);
if (!slowCompletor.containsKey(s)) {
break;
}
}
for (int j = 1; j < s.length(); j++) {
allPrefixes.add(s.substring(0, j));
}
int weight = TestUtil.nextInt(random, 1, 100); // weights 1..100
int output = TestUtil.nextInt(random, 0, 500); // outputs 0..500
slowCompletor.put(s, new TwoLongs(weight, output));
}
for (Map.Entry<String,TwoLongs> e : slowCompletor.entrySet()) {
//System.out.println("add: " + e);
long weight = e.getValue().a;
long output = e.getValue().b;
builder.add(Util.toIntsRef(new BytesRef(e.getKey()), scratch), outputs.newPair(weight, output));
}
final FST<Pair<Long,Long>> fst = builder.finish();
//System.out.println("SAVE out.dot");
//Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
//Util.toDot(fst, w, false, false);
//w.close();
BytesReader reader = fst.getBytesReader();
//System.out.println("testing: " + allPrefixes.size() + " prefixes");
for (String prefix : allPrefixes) {
// 1. run prefix against fst, then complete by value
//System.out.println("TEST: " + prefix);
Pair<Long,Long> prefixOutput = outputs.getNoOutput();
FST.Arc<Pair<Long,Long>> arc = fst.getFirstArc(new FST.Arc<Pair<Long,Long>>());
for(int idx=0;idx<prefix.length();idx++) {
if (fst.findTargetArc((int) prefix.charAt(idx), arc, arc, reader) == null) {
fail();
}
prefixOutput = outputs.add(prefixOutput, arc.output());
}
final int topN = TestUtil.nextInt(random, 1, 10);
Util.TopResults<Pair<Long,Long>> r = Util.shortestPaths(fst, arc, fst.outputs.getNoOutput(), minPairWeightComparator, topN, true);
assertTrue(r.isComplete);
// 2. go thru whole treemap (slowCompletor) and check it's actually the best suggestion
final List<Result<Pair<Long,Long>>> matches = new ArrayList<>();
// TODO: could be faster... but it's slowCompletor for a reason
for (Map.Entry<String,TwoLongs> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
//System.out.println(" consider " + e.getKey());
matches.add(new Result<>(Util.toIntsRef(new BytesRef(e.getKey().substring(prefix.length())), new IntsRefBuilder()),
outputs.newPair(e.getValue().a - prefixOutput.output1, e.getValue().b - prefixOutput.output2)));
}
}
assertTrue(matches.size() > 0);
Collections.sort(matches, new TieBreakByInputComparator<>(minPairWeightComparator));
if (matches.size() > topN) {
matches.subList(topN, matches.size()).clear();
}
assertEquals(matches.size(), r.topN.size());
for(int hit=0;hit<r.topN.size();hit++) {
//System.out.println(" check hit " + hit);
assertEquals(matches.get(hit).input, r.topN.get(hit).input);
assertEquals(matches.get(hit).output, r.topN.get(hit).output);
}
}
}
public void testLargeOutputsOnArrayArcs() throws Exception {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
final byte[] bytes = new byte[300];
final IntsRefBuilder input = new IntsRefBuilder();
input.append(0);
final BytesRef output = new BytesRef(bytes);
for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc);
output.bytes[0] = (byte) arc;
builder.add(input.get(), BytesRef.deepCopyOf(output));
}
final FST<BytesRef> fst = builder.finish();
for(int arc=0;arc<6;arc++) {
input.setIntAt(0, arc);
final BytesRef result = Util.get(fst, input.get());
assertNotNull(result);
assertEquals(300, result.length);
assertEquals(result.bytes[result.offset], arc);
for(int byteIDX=1;byteIDX<result.length;byteIDX++) {
assertEquals(0, result.bytes[result.offset+byteIDX]);
}
}
}
public void testIllegallyModifyRootArc() throws Exception {
assumeTrue("test relies on assertions", assertsAreEnabled);
Set<BytesRef> terms = new HashSet<>();
for(int i=0;i<100;i++) {
String prefix = Character.toString((char) ('a' + i));
terms.add(new BytesRef(prefix));
if (prefix.equals("m") == false) {
for(int j=0;j<20;j++) {
// Make a big enough FST that the root cache will be created:
String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
terms.add(new BytesRef(prefix + suffix));
}
}
}
List<BytesRef> termsList = new ArrayList<>(terms);
Collections.sort(termsList);
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder input = new IntsRefBuilder();
for(BytesRef term : termsList) {
Util.toIntsRef(term, input);
builder.add(input.get(), term);
}
FST<BytesRef> fst = builder.finish();
Arc<BytesRef> arc = new FST.Arc<>();
fst.getFirstArc(arc);
FST.BytesReader reader = fst.getBytesReader();
arc = fst.findTargetArc((int) 'm', arc, arc, reader);
assertNotNull(arc);
assertEquals(new BytesRef("m"), arc.output());
// NOTE: illegal:
arc.output().length = 0;
fst.getFirstArc(arc);
try {
arc = fst.findTargetArc((int) 'm', arc, arc, reader);
} catch (AssertionError ae) {
// expected
}
}
public void testSimpleDepth() throws Exception {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRef ab = new BytesRef("ab");
BytesRef ac = new BytesRef("ac");
BytesRef bd = new BytesRef("bd");
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
FST<Long> fst = builder.finish();
assertEquals(3, (long) Util.get(fst, ab));
assertEquals(5, (long) Util.get(fst, ac));
assertEquals(7, (long) Util.get(fst, bd));
}
}