blob: 8c9c309609874e1e994c637efe6729589e1ac89b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.util.Arrays;
import java.util.Random;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TimeUnits;
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@Ignore("Requires tons of heap to run (30 GB hits OOME but 35 GB passes after ~4.5 hours)")
@TimeoutSuite(millis = 100 * TimeUnits.HOUR)
public class Test2BFST extends LuceneTestCase {
private static long LIMIT = 3L*1024*1024*1024;
public void test() throws Exception {
assumeWorkingMMapOnWindows();
int[] ints = new int[7];
IntsRef input = new IntsRef(ints, 0, ints.length);
long seed = random().nextLong();
Directory dir = new MMapDirectory(createTempDir("2BFST"));
for(int iter=0;iter<1;iter++) {
// Build FST w/ NoOutputs and stop when nodeCount > 2.2B
{
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15);
int count = 0;
Random r = new Random(seed);
int[] ints2 = new int[200];
IntsRef input2 = new IntsRef(ints2, 0, ints2.length);
while(true) {
//System.out.println("add: " + input + " -> " + output);
for(int i=10;i<ints2.length;i++) {
ints2[i] = r.nextInt(256);
}
b.add(input2, NO_OUTPUT);
count++;
if (count % 100000 == 0) {
System.out.println(count + ": " + b.fstRamBytesUsed() + " bytes; " + b.getNodeCount() + " nodes");
}
if (b.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
break;
}
nextInput(r, ints2);
}
FST<Object> fst = b.finish();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
Arrays.fill(ints2, 0);
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(NO_OUTPUT, Util.get(fst, input2));
nextInput(r, ints2);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<>(fst);
Arrays.fill(ints2, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
if (pair == null) {
break;
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(input2, pair.input);
assertEquals(NO_OUTPUT, pair.output);
upto++;
nextInput(r, ints2);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
}
}
// Build FST w/ ByteSequenceOutputs and stop when FST
// size = 3GB
{
System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
Arrays.fill(ints, 0);
int count = 0;
Random r = new Random(seed);
while(true) {
r.nextBytes(outputBytes);
//System.out.println("add: " + input + " -> " + output);
b.add(input, BytesRef.deepCopyOf(output));
count++;
if (count % 10000 == 0) {
long size = b.fstRamBytesUsed();
if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes");
}
if (size > LIMIT) {
break;
}
}
nextInput(r, ints);
}
FST<BytesRef> fst = b.finish();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
r = new Random(seed);
Arrays.fill(ints, 0);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
r.nextBytes(outputBytes);
assertEquals(output, Util.get(fst, input));
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
r.nextBytes(outputBytes);
assertEquals(output, pair.output);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
}
}
// Build FST w/ PositiveIntOutputs and stop when FST
// size = 3GB
{
System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
true, 15);
long output = 1;
Arrays.fill(ints, 0);
int count = 0;
Random r = new Random(seed);
while(true) {
//System.out.println("add: " + input + " -> " + output);
b.add(input, output);
output += 1+r.nextInt(10);
count++;
if (count % 10000 == 0) {
long size = b.fstRamBytesUsed();
if (count % 1000000 == 0) {
System.out.println(count + "...: " + size + " bytes");
}
if (size > LIMIT) {
break;
}
}
nextInput(r, ints);
}
FST<Long> fst = b.finish();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.ramBytesUsed() + "; nodeCount=" + b.getNodeCount() + "; arcCount=" + b.getArcCount() + "]");
Arrays.fill(ints, 0);
output = 1;
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
assertEquals(input, Util.getByOutput(fst, output));
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
output = 1;
while(true) {
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
assertEquals(output, pair.output.longValue());
output += 1 + r.nextInt(10);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
}
}
}
dir.close();
}
private void nextInput(Random r, int[] ints) {
int downTo = 6;
while(downTo >= 0) {
// Must add random amounts (and not just 1) because
// otherwise FST outsmarts us and remains tiny:
ints[downTo] += 1+r.nextInt(10);
if (ints[downTo] < 256) {
break;
} else {
ints[downTo] = 0;
downTo--;
}
}
}
}