| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.fst; |
| |
| import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; |
| import java.util.Arrays; |
| import java.util.Random; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.store.MMapDirectory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TimeUnits; |
| import org.junit.Ignore; |
| |
| @Ignore("Requires tons of heap to run (30 GB hits OOME but 35 GB passes after ~4.5 hours)") |
| @TimeoutSuite(millis = 100 * TimeUnits.HOUR) |
| public class Test2BFST extends LuceneTestCase { |
| |
| private static long LIMIT = 3L * 1024 * 1024 * 1024; |
| |
| public void test() throws Exception { |
| assumeWorkingMMapOnWindows(); |
| |
| int[] ints = new int[7]; |
| IntsRef input = new IntsRef(ints, 0, ints.length); |
| long seed = random().nextLong(); |
| |
| Directory dir = new MMapDirectory(createTempDir("2BFST")); |
| |
| for (int iter = 0; iter < 1; iter++) { |
| // Build FST w/ NoOutputs and stop when nodeCount > 2.2B |
| { |
| System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); |
| Outputs<Object> outputs = NoOutputs.getSingleton(); |
| Object NO_OUTPUT = outputs.getNoOutput(); |
| final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| int count = 0; |
| Random r = new Random(seed); |
| int[] ints2 = new int[200]; |
| IntsRef input2 = new IntsRef(ints2, 0, ints2.length); |
| while (true) { |
| // System.out.println("add: " + input + " -> " + output); |
| for (int i = 10; i < ints2.length; i++) { |
| ints2[i] = r.nextInt(256); |
| } |
| fstCompiler.add(input2, NO_OUTPUT); |
| count++; |
| if (count % 100000 == 0) { |
| System.out.println( |
| count |
| + ": " |
| + fstCompiler.fstRamBytesUsed() |
| + " bytes; " |
| + fstCompiler.getNodeCount() |
| + " nodes"); |
| } |
| if (fstCompiler.getNodeCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) { |
| break; |
| } |
| nextInput(r, ints2); |
| } |
| |
| FST<Object> fst = fstCompiler.compile(); |
| |
| for (int verify = 0; verify < 2; verify++) { |
| System.out.println( |
| "\nTEST: now verify [fst size=" |
| + fst.ramBytesUsed() |
| + "; nodeCount=" |
| + fstCompiler.getNodeCount() |
| + "; arcCount=" |
| + fstCompiler.getArcCount() |
| + "]"); |
| |
| Arrays.fill(ints2, 0); |
| r = new Random(seed); |
| |
| for (int i = 0; i < count; i++) { |
| if (i % 1000000 == 0) { |
| System.out.println(i + "...: "); |
| } |
| for (int j = 10; j < ints2.length; j++) { |
| ints2[j] = r.nextInt(256); |
| } |
| assertEquals(NO_OUTPUT, Util.get(fst, input2)); |
| nextInput(r, ints2); |
| } |
| |
| System.out.println("\nTEST: enum all input/outputs"); |
| IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<>(fst); |
| |
| Arrays.fill(ints2, 0); |
| r = new Random(seed); |
| int upto = 0; |
| while (true) { |
| IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next(); |
| if (pair == null) { |
| break; |
| } |
| for (int j = 10; j < ints2.length; j++) { |
| ints2[j] = r.nextInt(256); |
| } |
| assertEquals(input2, pair.input); |
| assertEquals(NO_OUTPUT, pair.output); |
| upto++; |
| nextInput(r, ints2); |
| } |
| assertEquals(count, upto); |
| |
| if (verify == 0) { |
| System.out.println("\nTEST: save/load FST and re-verify"); |
| IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); |
| fst.save(out, out); |
| out.close(); |
| IndexInput in = dir.openInput("fst", IOContext.DEFAULT); |
| fst = new FST<>(in, in, outputs); |
| in.close(); |
| } else { |
| dir.deleteFile("fst"); |
| } |
| } |
| } |
| |
| // Build FST w/ ByteSequenceOutputs and stop when FST |
| // size = 3GB |
| { |
| System.out.println("\nTEST: 3 GB size; outputs=bytes"); |
| Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton(); |
| final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| byte[] outputBytes = new byte[20]; |
| BytesRef output = new BytesRef(outputBytes); |
| Arrays.fill(ints, 0); |
| int count = 0; |
| Random r = new Random(seed); |
| while (true) { |
| r.nextBytes(outputBytes); |
| // System.out.println("add: " + input + " -> " + output); |
| fstCompiler.add(input, BytesRef.deepCopyOf(output)); |
| count++; |
| if (count % 10000 == 0) { |
| long size = fstCompiler.fstRamBytesUsed(); |
| if (count % 1000000 == 0) { |
| System.out.println(count + "...: " + size + " bytes"); |
| } |
| if (size > LIMIT) { |
| break; |
| } |
| } |
| nextInput(r, ints); |
| } |
| |
| FST<BytesRef> fst = fstCompiler.compile(); |
| for (int verify = 0; verify < 2; verify++) { |
| |
| System.out.println( |
| "\nTEST: now verify [fst size=" |
| + fst.ramBytesUsed() |
| + "; nodeCount=" |
| + fstCompiler.getNodeCount() |
| + "; arcCount=" |
| + fstCompiler.getArcCount() |
| + "]"); |
| |
| r = new Random(seed); |
| Arrays.fill(ints, 0); |
| |
| for (int i = 0; i < count; i++) { |
| if (i % 1000000 == 0) { |
| System.out.println(i + "...: "); |
| } |
| r.nextBytes(outputBytes); |
| assertEquals(output, Util.get(fst, input)); |
| nextInput(r, ints); |
| } |
| |
| System.out.println("\nTEST: enum all input/outputs"); |
| IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<>(fst); |
| |
| Arrays.fill(ints, 0); |
| r = new Random(seed); |
| int upto = 0; |
| while (true) { |
| IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next(); |
| if (pair == null) { |
| break; |
| } |
| assertEquals(input, pair.input); |
| r.nextBytes(outputBytes); |
| assertEquals(output, pair.output); |
| upto++; |
| nextInput(r, ints); |
| } |
| assertEquals(count, upto); |
| |
| if (verify == 0) { |
| System.out.println("\nTEST: save/load FST and re-verify"); |
| IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); |
| fst.save(out, out); |
| out.close(); |
| IndexInput in = dir.openInput("fst", IOContext.DEFAULT); |
| fst = new FST<>(in, in, outputs); |
| in.close(); |
| } else { |
| dir.deleteFile("fst"); |
| } |
| } |
| } |
| |
| // Build FST w/ PositiveIntOutputs and stop when FST |
| // size = 3GB |
| { |
| System.out.println("\nTEST: 3 GB size; outputs=long"); |
| Outputs<Long> outputs = PositiveIntOutputs.getSingleton(); |
| final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| long output = 1; |
| |
| Arrays.fill(ints, 0); |
| int count = 0; |
| Random r = new Random(seed); |
| while (true) { |
| // System.out.println("add: " + input + " -> " + output); |
| fstCompiler.add(input, output); |
| output += 1 + r.nextInt(10); |
| count++; |
| if (count % 10000 == 0) { |
| long size = fstCompiler.fstRamBytesUsed(); |
| if (count % 1000000 == 0) { |
| System.out.println(count + "...: " + size + " bytes"); |
| } |
| if (size > LIMIT) { |
| break; |
| } |
| } |
| nextInput(r, ints); |
| } |
| |
| FST<Long> fst = fstCompiler.compile(); |
| |
| for (int verify = 0; verify < 2; verify++) { |
| |
| System.out.println( |
| "\nTEST: now verify [fst size=" |
| + fst.ramBytesUsed() |
| + "; nodeCount=" |
| + fstCompiler.getNodeCount() |
| + "; arcCount=" |
| + fstCompiler.getArcCount() |
| + "]"); |
| |
| Arrays.fill(ints, 0); |
| |
| output = 1; |
| r = new Random(seed); |
| for (int i = 0; i < count; i++) { |
| if (i % 1000000 == 0) { |
| System.out.println(i + "...: "); |
| } |
| |
| // forward lookup: |
| assertEquals(output, Util.get(fst, input).longValue()); |
| // reverse lookup: |
| @SuppressWarnings("deprecation") |
| IntsRef inputResult = Util.getByOutput(fst, output); |
| assertEquals(input, inputResult); |
| output += 1 + r.nextInt(10); |
| nextInput(r, ints); |
| } |
| |
| System.out.println("\nTEST: enum all input/outputs"); |
| IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst); |
| |
| Arrays.fill(ints, 0); |
| r = new Random(seed); |
| int upto = 0; |
| output = 1; |
| while (true) { |
| IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next(); |
| if (pair == null) { |
| break; |
| } |
| assertEquals(input, pair.input); |
| assertEquals(output, pair.output.longValue()); |
| output += 1 + r.nextInt(10); |
| upto++; |
| nextInput(r, ints); |
| } |
| assertEquals(count, upto); |
| |
| if (verify == 0) { |
| System.out.println("\nTEST: save/load FST and re-verify"); |
| IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); |
| fst.save(out, out); |
| out.close(); |
| IndexInput in = dir.openInput("fst", IOContext.DEFAULT); |
| fst = new FST<>(in, in, outputs); |
| in.close(); |
| } else { |
| dir.deleteFile("fst"); |
| } |
| } |
| } |
| } |
| dir.close(); |
| } |
| |
| private void nextInput(Random r, int[] ints) { |
| int downTo = 6; |
| while (downTo >= 0) { |
| // Must add random amounts (and not just 1) because |
| // otherwise FST outsmarts us and remains tiny: |
| ints[downTo] += 1 + r.nextInt(10); |
| if (ints[downTo] < 256) { |
| break; |
| } else { |
| ints[downTo] = 0; |
| downTo--; |
| } |
| } |
| } |
| } |