blob: 641b2303e97aed6d1f3500049dc8f0ac147b80f8 [file] [log] [blame]
diff --git a/lucene/analysis/kuromoji/ivy.xml b/lucene/analysis/kuromoji/ivy.xml
index 10eba4e..eb08509 100644
--- a/lucene/analysis/kuromoji/ivy.xml
+++ b/lucene/analysis/kuromoji/ivy.xml
@@ -27,7 +27,7 @@
<dependencies>
<dependency org="mecab" name="mecab-ipadic" rev="${/mecab/mecab-ipadic}" conf="ipadic">
- <artifact name="ipadic" type=".tar.gz" url="http://mecab.googlecode.com/files/mecab-ipadic-2.7.0-20070801.tar.gz"/>
+ <artifact name="ipadic" type=".tar.gz" url="http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz"/>
</dependency>
<dependency org="mecab" name="mecab-naist-jdic" rev="${/mecab/mecab-naist-jdic}" conf="naist">
<artifact name="mecab-naist-jdic" type=".tar.gz" url="http://sourceforge.jp/frs/redir.php?m=iij&amp;f=/naist-jdic/53500/mecab-naist-jdic-0.6.3b-20111013.tar.gz"/>
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index 8935809..6cfad72 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
index 54382ed..6ad8a68 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
@@ -28,7 +28,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
-import org.apache.lucene.util.BitUtil;
public final class ConnectionCostsWriter {
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
index 61d6f27..1b8abbb 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@@ -33,12 +33,10 @@ import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
-import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.packed.PackedInts;
import com.ibm.icu.text.Normalizer2;
@@ -133,7 +131,7 @@ public class TokenInfoDictionaryBuilder {
System.out.println(" encode...");
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
- Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, PackedInts.DEFAULT, true, 15);
+ Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = -1; // first ord will be 0
String lastValue = null;
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
index fb682fd..b16bb15 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
@@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@@ -363,8 +362,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
- FST_OUTPUTS, false,
- PackedInts.COMPACT, true, 15);
+ FST_OUTPUTS, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
index 1427dec..2f71765 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
@@ -81,9 +81,6 @@ import org.apache.lucene.util.packed.PackedInts;
// loads itself in ram?
public final class MemoryPostingsFormat extends PostingsFormat {
- private final boolean doPackFST;
- private final float acceptableOverheadRatio;
-
public MemoryPostingsFormat() {
this(false, PackedInts.DEFAULT);
}
@@ -97,13 +94,11 @@ public final class MemoryPostingsFormat extends PostingsFormat {
*/
public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) {
super("Memory");
- this.doPackFST = doPackFST;
- this.acceptableOverheadRatio = acceptableOverheadRatio;
}
@Override
public String toString() {
- return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
+ return "PostingsFormat(name=" + getName() + ")";
}
private final static class TermsWriter {
@@ -111,16 +106,12 @@ public final class MemoryPostingsFormat extends PostingsFormat {
private final FieldInfo field;
private final Builder<BytesRef> builder;
private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
- private final boolean doPackFST;
- private final float acceptableOverheadRatio;
private int termCount;
- public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST, float acceptableOverheadRatio) {
+ public TermsWriter(IndexOutput out, FieldInfo field) {
this.out = out;
this.field = field;
- this.doPackFST = doPackFST;
- this.acceptableOverheadRatio = acceptableOverheadRatio;
- builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doPackFST, acceptableOverheadRatio, true, 15);
+ builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
}
private class PostingsWriter {
@@ -307,8 +298,7 @@ public final class MemoryPostingsFormat extends PostingsFormat {
TermsEnum termsEnum = terms.iterator();
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
- TermsWriter termsWriter = new TermsWriter(out, fieldInfo,
- doPackFST, acceptableOverheadRatio);
+ TermsWriter termsWriter = new TermsWriter(out, fieldInfo);
FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.maxDoc());
long sumTotalTermFreq = 0;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
index a4a150b..bdacc22 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
@@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@@ -456,8 +455,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
- outputs, false,
- PackedInts.COMPACT, true, 15);
+ outputs, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
index c5ab849..428edd3 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java
@@ -23,7 +23,6 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
-import org.apache.lucene.util.packed.PackedInts;
// TODO: could we somehow stream an FST to disk while we
// build it?
@@ -69,10 +68,6 @@ public class Builder<T> {
private final int shareMaxTailLength;
private final IntsRefBuilder lastInput = new IntsRefBuilder();
-
- // for packing
- private final boolean doPackFST;
- private final float acceptableOverheadRatio;
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
@@ -99,11 +94,10 @@ public class Builder<T> {
/**
* Instantiates an FST/FSA builder without any pruning. A shortcut
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean,
- * boolean, int, Outputs, boolean, float,
- * boolean, int)} with pruning options turned off.
+ * boolean, int, Outputs, boolean, int)} with pruning options turned off.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
- this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, false, PackedInts.COMPACT, true, 15);
+ this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
}
/**
@@ -143,11 +137,6 @@ public class Builder<T> {
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
- * @param doPackFST Pass true to create a packed FST.
- *
- * @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
- * is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float)
- *
* @param allowArrayArcs Pass false to disable the array arc optimization
* while building the FST; this will make the resulting
* FST smaller but slower to traverse.
@@ -159,16 +148,13 @@ public class Builder<T> {
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
- boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs,
- int bytesPageBits) {
+ boolean allowArrayArcs, int bytesPageBits) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
- this.doPackFST = doPackFST;
- this.acceptableOverheadRatio = acceptableOverheadRatio;
this.allowArrayArcs = allowArrayArcs;
- fst = new FST<>(inputType, outputs, doPackFST, acceptableOverheadRatio, bytesPageBits);
+ fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
if (doShareSuffix) {
@@ -496,11 +482,7 @@ public class Builder<T> {
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length()).node);
- if (doPackFST) {
- return fst.pack(this, 3, Math.max(10, (int) (getNodeCount()/4)), acceptableOverheadRatio);
- } else {
- return fst;
- }
+ return fst;
}
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
index 4a0a3a9..5ea6dab 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@@ -24,13 +24,9 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
@@ -38,13 +34,9 @@ import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.Accountable;
-import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
-import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.packed.GrowableWriter;
-import org.apache.lucene.util.packed.PackedInts;
// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
@@ -90,14 +82,6 @@ public final class FST<T> implements Accountable {
static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
- // Arcs are stored as fixed-size (per entry) array, so
- // that we can find an arc using binary search. We do
- // this when number of arcs is > NUM_ARCS_ARRAY:
-
- // If set, the target node is delta coded vs current
- // position:
- private static final int BIT_TARGET_DELTA = 1 << 6;
-
// We use this as a marker (because this one flag is
// illegal by itself ...):
private static final byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT;
@@ -137,7 +121,9 @@ public final class FST<T> implements Accountable {
/** Don't store arcWithOutputCount anymore */
private static final int VERSION_NO_NODE_ARC_COUNTS = 5;
- private static final int VERSION_CURRENT = VERSION_NO_NODE_ARC_COUNTS;
+ private static final int VERSION_PACKED_REMOVED = 6;
+
+ private static final int VERSION_CURRENT = VERSION_PACKED_REMOVED;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@@ -168,9 +154,6 @@ public final class FST<T> implements Accountable {
public final Outputs<T> outputs;
- private final boolean packed;
- private PackedInts.Reader nodeRefToAddress;
-
private Arc<T> cachedRootArcs[];
/** Represents a single arc. */
@@ -273,18 +256,11 @@ public final class FST<T> implements Accountable {
return (flags & bit) != 0;
}
- private GrowableWriter nodeAddress;
-
- // TODO: we could be smarter here, and prune periodically
- // as we go; high in-count nodes will "usually" become
- // clear early on:
- private GrowableWriter inCounts;
-
private final int version;
// make a new empty FST, for building; Builder invokes
// this ctor
- FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, int bytesPageBits) {
+ FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
this.inputType = inputType;
this.outputs = outputs;
version = VERSION_CURRENT;
@@ -293,17 +269,8 @@ public final class FST<T> implements Accountable {
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
bytes.writeByte((byte) 0);
- if (willPackFST) {
- nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio);
- inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio);
- } else {
- nodeAddress = null;
- inCounts = null;
- }
emptyOutput = null;
- packed = false;
- nodeRefToAddress = null;
}
public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
@@ -324,8 +291,12 @@ public final class FST<T> implements Accountable {
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
- version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_NO_NODE_ARC_COUNTS);
- packed = in.readByte() == 1;
+ version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_CURRENT);
+ if (version < VERSION_PACKED_REMOVED) {
+ if (in.readByte() == 1) {
+ throw new CorruptIndexException("Cannot read packed FSTs anymore", in);
+ }
+ }
if (in.readByte() == 1) {
// accepts empty string
// 1 KB blocks:
@@ -334,17 +305,12 @@ public final class FST<T> implements Accountable {
emptyBytes.copyBytes(in, numBytes);
// De-serialize empty-string output:
- BytesReader reader;
- if (packed) {
- reader = emptyBytes.getForwardReader();
- } else {
- reader = emptyBytes.getReverseReader();
- // NoOutputs uses 0 bytes when writing its output,
- // so we have to check here else BytesStore gets
- // angry:
- if (numBytes > 0) {
- reader.setPosition(numBytes-1);
- }
+ BytesReader reader = emptyBytes.getReverseReader();
+ // NoOutputs uses 0 bytes when writing its output,
+ // so we have to check here else BytesStore gets
+ // angry:
+ if (numBytes > 0) {
+ reader.setPosition(numBytes-1);
}
emptyOutput = outputs.readFinalOutput(reader);
} else {
@@ -364,11 +330,6 @@ public final class FST<T> implements Accountable {
default:
throw new IllegalStateException("invalid input type " + t);
}
- if (packed) {
- nodeRefToAddress = PackedInts.getReader(in);
- } else {
- nodeRefToAddress = null;
- }
startNode = in.readVLong();
if (version < VERSION_NO_NODE_ARC_COUNTS) {
in.readVLong();
@@ -424,31 +385,13 @@ public final class FST<T> implements Accountable {
} else {
size += bytes.ramBytesUsed();
}
- if (packed) {
- size += nodeRefToAddress.ramBytesUsed();
- } else if (nodeAddress != null) {
- size += nodeAddress.ramBytesUsed();
- size += inCounts.ramBytesUsed();
- }
size += cachedArcsBytesUsed;
return size;
}
@Override
- public Collection<Accountable> getChildResources() {
- List<Accountable> resources = new ArrayList<>();
- if (packed) {
- resources.add(Accountables.namedAccountable("node ref to address", nodeRefToAddress));
- } else if (nodeAddress != null) {
- resources.add(Accountables.namedAccountable("node addresses", nodeAddress));
- resources.add(Accountables.namedAccountable("in counts", inCounts));
- }
- return resources;
- }
-
- @Override
public String toString() {
- return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs + ",packed=" + packed;
+ return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs;
}
void finish(long newStartNode) throws IOException {
@@ -463,16 +406,6 @@ public final class FST<T> implements Accountable {
bytes.finish();
cacheRootArcs();
}
-
- private long getNodeAddress(long node) {
- if (nodeAddress != null) {
- // Deref
- return nodeAddress.get((int) node);
- } else {
- // Straight
- return node;
- }
- }
// Optionally caches first 128 labels
@SuppressWarnings({"rawtypes","unchecked"})
@@ -527,18 +460,7 @@ public final class FST<T> implements Accountable {
if (startNode == -1) {
throw new IllegalStateException("call finish first");
}
- if (nodeAddress != null) {
- throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed");
- }
- if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) {
- throw new IllegalStateException("cannot save a FST which has been loaded from disk ");
- }
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
- if (packed) {
- out.writeByte((byte) 1);
- } else {
- out.writeByte((byte) 0);
- }
// TODO: really we should encode this as an arc, arriving
// to the root node, instead of special casing here:
if (emptyOutput != null) {
@@ -552,16 +474,14 @@ public final class FST<T> implements Accountable {
byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()];
ros.writeTo(emptyOutputBytes, 0);
- if (!packed) {
- // reverse
- final int stopAt = emptyOutputBytes.length/2;
- int upto = 0;
- while(upto < stopAt) {
- final byte b = emptyOutputBytes[upto];
- emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
- emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
- upto++;
- }
+ // reverse
+ final int stopAt = emptyOutputBytes.length/2;
+ int upto = 0;
+ while(upto < stopAt) {
+ final byte b = emptyOutputBytes[upto];
+ emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1];
+ emptyOutputBytes[emptyOutputBytes.length-upto-1] = b;
+ upto++;
}
out.writeVInt(emptyOutputBytes.length);
out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
@@ -577,9 +497,6 @@ public final class FST<T> implements Accountable {
t = 2;
}
out.writeByte(t);
- if (packed) {
- ((PackedInts.Mutable) nodeRefToAddress).save(out);
- }
out.writeVLong(startNode);
if (bytes != null) {
long numBytes = bytes.getPosition();
@@ -705,8 +622,6 @@ public final class FST<T> implements Accountable {
if (!targetHasArcs) {
flags += BIT_STOP_NODE;
- } else if (inCounts != null) {
- inCounts.set((int) target.node, inCounts.get((int) target.node) + 1);
}
if (arc.output != NO_OUTPUT) {
@@ -810,30 +725,8 @@ public final class FST<T> implements Accountable {
builder.bytes.reverse(startAddress, thisNodeAddress);
- // PackedInts uses int as the index, so we cannot handle
- // > 2.1B nodes when packing:
- if (nodeAddress != null && builder.nodeCount == Integer.MAX_VALUE) {
- throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes");
- }
-
builder.nodeCount++;
- final long node;
- if (nodeAddress != null) {
-
- // Nodes are addressed by 1+ord:
- if ((int) builder.nodeCount == nodeAddress.size()) {
- nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue()));
- inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue()));
- }
- nodeAddress.set((int) builder.nodeCount, thisNodeAddress);
- // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress);
- node = builder.nodeCount;
- } else {
- node = thisNodeAddress;
- }
-
- //System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress);
- return node;
+ return thisNodeAddress;
}
/** Fills virtual 'start' arc, ie, an empty incoming arc to
@@ -876,13 +769,13 @@ public final class FST<T> implements Accountable {
arc.flags = BIT_LAST_ARC;
return arc;
} else {
- in.setPosition(getNodeAddress(follow.target));
+ in.setPosition(follow.target);
arc.node = follow.target;
final byte b = in.readByte();
if (b == ARCS_AS_FIXED_ARRAY) {
// array: jump straight to end
arc.numArcs = in.readVInt();
- if (packed || version >= VERSION_VINT_TARGET) {
+ if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
@@ -906,8 +799,6 @@ public final class FST<T> implements Accountable {
}
if (arc.flag(BIT_STOP_NODE)) {
} else if (arc.flag(BIT_TARGET_NEXT)) {
- } else if (packed) {
- in.readVLong();
} else {
readUnpackedNodeTarget(in);
}
@@ -964,7 +855,7 @@ public final class FST<T> implements Accountable {
}
public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException {
- final long address = getNodeAddress(node);
+ final long address = node;
in.setPosition(address);
//System.out.println(" readFirstRealTargtArc address="
//+ address);
@@ -975,7 +866,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
- if (packed || version >= VERSION_VINT_TARGET) {
+ if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
@@ -1002,7 +893,7 @@ public final class FST<T> implements Accountable {
if (!targetHasArcs(follow)) {
return false;
} else {
- in.setPosition(getNodeAddress(follow.target));
+ in.setPosition(follow.target);
return in.readByte() == ARCS_AS_FIXED_ARRAY;
}
}
@@ -1029,7 +920,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" nextArc fake " +
//arc.nextArc);
- long pos = getNodeAddress(arc.nextArc);
+ long pos = arc.nextArc;
in.setPosition(pos);
final byte b = in.readByte();
@@ -1038,7 +929,7 @@ public final class FST<T> implements Accountable {
in.readVInt();
// Skip bytesPerArc:
- if (packed || version >= VERSION_VINT_TARGET) {
+ if (version >= VERSION_VINT_TARGET) {
in.readVInt();
} else {
in.readInt();
@@ -1107,41 +998,18 @@ public final class FST<T> implements Accountable {
arc.nextArc = in.getPosition();
// TODO: would be nice to make this lazy -- maybe
// caller doesn't need the target and is scanning arcs...
- if (nodeAddress == null) {
- if (!arc.flag(BIT_LAST_ARC)) {
- if (arc.bytesPerArc == 0) {
- // must scan
- seekToNextNode(in);
- } else {
- in.setPosition(arc.posArcsStart);
- in.skipBytes(arc.bytesPerArc * arc.numArcs);
- }
- }
- arc.target = in.getPosition();
- } else {
- arc.target = arc.node - 1;
- assert arc.target > 0;
- }
- } else {
- if (packed) {
- final long pos = in.getPosition();
- final long code = in.readVLong();
- if (arc.flag(BIT_TARGET_DELTA)) {
- // Address is delta-coded from current address:
- arc.target = pos + code;
- //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target);
- } else if (code < nodeRefToAddress.size()) {
- // Deref
- arc.target = nodeRefToAddress.get((int) code);
- //System.out.println(" deref code=" + code + " target=" + arc.target);
+ if (!arc.flag(BIT_LAST_ARC)) {
+ if (arc.bytesPerArc == 0) {
+ // must scan
+ seekToNextNode(in);
} else {
- // Absolute
- arc.target = code;
- //System.out.println(" abs code=" + code);
+ in.setPosition(arc.posArcsStart);
+ in.skipBytes(arc.bytesPerArc * arc.numArcs);
}
- } else {
- arc.target = readUnpackedNodeTarget(in);
}
+ arc.target = in.getPosition();
+ } else {
+ arc.target = readUnpackedNodeTarget(in);
arc.nextArc = in.getPosition();
}
return arc;
@@ -1228,7 +1096,7 @@ public final class FST<T> implements Accountable {
return null;
}
- in.setPosition(getNodeAddress(follow.target));
+ in.setPosition(follow.target);
arc.node = follow.target;
@@ -1237,7 +1105,7 @@ public final class FST<T> implements Accountable {
if (in.readByte() == ARCS_AS_FIXED_ARRAY) {
// Arcs are full array; do binary search:
arc.numArcs = in.readVInt();
- if (packed || version >= VERSION_VINT_TARGET) {
+ if (version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
@@ -1303,11 +1171,7 @@ public final class FST<T> implements Accountable {
}
if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
- if (packed) {
- in.readVLong();
- } else {
- readUnpackedNodeTarget(in);
- }
+ readUnpackedNodeTarget(in);
}
if (flag(flags, BIT_LAST_ARC)) {
@@ -1340,18 +1204,10 @@ public final class FST<T> implements Accountable {
/** Returns a {@link BytesReader} for this FST, positioned at
* position 0. */
public BytesReader getBytesReader() {
- if (packed) {
- if (bytesArray != null) {
- return new ForwardBytesReader(bytesArray);
- } else {
- return bytes.getForwardReader();
- }
+ if (bytesArray != null) {
+ return new ReverseBytesReader(bytesArray);
} else {
- if (bytesArray != null) {
- return new ReverseBytesReader(bytesArray);
- } else {
- return bytes.getReverseReader();
- }
+ return bytes.getReverseReader();
}
}
@@ -1476,395 +1332,4 @@ public final class FST<T> implements Accountable {
}
*/
- // Creates a packed FST
- private FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
- version = VERSION_CURRENT;
- packed = true;
- this.inputType = inputType;
- bytesArray = null;
- bytes = new BytesStore(bytesPageBits);
- this.outputs = outputs;
- }
-
- /** Expert: creates an FST by packing this one. This
- * process requires substantial additional RAM (currently
- * up to ~8 bytes per node depending on
- * <code>acceptableOverheadRatio</code>), but then should
- * produce a smaller FST.
- *
- * <p>The implementation of this method uses ideas from
- * <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>,
- * which describes techniques to reduce the size of a FST.
- * However, this is not a strict implementation of the
- * algorithms described in this paper.
- */
- FST<T> pack(Builder<T> builder, int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
-
- // NOTE: maxDerefNodes is intentionally int: we cannot
- // support > 2.1B deref nodes
-
- // TODO: other things to try
- // - renumber the nodes to get more next / better locality?
- // - allow multiple input labels on an arc, so
- // singular chain of inputs can take one arc (on
- // wikipedia terms this could save another ~6%)
- // - in the ord case, the output '1' is presumably
- // very common (after NO_OUTPUT)... maybe use a bit
- // for it..?
- // - use spare bits in flags.... for top few labels /
- // outputs / targets
-
- if (nodeAddress == null) {
- throw new IllegalArgumentException("this FST was not built with willPackFST=true");
- }
-
- T NO_OUTPUT = outputs.getNoOutput();
-
- Arc<T> arc = new Arc<>();
-
- final BytesReader r = getBytesReader();
-
- final int topN = Math.min(maxDerefNodes, inCounts.size());
-
- // Find top nodes with highest number of incoming arcs:
- NodeQueue q = new NodeQueue(topN);
-
- // TODO: we could use more RAM efficient selection algo here...
- NodeAndInCount bottom = null;
- for(int node=0; node<inCounts.size(); node++) {
- if (inCounts.get(node) >= minInCountDeref) {
- if (bottom == null) {
- q.add(new NodeAndInCount(node, (int) inCounts.get(node)));
- if (q.size() == topN) {
- bottom = q.top();
- }
- } else if (inCounts.get(node) > bottom.count) {
- q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node)));
- }
- }
- }
-
- // Free up RAM:
- inCounts = null;
-
- final Map<Integer,Integer> topNodeMap = new HashMap<>();
- for(int downTo=q.size()-1;downTo>=0;downTo--) {
- NodeAndInCount n = q.pop();
- topNodeMap.put(n.node, downTo);
- //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo);
- }
-
- // +1 because node ords start at 1 (0 is reserved as stop node):
- final GrowableWriter newNodeAddress = new GrowableWriter(
- PackedInts.bitsRequired(builder.bytes.getPosition()), (int) (1 + builder.nodeCount), acceptableOverheadRatio);
-
- // Fill initial coarse guess:
- for(int node=1;node<=builder.nodeCount;node++) {
- newNodeAddress.set(node, 1 + builder.bytes.getPosition() - nodeAddress.get(node));
- }
-
- int absCount;
- int deltaCount;
- int topCount;
- int nextCount;
-
- FST<T> fst;
-
- // Iterate until we converge:
- while(true) {
-
- //System.out.println("\nITER");
- boolean changed = false;
-
- // for assert:
- boolean negDelta = false;
-
- fst = new FST<>(inputType, outputs, builder.bytes.getBlockBits());
-
- final BytesStore writer = fst.bytes;
-
- // Skip 0 byte since 0 is reserved target:
- writer.writeByte((byte) 0);
-
- absCount = deltaCount = topCount = nextCount = 0;
-
- int changedCount = 0;
-
- long addressError = 0;
-
- //int totWasted = 0;
-
- // Since we re-reverse the bytes, we now write the
- // nodes backwards, so that BIT_TARGET_NEXT is
- // unchanged:
- for(int node=(int) builder.nodeCount;node>=1;node--) {
- final long address = writer.getPosition();
-
- //System.out.println(" node: " + node + " address=" + address);
- if (address != newNodeAddress.get(node)) {
- addressError = address - newNodeAddress.get(node);
- //System.out.println(" change: " + (address - newNodeAddress[node]));
- changed = true;
- newNodeAddress.set(node, address);
- changedCount++;
- }
-
- int nodeArcCount = 0;
- int bytesPerArc = 0;
-
- boolean retry = false;
-
- // for assert:
- boolean anyNegDelta = false;
-
- // Retry loop: possibly iterate more than once, if
- // this is an array'd node and bytesPerArc changes:
- writeNode:
- while(true) { // retry writing this node
-
- //System.out.println(" cycle: retry");
- readFirstRealTargetArc(node, arc, r);
-
- final boolean useArcArray = arc.bytesPerArc != 0;
- if (useArcArray) {
- // Write false first arc:
- if (bytesPerArc == 0) {
- bytesPerArc = arc.bytesPerArc;
- }
- writer.writeByte(ARCS_AS_FIXED_ARRAY);
- writer.writeVInt(arc.numArcs);
- writer.writeVInt(bytesPerArc);
- //System.out.println("node " + node + ": " + arc.numArcs + " arcs");
- }
-
- int maxBytesPerArc = 0;
- //int wasted = 0;
- while(true) { // iterate over all arcs for this node
- //System.out.println(" cycle next arc");
-
- final long arcStartPos = writer.getPosition();
- nodeArcCount++;
-
- byte flags = 0;
-
- if (arc.isLast()) {
- flags += BIT_LAST_ARC;
- }
- /*
- if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) {
- flags += BIT_TARGET_NEXT;
- }
- */
- if (!useArcArray && node != 1 && arc.target == node-1) {
- flags += BIT_TARGET_NEXT;
- if (!retry) {
- nextCount++;
- }
- }
- if (arc.isFinal()) {
- flags += BIT_FINAL_ARC;
- if (arc.nextFinalOutput != NO_OUTPUT) {
- flags += BIT_ARC_HAS_FINAL_OUTPUT;
- }
- } else {
- assert arc.nextFinalOutput == NO_OUTPUT;
- }
- if (!targetHasArcs(arc)) {
- flags += BIT_STOP_NODE;
- }
-
- if (arc.output != NO_OUTPUT) {
- flags += BIT_ARC_HAS_OUTPUT;
- }
-
- final long absPtr;
- final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0;
- if (doWriteTarget) {
-
- final Integer ptr = topNodeMap.get(arc.target);
- if (ptr != null) {
- absPtr = ptr;
- } else {
- absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError;
- }
-
- long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition() - 2;
- if (delta < 0) {
- //System.out.println("neg: " + delta);
- anyNegDelta = true;
- delta = 0;
- }
-
- if (delta < absPtr) {
- flags |= BIT_TARGET_DELTA;
- }
- } else {
- absPtr = 0;
- }
-
- assert flags != ARCS_AS_FIXED_ARRAY;
- writer.writeByte(flags);
-
- fst.writeLabel(writer, arc.label);
-
- if (arc.output != NO_OUTPUT) {
- outputs.write(arc.output, writer);
- }
- if (arc.nextFinalOutput != NO_OUTPUT) {
- outputs.writeFinalOutput(arc.nextFinalOutput, writer);
- }
-
- if (doWriteTarget) {
-
- long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition();
- if (delta < 0) {
- anyNegDelta = true;
- //System.out.println("neg: " + delta);
- delta = 0;
- }
-
- if (flag(flags, BIT_TARGET_DELTA)) {
- //System.out.println(" delta");
- writer.writeVLong(delta);
- if (!retry) {
- deltaCount++;
- }
- } else {
- /*
- if (ptr != null) {
- System.out.println(" deref");
- } else {
- System.out.println(" abs");
- }
- */
- writer.writeVLong(absPtr);
- if (!retry) {
- if (absPtr >= topNodeMap.size()) {
- absCount++;
- } else {
- topCount++;
- }
- }
- }
- }
-
- if (useArcArray) {
- final int arcBytes = (int) (writer.getPosition() - arcStartPos);
- //System.out.println(" " + arcBytes + " bytes");
- maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes);
- // NOTE: this may in fact go "backwards", if
- // somehow (rarely, possibly never) we use
- // more bytesPerArc in this rewrite than the
- // incoming FST did... but in this case we
- // will retry (below) so it's OK to ovewrite
- // bytes:
- //wasted += bytesPerArc - arcBytes;
- writer.skipBytes((int) (arcStartPos + bytesPerArc - writer.getPosition()));
- }
-
- if (arc.isLast()) {
- break;
- }
-
- readNextRealArc(arc, r);
- }
-
- if (useArcArray) {
- if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) {
- // converged
- //System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted);
- //totWasted += wasted;
- break;
- }
- } else {
- break;
- }
-
- //System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc);
-
- // Retry:
- bytesPerArc = maxBytesPerArc;
- writer.truncate(address);
- nodeArcCount = 0;
- retry = true;
- anyNegDelta = false;
- }
-
- negDelta |= anyNegDelta;
- }
-
- if (!changed) {
- // We don't renumber the nodes (just reverse their
- // order) so nodes should only point forward to
- // other nodes because we only produce acyclic FSTs
- // w/ nodes only pointing "forwards":
- assert !negDelta;
- //System.out.println("TOT wasted=" + totWasted);
- // Converged!
- break;
- }
- }
-
- long maxAddress = 0;
- for (long key : topNodeMap.keySet()) {
- maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key));
- }
-
- PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(),
- PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio);
- for(Map.Entry<Integer,Integer> ent : topNodeMap.entrySet()) {
- nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey()));
- }
- fst.nodeRefToAddress = nodeRefToAddressIn;
-
- fst.startNode = newNodeAddress.get((int) startNode);
- //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode);
-
- if (emptyOutput != null) {
- fst.setEmptyOutput(emptyOutput);
- }
-
- fst.bytes.finish();
- fst.cacheRootArcs();
-
- //final int size = fst.sizeInBytes();
- //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount);
-
- return fst;
- }
-
- private static class NodeAndInCount implements Comparable<NodeAndInCount> {
- final int node;
- final int count;
-
- public NodeAndInCount(int node, int count) {
- this.node = node;
- this.count = count;
- }
-
- @Override
- public int compareTo(NodeAndInCount other) {
- if (count > other.count) {
- return 1;
- } else if (count < other.count) {
- return -1;
- } else {
- // Tie-break: smaller node compares as greater than
- return other.node - node;
- }
- }
- }
-
- private static class NodeQueue extends PriorityQueue<NodeAndInCount> {
- public NodeQueue(int topN) {
- super(topN, false);
- }
-
- @Override
- public boolean lessThan(NodeAndInCount a, NodeAndInCount b) {
- final int cmp = a.compareTo(b);
- assert cmp != 0;
- return cmp < 0;
- }
- }
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java b/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java
index 41426f9..d984586 100644
--- a/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java
@@ -24,7 +24,6 @@
* <li>Fast and low memory overhead construction of the minimal FST
* (but inputs must be provided in sorted order)</li>
* <li>Low object overhead and quick deserialization (byte[] representation)</li>
- * <li>Optional two-pass compression: {@link org.apache.lucene.util.fst.FST#pack FST.pack()}</li>
* <li>{@link org.apache.lucene.util.fst.Util#getByOutput Lookup-by-output} when the
* outputs are in sorted order (e.g., ordinals or file pointers)</li>
* <li>Pluggable {@link org.apache.lucene.util.fst.Outputs Outputs} representation</li>
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
index bdec65c..a02bf8a 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
@@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TimeUnits;
-import org.apache.lucene.util.packed.PackedInts;
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@@ -47,16 +46,14 @@ public class Test2BFST extends LuceneTestCase {
Directory dir = new MMapDirectory(createTempDir("2BFST"));
- for(int doPackIter=0;doPackIter<2;doPackIter++) {
- boolean doPack = doPackIter == 1;
-
+ for(int iter=0;iter<1;iter++) {
// Build FST w/ NoOutputs and stop when nodeCount > 2.2B
- if (!doPack) {
+ {
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
- doPack, PackedInts.COMPACT, true, 15);
+ true, 15);
int count = 0;
Random r = new Random(seed);
@@ -135,10 +132,10 @@ public class Test2BFST extends LuceneTestCase {
// Build FST w/ ByteSequenceOutputs and stop when FST
// size = 3GB
{
- System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
+ System.out.println("\nTEST: 3 GB size; outputs=bytes");
Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton();
final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
- doPack, PackedInts.COMPACT, true, 15);
+ true, 15);
byte[] outputBytes = new byte[20];
BytesRef output = new BytesRef(outputBytes);
@@ -212,10 +209,10 @@ public class Test2BFST extends LuceneTestCase {
// Build FST w/ PositiveIntOutputs and stop when FST
// size = 3GB
{
- System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
+ System.out.println("\nTEST: 3 GB size; outputs=long");
Outputs<Long> outputs = PositiveIntOutputs.getSingleton();
final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
- doPack, PackedInts.COMPACT, true, 15);
+ true, 15);
long output = 1;
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index 39b3282..6b218cf 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -76,7 +76,6 @@ import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.Result;
-import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.util.fst.FSTTester.getRandomString;
import static org.apache.lucene.util.fst.FSTTester.simpleRandomString;
@@ -328,9 +327,7 @@ public class TestFSTs extends LuceneTestCase {
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final boolean doRewrite = random().nextBoolean();
-
- Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doRewrite, PackedInts.DEFAULT, true, 15);
+ Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
@@ -464,16 +461,14 @@ public class TestFSTs extends LuceneTestCase {
private int inputMode;
private final Outputs<T> outputs;
private final Builder<T> builder;
- private final boolean doPack;
- public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) {
+ public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode;
this.outputs = outputs;
- this.doPack = doPack;
- builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, doPack, PackedInts.DEFAULT, !noArcArrays, 15);
+ builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@@ -622,7 +617,6 @@ public class TestFSTs extends LuceneTestCase {
boolean storeOrds = false;
boolean storeDocFreqs = false;
boolean verify = true;
- boolean doPack = false;
boolean noArcArrays = false;
Path wordsFileIn = null;
Path dirOut = null;
@@ -647,8 +641,6 @@ public class TestFSTs extends LuceneTestCase {
storeOrds = true;
} else if (args[idx].equals("-noverify")) {
verify = false;
- } else if (args[idx].equals("-pack")) {
- doPack = true;
} else if (args[idx].startsWith("-")) {
System.err.println("Unrecognized option: " + args[idx]);
System.exit(-1);
@@ -677,7 +669,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton();
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2);
- new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
+ new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
@@ -691,7 +683,7 @@ public class TestFSTs extends LuceneTestCase {
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
+ new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
return (long) ord;
@@ -700,7 +692,7 @@ public class TestFSTs extends LuceneTestCase {
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
+ new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
Random rand;
@Override
public Long getOutput(IntsRef input, int ord) {
@@ -714,7 +706,7 @@ public class TestFSTs extends LuceneTestCase {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
- new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) {
+ new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
@@ -1118,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase {
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, random().nextBoolean(), PackedInts.DEFAULT, true, 15);
+ final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
final FST<Long> fst = builder.finish();
@@ -1132,8 +1124,7 @@ public class TestFSTs extends LuceneTestCase {
public void testInternalFinalState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
- final boolean willRewrite = random().nextBoolean();
- final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, willRewrite, PackedInts.DEFAULT, true, 15);
+ final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
final FST<Long> fst = builder.finish();
diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
index b49ea79..d83b915 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java
@@ -50,7 +50,6 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
-import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@@ -354,8 +353,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
- FST_OUTPUTS, false,
- PackedInts.COMPACT, true, 15);
+ FST_OUTPUTS, true, 15);
//if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
//}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
index 3706724..3d20412 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java
@@ -26,7 +26,6 @@ import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.*;
-import org.apache.lucene.util.packed.PackedInts;
/**
* Finite state automata based implementation of "autocomplete" functionality.
@@ -237,8 +236,7 @@ public class FSTCompletionBuilder {
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
- shareMaxTailLength, outputs, false,
- PackedInts.DEFAULT, true, 15);
+ shareMaxTailLength, outputs, true, 15);
BytesRefBuilder scratch = new BytesRefBuilder();
BytesRef entry;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
index 11b1325..8e6a4ea 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java
@@ -40,7 +40,6 @@ import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.packed.PackedInts;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -273,25 +272,14 @@ public class FSTTester<T> {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
- final boolean willRewrite = random.nextBoolean();
-
final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs,
- willRewrite,
- PackedInts.DEFAULT,
true,
15);
- if (LuceneTestCase.VERBOSE) {
- if (willRewrite) {
- System.out.println("TEST: packed FST");
- } else {
- System.out.println("TEST: non-packed FST");
- }
- }
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {
@@ -306,7 +294,7 @@ public class FSTTester<T> {
}
FST<T> fst = builder.finish();
- if (random.nextBoolean() && fst != null && !willRewrite) {
+ if (random.nextBoolean() && fst != null) {
IOContext context = LuceneTestCase.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context);
fst.save(out);