| diff --git a/lucene/analysis/kuromoji/ivy.xml b/lucene/analysis/kuromoji/ivy.xml |
| index 10eba4e..eb08509 100644 |
| --- a/lucene/analysis/kuromoji/ivy.xml |
| +++ b/lucene/analysis/kuromoji/ivy.xml |
| @@ -27,7 +27,7 @@ |
| |
| <dependencies> |
| <dependency org="mecab" name="mecab-ipadic" rev="${/mecab/mecab-ipadic}" conf="ipadic"> |
| - <artifact name="ipadic" type=".tar.gz" url="http://mecab.googlecode.com/files/mecab-ipadic-2.7.0-20070801.tar.gz"/> |
| + <artifact name="ipadic" type=".tar.gz" url="http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz"/> |
| </dependency> |
| <dependency org="mecab" name="mecab-naist-jdic" rev="${/mecab/mecab-naist-jdic}" conf="naist"> |
| <artifact name="mecab-naist-jdic" type=".tar.gz" url="http://sourceforge.jp/frs/redir.php?m=iij&f=/naist-jdic/53500/mecab-naist-jdic-0.6.3b-20111013.tar.gz"/> |
| diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat |
| index 8935809..6cfad72 100644 |
| Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ |
| diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java |
| index 54382ed..6ad8a68 100644 |
| --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java |
| +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java |
| @@ -28,7 +28,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.store.OutputStreamDataOutput; |
| -import org.apache.lucene.util.BitUtil; |
| |
| public final class ConnectionCostsWriter { |
| |
| diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java |
| index 61d6f27..1b8abbb 100644 |
| --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java |
| +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java |
| @@ -33,12 +33,10 @@ import java.util.Comparator; |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat; |
| -import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.PositiveIntOutputs; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| import com.ibm.icu.text.Normalizer2; |
| |
| @@ -133,7 +131,7 @@ public class TokenInfoDictionaryBuilder { |
| System.out.println(" encode..."); |
| |
| PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); |
| - Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, PackedInts.DEFAULT, true, 15); |
| + Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| long ord = -1; // first ord will be 0 |
| String lastValue = null; |
| diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java |
| index fb682fd..b16bb15 100644 |
| --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java |
| +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java |
| @@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.BytesRefFSTEnum; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.Util; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| /* |
| TODO: |
| @@ -363,8 +362,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { |
| |
| final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, |
| 0, 0, true, false, Integer.MAX_VALUE, |
| - FST_OUTPUTS, false, |
| - PackedInts.COMPACT, true, 15); |
| + FST_OUTPUTS, true, 15); |
| //if (DEBUG) { |
| // System.out.println(" compile index for prefix=" + prefix); |
| //} |
| diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java |
| index 1427dec..2f71765 100644 |
| --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java |
| +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java |
| @@ -81,9 +81,6 @@ import org.apache.lucene.util.packed.PackedInts; |
| // loads itself in ram? |
| public final class MemoryPostingsFormat extends PostingsFormat { |
| |
| - private final boolean doPackFST; |
| - private final float acceptableOverheadRatio; |
| - |
| public MemoryPostingsFormat() { |
| this(false, PackedInts.DEFAULT); |
| } |
| @@ -97,13 +94,11 @@ public final class MemoryPostingsFormat extends PostingsFormat { |
| */ |
| public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) { |
| super("Memory"); |
| - this.doPackFST = doPackFST; |
| - this.acceptableOverheadRatio = acceptableOverheadRatio; |
| } |
| |
| @Override |
| public String toString() { |
| - return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")"; |
| + return "PostingsFormat(name=" + getName() + ")"; |
| } |
| |
| private final static class TermsWriter { |
| @@ -111,16 +106,12 @@ public final class MemoryPostingsFormat extends PostingsFormat { |
| private final FieldInfo field; |
| private final Builder<BytesRef> builder; |
| private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| - private final boolean doPackFST; |
| - private final float acceptableOverheadRatio; |
| private int termCount; |
| |
| - public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST, float acceptableOverheadRatio) { |
| + public TermsWriter(IndexOutput out, FieldInfo field) { |
| this.out = out; |
| this.field = field; |
| - this.doPackFST = doPackFST; |
| - this.acceptableOverheadRatio = acceptableOverheadRatio; |
| - builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doPackFST, acceptableOverheadRatio, true, 15); |
| + builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| } |
| |
| private class PostingsWriter { |
| @@ -307,8 +298,7 @@ public final class MemoryPostingsFormat extends PostingsFormat { |
| TermsEnum termsEnum = terms.iterator(); |
| |
| FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); |
| - TermsWriter termsWriter = new TermsWriter(out, fieldInfo, |
| - doPackFST, acceptableOverheadRatio); |
| + TermsWriter termsWriter = new TermsWriter(out, fieldInfo); |
| |
| FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.maxDoc()); |
| long sumTotalTermFreq = 0; |
| diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java |
| index a4a150b..bdacc22 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java |
| +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java |
| @@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.BytesRefFSTEnum; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.Util; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| /* |
| TODO: |
| @@ -456,8 +455,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { |
| final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, |
| 0, 0, true, false, Integer.MAX_VALUE, |
| - outputs, false, |
| - PackedInts.COMPACT, true, 15); |
| + outputs, true, 15); |
| //if (DEBUG) { |
| // System.out.println(" compile index for prefix=" + prefix); |
| //} |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java |
| index c5ab849..428edd3 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java |
| @@ -23,7 +23,6 @@ import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| // TODO: could we somehow stream an FST to disk while we |
| // build it? |
| @@ -69,10 +68,6 @@ public class Builder<T> { |
| private final int shareMaxTailLength; |
| |
| private final IntsRefBuilder lastInput = new IntsRefBuilder(); |
| - |
| - // for packing |
| - private final boolean doPackFST; |
| - private final float acceptableOverheadRatio; |
| |
| // NOTE: cutting this over to ArrayList instead loses ~6% |
| // in build performance on 9.8M Wikipedia terms; so we |
| @@ -99,11 +94,10 @@ public class Builder<T> { |
| /** |
| * Instantiates an FST/FSA builder without any pruning. A shortcut |
| * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, |
| - * boolean, int, Outputs, boolean, float, |
| - * boolean, int)} with pruning options turned off. |
| + * boolean, int, Outputs, boolean, int)} with pruning options turned off. |
| */ |
| public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) { |
| - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, false, PackedInts.COMPACT, true, 15); |
| + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| } |
| |
| /** |
| @@ -143,11 +137,6 @@ public class Builder<T> { |
| * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the |
| * singleton output object. |
| * |
| - * @param doPackFST Pass true to create a packed FST. |
| - * |
| - * @param acceptableOverheadRatio How to trade speed for space when building the FST. This option |
| - * is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float) |
| - * |
| * @param allowArrayArcs Pass false to disable the array arc optimization |
| * while building the FST; this will make the resulting |
| * FST smaller but slower to traverse. |
| @@ -159,16 +148,13 @@ public class Builder<T> { |
| */ |
| public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, |
| boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs, |
| - boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs, |
| - int bytesPageBits) { |
| + boolean allowArrayArcs, int bytesPageBits) { |
| this.minSuffixCount1 = minSuffixCount1; |
| this.minSuffixCount2 = minSuffixCount2; |
| this.doShareNonSingletonNodes = doShareNonSingletonNodes; |
| this.shareMaxTailLength = shareMaxTailLength; |
| - this.doPackFST = doPackFST; |
| - this.acceptableOverheadRatio = acceptableOverheadRatio; |
| this.allowArrayArcs = allowArrayArcs; |
| - fst = new FST<>(inputType, outputs, doPackFST, acceptableOverheadRatio, bytesPageBits); |
| + fst = new FST<>(inputType, outputs, bytesPageBits); |
| bytes = fst.bytes; |
| assert bytes != null; |
| if (doShareSuffix) { |
| @@ -496,11 +482,7 @@ public class Builder<T> { |
| //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); |
| fst.finish(compileNode(root, lastInput.length()).node); |
| |
| - if (doPackFST) { |
| - return fst.pack(this, 3, Math.max(10, (int) (getNodeCount()/4)), acceptableOverheadRatio); |
| - } else { |
| - return fst; |
| - } |
| + return fst; |
| } |
| |
| private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException { |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java |
| index 4a0a3a9..5ea6dab 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java |
| @@ -24,13 +24,9 @@ import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| -import java.util.ArrayList; |
| -import java.util.Collection; |
| -import java.util.HashMap; |
| -import java.util.List; |
| -import java.util.Map; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| +import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.store.ByteArrayDataOutput; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.DataOutput; |
| @@ -38,13 +34,9 @@ import org.apache.lucene.store.InputStreamDataInput; |
| import org.apache.lucene.store.OutputStreamDataOutput; |
| import org.apache.lucene.store.RAMOutputStream; |
| import org.apache.lucene.util.Accountable; |
| -import org.apache.lucene.util.Accountables; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.Constants; |
| -import org.apache.lucene.util.PriorityQueue; |
| import org.apache.lucene.util.RamUsageEstimator; |
| -import org.apache.lucene.util.packed.GrowableWriter; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| // TODO: break this into WritableFST and ReadOnlyFST.. then |
| // we can have subclasses of ReadOnlyFST to handle the |
| @@ -90,14 +82,6 @@ public final class FST<T> implements Accountable { |
| |
| static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; |
| |
| - // Arcs are stored as fixed-size (per entry) array, so |
| - // that we can find an arc using binary search. We do |
| - // this when number of arcs is > NUM_ARCS_ARRAY: |
| - |
| - // If set, the target node is delta coded vs current |
| - // position: |
| - private static final int BIT_TARGET_DELTA = 1 << 6; |
| - |
| // We use this as a marker (because this one flag is |
| // illegal by itself ...): |
| private static final byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; |
| @@ -137,7 +121,9 @@ public final class FST<T> implements Accountable { |
| /** Don't store arcWithOutputCount anymore */ |
| private static final int VERSION_NO_NODE_ARC_COUNTS = 5; |
| |
| - private static final int VERSION_CURRENT = VERSION_NO_NODE_ARC_COUNTS; |
| + private static final int VERSION_PACKED_REMOVED = 6; |
| + |
| + private static final int VERSION_CURRENT = VERSION_PACKED_REMOVED; |
| |
| // Never serialized; just used to represent the virtual |
| // final node w/ no arcs: |
| @@ -168,9 +154,6 @@ public final class FST<T> implements Accountable { |
| |
| public final Outputs<T> outputs; |
| |
| - private final boolean packed; |
| - private PackedInts.Reader nodeRefToAddress; |
| - |
| private Arc<T> cachedRootArcs[]; |
| |
| /** Represents a single arc. */ |
| @@ -273,18 +256,11 @@ public final class FST<T> implements Accountable { |
| return (flags & bit) != 0; |
| } |
| |
| - private GrowableWriter nodeAddress; |
| - |
| - // TODO: we could be smarter here, and prune periodically |
| - // as we go; high in-count nodes will "usually" become |
| - // clear early on: |
| - private GrowableWriter inCounts; |
| - |
| private final int version; |
| |
| // make a new empty FST, for building; Builder invokes |
| // this ctor |
| - FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, int bytesPageBits) { |
| + FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) { |
| this.inputType = inputType; |
| this.outputs = outputs; |
| version = VERSION_CURRENT; |
| @@ -293,17 +269,8 @@ public final class FST<T> implements Accountable { |
| // pad: ensure no node gets address 0 which is reserved to mean |
| // the stop state w/ no arcs |
| bytes.writeByte((byte) 0); |
| - if (willPackFST) { |
| - nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio); |
| - inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); |
| - } else { |
| - nodeAddress = null; |
| - inCounts = null; |
| - } |
| |
| emptyOutput = null; |
| - packed = false; |
| - nodeRefToAddress = null; |
| } |
| |
| public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; |
| @@ -324,8 +291,12 @@ public final class FST<T> implements Accountable { |
| |
| // NOTE: only reads most recent format; we don't have |
| // back-compat promise for FSTs (they are experimental): |
| - version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_NO_NODE_ARC_COUNTS); |
| - packed = in.readByte() == 1; |
| + version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_CURRENT); |
| + if (version < VERSION_PACKED_REMOVED) { |
| + if (in.readByte() == 1) { |
| + throw new CorruptIndexException("Cannot read packed FSTs anymore", in); |
| + } |
| + } |
| if (in.readByte() == 1) { |
| // accepts empty string |
| // 1 KB blocks: |
| @@ -334,17 +305,12 @@ public final class FST<T> implements Accountable { |
| emptyBytes.copyBytes(in, numBytes); |
| |
| // De-serialize empty-string output: |
| - BytesReader reader; |
| - if (packed) { |
| - reader = emptyBytes.getForwardReader(); |
| - } else { |
| - reader = emptyBytes.getReverseReader(); |
| - // NoOutputs uses 0 bytes when writing its output, |
| - // so we have to check here else BytesStore gets |
| - // angry: |
| - if (numBytes > 0) { |
| - reader.setPosition(numBytes-1); |
| - } |
| + BytesReader reader = emptyBytes.getReverseReader(); |
| + // NoOutputs uses 0 bytes when writing its output, |
| + // so we have to check here else BytesStore gets |
| + // angry: |
| + if (numBytes > 0) { |
| + reader.setPosition(numBytes-1); |
| } |
| emptyOutput = outputs.readFinalOutput(reader); |
| } else { |
| @@ -364,11 +330,6 @@ public final class FST<T> implements Accountable { |
| default: |
| throw new IllegalStateException("invalid input type " + t); |
| } |
| - if (packed) { |
| - nodeRefToAddress = PackedInts.getReader(in); |
| - } else { |
| - nodeRefToAddress = null; |
| - } |
| startNode = in.readVLong(); |
| if (version < VERSION_NO_NODE_ARC_COUNTS) { |
| in.readVLong(); |
| @@ -424,31 +385,13 @@ public final class FST<T> implements Accountable { |
| } else { |
| size += bytes.ramBytesUsed(); |
| } |
| - if (packed) { |
| - size += nodeRefToAddress.ramBytesUsed(); |
| - } else if (nodeAddress != null) { |
| - size += nodeAddress.ramBytesUsed(); |
| - size += inCounts.ramBytesUsed(); |
| - } |
| size += cachedArcsBytesUsed; |
| return size; |
| } |
| |
| @Override |
| - public Collection<Accountable> getChildResources() { |
| - List<Accountable> resources = new ArrayList<>(); |
| - if (packed) { |
| - resources.add(Accountables.namedAccountable("node ref to address", nodeRefToAddress)); |
| - } else if (nodeAddress != null) { |
| - resources.add(Accountables.namedAccountable("node addresses", nodeAddress)); |
| - resources.add(Accountables.namedAccountable("in counts", inCounts)); |
| - } |
| - return resources; |
| - } |
| - |
| - @Override |
| public String toString() { |
| - return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs + ",packed=" + packed; |
| + return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs; |
| } |
| |
| void finish(long newStartNode) throws IOException { |
| @@ -463,16 +406,6 @@ public final class FST<T> implements Accountable { |
| bytes.finish(); |
| cacheRootArcs(); |
| } |
| - |
| - private long getNodeAddress(long node) { |
| - if (nodeAddress != null) { |
| - // Deref |
| - return nodeAddress.get((int) node); |
| - } else { |
| - // Straight |
| - return node; |
| - } |
| - } |
| |
| // Optionally caches first 128 labels |
| @SuppressWarnings({"rawtypes","unchecked"}) |
| @@ -527,18 +460,7 @@ public final class FST<T> implements Accountable { |
| if (startNode == -1) { |
| throw new IllegalStateException("call finish first"); |
| } |
| - if (nodeAddress != null) { |
| - throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed"); |
| - } |
| - if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) { |
| - throw new IllegalStateException("cannot save a FST which has been loaded from disk "); |
| - } |
| CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); |
| - if (packed) { |
| - out.writeByte((byte) 1); |
| - } else { |
| - out.writeByte((byte) 0); |
| - } |
| // TODO: really we should encode this as an arc, arriving |
| // to the root node, instead of special casing here: |
| if (emptyOutput != null) { |
| @@ -552,16 +474,14 @@ public final class FST<T> implements Accountable { |
| byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()]; |
| ros.writeTo(emptyOutputBytes, 0); |
| |
| - if (!packed) { |
| - // reverse |
| - final int stopAt = emptyOutputBytes.length/2; |
| - int upto = 0; |
| - while(upto < stopAt) { |
| - final byte b = emptyOutputBytes[upto]; |
| - emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1]; |
| - emptyOutputBytes[emptyOutputBytes.length-upto-1] = b; |
| - upto++; |
| - } |
| + // reverse |
| + final int stopAt = emptyOutputBytes.length/2; |
| + int upto = 0; |
| + while(upto < stopAt) { |
| + final byte b = emptyOutputBytes[upto]; |
| + emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1]; |
| + emptyOutputBytes[emptyOutputBytes.length-upto-1] = b; |
| + upto++; |
| } |
| out.writeVInt(emptyOutputBytes.length); |
| out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); |
| @@ -577,9 +497,6 @@ public final class FST<T> implements Accountable { |
| t = 2; |
| } |
| out.writeByte(t); |
| - if (packed) { |
| - ((PackedInts.Mutable) nodeRefToAddress).save(out); |
| - } |
| out.writeVLong(startNode); |
| if (bytes != null) { |
| long numBytes = bytes.getPosition(); |
| @@ -705,8 +622,6 @@ public final class FST<T> implements Accountable { |
| |
| if (!targetHasArcs) { |
| flags += BIT_STOP_NODE; |
| - } else if (inCounts != null) { |
| - inCounts.set((int) target.node, inCounts.get((int) target.node) + 1); |
| } |
| |
| if (arc.output != NO_OUTPUT) { |
| @@ -810,30 +725,8 @@ public final class FST<T> implements Accountable { |
| |
| builder.bytes.reverse(startAddress, thisNodeAddress); |
| |
| - // PackedInts uses int as the index, so we cannot handle |
| - // > 2.1B nodes when packing: |
| - if (nodeAddress != null && builder.nodeCount == Integer.MAX_VALUE) { |
| - throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes"); |
| - } |
| - |
| builder.nodeCount++; |
| - final long node; |
| - if (nodeAddress != null) { |
| - |
| - // Nodes are addressed by 1+ord: |
| - if ((int) builder.nodeCount == nodeAddress.size()) { |
| - nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); |
| - inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); |
| - } |
| - nodeAddress.set((int) builder.nodeCount, thisNodeAddress); |
| - // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); |
| - node = builder.nodeCount; |
| - } else { |
| - node = thisNodeAddress; |
| - } |
| - |
| - //System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress); |
| - return node; |
| + return thisNodeAddress; |
| } |
| |
| /** Fills virtual 'start' arc, ie, an empty incoming arc to |
| @@ -876,13 +769,13 @@ public final class FST<T> implements Accountable { |
| arc.flags = BIT_LAST_ARC; |
| return arc; |
| } else { |
| - in.setPosition(getNodeAddress(follow.target)); |
| + in.setPosition(follow.target); |
| arc.node = follow.target; |
| final byte b = in.readByte(); |
| if (b == ARCS_AS_FIXED_ARRAY) { |
| // array: jump straight to end |
| arc.numArcs = in.readVInt(); |
| - if (packed || version >= VERSION_VINT_TARGET) { |
| + if (version >= VERSION_VINT_TARGET) { |
| arc.bytesPerArc = in.readVInt(); |
| } else { |
| arc.bytesPerArc = in.readInt(); |
| @@ -906,8 +799,6 @@ public final class FST<T> implements Accountable { |
| } |
| if (arc.flag(BIT_STOP_NODE)) { |
| } else if (arc.flag(BIT_TARGET_NEXT)) { |
| - } else if (packed) { |
| - in.readVLong(); |
| } else { |
| readUnpackedNodeTarget(in); |
| } |
| @@ -964,7 +855,7 @@ public final class FST<T> implements Accountable { |
| } |
| |
| public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException { |
| - final long address = getNodeAddress(node); |
| + final long address = node; |
| in.setPosition(address); |
| //System.out.println(" readFirstRealTargtArc address=" |
| //+ address); |
| @@ -975,7 +866,7 @@ public final class FST<T> implements Accountable { |
| //System.out.println(" fixedArray"); |
| // this is first arc in a fixed-array |
| arc.numArcs = in.readVInt(); |
| - if (packed || version >= VERSION_VINT_TARGET) { |
| + if (version >= VERSION_VINT_TARGET) { |
| arc.bytesPerArc = in.readVInt(); |
| } else { |
| arc.bytesPerArc = in.readInt(); |
| @@ -1002,7 +893,7 @@ public final class FST<T> implements Accountable { |
| if (!targetHasArcs(follow)) { |
| return false; |
| } else { |
| - in.setPosition(getNodeAddress(follow.target)); |
| + in.setPosition(follow.target); |
| return in.readByte() == ARCS_AS_FIXED_ARRAY; |
| } |
| } |
| @@ -1029,7 +920,7 @@ public final class FST<T> implements Accountable { |
| //System.out.println(" nextArc fake " + |
| //arc.nextArc); |
| |
| - long pos = getNodeAddress(arc.nextArc); |
| + long pos = arc.nextArc; |
| in.setPosition(pos); |
| |
| final byte b = in.readByte(); |
| @@ -1038,7 +929,7 @@ public final class FST<T> implements Accountable { |
| in.readVInt(); |
| |
| // Skip bytesPerArc: |
| - if (packed || version >= VERSION_VINT_TARGET) { |
| + if (version >= VERSION_VINT_TARGET) { |
| in.readVInt(); |
| } else { |
| in.readInt(); |
| @@ -1107,41 +998,18 @@ public final class FST<T> implements Accountable { |
| arc.nextArc = in.getPosition(); |
| // TODO: would be nice to make this lazy -- maybe |
| // caller doesn't need the target and is scanning arcs... |
| - if (nodeAddress == null) { |
| - if (!arc.flag(BIT_LAST_ARC)) { |
| - if (arc.bytesPerArc == 0) { |
| - // must scan |
| - seekToNextNode(in); |
| - } else { |
| - in.setPosition(arc.posArcsStart); |
| - in.skipBytes(arc.bytesPerArc * arc.numArcs); |
| - } |
| - } |
| - arc.target = in.getPosition(); |
| - } else { |
| - arc.target = arc.node - 1; |
| - assert arc.target > 0; |
| - } |
| - } else { |
| - if (packed) { |
| - final long pos = in.getPosition(); |
| - final long code = in.readVLong(); |
| - if (arc.flag(BIT_TARGET_DELTA)) { |
| - // Address is delta-coded from current address: |
| - arc.target = pos + code; |
| - //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); |
| - } else if (code < nodeRefToAddress.size()) { |
| - // Deref |
| - arc.target = nodeRefToAddress.get((int) code); |
| - //System.out.println(" deref code=" + code + " target=" + arc.target); |
| + if (!arc.flag(BIT_LAST_ARC)) { |
| + if (arc.bytesPerArc == 0) { |
| + // must scan |
| + seekToNextNode(in); |
| } else { |
| - // Absolute |
| - arc.target = code; |
| - //System.out.println(" abs code=" + code); |
| + in.setPosition(arc.posArcsStart); |
| + in.skipBytes(arc.bytesPerArc * arc.numArcs); |
| } |
| - } else { |
| - arc.target = readUnpackedNodeTarget(in); |
| } |
| + arc.target = in.getPosition(); |
| + } else { |
| + arc.target = readUnpackedNodeTarget(in); |
| arc.nextArc = in.getPosition(); |
| } |
| return arc; |
| @@ -1228,7 +1096,7 @@ public final class FST<T> implements Accountable { |
| return null; |
| } |
| |
| - in.setPosition(getNodeAddress(follow.target)); |
| + in.setPosition(follow.target); |
| |
| arc.node = follow.target; |
| |
| @@ -1237,7 +1105,7 @@ public final class FST<T> implements Accountable { |
| if (in.readByte() == ARCS_AS_FIXED_ARRAY) { |
| // Arcs are full array; do binary search: |
| arc.numArcs = in.readVInt(); |
| - if (packed || version >= VERSION_VINT_TARGET) { |
| + if (version >= VERSION_VINT_TARGET) { |
| arc.bytesPerArc = in.readVInt(); |
| } else { |
| arc.bytesPerArc = in.readInt(); |
| @@ -1303,11 +1171,7 @@ public final class FST<T> implements Accountable { |
| } |
| |
| if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { |
| - if (packed) { |
| - in.readVLong(); |
| - } else { |
| - readUnpackedNodeTarget(in); |
| - } |
| + readUnpackedNodeTarget(in); |
| } |
| |
| if (flag(flags, BIT_LAST_ARC)) { |
| @@ -1340,18 +1204,10 @@ public final class FST<T> implements Accountable { |
| /** Returns a {@link BytesReader} for this FST, positioned at |
| * position 0. */ |
| public BytesReader getBytesReader() { |
| - if (packed) { |
| - if (bytesArray != null) { |
| - return new ForwardBytesReader(bytesArray); |
| - } else { |
| - return bytes.getForwardReader(); |
| - } |
| + if (bytesArray != null) { |
| + return new ReverseBytesReader(bytesArray); |
| } else { |
| - if (bytesArray != null) { |
| - return new ReverseBytesReader(bytesArray); |
| - } else { |
| - return bytes.getReverseReader(); |
| - } |
| + return bytes.getReverseReader(); |
| } |
| } |
| |
| @@ -1476,395 +1332,4 @@ public final class FST<T> implements Accountable { |
| } |
| */ |
| |
| - // Creates a packed FST |
| - private FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) { |
| - version = VERSION_CURRENT; |
| - packed = true; |
| - this.inputType = inputType; |
| - bytesArray = null; |
| - bytes = new BytesStore(bytesPageBits); |
| - this.outputs = outputs; |
| - } |
| - |
| - /** Expert: creates an FST by packing this one. This |
| - * process requires substantial additional RAM (currently |
| - * up to ~8 bytes per node depending on |
| - * <code>acceptableOverheadRatio</code>), but then should |
| - * produce a smaller FST. |
| - * |
| - * <p>The implementation of this method uses ideas from |
| - * <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>, |
| - * which describes techniques to reduce the size of a FST. |
| - * However, this is not a strict implementation of the |
| - * algorithms described in this paper. |
| - */ |
| - FST<T> pack(Builder<T> builder, int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { |
| - |
| - // NOTE: maxDerefNodes is intentionally int: we cannot |
| - // support > 2.1B deref nodes |
| - |
| - // TODO: other things to try |
| - // - renumber the nodes to get more next / better locality? |
| - // - allow multiple input labels on an arc, so |
| - // singular chain of inputs can take one arc (on |
| - // wikipedia terms this could save another ~6%) |
| - // - in the ord case, the output '1' is presumably |
| - // very common (after NO_OUTPUT)... maybe use a bit |
| - // for it..? |
| - // - use spare bits in flags.... for top few labels / |
| - // outputs / targets |
| - |
| - if (nodeAddress == null) { |
| - throw new IllegalArgumentException("this FST was not built with willPackFST=true"); |
| - } |
| - |
| - T NO_OUTPUT = outputs.getNoOutput(); |
| - |
| - Arc<T> arc = new Arc<>(); |
| - |
| - final BytesReader r = getBytesReader(); |
| - |
| - final int topN = Math.min(maxDerefNodes, inCounts.size()); |
| - |
| - // Find top nodes with highest number of incoming arcs: |
| - NodeQueue q = new NodeQueue(topN); |
| - |
| - // TODO: we could use more RAM efficient selection algo here... |
| - NodeAndInCount bottom = null; |
| - for(int node=0; node<inCounts.size(); node++) { |
| - if (inCounts.get(node) >= minInCountDeref) { |
| - if (bottom == null) { |
| - q.add(new NodeAndInCount(node, (int) inCounts.get(node))); |
| - if (q.size() == topN) { |
| - bottom = q.top(); |
| - } |
| - } else if (inCounts.get(node) > bottom.count) { |
| - q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node))); |
| - } |
| - } |
| - } |
| - |
| - // Free up RAM: |
| - inCounts = null; |
| - |
| - final Map<Integer,Integer> topNodeMap = new HashMap<>(); |
| - for(int downTo=q.size()-1;downTo>=0;downTo--) { |
| - NodeAndInCount n = q.pop(); |
| - topNodeMap.put(n.node, downTo); |
| - //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); |
| - } |
| - |
| - // +1 because node ords start at 1 (0 is reserved as stop node): |
| - final GrowableWriter newNodeAddress = new GrowableWriter( |
| - PackedInts.bitsRequired(builder.bytes.getPosition()), (int) (1 + builder.nodeCount), acceptableOverheadRatio); |
| - |
| - // Fill initial coarse guess: |
| - for(int node=1;node<=builder.nodeCount;node++) { |
| - newNodeAddress.set(node, 1 + builder.bytes.getPosition() - nodeAddress.get(node)); |
| - } |
| - |
| - int absCount; |
| - int deltaCount; |
| - int topCount; |
| - int nextCount; |
| - |
| - FST<T> fst; |
| - |
| - // Iterate until we converge: |
| - while(true) { |
| - |
| - //System.out.println("\nITER"); |
| - boolean changed = false; |
| - |
| - // for assert: |
| - boolean negDelta = false; |
| - |
| - fst = new FST<>(inputType, outputs, builder.bytes.getBlockBits()); |
| - |
| - final BytesStore writer = fst.bytes; |
| - |
| - // Skip 0 byte since 0 is reserved target: |
| - writer.writeByte((byte) 0); |
| - |
| - absCount = deltaCount = topCount = nextCount = 0; |
| - |
| - int changedCount = 0; |
| - |
| - long addressError = 0; |
| - |
| - //int totWasted = 0; |
| - |
| - // Since we re-reverse the bytes, we now write the |
| - // nodes backwards, so that BIT_TARGET_NEXT is |
| - // unchanged: |
| - for(int node=(int) builder.nodeCount;node>=1;node--) { |
| - final long address = writer.getPosition(); |
| - |
| - //System.out.println(" node: " + node + " address=" + address); |
| - if (address != newNodeAddress.get(node)) { |
| - addressError = address - newNodeAddress.get(node); |
| - //System.out.println(" change: " + (address - newNodeAddress[node])); |
| - changed = true; |
| - newNodeAddress.set(node, address); |
| - changedCount++; |
| - } |
| - |
| - int nodeArcCount = 0; |
| - int bytesPerArc = 0; |
| - |
| - boolean retry = false; |
| - |
| - // for assert: |
| - boolean anyNegDelta = false; |
| - |
| - // Retry loop: possibly iterate more than once, if |
| - // this is an array'd node and bytesPerArc changes: |
| - writeNode: |
| - while(true) { // retry writing this node |
| - |
| - //System.out.println(" cycle: retry"); |
| - readFirstRealTargetArc(node, arc, r); |
| - |
| - final boolean useArcArray = arc.bytesPerArc != 0; |
| - if (useArcArray) { |
| - // Write false first arc: |
| - if (bytesPerArc == 0) { |
| - bytesPerArc = arc.bytesPerArc; |
| - } |
| - writer.writeByte(ARCS_AS_FIXED_ARRAY); |
| - writer.writeVInt(arc.numArcs); |
| - writer.writeVInt(bytesPerArc); |
| - //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); |
| - } |
| - |
| - int maxBytesPerArc = 0; |
| - //int wasted = 0; |
| - while(true) { // iterate over all arcs for this node |
| - //System.out.println(" cycle next arc"); |
| - |
| - final long arcStartPos = writer.getPosition(); |
| - nodeArcCount++; |
| - |
| - byte flags = 0; |
| - |
| - if (arc.isLast()) { |
| - flags += BIT_LAST_ARC; |
| - } |
| - /* |
| - if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) { |
| - flags += BIT_TARGET_NEXT; |
| - } |
| - */ |
| - if (!useArcArray && node != 1 && arc.target == node-1) { |
| - flags += BIT_TARGET_NEXT; |
| - if (!retry) { |
| - nextCount++; |
| - } |
| - } |
| - if (arc.isFinal()) { |
| - flags += BIT_FINAL_ARC; |
| - if (arc.nextFinalOutput != NO_OUTPUT) { |
| - flags += BIT_ARC_HAS_FINAL_OUTPUT; |
| - } |
| - } else { |
| - assert arc.nextFinalOutput == NO_OUTPUT; |
| - } |
| - if (!targetHasArcs(arc)) { |
| - flags += BIT_STOP_NODE; |
| - } |
| - |
| - if (arc.output != NO_OUTPUT) { |
| - flags += BIT_ARC_HAS_OUTPUT; |
| - } |
| - |
| - final long absPtr; |
| - final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; |
| - if (doWriteTarget) { |
| - |
| - final Integer ptr = topNodeMap.get(arc.target); |
| - if (ptr != null) { |
| - absPtr = ptr; |
| - } else { |
| - absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError; |
| - } |
| - |
| - long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition() - 2; |
| - if (delta < 0) { |
| - //System.out.println("neg: " + delta); |
| - anyNegDelta = true; |
| - delta = 0; |
| - } |
| - |
| - if (delta < absPtr) { |
| - flags |= BIT_TARGET_DELTA; |
| - } |
| - } else { |
| - absPtr = 0; |
| - } |
| - |
| - assert flags != ARCS_AS_FIXED_ARRAY; |
| - writer.writeByte(flags); |
| - |
| - fst.writeLabel(writer, arc.label); |
| - |
| - if (arc.output != NO_OUTPUT) { |
| - outputs.write(arc.output, writer); |
| - } |
| - if (arc.nextFinalOutput != NO_OUTPUT) { |
| - outputs.writeFinalOutput(arc.nextFinalOutput, writer); |
| - } |
| - |
| - if (doWriteTarget) { |
| - |
| - long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition(); |
| - if (delta < 0) { |
| - anyNegDelta = true; |
| - //System.out.println("neg: " + delta); |
| - delta = 0; |
| - } |
| - |
| - if (flag(flags, BIT_TARGET_DELTA)) { |
| - //System.out.println(" delta"); |
| - writer.writeVLong(delta); |
| - if (!retry) { |
| - deltaCount++; |
| - } |
| - } else { |
| - /* |
| - if (ptr != null) { |
| - System.out.println(" deref"); |
| - } else { |
| - System.out.println(" abs"); |
| - } |
| - */ |
| - writer.writeVLong(absPtr); |
| - if (!retry) { |
| - if (absPtr >= topNodeMap.size()) { |
| - absCount++; |
| - } else { |
| - topCount++; |
| - } |
| - } |
| - } |
| - } |
| - |
| - if (useArcArray) { |
| - final int arcBytes = (int) (writer.getPosition() - arcStartPos); |
| - //System.out.println(" " + arcBytes + " bytes"); |
| - maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); |
| - // NOTE: this may in fact go "backwards", if |
| - // somehow (rarely, possibly never) we use |
| - // more bytesPerArc in this rewrite than the |
| - // incoming FST did... but in this case we |
| - // will retry (below) so it's OK to ovewrite |
| - // bytes: |
| - //wasted += bytesPerArc - arcBytes; |
| - writer.skipBytes((int) (arcStartPos + bytesPerArc - writer.getPosition())); |
| - } |
| - |
| - if (arc.isLast()) { |
| - break; |
| - } |
| - |
| - readNextRealArc(arc, r); |
| - } |
| - |
| - if (useArcArray) { |
| - if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) { |
| - // converged |
| - //System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted); |
| - //totWasted += wasted; |
| - break; |
| - } |
| - } else { |
| - break; |
| - } |
| - |
| - //System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc); |
| - |
| - // Retry: |
| - bytesPerArc = maxBytesPerArc; |
| - writer.truncate(address); |
| - nodeArcCount = 0; |
| - retry = true; |
| - anyNegDelta = false; |
| - } |
| - |
| - negDelta |= anyNegDelta; |
| - } |
| - |
| - if (!changed) { |
| - // We don't renumber the nodes (just reverse their |
| - // order) so nodes should only point forward to |
| - // other nodes because we only produce acyclic FSTs |
| - // w/ nodes only pointing "forwards": |
| - assert !negDelta; |
| - //System.out.println("TOT wasted=" + totWasted); |
| - // Converged! |
| - break; |
| - } |
| - } |
| - |
| - long maxAddress = 0; |
| - for (long key : topNodeMap.keySet()) { |
| - maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key)); |
| - } |
| - |
| - PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(), |
| - PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio); |
| - for(Map.Entry<Integer,Integer> ent : topNodeMap.entrySet()) { |
| - nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey())); |
| - } |
| - fst.nodeRefToAddress = nodeRefToAddressIn; |
| - |
| - fst.startNode = newNodeAddress.get((int) startNode); |
| - //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); |
| - |
| - if (emptyOutput != null) { |
| - fst.setEmptyOutput(emptyOutput); |
| - } |
| - |
| - fst.bytes.finish(); |
| - fst.cacheRootArcs(); |
| - |
| - //final int size = fst.sizeInBytes(); |
| - //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount); |
| - |
| - return fst; |
| - } |
| - |
| - private static class NodeAndInCount implements Comparable<NodeAndInCount> { |
| - final int node; |
| - final int count; |
| - |
| - public NodeAndInCount(int node, int count) { |
| - this.node = node; |
| - this.count = count; |
| - } |
| - |
| - @Override |
| - public int compareTo(NodeAndInCount other) { |
| - if (count > other.count) { |
| - return 1; |
| - } else if (count < other.count) { |
| - return -1; |
| - } else { |
| - // Tie-break: smaller node compares as greater than |
| - return other.node - node; |
| - } |
| - } |
| - } |
| - |
| - private static class NodeQueue extends PriorityQueue<NodeAndInCount> { |
| - public NodeQueue(int topN) { |
| - super(topN, false); |
| - } |
| - |
| - @Override |
| - public boolean lessThan(NodeAndInCount a, NodeAndInCount b) { |
| - final int cmp = a.compareTo(b); |
| - assert cmp != 0; |
| - return cmp < 0; |
| - } |
| - } |
| } |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java b/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java |
| index 41426f9..d984586 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java |
| @@ -24,7 +24,6 @@ |
| * <li>Fast and low memory overhead construction of the minimal FST |
| * (but inputs must be provided in sorted order)</li> |
| * <li>Low object overhead and quick deserialization (byte[] representation)</li> |
| - * <li>Optional two-pass compression: {@link org.apache.lucene.util.fst.FST#pack FST.pack()}</li> |
| * <li>{@link org.apache.lucene.util.fst.Util#getByOutput Lookup-by-output} when the |
| * outputs are in sorted order (e.g., ordinals or file pointers)</li> |
| * <li>Pluggable {@link org.apache.lucene.util.fst.Outputs Outputs} representation</li> |
| diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java |
| index bdec65c..a02bf8a 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java |
| +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java |
| @@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TimeUnits; |
| -import org.apache.lucene.util.packed.PackedInts; |
| import org.junit.Ignore; |
| |
| import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; |
| @@ -47,16 +46,14 @@ public class Test2BFST extends LuceneTestCase { |
| |
| Directory dir = new MMapDirectory(createTempDir("2BFST")); |
| |
| - for(int doPackIter=0;doPackIter<2;doPackIter++) { |
| - boolean doPack = doPackIter == 1; |
| - |
| + for(int iter=0;iter<1;iter++) { |
| // Build FST w/ NoOutputs and stop when nodeCount > 2.2B |
| - if (!doPack) { |
| + { |
| System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); |
| Outputs<Object> outputs = NoOutputs.getSingleton(); |
| Object NO_OUTPUT = outputs.getNoOutput(); |
| final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, |
| - doPack, PackedInts.COMPACT, true, 15); |
| + true, 15); |
| |
| int count = 0; |
| Random r = new Random(seed); |
| @@ -135,10 +132,10 @@ public class Test2BFST extends LuceneTestCase { |
| // Build FST w/ ByteSequenceOutputs and stop when FST |
| // size = 3GB |
| { |
| - System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); |
| + System.out.println("\nTEST: 3 GB size; outputs=bytes"); |
| Outputs<BytesRef> outputs = ByteSequenceOutputs.getSingleton(); |
| final Builder<BytesRef> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, |
| - doPack, PackedInts.COMPACT, true, 15); |
| + true, 15); |
| |
| byte[] outputBytes = new byte[20]; |
| BytesRef output = new BytesRef(outputBytes); |
| @@ -212,10 +209,10 @@ public class Test2BFST extends LuceneTestCase { |
| // Build FST w/ PositiveIntOutputs and stop when FST |
| // size = 3GB |
| { |
| - System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); |
| + System.out.println("\nTEST: 3 GB size; outputs=long"); |
| Outputs<Long> outputs = PositiveIntOutputs.getSingleton(); |
| final Builder<Long> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, |
| - doPack, PackedInts.COMPACT, true, 15); |
| + true, 15); |
| |
| long output = 1; |
| |
| diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java |
| index 39b3282..6b218cf 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java |
| +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java |
| @@ -76,7 +76,6 @@ import org.apache.lucene.util.fst.FST.Arc; |
| import org.apache.lucene.util.fst.FST.BytesReader; |
| import org.apache.lucene.util.fst.PairOutputs.Pair; |
| import org.apache.lucene.util.fst.Util.Result; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| import static org.apache.lucene.util.fst.FSTTester.getRandomString; |
| import static org.apache.lucene.util.fst.FSTTester.simpleRandomString; |
| @@ -328,9 +327,7 @@ public class TestFSTs extends LuceneTestCase { |
| writer.close(); |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| |
| - final boolean doRewrite = random().nextBoolean(); |
| - |
| - Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doRewrite, PackedInts.DEFAULT, true, 15); |
| + Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| |
| boolean storeOrd = random().nextBoolean(); |
| if (VERBOSE) { |
| @@ -464,16 +461,14 @@ public class TestFSTs extends LuceneTestCase { |
| private int inputMode; |
| private final Outputs<T> outputs; |
| private final Builder<T> builder; |
| - private final boolean doPack; |
| |
| - public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean doPack, boolean noArcArrays) { |
| + public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs<T> outputs, boolean noArcArrays) { |
| this.dirOut = dirOut; |
| this.wordsFileIn = wordsFileIn; |
| this.inputMode = inputMode; |
| this.outputs = outputs; |
| - this.doPack = doPack; |
| |
| - builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, doPack, PackedInts.DEFAULT, !noArcArrays, 15); |
| + builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15); |
| } |
| |
| protected abstract T getOutput(IntsRef input, int ord) throws IOException; |
| @@ -622,7 +617,6 @@ public class TestFSTs extends LuceneTestCase { |
| boolean storeOrds = false; |
| boolean storeDocFreqs = false; |
| boolean verify = true; |
| - boolean doPack = false; |
| boolean noArcArrays = false; |
| Path wordsFileIn = null; |
| Path dirOut = null; |
| @@ -647,8 +641,6 @@ public class TestFSTs extends LuceneTestCase { |
| storeOrds = true; |
| } else if (args[idx].equals("-noverify")) { |
| verify = false; |
| - } else if (args[idx].equals("-pack")) { |
| - doPack = true; |
| } else if (args[idx].startsWith("-")) { |
| System.err.println("Unrecognized option: " + args[idx]); |
| System.exit(-1); |
| @@ -677,7 +669,7 @@ public class TestFSTs extends LuceneTestCase { |
| final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(); |
| final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(); |
| final PairOutputs<Long,Long> outputs = new PairOutputs<>(o1, o2); |
| - new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { |
| + new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| Random rand; |
| @Override |
| public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) { |
| @@ -691,7 +683,7 @@ public class TestFSTs extends LuceneTestCase { |
| } else if (storeOrds) { |
| // Store only ords |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| - new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { |
| + new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| @Override |
| public Long getOutput(IntsRef input, int ord) { |
| return (long) ord; |
| @@ -700,7 +692,7 @@ public class TestFSTs extends LuceneTestCase { |
| } else if (storeDocFreqs) { |
| // Store only docFreq |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| - new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { |
| + new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| Random rand; |
| @Override |
| public Long getOutput(IntsRef input, int ord) { |
| @@ -714,7 +706,7 @@ public class TestFSTs extends LuceneTestCase { |
| // Store nothing |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| final Object NO_OUTPUT = outputs.getNoOutput(); |
| - new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { |
| + new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { |
| @Override |
| public Object getOutput(IntsRef input, int ord) { |
| return NO_OUTPUT; |
| @@ -1118,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase { |
| public void testFinalOutputOnEndState() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| |
| - final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, random().nextBoolean(), PackedInts.DEFAULT, true, 15); |
| + final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L); |
| builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L); |
| final FST<Long> fst = builder.finish(); |
| @@ -1132,8 +1124,7 @@ public class TestFSTs extends LuceneTestCase { |
| |
| public void testInternalFinalState() throws Exception { |
| final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); |
| - final boolean willRewrite = random().nextBoolean(); |
| - final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, willRewrite, PackedInts.DEFAULT, true, 15); |
| + final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); |
| builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); |
| builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); |
| final FST<Long> fst = builder.finish(); |
| diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java |
| index b49ea79..d83b915 100644 |
| --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java |
| +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java |
| @@ -50,7 +50,6 @@ import org.apache.lucene.util.fst.PairOutputs.Pair; |
| import org.apache.lucene.util.fst.PairOutputs; |
| import org.apache.lucene.util.fst.PositiveIntOutputs; |
| import org.apache.lucene.util.fst.Util; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| /* |
| TODO: |
| @@ -354,8 +353,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { |
| |
| final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, |
| 0, 0, true, false, Integer.MAX_VALUE, |
| - FST_OUTPUTS, false, |
| - PackedInts.COMPACT, true, 15); |
| + FST_OUTPUTS, true, 15); |
| //if (DEBUG) { |
| // System.out.println(" compile index for prefix=" + prefix); |
| //} |
| diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java |
| index 3706724..3d20412 100644 |
| --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java |
| +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java |
| @@ -26,7 +26,6 @@ import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.BytesRefIterator; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.*; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * Finite state automata based implementation of "autocomplete" functionality. |
| @@ -237,8 +236,7 @@ public class FSTCompletionBuilder { |
| final Object empty = outputs.getNoOutput(); |
| final Builder<Object> builder = new Builder<>( |
| FST.INPUT_TYPE.BYTE1, 0, 0, true, true, |
| - shareMaxTailLength, outputs, false, |
| - PackedInts.DEFAULT, true, 15); |
| + shareMaxTailLength, outputs, true, 15); |
| |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| BytesRef entry; |
| diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java |
| index 11b1325..8e6a4ea 100644 |
| --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java |
| +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java |
| @@ -40,7 +40,6 @@ import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.UnicodeUtil; |
| -import org.apache.lucene.util.packed.PackedInts; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| @@ -273,25 +272,14 @@ public class FSTTester<T> { |
| System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2); |
| } |
| |
| - final boolean willRewrite = random.nextBoolean(); |
| - |
| final Builder<T> builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, |
| prune1, prune2, |
| prune1==0 && prune2==0, |
| allowRandomSuffixSharing ? random.nextBoolean() : true, |
| allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, |
| outputs, |
| - willRewrite, |
| - PackedInts.DEFAULT, |
| true, |
| 15); |
| - if (LuceneTestCase.VERBOSE) { |
| - if (willRewrite) { |
| - System.out.println("TEST: packed FST"); |
| - } else { |
| - System.out.println("TEST: non-packed FST"); |
| - } |
| - } |
| |
| for(InputOutput<T> pair : pairs) { |
| if (pair.output instanceof List) { |
| @@ -306,7 +294,7 @@ public class FSTTester<T> { |
| } |
| FST<T> fst = builder.finish(); |
| |
| - if (random.nextBoolean() && fst != null && !willRewrite) { |
| + if (random.nextBoolean() && fst != null) { |
| IOContext context = LuceneTestCase.newIOContext(random); |
| IndexOutput out = dir.createOutput("fst.bin", context); |
| fst.save(out); |