| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.fst; |
| |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.store.InputStreamDataInput; |
| import org.apache.lucene.store.OutputStreamDataOutput; |
| import org.apache.lucene.store.RAMOutputStream; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.Constants; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| import static org.apache.lucene.util.fst.FST.Arc.BitTable; |
| |
| // TODO: break this into WritableFST and ReadOnlyFST.. then |
| // we can have subclasses of ReadOnlyFST to handle the |
| // different byte[] level encodings (packed or |
| // not)... and things like nodeCount, arcCount are read only |
| |
| // TODO: if FST is pure prefix trie we can do a more compact |
| // job, ie, once we are at a 'suffix only', just store the |
| // completion labels as a string not as a series of arcs. |
| |
| // NOTE: while the FST is able to represent a non-final |
| // dead-end state (NON_FINAL_END_NODE=0), the layers above |
| // (FSTEnum, Util) have problems with this!! |
| |
| /** Represents an finite state machine (FST), using a |
| * compact byte[] format. |
| * <p> The format is similar to what's used by Morfologik |
| * (https://github.com/morfologik/morfologik-stemming). |
| * |
| * <p> See the {@link org.apache.lucene.util.fst package |
| * documentation} for some simple examples. |
| * |
| * @lucene.experimental |
| */ |
| public final class FST<T> implements Accountable { |
| |
| /** Specifies allowed range of each int input label for |
| * this FST. */ |
| public enum INPUT_TYPE {BYTE1, BYTE2, BYTE4} |
| |
| private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FST.class); |
| private static final long ARC_SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Arc.class); |
| |
| private static final int BIT_FINAL_ARC = 1 << 0; |
| static final int BIT_LAST_ARC = 1 << 1; |
| static final int BIT_TARGET_NEXT = 1 << 2; |
| |
| // TODO: we can free up a bit if we can nuke this: |
| private static final int BIT_STOP_NODE = 1 << 3; |
| |
| /** This flag is set if the arc has an output. */ |
| public static final int BIT_ARC_HAS_OUTPUT = 1 << 4; |
| |
| private static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; |
| |
| /** Value of the arc flags to declare a node with fixed length arcs |
| * designed for binary search. */ |
| // We use this as a marker because this one flag is illegal by itself. |
| public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT; |
| |
| /** Value of the arc flags to declare a node with fixed length arcs |
| * and bit table designed for direct addressing. */ |
| static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; |
| |
| /** |
| * @see #shouldExpandNodeWithFixedLengthArcs |
| */ |
| static final int FIXED_LENGTH_ARC_SHALLOW_DEPTH = 3; // 0 => only root node. |
| |
| /** |
| * @see #shouldExpandNodeWithFixedLengthArcs |
| */ |
| static final int FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS = 5; |
| |
| /** |
| * @see #shouldExpandNodeWithFixedLengthArcs |
| */ |
| static final int FIXED_LENGTH_ARC_DEEP_NUM_ARCS = 10; |
| |
| /** |
| * Maximum oversizing factor allowed for direct addressing compared to binary search when expansion |
| * credits allow the oversizing. This factor prevents expansions that are obviously too costly even |
| * if there are sufficient credits. |
| * |
| * @see #shouldExpandNodeWithDirectAddressing |
| */ |
| private static final float DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR = 1.66f; |
| |
| // Increment version to change it |
| private static final String FILE_FORMAT_NAME = "FST"; |
| private static final int VERSION_START = 6; |
| private static final int VERSION_CURRENT = 7; |
| |
| // Never serialized; just used to represent the virtual |
| // final node w/ no arcs: |
| private static final long FINAL_END_NODE = -1; |
| |
| // Never serialized; just used to represent the virtual |
| // non-final node w/ no arcs: |
| private static final long NON_FINAL_END_NODE = 0; |
| |
| /** If arc has this label then that arc is final/accepted */ |
| public static final int END_LABEL = -1; |
| |
| final INPUT_TYPE inputType; |
| |
| // if non-null, this FST accepts the empty string and |
| // produces this output |
| T emptyOutput; |
| |
| /** A {@link BytesStore}, used during building, or during reading when |
| * the FST is very large (more than 1 GB). If the FST is less than 1 |
| * GB then bytesArray is set instead. */ |
| final BytesStore bytes; |
| |
| private final FSTStore fstStore; |
| |
| private long startNode = -1; |
| |
| public final Outputs<T> outputs; |
| |
| /** Represents a single arc. */ |
| public static final class Arc<T> { |
| |
| //*** Arc fields. |
| |
| private int label; |
| |
| private T output; |
| |
| private long target; |
| |
| private byte flags; |
| |
| private T nextFinalOutput; |
| |
| private long nextArc; |
| |
| private byte nodeFlags; |
| |
| //*** Fields for arcs belonging to a node with fixed length arcs. |
| // So only valid when bytesPerArc != 0. |
| // nodeFlags == ARCS_FOR_BINARY_SEARCH || nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. |
| |
| private int bytesPerArc; |
| |
| private long posArcsStart; |
| |
| private int arcIdx; |
| |
| private int numArcs; |
| |
| //*** Fields for a direct addressing node. nodeFlags == ARCS_FOR_DIRECT_ADDRESSING. |
| |
| /** Start position in the {@link FST.BytesReader} of the presence bits for a direct addressing node, aka the bit-table */ |
| private long bitTableStart; |
| |
| /** First label of a direct addressing node. */ |
| private int firstLabel; |
| |
| /** |
| * Index of the current label of a direct addressing node. While {@link #arcIdx} is the current index in the label |
| * range, {@link #presenceIndex} is its corresponding index in the list of actually present labels. It is equal |
| * to the number of bits set before the bit at {@link #arcIdx} in the bit-table. This field is a cache to avoid |
| * to count bits set repeatedly when iterating the next arcs. |
| */ |
| private int presenceIndex; |
| |
| /** Returns this */ |
| public Arc<T> copyFrom(Arc<T> other) { |
| label = other.label(); |
| target = other.target(); |
| flags = other.flags(); |
| output = other.output(); |
| nextFinalOutput = other.nextFinalOutput(); |
| nextArc = other.nextArc(); |
| nodeFlags = other.nodeFlags(); |
| bytesPerArc = other.bytesPerArc(); |
| |
| // Fields for arcs belonging to a node with fixed length arcs. |
| // We could avoid copying them if bytesPerArc() == 0 (this was the case with previous code, and the current code |
| // still supports that), but it may actually help external uses of FST to have consistent arc state, and debugging |
| // is easier. |
| posArcsStart = other.posArcsStart(); |
| arcIdx = other.arcIdx(); |
| numArcs = other.numArcs(); |
| bitTableStart = other.bitTableStart; |
| firstLabel = other.firstLabel(); |
| presenceIndex = other.presenceIndex; |
| |
| return this; |
| } |
| |
| boolean flag(int flag) { |
| return FST.flag(flags, flag); |
| } |
| |
| public boolean isLast() { |
| return flag(BIT_LAST_ARC); |
| } |
| |
| public boolean isFinal() { |
| return flag(BIT_FINAL_ARC); |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder b = new StringBuilder(); |
| b.append(" target=").append(target()); |
| b.append(" label=0x").append(Integer.toHexString(label())); |
| if (flag(BIT_FINAL_ARC)) { |
| b.append(" final"); |
| } |
| if (flag(BIT_LAST_ARC)) { |
| b.append(" last"); |
| } |
| if (flag(BIT_TARGET_NEXT)) { |
| b.append(" targetNext"); |
| } |
| if (flag(BIT_STOP_NODE)) { |
| b.append(" stop"); |
| } |
| if (flag(BIT_ARC_HAS_OUTPUT)) { |
| b.append(" output=").append(output()); |
| } |
| if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { |
| b.append(" nextFinalOutput=").append(nextFinalOutput()); |
| } |
| if (bytesPerArc() != 0) { |
| b.append(" arcArray(idx=").append(arcIdx()).append(" of ").append(numArcs()).append(")") |
| .append("(").append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs").append(")"); |
| } |
| return b.toString(); |
| } |
| |
| public int label() { |
| return label; |
| } |
| |
| public T output() { |
| return output; |
| } |
| |
| /** Ord/address to target node. */ |
| public long target() { |
| return target; |
| } |
| |
| public byte flags() { |
| return flags; |
| } |
| |
| public T nextFinalOutput() { |
| return nextFinalOutput; |
| } |
| |
| /** Address (into the byte[]) of the next arc - only for list of variable length arc. |
| * Or ord/address to the next node if label == {@link #END_LABEL}. */ |
| long nextArc() { |
| return nextArc; |
| } |
| |
| /** Where we are in the array; only valid if bytesPerArc != 0. */ |
| public int arcIdx() { |
| return arcIdx; |
| } |
| |
| /** Node header flags. Only meaningful to check if the value is either |
| * {@link #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} |
| * (other value when bytesPerArc == 0). */ |
| public byte nodeFlags() { |
| return nodeFlags; |
| } |
| |
| /** Where the first arc in the array starts; only valid if bytesPerArc != 0 */ |
| public long posArcsStart() { |
| return posArcsStart; |
| } |
| |
| /** Non-zero if this arc is part of a node with fixed length arcs, which means all |
| * arcs for the node are encoded with a fixed number of bytes so |
| * that we binary search or direct address. We do when there are enough |
| * arcs leaving one node. It wastes some bytes but gives faster lookups. */ |
| public int bytesPerArc() { |
| return bytesPerArc; |
| } |
| |
| /** How many arcs; only valid if bytesPerArc != 0 (fixed length arcs). |
| * For a node designed for binary search this is the array size. |
| * For a node designed for direct addressing, this is the label range. */ |
| public int numArcs() { |
| return numArcs; |
| } |
| |
| /** First label of a direct addressing node. |
| * Only valid if nodeFlags == {@link #ARCS_FOR_DIRECT_ADDRESSING}. */ |
| int firstLabel() { |
| return firstLabel; |
| } |
| |
| /** |
| * Helper methods to read the bit-table of a direct addressing node. |
| * Only valid for {@link Arc} with {@link Arc#nodeFlags()} == {@code ARCS_FOR_DIRECT_ADDRESSING}. |
| */ |
| static class BitTable { |
| |
| /** See {@link BitTableUtil#isBitSet(int, FST.BytesReader)}. */ |
| static boolean isBitSet(int bitIndex, Arc<?> arc, FST.BytesReader in) throws IOException { |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| in.setPosition(arc.bitTableStart); |
| return BitTableUtil.isBitSet(bitIndex, in); |
| } |
| |
| /** |
| * See {@link BitTableUtil#countBits(int, FST.BytesReader)}. |
| * The count of bit set is the number of arcs of a direct addressing node. |
| */ |
| static int countBits(Arc<?> arc, FST.BytesReader in) throws IOException { |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| in.setPosition(arc.bitTableStart); |
| return BitTableUtil.countBits(getNumPresenceBytes(arc.numArcs()), in); |
| } |
| |
| /** See {@link BitTableUtil#countBitsUpTo(int, FST.BytesReader)}. */ |
| static int countBitsUpTo(int bitIndex, Arc<?> arc, FST.BytesReader in) throws IOException { |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| in.setPosition(arc.bitTableStart); |
| return BitTableUtil.countBitsUpTo(bitIndex, in); |
| } |
| |
| /** See {@link BitTableUtil#nextBitSet(int, int, FST.BytesReader)}. */ |
| static int nextBitSet(int bitIndex, Arc<?> arc, FST.BytesReader in) throws IOException { |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| in.setPosition(arc.bitTableStart); |
| return BitTableUtil.nextBitSet(bitIndex, getNumPresenceBytes(arc.numArcs()), in); |
| } |
| |
| /** See {@link BitTableUtil#previousBitSet(int, FST.BytesReader)}. */ |
| static int previousBitSet(int bitIndex, Arc<?> arc, FST.BytesReader in) throws IOException { |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| in.setPosition(arc.bitTableStart); |
| return BitTableUtil.previousBitSet(bitIndex, in); |
| } |
| |
| /** |
| * Asserts the bit-table of the provided {@link Arc} is valid. |
| */ |
| static boolean assertIsValid(Arc<?> arc, FST.BytesReader in) throws IOException { |
| assert arc.bytesPerArc() > 0; |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| // First bit must be set. |
| assert isBitSet(0, arc, in); |
| // Last bit must be set. |
| assert isBitSet(arc.numArcs() - 1, arc, in); |
| // No bit set after the last arc. |
| assert nextBitSet(arc.numArcs() - 1, arc, in) == -1; |
| return true; |
| } |
| } |
| } |
| |
| private static boolean flag(int flags, int bit) { |
| return (flags & bit) != 0; |
| } |
| |
| // make a new empty FST, for building; Builder invokes this |
| FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) { |
| this.inputType = inputType; |
| this.outputs = outputs; |
| fstStore = null; |
| bytes = new BytesStore(bytesPageBits); |
| // pad: ensure no node gets address 0 which is reserved to mean |
| // the stop state w/ no arcs |
| bytes.writeByte((byte) 0); |
| emptyOutput = null; |
| } |
| |
| private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; |
| |
| /** Load a previously saved FST. */ |
| public FST(DataInput metaIn, DataInput in, Outputs<T> outputs) throws IOException { |
| this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); |
| } |
| |
| /** Load a previously saved FST; maxBlockBits allows you to |
| * control the size of the byte[] pages used to hold the FST bytes. */ |
| public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException { |
| bytes = null; |
| this.fstStore = fstStore; |
| this.outputs = outputs; |
| |
| // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have |
| // back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it |
| CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT); |
| if (metaIn.readByte() == 1) { |
| // accepts empty string |
| // 1 KB blocks: |
| BytesStore emptyBytes = new BytesStore(10); |
| int numBytes = metaIn.readVInt(); |
| emptyBytes.copyBytes(metaIn, numBytes); |
| |
| // De-serialize empty-string output: |
| BytesReader reader = emptyBytes.getReverseReader(); |
| // NoOutputs uses 0 bytes when writing its output, |
| // so we have to check here else BytesStore gets |
| // angry: |
| if (numBytes > 0) { |
| reader.setPosition(numBytes-1); |
| } |
| emptyOutput = outputs.readFinalOutput(reader); |
| } else { |
| emptyOutput = null; |
| } |
| final byte t = metaIn.readByte(); |
| switch(t) { |
| case 0: |
| inputType = INPUT_TYPE.BYTE1; |
| break; |
| case 1: |
| inputType = INPUT_TYPE.BYTE2; |
| break; |
| case 2: |
| inputType = INPUT_TYPE.BYTE4; |
| break; |
| default: |
| throw new CorruptIndexException("invalid input type " + t, in); |
| } |
| startNode = metaIn.readVLong(); |
| |
| long numBytes = metaIn.readVLong(); |
| this.fstStore.init(in, numBytes); |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| long size = BASE_RAM_BYTES_USED; |
| if (this.fstStore != null) { |
| size += this.fstStore.ramBytesUsed(); |
| } else { |
| size += bytes.ramBytesUsed(); |
| } |
| |
| return size; |
| } |
| |
| @Override |
| public String toString() { |
| return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs; |
| } |
| |
| void finish(long newStartNode) throws IOException { |
| assert newStartNode <= bytes.getPosition(); |
| if (startNode != -1) { |
| throw new IllegalStateException("already finished"); |
| } |
| if (newStartNode == FINAL_END_NODE && emptyOutput != null) { |
| newStartNode = 0; |
| } |
| startNode = newStartNode; |
| bytes.finish(); |
| } |
| |
| public T getEmptyOutput() { |
| return emptyOutput; |
| } |
| |
| void setEmptyOutput(T v) { |
| if (emptyOutput != null) { |
| emptyOutput = outputs.merge(emptyOutput, v); |
| } else { |
| emptyOutput = v; |
| } |
| } |
| |
| public void save(DataOutput metaOut, DataOutput out) throws IOException { |
| if (startNode == -1) { |
| throw new IllegalStateException("call finish first"); |
| } |
| CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT); |
| // TODO: really we should encode this as an arc, arriving |
| // to the root node, instead of special casing here: |
| if (emptyOutput != null) { |
| // Accepts empty string |
| metaOut.writeByte((byte) 1); |
| |
| // Serialize empty-string output: |
| RAMOutputStream ros = new RAMOutputStream(); |
| outputs.writeFinalOutput(emptyOutput, ros); |
| |
| byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()]; |
| ros.writeTo(emptyOutputBytes, 0); |
| int emptyLen = emptyOutputBytes.length; |
| |
| // reverse |
| final int stopAt = emptyLen / 2; |
| int upto = 0; |
| while(upto < stopAt) { |
| final byte b = emptyOutputBytes[upto]; |
| emptyOutputBytes[upto] = emptyOutputBytes[emptyLen - upto - 1]; |
| emptyOutputBytes[emptyLen - upto - 1] = b; |
| upto++; |
| } |
| metaOut.writeVInt(emptyLen); |
| metaOut.writeBytes(emptyOutputBytes, 0, emptyLen); |
| } else { |
| metaOut.writeByte((byte) 0); |
| } |
| final byte t; |
| if (inputType == INPUT_TYPE.BYTE1) { |
| t = 0; |
| } else if (inputType == INPUT_TYPE.BYTE2) { |
| t = 1; |
| } else { |
| t = 2; |
| } |
| metaOut.writeByte(t); |
| metaOut.writeVLong(startNode); |
| if (bytes != null) { |
| long numBytes = bytes.getPosition(); |
| metaOut.writeVLong(numBytes); |
| bytes.writeTo(out); |
| } else { |
| assert fstStore != null; |
| fstStore.writeTo(out); |
| } |
| } |
| |
| /** |
| * Writes an automaton to a file. |
| */ |
| public void save(final Path path) throws IOException { |
| try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { |
| DataOutput out = new OutputStreamDataOutput(os); |
| save(out, out); |
| } |
| } |
| |
| /** |
| * Reads an automaton from a file. |
| */ |
| public static <T> FST<T> read(Path path, Outputs<T> outputs) throws IOException { |
| try (InputStream is = Files.newInputStream(path)) { |
| DataInput in = new InputStreamDataInput(new BufferedInputStream(is)); |
| return new FST<>(in, in, outputs); |
| } |
| } |
| |
| private void writeLabel(DataOutput out, int v) throws IOException { |
| assert v >= 0: "v=" + v; |
| if (inputType == INPUT_TYPE.BYTE1) { |
| assert v <= 255: "v=" + v; |
| out.writeByte((byte) v); |
| } else if (inputType == INPUT_TYPE.BYTE2) { |
| assert v <= 65535: "v=" + v; |
| out.writeShort((short) v); |
| } else { |
| out.writeVInt(v); |
| } |
| } |
| |
| /** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */ |
| public int readLabel(DataInput in) throws IOException { |
| final int v; |
| if (inputType == INPUT_TYPE.BYTE1) { |
| // Unsigned byte: |
| v = in.readByte() & 0xFF; |
| } else if (inputType == INPUT_TYPE.BYTE2) { |
| // Unsigned short: |
| v = in.readShort() & 0xFFFF; |
| } else { |
| v = in.readVInt(); |
| } |
| return v; |
| } |
| |
| /** returns true if the node at this address has any |
| * outgoing arcs */ |
| public static<T> boolean targetHasArcs(Arc<T> arc) { |
| return arc.target() > 0; |
| } |
| |
| // serializes new node by appending its bytes to the end |
| // of the current byte[] |
| long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException { |
| T NO_OUTPUT = outputs.getNoOutput(); |
| |
| //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); |
| if (nodeIn.numArcs == 0) { |
| if (nodeIn.isFinal) { |
| return FINAL_END_NODE; |
| } else { |
| return NON_FINAL_END_NODE; |
| } |
| } |
| final long startAddress = builder.bytes.getPosition(); |
| //System.out.println(" startAddr=" + startAddress); |
| |
| final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn); |
| if (doFixedLengthArcs) { |
| //System.out.println(" fixed length arcs"); |
| if (builder.numBytesPerArc.length < nodeIn.numArcs) { |
| builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)]; |
| builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.length]; |
| } |
| } |
| |
| builder.arcCount += nodeIn.numArcs; |
| |
| final int lastArc = nodeIn.numArcs-1; |
| |
| long lastArcStart = builder.bytes.getPosition(); |
| int maxBytesPerArc = 0; |
| int maxBytesPerArcWithoutLabel = 0; |
| for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) { |
| final Builder.Arc<T> arc = nodeIn.arcs[arcIdx]; |
| final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; |
| int flags = 0; |
| //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); |
| |
| if (arcIdx == lastArc) { |
| flags += BIT_LAST_ARC; |
| } |
| |
| if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) { |
| // TODO: for better perf (but more RAM used) we |
| // could avoid this except when arc is "near" the |
| // last arc: |
| flags += BIT_TARGET_NEXT; |
| } |
| |
| if (arc.isFinal) { |
| flags += BIT_FINAL_ARC; |
| if (arc.nextFinalOutput != NO_OUTPUT) { |
| flags += BIT_ARC_HAS_FINAL_OUTPUT; |
| } |
| } else { |
| assert arc.nextFinalOutput == NO_OUTPUT; |
| } |
| |
| boolean targetHasArcs = target.node > 0; |
| |
| if (!targetHasArcs) { |
| flags += BIT_STOP_NODE; |
| } |
| |
| if (arc.output != NO_OUTPUT) { |
| flags += BIT_ARC_HAS_OUTPUT; |
| } |
| |
| builder.bytes.writeByte((byte) flags); |
| long labelStart = builder.bytes.getPosition(); |
| writeLabel(builder.bytes, arc.label); |
| int numLabelBytes = (int) (builder.bytes.getPosition() - labelStart); |
| |
| // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); |
| |
| if (arc.output != NO_OUTPUT) { |
| outputs.write(arc.output, builder.bytes); |
| //System.out.println(" write output"); |
| } |
| |
| if (arc.nextFinalOutput != NO_OUTPUT) { |
| //System.out.println(" write final output"); |
| outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); |
| } |
| |
| if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { |
| assert target.node > 0; |
| //System.out.println(" write target"); |
| builder.bytes.writeVLong(target.node); |
| } |
| |
| // just write the arcs "like normal" on first pass, but record how many bytes each one took |
| // and max byte size: |
| if (doFixedLengthArcs) { |
| int numArcBytes = (int) (builder.bytes.getPosition() - lastArcStart); |
| builder.numBytesPerArc[arcIdx] = numArcBytes; |
| builder.numLabelBytesPerArc[arcIdx] = numLabelBytes; |
| lastArcStart = builder.bytes.getPosition(); |
| maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes); |
| maxBytesPerArcWithoutLabel = Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); |
| //System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes); |
| } |
| } |
| |
| // TODO: try to avoid wasteful cases: disable doFixedLengthArcs in that case |
| /* |
| * |
| * LUCENE-4682: what is a fair heuristic here? |
| * It could involve some of these: |
| * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount? |
| * 2. how much binSearch saves over scan: nodeIn.numArcs |
| * 3. waste: numBytes vs numBytesExpanded |
| * |
| * the one below just looks at #3 |
| if (doFixedLengthArcs) { |
| // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor???? |
| int numBytes = lastArcStart - startAddress; |
| int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs; |
| if (numBytesExpanded > numBytes*1.25) { |
| doFixedLengthArcs = false; |
| } |
| } |
| */ |
| |
| if (doFixedLengthArcs) { |
| assert maxBytesPerArc > 0; |
| // 2nd pass just "expands" all arcs to take up a fixed byte size |
| |
| int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; |
| assert labelRange > 0; |
| if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { |
| writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); |
| builder.directAddressingNodeCount++; |
| } else { |
| writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc); |
| builder.binarySearchNodeCount++; |
| } |
| } |
| |
| final long thisNodeAddress = builder.bytes.getPosition()-1; |
| builder.bytes.reverse(startAddress, thisNodeAddress); |
| builder.nodeCount++; |
| return thisNodeAddress; |
| } |
| |
| /** |
| * Returns whether the given node should be expanded with fixed length arcs. |
| * Nodes will be expanded depending on their depth (distance from the root node) and their number |
| * of arcs. |
| * <p> |
| * Nodes with fixed length arcs use more space, because they encode all arcs with a fixed number |
| * of bytes, but they allow either binary search or direct addressing on the arcs (instead of linear |
| * scan) on lookup by arc label. |
| */ |
| private boolean shouldExpandNodeWithFixedLengthArcs(Builder<T> builder, Builder.UnCompiledNode<T> node) { |
| return builder.allowFixedLengthArcs && |
| ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) || |
| node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS); |
| } |
| |
| /** |
| * Returns whether the given node should be expanded with direct addressing instead of binary search. |
| * <p> |
| * Prefer direct addressing for performance if it does not oversize binary search byte size too much, |
| * so that the arcs can be directly addressed by label. |
| * |
| * @see Builder#getDirectAddressingMaxOversizingFactor() |
| */ |
| private boolean shouldExpandNodeWithDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, |
| int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) { |
| // Anticipate precisely the size of the encodings. |
| int sizeForBinarySearch = numBytesPerArc * nodeIn.numArcs; |
| int sizeForDirectAddressing = getNumPresenceBytes(labelRange) + builder.numLabelBytesPerArc[0] |
| + maxBytesPerArcWithoutLabel * nodeIn.numArcs; |
| |
| // Determine the allowed oversize compared to binary search. |
| // This is defined by a parameter of FST Builder (default 1: no oversize). |
| int allowedOversize = (int) (sizeForBinarySearch * builder.getDirectAddressingMaxOversizingFactor()); |
| int expansionCost = sizeForDirectAddressing - allowedOversize; |
| |
| // Select direct addressing if either: |
| // - Direct addressing size is smaller than binary search. |
| // In this case, increment the credit by the reduced size (to use it later). |
| // - Direct addressing size is larger than binary search, but the positive credit allows the oversizing. |
| // In this case, decrement the credit by the oversize. |
| // In addition, do not try to oversize to a clearly too large node size |
| // (this is the DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR parameter). |
| if (expansionCost <= 0 || (builder.directAddressingExpansionCredit >= expansionCost |
| && sizeForDirectAddressing <= allowedOversize * DIRECT_ADDRESSING_MAX_OVERSIZE_WITH_CREDIT_FACTOR)) { |
| builder.directAddressingExpansionCredit -= expansionCost; |
| return true; |
| } |
| return false; |
| } |
| |
| private void writeNodeForBinarySearch(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArc) { |
| // Build the header in a buffer. |
| // It is a false/special arc which is in fact a node header with node flags followed by node metadata. |
| builder.fixedLengthArcsBuffer |
| .resetPosition() |
| .writeByte(ARCS_FOR_BINARY_SEARCH) |
| .writeVInt(nodeIn.numArcs) |
| .writeVInt(maxBytesPerArc); |
| int headerLen = builder.fixedLengthArcsBuffer.getPosition(); |
| |
| // Expand the arcs in place, backwards. |
| long srcPos = builder.bytes.getPosition(); |
| long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc; |
| assert destPos >= srcPos; |
| if (destPos > srcPos) { |
| builder.bytes.skipBytes((int) (destPos - srcPos)); |
| for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { |
| destPos -= maxBytesPerArc; |
| int arcLen = builder.numBytesPerArc[arcIdx]; |
| srcPos -= arcLen; |
| if (srcPos != destPos) { |
| assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs; |
| builder.bytes.copyBytes(srcPos, destPos, arcLen); |
| } |
| } |
| } |
| |
| // Write the header. |
| builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); |
| } |
| |
| private void writeNodeForDirectAddressing(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long startAddress, int maxBytesPerArcWithoutLabel, int labelRange) { |
| // Expand the arcs backwards in a buffer because we remove the labels. |
| // So the obtained arcs might occupy less space. This is the reason why this |
| // whole method is more complex. |
| // Drop the label bytes since we can infer the label based on the arc index, |
| // the presence bits, and the first label. Keep the first label. |
| int headerMaxLen = 11; |
| int numPresenceBytes = getNumPresenceBytes(labelRange); |
| long srcPos = builder.bytes.getPosition(); |
| int totalArcBytes = builder.numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; |
| int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; |
| byte[] buffer = builder.fixedLengthArcsBuffer.ensureCapacity(bufferOffset).getBytes(); |
| // Copy the arcs to the buffer, dropping all labels except first one. |
| for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { |
| bufferOffset -= maxBytesPerArcWithoutLabel; |
| int srcArcLen = builder.numBytesPerArc[arcIdx]; |
| srcPos -= srcArcLen; |
| int labelLen = builder.numLabelBytesPerArc[arcIdx]; |
| // Copy the flags. |
| builder.bytes.copyBytes(srcPos, buffer, bufferOffset, 1); |
| // Skip the label, copy the remaining. |
| int remainingArcLen = srcArcLen - 1 - labelLen; |
| if (remainingArcLen != 0) { |
| builder.bytes.copyBytes(srcPos + 1 + labelLen, buffer, bufferOffset + 1, remainingArcLen); |
| } |
| if (arcIdx == 0) { |
| // Copy the label of the first arc only. |
| bufferOffset -= labelLen; |
| builder.bytes.copyBytes(srcPos + 1, buffer, bufferOffset, labelLen); |
| } |
| } |
| assert bufferOffset == headerMaxLen + numPresenceBytes; |
| |
| // Build the header in the buffer. |
| // It is a false/special arc which is in fact a node header with node flags followed by node metadata. |
| builder.fixedLengthArcsBuffer |
| .resetPosition() |
| .writeByte(ARCS_FOR_DIRECT_ADDRESSING) |
| .writeVInt(labelRange) // labelRange instead of numArcs. |
| .writeVInt(maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. |
| int headerLen = builder.fixedLengthArcsBuffer.getPosition(); |
| |
| // Prepare the builder byte store. Enlarge or truncate if needed. |
| long nodeEnd = startAddress + headerLen + numPresenceBytes + totalArcBytes; |
| long currentPosition = builder.bytes.getPosition(); |
| if (nodeEnd >= currentPosition) { |
| builder.bytes.skipBytes((int) (nodeEnd - currentPosition)); |
| } else { |
| builder.bytes.truncate(nodeEnd); |
| } |
| assert builder.bytes.getPosition() == nodeEnd; |
| |
| // Write the header. |
| long writeOffset = startAddress; |
| builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); |
| writeOffset += headerLen; |
| |
| // Write the presence bits |
| writePresenceBits(builder, nodeIn, writeOffset, numPresenceBytes); |
| writeOffset += numPresenceBytes; |
| |
| // Write the first label and the arcs. |
| builder.bytes.writeBytes(writeOffset, builder.fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); |
| } |
| |
| private void writePresenceBits(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long dest, int numPresenceBytes) { |
| long bytePos = dest; |
| byte presenceBits = 1; // The first arc is always present. |
| int presenceIndex = 0; |
| int previousLabel = nodeIn.arcs[0].label; |
| for (int arcIdx = 1; arcIdx < nodeIn.numArcs; arcIdx++) { |
| int label = nodeIn.arcs[arcIdx].label; |
| assert label > previousLabel; |
| presenceIndex += label - previousLabel; |
| while (presenceIndex >= Byte.SIZE) { |
| builder.bytes.writeByte(bytePos++, presenceBits); |
| presenceBits = 0; |
| presenceIndex -= Byte.SIZE; |
| } |
| // Set the bit at presenceIndex to flag that the corresponding arc is present. |
| presenceBits |= 1 << presenceIndex; |
| previousLabel = label; |
| } |
| assert presenceIndex == (nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label) % 8; |
| assert presenceBits != 0; // The last byte is not 0. |
| assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. |
| builder.bytes.writeByte(bytePos++, presenceBits); |
| assert bytePos - dest == numPresenceBytes; |
| } |
| |
| /** Gets the number of bytes required to flag the presence of each arc in the given label range, one bit per arc. */ |
| private static int getNumPresenceBytes(int labelRange) { |
| assert labelRange >= 0; |
| return (labelRange + 7) >> 3; |
| } |
| |
| /** |
| * Reads the presence bits of a direct-addressing node. |
| * Actually we don't read them here, we just keep the pointer to the bit-table start and we skip them. |
| */ |
| private void readPresenceBytes(Arc<T> arc, BytesReader in) throws IOException { |
| assert arc.bytesPerArc() > 0; |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| arc.bitTableStart = in.getPosition(); |
| in.skipBytes(getNumPresenceBytes(arc.numArcs())); |
| } |
| |
| /** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */ |
| public Arc<T> getFirstArc(Arc<T> arc) { |
| T NO_OUTPUT = outputs.getNoOutput(); |
| |
| if (emptyOutput != null) { |
| arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; |
| arc.nextFinalOutput = emptyOutput; |
| if (emptyOutput != NO_OUTPUT) { |
| arc.flags = (byte) (arc.flags() | BIT_ARC_HAS_FINAL_OUTPUT); |
| } |
| } else { |
| arc.flags = BIT_LAST_ARC; |
| arc.nextFinalOutput = NO_OUTPUT; |
| } |
| arc.output = NO_OUTPUT; |
| |
| // If there are no nodes, ie, the FST only accepts the |
| // empty string, then startNode is 0 |
| arc.target = startNode; |
| return arc; |
| } |
| |
| /** Follows the <code>follow</code> arc and reads the last |
| * arc of its target; this changes the provided |
| * <code>arc</code> (2nd arg) in-place and returns it. |
| * |
| * @return Returns the second argument |
| * (<code>arc</code>). */ |
| Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { |
| //System.out.println("readLast"); |
| if (!targetHasArcs(follow)) { |
| //System.out.println(" end node"); |
| assert follow.isFinal(); |
| arc.label = END_LABEL; |
| arc.target = FINAL_END_NODE; |
| arc.output = follow.nextFinalOutput(); |
| arc.flags = BIT_LAST_ARC; |
| arc.nodeFlags = arc.flags; |
| return arc; |
| } else { |
| in.setPosition(follow.target()); |
| byte flags = arc.nodeFlags = in.readByte(); |
| if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { |
| // Special arc which is actually a node header for fixed length arcs. |
| // Jump straight to end to find the last arc. |
| arc.numArcs = in.readVInt(); |
| arc.bytesPerArc = in.readVInt(); |
| //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); |
| if (flags == ARCS_FOR_DIRECT_ADDRESSING) { |
| readPresenceBytes(arc, in); |
| arc.firstLabel = readLabel(in); |
| arc.posArcsStart = in.getPosition(); |
| readLastArcByDirectAddressing(arc, in); |
| } else { |
| arc.arcIdx = arc.numArcs() - 2; |
| arc.posArcsStart = in.getPosition(); |
| readNextRealArc(arc, in); |
| } |
| } else { |
| arc.flags = flags; |
| // non-array: linear scan |
| arc.bytesPerArc = 0; |
| //System.out.println(" scan"); |
| while(!arc.isLast()) { |
| // skip this arc: |
| readLabel(in); |
| if (arc.flag(BIT_ARC_HAS_OUTPUT)) { |
| outputs.skipOutput(in); |
| } |
| if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { |
| outputs.skipFinalOutput(in); |
| } |
| if (arc.flag(BIT_STOP_NODE)) { |
| } else if (arc.flag(BIT_TARGET_NEXT)) { |
| } else { |
| readUnpackedNodeTarget(in); |
| } |
| arc.flags = in.readByte(); |
| } |
| // Undo the byte flags we read: |
| in.skipBytes(-1); |
| arc.nextArc = in.getPosition(); |
| readNextRealArc(arc, in); |
| } |
| assert arc.isLast(); |
| return arc; |
| } |
| } |
| |
| private long readUnpackedNodeTarget(BytesReader in) throws IOException { |
| return in.readVLong(); |
| } |
| |
| /** |
| * Follow the <code>follow</code> arc and read the first arc of its target; |
| * this changes the provided <code>arc</code> (2nd arg) in-place and returns |
| * it. |
| * |
| * @return Returns the second argument (<code>arc</code>). |
| */ |
| public Arc<T> readFirstTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { |
| //int pos = address; |
| //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); |
| if (follow.isFinal()) { |
| // Insert "fake" final first arc: |
| arc.label = END_LABEL; |
| arc.output = follow.nextFinalOutput(); |
| arc.flags = BIT_FINAL_ARC; |
| if (follow.target() <= 0) { |
| arc.flags |= BIT_LAST_ARC; |
| } else { |
| // NOTE: nextArc is a node (not an address!) in this case: |
| arc.nextArc = follow.target(); |
| } |
| arc.target = FINAL_END_NODE; |
| arc.nodeFlags = arc.flags; |
| //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); |
| return arc; |
| } else { |
| return readFirstRealTargetArc(follow.target(), arc, in); |
| } |
| } |
| |
| public Arc<T> readFirstRealTargetArc(long nodeAddress, Arc<T> arc, final BytesReader in) throws IOException { |
| in.setPosition(nodeAddress); |
| //System.out.println(" flags=" + arc.flags); |
| |
| byte flags = arc.nodeFlags = in.readByte(); |
| if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { |
| //System.out.println(" fixed length arc"); |
| // Special arc which is actually a node header for fixed length arcs. |
| arc.numArcs = in.readVInt(); |
| arc.bytesPerArc = in.readVInt(); |
| arc.arcIdx = -1; |
| if (flags == ARCS_FOR_DIRECT_ADDRESSING) { |
| readPresenceBytes(arc, in); |
| arc.firstLabel = readLabel(in); |
| arc.presenceIndex = -1; |
| } |
| arc.posArcsStart = in.getPosition(); |
| //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); |
| } else { |
| arc.nextArc = nodeAddress; |
| arc.bytesPerArc = 0; |
| } |
| |
| return readNextRealArc(arc, in); |
| } |
| |
| /** |
| * Returns whether <code>arc</code>'s target points to a node in expanded format (fixed length arcs). |
| */ |
| boolean isExpandedTarget(Arc<T> follow, BytesReader in) throws IOException { |
| if (!targetHasArcs(follow)) { |
| return false; |
| } else { |
| in.setPosition(follow.target()); |
| byte flags = in.readByte(); |
| return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING; |
| } |
| } |
| |
| /** In-place read; returns the arc. */ |
| public Arc<T> readNextArc(Arc<T> arc, BytesReader in) throws IOException { |
| if (arc.label() == END_LABEL) { |
| // This was a fake inserted "final" arc |
| if (arc.nextArc() <= 0) { |
| throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); |
| } |
| return readFirstRealTargetArc(arc.nextArc(), arc, in); |
| } else { |
| return readNextRealArc(arc, in); |
| } |
| } |
| |
| /** Peeks at next arc's label; does not alter arc. Do |
| * not call this if arc.isLast()! */ |
| int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException { |
| assert !arc.isLast(); |
| |
| if (arc.label() == END_LABEL) { |
| //System.out.println(" nextArc fake " + arc.nextArc); |
| // Next arc is the first arc of a node. |
| // Position to read the first arc label. |
| |
| in.setPosition(arc.nextArc()); |
| byte flags = in.readByte(); |
| if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { |
| //System.out.println(" nextArc fixed length arc"); |
| // Special arc which is actually a node header for fixed length arcs. |
| int numArcs = in.readVInt(); |
| in.readVInt(); // Skip bytesPerArc. |
| if (flags == ARCS_FOR_BINARY_SEARCH) { |
| in.readByte(); // Skip arc flags. |
| } else { |
| in.skipBytes(getNumPresenceBytes(numArcs)); |
| } |
| } |
| } else { |
| if (arc.bytesPerArc() != 0) { |
| //System.out.println(" nextArc real array"); |
| // Arcs have fixed length. |
| if (arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH) { |
| // Point to next arc, -1 to skip arc flags. |
| in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * arc.bytesPerArc() - 1); |
| } else { |
| assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING; |
| // Direct addressing node. The label is not stored but rather inferred |
| // based on first label and arc index in the range. |
| assert BitTable.assertIsValid(arc, in); |
| assert BitTable.isBitSet(arc.arcIdx(), arc, in); |
| int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); |
| assert nextIndex != -1; |
| return arc.firstLabel() + nextIndex; |
| } |
| } else { |
| // Arcs have variable length. |
| //System.out.println(" nextArc real list"); |
| // Position to next arc, -1 to skip flags. |
| in.setPosition(arc.nextArc() - 1); |
| } |
| } |
| return readLabel(in); |
| } |
| |
| public Arc<T> readArcByIndex(Arc<T> arc, final BytesReader in, int idx) throws IOException { |
| assert arc.bytesPerArc() > 0; |
| assert arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH; |
| assert idx >= 0 && idx < arc.numArcs(); |
| in.setPosition(arc.posArcsStart() - idx * arc.bytesPerArc()); |
| arc.arcIdx = idx; |
| arc.flags = in.readByte(); |
| return readArc(arc, in); |
| } |
| |
| /** |
| * Reads a present direct addressing node arc, with the provided index in the label range. |
| * |
| * @param rangeIndex The index of the arc in the label range. It must be present. |
| * The real arc offset is computed based on the presence bits of |
| * the direct addressing node. |
| */ |
| public Arc<T> readArcByDirectAddressing(Arc<T> arc, final BytesReader in, int rangeIndex) throws IOException { |
| assert BitTable.assertIsValid(arc, in); |
| assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); |
| assert BitTable.isBitSet(rangeIndex, arc, in); |
| int presenceIndex = BitTable.countBitsUpTo(rangeIndex, arc, in); |
| return readArcByDirectAddressing(arc, in, rangeIndex, presenceIndex); |
| } |
| |
| /** |
| * Reads a present direct addressing node arc, with the provided index in the label range and its corresponding |
| * presence index (which is the count of presence bits before it). |
| */ |
| private Arc<T> readArcByDirectAddressing(Arc<T> arc, final BytesReader in, int rangeIndex, int presenceIndex) throws IOException { |
| in.setPosition(arc.posArcsStart() - presenceIndex * arc.bytesPerArc()); |
| arc.arcIdx = rangeIndex; |
| arc.presenceIndex = presenceIndex; |
| arc.flags = in.readByte(); |
| return readArc(arc, in); |
| } |
| |
| /** |
| * Reads the last arc of a direct addressing node. |
| * This method is equivalent to call {@link #readArcByDirectAddressing(Arc, BytesReader, int)} with {@code rangeIndex} |
| * equal to {@code arc.numArcs() - 1}, but it is faster. |
| */ |
| public Arc<T> readLastArcByDirectAddressing(Arc<T> arc, final BytesReader in) throws IOException { |
| assert BitTable.assertIsValid(arc, in); |
| int presenceIndex = BitTable.countBits(arc, in) - 1; |
| return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); |
| } |
| |
| /** Never returns null, but you should never call this if |
| * arc.isLast() is true. */ |
| public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException { |
| |
| // TODO: can't assert this because we call from readFirstArc |
| // assert !flag(arc.flags, BIT_LAST_ARC); |
| |
| switch (arc.nodeFlags()) { |
| |
| case ARCS_FOR_BINARY_SEARCH: |
| assert arc.bytesPerArc() > 0; |
| arc.arcIdx++; |
| assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); |
| in.setPosition(arc.posArcsStart() - arc.arcIdx() * arc.bytesPerArc()); |
| arc.flags = in.readByte(); |
| break; |
| |
| case ARCS_FOR_DIRECT_ADDRESSING: |
| assert BitTable.assertIsValid(arc, in); |
| assert arc.arcIdx() == -1 || BitTable.isBitSet(arc.arcIdx(), arc, in); |
| int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); |
| return readArcByDirectAddressing(arc, in, nextIndex, arc.presenceIndex + 1); |
| |
| default: |
| // Variable length arcs - linear search. |
| assert arc.bytesPerArc() == 0; |
| in.setPosition(arc.nextArc()); |
| arc.flags = in.readByte(); |
| } |
| return readArc(arc, in); |
| } |
| |
| /** |
| * Reads an arc. |
| * <br>Precondition: The arc flags byte has already been read and set; |
| * the given BytesReader is positioned just after the arc flags byte. |
| */ |
| private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException { |
| if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) { |
| arc.label = arc.firstLabel() + arc.arcIdx(); |
| } else { |
| arc.label = readLabel(in); |
| } |
| |
| if (arc.flag(BIT_ARC_HAS_OUTPUT)) { |
| arc.output = outputs.read(in); |
| } else { |
| arc.output = outputs.getNoOutput(); |
| } |
| |
| if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { |
| arc.nextFinalOutput = outputs.readFinalOutput(in); |
| } else { |
| arc.nextFinalOutput = outputs.getNoOutput(); |
| } |
| |
| if (arc.flag(BIT_STOP_NODE)) { |
| if (arc.flag(BIT_FINAL_ARC)) { |
| arc.target = FINAL_END_NODE; |
| } else { |
| arc.target = NON_FINAL_END_NODE; |
| } |
| arc.nextArc = in.getPosition(); // Only useful for list. |
| } else if (arc.flag(BIT_TARGET_NEXT)) { |
| arc.nextArc = in.getPosition(); // Only useful for list. |
| // TODO: would be nice to make this lazy -- maybe |
| // caller doesn't need the target and is scanning arcs... |
| if (!arc.flag(BIT_LAST_ARC)) { |
| if (arc.bytesPerArc() == 0) { |
| // must scan |
| seekToNextNode(in); |
| } else { |
| int numArcs = arc.nodeFlags == ARCS_FOR_DIRECT_ADDRESSING ? BitTable.countBits(arc, in) : arc.numArcs(); |
| in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * numArcs); |
| } |
| } |
| arc.target = in.getPosition(); |
| } else { |
| arc.target = readUnpackedNodeTarget(in); |
| arc.nextArc = in.getPosition(); // Only useful for list. |
| } |
| return arc; |
| } |
| |
| static <T> Arc<T> readEndArc(Arc<T> follow, Arc<T> arc) { |
| if (follow.isFinal()) { |
| if (follow.target() <= 0) { |
| arc.flags = FST.BIT_LAST_ARC; |
| } else { |
| arc.flags = 0; |
| // NOTE: nextArc is a node (not an address!) in this case: |
| arc.nextArc = follow.target(); |
| } |
| arc.output = follow.nextFinalOutput(); |
| arc.label = FST.END_LABEL; |
| return arc; |
| } else { |
| return null; |
| } |
| } |
| |
| // TODO: could we somehow [partially] tableize arc lookups |
| // like automaton? |
| |
| /** Finds an arc leaving the incoming arc, replacing the arc in place. |
| * This returns null if the arc was not found, else the incoming arc. */ |
| public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { |
| |
| if (labelToMatch == END_LABEL) { |
| if (follow.isFinal()) { |
| if (follow.target() <= 0) { |
| arc.flags = BIT_LAST_ARC; |
| } else { |
| arc.flags = 0; |
| // NOTE: nextArc is a node (not an address!) in this case: |
| arc.nextArc = follow.target(); |
| } |
| arc.output = follow.nextFinalOutput(); |
| arc.label = END_LABEL; |
| arc.nodeFlags = arc.flags; |
| return arc; |
| } else { |
| return null; |
| } |
| } |
| |
| if (!targetHasArcs(follow)) { |
| return null; |
| } |
| |
| in.setPosition(follow.target()); |
| |
| // System.out.println("fta label=" + (char) labelToMatch); |
| |
| byte flags = arc.nodeFlags = in.readByte(); |
| if (flags == ARCS_FOR_DIRECT_ADDRESSING) { |
| arc.numArcs = in.readVInt(); // This is in fact the label range. |
| arc.bytesPerArc = in.readVInt(); |
| readPresenceBytes(arc, in); |
| arc.firstLabel = readLabel(in); |
| arc.posArcsStart = in.getPosition(); |
| |
| int arcIndex = labelToMatch - arc.firstLabel(); |
| if (arcIndex < 0 || arcIndex >= arc.numArcs()) { |
| return null; // Before or after label range. |
| } else if (!BitTable.isBitSet(arcIndex, arc, in)) { |
| return null; // Arc missing in the range. |
| } |
| return readArcByDirectAddressing(arc, in, arcIndex); |
| } else if (flags == ARCS_FOR_BINARY_SEARCH) { |
| arc.numArcs = in.readVInt(); |
| arc.bytesPerArc = in.readVInt(); |
| arc.posArcsStart = in.getPosition(); |
| |
| // Array is sparse; do binary search: |
| int low = 0; |
| int high = arc.numArcs() - 1; |
| while (low <= high) { |
| //System.out.println(" cycle"); |
| int mid = (low + high) >>> 1; |
| // +1 to skip over flags |
| in.setPosition(arc.posArcsStart() - (arc.bytesPerArc() * mid + 1)); |
| int midLabel = readLabel(in); |
| final int cmp = midLabel - labelToMatch; |
| if (cmp < 0) { |
| low = mid + 1; |
| } else if (cmp > 0) { |
| high = mid - 1; |
| } else { |
| arc.arcIdx = mid - 1; |
| //System.out.println(" found!"); |
| return readNextRealArc(arc, in); |
| } |
| } |
| return null; |
| } |
| |
| // Linear scan |
| readFirstRealTargetArc(follow.target(), arc, in); |
| |
| while(true) { |
| //System.out.println(" non-bs cycle"); |
| // TODO: we should fix this code to not have to create |
| // object for the output of every arc we scan... only |
| // for the matching arc, if found |
| if (arc.label() == labelToMatch) { |
| //System.out.println(" found!"); |
| return arc; |
| } else if (arc.label() > labelToMatch) { |
| return null; |
| } else if (arc.isLast()) { |
| return null; |
| } else { |
| readNextRealArc(arc, in); |
| } |
| } |
| } |
| |
| private void seekToNextNode(BytesReader in) throws IOException { |
| |
| while(true) { |
| |
| final int flags = in.readByte(); |
| readLabel(in); |
| |
| if (flag(flags, BIT_ARC_HAS_OUTPUT)) { |
| outputs.skipOutput(in); |
| } |
| |
| if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { |
| outputs.skipFinalOutput(in); |
| } |
| |
| if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { |
| readUnpackedNodeTarget(in); |
| } |
| |
| if (flag(flags, BIT_LAST_ARC)) { |
| return; |
| } |
| } |
| } |
| |
| /** Returns a {@link BytesReader} for this FST, positioned at |
| * position 0. */ |
| public BytesReader getBytesReader() { |
| if (this.fstStore != null) { |
| return this.fstStore.getReverseBytesReader(); |
| } else { |
| return bytes.getReverseReader(); |
| } |
| } |
| |
| /** Reads bytes stored in an FST. */ |
| public static abstract class BytesReader extends DataInput { |
| /** Get current read position. */ |
| public abstract long getPosition(); |
| |
| /** Set current read position. */ |
| public abstract void setPosition(long pos); |
| |
| /** Returns true if this reader uses reversed bytes |
| * under-the-hood. */ |
| public abstract boolean reversed(); |
| } |
| } |