blob: a35c4d7afb3465654d1990d12d63d44829fb2a9a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
// TODO: could we somehow stream an FST to disk while we
// build it?
/**
* Builds a minimal FST (maps an IntsRef term to an arbitrary
* output) from pre-sorted terms with outputs. The FST
* becomes an FSA if you use NoOutputs. The FST is written
* on-the-fly into a compact serialized format byte array, which can
* be saved to / loaded from a Directory or used directly
* for traversal. The FST is always finite (no cycles).
*
* <p>NOTE: The algorithm is described at
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
*
* <p>The parameterized type T is the output type. See the
* subclasses of {@link Outputs}.
*
* <p>FSTs larger than 2.1GB are now possible (as of Lucene
* 4.2). FSTs containing more than 2.1B nodes are also now
* possible, however they cannot be packed.
*
* @lucene.experimental
*/
public class Builder<T> {
/**
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
* Default is 1: ensure no oversizing on average.
* <p>
* This factor does not determine whether to encode a node with a list of variable length arcs or with
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
* encoded with fixed length arcs.
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
* <p>
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
* <p>
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
* <p>
* Use {@code TestFstDirectAddressing.main()}
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
* to evaluate a change.
*
* @see #setDirectAddressingMaxOversizingFactor
*/
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
private final NodeHash<T> dedupHash;
final FST<T> fst;
private final T NO_OUTPUT;
// private static final boolean DEBUG = true;
// simplistic pruning: we prune node (and all following
// nodes) if less than this number of terms go through it:
private final int minSuffixCount1;
// better pruning: we prune node (and all following
// nodes) if the prior node has less than this number of
// terms go through it:
private final int minSuffixCount2;
private final boolean doShareNonSingletonNodes;
private final int shareMaxTailLength;
private final IntsRefBuilder lastInput = new IntsRefBuilder();
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
// left this as an array:
// current "frontier"
private UnCompiledNode<T>[] frontier;
// Used for the BIT_TARGET_NEXT optimization (whereby
// instead of storing the address of the target node for
// a given arc, we mark a single bit noting that the next
// node in the byte[] is the target node):
long lastFrozenNode;
// Reused temporarily while building the FST:
int[] numBytesPerArc = new int[4];
int[] numLabelBytesPerArc = new int[numBytesPerArc.length];
final FixedLengthArcsBuffer fixedLengthArcsBuffer = new FixedLengthArcsBuffer();
long arcCount;
long nodeCount;
long binarySearchNodeCount;
long directAddressingNodeCount;
boolean allowFixedLengthArcs;
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
long directAddressingExpansionCredit;
BytesStore bytes;
/**
* Instantiates an FST/FSA builder without any pruning. A shortcut to {@link
* #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with
* pruning options turned off.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
}
/**
* Instantiates an FST/FSA builder with all the possible tuning and construction
* tweaks. Read parameter documentation carefully.
*
* @param inputType
* The input type (transition labels). Can be anything from {@link INPUT_TYPE}
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
*
* @param minSuffixCount1
* If pruning the input graph during construction, this threshold is used for telling
* if a node is kept or pruned. If transition_count(node) &gt;= minSuffixCount1, the node
* is kept.
*
* @param minSuffixCount2
* (Note: only Mike McCandless knows what this one is really doing...)
*
* @param doShareSuffix
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
* This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
* <code>false</code> creates a single suffix path for all input sequences. This will result in a larger
* FST, but requires substantially less memory and CPU during building.
*
* @param doShareNonSingletonNodes
* Only used if doShareSuffix is true. Set this to
* true to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param shareMaxTailLength
* Only used if doShareSuffix is true. Set this to
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
* traverse.
*
* @param bytesPageBits How many bits wide to make each
* byte[] block in the BytesStore; if you know the FST
* will be large then make this larger. For example 15
* bits = 32768 byte pages.
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean allowFixedLengthArcs, int bytesPageBits) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.allowFixedLengthArcs = allowFixedLengthArcs;
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
if (doShareSuffix) {
dedupHash = new NodeHash<>(fst, bytes.getReverseReader(false));
} else {
dedupHash = null;
}
NO_OUTPUT = outputs.getNoOutput();
@SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T>[] f =
(UnCompiledNode<T>[]) new UnCompiledNode[10];
frontier = f;
for(int idx=0;idx<frontier.length;idx++) {
frontier[idx] = new UnCompiledNode<>(this, idx);
}
}
/**
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
* of arcs instead of binary search.
* <p>
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
* only binary search nodes will be created.
*
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
*/
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
directAddressingMaxOversizingFactor = factor;
return this;
}
/**
* @see #setDirectAddressingMaxOversizingFactor(float)
*/
public float getDirectAddressingMaxOversizingFactor() {
return directAddressingMaxOversizingFactor;
}
public long getTermCount() {
return frontier[0].inputCount;
}
public long getNodeCount() {
// 1+ in order to count the -1 implicit final node
return 1+nodeCount;
}
public long getArcCount() {
return arcCount;
}
public long getMappedStateCount() {
return dedupHash == null ? 0 : nodeCount;
}
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
final long node;
long bytesPosStart = bytes.getPosition();
if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) {
if (nodeIn.numArcs == 0) {
node = fst.addNode(this, nodeIn);
lastFrozenNode = node;
} else {
node = dedupHash.add(this, nodeIn);
}
} else {
node = fst.addNode(this, nodeIn);
}
assert node != -2;
long bytesPosEnd = bytes.getPosition();
if (bytesPosEnd != bytesPosStart) {
// The FST added a new node:
assert bytesPosEnd > bytesPosStart;
lastFrozenNode = node;
}
nodeIn.clear();
final CompiledNode fn = new CompiledNode();
fn.node = node;
return fn;
}
private void freezeTail(int prefixLenPlus1) throws IOException {
//System.out.println(" compileTail " + prefixLenPlus1);
final int downTo = Math.max(1, prefixLenPlus1);
for(int idx=lastInput.length(); idx >= downTo; idx--) {
boolean doPrune = false;
boolean doCompile = false;
final UnCompiledNode<T> node = frontier[idx];
final UnCompiledNode<T> parent = frontier[idx-1];
if (node.inputCount < minSuffixCount1) {
doPrune = true;
doCompile = true;
} else if (idx > prefixLenPlus1) {
// prune if parent's inputCount is less than suffixMinCount2
if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) {
// my parent, about to be compiled, doesn't make the cut, so
// I'm definitely pruned
// if minSuffixCount2 is 1, we keep only up
// until the 'distinguished edge', ie we keep only the
// 'divergent' part of the FST. if my parent, about to be
// compiled, has inputCount 1 then we are already past the
// distinguished edge. NOTE: this only works if
// the FST outputs are not "compressible" (simple
// ords ARE compressible).
doPrune = true;
} else {
// my parent, about to be compiled, does make the cut, so
// I'm definitely not pruned
doPrune = false;
}
doCompile = true;
} else {
// if pruning is disabled (count is 0) we can always
// compile current node
doCompile = minSuffixCount2 == 0;
}
//System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune);
if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) {
// drop all arcs
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
@SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T> target =
(UnCompiledNode<T>) node.arcs[arcIdx].target;
target.clear();
}
node.numArcs = 0;
}
if (doPrune) {
// this node doesn't make it -- deref it
node.clear();
parent.deleteLast(lastInput.intAt(idx-1), node);
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(node, lastInput.length()-idx);
}
final T nextFinalOutput = node.output;
// We "fake" the node as being final if it has no
// outgoing arcs; in theory we could leave it
// as non-final (the FST can represent this), but
// FSTEnum, Util, etc., have trouble w/ non-final
// dead-end states:
final boolean isFinal = node.isFinal || node.numArcs == 0;
if (doCompile) {
// this node makes it and we now compile it. first,
// compile any targets that were previously
// undecided:
parent.replaceLast(lastInput.intAt(idx-1),
compileNode(node, 1+lastInput.length()-idx),
nextFinalOutput,
isFinal);
} else {
// replaceLast just to install
// nextFinalOutput/isFinal onto the arc
parent.replaceLast(lastInput.intAt(idx-1),
node,
nextFinalOutput,
isFinal);
// this node will stay in play for now, since we are
// undecided on whether to prune it. later, it
// will be either compiled or pruned, so we must
// allocate a new node:
frontier[idx] = new UnCompiledNode<>(this, idx);
}
}
}
}
// for debugging
/*
private String toString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
return b.toString();
}
}
*/
/** Add the next input/output pair. The provided input
* must be sorted after the previous one according to
* {@link IntsRef#compareTo}. It's also OK to add the same
* input twice in a row with different outputs, as long
* as {@link Outputs} implements the {@link Outputs#merge}
* method. Note that input is fully consumed after this
* method is returned (so caller is free to reuse), but
* output is not. So if your outputs are changeable (eg
* {@link ByteSequenceOutputs} or {@link
* IntSequenceOutputs}) then you cannot reuse across
* calls. */
public void add(IntsRef input, T output) throws IOException {
/*
if (DEBUG) {
BytesRef b = new BytesRef(input.length);
for(int x=0;x<input.length;x++) {
b.bytes[x] = (byte) input.ints[x];
}
b.length = input.length;
if (output == NO_OUTPUT) {
System.out.println("\nFST ADD: input=" + toString(b) + " " + b);
} else {
System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output));
}
}
*/
// De-dup NO_OUTPUT since it must be a singleton:
if (output.equals(NO_OUTPUT)) {
output = NO_OUTPUT;
}
assert lastInput.length() == 0 || input.compareTo(lastInput.get()) >= 0: "inputs are added out of order lastInput=" + lastInput.get() + " vs input=" + input;
assert validOutput(output);
//System.out.println("\nadd: " + input);
if (input.length == 0) {
// empty input: only allowed as first input. we have
// to special case this because the packed FST
// format cannot represent the empty input since
// 'finalness' is stored on the incoming arc, not on
// the node
frontier[0].inputCount++;
frontier[0].isFinal = true;
fst.setEmptyOutput(output);
return;
}
// compare shared prefix length
int pos1 = 0;
int pos2 = input.offset;
final int pos1Stop = Math.min(lastInput.length(), input.length);
while(true) {
frontier[pos1].inputCount++;
//System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]);
if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) {
break;
}
pos1++;
pos2++;
}
final int prefixLenPlus1 = pos1+1;
if (frontier.length < input.length+1) {
final UnCompiledNode<T>[] next = ArrayUtil.grow(frontier, input.length+1);
for(int idx=frontier.length;idx<next.length;idx++) {
next[idx] = new UnCompiledNode<>(this, idx);
}
frontier = next;
}
// minimize/compile states from previous input's
// orphan'd suffix
freezeTail(prefixLenPlus1);
// init tail states for current input
for(int idx=prefixLenPlus1;idx<=input.length;idx++) {
frontier[idx-1].addArc(input.ints[input.offset + idx - 1],
frontier[idx]);
frontier[idx].inputCount++;
}
final UnCompiledNode<T> lastNode = frontier[input.length];
if (lastInput.length() != input.length || prefixLenPlus1 != input.length + 1) {
lastNode.isFinal = true;
lastNode.output = NO_OUTPUT;
}
// push conflicting outputs forward, only as far as
// needed
for(int idx=1;idx<prefixLenPlus1;idx++) {
final UnCompiledNode<T> node = frontier[idx];
final UnCompiledNode<T> parentNode = frontier[idx-1];
final T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]);
assert validOutput(lastOutput);
final T commonOutputPrefix;
final T wordSuffix;
if (lastOutput != NO_OUTPUT) {
commonOutputPrefix = fst.outputs.common(output, lastOutput);
assert validOutput(commonOutputPrefix);
wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix);
assert validOutput(wordSuffix);
parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix);
node.prependOutput(wordSuffix);
} else {
commonOutputPrefix = wordSuffix = NO_OUTPUT;
}
output = fst.outputs.subtract(output, commonOutputPrefix);
assert validOutput(output);
}
if (lastInput.length() == input.length && prefixLenPlus1 == 1+input.length) {
// same input more than 1 time in a row, mapping to
// multiple outputs
lastNode.output = fst.outputs.merge(lastNode.output, output);
} else {
// this new arc is private to this new input; set its
// arc output to the leftover output:
frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output);
}
// save last input
lastInput.copyInts(input);
//System.out.println(" count[0]=" + frontier[0].inputCount);
}
private boolean validOutput(T output) {
return output == NO_OUTPUT || !output.equals(NO_OUTPUT);
}
/** Returns final FST. NOTE: this will return null if
* nothing is accepted by the FST. */
public FST<T> finish() throws IOException {
final UnCompiledNode<T> root = frontier[0];
// minimize nodes in the last word's suffix
freezeTail(0);
if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) {
if (fst.emptyOutput == null) {
return null;
} else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
// empty string got pruned
return null;
}
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(root, lastInput.length());
}
}
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length()).node);
return fst;
}
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Arc<T> arc = node.arcs[arcIdx];
if (!arc.target.isCompiled()) {
// not yet compiled
@SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
if (n.numArcs == 0) {
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
arc.isFinal = n.isFinal = true;
}
arc.target = compileNode(n, tailLength-1);
}
}
}
/** Expert: holds a pending (seen but not yet serialized) arc. */
public static class Arc<T> {
public int label; // really an "unsigned" byte
public Node target;
public boolean isFinal;
public T output;
public T nextFinalOutput;
}
// NOTE: not many instances of Node or CompiledNode are in
// memory while the FST is being built; it's only the
// current "frontier":
static interface Node {
boolean isCompiled();
}
public long fstRamBytesUsed() {
return fst.ramBytesUsed();
}
static final class CompiledNode implements Node {
long node;
@Override
public boolean isCompiled() {
return true;
}
}
/** Expert: holds a pending (seen but not yet serialized) Node. */
public static final class UnCompiledNode<T> implements Node {
final Builder<T> owner;
public int numArcs;
public Arc<T>[] arcs;
// TODO: instead of recording isFinal/output on the
// node, maybe we should use -1 arc to mean "end" (like
// we do when reading the FST). Would simplify much
// code here...
public T output;
public boolean isFinal;
public long inputCount;
/** This node's depth, starting from the automaton root. */
public final int depth;
/**
* @param depth
* The node's depth starting from the automaton root. Needed for
* LUCENE-2934 (node expansion based on conditions other than the
* fanout size).
*/
@SuppressWarnings({"rawtypes","unchecked"})
public UnCompiledNode(Builder<T> owner, int depth) {
this.owner = owner;
arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<>();
output = owner.NO_OUTPUT;
this.depth = depth;
}
@Override
public boolean isCompiled() {
return false;
}
public void clear() {
numArcs = 0;
isFinal = false;
output = owner.NO_OUTPUT;
inputCount = 0;
// We don't clear the depth here because it never changes
// for nodes on the frontier (even when reused).
}
public T getLastOutput(int labelToMatch) {
assert numArcs > 0;
assert arcs[numArcs-1].label == labelToMatch;
return arcs[numArcs-1].output;
}
public void addArc(int label, Node target) {
assert label >= 0;
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
if (numArcs == arcs.length) {
final Arc<T>[] newArcs = ArrayUtil.grow(arcs, numArcs+1);
for(int arcIdx=numArcs;arcIdx<newArcs.length;arcIdx++) {
newArcs[arcIdx] = new Arc<>();
}
arcs = newArcs;
}
final Arc<T> arc = arcs[numArcs++];
arc.label = label;
arc.target = target;
arc.output = arc.nextFinalOutput = owner.NO_OUTPUT;
arc.isFinal = false;
}
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
arc.target = target;
//assert target.node != -2;
arc.nextFinalOutput = nextFinalOutput;
arc.isFinal = isFinal;
}
public void deleteLast(int label, Node target) {
assert numArcs > 0;
assert label == arcs[numArcs-1].label;
assert target == arcs[numArcs-1].target;
numArcs--;
}
public void setLastOutput(int labelToMatch, T newOutput) {
assert owner.validOutput(newOutput);
assert numArcs > 0;
final Arc<T> arc = arcs[numArcs-1];
assert arc.label == labelToMatch;
arc.output = newOutput;
}
// pushes an output prefix forward onto all arcs
public void prependOutput(T outputPrefix) {
assert owner.validOutput(outputPrefix);
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
arcs[arcIdx].output = owner.fst.outputs.add(outputPrefix, arcs[arcIdx].output);
assert owner.validOutput(arcs[arcIdx].output);
}
if (isFinal) {
output = owner.fst.outputs.add(outputPrefix, output);
assert owner.validOutput(output);
}
}
}
/**
* Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing).
*/
static class FixedLengthArcsBuffer {
// Initial capacity is the max length required for the header of a node with fixed length arcs:
// header(byte) + numArcs(vint) + numBytes(vint)
private byte[] bytes = new byte[11];
private final ByteArrayDataOutput bado = new ByteArrayDataOutput(bytes);
/** Ensures the capacity of the internal byte array. Enlarges it if needed. */
FixedLengthArcsBuffer ensureCapacity(int capacity) {
if (bytes.length < capacity) {
bytes = new byte[ArrayUtil.oversize(capacity, Byte.BYTES)];
bado.reset(bytes);
}
return this;
}
FixedLengthArcsBuffer resetPosition() {
bado.reset(bytes);
return this;
}
FixedLengthArcsBuffer writeByte(byte b) {
bado.writeByte(b);
return this;
}
FixedLengthArcsBuffer writeVInt(int i) {
try {
bado.writeVInt(i);
} catch (IOException e) { // Never thrown.
throw new RuntimeException(e);
}
return this;
}
int getPosition() {
return bado.getPosition();
}
/** Gets the internal byte array. */
byte[] getBytes() {
return bytes;
}
}
}