blob: 0e61f895c112d5503ab40fe97298feb560bb8ec8 [file] [log] [blame]
diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
index 3225fdf..5127862 100644
--- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
+++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java
@@ -83,7 +83,7 @@ public class Builder<T> {
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
frontier = f;
for(int idx=0;idx<frontier.length;idx++) {
- frontier[idx] = new UnCompiledNode<T>(this);
+ frontier[idx] = new UnCompiledNode<T>(this, idx);
}
}
@@ -201,7 +201,7 @@ public class Builder<T> {
// undecided on whether to prune it. later, it
// will be either compiled or pruned, so we must
// allocate a new node:
- frontier[idx] = new UnCompiledNode<T>(this);
+ frontier[idx] = new UnCompiledNode<T>(this, idx);
}
}
}
@@ -292,7 +292,7 @@ public class Builder<T> {
new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(frontier, 0, next, 0, frontier.length);
for(int idx=frontier.length;idx<next.length;idx++) {
- next[idx] = new UnCompiledNode<T>(this);
+ next[idx] = new UnCompiledNode<T>(this, idx);
}
frontier = next;
}
@@ -424,12 +424,22 @@ public class Builder<T> {
boolean isFinal;
int inputCount;
+ /** This node's depth, starting from the automaton root. */
+ final int depth;
+
+ /**
+ * @param depth
+ * The node's depth starting from the automaton root. Needed for
+ * LUCENE-2934 (node expansion based on conditions other than the
+ * fanout size).
+ */
@SuppressWarnings("unchecked")
- public UnCompiledNode(Builder<T> owner) {
+ public UnCompiledNode(Builder<T> owner, int depth) {
this.owner = owner;
arcs = (Arc<T>[]) new Arc[1];
arcs[0] = new Arc<T>();
output = owner.NO_OUTPUT;
+ this.depth = depth;
}
public boolean isCompiled() {
@@ -441,6 +451,9 @@ public class Builder<T> {
isFinal = false;
output = owner.NO_OUTPUT;
inputCount = 0;
+
+ // We don't clear the depth here because it never changes
+ // for nodes on the frontier (even when reused).
}
public T getLastOutput(int labelToMatch) {
diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
index 0b366b4..60dc55c 100644
--- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
+++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java
@@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode;
/** Represents an FST using a compact byte[] format.
* <p> The format is similar to what's used by Morfologik
@@ -47,11 +48,21 @@ public class FST<T> {
// this when number of arcs is > NUM_ARCS_ARRAY:
private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
- // If the node has >= this number of arcs, the arcs are
- // stored as a fixed array. Fixed array consumes more RAM
- // but enables binary search on the arcs (instead of
- // linear scan) on lookup by arc label:
- private final static int NUM_ARCS_FIXED_ARRAY = 10;
+ /**
+ * @see #shouldExpand(UnCompiledNode)
+ */
+ final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
+
+ /**
+ * @see #shouldExpand(UnCompiledNode)
+ */
+ final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
+
+ /**
+ * @see #shouldExpand(UnCompiledNode)
+ */
+ final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
+
private int[] bytesPerArc = new int[0];
// Increment version to change it
@@ -315,7 +326,7 @@ public class FST<T> {
int startAddress = writer.posWrite;
//System.out.println(" startAddr=" + startAddress);
- final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY;
+ final boolean doFixedArray = shouldExpand(node);
final int fixedArrayStart;
if (doFixedArray) {
if (bytesPerArc.length < node.numArcs) {
@@ -518,6 +529,23 @@ public class FST<T> {
return readNextArc(arc);
}
+ /**
+ * Checks if <code>arc</code>'s target state is in expanded (or vector) format.
+ *
+ * @return Returns <code>true</code> if <code>arc</code> points to a state in an
+ * expanded array format.
+ */
+ boolean isExpandedTarget(Arc<T> follow) throws IOException {
+ if (follow.isFinal()) {
+ return false;
+ } else {
+ final BytesReader in = getBytesReader(follow.target);
+ final byte b = in.readByte();
+
+ return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
+ }
+ }
+
/** In-place read; returns the arc. */
public Arc<T> readNextArc(Arc<T> arc) throws IOException {
if (arc.label == -1) {
@@ -712,6 +740,26 @@ public class FST<T> {
public int getArcWithOutputCount() {
return arcWithOutputCount;
}
+
+ /**
+ * Nodes will be expanded if their depth (distance from the root node) is
+ * &lt;= this value and their number of arcs is &gt;=
+ * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}.
+ *
+ * <p>
+ * Fixed array consumes more RAM but enables binary search on the arcs
+ * (instead of a linear scan) on lookup by arc label.
+ *
+ * @return <code>true</code> if <code>node</code> should be stored in an
+ * expanded (array) form.
+ *
+ * @see #FIXED_ARRAY_NUM_ARCS_DEEP
+ * @see Builder.UnCompiledNode#depth
+ */
+ private boolean shouldExpand(UnCompiledNode<T> node) {
+ return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+ node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP;
+ }
// Non-static: writes to FST's byte[]
class BytesWriter extends DataOutput {
diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java
index a10c4bc..6699ac6 100644
--- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java
+++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java
@@ -189,6 +189,8 @@ public final class Util {
*/
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
throws IOException {
+ final String expandedNodeColor = "blue";
+
// This is the start arc in the automaton (from the epsilon state to the first state
// with outgoing transitions.
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
@@ -219,7 +221,9 @@ public final class Util {
}
emitDotState(out, "initial", "point", "white", "");
- emitDotState(out, Integer.toString(startArc.target), stateShape, null, "");
+ emitDotState(out, Integer.toString(startArc.target), stateShape,
+ fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
+ "");
out.write(" initial -> " + startArc.target + "\n");
final T NO_OUTPUT = fst.outputs.getNoOutput();
@@ -243,7 +247,9 @@ public final class Util {
while (true) {
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get(arc.target)) {
- emitDotState(out, Integer.toString(arc.target), stateShape, null,
+ final boolean isExpanded = fst.isExpandedTarget(arc);
+ emitDotState(out, Integer.toString(arc.target), stateShape,
+ isExpanded ? expandedNodeColor : null,
labelStates ? Integer.toString(arc.target) : "");
seen.set(arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
@@ -285,10 +291,10 @@ public final class Util {
}
sameLevelStates.clear();
}
-
+
// Emit terminating state (always there anyway).
out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
- out.write(" {rank=sink; -1 } ");
+ out.write(" {rank=sink; -1 }\n");
out.write("}\n");
out.flush();
diff --git a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
index f979481..5f6c589 100644
--- a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
+++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
@@ -56,6 +56,7 @@ import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.fst.FST.Arc;
public class TestFSTs extends LuceneTestCase {
@@ -1322,4 +1323,85 @@ public class TestFSTs extends LuceneTestCase {
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
}
+
+ /**
+ * Test state expansion (array format) on close-to-root states. Creates
+ * synthetic input that has one expanded state on each level.
+ *
+ * @see "https://issues.apache.org/jira/browse/LUCENE-2933"
+ */
+ public void testExpandedCloseToRoot() throws Exception {
+ class SyntheticData {
+ FST<Object> compile(String[] lines) throws IOException {
+ final NoOutputs outputs = NoOutputs.getSingleton();
+ final Object nothing = outputs.getNoOutput();
+ final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+ int line = 0;
+ final BytesRef term = new BytesRef();
+ while (line < lines.length) {
+ String w = lines[line++];
+ if (w == null) {
+ break;
+ }
+ term.copy(w);
+ b.add(term, nothing);
+ }
+
+ return b.finish();
+ }
+
+ void generate(ArrayList<String> out, StringBuilder b, char from, char to,
+ int depth) {
+ if (depth == 0 || from == to) {
+ String seq = b.toString() + "_" + out.size() + "_end";
+ out.add(seq);
+ } else {
+ for (char c = from; c <= to; c++) {
+ b.append(c);
+ generate(out, b, from, c == to ? to : from, depth - 1);
+ b.deleteCharAt(b.length() - 1);
+ }
+ }
+ }
+
+ public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
+ throws IOException {
+ if (fst.targetHasArcs(arc)) {
+ int childCount = 0;
+ for (arc = fst.readFirstTargetArc(arc, arc);;
+ arc = fst.readNextArc(arc), childCount++)
+ {
+ boolean expanded = fst.isExpandedTarget(arc);
+ int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
+
+ assertEquals(
+ expanded,
+ (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
+ children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+ children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+ if (arc.isLast()) break;
+ }
+
+ return childCount;
+ }
+ return 0;
+ }
+ }
+
+ // Sanity check.
+ assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+ assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
+
+ SyntheticData s = new SyntheticData();
+
+ ArrayList<String> out = new ArrayList<String>();
+ StringBuilder b = new StringBuilder();
+ s.generate(out, b, 'a', 'i', 10);
+ String[] input = out.toArray(new String[out.size()]);
+ Arrays.sort(input);
+ FST<Object> fst = s.compile(input);
+ FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
+ s.verifyStateAndBelow(fst, arc, 1);
+ }
}