| diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java |
| index 3225fdf..5127862 100644 |
| --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java |
| +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java |
| @@ -83,7 +83,7 @@ public class Builder<T> { |
| @SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10]; |
| frontier = f; |
| for(int idx=0;idx<frontier.length;idx++) { |
| - frontier[idx] = new UnCompiledNode<T>(this); |
| + frontier[idx] = new UnCompiledNode<T>(this, idx); |
| } |
| } |
| |
| @@ -201,7 +201,7 @@ public class Builder<T> { |
| // undecided on whether to prune it. later, it |
| // will be either compiled or pruned, so we must |
| // allocate a new node: |
| - frontier[idx] = new UnCompiledNode<T>(this); |
| + frontier[idx] = new UnCompiledNode<T>(this, idx); |
| } |
| } |
| } |
| @@ -292,7 +292,7 @@ public class Builder<T> { |
| new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| System.arraycopy(frontier, 0, next, 0, frontier.length); |
| for(int idx=frontier.length;idx<next.length;idx++) { |
| - next[idx] = new UnCompiledNode<T>(this); |
| + next[idx] = new UnCompiledNode<T>(this, idx); |
| } |
| frontier = next; |
| } |
| @@ -424,12 +424,22 @@ public class Builder<T> { |
| boolean isFinal; |
| int inputCount; |
| |
| + /** This node's depth, starting from the automaton root. */ |
| + final int depth; |
| + |
| + /** |
| + * @param depth |
| + * The node's depth starting from the automaton root. Needed for |
| + * LUCENE-2934 (node expansion based on conditions other than the |
| + * fanout size). |
| + */ |
| @SuppressWarnings("unchecked") |
| - public UnCompiledNode(Builder<T> owner) { |
| + public UnCompiledNode(Builder<T> owner, int depth) { |
| this.owner = owner; |
| arcs = (Arc<T>[]) new Arc[1]; |
| arcs[0] = new Arc<T>(); |
| output = owner.NO_OUTPUT; |
| + this.depth = depth; |
| } |
| |
| public boolean isCompiled() { |
| @@ -441,6 +451,9 @@ public class Builder<T> { |
| isFinal = false; |
| output = owner.NO_OUTPUT; |
| inputCount = 0; |
| + |
| + // We don't clear the depth here because it never changes |
| + // for nodes on the frontier (even when reused). |
| } |
| |
| public T getLastOutput(int labelToMatch) { |
| diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java |
| index 0b366b4..60dc55c 100644 |
| --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java |
| +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java |
| @@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.CodecUtil; |
| +import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode; |
| |
| /** Represents an FST using a compact byte[] format. |
| * <p> The format is similar to what's used by Morfologik |
| @@ -47,11 +48,21 @@ public class FST<T> { |
| // this when number of arcs is > NUM_ARCS_ARRAY: |
| private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; |
| |
| - // If the node has >= this number of arcs, the arcs are |
| - // stored as a fixed array. Fixed array consumes more RAM |
| - // but enables binary search on the arcs (instead of |
| - // linear scan) on lookup by arc label: |
| - private final static int NUM_ARCS_FIXED_ARRAY = 10; |
| + /** |
| + * @see #shouldExpand(UnCompiledNode) |
| + */ |
| + final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node. |
| + |
| + /** |
| + * @see #shouldExpand(UnCompiledNode) |
| + */ |
| + final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5; |
| + |
| + /** |
| + * @see #shouldExpand(UnCompiledNode) |
| + */ |
| + final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10; |
| + |
| private int[] bytesPerArc = new int[0]; |
| |
| // Increment version to change it |
| @@ -315,7 +326,7 @@ public class FST<T> { |
| int startAddress = writer.posWrite; |
| //System.out.println(" startAddr=" + startAddress); |
| |
| - final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY; |
| + final boolean doFixedArray = shouldExpand(node); |
| final int fixedArrayStart; |
| if (doFixedArray) { |
| if (bytesPerArc.length < node.numArcs) { |
| @@ -518,6 +529,23 @@ public class FST<T> { |
| return readNextArc(arc); |
| } |
| |
| + /** |
| + * Checks if <code>arc</code>'s target state is in expanded (or vector) format. |
| + * |
| + * @return Returns <code>true</code> if <code>arc</code> points to a state in an |
| + * expanded array format. |
| + */ |
| + boolean isExpandedTarget(Arc<T> follow) throws IOException { |
| + if (follow.isFinal()) { |
| + return false; |
| + } else { |
| + final BytesReader in = getBytesReader(follow.target); |
| + final byte b = in.readByte(); |
| + |
| + return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; |
| + } |
| + } |
| + |
| /** In-place read; returns the arc. */ |
| public Arc<T> readNextArc(Arc<T> arc) throws IOException { |
| if (arc.label == -1) { |
| @@ -712,6 +740,26 @@ public class FST<T> { |
| public int getArcWithOutputCount() { |
| return arcWithOutputCount; |
| } |
| + |
| + /** |
| + * Nodes will be expanded if their depth (distance from the root node) is |
| + * <= this value and their number of arcs is >= |
| + * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. |
| + * |
| + * <p> |
| + * Fixed array consumes more RAM but enables binary search on the arcs |
| + * (instead of a linear scan) on lookup by arc label. |
| + * |
| + * @return <code>true</code> if <code>node</code> should be stored in an |
| + * expanded (array) form. |
| + * |
| + * @see #FIXED_ARRAY_NUM_ARCS_DEEP |
| + * @see Builder.UnCompiledNode#depth |
| + */ |
| + private boolean shouldExpand(UnCompiledNode<T> node) { |
| + return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || |
| + node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP; |
| + } |
| |
| // Non-static: writes to FST's byte[] |
| class BytesWriter extends DataOutput { |
| diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java |
| index a10c4bc..6699ac6 100644 |
| --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java |
| +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java |
| @@ -189,6 +189,8 @@ public final class Util { |
| */ |
| public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) |
| throws IOException { |
| + final String expandedNodeColor = "blue"; |
| + |
| // This is the start arc in the automaton (from the epsilon state to the first state |
| // with outgoing transitions. |
| final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>()); |
| @@ -219,7 +221,9 @@ public final class Util { |
| } |
| |
| emitDotState(out, "initial", "point", "white", ""); |
| - emitDotState(out, Integer.toString(startArc.target), stateShape, null, ""); |
| + emitDotState(out, Integer.toString(startArc.target), stateShape, |
| + fst.isExpandedTarget(startArc) ? expandedNodeColor : null, |
| + ""); |
| out.write(" initial -> " + startArc.target + "\n"); |
| |
| final T NO_OUTPUT = fst.outputs.getNoOutput(); |
| @@ -243,7 +247,9 @@ public final class Util { |
| while (true) { |
| // Emit the unseen state and add it to the queue for the next level. |
| if (arc.target >= 0 && !seen.get(arc.target)) { |
| - emitDotState(out, Integer.toString(arc.target), stateShape, null, |
| + final boolean isExpanded = fst.isExpandedTarget(arc); |
| + emitDotState(out, Integer.toString(arc.target), stateShape, |
| + isExpanded ? expandedNodeColor : null, |
| labelStates ? Integer.toString(arc.target) : ""); |
| seen.set(arc.target); |
| nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc)); |
| @@ -285,10 +291,10 @@ public final class Util { |
| } |
| sameLevelStates.clear(); |
| } |
| - |
| + |
| // Emit terminating state (always there anyway). |
| out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); |
| - out.write(" {rank=sink; -1 } "); |
| + out.write(" {rank=sink; -1 }\n"); |
| |
| out.write("}\n"); |
| out.flush(); |
| diff --git a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java |
| index f979481..5f6c589 100644 |
| --- a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java |
| +++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java |
| @@ -56,6 +56,7 @@ import org.apache.lucene.util.LineFileDocs; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util._TestUtil; |
| +import org.apache.lucene.util.automaton.fst.FST.Arc; |
| |
| public class TestFSTs extends LuceneTestCase { |
| |
| @@ -1322,4 +1323,85 @@ public class TestFSTs extends LuceneTestCase { |
| assertEquals(b, seekResult.input); |
| assertEquals(42, (long) seekResult.output); |
| } |
| + |
| + /** |
| + * Test state expansion (array format) on close-to-root states. Creates |
| + * synthetic input that has one expanded state on each level. |
| + * |
| + * @see "https://issues.apache.org/jira/browse/LUCENE-2933" |
| + */ |
| + public void testExpandedCloseToRoot() throws Exception { |
| + class SyntheticData { |
| + FST<Object> compile(String[] lines) throws IOException { |
| + final NoOutputs outputs = NoOutputs.getSingleton(); |
| + final Object nothing = outputs.getNoOutput(); |
| + final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); |
| + |
| + int line = 0; |
| + final BytesRef term = new BytesRef(); |
| + while (line < lines.length) { |
| + String w = lines[line++]; |
| + if (w == null) { |
| + break; |
| + } |
| + term.copy(w); |
| + b.add(term, nothing); |
| + } |
| + |
| + return b.finish(); |
| + } |
| + |
| + void generate(ArrayList<String> out, StringBuilder b, char from, char to, |
| + int depth) { |
| + if (depth == 0 || from == to) { |
| + String seq = b.toString() + "_" + out.size() + "_end"; |
| + out.add(seq); |
| + } else { |
| + for (char c = from; c <= to; c++) { |
| + b.append(c); |
| + generate(out, b, from, c == to ? to : from, depth - 1); |
| + b.deleteCharAt(b.length() - 1); |
| + } |
| + } |
| + } |
| + |
| + public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) |
| + throws IOException { |
| + if (fst.targetHasArcs(arc)) { |
| + int childCount = 0; |
| + for (arc = fst.readFirstTargetArc(arc, arc);; |
| + arc = fst.readNextArc(arc), childCount++) |
| + { |
| + boolean expanded = fst.isExpandedTarget(arc); |
| + int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1); |
| + |
| + assertEquals( |
| + expanded, |
| + (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && |
| + children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || |
| + children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); |
| + if (arc.isLast()) break; |
| + } |
| + |
| + return childCount; |
| + } |
| + return 0; |
| + } |
| + } |
| + |
| + // Sanity check. |
| + assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); |
| + assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); |
| + |
| + SyntheticData s = new SyntheticData(); |
| + |
| + ArrayList<String> out = new ArrayList<String>(); |
| + StringBuilder b = new StringBuilder(); |
| + s.generate(out, b, 'a', 'i', 10); |
| + String[] input = out.toArray(new String[out.size()]); |
| + Arrays.sort(input); |
| + FST<Object> fst = s.compile(input); |
| + FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>()); |
| + s.verifyStateAndBelow(fst, arc, 1); |
| + } |
| } |