LUCENE-5752: cutover to FixedBitSet to mark the accept states; improve javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5752@1603489 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
index cb9c996..3603aba 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
@@ -24,6 +24,7 @@
import java.util.Set;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.Sorter;
@@ -71,7 +72,7 @@
/** Holds toState, min, max for each transition. */
private int[] transitions = new int[6];
- private final Set<Integer> acceptStates = new HashSet<Integer>();
+ private FixedBitSet isAccept = new FixedBitSet(4);
/** True if no state has two transitions leaving with the same label. */
private boolean deterministic = true;
@@ -86,18 +87,23 @@
int state = nextState/2;
states[nextState] = -1;
nextState += 2;
+ if (state >= isAccept.length()) {
+ FixedBitSet newBits = new FixedBitSet(ArrayUtil.oversize(state+1, 1));
+ newBits.or(isAccept);
+ isAccept = newBits;
+ }
return state;
}
/** Set or clear this state as an accept state. */
- public void setAccept(int state, boolean isAccept) {
+ public void setAccept(int state, boolean accept) {
if (state >= getNumStates()) {
throw new IllegalArgumentException("state=" + state + " is out of bounds (numStates=" + getNumStates() + ")");
}
- if (isAccept) {
- acceptStates.add(state);
+ if (accept) {
+ isAccept.set(state);
} else {
- acceptStates.remove(state);
+ isAccept.clear(state);
}
}
@@ -119,14 +125,14 @@
return transitions;
}
- /** Returns accept states. */
- public Set<Integer> getAcceptStates() {
- return acceptStates;
+ /** Returns accept states. If the bit is set then that state is an accept state. */
+ FixedBitSet getAcceptStates() {
+ return isAccept;
}
/** Returns true if this state is an accept state. */
public boolean isAccept(int state) {
- return acceptStates.contains(state);
+ return isAccept.get(state);
}
/** Add a new transition with min = max = label. */
@@ -195,12 +201,19 @@
if (states[nextState+i] != -1) {
states[nextState+i] += nextTransition;
}
- int state = i/2;
}
nextState += other.nextState;
-
- for(int s : other.getAcceptStates()) {
- setAccept(stateOffset+s, true);
+ if (isAccept.length() < nextState/2) {
+ FixedBitSet newBits = new FixedBitSet(ArrayUtil.oversize(nextState/2, 1));
+ newBits.or(isAccept);
+ isAccept = newBits;
+ }
+ int otherNumStates = other.getNumStates();
+ FixedBitSet otherAcceptStates = other.getAcceptStates();
+ int state = 0;
+ while (state < otherNumStates && (state = otherAcceptStates.nextSetBit(state)) != -1) {
+ setAccept(stateOffset + state, true);
+ state++;
}
// Bulk copy and then fixup dest for each transition:
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
index ffb1aa8..6d2ecac 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@@ -42,6 +42,7 @@
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
@@ -239,7 +240,7 @@
b = concatenate(as);
}
- Set<Integer> prevAcceptStates = new HashSet<>(b.getAcceptStates());
+ Set<Integer> prevAcceptStates = toSet(b, 0);
for(int i=min;i<max;i++) {
int numStates = b.getNumStates();
@@ -247,16 +248,26 @@
for(int s : prevAcceptStates) {
b.addEpsilon(s, numStates);
}
- prevAcceptStates.clear();
- for(int s : a.getAcceptStates()) {
- prevAcceptStates.add(numStates+s);
- }
+ prevAcceptStates = toSet(a, numStates);
}
b.finishState();
return b;
}
+
+ private static Set<Integer> toSet(Automaton a, int offset) {
+ int numStates = a.getNumStates();
+ FixedBitSet isAccept = a.getAcceptStates();
+ Set<Integer> result = new HashSet<Integer>();
+ int upto = 0;
+ while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) {
+ result.add(offset+upto);
+ upto++;
+ }
+
+ return result;
+ }
/**
* Returns a (deterministic) automaton that accepts the complement of the
@@ -913,13 +924,16 @@
LinkedList<Integer> workList = new LinkedList<>();
BitSet live = new BitSet(numStates);
- for (int s : a.getAcceptStates()) {
+ FixedBitSet acceptBits = a.getAcceptStates();
+ int s = 0;
+ while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) {
live.set(s);
workList.add(s);
+ s++;
}
while (workList.isEmpty() == false) {
- int s = workList.removeFirst();
+ s = workList.removeFirst();
int count = a2.initTransition(s, t);
for(int i=0;i<count;i++) {
a2.getNextTransition(t);
@@ -971,6 +985,7 @@
assert hasDeadStates(result) == false;
return result;
}
+
/**
* Finds the largest entry whose value is less than or equal to c, or 0 if
* there is no such entry.
@@ -988,7 +1003,8 @@
}
/**
- * Returns true if the language of this automaton is finite.
+ * Returns true if the language of this automaton is finite. The
+ * automaton must not have any dead states.
*/
public static boolean isFinite(Automaton a) {
if (a.getNumStates() == 0) {
@@ -998,7 +1014,7 @@
}
/**
- * Checks whether there is a loop containing s. (This is sufficient since
+ * Checks whether there is a loop containing state. (This is sufficient since
* there are never transitions to dead states.)
*/
// TODO: not great that this is recursive... in theory a
@@ -1144,12 +1160,14 @@
Automaton result = builder.finish();
- for(int s : a.getAcceptStates()) {
- assert s < numStates;
+ int s = 0;
+ FixedBitSet acceptStates = a.getAcceptStates();
+ while (s < numStates && (s = acceptStates.nextSetBit(s)) != -1) {
result.addEpsilon(0, s+1);
if (initialStates != null) {
initialStates.add(s+1);
}
+ s++;
}
result.finishState();