blob: b41abedfc51d4ef4b71e7c2b3de9d3251911c97f [file] [log] [blame]
/*
* dk.brics.automaton
*
* Copyright (c) 2001-2009 Anders Moeller
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.apache.lucene.util.automaton;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.hppc.BitMixer;
/**
* Automata operations.
*
* @lucene.experimental
*/
final public class Operations {
/**
* Default maximum effort that {@link Operations#determinize} should spend before giving up and
* throwing {@link TooComplexToDeterminizeException}.
*/
public static final int DEFAULT_DETERMINIZE_WORK_LIMIT = 10000;
/**
* Maximum level of recursion allowed in recursive operations.
*/
public static final int MAX_RECURSION_LEVEL = 1000;
private Operations() {}
/**
* Returns an automaton that accepts the concatenation of the languages of the
* given automata.
* <p>
* Complexity: linear in total number of states.
*/
static public Automaton concatenate(Automaton a1, Automaton a2) {
return concatenate(Arrays.asList(a1, a2));
}
/**
* Returns an automaton that accepts the concatenation of the languages of the
* given automata.
* <p>
* Complexity: linear in total number of states.
*/
static public Automaton concatenate(List<Automaton> l) {
Automaton result = new Automaton();
// First pass: create all states
for(Automaton a : l) {
if (a.getNumStates() == 0) {
result.finishState();
return result;
}
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
result.createState();
}
}
// Second pass: add transitions, carefully linking accept
// states of A to init state of next A:
int stateOffset = 0;
Transition t = new Transition();
for(int i=0;i<l.size();i++) {
Automaton a = l.get(i);
int numStates = a.getNumStates();
Automaton nextA = (i == l.size()-1) ? null : l.get(i+1);
for(int s=0;s<numStates;s++) {
int numTransitions = a.initTransition(s, t);
for(int j=0;j<numTransitions;j++) {
a.getNextTransition(t);
result.addTransition(stateOffset + s, stateOffset + t.dest, t.min, t.max);
}
if (a.isAccept(s)) {
Automaton followA = nextA;
int followOffset = stateOffset;
int upto = i+1;
while (true) {
if (followA != null) {
// Adds a "virtual" epsilon transition:
numTransitions = followA.initTransition(0, t);
for(int j=0;j<numTransitions;j++) {
followA.getNextTransition(t);
result.addTransition(stateOffset + s, followOffset + numStates + t.dest, t.min, t.max);
}
if (followA.isAccept(0)) {
// Keep chaining if followA accepts empty string
followOffset += followA.getNumStates();
followA = (upto == l.size()-1) ? null : l.get(upto+1);
upto++;
} else {
break;
}
} else {
result.setAccept(stateOffset + s, true);
break;
}
}
}
}
stateOffset += numStates;
}
if (result.getNumStates() == 0) {
result.createState();
}
result.finishState();
return result;
}
/**
* Returns an automaton that accepts the union of the empty string and the
* language of the given automaton. This may create a dead state.
* <p>
* Complexity: linear in number of states.
*/
static public Automaton optional(Automaton a) {
Automaton result = new Automaton();
result.createState();
result.setAccept(0, true);
if (a.getNumStates() > 0) {
result.copy(a);
result.addEpsilon(0, 1);
}
result.finishState();
return result;
}
/**
* Returns an automaton that accepts the Kleene star (zero or more
* concatenated repetitions) of the language of the given automaton. Never
* modifies the input automaton language.
* <p>
* Complexity: linear in number of states.
*/
static public Automaton repeat(Automaton a) {
if (a.getNumStates() == 0) {
// Repeating the empty automata will still only accept the empty automata.
return a;
}
Automaton.Builder builder = new Automaton.Builder();
builder.createState();
builder.setAccept(0, true);
builder.copy(a);
Transition t = new Transition();
int count = a.initTransition(0, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
builder.addTransition(0, t.dest+1, t.min, t.max);
}
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
if (a.isAccept(s)) {
count = a.initTransition(0, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
builder.addTransition(s+1, t.dest+1, t.min, t.max);
}
}
}
return builder.finish();
}
/**
* Returns an automaton that accepts <code>min</code> or more concatenated
* repetitions of the language of the given automaton.
* <p>
* Complexity: linear in number of states and in <code>min</code>.
*/
static public Automaton repeat(Automaton a, int count) {
if (count == 0) {
return repeat(a);
}
List<Automaton> as = new ArrayList<>();
while (count-- > 0) {
as.add(a);
}
as.add(repeat(a));
return concatenate(as);
}
/**
* Returns an automaton that accepts between <code>min</code> and
* <code>max</code> (including both) concatenated repetitions of the language
* of the given automaton.
* <p>
* Complexity: linear in number of states and in <code>min</code> and
* <code>max</code>.
*/
static public Automaton repeat(Automaton a, int min, int max) {
if (min > max) {
return Automata.makeEmpty();
}
Automaton b;
if (min == 0) {
b = Automata.makeEmptyString();
} else if (min == 1) {
b = new Automaton();
b.copy(a);
} else {
List<Automaton> as = new ArrayList<>();
for(int i=0;i<min;i++) {
as.add(a);
}
b = concatenate(as);
}
Set<Integer> prevAcceptStates = toSet(b, 0);
Automaton.Builder builder = new Automaton.Builder();
builder.copy(b);
for(int i=min;i<max;i++) {
int numStates = builder.getNumStates();
builder.copy(a);
for(int s : prevAcceptStates) {
builder.addEpsilon(s, numStates);
}
prevAcceptStates = toSet(a, numStates);
}
return builder.finish();
}
private static Set<Integer> toSet(Automaton a, int offset) {
int numStates = a.getNumStates();
BitSet isAccept = a.getAcceptStates();
Set<Integer> result = new HashSet<Integer>();
int upto = 0;
while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) {
result.add(offset+upto);
upto++;
}
return result;
}
/**
* Returns a (deterministic) automaton that accepts the complement of the
* language of the given automaton.
* <p>
* Complexity: linear in number of states if already deterministic and
* exponential otherwise.
* @param determinizeWorkLimit maximum effort to spend determinizing the automaton. Set higher to
* allow more complex queries and lower to prevent memory exhaustion. {@link
* #DEFAULT_DETERMINIZE_WORK_LIMIT} is a good starting default.
*/
static public Automaton complement(Automaton a, int determinizeWorkLimit) {
a = totalize(determinize(a, determinizeWorkLimit));
int numStates = a.getNumStates();
for (int p=0;p<numStates;p++) {
a.setAccept(p, !a.isAccept(p));
}
return removeDeadStates(a);
}
/**
* Returns a (deterministic) automaton that accepts the intersection of the
* language of <code>a1</code> and the complement of the language of
* <code>a2</code>. As a side-effect, the automata may be determinized, if not
* already deterministic.
* <p>
* Complexity: quadratic in number of states if a2 already deterministic and
* exponential in number of a2's states otherwise.
*
* @param a1 the initial automaton
* @param a2 the automaton to subtract
* @param determinizeWorkLimit maximum effort to spend determinizing the automaton. Set higher to
* allow more complex queries and lower to prevent memory exhaustion. {@link
* #DEFAULT_DETERMINIZE_WORK_LIMIT} is a good starting default.
*/
static public Automaton minus(Automaton a1, Automaton a2, int determinizeWorkLimit) {
if (Operations.isEmpty(a1) || a1 == a2) {
return Automata.makeEmpty();
}
if (Operations.isEmpty(a2)) {
return a1;
}
return intersection(a1, complement(a2, determinizeWorkLimit));
}
/**
* Returns an automaton that accepts the intersection of the languages of the
* given automata. Never modifies the input automata languages.
* <p>
* Complexity: quadratic in number of states.
*/
static public Automaton intersection(Automaton a1, Automaton a2) {
if (a1 == a2) {
return a1;
}
if (a1.getNumStates() == 0) {
return a1;
}
if (a2.getNumStates() == 0) {
return a2;
}
Transition[][] transitions1 = a1.getSortedTransitions();
Transition[][] transitions2 = a2.getSortedTransitions();
Automaton c = new Automaton();
c.createState();
ArrayDeque<StatePair> worklist = new ArrayDeque<>();
HashMap<StatePair,StatePair> newstates = new HashMap<>();
StatePair p = new StatePair(0, 0, 0);
worklist.add(p);
newstates.put(p, p);
while (worklist.size() > 0) {
p = worklist.removeFirst();
c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2));
Transition[] t1 = transitions1[p.s1];
Transition[] t2 = transitions2[p.s2];
for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
while (b2 < t2.length && t2[b2].max < t1[n1].min)
b2++;
for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
if (t2[n2].max >= t1[n1].min) {
StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
StatePair r = newstates.get(q);
if (r == null) {
q.s = c.createState();
worklist.add(q);
newstates.put(q, q);
r = q;
}
int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
c.addTransition(p.s, r.s, min, max);
}
}
}
c.finishState();
return removeDeadStates(c);
}
/** Returns true if these two automata accept exactly the
* same language. This is a costly computation! Both automata
* must be determinized and have no dead states! */
public static boolean sameLanguage(Automaton a1, Automaton a2) {
if (a1 == a2) {
return true;
}
return subsetOf(a2, a1) && subsetOf(a1, a2);
}
// TODO: move to test-framework?
/** Returns true if this automaton has any states that cannot
* be reached from the initial state or cannot reach an accept state.
* Cost is O(numTransitions+numStates). */
public static boolean hasDeadStates(Automaton a) {
BitSet liveStates = getLiveStates(a);
int numLive = liveStates.cardinality();
int numStates = a.getNumStates();
assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates;
return numLive < numStates;
}
// TODO: move to test-framework?
/** Returns true if there are dead states reachable from an initial state. */
public static boolean hasDeadStatesFromInitial(Automaton a) {
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
BitSet reachableFromAccept = getLiveStatesToAccept(a);
reachableFromInitial.andNot(reachableFromAccept);
return reachableFromInitial.isEmpty() == false;
}
// TODO: move to test-framework?
/** Returns true if there are dead states that reach an accept state. */
public static boolean hasDeadStatesToAccept(Automaton a) {
BitSet reachableFromInitial = getLiveStatesFromInitial(a);
BitSet reachableFromAccept = getLiveStatesToAccept(a);
reachableFromAccept.andNot(reachableFromInitial);
return reachableFromAccept.isEmpty() == false;
}
/**
* Returns true if the language of <code>a1</code> is a subset of the language
* of <code>a2</code>. Both automata must be determinized and must have no dead
* states.
* <p>
* Complexity: quadratic in number of states.
*/
public static boolean subsetOf(Automaton a1, Automaton a2) {
if (a1.isDeterministic() == false) {
throw new IllegalArgumentException("a1 must be deterministic");
}
if (a2.isDeterministic() == false) {
throw new IllegalArgumentException("a2 must be deterministic");
}
assert hasDeadStatesFromInitial(a1) == false;
assert hasDeadStatesFromInitial(a2) == false;
if (a1.getNumStates() == 0) {
// Empty language is alwyas a subset of any other language
return true;
} else if (a2.getNumStates() == 0) {
return isEmpty(a1);
}
// TODO: cutover to iterators instead
Transition[][] transitions1 = a1.getSortedTransitions();
Transition[][] transitions2 = a2.getSortedTransitions();
ArrayDeque<StatePair> worklist = new ArrayDeque<>();
HashSet<StatePair> visited = new HashSet<>();
StatePair p = new StatePair(0, 0);
worklist.add(p);
visited.add(p);
while (worklist.size() > 0) {
p = worklist.removeFirst();
if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) {
return false;
}
Transition[] t1 = transitions1[p.s1];
Transition[] t2 = transitions2[p.s2];
for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
while (b2 < t2.length && t2[b2].max < t1[n1].min) {
b2++;
}
int min1 = t1[n1].min, max1 = t1[n1].max;
for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
if (t2[n2].min > min1) {
return false;
}
if (t2[n2].max < Character.MAX_CODE_POINT) {
min1 = t2[n2].max + 1;
} else {
min1 = Character.MAX_CODE_POINT;
max1 = Character.MIN_CODE_POINT;
}
StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
if (!visited.contains(q)) {
worklist.add(q);
visited.add(q);
}
}
if (min1 <= max1) {
return false;
}
}
}
return true;
}
/**
* Returns an automaton that accepts the union of the languages of the given
* automata.
* <p>
* Complexity: linear in number of states.
*/
public static Automaton union(Automaton a1, Automaton a2) {
return union(Arrays.asList(a1, a2));
}
/**
* Returns an automaton that accepts the union of the languages of the given
* automata.
* <p>
* Complexity: linear in number of states.
*/
public static Automaton union(Collection<Automaton> l) {
Automaton result = new Automaton();
// Create initial state:
result.createState();
// Copy over all automata
for(Automaton a : l) {
result.copy(a);
}
// Add epsilon transition from new initial state
int stateOffset = 1;
for(Automaton a : l) {
if (a.getNumStates() == 0) {
continue;
}
result.addEpsilon(0, stateOffset);
stateOffset += a.getNumStates();
}
result.finishState();
return removeDeadStates(result);
}
// Simple custom ArrayList<Transition>
private final static class TransitionList {
// dest, min, max
int[] transitions = new int[3];
int next;
public void add(Transition t) {
if (transitions.length < next+3) {
transitions = ArrayUtil.grow(transitions, next+3);
}
transitions[next] = t.dest;
transitions[next+1] = t.min;
transitions[next+2] = t.max;
next += 3;
}
}
// Holds all transitions that start on this int point, or
// end at this point-1
private final static class PointTransitions implements Comparable<PointTransitions> {
int point;
final TransitionList ends = new TransitionList();
final TransitionList starts = new TransitionList();
@Override
public int compareTo(PointTransitions other) {
return point - other.point;
}
public void reset(int point) {
this.point = point;
ends.next = 0;
starts.next = 0;
}
@Override
public boolean equals(Object other) {
return ((PointTransitions) other).point == point;
}
@Override
public int hashCode() {
return point;
}
}
private final static class PointTransitionSet {
int count;
PointTransitions[] points = new PointTransitions[5];
private final static int HASHMAP_CUTOVER = 30;
private final HashMap<Integer,PointTransitions> map = new HashMap<>();
private boolean useHash = false;
private PointTransitions next(int point) {
// 1st time we are seeing this point
if (count == points.length) {
final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(points, 0, newArray, 0, count);
points = newArray;
}
PointTransitions points0 = points[count];
if (points0 == null) {
points0 = points[count] = new PointTransitions();
}
points0.reset(point);
count++;
return points0;
}
private PointTransitions find(int point) {
if (useHash) {
final Integer pi = point;
PointTransitions p = map.get(pi);
if (p == null) {
p = next(point);
map.put(pi, p);
}
return p;
} else {
for(int i=0;i<count;i++) {
if (points[i].point == point) {
return points[i];
}
}
final PointTransitions p = next(point);
if (count == HASHMAP_CUTOVER) {
// switch to HashMap on the fly
assert map.size() == 0;
for(int i=0;i<count;i++) {
map.put(points[i].point, points[i]);
}
useHash = true;
}
return p;
}
}
public void reset() {
if (useHash) {
map.clear();
useHash = false;
}
count = 0;
}
public void sort() {
// Tim sort performs well on already sorted arrays:
if (count > 1) ArrayUtil.timSort(points, 0, count);
}
public void add(Transition t) {
find(t.min).starts.add(t);
find(1+t.max).ends.add(t);
}
@Override
public String toString() {
StringBuilder s = new StringBuilder();
for(int i=0;i<count;i++) {
if (i > 0) {
s.append(' ');
}
s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3);
}
return s.toString();
}
}
/**
* Determinizes the given automaton.
* <p>
* Worst case complexity: exponential in number of states.
* @param workLimit Maximum amount of "work" that the powerset construction will spend before
* throwing {@link TooComplexToDeterminizeException}. Higher numbers allow this operation to
* consume more memory and CPU but allow more complex automatons. Use {@link
* #DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't otherwise know what to
* specify.
* @throws TooComplexToDeterminizeException if determinizing requires more than {@code workLimit}
* "effort"
*/
public static Automaton determinize(Automaton a, int workLimit) {
if (a.isDeterministic()) {
// Already determinized
return a;
}
if (a.getNumStates() <= 1) {
// Already determinized
return a;
}
// subset construction
Automaton.Builder b = new Automaton.Builder();
//System.out.println("DET:");
//a.writeDot("/l/la/lucene/core/detin.dot");
// Same initial values and state will always have the same hashCode
FrozenIntSet initialset = new FrozenIntSet(new int[] { 0 }, BitMixer.mix(0) + 1, 0);
// Create state 0:
b.createState();
ArrayDeque<FrozenIntSet> worklist = new ArrayDeque<>();
Map<IntSet,Integer> newstate = new HashMap<>();
worklist.add(initialset);
b.setAccept(0, a.isAccept(0));
newstate.put(initialset, 0);
// like Set<Integer,PointTransitions>
final PointTransitionSet points = new PointTransitionSet();
// like HashMap<Integer,Integer>, maps state to its count
final StateSet statesSet = new StateSet(5);
Transition t = new Transition();
long effortSpent = 0;
// LUCENE-9981: approximate conversion from what used to be a limit on number of states, to
// maximum "effort":
long effortLimit = workLimit * (long) 10;
while (worklist.size() > 0) {
// TODO (LUCENE-9983): these int sets really do not need to be sorted, and we are paying
// a high (unecessary) price for that! really we just need a low-overhead Map<int,int>
// that implements equals/hash based only on the keys (ignores the values). fixing this
// might be a bigspeedup for determinizing complex automata
FrozenIntSet s = worklist.removeFirst();
// LUCENE-9981: we more carefully aggregate the net work this automaton is costing us, instead
// of (overly simplistically) counting number
// of determinized states:
effortSpent += s.values.length;
if (effortSpent >= effortLimit) {
throw new TooComplexToDeterminizeException(a, workLimit);
}
// Collate all outgoing transitions by min/1+max:
for(int i=0;i<s.values.length;i++) {
final int s0 = s.values[i];
int numTransitions = a.getNumTransitions(s0);
a.initTransition(s0, t);
for(int j=0;j<numTransitions;j++) {
a.getNextTransition(t);
points.add(t);
}
}
if (points.count == 0) {
// No outgoing transitions -- skip it
continue;
}
points.sort();
int lastPoint = -1;
int accCount = 0;
final int r = s.state;
for(int i=0;i<points.count;i++) {
final int point = points.points[i].point;
if (statesSet.size() > 0) {
assert lastPoint != -1;
Integer q = newstate.get(statesSet);
if (q == null) {
q = b.createState();
final FrozenIntSet p = statesSet.freeze(q);
//System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount);
worklist.add(p);
b.setAccept(q, accCount > 0);
newstate.put(p, q);
} else {
assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" +
b.isAccept(q) + " states=" + statesSet;
}
// System.out.println(" add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1));
b.addTransition(r, q, lastPoint, point-1);
}
// process transitions that end on this point
// (closes an overlapping interval)
int[] transitions = points.points[i].ends.transitions;
int limit = points.points[i].ends.next;
for(int j=0;j<limit;j+=3) {
int dest = transitions[j];
statesSet.decr(dest);
accCount -= a.isAccept(dest) ? 1:0;
}
points.points[i].ends.next = 0;
// process transitions that start on this point
// (opens a new interval)
transitions = points.points[i].starts.transitions;
limit = points.points[i].starts.next;
for(int j=0;j<limit;j+=3) {
int dest = transitions[j];
statesSet.incr(dest);
accCount += a.isAccept(dest) ? 1:0;
}
lastPoint = point;
points.points[i].starts.next = 0;
}
points.reset();
assert statesSet.size() == 0: "size=" + statesSet.size();
}
Automaton result = b.finish();
assert result.isDeterministic();
return result;
}
/**
* Returns true if the given automaton accepts no strings.
*/
public static boolean isEmpty(Automaton a) {
if (a.getNumStates() == 0) {
// Common case: no states
return true;
}
if (a.isAccept(0) == false && a.getNumTransitions(0) == 0) {
// Common case: just one initial state
return true;
}
if (a.isAccept(0) == true) {
// Apparently common case: it accepts the damned empty string
return false;
}
ArrayDeque<Integer> workList = new ArrayDeque<>();
BitSet seen = new BitSet(a.getNumStates());
workList.add(0);
seen.set(0);
Transition t = new Transition();
while (workList.isEmpty() == false) {
int state = workList.removeFirst();
if (a.isAccept(state)) {
return false;
}
int count = a.initTransition(state, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
if (seen.get(t.dest) == false) {
workList.add(t.dest);
seen.set(t.dest);
}
}
}
return true;
}
/**
* Returns true if the given automaton accepts all strings. The automaton must be minimized.
*/
public static boolean isTotal(Automaton a) {
return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
}
/**
* Returns true if the given automaton accepts all strings for the specified min/max
* range of the alphabet. The automaton must be minimized.
*/
public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
Transition t = new Transition();
a.getTransition(0, 0, t);
return t.dest == 0
&& t.min == minAlphabet
&& t.max == maxAlphabet;
}
return false;
}
/**
* Returns true if the given string is accepted by the automaton. The input must be deterministic.
* <p>
* Complexity: linear in the length of the string.
* <p>
* <b>Note:</b> for full performance, use the {@link RunAutomaton} class.
*/
public static boolean run(Automaton a, String s) {
assert a.isDeterministic();
int state = 0;
for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
int nextState = a.step(state, cp = s.codePointAt(i));
if (nextState == -1) {
return false;
}
state = nextState;
}
return a.isAccept(state);
}
/**
* Returns true if the given string (expressed as unicode codepoints) is accepted by the automaton. The input must be deterministic.
* <p>
* Complexity: linear in the length of the string.
* <p>
* <b>Note:</b> for full performance, use the {@link RunAutomaton} class.
*/
public static boolean run(Automaton a, IntsRef s) {
assert a.isDeterministic();
int state = 0;
for (int i=0;i<s.length;i++) {
int nextState = a.step(state, s.ints[s.offset+i]);
if (nextState == -1) {
return false;
}
state = nextState;
}
return a.isAccept(state);
}
/**
* Returns the set of live states. A state is "live" if an accept state is
* reachable from it and if it is reachable from the initial state.
*/
private static BitSet getLiveStates(Automaton a) {
BitSet live = getLiveStatesFromInitial(a);
live.and(getLiveStatesToAccept(a));
return live;
}
/** Returns bitset marking states reachable from the initial state. */
private static BitSet getLiveStatesFromInitial(Automaton a) {
int numStates = a.getNumStates();
BitSet live = new BitSet(numStates);
if (numStates == 0) {
return live;
}
ArrayDeque<Integer> workList = new ArrayDeque<>();
live.set(0);
workList.add(0);
Transition t = new Transition();
while (workList.isEmpty() == false) {
int s = workList.removeFirst();
int count = a.initTransition(s, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
if (live.get(t.dest) == false) {
live.set(t.dest);
workList.add(t.dest);
}
}
}
return live;
}
/** Returns bitset marking states that can reach an accept state. */
private static BitSet getLiveStatesToAccept(Automaton a) {
Automaton.Builder builder = new Automaton.Builder();
// NOTE: not quite the same thing as what SpecialOperations.reverse does:
Transition t = new Transition();
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
builder.createState();
}
for(int s=0;s<numStates;s++) {
int count = a.initTransition(s, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
builder.addTransition(t.dest, s, t.min, t.max);
}
}
Automaton a2 = builder.finish();
ArrayDeque<Integer> workList = new ArrayDeque<>();
BitSet live = new BitSet(numStates);
BitSet acceptBits = a.getAcceptStates();
int s = 0;
while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) {
live.set(s);
workList.add(s);
s++;
}
while (workList.isEmpty() == false) {
s = workList.removeFirst();
int count = a2.initTransition(s, t);
for(int i=0;i<count;i++) {
a2.getNextTransition(t);
if (live.get(t.dest) == false) {
live.set(t.dest);
workList.add(t.dest);
}
}
}
return live;
}
/**
* Removes transitions to dead states (a state is "dead" if it is not
* reachable from the initial state or no accept state is reachable from it.)
*/
public static Automaton removeDeadStates(Automaton a) {
int numStates = a.getNumStates();
BitSet liveSet = getLiveStates(a);
int[] map = new int[numStates];
Automaton result = new Automaton();
//System.out.println("liveSet: " + liveSet + " numStates=" + numStates);
for(int i=0;i<numStates;i++) {
if (liveSet.get(i)) {
map[i] = result.createState();
result.setAccept(map[i], a.isAccept(i));
}
}
Transition t = new Transition();
for (int i=0;i<numStates;i++) {
if (liveSet.get(i)) {
int numTransitions = a.initTransition(i, t);
// filter out transitions to dead states:
for(int j=0;j<numTransitions;j++) {
a.getNextTransition(t);
if (liveSet.get(t.dest)) {
result.addTransition(map[i], map[t.dest], t.min, t.max);
}
}
}
}
result.finishState();
assert hasDeadStates(result) == false;
return result;
}
/**
* Returns true if the language of this automaton is finite. The
* automaton must not have any dead states.
*/
public static boolean isFinite(Automaton a) {
if (a.getNumStates() == 0) {
return true;
}
return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()), 0);
}
/**
* Checks whether there is a loop containing state. (This is sufficient since
* there are never transitions to dead states.)
*/
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
private static boolean isFinite(Transition scratch, Automaton a, int state, BitSet path, BitSet visited, int level) {
if (level > MAX_RECURSION_LEVEL) {
throw new IllegalArgumentException("input automaton is too large: " + level);
}
path.set(state);
int numTransitions = a.initTransition(state, scratch);
for(int t=0;t<numTransitions;t++) {
a.getTransition(state, t, scratch);
if (path.get(scratch.dest) || (!visited.get(scratch.dest) && !isFinite(scratch, a, scratch.dest, path, visited, level+1))) {
return false;
}
}
path.clear(state);
visited.set(state);
return true;
}
/**
* Returns the longest string that is a prefix of all accepted strings and
* visits each state at most once. The automaton must not have dead states.
* If this automaton has already been converted to UTF-8 (e.g. using
* {@link UTF32ToUTF8}) then you should use {@link #getCommonPrefixBytesRef} instead.
*
* @throws IllegalArgumentException if the automaton has dead states reachable from the initial
* state.
* @return common prefix, which can be an empty (length 0) String (never null)
*/
public static String getCommonPrefix(Automaton a) {
if (hasDeadStatesFromInitial(a)) {
throw new IllegalArgumentException("input automaton has dead states");
}
if (isEmpty(a)) {
return "";
}
StringBuilder builder = new StringBuilder();
Transition scratch = new Transition();
FixedBitSet visited = new FixedBitSet(a.getNumStates());
FixedBitSet current = new FixedBitSet(a.getNumStates());
FixedBitSet next = new FixedBitSet(a.getNumStates());
current.set(0); // start with initial state
algorithm:
while (true) {
int label = -1;
// do a pass, stepping all current paths forward once
for (int state = current.nextSetBit(0);
state != DocIdSetIterator.NO_MORE_DOCS;
state =
state + 1 >= current.length()
? DocIdSetIterator.NO_MORE_DOCS
: current.nextSetBit(state + 1)) {
visited.set(state);
// if it is an accept state, we are done
if (a.isAccept(state)) {
break algorithm;
}
for (int transition = 0; transition < a.getNumTransitions(state); transition++) {
a.getTransition(state, transition, scratch);
if (label == -1) {
label = scratch.min;
}
// either a range of labels, or label that doesn't match all the other paths this round
if (scratch.min != scratch.max || scratch.min != label) {
break algorithm;
}
// mark target state for next iteration
next.set(scratch.dest);
}
}
assert label != -1 : "we should not get here since we checked no dead-end states up front!?";
// add the label to the prefix
builder.appendCodePoint(label);
// swap "current" with "next", clear "next"
FixedBitSet tmp = current;
current = next;
next = tmp;
next.clear(0, next.length());
}
return builder.toString();
}
/**
* Returns the longest BytesRef that is a prefix of all accepted strings and
* visits each state at most once.
*
* @return common prefix, which can be an empty (length 0) BytesRef (never null), and might
* possibly include a UTF-8 fragment of a full Unicode character
*/
public static BytesRef getCommonPrefixBytesRef(Automaton a) {
String prefix = getCommonPrefix(a);
BytesRefBuilder builder = new BytesRefBuilder();
for (int i = 0; i < prefix.length(); i++) {
char ch = prefix.charAt(i);
if (ch > 255) {
throw new IllegalStateException("automaton is not binary");
}
builder.append((byte) ch);
}
return builder.get();
}
/** If this automaton accepts a single input, return it. Else, return null.
* The automaton must be deterministic. */
public static IntsRef getSingleton(Automaton a) {
if (a.isDeterministic() == false) {
throw new IllegalArgumentException("input automaton must be deterministic");
}
IntsRefBuilder builder = new IntsRefBuilder();
HashSet<Integer> visited = new HashSet<>();
int s = 0;
Transition t = new Transition();
while (true) {
visited.add(s);
if (a.isAccept(s) == false) {
if (a.getNumTransitions(s) == 1) {
a.getTransition(s, 0, t);
if (t.min == t.max && !visited.contains(t.dest)) {
builder.append(t.min);
s = t.dest;
continue;
}
}
} else if (a.getNumTransitions(s) == 0) {
return builder.get();
}
// Automaton accepts more than one string:
return null;
}
}
/**
* Returns the longest BytesRef that is a suffix of all accepted strings.
* Worst case complexity: quadratic with the number of states+transitions.
*
* @return common suffix, which can be an empty (length 0) BytesRef (never null)
*/
public static BytesRef getCommonSuffixBytesRef(Automaton a) {
// reverse the language of the automaton, then reverse its common prefix.
Automaton r = removeDeadStates(reverse(a));
BytesRef ref = getCommonPrefixBytesRef(r);
reverseBytes(ref);
return ref;
}
private static void reverseBytes(BytesRef ref) {
if (ref.length <= 1) return;
int num = ref.length >> 1;
for (int i = ref.offset; i < ( ref.offset + num ); i++) {
byte b = ref.bytes[i];
ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
}
}
/** Returns an automaton accepting the reverse language. */
public static Automaton reverse(Automaton a) {
return reverse(a, null);
}
/** Reverses the automaton, returning the new initial states. */
static Automaton reverse(Automaton a, Set<Integer> initialStates) {
if (Operations.isEmpty(a)) {
return new Automaton();
}
int numStates = a.getNumStates();
// Build a new automaton with all edges reversed
Automaton.Builder builder = new Automaton.Builder();
// Initial node; we'll add epsilon transitions in the end:
builder.createState();
for(int s=0;s<numStates;s++) {
builder.createState();
}
// Old initial state becomes new accept state:
builder.setAccept(1, true);
Transition t = new Transition();
for (int s=0;s<numStates;s++) {
int numTransitions = a.getNumTransitions(s);
a.initTransition(s, t);
for(int i=0;i<numTransitions;i++) {
a.getNextTransition(t);
builder.addTransition(t.dest+1, s+1, t.min, t.max);
}
}
Automaton result = builder.finish();
int s = 0;
BitSet acceptStates = a.getAcceptStates();
while (s < numStates && (s = acceptStates.nextSetBit(s)) != -1) {
result.addEpsilon(0, s+1);
if (initialStates != null) {
initialStates.add(s+1);
}
s++;
}
result.finishState();
return result;
}
/** Returns a new automaton accepting the same language with added
* transitions to a dead state so that from every state and every label
* there is a transition. */
static Automaton totalize(Automaton a) {
Automaton result = new Automaton();
int numStates = a.getNumStates();
for(int i=0;i<numStates;i++) {
result.createState();
result.setAccept(i, a.isAccept(i));
}
int deadState = result.createState();
result.addTransition(deadState, deadState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
Transition t = new Transition();
for(int i=0;i<numStates;i++) {
int maxi = Character.MIN_CODE_POINT;
int count = a.initTransition(i, t);
for(int j=0;j<count;j++) {
a.getNextTransition(t);
result.addTransition(i, t.dest, t.min, t.max);
if (t.min > maxi) {
result.addTransition(i, deadState, maxi, t.min-1);
}
if (t.max + 1 > maxi) {
maxi = t.max + 1;
}
}
if (maxi <= Character.MAX_CODE_POINT) {
result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT);
}
}
result.finishState();
return result;
}
/** Returns the topological sort of all states reachable from
* the initial state. Behavior is undefined if this
* automaton has cycles. CPU cost is O(numTransitions),
* and the implementation is recursive so an automaton
* matching long strings may exhaust the java stack. */
public static int[] topoSortStates(Automaton a) {
if (a.getNumStates() == 0) {
return new int[0];
}
int numStates = a.getNumStates();
int[] states = new int[numStates];
final BitSet visited = new BitSet(numStates);
int upto = topoSortStatesRecurse(a, visited, states, 0, 0, 0);
if (upto < states.length) {
// There were dead states
int[] newStates = new int[upto];
System.arraycopy(states, 0, newStates, 0, upto);
states = newStates;
}
// Reverse the order:
for(int i=0;i<states.length/2;i++) {
int s = states[i];
states[i] = states[states.length-1-i];
states[states.length-1-i] = s;
}
return states;
}
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
private static int topoSortStatesRecurse(Automaton a, BitSet visited, int[] states, int upto, int state, int level) {
if (level > MAX_RECURSION_LEVEL) {
throw new IllegalArgumentException("input automaton is too large: " + level);
}
Transition t = new Transition();
int count = a.initTransition(state, t);
for (int i=0;i<count;i++) {
a.getNextTransition(t);
if (!visited.get(t.dest)) {
visited.set(t.dest);
upto = topoSortStatesRecurse(a, visited, states, upto, t.dest, level+1);
}
}
states[upto] = state;
upto++;
return upto;
}
}