blob: e3f26e460324f46243a72213d16f5fb271c26c06 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
/**
* Utilities for testing automata.
* <p>
* Capable of generating random regular expressions,
* and automata, and also provides a number of very
* basic unoptimized implementations (*slow) for testing.
*/
public class AutomatonTestUtil {
/**
* Default maximum number of states that {@link Operations#determinize} should create.
*/
public static final int DEFAULT_MAX_DETERMINIZED_STATES = 1000000;
/** Returns random string, including full unicode range. */
public static String randomRegexp(Random r) {
while (true) {
String regexp = randomRegexpString(r);
// we will also generate some undefined unicode queries
if (!UnicodeUtil.validUTF16String(regexp))
continue;
try {
new RegExp(regexp, RegExp.NONE);
return regexp;
} catch (Exception e) {}
}
}
private static String randomRegexpString(Random r) {
final int end = r.nextInt(20);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
int t = r.nextInt(15);
if (0 == t && i < end - 1) {
// Make a surrogate pair
// High surrogate
buffer[i++] = (char) TestUtil.nextInt(r, 0xd800, 0xdbff);
// Low surrogate
buffer[i] = (char) TestUtil.nextInt(r, 0xdc00, 0xdfff);
}
else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
else if (2 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x80, 0x800);
else if (3 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x800, 0xd7ff);
else if (4 == t) buffer[i] = (char) TestUtil.nextInt(r, 0xe000, 0xffff);
else if (5 == t) buffer[i] = '.';
else if (6 == t) buffer[i] = '?';
else if (7 == t) buffer[i] = '*';
else if (8 == t) buffer[i] = '+';
else if (9 == t) buffer[i] = '(';
else if (10 == t) buffer[i] = ')';
else if (11 == t) buffer[i] = '-';
else if (12 == t) buffer[i] = '[';
else if (13 == t) buffer[i] = ']';
else if (14 == t) buffer[i] = '|';
}
return new String(buffer, 0, end);
}
/** picks a random int code point, avoiding surrogates;
* throws IllegalArgumentException if this transition only
* accepts surrogates */
private static int getRandomCodePoint(final Random r, int min, int max) {
final int code;
if (max < UnicodeUtil.UNI_SUR_HIGH_START ||
min > UnicodeUtil.UNI_SUR_HIGH_END) {
// easy: entire range is before or after surrogates
code = min+r.nextInt(max-min+1);
} else if (min >= UnicodeUtil.UNI_SUR_HIGH_START) {
if (max > UnicodeUtil.UNI_SUR_LOW_END) {
// after surrogates
code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(max-UnicodeUtil.UNI_SUR_LOW_END);
} else {
throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max);
}
} else if (max <= UnicodeUtil.UNI_SUR_LOW_END) {
if (min < UnicodeUtil.UNI_SUR_HIGH_START) {
// before surrogates
code = min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - min);
} else {
throw new IllegalArgumentException("transition accepts only surrogates: min=" + min + " max=" + max);
}
} else {
// range includes all surrogates
int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - min;
int gap2 = max - UnicodeUtil.UNI_SUR_LOW_END;
int c = r.nextInt(gap1+gap2);
if (c < gap1) {
code = min + c;
} else {
code = UnicodeUtil.UNI_SUR_LOW_END + c - gap1 + 1;
}
}
assert code >= min && code <= max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END):
"code=" + code + " min=" + min + " max=" + max;
return code;
}
/**
* Lets you retrieve random strings accepted
* by an Automaton.
* <p>
* Once created, call {@link #getRandomAcceptedString(Random)}
* to get a new string (in UTF-32 codepoints).
*/
public static class RandomAcceptedStrings {
private final Map<Transition,Boolean> leadsToAccept;
private final Automaton a;
private final Transition[][] transitions;
private static class ArrivingTransition {
final int from;
final Transition t;
public ArrivingTransition(int from, Transition t) {
this.from = from;
this.t = t;
}
}
public RandomAcceptedStrings(Automaton a) {
this.a = a;
if (a.getNumStates() == 0) {
throw new IllegalArgumentException("this automaton accepts nothing");
}
this.transitions = a.getSortedTransitions();
leadsToAccept = new HashMap<>();
final Map<Integer,List<ArrivingTransition>> allArriving = new HashMap<>();
final LinkedList<Integer> q = new LinkedList<>();
final Set<Integer> seen = new HashSet<>();
// reverse map the transitions, so we can quickly look
// up all arriving transitions to a given state
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
for(Transition t : transitions[s]) {
List<ArrivingTransition> tl = allArriving.get(t.dest);
if (tl == null) {
tl = new ArrayList<>();
allArriving.put(t.dest, tl);
}
tl.add(new ArrivingTransition(s, t));
}
if (a.isAccept(s)) {
q.add(s);
seen.add(s);
}
}
// Breadth-first search, from accept states,
// backwards:
while (q.isEmpty() == false) {
final int s = q.removeFirst();
List<ArrivingTransition> arriving = allArriving.get(s);
if (arriving != null) {
for(ArrivingTransition at : arriving) {
final int from = at.from;
if (!seen.contains(from)) {
q.add(from);
seen.add(from);
leadsToAccept.put(at.t, Boolean.TRUE);
}
}
}
}
}
public int[] getRandomAcceptedString(Random r) {
int[] codePoints = new int[0];
int codepointCount = 0;
int s = 0;
while(true) {
if (a.isAccept(s)) {
if (a.getNumTransitions(s) == 0) {
// stop now
break;
} else {
if (r.nextBoolean()) {
break;
}
}
}
if (a.getNumTransitions(s) == 0) {
throw new RuntimeException("this automaton has dead states");
}
boolean cheat = r.nextBoolean();
final Transition t;
if (cheat) {
// pick a transition that we know is the fastest
// path to an accept state
List<Transition> toAccept = new ArrayList<>();
for(Transition t0 : transitions[s]) {
if (leadsToAccept.containsKey(t0)) {
toAccept.add(t0);
}
}
if (toAccept.size() == 0) {
// this is OK -- it means we jumped into a cycle
t = transitions[s][r.nextInt(transitions[s].length)];
} else {
t = toAccept.get(r.nextInt(toAccept.size()));
}
} else {
t = transitions[s][r.nextInt(transitions[s].length)];
}
codePoints = ArrayUtil.grow(codePoints, codepointCount + 1);
codePoints[codepointCount++] = getRandomCodePoint(r, t.min, t.max);
s = t.dest;
}
return ArrayUtil.copyOfSubArray(codePoints, 0, codepointCount);
}
}
private static Automaton randomSingleAutomaton(Random random) {
while (true) {
try {
Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
if (random.nextBoolean()) {
a1 = Operations.complement(a1, DEFAULT_MAX_DETERMINIZED_STATES);
}
return a1;
} catch (TooComplexToDeterminizeException tctde) {
// This can (rarely) happen if the random regexp is too hard; just try again...
}
}
}
/** return a random NFA/DFA for testing */
public static Automaton randomAutomaton(Random random) {
// get two random Automata from regexps
Automaton a1 = randomSingleAutomaton(random);
Automaton a2 = randomSingleAutomaton(random);
// combine them in random ways
switch (random.nextInt(4)) {
case 0: return Operations.concatenate(a1, a2);
case 1: return Operations.union(a1, a2);
case 2: return Operations.intersection(a1, a2);
default: return Operations.minus(a1, a2, DEFAULT_MAX_DETERMINIZED_STATES);
}
}
/**
* below are original, unoptimized implementations of DFA operations for testing.
* These are from brics automaton, full license (BSD) below:
*/
/*
* dk.brics.automaton
*
* Copyright (c) 2001-2009 Anders Moeller
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Simple, original brics implementation of Brzozowski minimize()
*/
public static Automaton minimizeSimple(Automaton a) {
Set<Integer> initialSet = new HashSet<Integer>();
a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
initialSet.clear();
a = determinizeSimple(Operations.reverse(a, initialSet), initialSet);
return a;
}
/**
* Simple, original brics implementation of determinize()
*/
public static Automaton determinizeSimple(Automaton a) {
Set<Integer> initialset = new HashSet<>();
initialset.add(0);
return determinizeSimple(a, initialset);
}
/**
* Simple, original brics implementation of determinize()
* Determinizes the given automaton using the given set of initial states.
*/
public static Automaton determinizeSimple(Automaton a, Set<Integer> initialset) {
if (a.getNumStates() == 0) {
return a;
}
int[] points = a.getStartPoints();
// subset construction
Map<Set<Integer>, Set<Integer>> sets = new HashMap<>();
LinkedList<Set<Integer>> worklist = new LinkedList<>();
Map<Set<Integer>, Integer> newstate = new HashMap<>();
sets.put(initialset, initialset);
worklist.add(initialset);
Automaton.Builder result = new Automaton.Builder();
result.createState();
newstate.put(initialset, 0);
Transition t = new Transition();
while (worklist.size() > 0) {
Set<Integer> s = worklist.removeFirst();
int r = newstate.get(s);
for (int q : s) {
if (a.isAccept(q)) {
result.setAccept(r, true);
break;
}
}
for (int n = 0; n < points.length; n++) {
Set<Integer> p = new HashSet<>();
for (int q : s) {
int count = a.initTransition(q, t);
for(int i=0;i<count;i++) {
a.getNextTransition(t);
if (t.min <= points[n] && points[n] <= t.max) {
p.add(t.dest);
}
}
}
if (!sets.containsKey(p)) {
sets.put(p, p);
worklist.add(p);
newstate.put(p, result.createState());
}
int q = newstate.get(p);
int min = points[n];
int max;
if (n + 1 < points.length) {
max = points[n + 1] - 1;
} else {
max = Character.MAX_CODE_POINT;
}
result.addTransition(r, q, min, max);
}
}
return Operations.removeDeadStates(result.finish());
}
/**
* Simple, original implementation of getFiniteStrings.
*
* <p>Returns the set of accepted strings, assuming that at most
* <code>limit</code> strings are accepted. If more than <code>limit</code>
* strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then
* the limit is infinite.
*
* <p>This implementation is recursive: it uses one stack
* frame for each digit in the returned strings (ie, max
* is the max length returned string).
*/
public static Set<IntsRef> getFiniteStringsRecursive(Automaton a, int limit) {
HashSet<IntsRef> strings = new HashSet<>();
if (!getFiniteStrings(a, 0, new HashSet<Integer>(), strings, new IntsRefBuilder(), limit)) {
return strings;
}
return strings;
}
/**
* Returns the strings that can be produced from the given state, or
* false if more than <code>limit</code> strings are found.
* <code>limit</code>&lt;0 means "infinite".
*/
private static boolean getFiniteStrings(Automaton a, int s, HashSet<Integer> pathstates,
HashSet<IntsRef> strings, IntsRefBuilder path, int limit) {
pathstates.add(s);
Transition t = new Transition();
int count = a.initTransition(s, t);
for (int i=0;i<count;i++) {
a.getNextTransition(t);
if (pathstates.contains(t.dest)) {
return false;
}
for (int n = t.min; n <= t.max; n++) {
path.append(n);
if (a.isAccept(t.dest)) {
strings.add(path.toIntsRef());
if (limit >= 0 && strings.size() > limit) {
return false;
}
}
if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) {
return false;
}
path.setLength(path.length() - 1);
}
}
pathstates.remove(s);
return true;
}
/**
* Returns true if the language of this automaton is finite.
* <p>
* WARNING: this method is slow, it will blow up if the automaton is large.
* this is only used to test the correctness of our faster implementation.
*/
public static boolean isFiniteSlow(Automaton a) {
if (a.getNumStates() == 0) {
return true;
}
return isFiniteSlow(a, 0, new HashSet<Integer>());
}
/**
* Checks whether there is a loop containing s. (This is sufficient since
* there are never transitions to dead states.)
*/
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack
private static boolean isFiniteSlow(Automaton a, int s, HashSet<Integer> path) {
path.add(s);
Transition t = new Transition();
int count = a.initTransition(s, t);
for (int i=0;i<count;i++) {
a.getNextTransition(t);
if (path.contains(t.dest) || !isFiniteSlow(a, t.dest, path)) {
return false;
}
}
path.remove(s);
return true;
}
/**
* Checks that an automaton has no detached states that are unreachable
* from the initial state.
*/
public static void assertNoDetachedStates(Automaton a) {
Automaton a2 = Operations.removeDeadStates(a);
assert a.getNumStates() == a2.getNumStates() : "automaton has " + (a.getNumStates() - a2.getNumStates()) + " detached states";
}
/** Returns true if the automaton is deterministic. */
public static boolean isDeterministicSlow(Automaton a) {
Transition t = new Transition();
int numStates = a.getNumStates();
for(int s=0;s<numStates;s++) {
int count = a.initTransition(s, t);
int lastMax = -1;
for(int i=0;i<count;i++) {
a.getNextTransition(t);
if (t.min <= lastMax) {
assert a.isDeterministic() == false;
return false;
}
lastMax = t.max;
}
}
assert a.isDeterministic() == true;
return true;
}
}