| /* |
| * dk.brics.automaton |
| * |
| * Copyright (c) 2001-2009 Anders Moeller |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * 3. The name of the author may not be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| package org.apache.lucene.util.automaton; |
| |
| import java.util.*; |
| |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.StringHelper; |
| |
| /** |
| * Construction of basic automata. |
| * |
| * @lucene.experimental |
| */ |
| final public class Automata { |
| |
| private Automata() {} |
| |
| /** |
| * Returns a new (deterministic) automaton with the empty language. |
| */ |
| public static Automaton makeEmpty() { |
| Automaton a = new Automaton(); |
| a.finishState(); |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts only the empty string. |
| */ |
| public static Automaton makeEmptyString() { |
| Automaton a = new Automaton(); |
| a.createState(); |
| a.setAccept(0, true); |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts all strings. |
| */ |
| public static Automaton makeAnyString() { |
| Automaton a = new Automaton(); |
| int s = a.createState(); |
| a.setAccept(s, true); |
| a.addTransition(s, s, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); |
| a.finishState(); |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts all binary terms. |
| */ |
| public static Automaton makeAnyBinary() { |
| Automaton a = new Automaton(); |
| int s = a.createState(); |
| a.setAccept(s, true); |
| a.addTransition(s, s, 0, 255); |
| a.finishState(); |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts all binary terms except |
| * the empty string. |
| */ |
| public static Automaton makeNonEmptyBinary() { |
| Automaton a = new Automaton(); |
| int s1 = a.createState(); |
| int s2 = a.createState(); |
| a.setAccept(s2, true); |
| a.addTransition(s1, s2, 0, 255); |
| a.addTransition(s2, s2, 0, 255); |
| a.finishState(); |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts any single codepoint. |
| */ |
| public static Automaton makeAnyChar() { |
| return makeCharRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); |
| } |
| |
| /** Accept any single character starting from the specified state, returning the new state */ |
| public static int appendAnyChar(Automaton a, int state) { |
| int newState = a.createState(); |
| a.addTransition(state, newState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); |
| return newState; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts a single codepoint of |
| * the given value. |
| */ |
| public static Automaton makeChar(int c) { |
| return makeCharRange(c, c); |
| } |
| |
| /** Appends the specified character to the specified state, returning a new state. */ |
| public static int appendChar(Automaton a, int state, int c) { |
| int newState = a.createState(); |
| a.addTransition(state, newState, c, c); |
| return newState; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts a single codepoint whose |
| * value is in the given interval (including both end points). |
| */ |
| public static Automaton makeCharRange(int min, int max) { |
| if (min > max) { |
| return makeEmpty(); |
| } |
| Automaton a = new Automaton(); |
| int s1 = a.createState(); |
| int s2 = a.createState(); |
| a.setAccept(s2, true); |
| a.addTransition(s1, s2, min, max); |
| a.finishState(); |
| return a; |
| } |
| |
| /** |
| * Constructs sub-automaton corresponding to decimal numbers of length |
| * x.substring(n).length(). |
| */ |
| private static int anyOfRightLength(Automaton.Builder builder, String x, int n) { |
| int s = builder.createState(); |
| if (x.length() == n) { |
| builder.setAccept(s, true); |
| } else { |
| builder.addTransition(s, anyOfRightLength(builder, x, n + 1), '0', '9'); |
| } |
| return s; |
| } |
| |
| /** |
| * Constructs sub-automaton corresponding to decimal numbers of value at least |
| * x.substring(n) and length x.substring(n).length(). |
| */ |
| private static int atLeast(Automaton.Builder builder, String x, int n, Collection<Integer> initials, |
| boolean zeros) { |
| int s = builder.createState(); |
| if (x.length() == n) { |
| builder.setAccept(s, true); |
| } else { |
| if (zeros) { |
| initials.add(s); |
| } |
| char c = x.charAt(n); |
| builder.addTransition(s, atLeast(builder, x, n + 1, initials, zeros && c == '0'), c); |
| if (c < '9') { |
| builder.addTransition(s, anyOfRightLength(builder, x, n + 1), (char) (c + 1), '9'); |
| } |
| } |
| return s; |
| } |
| |
| /** |
| * Constructs sub-automaton corresponding to decimal numbers of value at most |
| * x.substring(n) and length x.substring(n).length(). |
| */ |
| private static int atMost(Automaton.Builder builder, String x, int n) { |
| int s = builder.createState(); |
| if (x.length() == n) { |
| builder.setAccept(s, true); |
| } else { |
| char c = x.charAt(n); |
| builder.addTransition(s, atMost(builder, x, (char) n + 1), c); |
| if (c > '0') { |
| builder.addTransition(s, anyOfRightLength(builder, x, n + 1), '0', (char) (c - 1)); |
| } |
| } |
| return s; |
| } |
| |
| /** |
| * Constructs sub-automaton corresponding to decimal numbers of value between |
| * x.substring(n) and y.substring(n) and of length x.substring(n).length() |
| * (which must be equal to y.substring(n).length()). |
| */ |
| private static int between(Automaton.Builder builder, |
| String x, String y, int n, |
| Collection<Integer> initials, boolean zeros) { |
| int s = builder.createState(); |
| if (x.length() == n) { |
| builder.setAccept(s, true); |
| } else { |
| if (zeros) { |
| initials.add(s); |
| } |
| char cx = x.charAt(n); |
| char cy = y.charAt(n); |
| if (cx == cy) { |
| builder.addTransition(s, between(builder, x, y, n + 1, initials, zeros && cx == '0'), cx); |
| } else { // cx<cy |
| builder.addTransition(s, atLeast(builder, x, n + 1, initials, zeros && cx == '0'), cx); |
| builder.addTransition(s, atMost(builder, y, n + 1), cy); |
| if (cx + 1 < cy) { |
| builder.addTransition(s, anyOfRightLength(builder, x, n+1), (char) (cx + 1), (char) (cy - 1)); |
| } |
| } |
| } |
| |
| return s; |
| } |
| |
| private static boolean suffixIsZeros(BytesRef br, int len) { |
| for(int i=len;i<br.length;i++) { |
| if (br.bytes[br.offset+i] != 0) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| /** Creates a new deterministic, minimal automaton accepting |
| * all binary terms in the specified interval. Note that unlike |
| * {@link #makeDecimalInterval}, the returned automaton is infinite, |
| * because terms behave like floating point numbers leading with |
| * a decimal point. However, in the special case where min == max, |
| * and both are inclusive, the automata will be finite and accept |
| * exactly one term. */ |
| public static Automaton makeBinaryInterval(BytesRef min, boolean minInclusive, BytesRef max, boolean maxInclusive) { |
| |
| if (min == null && minInclusive == false) { |
| throw new IllegalArgumentException("minInclusive must be true when min is null (open ended)"); |
| } |
| |
| if (max == null && maxInclusive == false) { |
| throw new IllegalArgumentException("maxInclusive must be true when max is null (open ended)"); |
| } |
| |
| if (min == null) { |
| min = new BytesRef(); |
| minInclusive = true; |
| } |
| |
| int cmp; |
| if (max != null) { |
| cmp = min.compareTo(max); |
| } else { |
| cmp = -1; |
| if (min.length == 0) { |
| if (minInclusive) { |
| return makeAnyBinary(); |
| } else { |
| return makeNonEmptyBinary(); |
| } |
| } |
| } |
| |
| if (cmp == 0) { |
| if (minInclusive == false || maxInclusive == false) { |
| return makeEmpty(); |
| } else { |
| return makeBinary(min); |
| } |
| } else if (cmp > 0) { |
| // max < min |
| return makeEmpty(); |
| } |
| |
| if (max != null && |
| StringHelper.startsWith(max, min) && |
| suffixIsZeros(max, min.length)) { |
| |
| // Finite case: no sink state! |
| |
| int maxLength = max.length; |
| |
| // the == case was handled above |
| assert maxLength > min.length; |
| |
| // bar -> bar\0+ |
| if (maxInclusive == false) { |
| maxLength--; |
| } |
| |
| if (maxLength == min.length) { |
| if (minInclusive == false) { |
| return makeEmpty(); |
| } else { |
| return makeBinary(min); |
| } |
| } |
| |
| Automaton a = new Automaton(); |
| int lastState = a.createState(); |
| for (int i=0;i<min.length;i++) { |
| int state = a.createState(); |
| int label = min.bytes[min.offset+i] & 0xff; |
| a.addTransition(lastState, state, label); |
| lastState = state; |
| } |
| |
| if (minInclusive) { |
| a.setAccept(lastState, true); |
| } |
| |
| for(int i=min.length;i<maxLength;i++) { |
| int state = a.createState(); |
| a.addTransition(lastState, state, 0); |
| a.setAccept(state, true); |
| lastState = state; |
| } |
| a.finishState(); |
| return a; |
| } |
| |
| Automaton a = new Automaton(); |
| int startState = a.createState(); |
| |
| int sinkState = a.createState(); |
| a.setAccept(sinkState, true); |
| |
| // This state accepts all suffixes: |
| a.addTransition(sinkState, sinkState, 0, 255); |
| |
| boolean equalPrefix = true; |
| int lastState = startState; |
| int firstMaxState = -1; |
| int sharedPrefixLength = 0; |
| for(int i=0;i<min.length;i++) { |
| int minLabel = min.bytes[min.offset+i] & 0xff; |
| |
| int maxLabel; |
| if (max != null && equalPrefix && i < max.length) { |
| maxLabel = max.bytes[max.offset+i] & 0xff; |
| } else { |
| maxLabel = -1; |
| } |
| |
| int nextState; |
| if (minInclusive && i == min.length-1 && (equalPrefix == false || minLabel != maxLabel)) { |
| nextState = sinkState; |
| } else { |
| nextState = a.createState(); |
| } |
| |
| if (equalPrefix) { |
| |
| if (minLabel == maxLabel) { |
| // Still in shared prefix |
| a.addTransition(lastState, nextState, minLabel); |
| } else if (max == null) { |
| equalPrefix = false; |
| sharedPrefixLength = 0; |
| a.addTransition(lastState, sinkState, minLabel+1, 0xff); |
| a.addTransition(lastState, nextState, minLabel); |
| } else { |
| // This is the first point where min & max diverge: |
| assert maxLabel > minLabel; |
| |
| a.addTransition(lastState, nextState, minLabel); |
| |
| if (maxLabel > minLabel + 1) { |
| a.addTransition(lastState, sinkState, minLabel+1, maxLabel-1); |
| } |
| |
| // Now fork off path for max: |
| if (maxInclusive || i < max.length-1) { |
| firstMaxState = a.createState(); |
| if (i < max.length-1) { |
| a.setAccept(firstMaxState, true); |
| } |
| a.addTransition(lastState, firstMaxState, maxLabel); |
| } |
| equalPrefix = false; |
| sharedPrefixLength = i; |
| } |
| } else { |
| // OK, already diverged: |
| a.addTransition(lastState, nextState, minLabel); |
| if (minLabel < 255) { |
| a.addTransition(lastState, sinkState, minLabel+1, 255); |
| } |
| } |
| lastState = nextState; |
| } |
| |
| // Accept any suffix appended to the min term: |
| if (equalPrefix == false && lastState != sinkState && lastState != startState) { |
| a.addTransition(lastState, sinkState, 0, 255); |
| } |
| |
| if (minInclusive) { |
| // Accept exactly the min term: |
| a.setAccept(lastState, true); |
| } |
| |
| if (max != null) { |
| |
| // Now do max: |
| if (firstMaxState == -1) { |
| // Min was a full prefix of max |
| sharedPrefixLength = min.length; |
| } else { |
| lastState = firstMaxState; |
| sharedPrefixLength++; |
| } |
| for(int i=sharedPrefixLength;i<max.length;i++) { |
| int maxLabel = max.bytes[max.offset+i]&0xff; |
| if (maxLabel > 0) { |
| a.addTransition(lastState, sinkState, 0, maxLabel-1); |
| } |
| if (maxInclusive || i < max.length-1) { |
| int nextState = a.createState(); |
| if (i < max.length-1) { |
| a.setAccept(nextState, true); |
| } |
| a.addTransition(lastState, nextState, maxLabel); |
| lastState = nextState; |
| } |
| } |
| |
| if (maxInclusive) { |
| a.setAccept(lastState, true); |
| } |
| } |
| |
| a.finishState(); |
| |
| assert a.isDeterministic(): a.toDot(); |
| |
| return a; |
| } |
| |
| /** |
| * Returns a new automaton that accepts strings representing decimal (base 10) |
| * non-negative integers in the given interval. |
| * |
| * @param min minimal value of interval |
| * @param max maximal value of interval (both end points are included in the |
| * interval) |
| * @param digits if > 0, use fixed number of digits (strings must be prefixed |
| * by 0's to obtain the right length) - otherwise, the number of |
| * digits is not fixed (any number of leading 0s is accepted) |
| * @exception IllegalArgumentException if min > max or if numbers in the |
| * interval cannot be expressed with the given fixed number of |
| * digits |
| */ |
| public static Automaton makeDecimalInterval(int min, int max, int digits) |
| throws IllegalArgumentException { |
| String x = Integer.toString(min); |
| String y = Integer.toString(max); |
| if (min > max || (digits > 0 && y.length() > digits)) { |
| throw new IllegalArgumentException(); |
| } |
| int d; |
| if (digits > 0) d = digits; |
| else d = y.length(); |
| StringBuilder bx = new StringBuilder(); |
| for (int i = x.length(); i < d; i++) { |
| bx.append('0'); |
| } |
| bx.append(x); |
| x = bx.toString(); |
| StringBuilder by = new StringBuilder(); |
| for (int i = y.length(); i < d; i++) { |
| by.append('0'); |
| } |
| by.append(y); |
| y = by.toString(); |
| |
| Automaton.Builder builder = new Automaton.Builder(); |
| |
| if (digits <= 0) { |
| // Reserve the "real" initial state: |
| builder.createState(); |
| } |
| |
| Collection<Integer> initials = new ArrayList<>(); |
| |
| between(builder, x, y, 0, initials, digits <= 0); |
| |
| Automaton a1 = builder.finish(); |
| |
| if (digits <= 0) { |
| a1.addTransition(0, 0, '0'); |
| for (int p : initials) { |
| a1.addEpsilon(0, p); |
| } |
| a1.finishState(); |
| } |
| |
| return a1; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts the single given |
| * string. |
| */ |
| public static Automaton makeString(String s) { |
| Automaton a = new Automaton(); |
| int lastState = a.createState(); |
| for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) { |
| int state = a.createState(); |
| cp = s.codePointAt(i); |
| a.addTransition(lastState, state, cp); |
| lastState = state; |
| } |
| |
| a.setAccept(lastState, true); |
| a.finishState(); |
| |
| assert a.isDeterministic(); |
| assert Operations.hasDeadStates(a) == false; |
| |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts the single given |
| * binary term. |
| */ |
| public static Automaton makeBinary(BytesRef term) { |
| Automaton a = new Automaton(); |
| int lastState = a.createState(); |
| for (int i=0;i<term.length;i++) { |
| int state = a.createState(); |
| int label = term.bytes[term.offset+i] & 0xff; |
| a.addTransition(lastState, state, label); |
| lastState = state; |
| } |
| |
| a.setAccept(lastState, true); |
| a.finishState(); |
| |
| assert a.isDeterministic(); |
| assert Operations.hasDeadStates(a) == false; |
| |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic) automaton that accepts the single given |
| * string from the specified unicode code points. |
| */ |
| public static Automaton makeString(int[] word, int offset, int length) { |
| Automaton a = new Automaton(); |
| a.createState(); |
| int s = 0; |
| for (int i = offset; i < offset+length; i++) { |
| int s2 = a.createState(); |
| a.addTransition(s, s2, word[i]); |
| s = s2; |
| } |
| a.setAccept(s, true); |
| a.finishState(); |
| |
| return a; |
| } |
| |
| /** |
| * Returns a new (deterministic and minimal) automaton that accepts the union |
| * of the given collection of {@link BytesRef}s representing UTF-8 encoded |
| * strings. |
| * |
| * @param utf8Strings |
| * The input strings, UTF-8 encoded. The collection must be in sorted |
| * order. |
| * |
| * @return An {@link Automaton} accepting all input strings. The resulting |
| * automaton is codepoint based (full unicode codepoints on |
| * transitions). |
| */ |
| public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) { |
| if (utf8Strings.isEmpty()) { |
| return makeEmpty(); |
| } else { |
| return DaciukMihovAutomatonBuilder.build(utf8Strings); |
| } |
| } |
| } |