| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| |
| import java.util.Iterator; |
| import java.util.SortedSet; |
| import java.util.TreeSet; |
| |
| import org.apache.lucene.util.UnicodeUtil; |
| |
| /** |
| * Class to construct DFAs that match a word within some edit distance. |
| * <p> |
| * Implements the algorithm described in: |
| * Schulz and Mihov: Fast String Correction with Levenshtein Automata |
| * @lucene.experimental |
| */ |
| public class LevenshteinAutomata { |
| /** Maximum edit distance this class can generate an automaton for. |
| * @lucene.internal */ |
| public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; |
| /* input word */ |
| final int word[]; |
| /* the automata alphabet. */ |
| final int alphabet[]; |
| /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */ |
| final int alphaMax; |
| |
| /* the ranges outside of alphabet */ |
| final int rangeLower[]; |
| final int rangeUpper[]; |
| int numRanges = 0; |
| |
| ParametricDescription descriptions[]; |
| |
| /** |
| * Create a new LevenshteinAutomata for some input String. |
| * Optionally count transpositions as a primitive edit. |
| */ |
| public LevenshteinAutomata(String input, boolean withTranspositions) { |
| this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions); |
| } |
| |
| /** |
| * Expert: specify a custom maximum possible symbol |
| * (alphaMax); default is Character.MAX_CODE_POINT. |
| */ |
| public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { |
| this.word = word; |
| this.alphaMax = alphaMax; |
| |
| // calculate the alphabet |
| SortedSet<Integer> set = new TreeSet<>(); |
| for (int i = 0; i < word.length; i++) { |
| int v = word[i]; |
| if (v > alphaMax) { |
| throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word"); |
| } |
| set.add(v); |
| } |
| alphabet = new int[set.size()]; |
| Iterator<Integer> iterator = set.iterator(); |
| for (int i = 0; i < alphabet.length; i++) |
| alphabet[i] = iterator.next(); |
| |
| rangeLower = new int[alphabet.length + 2]; |
| rangeUpper = new int[alphabet.length + 2]; |
| // calculate the unicode range intervals that exclude the alphabet |
| // these are the ranges for all unicode characters not in the alphabet |
| int lower = 0; |
| for (int i = 0; i < alphabet.length; i++) { |
| int higher = alphabet[i]; |
| if (higher > lower) { |
| rangeLower[numRanges] = lower; |
| rangeUpper[numRanges] = higher - 1; |
| numRanges++; |
| } |
| lower = higher + 1; |
| } |
| /* add the final endpoint */ |
| if (lower <= alphaMax) { |
| rangeLower[numRanges] = lower; |
| rangeUpper[numRanges] = alphaMax; |
| numRanges++; |
| } |
| |
| descriptions = new ParametricDescription[] { |
| null, /* for n=0, we do not need to go through the trouble */ |
| withTranspositions ? new Lev1TParametricDescription(word.length) : new Lev1ParametricDescription(word.length), |
| withTranspositions ? new Lev2TParametricDescription(word.length) : new Lev2ParametricDescription(word.length), |
| }; |
| } |
| |
| private static int[] codePoints(String input) { |
| int length = Character.codePointCount(input, 0, input.length()); |
| int word[] = new int[length]; |
| for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { |
| word[j++] = cp = input.codePointAt(i); |
| } |
| return word; |
| } |
| |
| /** |
| * Compute a DFA that accepts all strings within an edit distance of <code>n</code>. |
| * <p> |
| * All automata have the following properties: |
| * <ul> |
| * <li>They are deterministic (DFA). |
| * <li>There are no transitions to dead states. |
| * <li>They are not minimal (some transitions could be combined). |
| * </ul> |
| */ |
| public Automaton toAutomaton(int n) { |
| return toAutomaton(n, ""); |
| } |
| |
| /** |
| * Compute a DFA that accepts all strings within an edit distance of <code>n</code>, |
| * matching the specified exact prefix. |
| * <p> |
| * All automata have the following properties: |
| * <ul> |
| * <li>They are deterministic (DFA). |
| * <li>There are no transitions to dead states. |
| * <li>They are not minimal (some transitions could be combined). |
| * </ul> |
| */ |
| public Automaton toAutomaton(int n, String prefix) { |
| assert prefix != null; |
| if (n == 0) { |
| return Automata.makeString(prefix + UnicodeUtil.newString(word, 0, word.length)); |
| } |
| |
| if (n >= descriptions.length) |
| return null; |
| |
| final int range = 2*n+1; |
| ParametricDescription description = descriptions[n]; |
| // the number of states is based on the length of the word and n |
| final int numStates = description.size(); |
| final int numTransitions = numStates * Math.min(1 + 2 * n, alphabet.length); |
| final int prefixStates = prefix != null ? prefix.codePointCount(0, prefix.length()) : 0; |
| |
| final Automaton a = new Automaton(numStates + prefixStates, numTransitions); |
| int lastState; |
| if (prefix != null) { |
| // Insert prefix |
| lastState = a.createState(); |
| for (int i = 0, cp = 0; i < prefix.length(); i += Character.charCount(cp)) { |
| int state = a.createState(); |
| cp = prefix.codePointAt(i); |
| a.addTransition(lastState, state, cp, cp); |
| lastState = state; |
| } |
| } else { |
| lastState = a.createState(); |
| } |
| |
| int stateOffset = lastState; |
| a.setAccept(lastState, description.isAccept(0)); |
| |
| // create all states, and mark as accept states if appropriate |
| for (int i = 1; i < numStates; i++) { |
| int state = a.createState(); |
| a.setAccept(state, description.isAccept(i)); |
| } |
| |
| // TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state) |
| |
| // create transitions from state to state |
| for (int k = 0; k < numStates; k++) { |
| final int xpos = description.getPosition(k); |
| if (xpos < 0) |
| continue; |
| final int end = xpos + Math.min(word.length - xpos, range); |
| |
| for (int x = 0; x < alphabet.length; x++) { |
| final int ch = alphabet[x]; |
| // get the characteristic vector at this position wrt ch |
| final int cvec = getVector(ch, xpos, end); |
| int dest = description.transition(k, xpos, cvec); |
| if (dest >= 0) { |
| a.addTransition(stateOffset+k, stateOffset+dest, ch); |
| } |
| } |
| // add transitions for all other chars in unicode |
| // by definition, their characteristic vectors are always 0, |
| // because they do not exist in the input string. |
| int dest = description.transition(k, xpos, 0); // by definition |
| if (dest >= 0) { |
| for (int r = 0; r < numRanges; r++) { |
| a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]); |
| } |
| } |
| } |
| |
| a.finishState(); |
| assert a.isDeterministic(); |
| return a; |
| } |
| |
| /** |
| * Get the characteristic vector <code>X(x, V)</code> |
| * where V is <code>substring(pos, end)</code> |
| */ |
| int getVector(int x, int pos, int end) { |
| int vector = 0; |
| for (int i = pos; i < end; i++) { |
| vector <<= 1; |
| if (word[i] == x) |
| vector |= 1; |
| } |
| return vector; |
| } |
| |
| /** |
| * A ParametricDescription describes the structure of a Levenshtein DFA for some degree n. |
| * <p> |
| * There are four components of a parametric description, all parameterized on the length |
| * of the word <code>w</code>: |
| * <ol> |
| * <li>The number of states: {@link #size()} |
| * <li>The set of final states: {@link #isAccept(int)} |
| * <li>The transition function: {@link #transition(int, int, int)} |
| * <li>Minimal boundary function: {@link #getPosition(int)} |
| * </ol> |
| */ |
| static abstract class ParametricDescription { |
| protected final int w; |
| protected final int n; |
| private final int[] minErrors; |
| |
| ParametricDescription(int w, int n, int[] minErrors) { |
| this.w = w; |
| this.n = n; |
| this.minErrors = minErrors; |
| } |
| |
| /** |
| * Return the number of states needed to compute a Levenshtein DFA |
| */ |
| int size() { |
| return minErrors.length * (w+1); |
| }; |
| |
| /** |
| * Returns true if the <code>state</code> in any Levenshtein DFA is an accept state (final state). |
| */ |
| boolean isAccept(int absState) { |
| // decode absState -> state, offset |
| int state = absState/(w+1); |
| int offset = absState%(w+1); |
| assert offset >= 0; |
| return w - offset + minErrors[state] <= n; |
| } |
| |
| /** |
| * Returns the position in the input word for a given <code>state</code>. |
| * This is the minimal boundary for the state. |
| */ |
| int getPosition(int absState) { |
| return absState % (w+1); |
| } |
| |
| /** |
| * Returns the state number for a transition from the given <code>state</code>, |
| * assuming <code>position</code> and characteristic vector <code>vector</code> |
| */ |
| abstract int transition(int state, int position, int vector); |
| |
| private final static long[] MASKS = new long[] {0x1,0x3,0x7,0xf, |
| 0x1f,0x3f,0x7f,0xff, |
| 0x1ff,0x3ff,0x7ff,0xfff, |
| 0x1fff,0x3fff,0x7fff,0xffff, |
| 0x1ffff,0x3ffff,0x7ffff,0xfffff, |
| 0x1fffff,0x3fffff,0x7fffff,0xffffff, |
| 0x1ffffff,0x3ffffff,0x7ffffff,0xfffffff, |
| 0x1fffffff,0x3fffffff,0x7fffffffL,0xffffffffL, |
| 0x1ffffffffL,0x3ffffffffL,0x7ffffffffL,0xfffffffffL, |
| 0x1fffffffffL,0x3fffffffffL,0x7fffffffffL,0xffffffffffL, |
| 0x1ffffffffffL,0x3ffffffffffL,0x7ffffffffffL,0xfffffffffffL, |
| 0x1fffffffffffL,0x3fffffffffffL,0x7fffffffffffL,0xffffffffffffL, |
| 0x1ffffffffffffL,0x3ffffffffffffL,0x7ffffffffffffL,0xfffffffffffffL, |
| 0x1fffffffffffffL,0x3fffffffffffffL,0x7fffffffffffffL,0xffffffffffffffL, |
| 0x1ffffffffffffffL,0x3ffffffffffffffL,0x7ffffffffffffffL,0xfffffffffffffffL, |
| 0x1fffffffffffffffL,0x3fffffffffffffffL,0x7fffffffffffffffL}; |
| |
| protected int unpack(long[] data, int index, int bitsPerValue) { |
| final long bitLoc = bitsPerValue * index; |
| final int dataLoc = (int) (bitLoc >> 6); |
| final int bitStart = (int) (bitLoc & 63); |
| //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue); |
| if (bitStart + bitsPerValue <= 64) { |
| // not split |
| return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]); |
| } else { |
| // split |
| final int part = 64-bitStart; |
| return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) + |
| ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part)); |
| } |
| } |
| } |
| } |