blob: 44902dcdadd1ef39fa7916da3206ea7360a3acdf [file] [log] [blame]
using J2N;
using Lucene.Net.Diagnostics;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Util.Automaton
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Class to construct DFAs that match a word within some edit distance.
/// <para/>
/// Implements the algorithm described in:
/// Schulz and Mihov: Fast String Correction with Levenshtein Automata
/// <para/>
/// @lucene.experimental
/// </summary>
public class LevenshteinAutomata
{
/// <summary>
/// @lucene.internal </summary>
public const int MAXIMUM_SUPPORTED_DISTANCE = 2;
/* input word */
internal readonly int[] word;
/* the automata alphabet. */
internal readonly int[] alphabet;
/* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
internal readonly int alphaMax;
/* the ranges outside of alphabet */
internal readonly int[] rangeLower;
internal readonly int[] rangeUpper;
internal int numRanges = 0;
internal ParametricDescription[] descriptions;
/// <summary>
/// Create a new <see cref="LevenshteinAutomata"/> for some <paramref name="input"/> string.
/// Optionally count transpositions as a primitive edit.
/// </summary>
public LevenshteinAutomata(string input, bool withTranspositions)
: this(CodePoints(input), Character.MaxCodePoint, withTranspositions)
{
}
/// <summary>
/// Expert: specify a custom maximum possible symbol
/// (alphaMax); default is <see cref="Character.MaxCodePoint"/>.
/// </summary>
public LevenshteinAutomata(int[] word, int alphaMax, bool withTranspositions)
{
this.word = word;
this.alphaMax = alphaMax;
// calculate the alphabet
ISet<int> set = new JCG.SortedSet<int>();
for (int i = 0; i < word.Length; i++)
{
int v = word[i];
if (v > alphaMax)
{
throw new ArgumentException("alphaMax exceeded by symbol " + v + " in word");
}
set.Add(v);
}
alphabet = new int[set.Count];
using (IEnumerator<int> iterator = set.GetEnumerator())
{
for (int i = 0; i < alphabet.Length; i++)
{
iterator.MoveNext();
alphabet[i] = iterator.Current;
}
}
rangeLower = new int[alphabet.Length + 2];
rangeUpper = new int[alphabet.Length + 2];
// calculate the unicode range intervals that exclude the alphabet
// these are the ranges for all unicode characters not in the alphabet
int lower = 0;
for (int i = 0; i < alphabet.Length; i++)
{
int higher = alphabet[i];
if (higher > lower)
{
rangeLower[numRanges] = lower;
rangeUpper[numRanges] = higher - 1;
numRanges++;
}
lower = higher + 1;
}
/* add the final endpoint */
if (lower <= alphaMax)
{
rangeLower[numRanges] = lower;
rangeUpper[numRanges] = alphaMax;
numRanges++;
}
descriptions = new ParametricDescription[] {
null,
withTranspositions ? (ParametricDescription)new Lev1TParametricDescription(word.Length) : new Lev1ParametricDescription(word.Length),
withTranspositions ? (ParametricDescription)new Lev2TParametricDescription(word.Length) : new Lev2ParametricDescription(word.Length)
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int[] CodePoints(string input)
{
int length = Character.CodePointCount(input, 0, input.Length);
int[] word = new int[length];
int cp; // LUCENENET: Removed unnecessary assignment
for (int i = 0, j = 0; i < input.Length; i += Character.CharCount(cp))
{
word[j++] = cp = Character.CodePointAt(input, i);
}
return word;
}
/// <summary>
/// Compute a DFA that accepts all strings within an edit distance of <paramref name="n"/>.
/// <para>
/// All automata have the following properties:
/// <list type="bullet">
/// <item><description>They are deterministic (DFA).</description></item>
/// <item><description>There are no transitions to dead states.</description></item>
/// <item><description>They are not minimal (some transitions could be combined).</description></item>
/// </list>
/// </para>
/// </summary>
public virtual Automaton ToAutomaton(int n)
{
if (n == 0)
{
return BasicAutomata.MakeString(word, 0, word.Length);
}
if (n >= descriptions.Length)
{
return null;
}
int range = 2 * n + 1;
ParametricDescription description = descriptions[n];
// the number of states is based on the length of the word and n
State[] states = new State[description.Count];
// create all states, and mark as accept states if appropriate
for (int i = 0; i < states.Length; i++)
{
states[i] = new State
{
number = i,
accept = description.IsAccept(i)
};
}
// create transitions from state to state
for (int k = 0; k < states.Length; k++)
{
int xpos = description.GetPosition(k);
if (xpos < 0)
{
continue;
}
int end = xpos + Math.Min(word.Length - xpos, range);
for (int x = 0; x < alphabet.Length; x++)
{
int ch = alphabet[x];
// get the characteristic vector at this position wrt ch
int cvec = GetVector(ch, xpos, end);
int dest = description.Transition(k, xpos, cvec);
if (dest >= 0)
{
states[k].AddTransition(new Transition(ch, states[dest]));
}
}
// add transitions for all other chars in unicode
// by definition, their characteristic vectors are always 0,
// because they do not exist in the input string.
int dest_ = description.Transition(k, xpos, 0); // by definition
if (dest_ >= 0)
{
for (int r = 0; r < numRanges; r++)
{
states[k].AddTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest_]));
}
}
}
Automaton a = new Automaton(states[0])
{
deterministic = true
};
// we create some useless unconnected states, and its a net-win overall to remove these,
// as well as to combine any adjacent transitions (it makes later algorithms more efficient).
// so, while we could set our numberedStates here, its actually best not to, and instead to
// force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions.
//a.setNumberedStates(states);
a.Reduce();
// we need not trim transitions to dead states, as they are not created.
//a.restoreInvariant();
return a;
}
/// <summary>
/// Get the characteristic vector <c>X(x, V)</c>
/// where V is <c>Substring(pos, end - pos)</c>.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual int GetVector(int x, int pos, int end)
{
int vector = 0;
for (int i = pos; i < end; i++)
{
vector <<= 1;
if (word[i] == x)
{
vector |= 1;
}
}
return vector;
}
/// <summary>
/// A <see cref="ParametricDescription"/> describes the structure of a Levenshtein DFA for some degree <c>n</c>.
/// <para/>
/// There are four components of a parametric description, all parameterized on the length
/// of the word <c>w</c>:
/// <list type="number">
/// <item><description>The number of states: <see cref="Count"/></description></item>
/// <item><description>The set of final states: <see cref="IsAccept(int)"/></description></item>
/// <item><description>The transition function: <see cref="Transition(int, int, int)"/></description></item>
/// <item><description>Minimal boundary function: <see cref="GetPosition(int)"/></description></item>
/// </list>
/// </summary>
internal abstract class ParametricDescription
{
protected readonly int m_w;
protected readonly int m_n;
private readonly int[] minErrors;
internal ParametricDescription(int w, int n, int[] minErrors)
{
this.m_w = w;
this.m_n = n;
this.minErrors = minErrors;
}
/// <summary>
/// Return the number of states needed to compute a Levenshtein DFA.
/// <para/>
/// NOTE: This was size() in Lucene.
/// </summary>
internal virtual int Count => minErrors.Length * (m_w + 1);
/// <summary>
/// Returns <c>true</c> if the <c>state</c> in any Levenshtein DFA is an accept state (final state).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual bool IsAccept(int absState)
{
// decode absState -> state, offset
int state = absState / (m_w + 1);
int offset = absState % (m_w + 1);
if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0);
return m_w - offset + minErrors[state] <= m_n;
}
/// <summary>
/// Returns the position in the input word for a given <c>state</c>.
/// this is the minimal boundary for the state.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual int GetPosition(int absState)
{
return absState % (m_w + 1);
}
/// <summary>
/// Returns the state number for a transition from the given <paramref name="state"/>,
/// assuming <paramref name="position"/> and characteristic vector <paramref name="vector"/>.
/// </summary>
internal abstract int Transition(int state, int position, int vector);
private static readonly long[] MASKS = new long[] {
0x1, 0x3, 0x7, 0xf,
0x1f, 0x3f, 0x7f, 0xff,
0x1ff, 0x3ff, 0x7ff, 0xfff,
0x1fff, 0x3fff, 0x7fff, 0xffff,
0x1ffff, 0x3ffff, 0x7ffff, 0xfffff,
0x1fffff, 0x3fffff, 0x7fffff, 0xffffff,
0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff,
0x1fffffff, 0x3fffffff, 0x7fffffffL, 0xffffffffL,
0x1ffffffffL, 0x3ffffffffL, 0x7ffffffffL, 0xfffffffffL,
0x1fffffffffL, 0x3fffffffffL, 0x7fffffffffL, 0xffffffffffL,
0x1ffffffffffL, 0x3ffffffffffL, 0x7ffffffffffL, 0xfffffffffffL,
0x1fffffffffffL, 0x3fffffffffffL, 0x7fffffffffffL, 0xffffffffffffL,
0x1ffffffffffffL, 0x3ffffffffffffL, 0x7ffffffffffffL, 0xfffffffffffffL,
0x1fffffffffffffL, 0x3fffffffffffffL, 0x7fffffffffffffL, 0xffffffffffffffL,
0x1ffffffffffffffL, 0x3ffffffffffffffL, 0x7ffffffffffffffL, 0xfffffffffffffffL,
0x1fffffffffffffffL, 0x3fffffffffffffffL, 0x7fffffffffffffffL
};
protected internal virtual int Unpack(long[] data, int index, int bitsPerValue)
{
long bitLoc = bitsPerValue * index;
int dataLoc = (int)(bitLoc >> 6);
int bitStart = (int)(bitLoc & 63);
if (bitStart + bitsPerValue <= 64)
{
// not split
return (int)((data[dataLoc] >> bitStart) & MASKS[bitsPerValue - 1]);
}
else
{
// split
int part = 64 - bitStart;
return (int)(((data[dataLoc] >> bitStart) & MASKS[part - 1]) + ((data[1 + dataLoc] & MASKS[bitsPerValue - part - 1]) << part));
}
}
}
}
}