src/Lucene.Net/Util/Automaton/LevenshteinAutomata.cs - lucenenet - Git at Google

 using J2N;
 using Lucene.Net.Diagnostics;
 using System;
 using System.Collections.Generic;
 using System.Runtime.CompilerServices;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Util.Automaton
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Class to construct DFAs that match a word within some edit distance.
     /// <para/>
     /// Implements the algorithm described in:
     /// Schulz and Mihov: Fast String Correction with Levenshtein Automata
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     public class LevenshteinAutomata
     {
         /// <summary>
         /// @lucene.internal </summary>
         public const int MAXIMUM_SUPPORTED_DISTANCE = 2;

         /* input word */
         internal readonly int[] word;
         /* the automata alphabet. */
         internal readonly int[] alphabet;
         /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
         internal readonly int alphaMax;

         /* the ranges outside of alphabet */
         internal readonly int[] rangeLower;
         internal readonly int[] rangeUpper;
         internal int numRanges = 0;

         internal ParametricDescription[] descriptions;

         /// <summary>
         /// Create a new <see cref="LevenshteinAutomata"/> for some <paramref name="input"/> string.
         /// Optionally count transpositions as a primitive edit.
         /// </summary>
         public LevenshteinAutomata(string input, bool withTranspositions)
             : this(CodePoints(input), Character.MaxCodePoint, withTranspositions)
         {
         }

         /// <summary>
         /// Expert: specify a custom maximum possible symbol
         /// (alphaMax); default is <see cref="Character.MaxCodePoint"/>.
         /// </summary>
         public LevenshteinAutomata(int[] word, int alphaMax, bool withTranspositions)
         {
             this.word = word;
             this.alphaMax = alphaMax;

             // calculate the alphabet
             ISet<int> set = new JCG.SortedSet<int>();
             for (int i = 0; i < word.Length; i++)
             {
                 int v = word[i];
                 if (v > alphaMax)
                 {
                     throw new ArgumentException("alphaMax exceeded by symbol " + v + " in word");
                 }
                 set.Add(v);
             }
             alphabet = new int[set.Count];
             using (IEnumerator<int> iterator = set.GetEnumerator())
             {
                 for (int i = 0; i < alphabet.Length; i++)
                 {
                     iterator.MoveNext();
                     alphabet[i] = iterator.Current;
                 }
             }

             rangeLower = new int[alphabet.Length + 2];
             rangeUpper = new int[alphabet.Length + 2];
             // calculate the unicode range intervals that exclude the alphabet
             // these are the ranges for all unicode characters not in the alphabet
             int lower = 0;
             for (int i = 0; i < alphabet.Length; i++)
             {
                 int higher = alphabet[i];
                 if (higher > lower)
                 {
                     rangeLower[numRanges] = lower;
                     rangeUpper[numRanges] = higher - 1;
                     numRanges++;
                 }
                 lower = higher + 1;
             }
             /* add the final endpoint */
             if (lower <= alphaMax)
             {
                 rangeLower[numRanges] = lower;
                 rangeUpper[numRanges] = alphaMax;
                 numRanges++;
             }

             descriptions = new ParametricDescription[] {
                 null,
                 withTranspositions ? (ParametricDescription)new Lev1TParametricDescription(word.Length) : new Lev1ParametricDescription(word.Length),
                 withTranspositions ? (ParametricDescription)new Lev2TParametricDescription(word.Length) : new Lev2ParametricDescription(word.Length)
             };
         }

         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int[] CodePoints(string input)
         {
             int length = Character.CodePointCount(input, 0, input.Length);
             int[] word = new int[length];
             int cp; // LUCENENET: Removed unnecessary assignment
             for (int i = 0, j = 0; i < input.Length; i += Character.CharCount(cp))
             {
                 word[j++] = cp = Character.CodePointAt(input, i);
             }
             return word;
         }

         /// <summary>
         /// Compute a DFA that accepts all strings within an edit distance of <paramref name="n"/>.
         /// <para>
         /// All automata have the following properties:
         /// <list type="bullet">
         ///     <item><description>They are deterministic (DFA).</description></item>
         ///     <item><description>There are no transitions to dead states.</description></item>
         ///     <item><description>They are not minimal (some transitions could be combined).</description></item>
         /// </list>
         /// </para>
         /// </summary>
         public virtual Automaton ToAutomaton(int n)
         {
             if (n == 0)
             {
                 return BasicAutomata.MakeString(word, 0, word.Length);
             }

             if (n >= descriptions.Length)
             {
                 return null;
             }

             int range = 2 * n + 1;
             ParametricDescription description = descriptions[n];
             // the number of states is based on the length of the word and n
             State[] states = new State[description.Count];
             // create all states, and mark as accept states if appropriate
             for (int i = 0; i < states.Length; i++)
             {
                 states[i] = new State
                 {
                     number = i,
                     accept = description.IsAccept(i)
                 };
             }
             // create transitions from state to state
             for (int k = 0; k < states.Length; k++)
             {
                 int xpos = description.GetPosition(k);
                 if (xpos < 0)
                 {
                     continue;
                 }
                 int end = xpos + Math.Min(word.Length - xpos, range);

                 for (int x = 0; x < alphabet.Length; x++)
                 {
                     int ch = alphabet[x];
                     // get the characteristic vector at this position wrt ch
                     int cvec = GetVector(ch, xpos, end);
                     int dest = description.Transition(k, xpos, cvec);
                     if (dest >= 0)
                     {
                         states[k].AddTransition(new Transition(ch, states[dest]));
                     }
                 }
                 // add transitions for all other chars in unicode
                 // by definition, their characteristic vectors are always 0,
                 // because they do not exist in the input string.
                 int dest_ = description.Transition(k, xpos, 0); // by definition
                 if (dest_ >= 0)
                 {
                     for (int r = 0; r < numRanges; r++)
                     {
                         states[k].AddTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest_]));
                     }
                 }
             }

             Automaton a = new Automaton(states[0])
             {
                 deterministic = true
             };
             // we create some useless unconnected states, and its a net-win overall to remove these,
             // as well as to combine any adjacent transitions (it makes later algorithms more efficient).
             // so, while we could set our numberedStates here, its actually best not to, and instead to
             // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions.
             //a.setNumberedStates(states);
             a.Reduce();
             // we need not trim transitions to dead states, as they are not created.
             //a.restoreInvariant();
             return a;
         }

         /// <summary>
         /// Get the characteristic vector <c>X(x, V)</c>
         /// where V is <c>Substring(pos, end - pos)</c>.
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal virtual int GetVector(int x, int pos, int end)
         {
             int vector = 0;
             for (int i = pos; i < end; i++)
             {
                 vector <<= 1;
                 if (word[i] == x)
                 {
                     vector |= 1;
                 }
             }
             return vector;
         }

         /// <summary>
         /// A <see cref="ParametricDescription"/> describes the structure of a Levenshtein DFA for some degree <c>n</c>.
         /// <para/>
         /// There are four components of a parametric description, all parameterized on the length
         /// of the word <c>w</c>:
         /// <list type="number">
         ///     <item><description>The number of states: <see cref="Count"/></description></item>
         ///     <item><description>The set of final states: <see cref="IsAccept(int)"/></description></item>
         ///     <item><description>The transition function: <see cref="Transition(int, int, int)"/></description></item>
         ///     <item><description>Minimal boundary function: <see cref="GetPosition(int)"/></description></item>
         /// </list>
         /// </summary>
         internal abstract class ParametricDescription
         {
             protected readonly int m_w;
             protected readonly int m_n;
             private readonly int[] minErrors;

             internal ParametricDescription(int w, int n, int[] minErrors)
             {
                 this.m_w = w;
                 this.m_n = n;
                 this.minErrors = minErrors;
             }

             /// <summary>
             /// Return the number of states needed to compute a Levenshtein DFA.
             /// <para/>
             /// NOTE: This was size() in Lucene.
             /// </summary>
             internal virtual int Count => minErrors.Length * (m_w + 1);

             /// <summary>
             /// Returns <c>true</c> if the <c>state</c> in any Levenshtein DFA is an accept state (final state).
             /// </summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             internal virtual bool IsAccept(int absState)
             {
                 // decode absState -> state, offset
                 int state = absState / (m_w + 1);
                 int offset = absState % (m_w + 1);
                 if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0);
                 return m_w - offset + minErrors[state] <= m_n;
             }

             /// <summary>
             /// Returns the position in the input word for a given <c>state</c>.
             /// this is the minimal boundary for the state.
             /// </summary>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             internal virtual int GetPosition(int absState)
             {
                 return absState % (m_w + 1);
             }

             /// <summary>
             /// Returns the state number for a transition from the given <paramref name="state"/>,
             /// assuming <paramref name="position"/> and characteristic vector <paramref name="vector"/>.
             /// </summary>
             internal abstract int Transition(int state, int position, int vector);

             private static readonly long[] MASKS = new long[] {
                 0x1, 0x3, 0x7, 0xf,
                 0x1f, 0x3f, 0x7f, 0xff,
                 0x1ff, 0x3ff, 0x7ff, 0xfff,
                 0x1fff, 0x3fff, 0x7fff, 0xffff,
                 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff,
                 0x1fffff, 0x3fffff, 0x7fffff, 0xffffff,
                 0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff,
                 0x1fffffff, 0x3fffffff, 0x7fffffffL, 0xffffffffL,
                 0x1ffffffffL, 0x3ffffffffL, 0x7ffffffffL, 0xfffffffffL,
                 0x1fffffffffL, 0x3fffffffffL, 0x7fffffffffL, 0xffffffffffL,
                 0x1ffffffffffL, 0x3ffffffffffL, 0x7ffffffffffL, 0xfffffffffffL,
                 0x1fffffffffffL, 0x3fffffffffffL, 0x7fffffffffffL, 0xffffffffffffL,
                 0x1ffffffffffffL, 0x3ffffffffffffL, 0x7ffffffffffffL, 0xfffffffffffffL,
                 0x1fffffffffffffL, 0x3fffffffffffffL, 0x7fffffffffffffL, 0xffffffffffffffL,
                 0x1ffffffffffffffL, 0x3ffffffffffffffL, 0x7ffffffffffffffL, 0xfffffffffffffffL,
                 0x1fffffffffffffffL, 0x3fffffffffffffffL, 0x7fffffffffffffffL
             };

             protected internal virtual int Unpack(long[] data, int index, int bitsPerValue)
             {
                 long bitLoc = bitsPerValue * index;
                 int dataLoc = (int)(bitLoc >> 6);
                 int bitStart = (int)(bitLoc & 63);
                 if (bitStart + bitsPerValue <= 64)
                 {
                     // not split
                     return (int)((data[dataLoc] >> bitStart) & MASKS[bitsPerValue - 1]);
                 }
                 else
                 {
                     // split
                     int part = 64 - bitStart;
                     return (int)(((data[dataLoc] >> bitStart) & MASKS[part - 1]) + ((data[1 + dataLoc] & MASKS[bitsPerValue - part - 1]) << part));
                 }
             }
         }
     }
 }
	using J2N;
	using Lucene.Net.Diagnostics;
	using System;
	using System.Collections.Generic;
	using System.Runtime.CompilerServices;
	using JCG = J2N.Collections.Generic;

	namespace Lucene.Net.Util.Automaton
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Class to construct DFAs that match a word within some edit distance.
	/// <para/>
	/// Implements the algorithm described in:
	/// Schulz and Mihov: Fast String Correction with Levenshtein Automata
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	public class LevenshteinAutomata
	{
	/// <summary>
	/// @lucene.internal </summary>
	public const int MAXIMUM_SUPPORTED_DISTANCE = 2;

	/* input word */
	internal readonly int[] word;
	/* the automata alphabet. */
	internal readonly int[] alphabet;
	/* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
	internal readonly int alphaMax;

	/* the ranges outside of alphabet */
	internal readonly int[] rangeLower;
	internal readonly int[] rangeUpper;
	internal int numRanges = 0;

	internal ParametricDescription[] descriptions;

	/// <summary>
	/// Create a new <see cref="LevenshteinAutomata"/> for some <paramref name="input"/> string.
	/// Optionally count transpositions as a primitive edit.
	/// </summary>
	public LevenshteinAutomata(string input, bool withTranspositions)
	: this(CodePoints(input), Character.MaxCodePoint, withTranspositions)
	{
	}

	/// <summary>
	/// Expert: specify a custom maximum possible symbol
	/// (alphaMax); default is <see cref="Character.MaxCodePoint"/>.
	/// </summary>
	public LevenshteinAutomata(int[] word, int alphaMax, bool withTranspositions)
	{
	this.word = word;
	this.alphaMax = alphaMax;

	// calculate the alphabet
	ISet<int> set = new JCG.SortedSet<int>();
	for (int i = 0; i < word.Length; i++)
	{
	int v = word[i];
	if (v > alphaMax)
	{
	throw new ArgumentException("alphaMax exceeded by symbol " + v + " in word");
	}
	set.Add(v);
	}
	alphabet = new int[set.Count];
	using (IEnumerator<int> iterator = set.GetEnumerator())
	{
	for (int i = 0; i < alphabet.Length; i++)
	{
	iterator.MoveNext();
	alphabet[i] = iterator.Current;
	}
	}

	rangeLower = new int[alphabet.Length + 2];
	rangeUpper = new int[alphabet.Length + 2];
	// calculate the unicode range intervals that exclude the alphabet
	// these are the ranges for all unicode characters not in the alphabet
	int lower = 0;
	for (int i = 0; i < alphabet.Length; i++)
	{
	int higher = alphabet[i];
	if (higher > lower)
	{
	rangeLower[numRanges] = lower;
	rangeUpper[numRanges] = higher - 1;
	numRanges++;
	}
	lower = higher + 1;
	}
	/* add the final endpoint */
	if (lower <= alphaMax)
	{
	rangeLower[numRanges] = lower;
	rangeUpper[numRanges] = alphaMax;
	numRanges++;
	}

	descriptions = new ParametricDescription[] {
	null,
	withTranspositions ? (ParametricDescription)new Lev1TParametricDescription(word.Length) : new Lev1ParametricDescription(word.Length),
	withTranspositions ? (ParametricDescription)new Lev2TParametricDescription(word.Length) : new Lev2ParametricDescription(word.Length)
	};
	}

	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	private static int[] CodePoints(string input)
	{
	int length = Character.CodePointCount(input, 0, input.Length);
	int[] word = new int[length];
	int cp; // LUCENENET: Removed unnecessary assignment
	for (int i = 0, j = 0; i < input.Length; i += Character.CharCount(cp))
	{
	word[j++] = cp = Character.CodePointAt(input, i);
	}
	return word;
	}

	/// <summary>
	/// Compute a DFA that accepts all strings within an edit distance of <paramref name="n"/>.
	/// <para>
	/// All automata have the following properties:
	/// <list type="bullet">
	/// <item><description>They are deterministic (DFA).</description></item>
	/// <item><description>There are no transitions to dead states.</description></item>
	/// <item><description>They are not minimal (some transitions could be combined).</description></item>
	/// </list>
	/// </para>
	/// </summary>
	public virtual Automaton ToAutomaton(int n)
	{
	if (n == 0)
	{
	return BasicAutomata.MakeString(word, 0, word.Length);
	}

	if (n >= descriptions.Length)
	{
	return null;
	}

	int range = 2 * n + 1;
	ParametricDescription description = descriptions[n];
	// the number of states is based on the length of the word and n
	State[] states = new State[description.Count];
	// create all states, and mark as accept states if appropriate
	for (int i = 0; i < states.Length; i++)
	{
	states[i] = new State
	{
	number = i,
	accept = description.IsAccept(i)
	};
	}
	// create transitions from state to state
	for (int k = 0; k < states.Length; k++)
	{
	int xpos = description.GetPosition(k);
	if (xpos < 0)
	{
	continue;
	}
	int end = xpos + Math.Min(word.Length - xpos, range);

	for (int x = 0; x < alphabet.Length; x++)
	{
	int ch = alphabet[x];
	// get the characteristic vector at this position wrt ch
	int cvec = GetVector(ch, xpos, end);
	int dest = description.Transition(k, xpos, cvec);
	if (dest >= 0)
	{
	states[k].AddTransition(new Transition(ch, states[dest]));
	}
	}
	// add transitions for all other chars in unicode
	// by definition, their characteristic vectors are always 0,
	// because they do not exist in the input string.
	int dest_ = description.Transition(k, xpos, 0); // by definition
	if (dest_ >= 0)
	{
	for (int r = 0; r < numRanges; r++)
	{
	states[k].AddTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest_]));
	}
	}
	}

	Automaton a = new Automaton(states[0])
	{
	deterministic = true
	};
	// we create some useless unconnected states, and its a net-win overall to remove these,
	// as well as to combine any adjacent transitions (it makes later algorithms more efficient).
	// so, while we could set our numberedStates here, its actually best not to, and instead to
	// force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions.
	//a.setNumberedStates(states);
	a.Reduce();
	// we need not trim transitions to dead states, as they are not created.
	//a.restoreInvariant();
	return a;
	}

	/// <summary>
	/// Get the characteristic vector <c>X(x, V)</c>
	/// where V is <c>Substring(pos, end - pos)</c>.
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	internal virtual int GetVector(int x, int pos, int end)
	{
	int vector = 0;
	for (int i = pos; i < end; i++)
	{
	vector <<= 1;
	if (word[i] == x)
	{
	vector \|= 1;
	}
	}
	return vector;
	}

	/// <summary>
	/// A <see cref="ParametricDescription"/> describes the structure of a Levenshtein DFA for some degree <c>n</c>.
	/// <para/>
	/// There are four components of a parametric description, all parameterized on the length
	/// of the word <c>w</c>:
	/// <list type="number">
	/// <item><description>The number of states: <see cref="Count"/></description></item>
	/// <item><description>The set of final states: <see cref="IsAccept(int)"/></description></item>
	/// <item><description>The transition function: <see cref="Transition(int, int, int)"/></description></item>
	/// <item><description>Minimal boundary function: <see cref="GetPosition(int)"/></description></item>
	/// </list>
	/// </summary>
	internal abstract class ParametricDescription
	{
	protected readonly int m_w;
	protected readonly int m_n;
	private readonly int[] minErrors;

	internal ParametricDescription(int w, int n, int[] minErrors)
	{
	this.m_w = w;
	this.m_n = n;
	this.minErrors = minErrors;
	}

	/// <summary>
	/// Return the number of states needed to compute a Levenshtein DFA.
	/// <para/>
	/// NOTE: This was size() in Lucene.
	/// </summary>
	internal virtual int Count => minErrors.Length * (m_w + 1);

	/// <summary>
	/// Returns <c>true</c> if the <c>state</c> in any Levenshtein DFA is an accept state (final state).
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	internal virtual bool IsAccept(int absState)
	{
	// decode absState -> state, offset
	int state = absState / (m_w + 1);
	int offset = absState % (m_w + 1);
	if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0);
	return m_w - offset + minErrors[state] <= m_n;
	}

	/// <summary>
	/// Returns the position in the input word for a given <c>state</c>.
	/// this is the minimal boundary for the state.
	/// </summary>
	[MethodImpl(MethodImplOptions.AggressiveInlining)]
	internal virtual int GetPosition(int absState)
	{
	return absState % (m_w + 1);
	}

	/// <summary>
	/// Returns the state number for a transition from the given <paramref name="state"/>,
	/// assuming <paramref name="position"/> and characteristic vector <paramref name="vector"/>.
	/// </summary>
	internal abstract int Transition(int state, int position, int vector);

	private static readonly long[] MASKS = new long[] {
	0x1, 0x3, 0x7, 0xf,
	0x1f, 0x3f, 0x7f, 0xff,
	0x1ff, 0x3ff, 0x7ff, 0xfff,
	0x1fff, 0x3fff, 0x7fff, 0xffff,
	0x1ffff, 0x3ffff, 0x7ffff, 0xfffff,
	0x1fffff, 0x3fffff, 0x7fffff, 0xffffff,
	0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff,
	0x1fffffff, 0x3fffffff, 0x7fffffffL, 0xffffffffL,
	0x1ffffffffL, 0x3ffffffffL, 0x7ffffffffL, 0xfffffffffL,
	0x1fffffffffL, 0x3fffffffffL, 0x7fffffffffL, 0xffffffffffL,
	0x1ffffffffffL, 0x3ffffffffffL, 0x7ffffffffffL, 0xfffffffffffL,
	0x1fffffffffffL, 0x3fffffffffffL, 0x7fffffffffffL, 0xffffffffffffL,
	0x1ffffffffffffL, 0x3ffffffffffffL, 0x7ffffffffffffL, 0xfffffffffffffL,
	0x1fffffffffffffL, 0x3fffffffffffffL, 0x7fffffffffffffL, 0xffffffffffffffL,
	0x1ffffffffffffffL, 0x3ffffffffffffffL, 0x7ffffffffffffffL, 0xfffffffffffffffL,
	0x1fffffffffffffffL, 0x3fffffffffffffffL, 0x7fffffffffffffffL
	};

	protected internal virtual int Unpack(long[] data, int index, int bitsPerValue)
	{
	long bitLoc = bitsPerValue * index;
	int dataLoc = (int)(bitLoc >> 6);
	int bitStart = (int)(bitLoc & 63);
	if (bitStart + bitsPerValue <= 64)
	{
	// not split
	return (int)((data[dataLoc] >> bitStart) & MASKS[bitsPerValue - 1]);
	}
	else
	{
	// split
	int part = 64 - bitStart;
	return (int)(((data[dataLoc] >> bitStart) & MASKS[part - 1]) + ((data[1 + dataLoc] & MASKS[bitsPerValue - part - 1]) << part));
	}
	}
	}
	}
	}