blob: e95abbab144b5fab88ac3571e7420f9eb9aa0ab6 [file] [log] [blame]
using J2N;
using J2N.Runtime.CompilerServices;
using J2N.Text;
using Lucene.Net.Diagnostics;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using Arrays = Lucene.Net.Support.Arrays;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Util.Automaton
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Builds a minimal, deterministic <see cref="Automaton"/> that accepts a set of
/// strings. The algorithm requires sorted input data, but is very fast
/// (nearly linear with the input size).
/// </summary>
/// <seealso cref="Build(ICollection{BytesRef})"/>
/// <seealso cref="BasicAutomata.MakeStringUnion(ICollection{BytesRef})"/>
internal sealed class DaciukMihovAutomatonBuilder
{
/// <summary>
/// DFSA state with <see cref="char"/> labels on transitions.
/// </summary>
public sealed class State // LUCENENET NOTE: Made public because it is returned from a public member
{
/// <summary>
/// An empty set of labels. </summary>
private static readonly int[] NO_LABELS = Arrays.Empty<int>();
/// <summary>
/// An empty set of states. </summary>
private static readonly State[] NO_STATES = Arrays.Empty<State>();
/// <summary>
/// Labels of outgoing transitions. Indexed identically to <see cref="states"/>.
/// Labels must be sorted lexicographically.
/// </summary>
internal int[] labels = NO_LABELS;
/// <summary>
/// States reachable from outgoing transitions. Indexed identically to
/// <see cref="labels"/>.
/// </summary>
internal State[] states = NO_STATES;
/// <summary>
/// <c>true</c> if this state corresponds to the end of at least one
/// input sequence.
/// </summary>
internal bool is_final;
/// <summary>
/// Returns the target state of a transition leaving this state and labeled
/// with <paramref name="label"/>. If no such transition exists, returns
/// <c>null</c>.
/// </summary>
internal State GetState(int label)
{
int index = Array.BinarySearch(labels, label);
return index >= 0 ? states[index] : null;
}
/// <summary>
/// Two states are equal if:
/// <list type="bullet">
/// <item><description>They have an identical number of outgoing transitions, labeled with
/// the same labels.</description></item>
/// <item><description>Corresponding outgoing transitions lead to the same states (to states
/// with an identical right-language).</description></item>
/// </list>
/// </summary>
public override bool Equals(object obj)
{
State other = (State)obj;
return is_final == other.is_final && Arrays.Equals(this.labels, other.labels) && ReferenceEquals(this.states, other.states);
}
/// <summary>
/// Compute the hash code of the <i>current</i> status of this state.
/// </summary>
public override int GetHashCode()
{
int hash = is_final ? 1 : 0;
hash ^= hash * 31 + this.labels.Length;
foreach (int c in this.labels)
{
hash ^= hash * 31 + c;
}
/*
* Compare the right-language of this state using reference-identity of
* outgoing states. this is possible because states are interned (stored
* in registry) and traversed in post-order, so any outgoing transitions
* are already interned.
*/
foreach (State s in this.states)
{
hash ^= s.GetHashCode();
}
return hash;
}
/// <summary>
/// Return <c>true</c> if this state has any children (outgoing
/// transitions).
/// </summary>
internal bool HasChildren => labels.Length > 0;
/// <summary>
/// Create a new outgoing transition labeled <paramref name="label"/> and return
/// the newly created target state for this transition.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal State NewState(int label)
{
if (Debugging.AssertsEnabled) Debugging.Assert(Array.BinarySearch(labels, label) < 0, "State already has transition labeled: {0}", label);
labels = Arrays.CopyOf(labels, labels.Length + 1);
states = Arrays.CopyOf(states, states.Length + 1);
labels[labels.Length - 1] = label;
return states[states.Length - 1] = new State();
}
/// <summary>
/// Return the most recent transitions's target state.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal State LastChild() // LUCENENET NOTE: Kept this a method because there is another overload
{
if (Debugging.AssertsEnabled) Debugging.Assert(HasChildren, "No outgoing transitions.");
return states[states.Length - 1];
}
/// <summary>
/// Return the associated state if the most recent transition is labeled with
/// <paramref name="label"/>.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal State LastChild(int label)
{
int index = labels.Length - 1;
State s = null;
if (index >= 0 && labels[index] == label)
{
s = states[index];
}
if (Debugging.AssertsEnabled) Debugging.Assert(s == GetState(label));
return s;
}
/// <summary>
/// Replace the last added outgoing transition's target state with the given
/// <paramref name="state"/>.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void ReplaceLastChild(State state)
{
if (Debugging.AssertsEnabled) Debugging.Assert(HasChildren, "No outgoing transitions.");
states[states.Length - 1] = state;
}
/// <summary>
/// Compare two lists of objects for reference-equality.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ReferenceEquals(object[] a1, object[] a2)
{
if (a1.Length != a2.Length)
{
return false;
}
for (int i = 0; i < a1.Length; i++)
{
if (a1[i] != a2[i])
{
return false;
}
}
return true;
}
}
/// <summary>
/// A "registry" for state interning.
/// </summary>
private Dictionary<State, State> stateRegistry = new Dictionary<State, State>();
/// <summary>
/// Root automaton state.
/// </summary>
private readonly State root = new State();
/// <summary>
/// Previous sequence added to the automaton in <see cref="Add(CharsRef)"/>.
/// </summary>
private CharsRef previous;
/// <summary>
/// A comparer used for enforcing sorted UTF8 order, used in assertions only.
/// </summary>
private static readonly IComparer<CharsRef> comparer =
#pragma warning disable 612, 618
CharsRef.UTF16SortedAsUTF8Comparer;
#pragma warning restore 612, 618
/// <summary>
/// Add another character sequence to this automaton. The sequence must be
/// lexicographically larger or equal compared to any previous sequences added
/// to this automaton (the input must be sorted).
/// </summary>
public void Add(CharsRef current)
{
if (Debugging.AssertsEnabled)
{
Debugging.Assert(stateRegistry != null, "Automaton already built.");
Debugging.Assert(previous == null || comparer.Compare(previous, current) <= 0, "Input must be in sorted UTF-8 order: {0} >= {1}", previous, current);
Debugging.Assert(SetPrevious(current));
}
// Descend in the automaton (find matching prefix).
int pos = 0, max = current.Length;
State next, state = root;
while (pos < max && (next = state.LastChild(Character.CodePointAt(current, pos))) != null)
{
state = next;
// todo, optimize me
pos += Character.CharCount(Character.CodePointAt(current, pos));
}
if (state.HasChildren)
{
ReplaceOrRegister(state);
}
AddSuffix(state, current, pos);
}
/// <summary>
/// Finalize the automaton and return the root state. No more strings can be
/// added to the builder after this call.
/// </summary>
/// <returns> Root automaton state. </returns>
public State Complete()
{
if (this.stateRegistry == null)
{
throw new InvalidOperationException();
}
if (root.HasChildren)
{
ReplaceOrRegister(root);
}
stateRegistry = null;
return root;
}
/// <summary>
/// Internal recursive traversal for conversion.
/// </summary>
/// <param name="s"></param>
/// <param name="visited">Must use a dictionary with <see cref="IdentityEqualityComparer{State}.Default"/> passed into its constructor.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Util.Automaton.State Convert(State s, IDictionary<State, Util.Automaton.State> visited)
{
if (visited.TryGetValue(s, out Util.Automaton.State converted) && converted != null)
{
return converted;
}
converted = new Util.Automaton.State
{
Accept = s.is_final
};
visited[s] = converted;
int i = 0;
int[] labels = s.labels;
foreach (DaciukMihovAutomatonBuilder.State target in s.states)
{
converted.AddTransition(new Transition(labels[i++], Convert(target, visited)));
}
return converted;
}
/// <summary>
/// Build a minimal, deterministic automaton from a sorted list of <see cref="BytesRef"/> representing
/// strings in UTF-8. These strings must be binary-sorted.
/// </summary>
public static Automaton Build(ICollection<BytesRef> input)
{
DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
CharsRef scratch = new CharsRef();
foreach (BytesRef b in input)
{
UnicodeUtil.UTF8toUTF16(b, scratch);
builder.Add(scratch);
}
return new Automaton
{
initial = Convert(builder.Complete(), new JCG.Dictionary<State, Lucene.Net.Util.Automaton.State>(IdentityEqualityComparer<State>.Default)),
deterministic = true
};
}
/// <summary>
/// Copy <paramref name="current"/> into an internal buffer.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool SetPrevious(CharsRef current)
{
// don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277
// still, called only from assert
previous = CharsRef.DeepCopyOf(current);
return true;
}
/// <summary>
/// Replace last child of <paramref name="state"/> with an already registered state
/// or stateRegistry the last child state.
/// </summary>
private void ReplaceOrRegister(State state)
{
State child = state.LastChild();
if (child.HasChildren)
{
ReplaceOrRegister(child);
}
if (stateRegistry.TryGetValue(child, out State registered))
{
state.ReplaceLastChild(registered);
}
else
{
stateRegistry[child] = child;
}
}
/// <summary>
/// Add a suffix of <paramref name="current"/> starting at <paramref name="fromIndex"/>
/// (inclusive) to state <paramref name="state"/>.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void AddSuffix(State state, ICharSequence current, int fromIndex) // LUCENENET: CA1822: Mark members as static
{
int len = current.Length;
while (fromIndex < len)
{
int cp = Character.CodePointAt(current, fromIndex);
state = state.NewState(cp);
fromIndex += Character.CharCount(cp);
}
state.is_final = true;
}
}
}