blob: 9ecd071fcdb2c84992f75ee340c331e3a6394070 [file] [log] [blame]
using J2N;
using Lucene.Net.Index;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Attribute = Lucene.Net.Util.Attribute;
using AttributeSource = Lucene.Net.Util.AttributeSource;
using Automaton = Lucene.Net.Util.Automaton.Automaton;
using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata;
using BasicOperations = Lucene.Net.Util.Automaton.BasicOperations;
using IBits = Lucene.Net.Util.IBits;
using ByteRunAutomaton = Lucene.Net.Util.Automaton.ByteRunAutomaton;
using BytesRef = Lucene.Net.Util.BytesRef;
using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton;
using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
using DocsEnum = Lucene.Net.Index.DocsEnum;
using FilteredTermsEnum = Lucene.Net.Index.FilteredTermsEnum;
using LevenshteinAutomata = Lucene.Net.Util.Automaton.LevenshteinAutomata;
using Term = Lucene.Net.Index.Term;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using TermState = Lucene.Net.Index.TermState;
using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
/// <summary>
/// Subclass of <see cref="TermsEnum"/> for enumerating all terms that are similar
/// to the specified filter term.
///
/// <para>Term enumerations are always ordered by
/// <see cref="Comparer"/>. Each term in the enumeration is
/// greater than all that precede it.</para>
/// </summary>
public class FuzzyTermsEnum : TermsEnum
{
private void InitializeInstanceFields()
{
boostAtt = Attributes.AddAttribute<IBoostAttribute>();
}
private TermsEnum actualEnum;
private IBoostAttribute actualBoostAtt;
private IBoostAttribute boostAtt;
private readonly IMaxNonCompetitiveBoostAttribute maxBoostAtt;
private readonly ILevenshteinAutomataAttribute dfaAtt;
private float bottom;
private BytesRef bottomTerm;
// TODO: chicken-and-egg
private readonly IComparer<BytesRef> termComparer = BytesRef.UTF8SortedAsUnicodeComparer;
protected readonly float m_minSimilarity;
protected readonly float m_scaleFactor;
protected readonly int m_termLength;
protected int m_maxEdits;
protected readonly bool m_raw;
protected readonly Terms m_terms;
private readonly Term term;
protected readonly int[] m_termText;
protected readonly int m_realPrefixLength;
private readonly bool transpositions;
/// <summary>
/// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
/// length <paramref name="prefixLength"/> with <paramref name="term"/> and which have a fuzzy similarity &gt;
/// <paramref name="minSimilarity"/>.
/// <para/>
/// After calling the constructor the enumeration is already pointing to the first
/// valid term if such a term exists.
/// </summary>
/// <param name="terms"> Delivers terms. </param>
/// <param name="atts"> <see cref="AttributeSource"/> created by the rewrite method of <see cref="MultiTermQuery"/>
/// thats contains information about competitive boosts during rewrite. It is also used
/// to cache DFAs between segment transitions. </param>
/// <param name="term"> Pattern term. </param>
/// <param name="minSimilarity"> Minimum required similarity for terms from the reader. Pass an integer value
/// representing edit distance. Passing a fraction is deprecated. </param>
/// <param name="prefixLength"> Length of required common prefix. Default value is 0. </param>
/// <param name="transpositions"> Transpositions </param>
/// <exception cref="System.IO.IOException"> if there is a low-level IO error </exception>
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength, bool transpositions)
{
InitializeInstanceFields();
if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
{
throw new ArgumentException("fractional edit distances are not allowed");
}
if (minSimilarity < 0.0f)
{
throw new ArgumentException("minimumSimilarity cannot be less than 0");
}
if (prefixLength < 0)
{
throw new ArgumentException("prefixLength cannot be less than 0");
}
this.m_terms = terms;
this.term = term;
// convert the string into a utf32 int[] representation for fast comparisons
string utf16 = term.Text();
this.m_termText = new int[utf16.CodePointCount(0, utf16.Length)];
for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
{
m_termText[j++] = cp = utf16.CodePointAt(i);
}
this.m_termLength = m_termText.Length;
this.dfaAtt = atts.AddAttribute<ILevenshteinAutomataAttribute>();
//The prefix could be longer than the word.
//It's kind of silly though. It means we must match the entire word.
this.m_realPrefixLength = prefixLength > m_termLength ? m_termLength : prefixLength;
// if minSimilarity >= 1, we treat it as number of edits
if (minSimilarity >= 1f)
{
this.m_minSimilarity = 0; // just driven by number of edits
m_maxEdits = (int)minSimilarity;
m_raw = true;
}
else
{
this.m_minSimilarity = minSimilarity;
// calculate the maximum k edits for this similarity
m_maxEdits = InitialMaxDistance(this.m_minSimilarity, m_termLength);
m_raw = false;
}
if (transpositions && m_maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
{
throw new System.NotSupportedException("with transpositions enabled, distances > " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
}
this.transpositions = transpositions;
this.m_scaleFactor = 1.0f / (1.0f - this.m_minSimilarity);
this.maxBoostAtt = atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>();
bottom = maxBoostAtt.MaxNonCompetitiveBoost;
bottomTerm = maxBoostAtt.CompetitiveTerm;
BottomChanged(null, true);
}
/// <summary>
/// Return an automata-based enum for matching up to <paramref name="editDistance"/> from
/// <paramref name="lastTerm"/>, if possible
/// </summary>
protected virtual TermsEnum GetAutomatonEnum(int editDistance, BytesRef lastTerm)
{
IList<CompiledAutomaton> runAutomata = InitAutomata(editDistance);
if (editDistance < runAutomata.Count)
{
//if (BlockTreeTermsWriter.DEBUG) System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString()));
CompiledAutomaton compiled = runAutomata[editDistance];
return new AutomatonFuzzyTermsEnum(this, m_terms.Intersect(compiled, lastTerm == null ? null : compiled.Floor(lastTerm, new BytesRef())), runAutomata.SubList(0, editDistance + 1).ToArray(/*new CompiledAutomaton[editDistance + 1]*/));
}
else
{
return null;
}
}
/// <summary>
/// Initialize levenshtein DFAs up to maxDistance, if possible </summary>
private IList<CompiledAutomaton> InitAutomata(int maxDistance)
{
IList<CompiledAutomaton> runAutomata = dfaAtt.Automata;
//System.out.println("cached automata size: " + runAutomata.size());
if (runAutomata.Count <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
{
LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.NewString(m_termText, m_realPrefixLength, m_termText.Length - m_realPrefixLength), transpositions);
for (int i = runAutomata.Count; i <= maxDistance; i++)
{
Automaton a = builder.ToAutomaton(i);
//System.out.println("compute automaton n=" + i);
// constant prefix
if (m_realPrefixLength > 0)
{
Automaton prefix = BasicAutomata.MakeString(UnicodeUtil.NewString(m_termText, 0, m_realPrefixLength));
a = BasicOperations.Concatenate(prefix, a);
}
runAutomata.Add(new CompiledAutomaton(a, true, false));
}
}
return runAutomata;
}
/// <summary>
/// Swap in a new actual enum to proxy to </summary>
protected virtual void SetEnum(TermsEnum actualEnum)
{
this.actualEnum = actualEnum;
this.actualBoostAtt = actualEnum.Attributes.AddAttribute<IBoostAttribute>();
}
/// <summary>
/// Fired when the max non-competitive boost has changed. This is the hook to
/// swap in a smarter actualEnum
/// </summary>
private void BottomChanged(BytesRef lastTerm, bool init)
{
int oldMaxEdits = m_maxEdits;
// true if the last term encountered is lexicographically equal or after the bottom term in the PQ
bool termAfter = bottomTerm == null || (lastTerm != null && termComparer.Compare(lastTerm, bottomTerm) >= 0);
// as long as the max non-competitive boost is >= the max boost
// for some edit distance, keep dropping the max edit distance.
while (m_maxEdits > 0 && (termAfter ? bottom >= CalculateMaxBoost(m_maxEdits) : bottom > CalculateMaxBoost(m_maxEdits)))
{
m_maxEdits--;
}
if (oldMaxEdits != m_maxEdits || init) // the maximum n has changed
{
MaxEditDistanceChanged(lastTerm, m_maxEdits, init);
}
}
protected virtual void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init)
{
TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
// instead of assert, we do a hard check in case someone uses our enum directly
// assert newEnum != null;
if (newEnum == null)
{
Debug.Assert(maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
throw new System.ArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE");
}
SetEnum(newEnum);
}
// for some raw min similarity and input term length, the maximum # of edits
private int InitialMaxDistance(float minimumSimilarity, int termLen)
{
return (int)((1D - minimumSimilarity) * termLen);
}
// for some number of edits, the maximum possible scaled boost
private float CalculateMaxBoost(int nEdits)
{
float similarity = 1.0f - ((float)nEdits / (float)(m_termLength));
return (similarity - m_minSimilarity) * m_scaleFactor;
}
private BytesRef queuedBottom = null;
public override BytesRef Next()
{
if (queuedBottom != null)
{
BottomChanged(queuedBottom, false);
queuedBottom = null;
}
BytesRef term = actualEnum.Next();
boostAtt.Boost = actualBoostAtt.Boost;
float bottom = maxBoostAtt.MaxNonCompetitiveBoost;
BytesRef bottomTerm = maxBoostAtt.CompetitiveTerm;
if (term != null && (bottom != this.bottom || bottomTerm != this.bottomTerm))
{
this.bottom = bottom;
this.bottomTerm = bottomTerm;
// clone the term before potentially doing something with it
// this is a rare but wonderful occurrence anyway
queuedBottom = BytesRef.DeepCopyOf(term);
}
return term;
}
// proxy all other enum calls to the actual enum
public override int DocFreq
{
get { return actualEnum.DocFreq; }
}
public override long TotalTermFreq
{
get { return actualEnum.TotalTermFreq; }
}
public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags)
{
return actualEnum.Docs(liveDocs, reuse, flags);
}
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags)
{
return actualEnum.DocsAndPositions(liveDocs, reuse, flags);
}
public override void SeekExact(BytesRef term, TermState state)
{
actualEnum.SeekExact(term, state);
}
public override TermState GetTermState()
{
return actualEnum.GetTermState();
}
public override IComparer<BytesRef> Comparer => actualEnum.Comparer;
public override long Ord => actualEnum.Ord;
public override bool SeekExact(BytesRef text)
{
return actualEnum.SeekExact(text);
}
public override SeekStatus SeekCeil(BytesRef text)
{
return actualEnum.SeekCeil(text);
}
public override void SeekExact(long ord)
{
actualEnum.SeekExact(ord);
}
public override BytesRef Term => actualEnum.Term;
/// <summary>
/// Implement fuzzy enumeration with <see cref="Terms.Intersect(CompiledAutomaton, BytesRef)"/>.
/// <para/>
/// This is the fastest method as opposed to LinearFuzzyTermsEnum:
/// as enumeration is logarithmic to the number of terms (instead of linear)
/// and comparison is linear to length of the term (rather than quadratic)
/// </summary>
private class AutomatonFuzzyTermsEnum : FilteredTermsEnum
{
internal virtual void InitializeInstanceFields()
{
boostAtt = Attributes.AddAttribute<IBoostAttribute>();
}
private readonly FuzzyTermsEnum outerInstance;
private readonly ByteRunAutomaton[] matchers;
private readonly BytesRef termRef;
private IBoostAttribute boostAtt;
public AutomatonFuzzyTermsEnum(FuzzyTermsEnum outerInstance, TermsEnum tenum, CompiledAutomaton[] compiled)
: base(tenum, false)
{
this.outerInstance = outerInstance;
InitializeInstanceFields();
this.matchers = new ByteRunAutomaton[compiled.Length];
for (int i = 0; i < compiled.Length; i++)
{
this.matchers[i] = compiled[i].RunAutomaton;
}
termRef = new BytesRef(outerInstance.term.Text());
}
/// <summary>
/// Finds the smallest Lev(n) DFA that accepts the term. </summary>
protected override AcceptStatus Accept(BytesRef term)
{
//System.out.println("AFTE.accept term=" + term);
int ed = matchers.Length - 1;
// we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum,
// so we know the outer DFA always matches.
// now compute exact edit distance
while (ed > 0)
{
if (Matches(term, ed - 1))
{
ed--;
}
else
{
break;
}
}
//System.out.println("CHECK term=" + term.utf8ToString() + " ed=" + ed);
// scale to a boost and return (if similarity > minSimilarity)
if (ed == 0) // exact match
{
boostAtt.Boost = 1.0F;
//System.out.println(" yes");
return AcceptStatus.YES;
}
else
{
int codePointCount = UnicodeUtil.CodePointCount(term);
float similarity = 1.0f - ((float)ed / (float)(Math.Min(codePointCount, outerInstance.m_termLength)));
if (similarity > outerInstance.m_minSimilarity)
{
boostAtt.Boost = (similarity - outerInstance.m_minSimilarity) * outerInstance.m_scaleFactor;
//System.out.println(" yes");
return AcceptStatus.YES;
}
else
{
return AcceptStatus.NO;
}
}
}
/// <summary>
/// Returns <c>true</c> if <paramref name="term"/> is within <paramref name="k"/> edits of the query term </summary>
internal bool Matches(BytesRef term, int k)
{
return k == 0 ? term.Equals(termRef) : matchers[k].Run(term.Bytes, term.Offset, term.Length);
}
}
/// <summary>
/// @lucene.internal </summary>
public virtual float MinSimilarity => m_minSimilarity;
/// <summary>
/// @lucene.internal </summary>
public virtual float ScaleFactor => m_scaleFactor;
/// <summary>
/// Reuses compiled automata across different segments,
/// because they are independent of the index
/// <para/>
/// @lucene.internal
/// </summary>
public interface ILevenshteinAutomataAttribute : IAttribute
{
IList<CompiledAutomaton> Automata { get; }
}
/// <summary>
/// Stores compiled automata as a list (indexed by edit distance)
/// <para/>
/// @lucene.internal
/// </summary>
public sealed class LevenshteinAutomataAttribute : Attribute, ILevenshteinAutomataAttribute
{
// LUCENENET NOTE: Must use JCG.List for Equals and GetHashCode()
private readonly IList<CompiledAutomaton> automata = new JCG.List<CompiledAutomaton>();
public IList<CompiledAutomaton> Automata
{
get { return automata; }
}
public override void Clear()
{
automata.Clear();
}
public override int GetHashCode()
{
return automata.GetHashCode();
}
public override bool Equals(object other)
{
if (this == other)
{
return true;
}
if (!(other is LevenshteinAutomataAttribute))
{
return false;
}
return automata.Equals(((LevenshteinAutomataAttribute)other).automata);
}
public override void CopyTo(IAttribute target)
{
IList<CompiledAutomaton> targetAutomata = ((LevenshteinAutomataAttribute)target).Automata;
targetAutomata.Clear();
targetAutomata.AddRange(automata);
}
}
}
}