blob: a216941aa9629eea769640427e29e41cb0a90ce1 [file] [log] [blame]
using J2N;
using Lucene.Net.Analysis;
using Lucene.Net.Util;
using Lucene.Net.Util.Automaton;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
namespace Lucene.Net.Search.Suggest.Analyzing
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Implements a fuzzy <see cref="AnalyzingSuggester"/>. The similarity measurement is
/// based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
/// you can explicitly choose classic Levenshtein by passing <c>false</c>
/// for the <see cref="transpositions"/> parameter.
/// <para>
/// At most, this query will match terms up to <see cref="LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE"/>
/// edits. Higher distances are not supported. Note that the
/// fuzzy distance is measured in "byte space" on the bytes
/// returned by the <see cref="TokenStream"/>'s <see cref="Analysis.TokenAttributes.ITermToBytesRefAttribute"/>,
/// usually UTF8. By default
/// the analyzed bytes must be at least 3 <see cref="DEFAULT_MIN_FUZZY_LENGTH"/>
/// bytes before any edits are
/// considered. Furthermore, the first 1 <see cref="DEFAULT_NON_FUZZY_PREFIX"/>
/// byte is not allowed to be
/// edited. We allow up to 1 <see cref="DEFAULT_MAX_EDITS"/>
/// edit.
/// If <see cref="unicodeAware"/> parameter in the constructor is set to true, maxEdits,
/// minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
/// points (actual letters) instead of bytes.
///
/// </para>
/// <para>
/// NOTE: This suggester does not boost suggestions that
/// required no edits over suggestions that did require
/// edits. This is a known limitation.
///
/// </para>
/// <para>
/// Note: complex query analyzers can have a significant impact on the lookup
/// performance. It's recommended to not use analyzers that drop or inject terms
/// like synonyms to keep the complexity of the prefix intersection low for good
/// lookup performance. At index time, complex analyzers can safely be used.
/// </para>
///
/// @lucene.experimental
/// </summary>
public sealed class FuzzySuggester : AnalyzingSuggester
{
private readonly int maxEdits;
private readonly bool transpositions;
private readonly int nonFuzzyPrefix;
private readonly int minFuzzyLength;
private readonly bool unicodeAware;
/// <summary>
/// Measure <see cref="maxEdits"/>, <see cref="minFuzzyLength"/>,
/// <see cref="transpositions"/>, and <see cref="nonFuzzyPrefix"/>
/// parameters in Unicode code points (actual letters)
/// instead of bytes.
/// </summary>
public const bool DEFAULT_UNICODE_AWARE = false;
/// <summary>
/// The default minimum length of the key passed to <see cref="Lookup"/>
/// before any edits are allowed.
/// </summary>
public const int DEFAULT_MIN_FUZZY_LENGTH = 3;
/// <summary>
/// The default prefix length where edits are not allowed.
/// </summary>
public const int DEFAULT_NON_FUZZY_PREFIX = 1;
/// <summary>
/// The default maximum number of edits for fuzzy
/// suggestions.
/// </summary>
public const int DEFAULT_MAX_EDITS = 1;
/// <summary>
/// The default transposition value passed to <see cref="LevenshteinAutomata"/>
/// </summary>
public const bool DEFAULT_TRANSPOSITIONS = true;
/// <summary>
/// Creates a <see cref="FuzzySuggester"/> instance initialized with default values.
/// </summary>
/// <param name="analyzer"> The <see cref="Analyzer"/> used for this suggester. </param>
public FuzzySuggester(Analyzer analyzer)
: this(analyzer, analyzer)
{
}
/// <summary>
/// Creates a <see cref="FuzzySuggester"/> instance with an index &amp; a query analyzer initialized with default values.
/// </summary>
/// <param name="indexAnalyzer">
/// <see cref="Analyzer"/> that will be used for analyzing suggestions while building the index. </param>
/// <param name="queryAnalyzer">
/// <see cref="Analyzer"/> that will be used for analyzing query text during lookup </param>
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer)
: this(indexAnalyzer, queryAnalyzer, SuggesterOptions.EXACT_FIRST | SuggesterOptions.PRESERVE_SEP, 256, -1, true,
DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX,
DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE)
{
}
/// <summary>
/// Creates a <see cref="FuzzySuggester"/> instance.
/// </summary>
/// <param name="indexAnalyzer"> The <see cref="Analyzer"/> that will be used for
/// analyzing suggestions while building the index. </param>
/// <param name="queryAnalyzer"> The <see cref="Analyzer"/> that will be used for
/// analyzing query text during lookup </param>
/// <param name="options"> see <see cref="SuggesterOptions.EXACT_FIRST"/>, <see cref="SuggesterOptions.PRESERVE_SEP"/> </param>
/// <param name="maxSurfaceFormsPerAnalyzedForm"> Maximum number of
/// surface forms to keep for a single analyzed form.
/// When there are too many surface forms we discard the
/// lowest weighted ones. </param>
/// <param name="maxGraphExpansions"> Maximum number of graph paths
/// to expand from the analyzed form. Set this to -1 for
/// no limit. </param>
/// <param name="preservePositionIncrements"> Whether position holes should appear in the automaton </param>
/// <param name="maxEdits"> must be &gt;= 0 and &lt;= <see cref="LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE"/>. </param>
/// <param name="transpositions"> <c>true</c> if transpositions should be treated as a primitive
/// edit operation. If this is false, comparisons will implement the classic
/// Levenshtein algorithm. </param>
/// <param name="nonFuzzyPrefix"> length of common (non-fuzzy) prefix (see default <see cref="DEFAULT_NON_FUZZY_PREFIX"/> </param>
/// <param name="minFuzzyLength"> minimum length of lookup key before any edits are allowed (see default <see cref="DEFAULT_MIN_FUZZY_LENGTH"/>) </param>
/// <param name="unicodeAware"> operate Unicode code points instead of bytes. </param>
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, SuggesterOptions options,
int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, bool preservePositionIncrements,
int maxEdits, bool transpositions, int nonFuzzyPrefix, int minFuzzyLength, bool unicodeAware)
: base(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements)
{
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
{
throw new System.ArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
if (nonFuzzyPrefix < 0)
{
throw new System.ArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
}
if (minFuzzyLength < 0)
{
throw new System.ArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
}
this.maxEdits = maxEdits;
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
this.unicodeAware = unicodeAware;
}
protected internal override IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> GetFullPrefixPaths(
IList<FSTUtil.Path<PairOutputs<long?, BytesRef>.Pair>> prefixPaths,
Automaton lookupAutomaton,
FST<PairOutputs<long?, BytesRef>.Pair> fst)
{
// TODO: right now there's no penalty for fuzzy/edits,
// ie a completion whose prefix matched exactly what the
// user typed gets no boost over completions that
// required an edit, which get no boost over completions
// requiring two edits. I suspect a multiplicative
// factor is appropriate (eg, say a fuzzy match must be at
// least 2X better weight than the non-fuzzy match to
// "compete") ... in which case I think the wFST needs
// to be log weights or something ...
Automaton levA = ConvertAutomaton(ToLevenshteinAutomata(lookupAutomaton));
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
return FSTUtil.IntersectPrefixPaths(levA, fst);
}
protected internal override Automaton ConvertAutomaton(Automaton a)
{
if (unicodeAware)
{
Automaton utf8automaton = (new UTF32ToUTF8()).Convert(a);
BasicOperations.Determinize(utf8automaton);
return utf8automaton;
}
else
{
return a;
}
}
internal override TokenStreamToAutomaton GetTokenStreamToAutomaton()
{
var tsta = base.GetTokenStreamToAutomaton();
tsta.UnicodeArcs = unicodeAware;
return tsta;
}
internal Automaton ToLevenshteinAutomata(Automaton automaton)
{
var @ref = SpecialOperations.GetFiniteStrings(automaton, -1);
Automaton[] subs = new Automaton[@ref.Count];
int upto = 0;
foreach (Int32sRef path in @ref)
{
if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength)
{
subs[upto] = BasicAutomata.MakeString(path.Int32s, path.Offset, path.Length);
upto++;
}
else
{
Automaton prefix = BasicAutomata.MakeString(path.Int32s, path.Offset, nonFuzzyPrefix);
int[] ints = new int[path.Length - nonFuzzyPrefix];
Array.Copy(path.Int32s, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
// and pass 1 instead of 0? We probably don't want
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MaxCodePoint : 255, transpositions);
Automaton levAutomaton = lev.ToAutomaton(maxEdits);
Automaton combined = BasicOperations.Concatenate(prefix, levAutomaton);
combined.IsDeterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already
subs[upto] = combined;
upto++;
}
}
if (subs.Length == 0)
{
// automaton is empty, there is no accepted paths through it
return BasicAutomata.MakeEmpty(); // matches nothing
}
else if (subs.Length == 1)
{
// no synonyms or anything: just a single path through the tokenstream
return subs[0];
}
else
{
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = BasicOperations.Union(subs);
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
BasicOperations.Determinize(a);
return a;
}
}
}
}