blob: 181d24e52ebe32dd62fce705ecf34cf1c1b9bfed [file] [log] [blame]
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Lucene.Net.Index;
using Lucene.Net.Util;
using Lucene.Net.Util.Automaton;
namespace Lucene.Net.Search.Spell
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Simple automaton-based spellchecker.
/// <para>
/// Candidates are presented directly from the term dictionary, based on
/// Levenshtein distance. This is an alternative to <seealso cref="SpellChecker"/>
/// if you are using an edit-distance-like metric such as Levenshtein
/// or <seealso cref="JaroWinklerDistance"/>.
/// </para>
/// <para>
/// A practical benefit of this spellchecker is that it requires no additional
/// datastructures (neither in RAM nor on disk) to do its work.
///
/// </para>
/// </summary>
/// <seealso cref= LevenshteinAutomata </seealso>
/// <seealso cref= FuzzyTermsEnum
///
/// @lucene.experimental </seealso>
public class DirectSpellChecker
{
/// <summary>
/// The default StringDistance, Damerau-Levenshtein distance implemented internally
/// via <seealso cref="LevenshteinAutomata"/>.
/// <para>
/// Note: this is the fastest distance metric, because Damerau-Levenshtein is used
/// to draw candidates from the term dictionary: this just re-uses the scoring.
/// </para>
/// </summary>
public static readonly StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
/// <summary>
/// maximum edit distance for candidate terms </summary>
private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
/// <summary>
/// minimum prefix for candidate terms </summary>
private int minPrefix = 1;
/// <summary>
/// maximum number of top-N inspections per suggestion </summary>
private int maxInspections = 5;
/// <summary>
/// minimum accuracy for a term to match </summary>
private float accuracy = SpellChecker.DEFAULT_ACCURACY;
/// <summary>
/// value in [0..1] (or absolute number >=1) representing the minimum
/// number of documents (of the total) where a term should appear.
/// </summary>
private float thresholdFrequency = 0f;
/// <summary>
/// minimum length of a query word to return suggestions </summary>
private int minQueryLength = 4;
/// <summary>
/// value in [0..1] (or absolute number >=1) representing the maximum
/// number of documents (of the total) a query term can appear in to
/// be corrected.
/// </summary>
private float maxQueryFrequency = 0.01f;
/// <summary>
/// true if the spellchecker should lowercase terms </summary>
private bool lowerCaseTerms = true;
/// <summary>
/// the comparator to use </summary>
private IComparer<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
/// <summary>
/// the string distance to use </summary>
private StringDistance distance = INTERNAL_LEVENSHTEIN;
/// <summary>
/// Creates a DirectSpellChecker with default configuration values </summary>
public DirectSpellChecker()
{
}
/// <summary>
/// Get the maximum number of Levenshtein edit-distances to draw
/// candidate terms from.
/// </summary>
public virtual int MaxEdits
{
get
{
return maxEdits;
}
set
{
if (value < 1 || value > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
{
throw new NotSupportedException("Invalid maxEdits");
}
this.maxEdits = value;
}
}
/// <summary>
/// Get the minimal number of characters that must match exactly
/// </summary>
public virtual int MinPrefix
{
get
{
return minPrefix;
}
set
{
this.minPrefix = value;
}
}
/// <summary>
/// Get the maximum number of top-N inspections per suggestion
/// </summary>
public virtual int MaxInspections
{
get
{
return maxInspections;
}
set
{
this.maxInspections = value;
}
}
/// <summary>
/// Get the minimal accuracy from the StringDistance for a match
/// </summary>
public virtual float Accuracy
{
get
{
return accuracy;
}
set
{
this.accuracy = value;
}
}
/// <summary>
/// Get the minimal threshold of documents a term must appear for a match
/// </summary>
public virtual float ThresholdFrequency
{
get
{
return thresholdFrequency;
}
set
{
if (value >= 1f && value != (int)value)
{
throw new System.ArgumentException("Fractional absolute document frequencies are not allowed");
}
this.thresholdFrequency = value;
}
}
/// <summary>
/// Get the minimum length of a query term needed to return suggestions </summary>
public virtual int MinQueryLength
{
get
{
return minQueryLength;
}
set
{
this.minQueryLength = value;
}
}
/// <summary>
/// Get the maximum threshold of documents a query term can appear in order
/// to provide suggestions.
/// </summary>
public virtual float MaxQueryFrequency
{
get
{
return maxQueryFrequency;
}
set
{
if (value >= 1f && value != (int)value)
{
throw new System.ArgumentException("Fractional absolute document frequencies are not allowed");
}
this.maxQueryFrequency = value;
}
}
/// <summary>
/// true if the spellchecker should lowercase terms </summary>
public virtual bool LowerCaseTerms
{
get
{
return lowerCaseTerms;
}
set
{
this.lowerCaseTerms = value;
}
}
/// <summary>
/// Get the current comparator in use.
/// </summary>
public virtual IComparer<SuggestWord> Comparator
{
get
{
return comparator;
}
set
{
this.comparator = value;
}
}
/// <summary>
/// Get the string distance metric in use.
/// </summary>
public virtual StringDistance Distance
{
get
{
return distance;
}
set
{
this.distance = value;
}
}
/// <summary>
/// Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
/// suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
/// </summary>
public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir)
{
return SuggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
}
/// <summary>
/// Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
/// suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
///
/// </summary>
public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode)
{
return SuggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
}
/// <summary>
/// Suggest similar words.
///
/// <para>Unlike <seealso cref="SpellChecker"/>, the similarity used to fetch the most
/// relevant terms is an edit distance, therefore typically a low value
/// for numSug will work very well.
///
/// </para>
/// </summary>
/// <param name="term"> Term you want to spell check on </param>
/// <param name="numSug"> the maximum number of suggested words </param>
/// <param name="ir"> IndexReader to find terms from </param>
/// <param name="suggestMode"> specifies when to return suggested words </param>
/// <param name="accuracy"> return only suggested words that match with this similarity </param>
/// <returns> sorted list of the suggested words according to the comparator </returns>
/// <exception cref="IOException"> If there is a low-level I/O error. </exception>
public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy)
{
CharsRef spare = new CharsRef();
string text = term.Text();
if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength)
{
return new SuggestWord[0];
}
if (lowerCaseTerms)
{
term = new Term(term.Field(), text.ToLower(Locale.ROOT));
}
int docfreq = ir.DocFreq(term);
if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0)
{
return new SuggestWord[0];
}
int maxDoc = ir.MaxDoc();
if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency)
{
return new SuggestWord[0];
}
else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * (float)maxDoc))
{
return new SuggestWord[0];
}
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
{
docfreq = 0;
}
if (thresholdFrequency >= 1f)
{
docfreq = Math.Max(docfreq, (int)thresholdFrequency);
}
else if (thresholdFrequency > 0f)
{
docfreq = Math.Max(docfreq, (int)(thresholdFrequency * (float)maxDoc) - 1);
}
ICollection<ScoreTerm> terms = null;
int inspections = numSug * maxInspections;
// try ed=1 first, in case we get lucky
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
if (maxEdits > 1 && terms.Count < inspections)
{
var moreTerms = new HashSet<ScoreTerm>();
moreTerms.AddAll(terms);
moreTerms.AddAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
terms = moreTerms;
}
// create the suggestword response, sort it, and trim it to size.
var suggestions = new SuggestWord[terms.Count];
int index = suggestions.Length - 1;
foreach (ScoreTerm s in terms)
{
SuggestWord suggestion = new SuggestWord();
if (s.termAsString == null)
{
UnicodeUtil.UTF8toUTF16(s.term, spare);
s.termAsString = spare.ToString();
}
suggestion.@string = s.termAsString;
suggestion.score = s.score;
suggestion.freq = s.docfreq;
suggestions[index--] = suggestion;
}
ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparator));
if (numSug < suggestions.Length)
{
SuggestWord[] trimmed = new SuggestWord[numSug];
Array.Copy(suggestions, 0, trimmed, 0, numSug);
suggestions = trimmed;
}
return suggestions;
}
/// <summary>
/// Provide spelling corrections based on several parameters.
/// </summary>
/// <param name="term"> The term to suggest spelling corrections for </param>
/// <param name="numSug"> The maximum number of spelling corrections </param>
/// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
/// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
/// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
/// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
/// <param name="spare"> a chars scratch </param>
/// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
/// <exception cref="IOException"> If I/O related errors occur </exception>
protected internal virtual ICollection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare)
{
var atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute<MaxNonCompetitiveBoostAttribute>();
Terms terms = MultiFields.GetTerms(ir, term.Field());
if (terms == null)
{
return Enumerable.Empty<ScoreDoc>();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);
var stQueue = new PriorityQueue<ScoreTerm>();
BytesRef queryTerm = new BytesRef(term.Text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
BoostAttribute boostAtt = e.Attributes().AddAttribute<BoostAttribute>();
while ((candidateTerm = e.Next()) != null)
{
float boost = boostAtt.Boost;
// ignore uncompetitive hits
if (stQueue.Size() >= numSug && boost <= stQueue.Peek().boost)
{
continue;
}
// ignore exact match of the same term
if (queryTerm.BytesEquals(candidateTerm))
{
continue;
}
int df = e.DocFreq();
// check docFreq if required
if (df <= docfreq)
{
continue;
}
float score;
string termAsString;
if (distance == INTERNAL_LEVENSHTEIN)
{
// delay creating strings until the end
termAsString = null;
// undo FuzzyTermsEnum's scale factor for a real scaled lev score
score = boost / e.ScaleFactor + e.MinSimilarity;
}
else
{
UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
termAsString = spare.ToString();
score = distance.GetDistance(term.Text(), termAsString);
}
if (score < accuracy)
{
continue;
}
// add new entry in PQ
st.term = BytesRef.DeepCopyOf(candidateTerm);
st.boost = boost;
st.docfreq = df;
st.termAsString = termAsString;
st.score = score;
stQueue.Offer(st);
// possibly drop entries from queue
st = (stQueue.Size() > numSug) ? stQueue.Poll() : new ScoreTerm();
maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Size() >= numSug) ? stQueue.Peek().boost : float.NegativeInfinity;
}
return stQueue;
}
/// <summary>
/// Holds a spelling correction for internal usage inside <seealso cref="DirectSpellChecker"/>.
/// </summary>
protected internal class ScoreTerm : IComparable<ScoreTerm>
{
/// <summary>
/// The actual spellcheck correction.
/// </summary>
public BytesRef term;
/// <summary>
/// The boost representing the similarity from the FuzzyTermsEnum (internal similarity score)
/// </summary>
public float boost;
/// <summary>
/// The df of the spellcheck correction.
/// </summary>
public int docfreq;
/// <summary>
/// The spellcheck correction represented as string, can be <code>null</code>.
/// </summary>
public string termAsString;
/// <summary>
/// The similarity score.
/// </summary>
public float score;
/// <summary>
/// Constructor.
/// </summary>
public ScoreTerm()
{
}
public virtual int CompareTo(ScoreTerm other)
{
if (term.BytesEquals(other.term))
{
return 0; // consistent with equals
}
if (this.boost == other.boost)
{
return other.term.CompareTo(this.term);
}
else
{
return this.boost.CompareTo(other.boost);
}
}
public override int GetHashCode()
{
const int prime = 31;
int result = 1;
result = prime * result + ((term == null) ? 0 : term.GetHashCode());
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (obj == null)
{
return false;
}
if (this.GetType() != obj.GetType())
{
return false;
}
ScoreTerm other = (ScoreTerm)obj;
if (term == null)
{
if (other.term != null)
{
return false;
}
}
else if (!term.BytesEquals(other.term))
{
return false;
}
return true;
}
}
}
}