| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Linq; |
| using Lucene.Net.Index; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Automaton; |
| |
| namespace Lucene.Net.Search.Spell |
| { |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| /// <summary> |
| /// Simple automaton-based spellchecker. |
| /// <para> |
| /// Candidates are presented directly from the term dictionary, based on |
| /// Levenshtein distance. This is an alternative to <seealso cref="SpellChecker"/> |
| /// if you are using an edit-distance-like metric such as Levenshtein |
| /// or <seealso cref="JaroWinklerDistance"/>. |
| /// </para> |
| /// <para> |
| /// A practical benefit of this spellchecker is that it requires no additional |
| /// datastructures (neither in RAM nor on disk) to do its work. |
| /// |
| /// </para> |
| /// </summary> |
| /// <seealso cref= LevenshteinAutomata </seealso> |
| /// <seealso cref= FuzzyTermsEnum |
| /// |
| /// @lucene.experimental </seealso> |
| public class DirectSpellChecker |
| { |
| /// <summary> |
| /// The default StringDistance, Damerau-Levenshtein distance implemented internally |
| /// via <seealso cref="LevenshteinAutomata"/>. |
| /// <para> |
| /// Note: this is the fastest distance metric, because Damerau-Levenshtein is used |
| /// to draw candidates from the term dictionary: this just re-uses the scoring. |
| /// </para> |
| /// </summary> |
| public static readonly StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance(); |
| |
| /// <summary> |
| /// maximum edit distance for candidate terms </summary> |
| private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; |
| /// <summary> |
| /// minimum prefix for candidate terms </summary> |
| private int minPrefix = 1; |
| /// <summary> |
| /// maximum number of top-N inspections per suggestion </summary> |
| private int maxInspections = 5; |
| /// <summary> |
| /// minimum accuracy for a term to match </summary> |
| private float accuracy = SpellChecker.DEFAULT_ACCURACY; |
| /// <summary> |
| /// value in [0..1] (or absolute number >=1) representing the minimum |
| /// number of documents (of the total) where a term should appear. |
| /// </summary> |
| private float thresholdFrequency = 0f; |
| /// <summary> |
| /// minimum length of a query word to return suggestions </summary> |
| private int minQueryLength = 4; |
| /// <summary> |
| /// value in [0..1] (or absolute number >=1) representing the maximum |
| /// number of documents (of the total) a query term can appear in to |
| /// be corrected. |
| /// </summary> |
| private float maxQueryFrequency = 0.01f; |
| /// <summary> |
| /// true if the spellchecker should lowercase terms </summary> |
| private bool lowerCaseTerms = true; |
| /// <summary> |
| /// the comparator to use </summary> |
| private IComparer<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR; |
| /// <summary> |
| /// the string distance to use </summary> |
| private StringDistance distance = INTERNAL_LEVENSHTEIN; |
| |
| /// <summary> |
| /// Creates a DirectSpellChecker with default configuration values </summary> |
| public DirectSpellChecker() |
| { |
| } |
| |
| /// <summary> |
| /// Get the maximum number of Levenshtein edit-distances to draw |
| /// candidate terms from. |
| /// </summary> |
| public virtual int MaxEdits |
| { |
| get |
| { |
| return maxEdits; |
| } |
| set |
| { |
| if (value < 1 || value > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) |
| { |
| throw new NotSupportedException("Invalid maxEdits"); |
| } |
| this.maxEdits = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the minimal number of characters that must match exactly |
| /// </summary> |
| public virtual int MinPrefix |
| { |
| get |
| { |
| return minPrefix; |
| } |
| set |
| { |
| this.minPrefix = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the maximum number of top-N inspections per suggestion |
| /// </summary> |
| public virtual int MaxInspections |
| { |
| get |
| { |
| return maxInspections; |
| } |
| set |
| { |
| this.maxInspections = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the minimal accuracy from the StringDistance for a match |
| /// </summary> |
| public virtual float Accuracy |
| { |
| get |
| { |
| return accuracy; |
| } |
| set |
| { |
| this.accuracy = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the minimal threshold of documents a term must appear for a match |
| /// </summary> |
| public virtual float ThresholdFrequency |
| { |
| get |
| { |
| return thresholdFrequency; |
| } |
| set |
| { |
| if (value >= 1f && value != (int)value) |
| { |
| throw new System.ArgumentException("Fractional absolute document frequencies are not allowed"); |
| } |
| this.thresholdFrequency = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the minimum length of a query term needed to return suggestions </summary> |
| public virtual int MinQueryLength |
| { |
| get |
| { |
| return minQueryLength; |
| } |
| set |
| { |
| this.minQueryLength = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the maximum threshold of documents a query term can appear in order |
| /// to provide suggestions. |
| /// </summary> |
| public virtual float MaxQueryFrequency |
| { |
| get |
| { |
| return maxQueryFrequency; |
| } |
| set |
| { |
| if (value >= 1f && value != (int)value) |
| { |
| throw new System.ArgumentException("Fractional absolute document frequencies are not allowed"); |
| } |
| this.maxQueryFrequency = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// true if the spellchecker should lowercase terms </summary> |
| public virtual bool LowerCaseTerms |
| { |
| get |
| { |
| return lowerCaseTerms; |
| } |
| set |
| { |
| this.lowerCaseTerms = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the current comparator in use. |
| /// </summary> |
| public virtual IComparer<SuggestWord> Comparator |
| { |
| get |
| { |
| return comparator; |
| } |
| set |
| { |
| this.comparator = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Get the string distance metric in use. |
| /// </summary> |
| public virtual StringDistance Distance |
| { |
| get |
| { |
| return distance; |
| } |
| set |
| { |
| this.distance = value; |
| } |
| } |
| |
| |
| /// <summary> |
| /// Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode) |
| /// suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)} |
| /// </summary> |
| public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir) |
| { |
| return SuggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); |
| } |
| |
| /// <summary> |
| /// Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float) |
| /// suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)} |
| /// |
| /// </summary> |
| public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode) |
| { |
| return SuggestSimilar(term, numSug, ir, suggestMode, this.accuracy); |
| } |
| |
| /// <summary> |
| /// Suggest similar words. |
| /// |
| /// <para>Unlike <seealso cref="SpellChecker"/>, the similarity used to fetch the most |
| /// relevant terms is an edit distance, therefore typically a low value |
| /// for numSug will work very well. |
| /// |
| /// </para> |
| /// </summary> |
| /// <param name="term"> Term you want to spell check on </param> |
| /// <param name="numSug"> the maximum number of suggested words </param> |
| /// <param name="ir"> IndexReader to find terms from </param> |
| /// <param name="suggestMode"> specifies when to return suggested words </param> |
| /// <param name="accuracy"> return only suggested words that match with this similarity </param> |
| /// <returns> sorted list of the suggested words according to the comparator </returns> |
| /// <exception cref="IOException"> If there is a low-level I/O error. </exception> |
| public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) |
| { |
| CharsRef spare = new CharsRef(); |
| string text = term.Text(); |
| if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength) |
| { |
| return new SuggestWord[0]; |
| } |
| |
| if (lowerCaseTerms) |
| { |
| term = new Term(term.Field(), text.ToLower(Locale.ROOT)); |
| } |
| |
| int docfreq = ir.DocFreq(term); |
| |
| if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) |
| { |
| return new SuggestWord[0]; |
| } |
| |
| int maxDoc = ir.MaxDoc(); |
| |
| if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) |
| { |
| return new SuggestWord[0]; |
| } |
| else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * (float)maxDoc)) |
| { |
| return new SuggestWord[0]; |
| } |
| |
| if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) |
| { |
| docfreq = 0; |
| } |
| |
| if (thresholdFrequency >= 1f) |
| { |
| docfreq = Math.Max(docfreq, (int)thresholdFrequency); |
| } |
| else if (thresholdFrequency > 0f) |
| { |
| docfreq = Math.Max(docfreq, (int)(thresholdFrequency * (float)maxDoc) - 1); |
| } |
| |
| ICollection<ScoreTerm> terms = null; |
| int inspections = numSug * maxInspections; |
| |
| // try ed=1 first, in case we get lucky |
| terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); |
| if (maxEdits > 1 && terms.Count < inspections) |
| { |
| var moreTerms = new HashSet<ScoreTerm>(); |
| moreTerms.AddAll(terms); |
| moreTerms.AddAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); |
| terms = moreTerms; |
| } |
| |
| // create the suggestword response, sort it, and trim it to size. |
| |
| var suggestions = new SuggestWord[terms.Count]; |
| int index = suggestions.Length - 1; |
| foreach (ScoreTerm s in terms) |
| { |
| SuggestWord suggestion = new SuggestWord(); |
| if (s.termAsString == null) |
| { |
| UnicodeUtil.UTF8toUTF16(s.term, spare); |
| s.termAsString = spare.ToString(); |
| } |
| suggestion.@string = s.termAsString; |
| suggestion.score = s.score; |
| suggestion.freq = s.docfreq; |
| suggestions[index--] = suggestion; |
| } |
| |
| ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparator)); |
| if (numSug < suggestions.Length) |
| { |
| SuggestWord[] trimmed = new SuggestWord[numSug]; |
| Array.Copy(suggestions, 0, trimmed, 0, numSug); |
| suggestions = trimmed; |
| } |
| return suggestions; |
| } |
| |
| /// <summary> |
| /// Provide spelling corrections based on several parameters. |
| /// </summary> |
| /// <param name="term"> The term to suggest spelling corrections for </param> |
| /// <param name="numSug"> The maximum number of spelling corrections </param> |
| /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param> |
| /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param> |
| /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param> |
| /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param> |
| /// <param name="spare"> a chars scratch </param> |
| /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns> |
| /// <exception cref="IOException"> If I/O related errors occur </exception> |
| protected internal virtual ICollection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare) |
| { |
| |
| var atts = new AttributeSource(); |
| MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute<MaxNonCompetitiveBoostAttribute>(); |
| Terms terms = MultiFields.GetTerms(ir, term.Field()); |
| if (terms == null) |
| { |
| return Enumerable.Empty<ScoreDoc>(); |
| } |
| FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true); |
| |
| var stQueue = new PriorityQueue<ScoreTerm>(); |
| |
| BytesRef queryTerm = new BytesRef(term.Text()); |
| BytesRef candidateTerm; |
| ScoreTerm st = new ScoreTerm(); |
| BoostAttribute boostAtt = e.Attributes().AddAttribute<BoostAttribute>(); |
| while ((candidateTerm = e.Next()) != null) |
| { |
| float boost = boostAtt.Boost; |
| // ignore uncompetitive hits |
| if (stQueue.Size() >= numSug && boost <= stQueue.Peek().boost) |
| { |
| continue; |
| } |
| |
| // ignore exact match of the same term |
| if (queryTerm.BytesEquals(candidateTerm)) |
| { |
| continue; |
| } |
| |
| int df = e.DocFreq(); |
| |
| // check docFreq if required |
| if (df <= docfreq) |
| { |
| continue; |
| } |
| |
| float score; |
| string termAsString; |
| if (distance == INTERNAL_LEVENSHTEIN) |
| { |
| // delay creating strings until the end |
| termAsString = null; |
| // undo FuzzyTermsEnum's scale factor for a real scaled lev score |
| score = boost / e.ScaleFactor + e.MinSimilarity; |
| } |
| else |
| { |
| UnicodeUtil.UTF8toUTF16(candidateTerm, spare); |
| termAsString = spare.ToString(); |
| score = distance.GetDistance(term.Text(), termAsString); |
| } |
| |
| if (score < accuracy) |
| { |
| continue; |
| } |
| |
| // add new entry in PQ |
| st.term = BytesRef.DeepCopyOf(candidateTerm); |
| st.boost = boost; |
| st.docfreq = df; |
| st.termAsString = termAsString; |
| st.score = score; |
| stQueue.Offer(st); |
| // possibly drop entries from queue |
| st = (stQueue.Size() > numSug) ? stQueue.Poll() : new ScoreTerm(); |
| maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Size() >= numSug) ? stQueue.Peek().boost : float.NegativeInfinity; |
| } |
| |
| return stQueue; |
| } |
| |
| /// <summary> |
| /// Holds a spelling correction for internal usage inside <seealso cref="DirectSpellChecker"/>. |
| /// </summary> |
| protected internal class ScoreTerm : IComparable<ScoreTerm> |
| { |
| |
| /// <summary> |
| /// The actual spellcheck correction. |
| /// </summary> |
| public BytesRef term; |
| |
| /// <summary> |
| /// The boost representing the similarity from the FuzzyTermsEnum (internal similarity score) |
| /// </summary> |
| public float boost; |
| |
| /// <summary> |
| /// The df of the spellcheck correction. |
| /// </summary> |
| public int docfreq; |
| |
| /// <summary> |
| /// The spellcheck correction represented as string, can be <code>null</code>. |
| /// </summary> |
| public string termAsString; |
| |
| /// <summary> |
| /// The similarity score. |
| /// </summary> |
| public float score; |
| |
| /// <summary> |
| /// Constructor. |
| /// </summary> |
| public ScoreTerm() |
| { |
| } |
| |
| public virtual int CompareTo(ScoreTerm other) |
| { |
| if (term.BytesEquals(other.term)) |
| { |
| return 0; // consistent with equals |
| } |
| if (this.boost == other.boost) |
| { |
| return other.term.CompareTo(this.term); |
| } |
| else |
| { |
| return this.boost.CompareTo(other.boost); |
| } |
| } |
| |
| public override int GetHashCode() |
| { |
| const int prime = 31; |
| int result = 1; |
| result = prime * result + ((term == null) ? 0 : term.GetHashCode()); |
| return result; |
| } |
| |
| public override bool Equals(object obj) |
| { |
| if (this == obj) |
| { |
| return true; |
| } |
| if (obj == null) |
| { |
| return false; |
| } |
| if (this.GetType() != obj.GetType()) |
| { |
| return false; |
| } |
| ScoreTerm other = (ScoreTerm)obj; |
| if (term == null) |
| { |
| if (other.term != null) |
| { |
| return false; |
| } |
| } |
| else if (!term.BytesEquals(other.term)) |
| { |
| return false; |
| } |
| return true; |
| } |
| } |
| } |
| } |