blob: 3800596afd85bdf498a8305117083728c069f493 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.spell;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Locale;
import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/**
* Simple automaton-based spellchecker.
* <p>
* Candidates are presented directly from the term dictionary, based on
* Levenshtein distance. This is an alternative to {@link SpellChecker}
* if you are using an edit-distance-like metric such as Levenshtein
* or {@link JaroWinklerDistance}.
* <p>
* A practical benefit of this spellchecker is that it requires no additional
* datastructures (neither in RAM nor on disk) to do its work.
*
* @see LevenshteinAutomata
* @see FuzzyTermsEnum
*
* @lucene.experimental
*/
public class DirectSpellChecker {
/** The default StringDistance, Damerau-Levenshtein distance implemented internally
* via {@link LevenshteinAutomata}.
* <p>
* Note: this is the fastest distance metric, because Damerau-Levenshtein is used
* to draw candidates from the term dictionary: this just re-uses the scoring.
*/
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
/** maximum edit distance for candidate terms */
private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
/** minimum prefix for candidate terms */
private int minPrefix = 1;
/** maximum number of top-N inspections per suggestion */
private int maxInspections = 5;
/** minimum accuracy for a term to match */
private float accuracy = SpellChecker.DEFAULT_ACCURACY;
/** value in [0..1] (or absolute number &gt;= 1) representing the minimum
* number of documents (of the total) where a term should appear. */
private float thresholdFrequency = 0f;
/** minimum length of a query word to return suggestions */
private int minQueryLength = 4;
/** maximum length of a query word to return suggestions */
private int maxQueryLength = Integer.MAX_VALUE;
/** value in [0..1] (or absolute number &gt;= 1) representing the maximum
* number of documents (of the total) a query term can appear in to
* be corrected. */
private float maxQueryFrequency = 0.01f;
/** true if the spellchecker should lowercase terms */
private boolean lowerCaseTerms = true;
/** the comparator to use */
private Comparator<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
/** the string distance to use */
private StringDistance distance = INTERNAL_LEVENSHTEIN;
/** Creates a DirectSpellChecker with default configuration values */
public DirectSpellChecker() {}
/** Get the maximum number of Levenshtein edit-distances to draw
* candidate terms from. */
public int getMaxEdits() {
return maxEdits;
}
/** Sets the maximum number of Levenshtein edit-distances to draw
* candidate terms from. This value can be 1 or 2. The default is 2.
* <p>
* Note: a large number of spelling errors occur with an edit distance
* of 1, by setting this value to 1 you can increase both performance
* and precision at the cost of recall.
*/
public void setMaxEdits(int maxEdits) {
if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
throw new UnsupportedOperationException("Invalid maxEdits");
this.maxEdits = maxEdits;
}
/**
* Get the minimal number of characters that must match exactly
*/
public int getMinPrefix() {
return minPrefix;
}
/**
* Sets the minimal number of initial characters (default: 1)
* that must match exactly.
* <p>
* This can improve both performance and accuracy of results,
* as misspellings are commonly not the first character.
*/
public void setMinPrefix(int minPrefix) {
this.minPrefix = minPrefix;
}
/**
* Get the maximum number of top-N inspections per suggestion
*/
public int getMaxInspections() {
return maxInspections;
}
/**
* Set the maximum number of top-N inspections (default: 5) per suggestion.
* <p>
* Increasing this number can improve the accuracy of results, at the cost
* of performance.
*/
public void setMaxInspections(int maxInspections) {
this.maxInspections = maxInspections;
}
/**
* Get the minimal accuracy from the StringDistance for a match
*/
public float getAccuracy() {
return accuracy;
}
/**
* Set the minimal accuracy required (default: 0.5f) from a StringDistance
* for a suggestion match.
*/
public void setAccuracy(float accuracy) {
this.accuracy = accuracy;
}
/**
* Get the minimal threshold of documents a term must appear for a match
*/
public float getThresholdFrequency() {
return thresholdFrequency;
}
/**
* Set the minimal threshold of documents a term must appear for a match.
* <p>
* This can improve quality by only suggesting high-frequency terms. Note that
* very high values might decrease performance slightly, by forcing the spellchecker
* to draw more candidates from the term dictionary, but a practical value such
* as <code>1</code> can be very useful towards improving quality.
* <p>
* This can be specified as a relative percentage of documents such as 0.5f,
* or it can be specified as an absolute whole document frequency, such as 4f.
* Absolute document frequencies may not be fractional.
*/
public void setThresholdFrequency(float thresholdFrequency) {
if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency)
throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
this.thresholdFrequency = thresholdFrequency;
}
/** Get the minimum length of a query term needed to return suggestions */
public int getMinQueryLength() {
return minQueryLength;
}
/**
* Set the minimum length of a query term (default: 4) needed to return suggestions.
* <p>
* Very short query terms will often cause only bad suggestions with any distance
* metric.
*/
public void setMinQueryLength(int minQueryLength) {
if (minQueryLength > this.maxQueryLength)
throw new IllegalArgumentException("minQueryLength must not be greater than maxQueryLength");
this.minQueryLength = minQueryLength;
}
/** Get the maximum length of a query term to return suggestions */
public int getMaxQueryLength() {
return maxQueryLength;
}
/**
* Set the maximum length of a query term to return suggestions.
* <p>
* Long queries can be expensive to process and/or trigger exceptions.
*/
public void setMaxQueryLength(int maxQueryLength) {
if (maxQueryLength < this.minQueryLength)
throw new IllegalArgumentException("maxQueryLength must not be smaller than minQueryLength");
this.maxQueryLength = maxQueryLength;
}
/**
* Get the maximum threshold of documents a query term can appear in order
* to provide suggestions.
*/
public float getMaxQueryFrequency() {
return maxQueryFrequency;
}
/**
* Set the maximum threshold (default: 0.01f) of documents a query term can
* appear in order to provide suggestions.
* <p>
* Very high-frequency terms are typically spelled correctly. Additionally,
* this can increase performance as it will do no work for the common case
* of correctly-spelled input terms.
* <p>
* This can be specified as a relative percentage of documents such as 0.5f,
* or it can be specified as an absolute whole document frequency, such as 4f.
* Absolute document frequencies may not be fractional.
*/
public void setMaxQueryFrequency(float maxQueryFrequency) {
if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency)
throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
this.maxQueryFrequency = maxQueryFrequency;
}
/** true if the spellchecker should lowercase terms */
public boolean getLowerCaseTerms() {
return lowerCaseTerms;
}
/**
* True if the spellchecker should lowercase terms (default: true)
* <p>
* This is a convenience method, if your index field has more complicated
* analysis (such as StandardTokenizer removing punctuation), it's probably
* better to turn this off, and instead run your query terms through your
* Analyzer first.
* <p>
* If this option is not on, case differences count as an edit!
*/
public void setLowerCaseTerms(boolean lowerCaseTerms) {
this.lowerCaseTerms = lowerCaseTerms;
}
/**
* Get the current comparator in use.
*/
public Comparator<SuggestWord> getComparator() {
return comparator;
}
/**
* Set the comparator for sorting suggestions.
* The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR}
*/
public void setComparator(Comparator<SuggestWord> comparator) {
this.comparator = comparator;
}
/**
* Get the string distance metric in use.
*/
public StringDistance getDistance() {
return distance;
}
/**
* Set the string distance metric.
* The default is {@link #INTERNAL_LEVENSHTEIN}
* <p>
* Note: because this spellchecker draws its candidates from the term
* dictionary using Damerau-Levenshtein, it works best with an edit-distance-like
* string metric. If you use a different metric than the default,
* you might want to consider increasing {@link #setMaxInspections(int)}
* to draw more candidates for your metric to rank.
*/
public void setDistance(StringDistance distance) {
this.distance = distance;
}
/**
* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
* suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
throws IOException {
return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
}
/**
* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
* suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
*
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
SuggestMode suggestMode) throws IOException {
return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
}
/**
* Suggest similar words.
*
* <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
* relevant terms is an edit distance, therefore typically a low value
* for numSug will work very well.
*
* @param term Term you want to spell check on
* @param numSug the maximum number of suggested words
* @param ir IndexReader to find terms from
* @param suggestMode specifies when to return suggested words
* @param accuracy return only suggested words that match with this similarity
* @return sorted list of the suggested words according to the comparator
* @throws IOException If there is a low-level I/O error.
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
SuggestMode suggestMode, float accuracy) throws IOException {
final CharsRefBuilder spare = new CharsRefBuilder();
String text = term.text();
int textLength = text.codePointCount(0, text.length());
if (textLength < minQueryLength || textLength > maxQueryLength)
return new SuggestWord[0];
if (lowerCaseTerms) {
term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
}
int docfreq = ir.docFreq(term);
if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
return new SuggestWord[0];
}
int maxDoc = ir.maxDoc();
if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
return new SuggestWord[0];
} else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
return new SuggestWord[0];
}
if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;
if (thresholdFrequency >= 1f) {
docfreq = Math.max(docfreq, (int) thresholdFrequency);
} else if (thresholdFrequency > 0f) {
docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
}
Collection<ScoreTerm> terms = null;
int inspections = numSug * maxInspections;
// try ed=1 first, in case we get lucky
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
if (maxEdits > 1 && terms.size() < inspections) {
HashSet<ScoreTerm> moreTerms = new HashSet<>();
moreTerms.addAll(terms);
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
terms = moreTerms;
}
// create the suggestword response, sort it, and trim it to size.
SuggestWord suggestions[] = new SuggestWord[terms.size()];
int index = suggestions.length - 1;
for (ScoreTerm s : terms) {
SuggestWord suggestion = new SuggestWord();
if (s.termAsString == null) {
spare.copyUTF8Bytes(s.term);
s.termAsString = spare.toString();
}
suggestion.string = s.termAsString;
suggestion.score = s.score;
suggestion.freq = s.docfreq;
suggestions[index--] = suggestion;
}
ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
if (numSug < suggestions.length) {
SuggestWord trimmed[] = new SuggestWord[numSug];
System.arraycopy(suggestions, 0, trimmed, 0, numSug);
suggestions = trimmed;
}
return suggestions;
}
/**
* Provide spelling corrections based on several parameters.
*
* @param term The term to suggest spelling corrections for
* @param numSug The maximum number of spelling corrections
* @param ir The index reader to fetch the candidate spelling corrections from
* @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
* @param editDistance The maximum edit distance candidates are allowed to have
* @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
* @param spare a chars scratch
* @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
* @throws IOException If I/O related errors occur
*/
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
float accuracy, final CharsRefBuilder spare) throws IOException {
Terms terms = MultiTerms.getTerms(ir, term.field());
if (terms == null) {
return Collections.emptyList();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
BytesRef queryTerm = new BytesRef(term.text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
while ((candidateTerm = e.next()) != null) {
// For FuzzyQuery, boost is the score:
float score = e.getBoost();
// ignore uncompetitive hits
if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
continue;
}
// ignore exact match of the same term
if (queryTerm.bytesEquals(candidateTerm)) {
continue;
}
int df = e.docFreq();
// check docFreq if required
if (df <= docfreq) {
continue;
}
final String termAsString;
if (distance == INTERNAL_LEVENSHTEIN) {
// delay creating strings until the end
termAsString = null;
} else {
spare.copyUTF8Bytes(candidateTerm);
termAsString = spare.toString();
score = distance.getDistance(term.text(), termAsString);
}
if (score < accuracy) {
continue;
}
// add new entry in PQ
st.term = BytesRef.deepCopyOf(candidateTerm);
st.boost = score;
st.docfreq = df;
st.termAsString = termAsString;
st.score = score;
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
}
return stQueue;
}
/**
* Holds a spelling correction for internal usage inside {@link DirectSpellChecker}.
*/
protected static class ScoreTerm implements Comparable<ScoreTerm> {
/**
* The actual spellcheck correction.
*/
public BytesRef term;
/**
* The boost representing the similarity from the FuzzyTermsEnum (internal similarity score)
*/
public float boost;
/**
* The df of the spellcheck correction.
*/
public int docfreq;
/**
* The spellcheck correction represented as string, can be <code>null</code>.
*/
public String termAsString;
/**
* The similarity score.
*/
public float score;
/**
* Constructor.
*/
public ScoreTerm() {
}
@Override
public int compareTo(ScoreTerm other) {
if (term.bytesEquals(other.term))
return 0; // consistent with equals
if (this.boost == other.boost)
return other.term.compareTo(this.term);
else
return Float.compare(this.boost, other.boost);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
ScoreTerm other = (ScoreTerm) obj;
if (term == null) {
if (other.term != null) return false;
} else if (!term.bytesEquals(other.term)) return false;
return true;
}
}
}