| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.spell; |
| |
| import java.io.IOException; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.Locale; |
| import java.util.PriorityQueue; |
| |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.MultiTerms; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.search.FuzzyTermsEnum; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.automaton.LevenshteinAutomata; |
| |
| /** |
| * Simple automaton-based spellchecker. |
| * <p> |
| * Candidates are presented directly from the term dictionary, based on |
| * Levenshtein distance. This is an alternative to {@link SpellChecker} |
| * if you are using an edit-distance-like metric such as Levenshtein |
| * or {@link JaroWinklerDistance}. |
| * <p> |
| * A practical benefit of this spellchecker is that it requires no additional |
| * datastructures (neither in RAM nor on disk) to do its work. |
| * |
| * @see LevenshteinAutomata |
| * @see FuzzyTermsEnum |
| * |
| * @lucene.experimental |
| */ |
| public class DirectSpellChecker { |
| /** The default StringDistance, Damerau-Levenshtein distance implemented internally |
| * via {@link LevenshteinAutomata}. |
| * <p> |
| * Note: this is the fastest distance metric, because Damerau-Levenshtein is used |
| * to draw candidates from the term dictionary: this just re-uses the scoring. |
| */ |
| public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance(); |
| |
| /** maximum edit distance for candidate terms */ |
| private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; |
| /** minimum prefix for candidate terms */ |
| private int minPrefix = 1; |
| /** maximum number of top-N inspections per suggestion */ |
| private int maxInspections = 5; |
| /** minimum accuracy for a term to match */ |
| private float accuracy = SpellChecker.DEFAULT_ACCURACY; |
| /** value in [0..1] (or absolute number >= 1) representing the minimum |
| * number of documents (of the total) where a term should appear. */ |
| private float thresholdFrequency = 0f; |
| /** minimum length of a query word to return suggestions */ |
| private int minQueryLength = 4; |
| /** maximum length of a query word to return suggestions */ |
| private int maxQueryLength = Integer.MAX_VALUE; |
| /** value in [0..1] (or absolute number >= 1) representing the maximum |
| * number of documents (of the total) a query term can appear in to |
| * be corrected. */ |
| private float maxQueryFrequency = 0.01f; |
| /** true if the spellchecker should lowercase terms */ |
| private boolean lowerCaseTerms = true; |
| /** the comparator to use */ |
| private Comparator<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR; |
| /** the string distance to use */ |
| private StringDistance distance = INTERNAL_LEVENSHTEIN; |
| |
| /** Creates a DirectSpellChecker with default configuration values */ |
| public DirectSpellChecker() {} |
| |
| /** Get the maximum number of Levenshtein edit-distances to draw |
| * candidate terms from. */ |
| public int getMaxEdits() { |
| return maxEdits; |
| } |
| |
| /** Sets the maximum number of Levenshtein edit-distances to draw |
| * candidate terms from. This value can be 1 or 2. The default is 2. |
| * <p> |
| * Note: a large number of spelling errors occur with an edit distance |
| * of 1, by setting this value to 1 you can increase both performance |
| * and precision at the cost of recall. |
| */ |
| public void setMaxEdits(int maxEdits) { |
| if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) |
| throw new UnsupportedOperationException("Invalid maxEdits"); |
| this.maxEdits = maxEdits; |
| } |
| |
| /** |
| * Get the minimal number of characters that must match exactly |
| */ |
| public int getMinPrefix() { |
| return minPrefix; |
| } |
| |
| /** |
| * Sets the minimal number of initial characters (default: 1) |
| * that must match exactly. |
| * <p> |
| * This can improve both performance and accuracy of results, |
| * as misspellings are commonly not the first character. |
| */ |
| public void setMinPrefix(int minPrefix) { |
| this.minPrefix = minPrefix; |
| } |
| |
| /** |
| * Get the maximum number of top-N inspections per suggestion |
| */ |
| public int getMaxInspections() { |
| return maxInspections; |
| } |
| |
| /** |
| * Set the maximum number of top-N inspections (default: 5) per suggestion. |
| * <p> |
| * Increasing this number can improve the accuracy of results, at the cost |
| * of performance. |
| */ |
| public void setMaxInspections(int maxInspections) { |
| this.maxInspections = maxInspections; |
| } |
| |
| /** |
| * Get the minimal accuracy from the StringDistance for a match |
| */ |
| public float getAccuracy() { |
| return accuracy; |
| } |
| |
| /** |
| * Set the minimal accuracy required (default: 0.5f) from a StringDistance |
| * for a suggestion match. |
| */ |
| public void setAccuracy(float accuracy) { |
| this.accuracy = accuracy; |
| } |
| |
| /** |
| * Get the minimal threshold of documents a term must appear for a match |
| */ |
| public float getThresholdFrequency() { |
| return thresholdFrequency; |
| } |
| |
| /** |
| * Set the minimal threshold of documents a term must appear for a match. |
| * <p> |
| * This can improve quality by only suggesting high-frequency terms. Note that |
| * very high values might decrease performance slightly, by forcing the spellchecker |
| * to draw more candidates from the term dictionary, but a practical value such |
| * as <code>1</code> can be very useful towards improving quality. |
| * <p> |
| * This can be specified as a relative percentage of documents such as 0.5f, |
| * or it can be specified as an absolute whole document frequency, such as 4f. |
| * Absolute document frequencies may not be fractional. |
| */ |
| public void setThresholdFrequency(float thresholdFrequency) { |
| if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency) |
| throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); |
| this.thresholdFrequency = thresholdFrequency; |
| } |
| |
| /** Get the minimum length of a query term needed to return suggestions */ |
| public int getMinQueryLength() { |
| return minQueryLength; |
| } |
| |
| /** |
| * Set the minimum length of a query term (default: 4) needed to return suggestions. |
| * <p> |
| * Very short query terms will often cause only bad suggestions with any distance |
| * metric. |
| */ |
| public void setMinQueryLength(int minQueryLength) { |
| if (minQueryLength > this.maxQueryLength) |
| throw new IllegalArgumentException("minQueryLength must not be greater than maxQueryLength"); |
| this.minQueryLength = minQueryLength; |
| } |
| |
| /** Get the maximum length of a query term to return suggestions */ |
| public int getMaxQueryLength() { |
| return maxQueryLength; |
| } |
| |
| /** |
| * Set the maximum length of a query term to return suggestions. |
| * <p> |
| * Long queries can be expensive to process and/or trigger exceptions. |
| */ |
| public void setMaxQueryLength(int maxQueryLength) { |
| if (maxQueryLength < this.minQueryLength) |
| throw new IllegalArgumentException("maxQueryLength must not be smaller than minQueryLength"); |
| this.maxQueryLength = maxQueryLength; |
| } |
| |
| /** |
| * Get the maximum threshold of documents a query term can appear in order |
| * to provide suggestions. |
| */ |
| public float getMaxQueryFrequency() { |
| return maxQueryFrequency; |
| } |
| |
| /** |
| * Set the maximum threshold (default: 0.01f) of documents a query term can |
| * appear in order to provide suggestions. |
| * <p> |
| * Very high-frequency terms are typically spelled correctly. Additionally, |
| * this can increase performance as it will do no work for the common case |
| * of correctly-spelled input terms. |
| * <p> |
| * This can be specified as a relative percentage of documents such as 0.5f, |
| * or it can be specified as an absolute whole document frequency, such as 4f. |
| * Absolute document frequencies may not be fractional. |
| */ |
| public void setMaxQueryFrequency(float maxQueryFrequency) { |
| if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency) |
| throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); |
| this.maxQueryFrequency = maxQueryFrequency; |
| } |
| |
| /** true if the spellchecker should lowercase terms */ |
| public boolean getLowerCaseTerms() { |
| return lowerCaseTerms; |
| } |
| |
| /** |
| * True if the spellchecker should lowercase terms (default: true) |
| * <p> |
| * This is a convenience method, if your index field has more complicated |
| * analysis (such as StandardTokenizer removing punctuation), it's probably |
| * better to turn this off, and instead run your query terms through your |
| * Analyzer first. |
| * <p> |
| * If this option is not on, case differences count as an edit! |
| */ |
| public void setLowerCaseTerms(boolean lowerCaseTerms) { |
| this.lowerCaseTerms = lowerCaseTerms; |
| } |
| |
| /** |
| * Get the current comparator in use. |
| */ |
| public Comparator<SuggestWord> getComparator() { |
| return comparator; |
| } |
| |
| /** |
| * Set the comparator for sorting suggestions. |
| * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR} |
| */ |
| public void setComparator(Comparator<SuggestWord> comparator) { |
| this.comparator = comparator; |
| } |
| |
| /** |
| * Get the string distance metric in use. |
| */ |
| public StringDistance getDistance() { |
| return distance; |
| } |
| |
| /** |
| * Set the string distance metric. |
| * The default is {@link #INTERNAL_LEVENSHTEIN} |
| * <p> |
| * Note: because this spellchecker draws its candidates from the term |
| * dictionary using Damerau-Levenshtein, it works best with an edit-distance-like |
| * string metric. If you use a different metric than the default, |
| * you might want to consider increasing {@link #setMaxInspections(int)} |
| * to draw more candidates for your metric to rank. |
| */ |
| public void setDistance(StringDistance distance) { |
| this.distance = distance; |
| } |
| |
| /** |
| * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode) |
| * suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)} |
| */ |
| public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir) |
| throws IOException { |
| return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); |
| } |
| |
| /** |
| * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float) |
| * suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)} |
| * |
| */ |
| public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, |
| SuggestMode suggestMode) throws IOException { |
| return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy); |
| } |
| |
| /** |
| * Suggest similar words. |
| * |
| * <p>Unlike {@link SpellChecker}, the similarity used to fetch the most |
| * relevant terms is an edit distance, therefore typically a low value |
| * for numSug will work very well. |
| * |
| * @param term Term you want to spell check on |
| * @param numSug the maximum number of suggested words |
| * @param ir IndexReader to find terms from |
| * @param suggestMode specifies when to return suggested words |
| * @param accuracy return only suggested words that match with this similarity |
| * @return sorted list of the suggested words according to the comparator |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, |
| SuggestMode suggestMode, float accuracy) throws IOException { |
| final CharsRefBuilder spare = new CharsRefBuilder(); |
| String text = term.text(); |
| |
| int textLength = text.codePointCount(0, text.length()); |
| if (textLength < minQueryLength || textLength > maxQueryLength) |
| return new SuggestWord[0]; |
| |
| if (lowerCaseTerms) { |
| term = new Term(term.field(), text.toLowerCase(Locale.ROOT)); |
| } |
| |
| int docfreq = ir.docFreq(term); |
| |
| if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) { |
| return new SuggestWord[0]; |
| } |
| |
| int maxDoc = ir.maxDoc(); |
| |
| if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { |
| return new SuggestWord[0]; |
| } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) { |
| return new SuggestWord[0]; |
| } |
| |
| if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0; |
| |
| if (thresholdFrequency >= 1f) { |
| docfreq = Math.max(docfreq, (int) thresholdFrequency); |
| } else if (thresholdFrequency > 0f) { |
| docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1); |
| } |
| |
| Collection<ScoreTerm> terms = null; |
| int inspections = numSug * maxInspections; |
| |
| // try ed=1 first, in case we get lucky |
| terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); |
| if (maxEdits > 1 && terms.size() < inspections) { |
| HashSet<ScoreTerm> moreTerms = new HashSet<>(); |
| moreTerms.addAll(terms); |
| moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); |
| terms = moreTerms; |
| } |
| |
| // create the suggestword response, sort it, and trim it to size. |
| |
| SuggestWord suggestions[] = new SuggestWord[terms.size()]; |
| int index = suggestions.length - 1; |
| for (ScoreTerm s : terms) { |
| SuggestWord suggestion = new SuggestWord(); |
| if (s.termAsString == null) { |
| spare.copyUTF8Bytes(s.term); |
| s.termAsString = spare.toString(); |
| } |
| suggestion.string = s.termAsString; |
| suggestion.score = s.score; |
| suggestion.freq = s.docfreq; |
| suggestions[index--] = suggestion; |
| } |
| |
| ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator)); |
| if (numSug < suggestions.length) { |
| SuggestWord trimmed[] = new SuggestWord[numSug]; |
| System.arraycopy(suggestions, 0, trimmed, 0, numSug); |
| suggestions = trimmed; |
| } |
| return suggestions; |
| } |
| |
| /** |
| * Provide spelling corrections based on several parameters. |
| * |
| * @param term The term to suggest spelling corrections for |
| * @param numSug The maximum number of spelling corrections |
| * @param ir The index reader to fetch the candidate spelling corrections from |
| * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included |
| * @param editDistance The maximum edit distance candidates are allowed to have |
| * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included |
| * @param spare a chars scratch |
| * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. |
| * @throws IOException If I/O related errors occur |
| */ |
| protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, |
| float accuracy, final CharsRefBuilder spare) throws IOException { |
| |
| Terms terms = MultiTerms.getTerms(ir, term.field()); |
| if (terms == null) { |
| return Collections.emptyList(); |
| } |
| FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true); |
| final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>(); |
| |
| BytesRef queryTerm = new BytesRef(term.text()); |
| BytesRef candidateTerm; |
| ScoreTerm st = new ScoreTerm(); |
| while ((candidateTerm = e.next()) != null) { |
| // For FuzzyQuery, boost is the score: |
| float score = e.getBoost(); |
| // ignore uncompetitive hits |
| if (stQueue.size() >= numSug && score <= stQueue.peek().boost) { |
| continue; |
| } |
| |
| // ignore exact match of the same term |
| if (queryTerm.bytesEquals(candidateTerm)) { |
| continue; |
| } |
| |
| int df = e.docFreq(); |
| |
| // check docFreq if required |
| if (df <= docfreq) { |
| continue; |
| } |
| |
| final String termAsString; |
| if (distance == INTERNAL_LEVENSHTEIN) { |
| // delay creating strings until the end |
| termAsString = null; |
| } else { |
| spare.copyUTF8Bytes(candidateTerm); |
| termAsString = spare.toString(); |
| score = distance.getDistance(term.text(), termAsString); |
| } |
| |
| if (score < accuracy) { |
| continue; |
| } |
| |
| // add new entry in PQ |
| st.term = BytesRef.deepCopyOf(candidateTerm); |
| st.boost = score; |
| st.docfreq = df; |
| st.termAsString = termAsString; |
| st.score = score; |
| stQueue.offer(st); |
| // possibly drop entries from queue |
| st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); |
| e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); |
| } |
| |
| return stQueue; |
| } |
| |
| /** |
| * Holds a spelling correction for internal usage inside {@link DirectSpellChecker}. |
| */ |
| protected static class ScoreTerm implements Comparable<ScoreTerm> { |
| |
| /** |
| * The actual spellcheck correction. |
| */ |
| public BytesRef term; |
| |
| /** |
| * The boost representing the similarity from the FuzzyTermsEnum (internal similarity score) |
| */ |
| public float boost; |
| |
| /** |
| * The df of the spellcheck correction. |
| */ |
| public int docfreq; |
| |
| /** |
| * The spellcheck correction represented as string, can be <code>null</code>. |
| */ |
| public String termAsString; |
| |
| /** |
| * The similarity score. |
| */ |
| public float score; |
| |
| /** |
| * Constructor. |
| */ |
| public ScoreTerm() { |
| } |
| |
| @Override |
| public int compareTo(ScoreTerm other) { |
| if (term.bytesEquals(other.term)) |
| return 0; // consistent with equals |
| if (this.boost == other.boost) |
| return other.term.compareTo(this.term); |
| else |
| return Float.compare(this.boost, other.boost); |
| } |
| |
| @Override |
| public int hashCode() { |
| final int prime = 31; |
| int result = 1; |
| result = prime * result + ((term == null) ? 0 : term.hashCode()); |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) return true; |
| if (obj == null) return false; |
| if (getClass() != obj.getClass()) return false; |
| ScoreTerm other = (ScoreTerm) obj; |
| if (term == null) { |
| if (other.term != null) return false; |
| } else if (!term.bytesEquals(other.term)) return false; |
| return true; |
| } |
| } |
| } |