lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.spell;

 import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.PriorityQueue;

 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.search.FuzzyTermsEnum;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;

 /**
  * Simple automaton-based spellchecker.
  * <p>
  * Candidates are presented directly from the term dictionary, based on
  * Levenshtein distance. This is an alternative to {@link SpellChecker}
  * if you are using an edit-distance-like metric such as Levenshtein
  * or {@link JaroWinklerDistance}.
  * <p>
  * A practical benefit of this spellchecker is that it requires no additional
  * datastructures (neither in RAM nor on disk) to do its work.
  *
  * @see LevenshteinAutomata
  * @see FuzzyTermsEnum
  *
  * @lucene.experimental
  */
 public class DirectSpellChecker {
   /** The default StringDistance, Damerau-Levenshtein distance implemented internally
    *  via {@link LevenshteinAutomata}.
    *  <p>
    *  Note: this is the fastest distance metric, because Damerau-Levenshtein is used
    *  to draw candidates from the term dictionary: this just re-uses the scoring.
    */
   public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();

   /** maximum edit distance for candidate terms */
   private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
   /** minimum prefix for candidate terms */
   private int minPrefix = 1;
   /** maximum number of top-N inspections per suggestion */
   private int maxInspections = 5;
   /** minimum accuracy for a term to match */
   private float accuracy = SpellChecker.DEFAULT_ACCURACY;
   /** value in [0..1] (or absolute number &gt;= 1) representing the minimum
     * number of documents (of the total) where a term should appear. */
   private float thresholdFrequency = 0f;
   /** minimum length of a query word to return suggestions */
   private int minQueryLength = 4;
   /** maximum length of a query word to return suggestions */
   private int maxQueryLength = Integer.MAX_VALUE;
   /** value in [0..1] (or absolute number &gt;= 1) representing the maximum
    *  number of documents (of the total) a query term can appear in to
    *  be corrected. */
   private float maxQueryFrequency = 0.01f;
   /** true if the spellchecker should lowercase terms */
   private boolean lowerCaseTerms = true;
   /** the comparator to use */
   private Comparator<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
   /** the string distance to use */
   private StringDistance distance = INTERNAL_LEVENSHTEIN;

   /** Creates a DirectSpellChecker with default configuration values */
   public DirectSpellChecker() {}

   /** Get the maximum number of Levenshtein edit-distances to draw
    *  candidate terms from. */
   public int getMaxEdits() {
     return maxEdits;
   }

   /** Sets the maximum number of Levenshtein edit-distances to draw
    *  candidate terms from. This value can be 1 or 2. The default is 2.
    *  <p>
    *  Note: a large number of spelling errors occur with an edit distance
    *  of 1, by setting this value to 1 you can increase both performance
    *  and precision at the cost of recall.
    */
   public void setMaxEdits(int maxEdits) {
     if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
       throw new UnsupportedOperationException("Invalid maxEdits");
     this.maxEdits = maxEdits;
   }

   /**
    * Get the minimal number of characters that must match exactly
    */
   public int getMinPrefix() {
     return minPrefix;
   }

   /**
    * Sets the minimal number of initial characters (default: 1)
    * that must match exactly.
    * <p>
    * This can improve both performance and accuracy of results,
    * as misspellings are commonly not the first character.
    */
   public void setMinPrefix(int minPrefix) {
     this.minPrefix = minPrefix;
   }

   /**
    * Get the maximum number of top-N inspections per suggestion
    */
   public int getMaxInspections() {
     return maxInspections;
   }

   /**
    * Set the maximum number of top-N inspections (default: 5) per suggestion.
    * <p>
    * Increasing this number can improve the accuracy of results, at the cost
    * of performance.
    */
   public void setMaxInspections(int maxInspections) {
     this.maxInspections = maxInspections;
   }

   /**
    * Get the minimal accuracy from the StringDistance for a match
    */
   public float getAccuracy() {
     return accuracy;
   }

   /**
    * Set the minimal accuracy required (default: 0.5f) from a StringDistance
    * for a suggestion match.
    */
   public void setAccuracy(float accuracy) {
     this.accuracy = accuracy;
   }

   /**
    * Get the minimal threshold of documents a term must appear for a match
    */
   public float getThresholdFrequency() {
     return thresholdFrequency;
   }

   /**
    * Set the minimal threshold of documents a term must appear for a match.
    * <p>
    * This can improve quality by only suggesting high-frequency terms. Note that
    * very high values might decrease performance slightly, by forcing the spellchecker
    * to draw more candidates from the term dictionary, but a practical value such
    * as <code>1</code> can be very useful towards improving quality.
    * <p>
    * This can be specified as a relative percentage of documents such as 0.5f,
    * or it can be specified as an absolute whole document frequency, such as 4f.
    * Absolute document frequencies may not be fractional.
    */
   public void setThresholdFrequency(float thresholdFrequency) {
     if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency)
       throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
     this.thresholdFrequency = thresholdFrequency;
   }

   /** Get the minimum length of a query term needed to return suggestions */
   public int getMinQueryLength() {
     return minQueryLength;
   }

   /**
    * Set the minimum length of a query term (default: 4) needed to return suggestions.
    * <p>
    * Very short query terms will often cause only bad suggestions with any distance
    * metric.
    */
   public void setMinQueryLength(int minQueryLength) {
     if (minQueryLength > this.maxQueryLength)
       throw new IllegalArgumentException("minQueryLength must not be greater than maxQueryLength");
     this.minQueryLength = minQueryLength;
   }

   /** Get the maximum length of a query term to return suggestions */
   public int getMaxQueryLength() {
     return maxQueryLength;
   }

   /**
    * Set the maximum length of a query term to return suggestions.
    * <p>
    * Long queries can be expensive to process and/or trigger exceptions.
    */
   public void setMaxQueryLength(int maxQueryLength) {
     if (maxQueryLength < this.minQueryLength)
       throw new IllegalArgumentException("maxQueryLength must not be smaller than minQueryLength");
     this.maxQueryLength = maxQueryLength;
   }

   /**
    * Get the maximum threshold of documents a query term can appear in order
    * to provide suggestions.
    */
   public float getMaxQueryFrequency() {
     return maxQueryFrequency;
   }

   /**
    * Set the maximum threshold (default: 0.01f) of documents a query term can
    * appear in order to provide suggestions.
    * <p>
    * Very high-frequency terms are typically spelled correctly. Additionally,
    * this can increase performance as it will do no work for the common case
    * of correctly-spelled input terms.
    * <p>
    * This can be specified as a relative percentage of documents such as 0.5f,
    * or it can be specified as an absolute whole document frequency, such as 4f.
    * Absolute document frequencies may not be fractional.
    */
   public void setMaxQueryFrequency(float maxQueryFrequency) {
     if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency)
       throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
     this.maxQueryFrequency = maxQueryFrequency;
   }

   /** true if the spellchecker should lowercase terms */
   public boolean getLowerCaseTerms() {
     return lowerCaseTerms;
   }

   /**
    * True if the spellchecker should lowercase terms (default: true)
    * <p>
    * This is a convenience method, if your index field has more complicated
    * analysis (such as StandardTokenizer removing punctuation), it's probably
    * better to turn this off, and instead run your query terms through your
    * Analyzer first.
    * <p>
    * If this option is not on, case differences count as an edit!
    */
   public void setLowerCaseTerms(boolean lowerCaseTerms) {
     this.lowerCaseTerms = lowerCaseTerms;
   }

   /**
    * Get the current comparator in use.
    */
   public Comparator<SuggestWord> getComparator() {
     return comparator;
   }

   /**
    * Set the comparator for sorting suggestions.
    * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR}
    */
   public void setComparator(Comparator<SuggestWord> comparator) {
     this.comparator = comparator;
   }

   /**
    * Get the string distance metric in use.
    */
   public StringDistance getDistance() {
     return distance;
   }

   /**
    * Set the string distance metric.
    * The default is {@link #INTERNAL_LEVENSHTEIN}
    * <p>
    * Note: because this spellchecker draws its candidates from the term
    * dictionary using Damerau-Levenshtein, it works best with an edit-distance-like
    * string metric. If you use a different metric than the default,
    * you might want to consider increasing {@link #setMaxInspections(int)}
    * to draw more candidates for your metric to rank.
    */
   public void setDistance(StringDistance distance) {
     this.distance = distance;
   }

   /**
    * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
    *       suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
    */
   public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
      throws IOException {
     return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
   }

   /**
    * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
    *       suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
    *
    */
   public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
       SuggestMode suggestMode) throws IOException {
     return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
   }

   /**
    * Suggest similar words.
    *
    * <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
    * relevant terms is an edit distance, therefore typically a low value
    * for numSug will work very well.
    *
    * @param term Term you want to spell check on
    * @param numSug the maximum number of suggested words
    * @param ir IndexReader to find terms from
    * @param suggestMode specifies when to return suggested words
    * @param accuracy return only suggested words that match with this similarity
    * @return sorted list of the suggested words according to the comparator
    * @throws IOException If there is a low-level I/O error.
    */
   public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
       SuggestMode suggestMode, float accuracy) throws IOException {
     final CharsRefBuilder spare = new CharsRefBuilder();
     String text = term.text();

     int textLength = text.codePointCount(0, text.length());
     if (textLength < minQueryLength || textLength > maxQueryLength)
       return new SuggestWord[0];

     if (lowerCaseTerms) {
       term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
     }

     int docfreq = ir.docFreq(term);

     if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
       return new SuggestWord[0];
     }

     int maxDoc = ir.maxDoc();

     if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
       return new SuggestWord[0];
     } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
       return new SuggestWord[0];
     }

     if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;

     if (thresholdFrequency >= 1f) {
       docfreq = Math.max(docfreq, (int) thresholdFrequency);
     } else if (thresholdFrequency > 0f) {
       docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
     }

     Collection<ScoreTerm> terms = null;
     int inspections = numSug * maxInspections;

     // try ed=1 first, in case we get lucky
     terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
     if (maxEdits > 1 && terms.size() < inspections) {
       HashSet<ScoreTerm> moreTerms = new HashSet<>();
       moreTerms.addAll(terms);
       moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
       terms = moreTerms;
     }

     // create the suggestword response, sort it, and trim it to size.

     SuggestWord suggestions[] = new SuggestWord[terms.size()];
     int index = suggestions.length - 1;
     for (ScoreTerm s : terms) {
       SuggestWord suggestion = new SuggestWord();
       if (s.termAsString == null) {
         spare.copyUTF8Bytes(s.term);
         s.termAsString = spare.toString();
       }
       suggestion.string = s.termAsString;
       suggestion.score = s.score;
       suggestion.freq = s.docfreq;
       suggestions[index--] = suggestion;
     }

     ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
     if (numSug < suggestions.length) {
       SuggestWord trimmed[] = new SuggestWord[numSug];
       System.arraycopy(suggestions, 0, trimmed, 0, numSug);
       suggestions = trimmed;
     }
     return suggestions;
   }

   /**
    * Provide spelling corrections based on several parameters.
    *
    * @param term The term to suggest spelling corrections for
    * @param numSug The maximum number of spelling corrections
    * @param ir The index reader to fetch the candidate spelling corrections from
    * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
    * @param editDistance The maximum edit distance candidates are allowed to have
    * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
    * @param spare a chars scratch
    * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
    * @throws IOException If I/O related errors occur
    */
   protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                                  float accuracy, final CharsRefBuilder spare) throws IOException {

     Terms terms = MultiTerms.getTerms(ir, term.field());
     if (terms == null) {
       return Collections.emptyList();
     }
     FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
     final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();

     BytesRef queryTerm = new BytesRef(term.text());
     BytesRef candidateTerm;
     ScoreTerm st = new ScoreTerm();
     while ((candidateTerm = e.next()) != null) {
       // For FuzzyQuery, boost is the score:
       float score = e.getBoost();
       // ignore uncompetitive hits
       if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
         continue;
       }

       // ignore exact match of the same term
       if (queryTerm.bytesEquals(candidateTerm)) {
         continue;
       }

       int df = e.docFreq();

       // check docFreq if required
       if (df <= docfreq) {
         continue;
       }

       final String termAsString;
       if (distance == INTERNAL_LEVENSHTEIN) {
         // delay creating strings until the end
         termAsString = null;
       } else {
         spare.copyUTF8Bytes(candidateTerm);
         termAsString = spare.toString();
         score = distance.getDistance(term.text(), termAsString);
       }

       if (score < accuracy) {
         continue;
       }

       // add new entry in PQ
       st.term = BytesRef.deepCopyOf(candidateTerm);
       st.boost = score;
       st.docfreq = df;
       st.termAsString = termAsString;
       st.score = score;
       stQueue.offer(st);
       // possibly drop entries from queue
       st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
       e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
     }

     return stQueue;
   }

   /**
    * Holds a spelling correction for internal usage inside {@link DirectSpellChecker}.
    */
   protected static class ScoreTerm implements Comparable<ScoreTerm> {

     /**
      * The actual spellcheck correction.
      */
     public BytesRef term;

     /**
      * The boost representing the similarity from the FuzzyTermsEnum (internal similarity score)
      */
     public float boost;

     /**
      * The df of the spellcheck correction.
      */
     public int docfreq;

     /**
      * The spellcheck correction represented as string, can be <code>null</code>.
      */
     public String termAsString;

     /**
      * The similarity score.
      */
     public float score;

     /**
      * Constructor.
      */
     public ScoreTerm() {
     }

     @Override
     public int compareTo(ScoreTerm other) {
       if (term.bytesEquals(other.term))
         return 0; // consistent with equals
       if (this.boost == other.boost)
         return other.term.compareTo(this.term);
       else
         return Float.compare(this.boost, other.boost);
     }

     @Override
     public int hashCode() {
       final int prime = 31;
       int result = 1;
       result = prime * result + ((term == null) ? 0 : term.hashCode());
       return result;
     }

     @Override
     public boolean equals(Object obj) {
       if (this == obj) return true;
       if (obj == null) return false;
       if (getClass() != obj.getClass()) return false;
       ScoreTerm other = (ScoreTerm) obj;
       if (term == null) {
         if (other.term != null) return false;
       } else if (!term.bytesEquals(other.term)) return false;
       return true;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.spell;

	import java.io.IOException;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashSet;
	import java.util.Locale;
	import java.util.PriorityQueue;

	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.MultiTerms;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.search.FuzzyTermsEnum;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.CharsRefBuilder;
	import org.apache.lucene.util.automaton.LevenshteinAutomata;

	/**
	* Simple automaton-based spellchecker.
	* <p>
	* Candidates are presented directly from the term dictionary, based on
	* Levenshtein distance. This is an alternative to {@link SpellChecker}
	* if you are using an edit-distance-like metric such as Levenshtein
	* or {@link JaroWinklerDistance}.
	* <p>
	* A practical benefit of this spellchecker is that it requires no additional
	* datastructures (neither in RAM nor on disk) to do its work.
	*
	* @see LevenshteinAutomata
	* @see FuzzyTermsEnum
	*
	* @lucene.experimental
	*/
	public class DirectSpellChecker {
	/** The default StringDistance, Damerau-Levenshtein distance implemented internally
	* via {@link LevenshteinAutomata}.
	* <p>
	* Note: this is the fastest distance metric, because Damerau-Levenshtein is used
	* to draw candidates from the term dictionary: this just re-uses the scoring.
	*/
	public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();

	/** maximum edit distance for candidate terms */
	private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
	/** minimum prefix for candidate terms */
	private int minPrefix = 1;
	/** maximum number of top-N inspections per suggestion */
	private int maxInspections = 5;
	/** minimum accuracy for a term to match */
	private float accuracy = SpellChecker.DEFAULT_ACCURACY;
	/** value in [0..1] (or absolute number >= 1) representing the minimum
	* number of documents (of the total) where a term should appear. */
	private float thresholdFrequency = 0f;
	/** minimum length of a query word to return suggestions */
	private int minQueryLength = 4;
	/** maximum length of a query word to return suggestions */
	private int maxQueryLength = Integer.MAX_VALUE;
	/** value in [0..1] (or absolute number >= 1) representing the maximum
	* number of documents (of the total) a query term can appear in to
	* be corrected. */
	private float maxQueryFrequency = 0.01f;
	/** true if the spellchecker should lowercase terms */
	private boolean lowerCaseTerms = true;
	/** the comparator to use */
	private Comparator<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
	/** the string distance to use */
	private StringDistance distance = INTERNAL_LEVENSHTEIN;

	/** Creates a DirectSpellChecker with default configuration values */
	public DirectSpellChecker() {}

	/** Get the maximum number of Levenshtein edit-distances to draw
	* candidate terms from. */
	public int getMaxEdits() {
	return maxEdits;
	}

	/** Sets the maximum number of Levenshtein edit-distances to draw
	* candidate terms from. This value can be 1 or 2. The default is 2.
	* <p>
	* Note: a large number of spelling errors occur with an edit distance
	* of 1, by setting this value to 1 you can increase both performance
	* and precision at the cost of recall.
	*/
	public void setMaxEdits(int maxEdits) {
	if (maxEdits < 1 \|\| maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
	throw new UnsupportedOperationException("Invalid maxEdits");
	this.maxEdits = maxEdits;
	}

	/**
	* Get the minimal number of characters that must match exactly
	*/
	public int getMinPrefix() {
	return minPrefix;
	}

	/**
	* Sets the minimal number of initial characters (default: 1)
	* that must match exactly.
	* <p>
	* This can improve both performance and accuracy of results,
	* as misspellings are commonly not the first character.
	*/
	public void setMinPrefix(int minPrefix) {
	this.minPrefix = minPrefix;
	}

	/**
	* Get the maximum number of top-N inspections per suggestion
	*/
	public int getMaxInspections() {
	return maxInspections;
	}

	/**
	* Set the maximum number of top-N inspections (default: 5) per suggestion.
	* <p>
	* Increasing this number can improve the accuracy of results, at the cost
	* of performance.
	*/
	public void setMaxInspections(int maxInspections) {
	this.maxInspections = maxInspections;
	}

	/**
	* Get the minimal accuracy from the StringDistance for a match
	*/
	public float getAccuracy() {
	return accuracy;
	}

	/**
	* Set the minimal accuracy required (default: 0.5f) from a StringDistance
	* for a suggestion match.
	*/
	public void setAccuracy(float accuracy) {
	this.accuracy = accuracy;
	}

	/**
	* Get the minimal threshold of documents a term must appear for a match
	*/
	public float getThresholdFrequency() {
	return thresholdFrequency;
	}

	/**
	* Set the minimal threshold of documents a term must appear for a match.
	* <p>
	* This can improve quality by only suggesting high-frequency terms. Note that
	* very high values might decrease performance slightly, by forcing the spellchecker
	* to draw more candidates from the term dictionary, but a practical value such
	* as <code>1</code> can be very useful towards improving quality.
	* <p>
	* This can be specified as a relative percentage of documents such as 0.5f,
	* or it can be specified as an absolute whole document frequency, such as 4f.
	* Absolute document frequencies may not be fractional.
	*/
	public void setThresholdFrequency(float thresholdFrequency) {
	if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency)
	throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
	this.thresholdFrequency = thresholdFrequency;
	}

	/** Get the minimum length of a query term needed to return suggestions */
	public int getMinQueryLength() {
	return minQueryLength;
	}

	/**
	* Set the minimum length of a query term (default: 4) needed to return suggestions.
	* <p>
	* Very short query terms will often cause only bad suggestions with any distance
	* metric.
	*/
	public void setMinQueryLength(int minQueryLength) {
	if (minQueryLength > this.maxQueryLength)
	throw new IllegalArgumentException("minQueryLength must not be greater than maxQueryLength");
	this.minQueryLength = minQueryLength;
	}

	/** Get the maximum length of a query term to return suggestions */
	public int getMaxQueryLength() {
	return maxQueryLength;
	}

	/**
	* Set the maximum length of a query term to return suggestions.
	* <p>
	* Long queries can be expensive to process and/or trigger exceptions.
	*/
	public void setMaxQueryLength(int maxQueryLength) {
	if (maxQueryLength < this.minQueryLength)
	throw new IllegalArgumentException("maxQueryLength must not be smaller than minQueryLength");
	this.maxQueryLength = maxQueryLength;
	}

	/**
	* Get the maximum threshold of documents a query term can appear in order
	* to provide suggestions.
	*/
	public float getMaxQueryFrequency() {
	return maxQueryFrequency;
	}

	/**
	* Set the maximum threshold (default: 0.01f) of documents a query term can
	* appear in order to provide suggestions.
	* <p>
	* Very high-frequency terms are typically spelled correctly. Additionally,
	* this can increase performance as it will do no work for the common case
	* of correctly-spelled input terms.
	* <p>
	* This can be specified as a relative percentage of documents such as 0.5f,
	* or it can be specified as an absolute whole document frequency, such as 4f.
	* Absolute document frequencies may not be fractional.
	*/
	public void setMaxQueryFrequency(float maxQueryFrequency) {
	if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency)
	throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed");
	this.maxQueryFrequency = maxQueryFrequency;
	}

	/** true if the spellchecker should lowercase terms */
	public boolean getLowerCaseTerms() {
	return lowerCaseTerms;
	}

	/**
	* True if the spellchecker should lowercase terms (default: true)
	* <p>
	* This is a convenience method, if your index field has more complicated
	* analysis (such as StandardTokenizer removing punctuation), it's probably
	* better to turn this off, and instead run your query terms through your
	* Analyzer first.
	* <p>
	* If this option is not on, case differences count as an edit!
	*/
	public void setLowerCaseTerms(boolean lowerCaseTerms) {
	this.lowerCaseTerms = lowerCaseTerms;
	}

	/**
	* Get the current comparator in use.
	*/
	public Comparator<SuggestWord> getComparator() {
	return comparator;
	}

	/**
	* Set the comparator for sorting suggestions.
	* The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR}
	*/
	public void setComparator(Comparator<SuggestWord> comparator) {
	this.comparator = comparator;
	}

	/**
	* Get the string distance metric in use.
	*/
	public StringDistance getDistance() {
	return distance;
	}

	/**
	* Set the string distance metric.
	* The default is {@link #INTERNAL_LEVENSHTEIN}
	* <p>
	* Note: because this spellchecker draws its candidates from the term
	* dictionary using Damerau-Levenshtein, it works best with an edit-distance-like
	* string metric. If you use a different metric than the default,
	* you might want to consider increasing {@link #setMaxInspections(int)}
	* to draw more candidates for your metric to rank.
	*/
	public void setDistance(StringDistance distance) {
	this.distance = distance;
	}

	/**
	* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
	* suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
	*/
	public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
	throws IOException {
	return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
	}

	/**
	* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
	* suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
	*
	*/
	public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
	SuggestMode suggestMode) throws IOException {
	return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
	}

	/**
	* Suggest similar words.
	*
	* <p>Unlike {@link SpellChecker}, the similarity used to fetch the most
	* relevant terms is an edit distance, therefore typically a low value
	* for numSug will work very well.
	*
	* @param term Term you want to spell check on
	* @param numSug the maximum number of suggested words
	* @param ir IndexReader to find terms from
	* @param suggestMode specifies when to return suggested words
	* @param accuracy return only suggested words that match with this similarity
	* @return sorted list of the suggested words according to the comparator
	* @throws IOException If there is a low-level I/O error.
	*/
	public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
	SuggestMode suggestMode, float accuracy) throws IOException {
	final CharsRefBuilder spare = new CharsRefBuilder();
	String text = term.text();

	int textLength = text.codePointCount(0, text.length());
	if (textLength < minQueryLength \|\| textLength > maxQueryLength)
	return new SuggestWord[0];

	if (lowerCaseTerms) {
	term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
	}

	int docfreq = ir.docFreq(term);

	if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
	return new SuggestWord[0];
	}

	int maxDoc = ir.maxDoc();

	if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
	return new SuggestWord[0];
	} else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
	return new SuggestWord[0];
	}

	if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;

	if (thresholdFrequency >= 1f) {
	docfreq = Math.max(docfreq, (int) thresholdFrequency);
	} else if (thresholdFrequency > 0f) {
	docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
	}

	Collection<ScoreTerm> terms = null;
	int inspections = numSug * maxInspections;

	// try ed=1 first, in case we get lucky
	terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
	if (maxEdits > 1 && terms.size() < inspections) {
	HashSet<ScoreTerm> moreTerms = new HashSet<>();
	moreTerms.addAll(terms);
	moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
	terms = moreTerms;
	}

	// create the suggestword response, sort it, and trim it to size.

	SuggestWord suggestions[] = new SuggestWord[terms.size()];
	int index = suggestions.length - 1;
	for (ScoreTerm s : terms) {
	SuggestWord suggestion = new SuggestWord();
	if (s.termAsString == null) {
	spare.copyUTF8Bytes(s.term);
	s.termAsString = spare.toString();
	}
	suggestion.string = s.termAsString;
	suggestion.score = s.score;
	suggestion.freq = s.docfreq;
	suggestions[index--] = suggestion;
	}

	ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator));
	if (numSug < suggestions.length) {
	SuggestWord trimmed[] = new SuggestWord[numSug];
	System.arraycopy(suggestions, 0, trimmed, 0, numSug);
	suggestions = trimmed;
	}
	return suggestions;
	}

	/**
	* Provide spelling corrections based on several parameters.
	*
	* @param term The term to suggest spelling corrections for
	* @param numSug The maximum number of spelling corrections
	* @param ir The index reader to fetch the candidate spelling corrections from
	* @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
	* @param editDistance The maximum edit distance candidates are allowed to have
	* @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
	* @param spare a chars scratch
	* @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
	* @throws IOException If I/O related errors occur
	*/
	protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
	float accuracy, final CharsRefBuilder spare) throws IOException {

	Terms terms = MultiTerms.getTerms(ir, term.field());
	if (terms == null) {
	return Collections.emptyList();
	}
	FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
	final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();

	BytesRef queryTerm = new BytesRef(term.text());
	BytesRef candidateTerm;
	ScoreTerm st = new ScoreTerm();
	while ((candidateTerm = e.next()) != null) {
	// For FuzzyQuery, boost is the score:
	float score = e.getBoost();
	// ignore uncompetitive hits
	if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
	continue;
	}

	// ignore exact match of the same term
	if (queryTerm.bytesEquals(candidateTerm)) {
	continue;
	}

	int df = e.docFreq();

	// check docFreq if required
	if (df <= docfreq) {
	continue;
	}

	final String termAsString;
	if (distance == INTERNAL_LEVENSHTEIN) {
	// delay creating strings until the end
	termAsString = null;
	} else {
	spare.copyUTF8Bytes(candidateTerm);
	termAsString = spare.toString();
	score = distance.getDistance(term.text(), termAsString);
	}

	if (score < accuracy) {
	continue;
	}

	// add new entry in PQ
	st.term = BytesRef.deepCopyOf(candidateTerm);
	st.boost = score;
	st.docfreq = df;
	st.termAsString = termAsString;
	st.score = score;
	stQueue.offer(st);
	// possibly drop entries from queue
	st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
	e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
	}

	return stQueue;
	}

	/**
	* Holds a spelling correction for internal usage inside {@link DirectSpellChecker}.
	*/
	protected static class ScoreTerm implements Comparable<ScoreTerm> {

	/**
	* The actual spellcheck correction.
	*/
	public BytesRef term;

	/**
	* The boost representing the similarity from the FuzzyTermsEnum (internal similarity score)
	*/
	public float boost;

	/**
	* The df of the spellcheck correction.
	*/
	public int docfreq;

	/**
	* The spellcheck correction represented as string, can be <code>null</code>.
	*/
	public String termAsString;

	/**
	* The similarity score.
	*/
	public float score;

	/**
	* Constructor.
	*/
	public ScoreTerm() {
	}

	@Override
	public int compareTo(ScoreTerm other) {
	if (term.bytesEquals(other.term))
	return 0; // consistent with equals
	if (this.boost == other.boost)
	return other.term.compareTo(this.term);
	else
	return Float.compare(this.boost, other.boost);
	}

	@Override
	public int hashCode() {
	final int prime = 31;
	int result = 1;
	result = prime * result + ((term == null) ? 0 : term.hashCode());
	return result;
	}

	@Override
	public boolean equals(Object obj) {
	if (this == obj) return true;
	if (obj == null) return false;
	if (getClass() != obj.getClass()) return false;
	ScoreTerm other = (ScoreTerm) obj;
	if (term == null) {
	if (other.term != null) return false;
	} else if (!term.bytesEquals(other.term)) return false;
	return true;
	}
	}
	}