solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.spelling;

 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.spell.DirectSpellChecker;
 import org.apache.lucene.search.spell.StringDistance;
 import org.apache.lucene.search.spell.SuggestWord;
 import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
 import org.apache.lucene.search.spell.SuggestWordQueue;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Spellchecker implementation that uses {@link DirectSpellChecker}
  * <p>
  * Requires no auxiliary index or data structure.
  * <p>
  * Supported options:
  * <ul>
  *   <li>field: Used as the source of terms.
  *   <li>distanceMeasure: Sets {@link DirectSpellChecker#setDistance(StringDistance)}.
  *       Note: to set the default {@link DirectSpellChecker#INTERNAL_LEVENSHTEIN}, use "internal".
  *   <li>accuracy: Sets {@link DirectSpellChecker#setAccuracy(float)}.
  *   <li>maxEdits: Sets {@link DirectSpellChecker#setMaxEdits(int)}.
  *   <li>minPrefix: Sets {@link DirectSpellChecker#setMinPrefix(int)}.
  *   <li>maxInspections: Sets {@link DirectSpellChecker#setMaxInspections(int)}.
  *   <li>comparatorClass: Sets {@link DirectSpellChecker#setComparator(Comparator)}.
  *       Note: score-then-frequency can be specified as "score" and frequency-then-score
  *       can be specified as "freq".
  *   <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}.
  *   <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}.
  *   <li>maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}.
  *   <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}.
  * </ul>
  * @see DirectSpellChecker
  */
 public class DirectSolrSpellChecker extends SolrSpellChecker {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

   // configuration params shared with other spellcheckers
   public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS;
   public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP;
   public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP;
   public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE;
   public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY;
   public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY;

   public static final String INTERNAL_DISTANCE = "internal";
   public static final float DEFAULT_ACCURACY = 0.5f;
   public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f;

   public static final String MAXEDITS = "maxEdits";
   public static final int DEFAULT_MAXEDITS = 2;

   // params specific to this implementation
   public static final String MINPREFIX = "minPrefix";
   public static final int DEFAULT_MINPREFIX = 1;

   public static final String MAXINSPECTIONS = "maxInspections";
   public static final int DEFAULT_MAXINSPECTIONS = 5;

   public static final String MINQUERYLENGTH = "minQueryLength";
   public static final int DEFAULT_MINQUERYLENGTH = 4;

   public static final String MAXQUERYLENGTH = "maxQueryLength";
   public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE;

   public static final String MAXQUERYFREQUENCY = "maxQueryFrequency";
   public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f;

   private DirectSpellChecker checker = new DirectSpellChecker();

   @Override
   @SuppressWarnings({"unchecked"})
   public String init(@SuppressWarnings({"rawtypes"})NamedList config, SolrCore core) {

     SolrParams params = config.toSolrParams();

     log.info("init: {}", config);
     String name = super.init(config, core);

     Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
     String compClass = (String) config.get(COMPARATOR_CLASS);
     if (compClass != null) {
       if (compClass.equalsIgnoreCase(SCORE_COMP))
         comp = SuggestWordQueue.DEFAULT_COMPARATOR;
       else if (compClass.equalsIgnoreCase(FREQ_COMP))
         comp = new SuggestWordFrequencyComparator();
       else //must be a FQCN
         comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
     }

     StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
     String distClass = (String) config.get(STRING_DISTANCE);
     if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
       sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);

     float minAccuracy = DEFAULT_ACCURACY;
     Float accuracy = params.getFloat(ACCURACY);
     if (accuracy != null)
       minAccuracy = accuracy;

     int maxEdits = DEFAULT_MAXEDITS;
     Integer edits = params.getInt(MAXEDITS);
     if (edits != null)
       maxEdits = edits;

     int minPrefix = DEFAULT_MINPREFIX;
     Integer prefix = params.getInt(MINPREFIX);
     if (prefix != null)
       minPrefix = prefix;

     int maxInspections = DEFAULT_MAXINSPECTIONS;
     Integer inspections = params.getInt(MAXINSPECTIONS);
     if (inspections != null)
       maxInspections = inspections;

     float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
     Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
     if (threshold != null)
       minThreshold = threshold;

     int minQueryLength = DEFAULT_MINQUERYLENGTH;
     Integer queryLength = params.getInt(MINQUERYLENGTH);
     if (queryLength != null)
       minQueryLength = queryLength;

     int maxQueryLength = DEFAULT_MAXQUERYLENGTH;
     Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH);
     if (overriddenMaxQueryLength != null)
       maxQueryLength = overriddenMaxQueryLength;

     float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
     Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
     if (queryFreq != null)
       maxQueryFrequency = queryFreq;

     checker.setComparator(comp);
     checker.setDistance(sd);
     checker.setMaxEdits(maxEdits);
     checker.setMinPrefix(minPrefix);
     checker.setAccuracy(minAccuracy);
     checker.setThresholdFrequency(minThreshold);
     checker.setMaxInspections(maxInspections);
     checker.setMinQueryLength(minQueryLength);
     checker.setMaxQueryLength(maxQueryLength);
     checker.setMaxQueryFrequency(maxQueryFrequency);
     checker.setLowerCaseTerms(false);

     return name;
   }

   @Override
   public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {}

   @Override
   public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException {}

   @Override
   public SpellingResult getSuggestions(SpellingOptions options)
       throws IOException {
     log.debug("getSuggestions: {}", options.tokens);

     SpellingResult result = new SpellingResult();
     float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;

     for (Token token : options.tokens) {
       String tokenText = token.toString();
       Term term = new Term(field, tokenText);
       int freq = options.reader.docFreq(term);
       int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount: options.count;
       SuggestWord[] suggestions = checker.suggestSimilar(term, count,options.reader, options.suggestMode, accuracy);
       result.addFrequency(token, freq);

       // If considering alternatives to "correctly-spelled" terms, then add the
       // original as a viable suggestion.
       if (options.alternativeTermCount > 0 && freq > 0) {
         boolean foundOriginal = false;
         SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
         for (int i = 0; i < suggestions.length; i++) {
           if (suggestions[i].string.equals(tokenText)) {
             foundOriginal = true;
             break;
           }
           suggestionsWithOrig[i + 1] = suggestions[i];
         }
         if (!foundOriginal) {
           SuggestWord orig = new SuggestWord();
           orig.freq = freq;
           orig.string = tokenText;
           suggestionsWithOrig[0] = orig;
           suggestions = suggestionsWithOrig;
         }
       }
       if(suggestions.length==0 && freq==0) {
         List<String> empty = Collections.emptyList();
         result.add(token, empty);
       } else {
         for (SuggestWord suggestion : suggestions) {
           result.add(token, suggestion.string, suggestion.freq);
         }
       }
     }
     return result;
   }

   @Override
   public float getAccuracy() {
     return checker.getAccuracy();
   }
   @Override
   public StringDistance getStringDistance() {
     return checker.getDistance();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.spelling;

	import java.io.IOException;
	import java.lang.invoke.MethodHandles;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.spell.DirectSpellChecker;
	import org.apache.lucene.search.spell.StringDistance;
	import org.apache.lucene.search.spell.SuggestWord;
	import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
	import org.apache.lucene.search.spell.SuggestWordQueue;
	import org.apache.solr.common.params.SolrParams;
	import org.apache.solr.common.util.NamedList;
	import org.apache.solr.core.SolrCore;
	import org.apache.solr.search.SolrIndexSearcher;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Spellchecker implementation that uses {@link DirectSpellChecker}
	* <p>
	* Requires no auxiliary index or data structure.
	* <p>
	* Supported options:
	* <ul>
	* <li>field: Used as the source of terms.
	* <li>distanceMeasure: Sets {@link DirectSpellChecker#setDistance(StringDistance)}.
	* Note: to set the default {@link DirectSpellChecker#INTERNAL_LEVENSHTEIN}, use "internal".
	* <li>accuracy: Sets {@link DirectSpellChecker#setAccuracy(float)}.
	* <li>maxEdits: Sets {@link DirectSpellChecker#setMaxEdits(int)}.
	* <li>minPrefix: Sets {@link DirectSpellChecker#setMinPrefix(int)}.
	* <li>maxInspections: Sets {@link DirectSpellChecker#setMaxInspections(int)}.
	* <li>comparatorClass: Sets {@link DirectSpellChecker#setComparator(Comparator)}.
	* Note: score-then-frequency can be specified as "score" and frequency-then-score
	* can be specified as "freq".
	* <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}.
	* <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}.
	* <li>maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}.
	* <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}.
	* </ul>
	* @see DirectSpellChecker
	*/
	public class DirectSolrSpellChecker extends SolrSpellChecker {
	private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

	// configuration params shared with other spellcheckers
	public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS;
	public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP;
	public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP;
	public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE;
	public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY;
	public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY;

	public static final String INTERNAL_DISTANCE = "internal";
	public static final float DEFAULT_ACCURACY = 0.5f;
	public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f;

	public static final String MAXEDITS = "maxEdits";
	public static final int DEFAULT_MAXEDITS = 2;

	// params specific to this implementation
	public static final String MINPREFIX = "minPrefix";
	public static final int DEFAULT_MINPREFIX = 1;

	public static final String MAXINSPECTIONS = "maxInspections";
	public static final int DEFAULT_MAXINSPECTIONS = 5;

	public static final String MINQUERYLENGTH = "minQueryLength";
	public static final int DEFAULT_MINQUERYLENGTH = 4;

	public static final String MAXQUERYLENGTH = "maxQueryLength";
	public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE;

	public static final String MAXQUERYFREQUENCY = "maxQueryFrequency";
	public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f;

	private DirectSpellChecker checker = new DirectSpellChecker();

	@Override
	@SuppressWarnings({"unchecked"})
	public String init(@SuppressWarnings({"rawtypes"})NamedList config, SolrCore core) {

	SolrParams params = config.toSolrParams();

	log.info("init: {}", config);
	String name = super.init(config, core);

	Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
	String compClass = (String) config.get(COMPARATOR_CLASS);
	if (compClass != null) {
	if (compClass.equalsIgnoreCase(SCORE_COMP))
	comp = SuggestWordQueue.DEFAULT_COMPARATOR;
	else if (compClass.equalsIgnoreCase(FREQ_COMP))
	comp = new SuggestWordFrequencyComparator();
	else //must be a FQCN
	comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
	}

	StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
	String distClass = (String) config.get(STRING_DISTANCE);
	if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
	sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);

	float minAccuracy = DEFAULT_ACCURACY;
	Float accuracy = params.getFloat(ACCURACY);
	if (accuracy != null)
	minAccuracy = accuracy;

	int maxEdits = DEFAULT_MAXEDITS;
	Integer edits = params.getInt(MAXEDITS);
	if (edits != null)
	maxEdits = edits;

	int minPrefix = DEFAULT_MINPREFIX;
	Integer prefix = params.getInt(MINPREFIX);
	if (prefix != null)
	minPrefix = prefix;

	int maxInspections = DEFAULT_MAXINSPECTIONS;
	Integer inspections = params.getInt(MAXINSPECTIONS);
	if (inspections != null)
	maxInspections = inspections;

	float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
	Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
	if (threshold != null)
	minThreshold = threshold;

	int minQueryLength = DEFAULT_MINQUERYLENGTH;
	Integer queryLength = params.getInt(MINQUERYLENGTH);
	if (queryLength != null)
	minQueryLength = queryLength;

	int maxQueryLength = DEFAULT_MAXQUERYLENGTH;
	Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH);
	if (overriddenMaxQueryLength != null)
	maxQueryLength = overriddenMaxQueryLength;

	float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
	Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
	if (queryFreq != null)
	maxQueryFrequency = queryFreq;

	checker.setComparator(comp);
	checker.setDistance(sd);
	checker.setMaxEdits(maxEdits);
	checker.setMinPrefix(minPrefix);
	checker.setAccuracy(minAccuracy);
	checker.setThresholdFrequency(minThreshold);
	checker.setMaxInspections(maxInspections);
	checker.setMinQueryLength(minQueryLength);
	checker.setMaxQueryLength(maxQueryLength);
	checker.setMaxQueryFrequency(maxQueryFrequency);
	checker.setLowerCaseTerms(false);

	return name;
	}

	@Override
	public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {}

	@Override
	public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException {}

	@Override
	public SpellingResult getSuggestions(SpellingOptions options)
	throws IOException {
	log.debug("getSuggestions: {}", options.tokens);

	SpellingResult result = new SpellingResult();
	float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;

	for (Token token : options.tokens) {
	String tokenText = token.toString();
	Term term = new Term(field, tokenText);
	int freq = options.reader.docFreq(term);
	int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount: options.count;
	SuggestWord[] suggestions = checker.suggestSimilar(term, count,options.reader, options.suggestMode, accuracy);
	result.addFrequency(token, freq);

	// If considering alternatives to "correctly-spelled" terms, then add the
	// original as a viable suggestion.
	if (options.alternativeTermCount > 0 && freq > 0) {
	boolean foundOriginal = false;
	SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
	for (int i = 0; i < suggestions.length; i++) {
	if (suggestions[i].string.equals(tokenText)) {
	foundOriginal = true;
	break;
	}
	suggestionsWithOrig[i + 1] = suggestions[i];
	}
	if (!foundOriginal) {
	SuggestWord orig = new SuggestWord();
	orig.freq = freq;
	orig.string = tokenText;
	suggestionsWithOrig[0] = orig;
	suggestions = suggestionsWithOrig;
	}
	}
	if(suggestions.length==0 && freq==0) {
	List<String> empty = Collections.emptyList();
	result.add(token, empty);
	} else {
	for (SuggestWord suggestion : suggestions) {
	result.add(token, suggestion.string, suggestion.freq);
	}
	}
	}
	return result;
	}

	@Override
	public float getAccuracy() {
	return checker.getAccuracy();
	}
	@Override
	public StringDistance getStringDistance() {
	return checker.getDistance();
	}
	}