blob: 95cf0ea7b50a8c8b645448c5a907cf7e10e4a9b4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Spellchecker implementation that uses {@link DirectSpellChecker}
* <p>
* Requires no auxiliary index or data structure.
* <p>
* Supported options:
* <ul>
* <li>field: Used as the source of terms.
* <li>distanceMeasure: Sets {@link DirectSpellChecker#setDistance(StringDistance)}.
* Note: to set the default {@link DirectSpellChecker#INTERNAL_LEVENSHTEIN}, use "internal".
* <li>accuracy: Sets {@link DirectSpellChecker#setAccuracy(float)}.
* <li>maxEdits: Sets {@link DirectSpellChecker#setMaxEdits(int)}.
* <li>minPrefix: Sets {@link DirectSpellChecker#setMinPrefix(int)}.
* <li>maxInspections: Sets {@link DirectSpellChecker#setMaxInspections(int)}.
* <li>comparatorClass: Sets {@link DirectSpellChecker#setComparator(Comparator)}.
* Note: score-then-frequency can be specified as "score" and frequency-then-score
* can be specified as "freq".
* <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}.
* <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}.
* <li>maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}.
* <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}.
* </ul>
* @see DirectSpellChecker
*/
public class DirectSolrSpellChecker extends SolrSpellChecker {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
// configuration params shared with other spellcheckers
public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS;
public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP;
public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP;
public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE;
public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY;
public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY;
public static final String INTERNAL_DISTANCE = "internal";
public static final float DEFAULT_ACCURACY = 0.5f;
public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f;
public static final String MAXEDITS = "maxEdits";
public static final int DEFAULT_MAXEDITS = 2;
// params specific to this implementation
public static final String MINPREFIX = "minPrefix";
public static final int DEFAULT_MINPREFIX = 1;
public static final String MAXINSPECTIONS = "maxInspections";
public static final int DEFAULT_MAXINSPECTIONS = 5;
public static final String MINQUERYLENGTH = "minQueryLength";
public static final int DEFAULT_MINQUERYLENGTH = 4;
public static final String MAXQUERYLENGTH = "maxQueryLength";
public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE;
public static final String MAXQUERYFREQUENCY = "maxQueryFrequency";
public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f;
private DirectSpellChecker checker = new DirectSpellChecker();
@Override
@SuppressWarnings({"unchecked"})
public String init(@SuppressWarnings({"rawtypes"})NamedList config, SolrCore core) {
SolrParams params = config.toSolrParams();
log.info("init: {}", config);
String name = super.init(config, core);
Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
String compClass = (String) config.get(COMPARATOR_CLASS);
if (compClass != null) {
if (compClass.equalsIgnoreCase(SCORE_COMP))
comp = SuggestWordQueue.DEFAULT_COMPARATOR;
else if (compClass.equalsIgnoreCase(FREQ_COMP))
comp = new SuggestWordFrequencyComparator();
else //must be a FQCN
comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
}
StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
String distClass = (String) config.get(STRING_DISTANCE);
if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);
float minAccuracy = DEFAULT_ACCURACY;
Float accuracy = params.getFloat(ACCURACY);
if (accuracy != null)
minAccuracy = accuracy;
int maxEdits = DEFAULT_MAXEDITS;
Integer edits = params.getInt(MAXEDITS);
if (edits != null)
maxEdits = edits;
int minPrefix = DEFAULT_MINPREFIX;
Integer prefix = params.getInt(MINPREFIX);
if (prefix != null)
minPrefix = prefix;
int maxInspections = DEFAULT_MAXINSPECTIONS;
Integer inspections = params.getInt(MAXINSPECTIONS);
if (inspections != null)
maxInspections = inspections;
float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
if (threshold != null)
minThreshold = threshold;
int minQueryLength = DEFAULT_MINQUERYLENGTH;
Integer queryLength = params.getInt(MINQUERYLENGTH);
if (queryLength != null)
minQueryLength = queryLength;
int maxQueryLength = DEFAULT_MAXQUERYLENGTH;
Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH);
if (overriddenMaxQueryLength != null)
maxQueryLength = overriddenMaxQueryLength;
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
if (queryFreq != null)
maxQueryFrequency = queryFreq;
checker.setComparator(comp);
checker.setDistance(sd);
checker.setMaxEdits(maxEdits);
checker.setMinPrefix(minPrefix);
checker.setAccuracy(minAccuracy);
checker.setThresholdFrequency(minThreshold);
checker.setMaxInspections(maxInspections);
checker.setMinQueryLength(minQueryLength);
checker.setMaxQueryLength(maxQueryLength);
checker.setMaxQueryFrequency(maxQueryFrequency);
checker.setLowerCaseTerms(false);
return name;
}
@Override
public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {}
@Override
public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException {}
@Override
public SpellingResult getSuggestions(SpellingOptions options)
throws IOException {
log.debug("getSuggestions: {}", options.tokens);
SpellingResult result = new SpellingResult();
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
for (Token token : options.tokens) {
String tokenText = token.toString();
Term term = new Term(field, tokenText);
int freq = options.reader.docFreq(term);
int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount: options.count;
SuggestWord[] suggestions = checker.suggestSimilar(term, count,options.reader, options.suggestMode, accuracy);
result.addFrequency(token, freq);
// If considering alternatives to "correctly-spelled" terms, then add the
// original as a viable suggestion.
if (options.alternativeTermCount > 0 && freq > 0) {
boolean foundOriginal = false;
SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
for (int i = 0; i < suggestions.length; i++) {
if (suggestions[i].string.equals(tokenText)) {
foundOriginal = true;
break;
}
suggestionsWithOrig[i + 1] = suggestions[i];
}
if (!foundOriginal) {
SuggestWord orig = new SuggestWord();
orig.freq = freq;
orig.string = tokenText;
suggestionsWithOrig[0] = orig;
suggestions = suggestionsWithOrig;
}
}
if(suggestions.length==0 && freq==0) {
List<String> empty = Collections.emptyList();
result.add(token, empty);
} else {
for (SuggestWord suggestion : suggestions) {
result.add(token, suggestion.string, suggestion.freq);
}
}
}
return result;
}
@Override
public float getAccuracy() {
return checker.getAccuracy();
}
@Override
public StringDistance getStringDistance() {
return checker.getDistance();
}
}