| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.spelling; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.core.WhitespaceAnalyzer; |
| import org.apache.lucene.search.spell.LevenshteinDistance; |
| import org.apache.lucene.search.spell.StringDistance; |
| import org.apache.lucene.search.spell.SuggestWord; |
| import org.apache.lucene.search.spell.SuggestWordQueue; |
| import org.apache.solr.client.solrj.response.SpellCheckResponse; |
| import org.apache.solr.common.params.SpellingParams; |
| import org.apache.solr.common.util.NamedList; |
| import org.apache.solr.core.SolrCore; |
| import org.apache.solr.handler.component.SpellCheckMergeData; |
| import org.apache.solr.schema.FieldType; |
| import org.apache.solr.schema.IndexSchema; |
| import org.apache.solr.search.SolrIndexSearcher; |
| |
| |
| /** |
| * <p> |
| * Refer to <a href="http://wiki.apache.org/solr/SpellCheckComponent">SpellCheckComponent</a> |
| * for more details. |
| * </p> |
| * |
| * @since solr 1.3 |
| */ |
| public abstract class SolrSpellChecker { |
| public static final String DICTIONARY_NAME = "name"; |
| public static final String DEFAULT_DICTIONARY_NAME = "default"; |
| public static final String FIELD = "field"; |
| public static final String FIELD_TYPE = "fieldType"; |
| /** Dictionary name */ |
| protected String name; |
| protected Analyzer analyzer; |
| protected String field; |
| protected String fieldTypeName; |
| |
| public String init(@SuppressWarnings({"rawtypes"})NamedList config, SolrCore core) { |
| name = (String) config.get(DICTIONARY_NAME); |
| if (name == null) { |
| name = DEFAULT_DICTIONARY_NAME; |
| } |
| field = (String)config.get(FIELD); |
| IndexSchema schema = core.getLatestSchema(); |
| if (field != null && schema.getFieldTypeNoEx(field) != null) { |
| analyzer = schema.getFieldType(field).getQueryAnalyzer(); |
| } |
| fieldTypeName = (String) config.get(FIELD_TYPE); |
| if (schema.getFieldTypes().containsKey(fieldTypeName)) { |
| FieldType fieldType = schema.getFieldTypes().get(fieldTypeName); |
| analyzer = fieldType.getQueryAnalyzer(); |
| } |
| if (analyzer == null) { |
| analyzer = new WhitespaceAnalyzer(); |
| } |
| return name; |
| } |
| /** |
| * Integrate spelling suggestions from the various shards in a distributed environment. |
| */ |
| public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) { |
| float min = 0.5f; |
| try { |
| min = getAccuracy(); |
| } catch(UnsupportedOperationException uoe) { |
| //just use .5 as a default |
| } |
| |
| StringDistance sd = null; |
| try { |
| sd = getStringDistance() == null ? new LevenshteinDistance() : getStringDistance(); |
| } catch(UnsupportedOperationException uoe) { |
| sd = new LevenshteinDistance(); |
| } |
| |
| SpellingResult result = new SpellingResult(); |
| for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) { |
| String original = entry.getKey(); |
| |
| //Only use this suggestion if all shards reported it as misspelled, |
| //unless it was not a term original to the user's query |
| //(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these) |
| Integer numShards = mergeData.origVsShards.get(original); |
| if(numShards<mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) { |
| continue; |
| } |
| |
| HashSet<String> suggested = entry.getValue(); |
| SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); |
| for (String suggestion : suggested) { |
| SuggestWord sug = mergeData.suggestedVsWord.get(suggestion); |
| sug.score = sd.getDistance(original, sug.string); |
| if (sug.score < min) continue; |
| sugQueue.insertWithOverflow(sug); |
| if (sugQueue.size() == numSug) { |
| // if queue full, maintain the minScore score |
| min = sugQueue.top().score; |
| } |
| } |
| |
| // create token |
| SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original); |
| Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset()); |
| |
| // get top 'count' suggestions out of 'sugQueue.size()' candidates |
| SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())]; |
| // skip the first sugQueue.size() - count elements |
| for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop(); |
| // now collect the top 'count' responses |
| for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) { |
| suggestions[k] = sugQueue.pop(); |
| } |
| |
| if (extendedResults) { |
| Integer o = mergeData.origVsFreq.get(original); |
| if (o != null) result.addFrequency(token, o); |
| for (SuggestWord word : suggestions) |
| result.add(token, word.string, word.freq); |
| } else { |
| List<String> words = new ArrayList<>(sugQueue.size()); |
| for (SuggestWord word : suggestions) words.add(word.string); |
| result.add(token, words); |
| } |
| } |
| return result; |
| } |
| |
| public Analyzer getQueryAnalyzer() { |
| return analyzer; |
| } |
| |
| public String getDictionaryName() { |
| return name; |
| } |
| |
| /** |
| * Reloads the index. Useful if an external process is responsible for building the spell checker. |
| * |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| public abstract void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException; |
| |
| /** |
| * (re)Builds the spelling index. May be a NOOP if the implementation doesn't require building, or can't be rebuilt. |
| */ |
| public abstract void build(SolrCore core, SolrIndexSearcher searcher) throws IOException; |
| |
| /** |
| * Get the value of {@link SpellingParams#SPELLCHECK_ACCURACY} if supported. |
| * Otherwise throws UnsupportedOperationException. |
| */ |
| protected float getAccuracy() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| /** |
| * Get the distance implementation used by this spellchecker, or NULL if not applicable. |
| */ |
| protected StringDistance getStringDistance() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| |
| /** |
| * Get suggestions for the given query. Tokenizes the query using a field appropriate Analyzer. |
| * The {@link SpellingResult#getSuggestions()} suggestions must be ordered by best suggestion first. |
| * |
| * @param options The {@link SpellingOptions} to use |
| * @return The {@link SpellingResult} suggestions |
| * @throws IOException if there is an error producing suggestions |
| */ |
| public abstract SpellingResult getSuggestions(SpellingOptions options) throws IOException; |
| |
| public boolean isSuggestionsMayOverlap() { |
| return false; |
| } |
| } |