| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.spelling; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.spell.CombineSuggestion; |
| import org.apache.lucene.search.spell.SuggestWord; |
| import org.apache.lucene.search.spell.WordBreakSpellChecker; |
| import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod; |
| import org.apache.solr.common.util.NamedList; |
| import org.apache.solr.core.SolrCore; |
| import org.apache.solr.search.SolrIndexSearcher; |
| |
| /** |
| * <p> |
| * A spellchecker that breaks and combines words. |
| * </p> |
| * <p> |
| * This will not combine adjacent tokens that do not have |
| * the same required status (prohibited, required, optional). |
| * However, this feature depends on incoming term flags |
| * being properly set. ({@link QueryConverter#PROHIBITED_TERM_FLAG}, |
| * {@link QueryConverter#REQUIRED_TERM_FLAG}, |
| * {@link QueryConverter#TERM_IN_BOOLEAN_QUERY_FLAG}, and |
| * {@link QueryConverter#TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG} ) |
| * This feature breaks completely if the upstream analyzer or query |
| * converter sets flags with the same values but different meanings. |
| * The default query converter (if not using "spellcheck.q") |
| * is {@link SpellingQueryConverter}, which properly sets these flags. |
| * </p> |
| */ |
| public class WordBreakSolrSpellChecker extends SolrSpellChecker { |
| /** |
| * <p> |
| * Try to combine multiple words into one? [true|false] |
| * </p> |
| */ |
| public static final String PARAM_COMBINE_WORDS = "combineWords"; |
| /** |
| * <p> |
| * Try to break words into multiples? [true|false] |
| * </p> |
| */ |
| public static final String PARAM_BREAK_WORDS = "breakWords"; |
| /** |
| * See {@link WordBreakSpellChecker#setMaxChanges} |
| */ |
| public static final String PARAM_MAX_CHANGES = "maxChanges"; |
| /** |
| * See {@link WordBreakSpellChecker#setMaxCombineWordLength} |
| */ |
| public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength"; |
| /** |
| * See {@link WordBreakSpellChecker#setMinBreakWordLength} |
| */ |
| public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength"; |
| /** |
| * See {@link BreakSuggestionTieBreaker} for options. |
| */ |
| public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSugestionTieBreaker"; |
| /** |
| * See {@link WordBreakSpellChecker#setMaxEvaluations} |
| */ |
| public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations"; |
| /** |
| * See {@link WordBreakSpellChecker#setMinSuggestionFrequency} |
| */ |
| public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq"; |
| |
| /** |
| * <p> |
| * Specify a value on the "breakSugestionTieBreaker" parameter. |
| * The default is MAX_FREQ. |
| * </p> |
| */ |
| public enum BreakSuggestionTieBreaker { |
| /** |
| * See |
| * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} |
| * # |
| */ |
| MAX_FREQ, |
| /** |
| * See |
| * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_SUMMED_FREQUENCY} |
| */ |
| SUM_FREQ |
| }; |
| |
| private WordBreakSpellChecker wbsp = null; |
| private boolean combineWords = false; |
| private boolean breakWords = false; |
| private BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; |
| private static final Pattern spacePattern = Pattern.compile("\\s+"); |
| |
| @Override |
| public String init(@SuppressWarnings("rawtypes") NamedList config, |
| SolrCore core) { |
| String name = super.init(config, core); |
| combineWords = boolParam(config, PARAM_COMBINE_WORDS); |
| breakWords = boolParam(config, PARAM_BREAK_WORDS); |
| wbsp = new WordBreakSpellChecker(); |
| String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER); |
| if (bstb != null) { |
| bstb = bstb.toUpperCase(Locale.ROOT); |
| if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) { |
| sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY; |
| } else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) { |
| sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; |
| } else { |
| throw new IllegalArgumentException("Invalid value for parameter " |
| + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb); |
| } |
| } else { |
| sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; |
| } |
| int mc = intParam(config, PARAM_MAX_CHANGES); |
| if (mc > 0) { |
| wbsp.setMaxChanges(mc); |
| } |
| int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH); |
| if (mcl > 0) { |
| wbsp.setMaxCombineWordLength(mcl); |
| } |
| int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH); |
| if (mbwl > 0) { |
| wbsp.setMinBreakWordLength(mbwl); |
| } |
| int me = intParam(config, PARAM_MAX_EVALUATIONS); |
| if (me > 0) { |
| wbsp.setMaxEvaluations(me); |
| } |
| int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY); |
| if (msf > 0) { |
| wbsp.setMinSuggestionFrequency(msf); |
| } |
| return name; |
| } |
| |
| private String strParam(@SuppressWarnings("rawtypes") NamedList config, |
| String paramName) { |
| Object o = config.get(paramName); |
| return o == null ? null : o.toString(); |
| } |
| |
| private boolean boolParam(@SuppressWarnings("rawtypes") NamedList config, |
| String paramName) { |
| String s = strParam(config, paramName); |
| if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) { |
| return true; |
| } |
| return false; |
| } |
| |
| private int intParam(@SuppressWarnings("rawtypes") NamedList config, |
| String paramName) { |
| Object o = config.get(paramName); |
| if (o == null) { |
| return 0; |
| } |
| try { |
| return Integer.parseInt(o.toString()); |
| } catch (NumberFormatException nfe) { |
| throw new IllegalArgumentException("Invalid integer for parameter " |
| + paramName + " : " + o); |
| } |
| } |
| |
| @Override |
| public SpellingResult getSuggestions(SpellingOptions options) |
| throws IOException { |
| IndexReader ir = options.reader; |
| int numSuggestions = options.count; |
| |
| StringBuilder sb = new StringBuilder(); |
| Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]); |
| List<Token> tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2); |
| List<Term> termArr = new ArrayList<>(options.tokens.size() + 2); |
| List<ResultEntry> breakSuggestionList = new ArrayList<>(); |
| List<ResultEntry> noBreakSuggestionList = new ArrayList<>(); |
| boolean lastOneProhibited = false; |
| boolean lastOneRequired = false; |
| boolean lastOneprocedesNewBooleanOp = false; |
| for (int i = 0; i < tokenArr.length; i++) { |
| boolean prohibited = |
| (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == |
| QueryConverter.PROHIBITED_TERM_FLAG; |
| boolean required = |
| (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == |
| QueryConverter.REQUIRED_TERM_FLAG; |
| boolean procedesNewBooleanOp = |
| (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == |
| QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; |
| if (i > 0 |
| && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) { |
| termArr.add(WordBreakSpellChecker.SEPARATOR_TERM); |
| tokenArrWithSeparators.add(null); |
| } |
| lastOneProhibited = prohibited; |
| lastOneRequired = required; |
| lastOneprocedesNewBooleanOp = procedesNewBooleanOp; |
| |
| Term thisTerm = new Term(field, tokenArr[i].toString()); |
| termArr.add(thisTerm); |
| tokenArrWithSeparators.add(tokenArr[i]); |
| if (breakWords) { |
| SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, |
| numSuggestions, ir, options.suggestMode, sortMethod); |
| if(breakSuggestions.length==0) { |
| noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0)); |
| } |
| for (SuggestWord[] breakSuggestion : breakSuggestions) { |
| sb.delete(0, sb.length()); |
| boolean firstOne = true; |
| int freq = 0; |
| for (SuggestWord word : breakSuggestion) { |
| if (!firstOne) { |
| sb.append(" "); |
| } |
| firstOne = false; |
| sb.append(word.string); |
| if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { |
| freq = Math.max(freq, word.freq); |
| } else { |
| freq += word.freq; |
| } |
| } |
| breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), |
| freq)); |
| } |
| } |
| } |
| breakSuggestionList.addAll(noBreakSuggestionList); |
| |
| List<ResultEntry> combineSuggestionList = Collections.emptyList(); |
| CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations( |
| termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode); |
| if (combineWords) { |
| combineSuggestionList = new ArrayList<>( |
| combineSuggestions.length); |
| for (CombineSuggestion cs : combineSuggestions) { |
| int firstTermIndex = cs.originalTermIndexes[0]; |
| int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1]; |
| sb.delete(0, sb.length()); |
| for (int i = firstTermIndex; i <= lastTermIndex; i++) { |
| if (i > firstTermIndex) { |
| sb.append(" "); |
| } |
| sb.append(tokenArrWithSeparators.get(i).toString()); |
| } |
| Token token = new Token(sb.toString(), tokenArrWithSeparators.get(firstTermIndex) |
| .startOffset(), tokenArrWithSeparators.get(lastTermIndex).endOffset()); |
| combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, |
| cs.suggestion.freq)); |
| } |
| } |
| |
| // Interleave the two lists of suggestions into one SpellingResult |
| SpellingResult result = new SpellingResult(); |
| Iterator<ResultEntry> breakIter = breakSuggestionList.iterator(); |
| Iterator<ResultEntry> combineIter = combineSuggestionList.iterator(); |
| ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null; |
| ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null; |
| int breakCount = 0; |
| int combineCount = 0; |
| while (lastBreak != null || lastCombine != null) { |
| if (lastBreak == null) { |
| addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq); |
| lastCombine = null; |
| } else if (lastCombine == null) { |
| addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq); |
| lastBreak = null; |
| } else if (lastBreak.freq < lastCombine.freq) { |
| addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq); |
| lastCombine = null; |
| } else if (lastCombine.freq < lastBreak.freq) { |
| addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq); |
| lastBreak = null; |
| } else if (breakCount >= combineCount) { //TODO: Should reverse >= to < ??S |
| addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq); |
| lastCombine = null; |
| } else { |
| addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq); |
| lastBreak = null; |
| } |
| if (lastBreak == null && breakIter.hasNext()) { |
| lastBreak = breakIter.next(); |
| breakCount++; |
| } |
| if (lastCombine == null && combineIter.hasNext()) { |
| lastCombine = combineIter.next(); |
| combineCount++; |
| } |
| } |
| return result; |
| } |
| private void addToResult(SpellingResult result, Token token, int tokenFrequency, String suggestion, int suggestionFrequency) { |
| if(suggestion==null) { |
| result.add(token, Collections.<String>emptyList()); |
| result.addFrequency(token, tokenFrequency); |
| } else { |
| result.add(token, suggestion, suggestionFrequency); |
| result.addFrequency(token, tokenFrequency); |
| } |
| } |
| |
| private int getCombineFrequency(IndexReader ir, Token token) throws IOException { |
| String[] words = spacePattern.split(token.toString()); |
| int result = 0; |
| if(sortMethod==BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { |
| for(String word : words) { |
| result = Math.max(result, ir.docFreq(new Term(field, word))); |
| } |
| } else { |
| for(String word : words) { |
| result += ir.docFreq(new Term(field, word)); |
| } |
| } |
| return result; |
| } |
| |
| @Override |
| public void build(SolrCore core, SolrIndexSearcher searcher) { |
| /* no-op */ |
| } |
| |
| @Override |
| public void reload(SolrCore core, SolrIndexSearcher searcher) |
| throws IOException { |
| /* no-op */ |
| } |
| |
| @Override |
| public boolean isSuggestionsMayOverlap() { |
| return true; |
| } |
| } |