| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.document; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.ScoreMode; |
| import org.apache.lucene.search.Weight; |
| import org.apache.lucene.search.suggest.BitsProducer; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util.automaton.Automata; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.FiniteStringsIterator; |
| import org.apache.lucene.util.automaton.LevenshteinAutomata; |
| import org.apache.lucene.util.automaton.Operations; |
| import org.apache.lucene.util.automaton.UTF32ToUTF8; |
| |
| /** |
| * A {@link CompletionQuery} that match documents containing terms within an edit distance of the |
| * specified prefix. |
| * |
| * <p>This query boost documents relative to how similar the indexed terms are to the provided |
| * prefix. |
| * |
| * <p>Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' against a |
| * field 'suggest_field' is as follows: |
| * |
| * <pre class="prettyprint"> |
| * CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg")); |
| * </pre> |
| * |
| * @lucene.experimental |
| */ |
| public class FuzzyCompletionQuery extends PrefixCompletionQuery { |
| |
| /** |
| * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code |
| * points (actual letters) instead of bytes. |
| */ |
| public static final boolean DEFAULT_UNICODE_AWARE = false; |
| |
| /** The default minimum length of the key before any edits are allowed. */ |
| public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; |
| |
| /** The default prefix length where edits are not allowed. */ |
| public static final int DEFAULT_NON_FUZZY_PREFIX = 1; |
| |
| /** The default maximum number of edits for fuzzy suggestions. */ |
| public static final int DEFAULT_MAX_EDITS = 1; |
| |
| /** The default transposition value passed to {@link LevenshteinAutomata} */ |
| public static final boolean DEFAULT_TRANSPOSITIONS = true; |
| |
| private final int maxEdits; |
| private final boolean transpositions; |
| private final int nonFuzzyPrefix; |
| private final int minFuzzyLength; |
| private final boolean unicodeAware; |
| private final int maxDeterminizedStates; |
| |
| /** |
| * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} with no |
| * filter |
| */ |
| public FuzzyCompletionQuery(Analyzer analyzer, Term term) { |
| this(analyzer, term, null); |
| } |
| |
| /** |
| * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, int, |
| * boolean, int, int, boolean, int)} with defaults for <code>maxEdits</code>, <code>transpositions |
| * </code>, <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>, <code>unicodeAware</code> |
| * and <code>maxDeterminizedStates</code> See {@link #DEFAULT_MAX_EDITS}, {@link |
| * #DEFAULT_TRANSPOSITIONS}, {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH}, |
| * {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_MAX_DETERMINIZED_STATES} for |
| * defaults |
| */ |
| public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) { |
| this( |
| analyzer, |
| term, |
| filter, |
| DEFAULT_MAX_EDITS, |
| DEFAULT_TRANSPOSITIONS, |
| DEFAULT_NON_FUZZY_PREFIX, |
| DEFAULT_MIN_FUZZY_LENGTH, |
| DEFAULT_UNICODE_AWARE, |
| Operations.DEFAULT_MAX_DETERMINIZED_STATES); |
| } |
| |
| /** |
| * Constructs an analyzed fuzzy prefix completion query |
| * |
| * @param analyzer used to analyze the provided {@link Term#text()} |
| * @param term query is run against {@link Term#field()} and {@link Term#text()} is analyzed with |
| * <code>analyzer</code> |
| * @param filter used to query on a sub set of documents |
| * @param maxEdits maximum number of acceptable edits |
| * @param transpositions value passed to {@link LevenshteinAutomata} |
| * @param nonFuzzyPrefix prefix length where edits are not allowed |
| * @param minFuzzyLength minimum prefix length before any edits are allowed |
| * @param unicodeAware treat prefix as unicode rather than bytes |
| * @param maxDeterminizedStates maximum automaton states allowed for {@link LevenshteinAutomata} |
| */ |
| public FuzzyCompletionQuery( |
| Analyzer analyzer, |
| Term term, |
| BitsProducer filter, |
| int maxEdits, |
| boolean transpositions, |
| int nonFuzzyPrefix, |
| int minFuzzyLength, |
| boolean unicodeAware, |
| int maxDeterminizedStates) { |
| super(analyzer, term, filter); |
| this.maxEdits = maxEdits; |
| this.transpositions = transpositions; |
| this.nonFuzzyPrefix = nonFuzzyPrefix; |
| this.minFuzzyLength = minFuzzyLength; |
| this.unicodeAware = unicodeAware; |
| this.maxDeterminizedStates = maxDeterminizedStates; |
| } |
| |
| @Override |
| public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) |
| throws IOException { |
| final Automaton originalAutomata; |
| try (CompletionTokenStream stream = |
| (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) { |
| originalAutomata = stream.toAutomaton(unicodeAware); |
| } |
| Set<IntsRef> refs = new HashSet<>(); |
| Automaton automaton = toLevenshteinAutomata(originalAutomata, refs); |
| if (unicodeAware) { |
| Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); |
| utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); |
| automaton = utf8automaton; |
| } |
| // TODO Accumulating all refs is bad, because the resulting set may be very big. |
| // TODO Better iterate over automaton again inside FuzzyCompletionWeight? |
| return new FuzzyCompletionWeight(this, automaton, refs); |
| } |
| |
| private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) { |
| List<Automaton> subs = new ArrayList<>(); |
| FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton); |
| for (IntsRef string; (string = finiteStrings.next()) != null; ) { |
| refs.add(IntsRef.deepCopyOf(string)); |
| |
| if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) { |
| subs.add(Automata.makeString(string.ints, string.offset, string.length)); |
| } else { |
| int ints[] = new int[string.length - nonFuzzyPrefix]; |
| System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length); |
| // TODO: maybe add alphaMin to LevenshteinAutomata, |
| // and pass 1 instead of 0? We probably don't want |
| // to allow the trailing dedup bytes to be |
| // edited... but then 0 byte is "in general" allowed |
| // on input (but not in UTF8). |
| LevenshteinAutomata lev = |
| new LevenshteinAutomata( |
| ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); |
| subs.add( |
| lev.toAutomaton( |
| maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix))); |
| } |
| } |
| |
| if (subs.isEmpty()) { |
| // automaton is empty, there is no accepted paths through it |
| return Automata.makeEmpty(); // matches nothing |
| } else if (subs.size() == 1) { |
| // no synonyms or anything: just a single path through the tokenstream |
| return subs.get(0); |
| } else { |
| // multiple paths: this is really scary! is it slow? |
| // maybe we should not do this and throw UOE? |
| Automaton a = Operations.union(subs); |
| // TODO: we could call toLevenshteinAutomata() before det? |
| // this only happens if you have multiple paths anyway (e.g. synonyms) |
| return Operations.determinize(a, maxDeterminizedStates); |
| } |
| } |
| |
| /** Get the maximum edit distance for fuzzy matches */ |
| public int getMaxEdits() { |
| return maxEdits; |
| } |
| |
| /** Return whether transpositions count as a single edit */ |
| public boolean isTranspositions() { |
| return transpositions; |
| } |
| |
| /** Get the length of a prefix where no edits are permitted */ |
| public int getNonFuzzyPrefix() { |
| return nonFuzzyPrefix; |
| } |
| |
| /** Get the minimum length of a term considered for matching */ |
| public int getMinFuzzyLength() { |
| return minFuzzyLength; |
| } |
| |
| /** Return true if lengths are measured in unicode code-points rather than bytes */ |
| public boolean isUnicodeAware() { |
| return unicodeAware; |
| } |
| |
| /** Get the maximum number of determinized states permitted */ |
| public int getMaxDeterminizedStates() { |
| return maxDeterminizedStates; |
| } |
| |
| @Override |
| public String toString(String field) { |
| StringBuilder buffer = new StringBuilder(); |
| if (!getField().equals(field)) { |
| buffer.append(getField()); |
| buffer.append(":"); |
| } |
| buffer.append(getTerm().text()); |
| buffer.append('*'); |
| buffer.append('~'); |
| buffer.append(Integer.toString(maxEdits)); |
| if (getFilter() != null) { |
| buffer.append(","); |
| buffer.append("filter"); |
| buffer.append(getFilter().toString()); |
| } |
| return buffer.toString(); |
| } |
| |
| private static class FuzzyCompletionWeight extends CompletionWeight { |
| private final Set<IntsRef> refs; |
| int currentBoost = 0; |
| |
| public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs) |
| throws IOException { |
| super(query, automaton); |
| this.refs = refs; |
| } |
| |
| @Override |
| protected void setNextMatch(IntsRef pathPrefix) { |
| // NOTE: the last letter of the matched prefix for the exact |
| // match never makes it through here |
| // so an exact match and a match with only a edit at the |
| // end is boosted the same |
| int maxCount = 0; |
| for (IntsRef ref : refs) { |
| int minLength = Math.min(ref.length, pathPrefix.length); |
| int count = 0; |
| for (int i = 0; i < minLength; i++) { |
| if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) { |
| count++; |
| } else { |
| break; |
| } |
| } |
| maxCount = Math.max(maxCount, count); |
| } |
| currentBoost = maxCount; |
| } |
| |
| @Override |
| protected float boost() { |
| return currentBoost; |
| } |
| } |
| } |