blob: 91abdaea7f726440395aaa982930b52987833923 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.document;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.suggest.BitsProducer;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
/**
* A {@link CompletionQuery} that match documents containing terms
* within an edit distance of the specified prefix.
* <p>
* This query boost documents relative to how similar the indexed terms are to the
* provided prefix.
* <p>
* Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg'
* against a field 'suggest_field' is as follows:
*
* <pre class="prettyprint">
* CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
* </pre>
*
* @lucene.experimental
*/
public class FuzzyCompletionQuery extends PrefixCompletionQuery {
/**
* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
* parameters in Unicode code points (actual letters)
* instead of bytes.
* */
public static final boolean DEFAULT_UNICODE_AWARE = false;
/**
* The default minimum length of the key before any edits are allowed.
*/
public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
/**
* The default prefix length where edits are not allowed.
*/
public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
/**
* The default maximum number of edits for fuzzy
* suggestions.
*/
public static final int DEFAULT_MAX_EDITS = 1;
/**
* The default transposition value passed to {@link LevenshteinAutomata}
*/
public static final boolean DEFAULT_TRANSPOSITIONS = true;
private final int maxEdits;
private final boolean transpositions;
private final int nonFuzzyPrefix;
private final int minFuzzyLength;
private final boolean unicodeAware;
private final int determinizeWorkLimit;
/**
* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)}
* with no filter
*/
public FuzzyCompletionQuery(Analyzer analyzer, Term term) {
this(analyzer, term, null);
}
/**
* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer,
* int, boolean, int, int, boolean, int)}
* with defaults for <code>maxEdits</code>, <code>transpositions</code>,
* <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>,
* <code>unicodeAware</code> and <code>deteminizeWorkLimit</code>
*
* See {@link #DEFAULT_MAX_EDITS}, {@link #DEFAULT_TRANSPOSITIONS},
* {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH},
* {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT}
* for defaults
*/
public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) {
this(analyzer, term, filter, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX,
DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
);
}
/**
* Constructs an analyzed fuzzy prefix completion query
*
* @param analyzer used to analyze the provided {@link Term#text()}
* @param term query is run against {@link Term#field()} and {@link Term#text()}
* is analyzed with <code>analyzer</code>
* @param filter used to query on a sub set of documents
* @param maxEdits maximum number of acceptable edits
* @param transpositions value passed to {@link LevenshteinAutomata}
* @param nonFuzzyPrefix prefix length where edits are not allowed
* @param minFuzzyLength minimum prefix length before any edits are allowed
* @param unicodeAware treat prefix as unicode rather than bytes
* @param determinizeWorkLimit maximum effort allowed to determinize the {@link
* LevenshteinAutomata}
*/
public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, int maxEdits,
boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
boolean unicodeAware, int determinizeWorkLimit) {
super(analyzer, term, filter);
this.maxEdits = maxEdits;
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
this.unicodeAware = unicodeAware;
this.determinizeWorkLimit = determinizeWorkLimit;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final Automaton originalAutomata;
try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
originalAutomata = stream.toAutomaton(unicodeAware);
}
Set<IntsRef> refs = new HashSet<>();
Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
utf8automaton = Operations.determinize(utf8automaton, determinizeWorkLimit);
automaton = utf8automaton;
}
// TODO Accumulating all refs is bad, because the resulting set may be very big.
// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
return new FuzzyCompletionWeight(this, automaton, refs);
}
private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
List<Automaton> subs = new ArrayList<>();
FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
for (IntsRef string; (string = finiteStrings.next()) != null;) {
refs.add(IntsRef.deepCopyOf(string));
if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
subs.add(Automata.makeString(string.ints, string.offset, string.length));
} else {
int ints[] = new int[string.length - nonFuzzyPrefix];
System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
// TODO: maybe add alphaMin to LevenshteinAutomata,
// and pass 1 instead of 0? We probably don't want
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints,
unicodeAware ? Character.MAX_CODE_POINT : 255,
transpositions);
subs.add(lev.toAutomaton(maxEdits,
UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
}
}
if (subs.isEmpty()) {
// automaton is empty, there is no accepted paths through it
return Automata.makeEmpty(); // matches nothing
} else if (subs.size() == 1) {
// no synonyms or anything: just a single path through the tokenstream
return subs.get(0);
} else {
// multiple paths: this is really scary! is it slow?
// maybe we should not do this and throw UOE?
Automaton a = Operations.union(subs);
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
return Operations.determinize(a, determinizeWorkLimit);
}
}
/**
* Get the maximum edit distance for fuzzy matches
*/
public int getMaxEdits() {
return maxEdits;
}
/**
* Return whether transpositions count as a single edit
*/
public boolean isTranspositions() {
return transpositions;
}
/**
* Get the length of a prefix where no edits are permitted
*/
public int getNonFuzzyPrefix() {
return nonFuzzyPrefix;
}
/**
* Get the minimum length of a term considered for matching
*/
public int getMinFuzzyLength() {
return minFuzzyLength;
}
/**
* Return true if lengths are measured in unicode code-points rather than bytes
*/
public boolean isUnicodeAware() {
return unicodeAware;
}
/** Get the maximum effort to use determinizing */
public int getDeterminizeWorkLimit() {
return determinizeWorkLimit;
}
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getField().equals(field)) {
buffer.append(getField());
buffer.append(":");
}
buffer.append(getTerm().text());
buffer.append('*');
buffer.append('~');
buffer.append(Integer.toString(maxEdits));
if (getFilter() != null) {
buffer.append(",");
buffer.append("filter");
buffer.append(getFilter().toString());
}
return buffer.toString();
}
private static class FuzzyCompletionWeight extends CompletionWeight {
private final Set<IntsRef> refs;
int currentBoost = 0;
public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs) throws IOException {
super(query, automaton);
this.refs = refs;
}
@Override
protected void setNextMatch(IntsRef pathPrefix) {
// NOTE: the last letter of the matched prefix for the exact
// match never makes it through here
// so an exact match and a match with only a edit at the
// end is boosted the same
int maxCount = 0;
for (IntsRef ref : refs) {
int minLength = Math.min(ref.length, pathPrefix.length);
int count = 0;
for (int i = 0; i < minLength; i++) {
if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
count++;
} else {
break;
}
}
maxCount = Math.max(maxCount, count);
}
currentBoost = maxCount;
}
@Override
protected float boost() {
return currentBoost;
}
}
}