| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| |
| import java.io.IOException; |
| import java.util.Objects; |
| |
| import org.apache.lucene.index.SingleTermsEnum; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.LevenshteinAutomata; |
| |
| /** Implements the fuzzy search query. The similarity measurement |
| * is based on the Damerau-Levenshtein (optimal string alignment) algorithm, |
| * though you can explicitly choose classic Levenshtein by passing <code>false</code> |
| * to the <code>transpositions</code> parameter. |
| * |
| * <p>This query uses {@link MultiTermQuery.TopTermsBlendedFreqScoringRewrite} |
| * as default. So terms will be collected and scored according to their |
| * edit distance. Only the top terms are used for building the {@link BooleanQuery}. |
| * It is not recommended to change the rewrite mode for fuzzy queries. |
| * |
| * <p>At most, this query will match terms up to |
| * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} edits. |
| * Higher distances (especially with transpositions enabled), are generally not useful and |
| * will match a significant amount of the term dictionary. If you really want this, consider |
| * using an n-gram indexing technique (such as the SpellChecker in the |
| * <a href="{@docRoot}/../suggest/overview-summary.html">suggest module</a>) instead. |
| * |
| * <p>NOTE: terms of length 1 or 2 will sometimes not match because of how the scaled |
| * distance between two terms is computed. For a term to match, the edit distance between |
| * the terms must be less than the minimum length term (either the input term, or |
| * the candidate term). For example, FuzzyQuery on term "abcd" with maxEdits=2 will |
| * not match an indexed term "ab", and FuzzyQuery on term "a" with maxEdits=2 will not |
| * match an indexed term "abc". |
| */ |
| public class FuzzyQuery extends MultiTermQuery { |
| |
| public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; |
| public final static int defaultPrefixLength = 0; |
| public final static int defaultMaxExpansions = 50; |
| public final static boolean defaultTranspositions = true; |
| |
| private final int maxEdits; |
| private final int maxExpansions; |
| private final boolean transpositions; |
| private final int prefixLength; |
| private final Term term; |
| |
| /** |
| * Create a new FuzzyQuery that will match terms with an edit distance |
| * of at most <code>maxEdits</code> to <code>term</code>. |
| * If a <code>prefixLength</code> > 0 is specified, a common prefix |
| * of that length is also required. |
| * |
| * @param term the term to search for |
| * @param maxEdits must be {@code >= 0} and {@code <=} {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}. |
| * @param prefixLength length of common (non-fuzzy) prefix |
| * @param maxExpansions the maximum number of terms to match. If this number is |
| * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, |
| * then the maxClauseCount will be used instead. |
| * @param transpositions true if transpositions should be treated as a primitive |
| * edit operation. If this is false, comparisons will implement the classic |
| * Levenshtein algorithm. |
| */ |
| public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) { |
| super(term.field()); |
| |
| if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { |
| throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); |
| } |
| if (prefixLength < 0) { |
| throw new IllegalArgumentException("prefixLength cannot be negative."); |
| } |
| if (maxExpansions <= 0) { |
| throw new IllegalArgumentException("maxExpansions must be positive."); |
| } |
| |
| this.term = term; |
| this.maxEdits = maxEdits; |
| this.prefixLength = prefixLength; |
| this.transpositions = transpositions; |
| this.maxExpansions = maxExpansions; |
| setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions)); |
| } |
| |
| /** |
| * Calls {@link #FuzzyQuery(Term, int, int, int, boolean) |
| * FuzzyQuery(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions)}. |
| */ |
| public FuzzyQuery(Term term, int maxEdits, int prefixLength) { |
| this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions); |
| } |
| |
| /** |
| * Calls {@link #FuzzyQuery(Term, int, int) FuzzyQuery(term, maxEdits, defaultPrefixLength)}. |
| */ |
| public FuzzyQuery(Term term, int maxEdits) { |
| this(term, maxEdits, defaultPrefixLength); |
| } |
| |
| /** |
| * Calls {@link #FuzzyQuery(Term, int) FuzzyQuery(term, defaultMaxEdits)}. |
| */ |
| public FuzzyQuery(Term term) { |
| this(term, defaultMaxEdits); |
| } |
| |
| /** |
| * @return the maximum number of edit distances allowed for this query to match. |
| */ |
| public int getMaxEdits() { |
| return maxEdits; |
| } |
| |
| /** |
| * Returns the non-fuzzy prefix length. This is the number of characters at the start |
| * of a term that must be identical (not fuzzy) to the query term if the query |
| * is to match that term. |
| */ |
| public int getPrefixLength() { |
| return prefixLength; |
| } |
| |
| /** |
| * Returns true if transpositions should be treated as a primitive edit operation. |
| * If this is false, comparisons will implement the classic Levenshtein algorithm. |
| */ |
| public boolean getTranspositions() { |
| return transpositions; |
| } |
| |
| /** |
| * Returns the compiled automata used to match terms |
| */ |
| public CompiledAutomaton getAutomata() { |
| FuzzyAutomatonBuilder builder = new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions); |
| return builder.buildMaxEditAutomaton(); |
| } |
| |
| @Override |
| public void visit(QueryVisitor visitor) { |
| if (visitor.acceptField(field)) { |
| visitor.consumeTermsMatching(this, term.field(), () -> getAutomata().runAutomaton); |
| } |
| } |
| |
| @Override |
| protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { |
| if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact |
| return new SingleTermsEnum(terms.iterator(), term.bytes()); |
| } |
| return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); |
| } |
| |
| /** |
| * Returns the pattern term. |
| */ |
| public Term getTerm() { |
| return term; |
| } |
| |
| @Override |
| public String toString(String field) { |
| final StringBuilder buffer = new StringBuilder(); |
| if (!term.field().equals(field)) { |
| buffer.append(term.field()); |
| buffer.append(":"); |
| } |
| buffer.append(term.text()); |
| buffer.append('~'); |
| buffer.append(maxEdits); |
| return buffer.toString(); |
| } |
| |
| @Override |
| public int hashCode() { |
| final int prime = 31; |
| int result = super.hashCode(); |
| result = prime * result + maxEdits; |
| result = prime * result + prefixLength; |
| result = prime * result + maxExpansions; |
| result = prime * result + (transpositions ? 0 : 1); |
| result = prime * result + ((term == null) ? 0 : term.hashCode()); |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) |
| return true; |
| if (!super.equals(obj)) |
| return false; |
| if (getClass() != obj.getClass()) |
| return false; |
| FuzzyQuery other = (FuzzyQuery) obj; |
| return Objects.equals(maxEdits, other.maxEdits) && Objects.equals(prefixLength, other.prefixLength) |
| && Objects.equals(maxExpansions, other.maxExpansions) && Objects.equals(transpositions, other.transpositions) |
| && Objects.equals(term, other.term); |
| } |
| |
| /** |
| * @deprecated pass integer edit distances instead. |
| */ |
| @Deprecated |
| public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; |
| |
| /** |
| * Helper function to convert from deprecated "minimumSimilarity" fractions |
| * to raw edit distances. |
| * |
| * @param minimumSimilarity scaled similarity |
| * @param termLen length (in unicode codepoints) of the term. |
| * @return equivalent number of maxEdits |
| * @deprecated pass integer edit distances instead. |
| */ |
| @Deprecated |
| public static int floatToEdits(float minimumSimilarity, int termLen) { |
| if (minimumSimilarity >= 1f) { |
| return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); |
| } else if (minimumSimilarity == 0.0f) { |
| return 0; // 0 means exact, not infinite # of edits! |
| } else { |
| return Math.min((int) ((1D-minimumSimilarity) * termLen), |
| LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); |
| } |
| } |
| |
| } |