lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.suggest.document;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.suggest.BitsProducer;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.FiniteStringsIterator;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.UTF32ToUTF8;

 /**
  * A {@link CompletionQuery} that match documents containing terms within an edit distance of the
  * specified prefix.
  *
  * <p>This query boost documents relative to how similar the indexed terms are to the provided
  * prefix.
  *
  * <p>Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' against a
  * field 'suggest_field' is as follows:
  *
  * <pre class="prettyprint">
  *  CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
  * </pre>
  *
  * @lucene.experimental
  */
 public class FuzzyCompletionQuery extends PrefixCompletionQuery {

   /**
    * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code
    * points (actual letters) instead of bytes.
    */
   public static final boolean DEFAULT_UNICODE_AWARE = false;

   /** The default minimum length of the key before any edits are allowed. */
   public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;

   /** The default prefix length where edits are not allowed. */
   public static final int DEFAULT_NON_FUZZY_PREFIX = 1;

   /** The default maximum number of edits for fuzzy suggestions. */
   public static final int DEFAULT_MAX_EDITS = 1;

   /** The default transposition value passed to {@link LevenshteinAutomata} */
   public static final boolean DEFAULT_TRANSPOSITIONS = true;

   private final int maxEdits;
   private final boolean transpositions;
   private final int nonFuzzyPrefix;
   private final int minFuzzyLength;
   private final boolean unicodeAware;
   private final int maxDeterminizedStates;

   /**
    * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} with no
    * filter
    */
   public FuzzyCompletionQuery(Analyzer analyzer, Term term) {
     this(analyzer, term, null);
   }

   /**
    * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, int,
    * boolean, int, int, boolean, int)} with defaults for <code>maxEdits</code>, <code>transpositions
    * </code>, <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>, <code>unicodeAware</code>
    * and <code>maxDeterminizedStates</code> See {@link #DEFAULT_MAX_EDITS}, {@link
    * #DEFAULT_TRANSPOSITIONS}, {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH},
    * {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_MAX_DETERMINIZED_STATES} for
    * defaults
    */
   public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) {
     this(
         analyzer,
         term,
         filter,
         DEFAULT_MAX_EDITS,
         DEFAULT_TRANSPOSITIONS,
         DEFAULT_NON_FUZZY_PREFIX,
         DEFAULT_MIN_FUZZY_LENGTH,
         DEFAULT_UNICODE_AWARE,
         Operations.DEFAULT_MAX_DETERMINIZED_STATES);
   }

   /**
    * Constructs an analyzed fuzzy prefix completion query
    *
    * @param analyzer used to analyze the provided {@link Term#text()}
    * @param term query is run against {@link Term#field()} and {@link Term#text()} is analyzed with
    *     <code>analyzer</code>
    * @param filter used to query on a sub set of documents
    * @param maxEdits maximum number of acceptable edits
    * @param transpositions value passed to {@link LevenshteinAutomata}
    * @param nonFuzzyPrefix prefix length where edits are not allowed
    * @param minFuzzyLength minimum prefix length before any edits are allowed
    * @param unicodeAware treat prefix as unicode rather than bytes
    * @param maxDeterminizedStates maximum automaton states allowed for {@link LevenshteinAutomata}
    */
   public FuzzyCompletionQuery(
       Analyzer analyzer,
       Term term,
       BitsProducer filter,
       int maxEdits,
       boolean transpositions,
       int nonFuzzyPrefix,
       int minFuzzyLength,
       boolean unicodeAware,
       int maxDeterminizedStates) {
     super(analyzer, term, filter);
     this.maxEdits = maxEdits;
     this.transpositions = transpositions;
     this.nonFuzzyPrefix = nonFuzzyPrefix;
     this.minFuzzyLength = minFuzzyLength;
     this.unicodeAware = unicodeAware;
     this.maxDeterminizedStates = maxDeterminizedStates;
   }

   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
       throws IOException {
     final Automaton originalAutomata;
     try (CompletionTokenStream stream =
         (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) {
       originalAutomata = stream.toAutomaton(unicodeAware);
     }
     Set<IntsRef> refs = new HashSet<>();
     Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
     if (unicodeAware) {
       Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
       utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
       automaton = utf8automaton;
     }
     // TODO Accumulating all refs is bad, because the resulting set may be very big.
     // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
     return new FuzzyCompletionWeight(this, automaton, refs);
   }

   private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
     List<Automaton> subs = new ArrayList<>();
     FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
     for (IntsRef string; (string = finiteStrings.next()) != null; ) {
       refs.add(IntsRef.deepCopyOf(string));

       if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
         subs.add(Automata.makeString(string.ints, string.offset, string.length));
       } else {
         int ints[] = new int[string.length - nonFuzzyPrefix];
         System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
         // TODO: maybe add alphaMin to LevenshteinAutomata,
         // and pass 1 instead of 0?  We probably don't want
         // to allow the trailing dedup bytes to be
         // edited... but then 0 byte is "in general" allowed
         // on input (but not in UTF8).
         LevenshteinAutomata lev =
             new LevenshteinAutomata(
                 ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
         subs.add(
             lev.toAutomaton(
                 maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
       }
     }

     if (subs.isEmpty()) {
       // automaton is empty, there is no accepted paths through it
       return Automata.makeEmpty(); // matches nothing
     } else if (subs.size() == 1) {
       // no synonyms or anything: just a single path through the tokenstream
       return subs.get(0);
     } else {
       // multiple paths: this is really scary! is it slow?
       // maybe we should not do this and throw UOE?
       Automaton a = Operations.union(subs);
       // TODO: we could call toLevenshteinAutomata() before det?
       // this only happens if you have multiple paths anyway (e.g. synonyms)
       return Operations.determinize(a, maxDeterminizedStates);
     }
   }

   /** Get the maximum edit distance for fuzzy matches */
   public int getMaxEdits() {
     return maxEdits;
   }

   /** Return whether transpositions count as a single edit */
   public boolean isTranspositions() {
     return transpositions;
   }

   /** Get the length of a prefix where no edits are permitted */
   public int getNonFuzzyPrefix() {
     return nonFuzzyPrefix;
   }

   /** Get the minimum length of a term considered for matching */
   public int getMinFuzzyLength() {
     return minFuzzyLength;
   }

   /** Return true if lengths are measured in unicode code-points rather than bytes */
   public boolean isUnicodeAware() {
     return unicodeAware;
   }

   /** Get the maximum number of determinized states permitted */
   public int getMaxDeterminizedStates() {
     return maxDeterminizedStates;
   }

   @Override
   public String toString(String field) {
     StringBuilder buffer = new StringBuilder();
     if (!getField().equals(field)) {
       buffer.append(getField());
       buffer.append(":");
     }
     buffer.append(getTerm().text());
     buffer.append('*');
     buffer.append('~');
     buffer.append(Integer.toString(maxEdits));
     if (getFilter() != null) {
       buffer.append(",");
       buffer.append("filter");
       buffer.append(getFilter().toString());
     }
     return buffer.toString();
   }

   private static class FuzzyCompletionWeight extends CompletionWeight {
     private final Set<IntsRef> refs;
     int currentBoost = 0;

     public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs)
         throws IOException {
       super(query, automaton);
       this.refs = refs;
     }

     @Override
     protected void setNextMatch(IntsRef pathPrefix) {
       // NOTE: the last letter of the matched prefix for the exact
       // match never makes it through here
       // so an exact match and a match with only a edit at the
       // end is boosted the same
       int maxCount = 0;
       for (IntsRef ref : refs) {
         int minLength = Math.min(ref.length, pathPrefix.length);
         int count = 0;
         for (int i = 0; i < minLength; i++) {
           if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
             count++;
           } else {
             break;
           }
         }
         maxCount = Math.max(maxCount, count);
       }
       currentBoost = maxCount;
     }

     @Override
     protected float boost() {
       return currentBoost;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.suggest.document;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.ScoreMode;
	import org.apache.lucene.search.Weight;
	import org.apache.lucene.search.suggest.BitsProducer;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.UnicodeUtil;
	import org.apache.lucene.util.automaton.Automata;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.FiniteStringsIterator;
	import org.apache.lucene.util.automaton.LevenshteinAutomata;
	import org.apache.lucene.util.automaton.Operations;
	import org.apache.lucene.util.automaton.UTF32ToUTF8;

	/**
	* A {@link CompletionQuery} that match documents containing terms within an edit distance of the
	* specified prefix.
	*
	* <p>This query boost documents relative to how similar the indexed terms are to the provided
	* prefix.
	*
	* <p>Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' against a
	* field 'suggest_field' is as follows:
	*
	* <pre class="prettyprint">
	* CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
	* </pre>
	*
	* @lucene.experimental
	*/
	public class FuzzyCompletionQuery extends PrefixCompletionQuery {

	/**
	* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code
	* points (actual letters) instead of bytes.
	*/
	public static final boolean DEFAULT_UNICODE_AWARE = false;

	/** The default minimum length of the key before any edits are allowed. */
	public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;

	/** The default prefix length where edits are not allowed. */
	public static final int DEFAULT_NON_FUZZY_PREFIX = 1;

	/** The default maximum number of edits for fuzzy suggestions. */
	public static final int DEFAULT_MAX_EDITS = 1;

	/** The default transposition value passed to {@link LevenshteinAutomata} */
	public static final boolean DEFAULT_TRANSPOSITIONS = true;

	private final int maxEdits;
	private final boolean transpositions;
	private final int nonFuzzyPrefix;
	private final int minFuzzyLength;
	private final boolean unicodeAware;
	private final int maxDeterminizedStates;

	/**
	* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} with no
	* filter
	*/
	public FuzzyCompletionQuery(Analyzer analyzer, Term term) {
	this(analyzer, term, null);
	}

	/**
	* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, int,
	* boolean, int, int, boolean, int)} with defaults for <code>maxEdits</code>, <code>transpositions
	* </code>, <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>, <code>unicodeAware</code>
	* and <code>maxDeterminizedStates</code> See {@link #DEFAULT_MAX_EDITS}, {@link
	* #DEFAULT_TRANSPOSITIONS}, {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH},
	* {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_MAX_DETERMINIZED_STATES} for
	* defaults
	*/
	public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) {
	this(
	analyzer,
	term,
	filter,
	DEFAULT_MAX_EDITS,
	DEFAULT_TRANSPOSITIONS,
	DEFAULT_NON_FUZZY_PREFIX,
	DEFAULT_MIN_FUZZY_LENGTH,
	DEFAULT_UNICODE_AWARE,
	Operations.DEFAULT_MAX_DETERMINIZED_STATES);
	}

	/**
	* Constructs an analyzed fuzzy prefix completion query
	*
	* @param analyzer used to analyze the provided {@link Term#text()}
	* @param term query is run against {@link Term#field()} and {@link Term#text()} is analyzed with
	* <code>analyzer</code>
	* @param filter used to query on a sub set of documents
	* @param maxEdits maximum number of acceptable edits
	* @param transpositions value passed to {@link LevenshteinAutomata}
	* @param nonFuzzyPrefix prefix length where edits are not allowed
	* @param minFuzzyLength minimum prefix length before any edits are allowed
	* @param unicodeAware treat prefix as unicode rather than bytes
	* @param maxDeterminizedStates maximum automaton states allowed for {@link LevenshteinAutomata}
	*/
	public FuzzyCompletionQuery(
	Analyzer analyzer,
	Term term,
	BitsProducer filter,
	int maxEdits,
	boolean transpositions,
	int nonFuzzyPrefix,
	int minFuzzyLength,
	boolean unicodeAware,
	int maxDeterminizedStates) {
	super(analyzer, term, filter);
	this.maxEdits = maxEdits;
	this.transpositions = transpositions;
	this.nonFuzzyPrefix = nonFuzzyPrefix;
	this.minFuzzyLength = minFuzzyLength;
	this.unicodeAware = unicodeAware;
	this.maxDeterminizedStates = maxDeterminizedStates;
	}

	@Override
	public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
	throws IOException {
	final Automaton originalAutomata;
	try (CompletionTokenStream stream =
	(CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text())) {
	originalAutomata = stream.toAutomaton(unicodeAware);
	}
	Set<IntsRef> refs = new HashSet<>();
	Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
	if (unicodeAware) {
	Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
	utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
	automaton = utf8automaton;
	}
	// TODO Accumulating all refs is bad, because the resulting set may be very big.
	// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
	return new FuzzyCompletionWeight(this, automaton, refs);
	}

	private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
	List<Automaton> subs = new ArrayList<>();
	FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
	for (IntsRef string; (string = finiteStrings.next()) != null; ) {
	refs.add(IntsRef.deepCopyOf(string));

	if (string.length <= nonFuzzyPrefix \|\| string.length < minFuzzyLength) {
	subs.add(Automata.makeString(string.ints, string.offset, string.length));
	} else {
	int ints[] = new int[string.length - nonFuzzyPrefix];
	System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
	// TODO: maybe add alphaMin to LevenshteinAutomata,
	// and pass 1 instead of 0? We probably don't want
	// to allow the trailing dedup bytes to be
	// edited... but then 0 byte is "in general" allowed
	// on input (but not in UTF8).
	LevenshteinAutomata lev =
	new LevenshteinAutomata(
	ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
	subs.add(
	lev.toAutomaton(
	maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
	}
	}

	if (subs.isEmpty()) {
	// automaton is empty, there is no accepted paths through it
	return Automata.makeEmpty(); // matches nothing
	} else if (subs.size() == 1) {
	// no synonyms or anything: just a single path through the tokenstream
	return subs.get(0);
	} else {
	// multiple paths: this is really scary! is it slow?
	// maybe we should not do this and throw UOE?
	Automaton a = Operations.union(subs);
	// TODO: we could call toLevenshteinAutomata() before det?
	// this only happens if you have multiple paths anyway (e.g. synonyms)
	return Operations.determinize(a, maxDeterminizedStates);
	}
	}

	/** Get the maximum edit distance for fuzzy matches */
	public int getMaxEdits() {
	return maxEdits;
	}

	/** Return whether transpositions count as a single edit */
	public boolean isTranspositions() {
	return transpositions;
	}

	/** Get the length of a prefix where no edits are permitted */
	public int getNonFuzzyPrefix() {
	return nonFuzzyPrefix;
	}

	/** Get the minimum length of a term considered for matching */
	public int getMinFuzzyLength() {
	return minFuzzyLength;
	}

	/** Return true if lengths are measured in unicode code-points rather than bytes */
	public boolean isUnicodeAware() {
	return unicodeAware;
	}

	/** Get the maximum number of determinized states permitted */
	public int getMaxDeterminizedStates() {
	return maxDeterminizedStates;
	}

	@Override
	public String toString(String field) {
	StringBuilder buffer = new StringBuilder();
	if (!getField().equals(field)) {
	buffer.append(getField());
	buffer.append(":");
	}
	buffer.append(getTerm().text());
	buffer.append('*');
	buffer.append('~');
	buffer.append(Integer.toString(maxEdits));
	if (getFilter() != null) {
	buffer.append(",");
	buffer.append("filter");
	buffer.append(getFilter().toString());
	}
	return buffer.toString();
	}

	private static class FuzzyCompletionWeight extends CompletionWeight {
	private final Set<IntsRef> refs;
	int currentBoost = 0;

	public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs)
	throws IOException {
	super(query, automaton);
	this.refs = refs;
	}

	@Override
	protected void setNextMatch(IntsRef pathPrefix) {
	// NOTE: the last letter of the matched prefix for the exact
	// match never makes it through here
	// so an exact match and a match with only a edit at the
	// end is boosted the same
	int maxCount = 0;
	for (IntsRef ref : refs) {
	int minLength = Math.min(ref.length, pathPrefix.length);
	int count = 0;
	for (int i = 0; i < minLength; i++) {
	if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
	count++;
	} else {
	break;
	}
	}
	maxCount = Math.max(maxCount, count);
	}
	currentBoost = maxCount;
	}

	@Override
	protected float boost() {
	return currentBoost;
	}
	}
	}