lucene/suggest/src/java/org/apache/lucene/search/suggest/document/FuzzyCompletionQuery.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.suggest.document;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.suggest.BitsProducer;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.FiniteStringsIterator;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.UTF32ToUTF8;

 /**
  * A {@link CompletionQuery} that match documents containing terms
  * within an edit distance of the specified prefix.
  * <p>
  * This query boost documents relative to how similar the indexed terms are to the
  * provided prefix.
  * <p>
  * Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg'
  * against a field 'suggest_field' is as follows:
  *
  * <pre class="prettyprint">
  *  CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
  * </pre>
  *
  * @lucene.experimental
  */
 public class FuzzyCompletionQuery extends PrefixCompletionQuery {

   /**
    * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
    * parameters in Unicode code points (actual letters)
    * instead of bytes.
    * */
   public static final boolean DEFAULT_UNICODE_AWARE = false;

   /**
    * The default minimum length of the key before any edits are allowed.
    */
   public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;

   /**
    * The default prefix length where edits are not allowed.
    */
   public static final int DEFAULT_NON_FUZZY_PREFIX = 1;

   /**
    * The default maximum number of edits for fuzzy
    * suggestions.
    */
   public static final int DEFAULT_MAX_EDITS = 1;

   /**
    * The default transposition value passed to {@link LevenshteinAutomata}
    */
   public static final boolean DEFAULT_TRANSPOSITIONS = true;

   private final int maxEdits;
   private final boolean transpositions;
   private final int nonFuzzyPrefix;
   private final int minFuzzyLength;
   private final boolean unicodeAware;
   private final int determinizeWorkLimit;

   /**
    * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)}
    * with no filter
    */
   public FuzzyCompletionQuery(Analyzer analyzer, Term term) {
     this(analyzer, term, null);
   }

   /**
    * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer,
    * int, boolean, int, int, boolean, int)}
    * with defaults for <code>maxEdits</code>, <code>transpositions</code>,
    * <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>,
    * <code>unicodeAware</code> and <code>deteminizeWorkLimit</code>
    *
    * See {@link #DEFAULT_MAX_EDITS}, {@link #DEFAULT_TRANSPOSITIONS},
    * {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH},
    * {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT}
    * for defaults
    */
   public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) {
     this(analyzer, term, filter, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX,
         DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
     );
   }

   /**
    * Constructs an analyzed fuzzy prefix completion query
    *
    * @param analyzer used to analyze the provided {@link Term#text()}
    * @param term query is run against {@link Term#field()} and {@link Term#text()}
    *             is analyzed with <code>analyzer</code>
    * @param filter used to query on a sub set of documents
    * @param maxEdits maximum number of acceptable edits
    * @param transpositions value passed to {@link LevenshteinAutomata}
    * @param nonFuzzyPrefix prefix length where edits are not allowed
    * @param minFuzzyLength minimum prefix length before any edits are allowed
    * @param unicodeAware treat prefix as unicode rather than bytes
    * @param determinizeWorkLimit maximum effort allowed to determinize the {@link
    *     LevenshteinAutomata}
    */
   public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, int maxEdits,
                               boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
                               boolean unicodeAware, int determinizeWorkLimit) {
     super(analyzer, term, filter);
     this.maxEdits = maxEdits;
     this.transpositions = transpositions;
     this.nonFuzzyPrefix = nonFuzzyPrefix;
     this.minFuzzyLength = minFuzzyLength;
     this.unicodeAware = unicodeAware;
     this.determinizeWorkLimit = determinizeWorkLimit;
   }

   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
     final Automaton originalAutomata;
     try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
       originalAutomata = stream.toAutomaton(unicodeAware);
     }
     Set<IntsRef> refs = new HashSet<>();
     Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
     if (unicodeAware) {
       Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
       utf8automaton = Operations.determinize(utf8automaton, determinizeWorkLimit);
       automaton = utf8automaton;
     }
     // TODO Accumulating all refs is bad, because the resulting set may be very big.
     // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
     return new FuzzyCompletionWeight(this, automaton, refs);
   }

   private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
     List<Automaton> subs = new ArrayList<>();
     FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
     for (IntsRef string; (string = finiteStrings.next()) != null;) {
       refs.add(IntsRef.deepCopyOf(string));

       if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
         subs.add(Automata.makeString(string.ints, string.offset, string.length));
       } else {
         int ints[] = new int[string.length - nonFuzzyPrefix];
         System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
         // TODO: maybe add alphaMin to LevenshteinAutomata,
         // and pass 1 instead of 0?  We probably don't want
         // to allow the trailing dedup bytes to be
         // edited... but then 0 byte is "in general" allowed
         // on input (but not in UTF8).
         LevenshteinAutomata lev = new LevenshteinAutomata(ints,
             unicodeAware ? Character.MAX_CODE_POINT : 255,
             transpositions);
         subs.add(lev.toAutomaton(maxEdits,
             UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
       }
     }

     if (subs.isEmpty()) {
       // automaton is empty, there is no accepted paths through it
       return Automata.makeEmpty(); // matches nothing
     } else if (subs.size() == 1) {
       // no synonyms or anything: just a single path through the tokenstream
       return subs.get(0);
     } else {
       // multiple paths: this is really scary! is it slow?
       // maybe we should not do this and throw UOE?
       Automaton a = Operations.union(subs);
       // TODO: we could call toLevenshteinAutomata() before det?
       // this only happens if you have multiple paths anyway (e.g. synonyms)
       return Operations.determinize(a, determinizeWorkLimit);
     }
   }

   /**
    * Get the maximum edit distance for fuzzy matches
    */
   public int getMaxEdits() {
     return maxEdits;
   }

   /**
    * Return whether transpositions count as a single edit
    */
   public boolean isTranspositions() {
     return transpositions;
   }

   /**
    * Get the length of a prefix where no edits are permitted
    */
   public int getNonFuzzyPrefix() {
     return nonFuzzyPrefix;
   }

   /**
    * Get the minimum length of a term considered for matching
    */
   public int getMinFuzzyLength() {
     return minFuzzyLength;
   }

   /**
    * Return true if lengths are measured in unicode code-points rather than bytes
    */
   public boolean isUnicodeAware() {
     return unicodeAware;
   }

   /** Get the maximum effort to use determinizing */
   public int getDeterminizeWorkLimit() {
     return determinizeWorkLimit;
   }

   @Override
   public String toString(String field) {
     StringBuilder buffer = new StringBuilder();
     if (!getField().equals(field)) {
       buffer.append(getField());
       buffer.append(":");
     }
     buffer.append(getTerm().text());
     buffer.append('*');
     buffer.append('~');
     buffer.append(Integer.toString(maxEdits));
     if (getFilter() != null) {
       buffer.append(",");
       buffer.append("filter");
       buffer.append(getFilter().toString());
     }
     return buffer.toString();
   }

   private static class FuzzyCompletionWeight extends CompletionWeight {
     private final Set<IntsRef> refs;
     int currentBoost = 0;

     public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs) throws IOException {
       super(query, automaton);
       this.refs = refs;
     }

     @Override
     protected void setNextMatch(IntsRef pathPrefix) {
       // NOTE: the last letter of the matched prefix for the exact
       // match never makes it through here
       // so an exact match and a match with only a edit at the
       // end is boosted the same
       int maxCount = 0;
       for (IntsRef ref : refs) {
         int minLength = Math.min(ref.length, pathPrefix.length);
         int count = 0;
         for (int i = 0; i < minLength; i++) {
           if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
             count++;
           } else {
             break;
           }
         }
         maxCount = Math.max(maxCount, count);
       }
       currentBoost = maxCount;
     }

     @Override
     protected float boost() {
       return currentBoost;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.suggest.document;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.ScoreMode;
	import org.apache.lucene.search.Weight;
	import org.apache.lucene.search.suggest.BitsProducer;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.UnicodeUtil;
	import org.apache.lucene.util.automaton.Automata;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.FiniteStringsIterator;
	import org.apache.lucene.util.automaton.LevenshteinAutomata;
	import org.apache.lucene.util.automaton.Operations;
	import org.apache.lucene.util.automaton.UTF32ToUTF8;

	/**
	* A {@link CompletionQuery} that match documents containing terms
	* within an edit distance of the specified prefix.
	* <p>
	* This query boost documents relative to how similar the indexed terms are to the
	* provided prefix.
	* <p>
	* Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg'
	* against a field 'suggest_field' is as follows:
	*
	* <pre class="prettyprint">
	* CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
	* </pre>
	*
	* @lucene.experimental
	*/
	public class FuzzyCompletionQuery extends PrefixCompletionQuery {

	/**
	* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
	* parameters in Unicode code points (actual letters)
	* instead of bytes.
	* */
	public static final boolean DEFAULT_UNICODE_AWARE = false;

	/**
	* The default minimum length of the key before any edits are allowed.
	*/
	public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;

	/**
	* The default prefix length where edits are not allowed.
	*/
	public static final int DEFAULT_NON_FUZZY_PREFIX = 1;

	/**
	* The default maximum number of edits for fuzzy
	* suggestions.
	*/
	public static final int DEFAULT_MAX_EDITS = 1;

	/**
	* The default transposition value passed to {@link LevenshteinAutomata}
	*/
	public static final boolean DEFAULT_TRANSPOSITIONS = true;

	private final int maxEdits;
	private final boolean transpositions;
	private final int nonFuzzyPrefix;
	private final int minFuzzyLength;
	private final boolean unicodeAware;
	private final int determinizeWorkLimit;

	/**
	* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)}
	* with no filter
	*/
	public FuzzyCompletionQuery(Analyzer analyzer, Term term) {
	this(analyzer, term, null);
	}

	/**
	* Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer,
	* int, boolean, int, int, boolean, int)}
	* with defaults for <code>maxEdits</code>, <code>transpositions</code>,
	* <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>,
	* <code>unicodeAware</code> and <code>deteminizeWorkLimit</code>
	*
	* See {@link #DEFAULT_MAX_EDITS}, {@link #DEFAULT_TRANSPOSITIONS},
	* {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH},
	* {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT}
	* for defaults
	*/
	public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) {
	this(analyzer, term, filter, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX,
	DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT
	);
	}

	/**
	* Constructs an analyzed fuzzy prefix completion query
	*
	* @param analyzer used to analyze the provided {@link Term#text()}
	* @param term query is run against {@link Term#field()} and {@link Term#text()}
	* is analyzed with <code>analyzer</code>
	* @param filter used to query on a sub set of documents
	* @param maxEdits maximum number of acceptable edits
	* @param transpositions value passed to {@link LevenshteinAutomata}
	* @param nonFuzzyPrefix prefix length where edits are not allowed
	* @param minFuzzyLength minimum prefix length before any edits are allowed
	* @param unicodeAware treat prefix as unicode rather than bytes
	* @param determinizeWorkLimit maximum effort allowed to determinize the {@link
	* LevenshteinAutomata}
	*/
	public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, int maxEdits,
	boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
	boolean unicodeAware, int determinizeWorkLimit) {
	super(analyzer, term, filter);
	this.maxEdits = maxEdits;
	this.transpositions = transpositions;
	this.nonFuzzyPrefix = nonFuzzyPrefix;
	this.minFuzzyLength = minFuzzyLength;
	this.unicodeAware = unicodeAware;
	this.determinizeWorkLimit = determinizeWorkLimit;
	}

	@Override
	public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
	final Automaton originalAutomata;
	try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
	originalAutomata = stream.toAutomaton(unicodeAware);
	}
	Set<IntsRef> refs = new HashSet<>();
	Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
	if (unicodeAware) {
	Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
	utf8automaton = Operations.determinize(utf8automaton, determinizeWorkLimit);
	automaton = utf8automaton;
	}
	// TODO Accumulating all refs is bad, because the resulting set may be very big.
	// TODO Better iterate over automaton again inside FuzzyCompletionWeight?
	return new FuzzyCompletionWeight(this, automaton, refs);
	}

	private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) {
	List<Automaton> subs = new ArrayList<>();
	FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
	for (IntsRef string; (string = finiteStrings.next()) != null;) {
	refs.add(IntsRef.deepCopyOf(string));

	if (string.length <= nonFuzzyPrefix \|\| string.length < minFuzzyLength) {
	subs.add(Automata.makeString(string.ints, string.offset, string.length));
	} else {
	int ints[] = new int[string.length - nonFuzzyPrefix];
	System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length);
	// TODO: maybe add alphaMin to LevenshteinAutomata,
	// and pass 1 instead of 0? We probably don't want
	// to allow the trailing dedup bytes to be
	// edited... but then 0 byte is "in general" allowed
	// on input (but not in UTF8).
	LevenshteinAutomata lev = new LevenshteinAutomata(ints,
	unicodeAware ? Character.MAX_CODE_POINT : 255,
	transpositions);
	subs.add(lev.toAutomaton(maxEdits,
	UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
	}
	}

	if (subs.isEmpty()) {
	// automaton is empty, there is no accepted paths through it
	return Automata.makeEmpty(); // matches nothing
	} else if (subs.size() == 1) {
	// no synonyms or anything: just a single path through the tokenstream
	return subs.get(0);
	} else {
	// multiple paths: this is really scary! is it slow?
	// maybe we should not do this and throw UOE?
	Automaton a = Operations.union(subs);
	// TODO: we could call toLevenshteinAutomata() before det?
	// this only happens if you have multiple paths anyway (e.g. synonyms)
	return Operations.determinize(a, determinizeWorkLimit);
	}
	}

	/**
	* Get the maximum edit distance for fuzzy matches
	*/
	public int getMaxEdits() {
	return maxEdits;
	}

	/**
	* Return whether transpositions count as a single edit
	*/
	public boolean isTranspositions() {
	return transpositions;
	}

	/**
	* Get the length of a prefix where no edits are permitted
	*/
	public int getNonFuzzyPrefix() {
	return nonFuzzyPrefix;
	}

	/**
	* Get the minimum length of a term considered for matching
	*/
	public int getMinFuzzyLength() {
	return minFuzzyLength;
	}

	/**
	* Return true if lengths are measured in unicode code-points rather than bytes
	*/
	public boolean isUnicodeAware() {
	return unicodeAware;
	}

	/** Get the maximum effort to use determinizing */
	public int getDeterminizeWorkLimit() {
	return determinizeWorkLimit;
	}

	@Override
	public String toString(String field) {
	StringBuilder buffer = new StringBuilder();
	if (!getField().equals(field)) {
	buffer.append(getField());
	buffer.append(":");
	}
	buffer.append(getTerm().text());
	buffer.append('*');
	buffer.append('~');
	buffer.append(Integer.toString(maxEdits));
	if (getFilter() != null) {
	buffer.append(",");
	buffer.append("filter");
	buffer.append(getFilter().toString());
	}
	return buffer.toString();
	}

	private static class FuzzyCompletionWeight extends CompletionWeight {
	private final Set<IntsRef> refs;
	int currentBoost = 0;

	public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs) throws IOException {
	super(query, automaton);
	this.refs = refs;
	}

	@Override
	protected void setNextMatch(IntsRef pathPrefix) {
	// NOTE: the last letter of the matched prefix for the exact
	// match never makes it through here
	// so an exact match and a match with only a edit at the
	// end is boosted the same
	int maxCount = 0;
	for (IntsRef ref : refs) {
	int minLength = Math.min(ref.length, pathPrefix.length);
	int count = 0;
	for (int i = 0; i < minLength; i++) {
	if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
	count++;
	} else {
	break;
	}
	}
	maxCount = Math.max(maxCount, count);
	}
	currentBoost = maxCount;
	}

	@Override
	protected float boost() {
	return currentBoost;
	}
	}
	}