lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.suggest.fst;

 import java.io.Closeable;
 import java.io.IOException;
 import java.util.Comparator;

 import org.apache.lucene.search.suggest.InMemorySorter;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.*;

 /**
  * Finite state automata based implementation of "autocomplete" functionality.
  *
  * <h2>Implementation details</h2>
  *
  * <p>
  * The construction step in {@link #finalize()} works as follows:
  * <ul>
  * <li>A set of input terms and their buckets is given.</li>
  * <li>All terms in the input are prefixed with a synthetic pseudo-character
  * (code) of the weight bucket the term fell into. For example a term
  * <code>abc</code> with a discretized weight equal '1' would become
  * <code>1abc</code>.</li>
  * <li>The terms are then sorted by their raw value of UTF-8 character values
  * (including the synthetic bucket code in front).</li>
  * <li>A finite state automaton ({@link FST}) is constructed from the input. The
  * root node has arcs labeled with all possible weights. We cache all these
  * arcs, highest-weight first.</li>
  * </ul>
  *
  * <p>
  * At runtime, in {@link FSTCompletion#lookup(CharSequence, int)},
  * the automaton is utilized as follows:
  * <ul>
  * <li>For each possible term weight encoded in the automaton (cached arcs from
  * the root above), starting with the highest one, we descend along the path of
  * the input key. If the key is not a prefix of a sequence in the automaton
  * (path ends prematurely), we exit immediately -- no completions.</li>
  * <li>Otherwise, we have found an internal automaton node that ends the key.
  * <b>The entire subautomaton (all paths) starting from this node form the key's
  * completions.</b> We start the traversal of this subautomaton. Every time we
  * reach a final state (arc), we add a single suggestion to the list of results
  * (the weight of this suggestion is constant and equal to the root path we
  * started from). The tricky part is that because automaton edges are sorted and
  * we scan depth-first, we can terminate the entire procedure as soon as we
  * collect enough suggestions the user requested.</li>
  * <li>In case the number of suggestions collected in the step above is still
  * insufficient, we proceed to the next (smaller) weight leaving the root node
  * and repeat the same algorithm again.</li>
  * </ul>
  *
  * <h2>Runtime behavior and performance characteristic</h2>
  *
  * The algorithm described above is optimized for finding suggestions to short
  * prefixes in a top-weights-first order. This is probably the most common use
  * case: it allows presenting suggestions early and sorts them by the global
  * frequency (and then alphabetically).
  *
  * <p>
  * If there is an exact match in the automaton, it is returned first on the
  * results list (even with by-weight sorting).
  *
  * <p>
  * Note that the maximum lookup time for <b>any prefix</b> is the time of
  * descending to the subtree, plus traversal of the subtree up to the number of
  * requested suggestions (because they are already presorted by weight on the
  * root level and alphabetically at any node level).
  *
  * <p>
  * To order alphabetically only (no ordering by priorities), use identical term
  * weights for all terms. Alphabetical suggestions are returned even if
  * non-constant weights are used, but the algorithm for doing this is
  * suboptimal.
  *
  * <p>
  * "alphabetically" in any of the documentation above indicates UTF-8
  * representation order, nothing else.
  *
  * <p>
  * <b>NOTE</b>: the FST file format is experimental and subject to suddenly
  * change, requiring you to rebuild the FST suggest index.
  *
  * @see FSTCompletion
  * @lucene.experimental
  */
 public class FSTCompletionBuilder {
   /**
    * Default number of buckets.
    */
   public static final int DEFAULT_BUCKETS = 10;

   /**
    * The number of separate buckets for weights (discretization). The more
    * buckets, the more fine-grained term weights (priorities) can be assigned.
    * The speed of lookup will not decrease for prefixes which have
    * highly-weighted completions (because these are filled-in first), but will
    * decrease significantly for low-weighted terms (but these should be
    * infrequent, so it is all right).
    *
    * <p>
    * The number of buckets must be within [1, 255] range.
    */
   private final int buckets;

   /**
    * Finite state automaton encoding all the lookup terms. See class notes for
    * details.
    */
   FST<Object> automaton;

   /**
    * FST construction require re-sorting the input. This is the class that
    * collects all the input entries, their weights and then provides sorted
    * order.
    */
   private final BytesRefSorter sorter;

   /**
    * Scratch buffer for {@link #add(BytesRef, int)}.
    */
   private final BytesRefBuilder scratch = new BytesRefBuilder();

   /**
    * Max tail sharing length.
    */
   private final int shareMaxTailLength;

   /**
    * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
    * promoted to first position and {@link InMemorySorter} with a comparator obtained from
    * {@link Comparator#naturalOrder()}.
    */
   public FSTCompletionBuilder() {
     this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Integer.MAX_VALUE);
   }

   /**
    * Creates an FSTCompletion with the specified options.
    * @param buckets
    *          The number of buckets for weight discretization. Buckets are used
    *          in {@link #add(BytesRef, int)} and must be smaller than the number
    *          given here.
    *
    * @param sorter
    *          {@link BytesRefSorter} used for re-sorting input for the automaton.
    *          For large inputs, use on-disk sorting implementations. The sorter
    *          is closed automatically in {@link #build()} if it implements
    *          {@link Closeable}.
    *
    * @param shareMaxTailLength
    *          Max shared suffix sharing length.
    *
    *          See the description of this parameter in {@link Builder}'s constructor.
    *          In general, for very large inputs you'll want to construct a non-minimal
    *          automaton which will be larger, but the construction will take far less ram.
    *          For minimal automata, set it to {@link Integer#MAX_VALUE}.
    */
   public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
     if (buckets < 1 || buckets > 255) {
       throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: "
           + buckets);
     }

     if (sorter == null) throw new IllegalArgumentException(
         "BytesRefSorter must not be null.");

     this.sorter = sorter;
     this.buckets = buckets;
     this.shareMaxTailLength = shareMaxTailLength;
   }

   /**
    * Appends a single suggestion and its weight to the internal buffers.
    *
    * @param utf8
    *          The suggestion (utf8 representation) to be added. The content is
    *          copied and the object can be reused.
    * @param bucket
    *          The bucket to place this suggestion in. Must be non-negative and
    *          smaller than the number of buckets passed in the constructor.
    *          Higher numbers indicate suggestions that should be presented
    *          before suggestions placed in smaller buckets.
    */
   public void add(BytesRef utf8, int bucket) throws IOException {
     if (bucket < 0 || bucket >= buckets) {
       throw new IllegalArgumentException(
           "Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
     }

     scratch.grow(utf8.length + 10);
     scratch.clear();
     scratch.append((byte) bucket);
     scratch.append(utf8);
     sorter.add(scratch.get());
   }

   /**
    * Builds the final automaton from a list of added entries. This method may
    * take a longer while as it needs to build the automaton.
    */
   public FSTCompletion build() throws IOException {
     this.automaton = buildAutomaton(sorter);

     if (sorter instanceof Closeable) {
       ((Closeable) sorter).close();
     }

     return new FSTCompletion(automaton);
   }

   /**
    * Builds the final automaton from a list of entries.
    */
   private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
     // Build the automaton.
     final Outputs<Object> outputs = NoOutputs.getSingleton();
     final Object empty = outputs.getNoOutput();
     final Builder<Object> builder = new Builder<>(
         FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
         shareMaxTailLength, outputs, true, 15);

     BytesRefBuilder scratch = new BytesRefBuilder();
     BytesRef entry;
     final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
     int count = 0;
     BytesRefIterator iter = sorter.iterator();
     while((entry = iter.next()) != null) {
       count++;
       if (scratch.get().compareTo(entry) != 0) {
         builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
         scratch.copyBytes(entry);
       }
     }

     return count == 0 ? null : builder.finish();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.suggest.fst;

	import java.io.Closeable;
	import java.io.IOException;
	import java.util.Comparator;

	import org.apache.lucene.search.suggest.InMemorySorter;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.BytesRefIterator;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.*;

	/**
	* Finite state automata based implementation of "autocomplete" functionality.
	*
	* <h2>Implementation details</h2>
	*
	* <p>
	* The construction step in {@link #finalize()} works as follows:
	* <ul>
	* <li>A set of input terms and their buckets is given.</li>
	* <li>All terms in the input are prefixed with a synthetic pseudo-character
	* (code) of the weight bucket the term fell into. For example a term
	* <code>abc</code> with a discretized weight equal '1' would become
	* <code>1abc</code>.</li>
	* <li>The terms are then sorted by their raw value of UTF-8 character values
	* (including the synthetic bucket code in front).</li>
	* <li>A finite state automaton ({@link FST}) is constructed from the input. The
	* root node has arcs labeled with all possible weights. We cache all these
	* arcs, highest-weight first.</li>
	* </ul>
	*
	* <p>
	* At runtime, in {@link FSTCompletion#lookup(CharSequence, int)},
	* the automaton is utilized as follows:
	* <ul>
	* <li>For each possible term weight encoded in the automaton (cached arcs from
	* the root above), starting with the highest one, we descend along the path of
	* the input key. If the key is not a prefix of a sequence in the automaton
	* (path ends prematurely), we exit immediately -- no completions.</li>
	* <li>Otherwise, we have found an internal automaton node that ends the key.
	* <b>The entire subautomaton (all paths) starting from this node form the key's
	* completions.</b> We start the traversal of this subautomaton. Every time we
	* reach a final state (arc), we add a single suggestion to the list of results
	* (the weight of this suggestion is constant and equal to the root path we
	* started from). The tricky part is that because automaton edges are sorted and
	* we scan depth-first, we can terminate the entire procedure as soon as we
	* collect enough suggestions the user requested.</li>
	* <li>In case the number of suggestions collected in the step above is still
	* insufficient, we proceed to the next (smaller) weight leaving the root node
	* and repeat the same algorithm again.</li>
	* </ul>
	*
	* <h2>Runtime behavior and performance characteristic</h2>
	*
	* The algorithm described above is optimized for finding suggestions to short
	* prefixes in a top-weights-first order. This is probably the most common use
	* case: it allows presenting suggestions early and sorts them by the global
	* frequency (and then alphabetically).
	*
	* <p>
	* If there is an exact match in the automaton, it is returned first on the
	* results list (even with by-weight sorting).
	*
	* <p>
	* Note that the maximum lookup time for <b>any prefix</b> is the time of
	* descending to the subtree, plus traversal of the subtree up to the number of
	* requested suggestions (because they are already presorted by weight on the
	* root level and alphabetically at any node level).
	*
	* <p>
	* To order alphabetically only (no ordering by priorities), use identical term
	* weights for all terms. Alphabetical suggestions are returned even if
	* non-constant weights are used, but the algorithm for doing this is
	* suboptimal.
	*
	* <p>
	* "alphabetically" in any of the documentation above indicates UTF-8
	* representation order, nothing else.
	*
	* <p>
	* <b>NOTE</b>: the FST file format is experimental and subject to suddenly
	* change, requiring you to rebuild the FST suggest index.
	*
	* @see FSTCompletion
	* @lucene.experimental
	*/
	public class FSTCompletionBuilder {
	/**
	* Default number of buckets.
	*/
	public static final int DEFAULT_BUCKETS = 10;

	/**
	* The number of separate buckets for weights (discretization). The more
	* buckets, the more fine-grained term weights (priorities) can be assigned.
	* The speed of lookup will not decrease for prefixes which have
	* highly-weighted completions (because these are filled-in first), but will
	* decrease significantly for low-weighted terms (but these should be
	* infrequent, so it is all right).
	*
	* <p>
	* The number of buckets must be within [1, 255] range.
	*/
	private final int buckets;

	/**
	* Finite state automaton encoding all the lookup terms. See class notes for
	* details.
	*/
	FST<Object> automaton;

	/**
	* FST construction require re-sorting the input. This is the class that
	* collects all the input entries, their weights and then provides sorted
	* order.
	*/
	private final BytesRefSorter sorter;

	/**
	* Scratch buffer for {@link #add(BytesRef, int)}.
	*/
	private final BytesRefBuilder scratch = new BytesRefBuilder();

	/**
	* Max tail sharing length.
	*/
	private final int shareMaxTailLength;

	/**
	* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
	* promoted to first position and {@link InMemorySorter} with a comparator obtained from
	* {@link Comparator#naturalOrder()}.
	*/
	public FSTCompletionBuilder() {
	this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Integer.MAX_VALUE);
	}

	/**
	* Creates an FSTCompletion with the specified options.
	* @param buckets
	* The number of buckets for weight discretization. Buckets are used
	* in {@link #add(BytesRef, int)} and must be smaller than the number
	* given here.
	*
	* @param sorter
	* {@link BytesRefSorter} used for re-sorting input for the automaton.
	* For large inputs, use on-disk sorting implementations. The sorter
	* is closed automatically in {@link #build()} if it implements
	* {@link Closeable}.
	*
	* @param shareMaxTailLength
	* Max shared suffix sharing length.
	*
	* See the description of this parameter in {@link Builder}'s constructor.
	* In general, for very large inputs you'll want to construct a non-minimal
	* automaton which will be larger, but the construction will take far less ram.
	* For minimal automata, set it to {@link Integer#MAX_VALUE}.
	*/
	public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
	if (buckets < 1 \|\| buckets > 255) {
	throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: "
	+ buckets);
	}

	if (sorter == null) throw new IllegalArgumentException(
	"BytesRefSorter must not be null.");

	this.sorter = sorter;
	this.buckets = buckets;
	this.shareMaxTailLength = shareMaxTailLength;
	}

	/**
	* Appends a single suggestion and its weight to the internal buffers.
	*
	* @param utf8
	* The suggestion (utf8 representation) to be added. The content is
	* copied and the object can be reused.
	* @param bucket
	* The bucket to place this suggestion in. Must be non-negative and
	* smaller than the number of buckets passed in the constructor.
	* Higher numbers indicate suggestions that should be presented
	* before suggestions placed in smaller buckets.
	*/
	public void add(BytesRef utf8, int bucket) throws IOException {
	if (bucket < 0 \|\| bucket >= buckets) {
	throw new IllegalArgumentException(
	"Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
	}

	scratch.grow(utf8.length + 10);
	scratch.clear();
	scratch.append((byte) bucket);
	scratch.append(utf8);
	sorter.add(scratch.get());
	}

	/**
	* Builds the final automaton from a list of added entries. This method may
	* take a longer while as it needs to build the automaton.
	*/
	public FSTCompletion build() throws IOException {
	this.automaton = buildAutomaton(sorter);

	if (sorter instanceof Closeable) {
	((Closeable) sorter).close();
	}

	return new FSTCompletion(automaton);
	}

	/**
	* Builds the final automaton from a list of entries.
	*/
	private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
	// Build the automaton.
	final Outputs<Object> outputs = NoOutputs.getSingleton();
	final Object empty = outputs.getNoOutput();
	final Builder<Object> builder = new Builder<>(
	FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
	shareMaxTailLength, outputs, true, 15);

	BytesRefBuilder scratch = new BytesRefBuilder();
	BytesRef entry;
	final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
	int count = 0;
	BytesRefIterator iter = sorter.iterator();
	while((entry = iter.next()) != null) {
	count++;
	if (scratch.get().compareTo(entry) != 0) {
	builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
	scratch.copyBytes(entry);
	}
	}

	return count == 0 ? null : builder.finish();
	}
	}