lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.suggest.fst;

 import java.io.Closeable;
 import java.io.IOException;
 import java.util.Comparator;
 import org.apache.lucene.search.suggest.InMemorySorter;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.*;

 /**
  * Finite state automata based implementation of "autocomplete" functionality.
  *
  * <h2>Implementation details</h2>
  *
  * <p>The construction step in {@link #finalize()} works as follows:
  *
  * <ul>
  *   <li>A set of input terms and their buckets is given.
  *   <li>All terms in the input are prefixed with a synthetic pseudo-character (code) of the weight
  *       bucket the term fell into. For example a term <code>abc</code> with a discretized weight
  *       equal '1' would become <code>1abc</code>.
  *   <li>The terms are then sorted by their raw value of UTF-8 character values (including the
  *       synthetic bucket code in front).
  *   <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
  *       arcs labeled with all possible weights. We cache all these arcs, highest-weight first.
  * </ul>
  *
  * <p>At runtime, in {@link FSTCompletion#lookup(CharSequence, int)}, the automaton is utilized as
  * follows:
  *
  * <ul>
  *   <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
  *       starting with the highest one, we descend along the path of the input key. If the key is
  *       not a prefix of a sequence in the automaton (path ends prematurely), we exit immediately --
  *       no completions.
  *   <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
  *       subautomaton (all paths) starting from this node form the key's completions.</b> We start
  *       the traversal of this subautomaton. Every time we reach a final state (arc), we add a
  *       single suggestion to the list of results (the weight of this suggestion is constant and
  *       equal to the root path we started from). The tricky part is that because automaton edges
  *       are sorted and we scan depth-first, we can terminate the entire procedure as soon as we
  *       collect enough suggestions the user requested.
  *   <li>In case the number of suggestions collected in the step above is still insufficient, we
  *       proceed to the next (smaller) weight leaving the root node and repeat the same algorithm
  *       again.
  * </ul>
  *
  * <h2>Runtime behavior and performance characteristic</h2>
  *
  * The algorithm described above is optimized for finding suggestions to short prefixes in a
  * top-weights-first order. This is probably the most common use case: it allows presenting
  * suggestions early and sorts them by the global frequency (and then alphabetically).
  *
  * <p>If there is an exact match in the automaton, it is returned first on the results list (even
  * with by-weight sorting).
  *
  * <p>Note that the maximum lookup time for <b>any prefix</b> is the time of descending to the
  * subtree, plus traversal of the subtree up to the number of requested suggestions (because they
  * are already presorted by weight on the root level and alphabetically at any node level).
  *
  * <p>To order alphabetically only (no ordering by priorities), use identical term weights for all
  * terms. Alphabetical suggestions are returned even if non-constant weights are used, but the
  * algorithm for doing this is suboptimal.
  *
  * <p>"alphabetically" in any of the documentation above indicates UTF-8 representation order,
  * nothing else.
  *
  * <p><b>NOTE</b>: the FST file format is experimental and subject to suddenly change, requiring you
  * to rebuild the FST suggest index.
  *
  * @see FSTCompletion
  * @lucene.experimental
  */
 public class FSTCompletionBuilder {
   /** Default number of buckets. */
   public static final int DEFAULT_BUCKETS = 10;

   /**
    * The number of separate buckets for weights (discretization). The more buckets, the more
    * fine-grained term weights (priorities) can be assigned. The speed of lookup will not decrease
    * for prefixes which have highly-weighted completions (because these are filled-in first), but
    * will decrease significantly for low-weighted terms (but these should be infrequent, so it is
    * all right).
    *
    * <p>The number of buckets must be within [1, 255] range.
    */
   private final int buckets;

   /** Finite state automaton encoding all the lookup terms. See class notes for details. */
   FST<Object> automaton;

   /**
    * FST construction require re-sorting the input. This is the class that collects all the input
    * entries, their weights and then provides sorted order.
    */
   private final BytesRefSorter sorter;

   /** Scratch buffer for {@link #add(BytesRef, int)}. */
   private final BytesRefBuilder scratch = new BytesRefBuilder();

   /** Max tail sharing length. */
   private final int shareMaxTailLength;

   /**
    * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match promoted to
    * first position and {@link InMemorySorter} with a comparator obtained from {@link
    * Comparator#naturalOrder()}.
    */
   public FSTCompletionBuilder() {
     this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Integer.MAX_VALUE);
   }

   /**
    * Creates an FSTCompletion with the specified options.
    *
    * @param buckets The number of buckets for weight discretization. Buckets are used in {@link
    *     #add(BytesRef, int)} and must be smaller than the number given here.
    * @param sorter {@link BytesRefSorter} used for re-sorting input for the automaton. For large
    *     inputs, use on-disk sorting implementations. The sorter is closed automatically in {@link
    *     #build()} if it implements {@link Closeable}.
    * @param shareMaxTailLength Max shared suffix sharing length.
    *     <p>See the description of this parameter in {@link
    *     org.apache.lucene.util.fst.FSTCompiler.Builder}. In general, for very large inputs you'll
    *     want to construct a non-minimal automaton which will be larger, but the construction will
    *     take far less ram. For minimal automata, set it to {@link Integer#MAX_VALUE}.
    */
   public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
     if (buckets < 1 || buckets > 255) {
       throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
     }

     if (sorter == null) {
       throw new IllegalArgumentException("BytesRefSorter must not be null.");
     }

     this.sorter = sorter;
     this.buckets = buckets;
     this.shareMaxTailLength = shareMaxTailLength;
   }

   /**
    * Appends a single suggestion and its weight to the internal buffers.
    *
    * @param utf8 The suggestion (utf8 representation) to be added. The content is copied and the
    *     object can be reused.
    * @param bucket The bucket to place this suggestion in. Must be non-negative and smaller than the
    *     number of buckets passed in the constructor. Higher numbers indicate suggestions that
    *     should be presented before suggestions placed in smaller buckets.
    */
   public void add(BytesRef utf8, int bucket) throws IOException {
     if (bucket < 0 || bucket >= buckets) {
       throw new IllegalArgumentException(
           "Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
     }

     scratch.grow(utf8.length + 10);
     scratch.clear();
     scratch.append((byte) bucket);
     scratch.append(utf8);
     sorter.add(scratch.get());
   }

   /**
    * Builds the final automaton from a list of added entries. This method may take a longer while as
    * it needs to build the automaton.
    */
   public FSTCompletion build() throws IOException {
     this.automaton = buildAutomaton(sorter);

     if (sorter instanceof Closeable) {
       ((Closeable) sorter).close();
     }

     return new FSTCompletion(automaton);
   }

   /** Builds the final automaton from a list of entries. */
   private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
     // Build the automaton.
     final Outputs<Object> outputs = NoOutputs.getSingleton();
     final Object empty = outputs.getNoOutput();
     final FSTCompiler<Object> fstCompiler =
         new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
             .shareMaxTailLength(shareMaxTailLength)
             .build();

     BytesRefBuilder scratch = new BytesRefBuilder();
     BytesRef entry;
     final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
     int count = 0;
     BytesRefIterator iter = sorter.iterator();
     while ((entry = iter.next()) != null) {
       count++;
       if (scratch.get().compareTo(entry) != 0) {
         fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
         scratch.copyBytes(entry);
       }
     }

     return count == 0 ? null : fstCompiler.compile();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.suggest.fst;

	import java.io.Closeable;
	import java.io.IOException;
	import java.util.Comparator;
	import org.apache.lucene.search.suggest.InMemorySorter;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.BytesRefIterator;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.*;

	/**
	* Finite state automata based implementation of "autocomplete" functionality.
	*
	* <h2>Implementation details</h2>
	*
	* <p>The construction step in {@link #finalize()} works as follows:
	*
	* <ul>
	* <li>A set of input terms and their buckets is given.
	* <li>All terms in the input are prefixed with a synthetic pseudo-character (code) of the weight
	* bucket the term fell into. For example a term <code>abc</code> with a discretized weight
	* equal '1' would become <code>1abc</code>.
	* <li>The terms are then sorted by their raw value of UTF-8 character values (including the
	* synthetic bucket code in front).
	* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
	* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.
	* </ul>
	*
	* <p>At runtime, in {@link FSTCompletion#lookup(CharSequence, int)}, the automaton is utilized as
	* follows:
	*
	* <ul>
	* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
	* starting with the highest one, we descend along the path of the input key. If the key is
	* not a prefix of a sequence in the automaton (path ends prematurely), we exit immediately --
	* no completions.
	* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
	* subautomaton (all paths) starting from this node form the key's completions.</b> We start
	* the traversal of this subautomaton. Every time we reach a final state (arc), we add a
	* single suggestion to the list of results (the weight of this suggestion is constant and
	* equal to the root path we started from). The tricky part is that because automaton edges
	* are sorted and we scan depth-first, we can terminate the entire procedure as soon as we
	* collect enough suggestions the user requested.
	* <li>In case the number of suggestions collected in the step above is still insufficient, we
	* proceed to the next (smaller) weight leaving the root node and repeat the same algorithm
	* again.
	* </ul>
	*
	* <h2>Runtime behavior and performance characteristic</h2>
	*
	* The algorithm described above is optimized for finding suggestions to short prefixes in a
	* top-weights-first order. This is probably the most common use case: it allows presenting
	* suggestions early and sorts them by the global frequency (and then alphabetically).
	*
	* <p>If there is an exact match in the automaton, it is returned first on the results list (even
	* with by-weight sorting).
	*
	* <p>Note that the maximum lookup time for <b>any prefix</b> is the time of descending to the
	* subtree, plus traversal of the subtree up to the number of requested suggestions (because they
	* are already presorted by weight on the root level and alphabetically at any node level).
	*
	* <p>To order alphabetically only (no ordering by priorities), use identical term weights for all
	* terms. Alphabetical suggestions are returned even if non-constant weights are used, but the
	* algorithm for doing this is suboptimal.
	*
	* <p>"alphabetically" in any of the documentation above indicates UTF-8 representation order,
	* nothing else.
	*
	* <p><b>NOTE</b>: the FST file format is experimental and subject to suddenly change, requiring you
	* to rebuild the FST suggest index.
	*
	* @see FSTCompletion
	* @lucene.experimental
	*/
	public class FSTCompletionBuilder {
	/** Default number of buckets. */
	public static final int DEFAULT_BUCKETS = 10;

	/**
	* The number of separate buckets for weights (discretization). The more buckets, the more
	* fine-grained term weights (priorities) can be assigned. The speed of lookup will not decrease
	* for prefixes which have highly-weighted completions (because these are filled-in first), but
	* will decrease significantly for low-weighted terms (but these should be infrequent, so it is
	* all right).
	*
	* <p>The number of buckets must be within [1, 255] range.
	*/
	private final int buckets;

	/** Finite state automaton encoding all the lookup terms. See class notes for details. */
	FST<Object> automaton;

	/**
	* FST construction require re-sorting the input. This is the class that collects all the input
	* entries, their weights and then provides sorted order.
	*/
	private final BytesRefSorter sorter;

	/** Scratch buffer for {@link #add(BytesRef, int)}. */
	private final BytesRefBuilder scratch = new BytesRefBuilder();

	/** Max tail sharing length. */
	private final int shareMaxTailLength;

	/**
	* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match promoted to
	* first position and {@link InMemorySorter} with a comparator obtained from {@link
	* Comparator#naturalOrder()}.
	*/
	public FSTCompletionBuilder() {
	this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Integer.MAX_VALUE);
	}

	/**
	* Creates an FSTCompletion with the specified options.
	*
	* @param buckets The number of buckets for weight discretization. Buckets are used in {@link
	* #add(BytesRef, int)} and must be smaller than the number given here.
	* @param sorter {@link BytesRefSorter} used for re-sorting input for the automaton. For large
	* inputs, use on-disk sorting implementations. The sorter is closed automatically in {@link
	* #build()} if it implements {@link Closeable}.
	* @param shareMaxTailLength Max shared suffix sharing length.
	* <p>See the description of this parameter in {@link
	* org.apache.lucene.util.fst.FSTCompiler.Builder}. In general, for very large inputs you'll
	* want to construct a non-minimal automaton which will be larger, but the construction will
	* take far less ram. For minimal automata, set it to {@link Integer#MAX_VALUE}.
	*/
	public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
	if (buckets < 1 \|\| buckets > 255) {
	throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
	}

	if (sorter == null) {
	throw new IllegalArgumentException("BytesRefSorter must not be null.");
	}

	this.sorter = sorter;
	this.buckets = buckets;
	this.shareMaxTailLength = shareMaxTailLength;
	}

	/**
	* Appends a single suggestion and its weight to the internal buffers.
	*
	* @param utf8 The suggestion (utf8 representation) to be added. The content is copied and the
	* object can be reused.
	* @param bucket The bucket to place this suggestion in. Must be non-negative and smaller than the
	* number of buckets passed in the constructor. Higher numbers indicate suggestions that
	* should be presented before suggestions placed in smaller buckets.
	*/
	public void add(BytesRef utf8, int bucket) throws IOException {
	if (bucket < 0 \|\| bucket >= buckets) {
	throw new IllegalArgumentException(
	"Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
	}

	scratch.grow(utf8.length + 10);
	scratch.clear();
	scratch.append((byte) bucket);
	scratch.append(utf8);
	sorter.add(scratch.get());
	}

	/**
	* Builds the final automaton from a list of added entries. This method may take a longer while as
	* it needs to build the automaton.
	*/
	public FSTCompletion build() throws IOException {
	this.automaton = buildAutomaton(sorter);

	if (sorter instanceof Closeable) {
	((Closeable) sorter).close();
	}

	return new FSTCompletion(automaton);
	}

	/** Builds the final automaton from a list of entries. */
	private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
	// Build the automaton.
	final Outputs<Object> outputs = NoOutputs.getSingleton();
	final Object empty = outputs.getNoOutput();
	final FSTCompiler<Object> fstCompiler =
	new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
	.shareMaxTailLength(shareMaxTailLength)
	.build();

	BytesRefBuilder scratch = new BytesRefBuilder();
	BytesRef entry;
	final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
	int count = 0;
	BytesRefIterator iter = sorter.iterator();
	while ((entry = iter.next()) != null) {
	count++;
	if (scratch.get().compareTo(entry) != 0) {
	fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
	scratch.copyBytes(entry);
	}
	}

	return count == 0 ? null : fstCompiler.compile();
	}
	}