| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.fst; |
| |
| import java.io.IOException; |
| import java.util.*; |
| |
| import org.apache.lucene.util.*; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.FST.Arc; |
| |
| /** |
| * Finite state automata based implementation of "autocomplete" functionality. |
| * |
| * @see FSTCompletionBuilder |
| * @lucene.experimental |
| */ |
| |
| // TODO: we could store exact weights as outputs from the FST (int4 encoded |
| // floats). This would provide exact outputs from this method and to some |
| // degree allowed post-sorting on a more fine-grained weight. |
| |
| // TODO: support for Analyzers (infix suggestions, synonyms?) |
| |
| public class FSTCompletion { |
| /** |
| * A single completion for a given key. |
| */ |
| public static final class Completion implements Comparable<Completion> { |
| /** UTF-8 bytes of the suggestion */ |
| public final BytesRef utf8; |
| /** source bucket (weight) of the suggestion */ |
| public final int bucket; |
| |
| Completion(BytesRef key, int bucket) { |
| this.utf8 = BytesRef.deepCopyOf(key); |
| this.bucket = bucket; |
| } |
| |
| @Override |
| public String toString() { |
| return utf8.utf8ToString() + "/" + bucket; |
| } |
| |
| /** @see BytesRef#compareTo(BytesRef) */ |
| @Override |
| public int compareTo(Completion o) { |
| return this.utf8.compareTo(o.utf8); |
| } |
| } |
| |
| /** |
| * Default number of buckets. |
| */ |
| public static final int DEFAULT_BUCKETS = 10; |
| |
| /** |
| * An empty result. Keep this an {@link ArrayList} to keep all the returned |
| * lists of single type (monomorphic calls). |
| */ |
| private static final ArrayList<Completion> EMPTY_RESULT = new ArrayList<>(); |
| |
| /** |
| * Finite state automaton encoding all the lookup terms. See class notes for |
| * details. |
| */ |
| private final FST<Object> automaton; |
| |
| /** |
| * An array of arcs leaving the root automaton state and encoding weights of |
| * all completions in their sub-trees. |
| */ |
| private final Arc<Object>[] rootArcs; |
| |
| /** |
| * @see #FSTCompletion(FST, boolean, boolean) |
| */ |
| private boolean exactFirst; |
| |
| /** |
| * @see #FSTCompletion(FST, boolean, boolean) |
| */ |
| private boolean higherWeightsFirst; |
| |
| /** |
| * Constructs an FSTCompletion, specifying higherWeightsFirst and exactFirst. |
| * @param automaton |
| * Automaton with completions. See {@link FSTCompletionBuilder}. |
| * @param higherWeightsFirst |
| * Return most popular suggestions first. This is the default |
| * behavior for this implementation. Setting it to <code>false</code> |
| * has no effect (use constant term weights to sort alphabetically |
| * only). |
| * @param exactFirst |
| * Find and push an exact match to the first position of the result |
| * list if found. |
| */ |
| @SuppressWarnings({"unchecked","rawtypes"}) |
| public FSTCompletion(FST<Object> automaton, boolean higherWeightsFirst, boolean exactFirst) { |
| this.automaton = automaton; |
| if (automaton != null) { |
| this.rootArcs = cacheRootArcs(automaton); |
| } else { |
| this.rootArcs = new Arc[0]; |
| } |
| this.higherWeightsFirst = higherWeightsFirst; |
| this.exactFirst = exactFirst; |
| } |
| |
| /** |
| * Defaults to higher weights first and exact first. |
| * @see #FSTCompletion(FST, boolean, boolean) |
| */ |
| public FSTCompletion(FST<Object> automaton) { |
| this(automaton, true, true); |
| } |
| |
| /** |
| * Cache the root node's output arcs starting with completions with the |
| * highest weights. |
| */ |
| @SuppressWarnings({"unchecked","rawtypes"}) |
| private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) { |
| try { |
| List<Arc<Object>> rootArcs = new ArrayList<>(); |
| Arc<Object> arc = automaton.getFirstArc(new Arc<>()); |
| FST.BytesReader fstReader = automaton.getBytesReader(); |
| automaton.readFirstTargetArc(arc, arc, fstReader); |
| while (true) { |
| rootArcs.add(new Arc<>().copyFrom(arc)); |
| if (arc.isLast()) break; |
| automaton.readNextArc(arc, fstReader); |
| } |
| |
| Collections.reverse(rootArcs); // we want highest weights first. |
| return rootArcs.toArray(new Arc[rootArcs.size()]); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** |
| * Returns the first exact match by traversing root arcs, starting from the |
| * arc <code>rootArcIndex</code>. |
| * |
| * @param rootArcIndex |
| * The first root arc index in {@link #rootArcs} to consider when |
| * matching. |
| * |
| * @param utf8 |
| * The sequence of utf8 bytes to follow. |
| * |
| * @return Returns the bucket number of the match or <code>-1</code> if no |
| * match was found. |
| */ |
| private int getExactMatchStartingFromRootArc( |
| int rootArcIndex, BytesRef utf8) { |
| // Get the UTF-8 bytes representation of the input key. |
| try { |
| final FST.Arc<Object> scratch = new FST.Arc<>(); |
| FST.BytesReader fstReader = automaton.getBytesReader(); |
| for (; rootArcIndex < rootArcs.length; rootArcIndex++) { |
| final FST.Arc<Object> rootArc = rootArcs[rootArcIndex]; |
| final FST.Arc<Object> arc = scratch.copyFrom(rootArc); |
| |
| // Descend into the automaton using the key as prefix. |
| if (descendWithPrefix(arc, utf8)) { |
| automaton.readFirstTargetArc(arc, arc, fstReader); |
| if (arc.label() == FST.END_LABEL) { |
| // Normalize prefix-encoded weight. |
| return rootArc.label(); |
| } |
| } |
| } |
| } catch (IOException e) { |
| // Should never happen, but anyway. |
| throw new RuntimeException(e); |
| } |
| |
| // No match. |
| return -1; |
| } |
| |
| /** |
| * Lookup suggestions to <code>key</code>. |
| * |
| * @param key |
| * The prefix to which suggestions should be sought. |
| * @param num |
| * At most this number of suggestions will be returned. |
| * @return Returns the suggestions, sorted by their approximated weight first |
| * (decreasing) and then alphabetically (UTF-8 codepoint order). |
| */ |
| public List<Completion> lookup(CharSequence key, int num) { |
| if (key.length() == 0 || automaton == null) { |
| return EMPTY_RESULT; |
| } |
| |
| try { |
| BytesRef keyUtf8 = new BytesRef(key); |
| if (!higherWeightsFirst && rootArcs.length > 1) { |
| // We could emit a warning here (?). An optimal strategy for |
| // alphabetically sorted |
| // suggestions would be to add them with a constant weight -- this saves |
| // unnecessary |
| // traversals and sorting. |
| return lookupSortedAlphabetically(keyUtf8, num); |
| } else { |
| return lookupSortedByWeight(keyUtf8, num, false); |
| } |
| } catch (IOException e) { |
| // Should never happen, but anyway. |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** |
| * Lookup suggestions sorted alphabetically <b>if weights are not |
| * constant</b>. This is a workaround: in general, use constant weights for |
| * alphabetically sorted result. |
| */ |
| private List<Completion> lookupSortedAlphabetically(BytesRef key, int num) |
| throws IOException { |
| // Greedily get num results from each weight branch. |
| List<Completion> res = lookupSortedByWeight(key, num, true); |
| |
| // Sort and trim. |
| Collections.sort(res); |
| if (res.size() > num) { |
| res = res.subList(0, num); |
| } |
| return res; |
| } |
| |
| /** |
| * Lookup suggestions sorted by weight (descending order). |
| * |
| * @param collectAll |
| * If <code>true</code>, the routine terminates immediately when |
| * <code>num</code> suggestions have been collected. If |
| * <code>false</code>, it will collect suggestions from all weight |
| * arcs (needed for {@link #lookupSortedAlphabetically}. |
| */ |
| private ArrayList<Completion> lookupSortedByWeight(BytesRef key, |
| int num, boolean collectAll) throws IOException { |
| // Don't overallocate the results buffers. This also serves the purpose of |
| // allowing the user of this class to request all matches using Integer.MAX_VALUE as |
| // the number of results. |
| final ArrayList<Completion> res = new ArrayList<>(Math.min(10, num)); |
| |
| final BytesRef output = BytesRef.deepCopyOf(key); |
| for (int i = 0; i < rootArcs.length; i++) { |
| final FST.Arc<Object> rootArc = rootArcs[i]; |
| final FST.Arc<Object> arc = new FST.Arc<>().copyFrom(rootArc); |
| |
| // Descend into the automaton using the key as prefix. |
| if (descendWithPrefix(arc, key)) { |
| // A subgraph starting from the current node has the completions |
| // of the key prefix. The arc we're at is the last key's byte, |
| // so we will collect it too. |
| output.length = key.length - 1; |
| if (collect(res, num, rootArc.label(), output, arc) && !collectAll) { |
| // We have enough suggestions to return immediately. Keep on looking |
| // for an |
| // exact match, if requested. |
| if (exactFirst) { |
| if (!checkExistingAndReorder(res, key)) { |
| int exactMatchBucket = getExactMatchStartingFromRootArc(i, key); |
| if (exactMatchBucket != -1) { |
| // Insert as the first result and truncate at num. |
| while (res.size() >= num) { |
| res.remove(res.size() - 1); |
| } |
| res.add(0, new Completion(key, exactMatchBucket)); |
| } |
| } |
| } |
| break; |
| } |
| } |
| } |
| return res; |
| } |
| |
| /** |
| * Checks if the list of |
| * {@link org.apache.lucene.search.suggest.Lookup.LookupResult}s already has a |
| * <code>key</code>. If so, reorders that |
| * {@link org.apache.lucene.search.suggest.Lookup.LookupResult} to the first |
| * position. |
| * |
| * @return Returns <code>true<code> if and only if <code>list</code> contained |
| * <code>key</code>. |
| */ |
| private boolean checkExistingAndReorder(ArrayList<Completion> list, BytesRef key) { |
| // We assume list does not have duplicates (because of how the FST is created). |
| for (int i = list.size(); --i >= 0;) { |
| if (key.equals(list.get(i).utf8)) { |
| // Key found. Unless already at i==0, remove it and push up front so |
| // that the ordering |
| // remains identical with the exception of the exact match. |
| list.add(0, list.remove(i)); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Descend along the path starting at <code>arc</code> and going through bytes |
| * in the argument. |
| * |
| * @param arc |
| * The starting arc. This argument is modified in-place. |
| * @param utf8 |
| * The term to descend along. |
| * @return If <code>true</code>, <code>arc</code> will be set to the arc |
| * matching last byte of <code>term</code>. <code>false</code> is |
| * returned if no such prefix exists. |
| */ |
| private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8) |
| throws IOException { |
| final int max = utf8.offset + utf8.length; |
| // Cannot save as instance var since multiple threads |
| // can use FSTCompletion at once... |
| final FST.BytesReader fstReader = automaton.getBytesReader(); |
| for (int i = utf8.offset; i < max; i++) { |
| if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) { |
| // No matching prefixes, return an empty result. |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Recursive collect lookup results from the automaton subgraph starting at |
| * <code>arc</code>. |
| * |
| * @param num |
| * Maximum number of results needed (early termination). |
| */ |
| private boolean collect(List<Completion> res, int num, int bucket, |
| BytesRef output, Arc<Object> arc) throws IOException { |
| if (output.length == output.bytes.length) { |
| output.bytes = ArrayUtil.grow(output.bytes); |
| } |
| assert output.offset == 0; |
| output.bytes[output.length++] = (byte) arc.label(); |
| FST.BytesReader fstReader = automaton.getBytesReader(); |
| automaton.readFirstTargetArc(arc, arc, fstReader); |
| while (true) { |
| if (arc.label() == FST.END_LABEL) { |
| res.add(new Completion(output, bucket)); |
| if (res.size() >= num) return true; |
| } else { |
| int save = output.length; |
| if (collect(res, num, bucket, output, new Arc<>().copyFrom(arc))) { |
| return true; |
| } |
| output.length = save; |
| } |
| |
| if (arc.isLast()) { |
| break; |
| } |
| automaton.readNextArc(arc, fstReader); |
| } |
| return false; |
| } |
| |
| /** |
| * Returns the bucket count (discretization thresholds). |
| */ |
| public int getBucketCount() { |
| return rootArcs.length; |
| } |
| |
| /** |
| * Returns the bucket assigned to a given key (if found) or <code>-1</code> if |
| * no exact match exists. |
| */ |
| public int getBucket(CharSequence key) { |
| return getExactMatchStartingFromRootArc(0, new BytesRef(key)); |
| } |
| |
| /** |
| * Returns the internal automaton. |
| */ |
| public FST<Object> getFST() { |
| return automaton; |
| } |
| } |