| package org.apache.lucene.search.suggest.fst; |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedOutputStream; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.List; |
| |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.FST.Arc; |
| import org.apache.lucene.util.fst.NoOutputs; |
| import org.apache.lucene.util.fst.Outputs; |
| |
| import org.apache.lucene.search.suggest.Lookup; |
| import org.apache.lucene.search.suggest.tst.TSTLookup; |
| import org.apache.lucene.search.spell.TermFreqIterator; |
| import org.apache.lucene.store.InputStreamDataInput; |
| import org.apache.lucene.store.OutputStreamDataOutput; |
| |
| /** |
| * Finite state automata based implementation of {@link Lookup} query |
| * suggestion/ autocomplete interface. |
| * |
| * <h2>Implementation details</h2> |
| * |
| * <p>The construction step in {@link #build(TermFreqIterator)} works as follows: |
| * <ul> |
| * <li>A set of input terms (String) and weights (float) is given.</li> |
| * <li>The range of weights is determined and then all weights are discretized into a fixed set |
| * of values ({@link #buckets}). |
| * Note that this means that minor changes in weights may be lost during automaton construction. |
| * In general, this is not a big problem because the "priorities" of completions can be split |
| * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal). |
| * If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li> |
| * <li>All terms in the input are preprended with a synthetic pseudo-character being the weight |
| * of that term. For example a term <code>abc</code> with a discretized weight equal '1' would |
| * become <code>1abc</code>.</li> |
| * <li>The terms are sorted by their raw value of utf16 character values (including the synthetic |
| * term in front).</li> |
| * <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has |
| * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li> |
| * </ul> |
| * |
| * <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows: |
| * <ul> |
| * <li>For each possible term weight encoded in the automaton (cached arcs from the root above), |
| * starting with the highest one, we descend along the path of the input key. If the key is not |
| * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately. |
| * No completions. |
| * <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire |
| * subautomaton (all paths) starting from this node form the key's completions.</b> We start |
| * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single |
| * suggestion to the list of results (the weight of this suggestion is constant and equal to the |
| * root path we started from). The tricky part is that because automaton edges are sorted and |
| * we scan depth-first, we can terminate the entire procedure as soon as we collect enough |
| * suggestions the user requested. |
| * <li>In case the number of suggestions collected in the step above is still insufficient, |
| * we proceed to the next (smaller) weight leaving the root node and repeat the same |
| * algorithm again. |
| * </li> |
| * </ul> |
| * |
| * <h2>Runtime behavior and performance characteristic</h2> |
| * |
| * <p>The algorithm described above is optimized for finding suggestions to short prefixes |
| * in a top-weights-first order. This is probably the most common use case: it allows |
| * presenting suggestions early and sorts them by the global frequency (and then alphabetically). |
| * |
| * <p>If there is an exact match in the automaton, it is returned first on the results |
| * list (even with by-weight sorting). |
| * |
| * <p>Note that the maximum lookup time for <b>any prefix</b> |
| * is the time of descending to the subtree, plus traversal of the subtree up to the number |
| * of requested suggestions (because they are already presorted by weight on the root level |
| * and alphabetically at any node level). |
| * |
| * <p>To order alphabetically only (no ordering by priorities), use identical term weights |
| * for all terms. Alphabetical suggestions are returned even if non-constant weights are |
| * used, but the algorithm for doing this is suboptimal. |
| * |
| * <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order, |
| * nothing else. |
| */ |
| public class FSTLookup extends Lookup { |
| |
| public FSTLookup() { |
| this(10, true); |
| } |
| |
| public FSTLookup(int buckets, boolean exactMatchFirst) { |
| this.buckets = buckets; |
| this.exactMatchFirst = exactMatchFirst; |
| } |
| |
| /** A structure for a single entry (for sorting/ preprocessing). */ |
| private static class Entry { |
| char [] term; |
| float weight; |
| |
| public Entry(char [] term, float freq) { |
| this.term = term; |
| this.weight = freq; |
| } |
| } |
| |
| /** Serialized automaton file name (storage). */ |
| public static final String FILENAME = "fst.dat"; |
| |
| /** An empty result. */ |
| private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList(); |
| |
| /** |
| * The number of separate buckets for weights (discretization). The more buckets, |
| * the more fine-grained term weights (priorities) can be assigned. The speed of lookup |
| * will not decrease for prefixes which have highly-weighted completions (because these |
| * are filled-in first), but will decrease significantly for low-weighted terms (but |
| * these should be infrequent, so it is all right). |
| * |
| * <p>The number of buckets must be within [1, 255] range. |
| */ |
| private final int buckets; |
| |
| /** |
| * If <code>true</code>, exact suggestions are returned first, even if they are prefixes |
| * of other strings in the automaton (possibly with larger weights). |
| */ |
| private final boolean exactMatchFirst; |
| |
| /** |
| * Finite state automaton encoding all the lookup terms. See class |
| * notes for details. |
| */ |
| private FST<Object> automaton; |
| |
| /** |
| * An array of arcs leaving the root automaton state and encoding weights of all |
| * completions in their sub-trees. |
| */ |
| private Arc<Object> [] rootArcs; |
| |
| /* */ |
| @Override |
| public void build(TermFreqIterator tfit) throws IOException { |
| // Buffer the input because we will need it twice: for calculating |
| // weights distribution and for the actual automata building. |
| List<Entry> entries = new ArrayList<Entry>(); |
| while (tfit.hasNext()) { |
| String term = tfit.next(); |
| char [] termChars = new char [term.length() + 1]; // add padding for weight. |
| for (int i = 0; i < term.length(); i++) |
| termChars[i + 1] = term.charAt(i); |
| entries.add(new Entry(termChars, tfit.freq())); |
| } |
| |
| // Distribute weights into at most N buckets. This is a form of discretization to |
| // limit the number of possible weights so that they can be efficiently encoded in the |
| // automaton. |
| // |
| // It is assumed the distribution of weights is _linear_ so proportional division |
| // of [min, max] range will be enough here. Other approaches could be to sort |
| // weights and divide into proportional ranges. |
| if (entries.size() > 0) { |
| redistributeWeightsProportionalMinMax(entries, buckets); |
| encodeWeightPrefix(entries); |
| } |
| |
| // Build the automaton (includes input sorting) and cache root arcs in order from the highest, |
| // to the lowest weight. |
| this.automaton = buildAutomaton(entries); |
| cacheRootArcs(); |
| } |
| |
| /** |
| * Cache the root node's output arcs starting with completions with the highest weights. |
| */ |
| @SuppressWarnings("unchecked") |
| private void cacheRootArcs() throws IOException { |
| if (automaton != null) { |
| List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>(); |
| Arc<Object> arc = automaton.getFirstArc(new Arc<Object>()); |
| automaton.readFirstTargetArc(arc, arc); |
| while (true) { |
| rootArcs.add(new Arc<Object>().copyFrom(arc)); |
| if (arc.isLast()) |
| break; |
| automaton.readNextArc(arc); |
| } |
| |
| Collections.reverse(rootArcs); // we want highest weights first. |
| this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]); |
| } |
| } |
| |
| /** |
| * Not implemented. |
| */ |
| @Override |
| public boolean add(String key, Object value) { |
| // This implementation does not support ad-hoc additions (all input |
| // must be sorted for the builder). |
| return false; |
| } |
| |
| /** |
| * Get the (approximated) weight of a single key (if there is a perfect match |
| * for it in the automaton). |
| * |
| * @return Returns the approximated weight of the input key or <code>null</code> |
| * if not found. |
| */ |
| @Override |
| public Float get(String key) { |
| return getExactMatchStartingFromRootArc(0, key); |
| } |
| |
| /** |
| * Returns the first exact match by traversing root arcs, starting from |
| * the arc <code>i</code>. |
| * |
| * @param i The first root arc index in {@link #rootArcs} to consider when |
| * matching. |
| */ |
| private Float getExactMatchStartingFromRootArc(int i, String key) { |
| // Get the UTF-8 bytes representation of the input key. |
| try { |
| final FST.Arc<Object> scratch = new FST.Arc<Object>(); |
| for (; i < rootArcs.length; i++) { |
| final FST.Arc<Object> rootArc = rootArcs[i]; |
| final FST.Arc<Object> arc = scratch.copyFrom(rootArc); |
| |
| // Descend into the automaton using the key as prefix. |
| if (descendWithPrefix(arc, key)) { |
| automaton.readFirstTargetArc(arc, arc); |
| if (arc.label == FST.END_LABEL) { |
| // Prefix-encoded weight. |
| return rootArc.label / (float) buckets; |
| } |
| } |
| } |
| } catch (IOException e) { |
| // Should never happen, but anyway. |
| throw new RuntimeException(e); |
| } |
| |
| return null; |
| } |
| |
| /** |
| * Lookup autocomplete suggestions to <code>key</code>. |
| * |
| * @param key The prefix to which suggestions should be sought. |
| * @param onlyMorePopular Return most popular suggestions first. This is the default |
| * behavior for this implementation. Setting it to <code>false</code> has no effect (use |
| * constant term weights to sort alphabetically only). |
| * @param num At most this number of suggestions will be returned. |
| * @return Returns the suggestions, sorted by their approximated weight first (decreasing) |
| * and then alphabetically (utf16 codepoint order). |
| */ |
| @Override |
| public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) { |
| if (key.length() == 0 || automaton == null) { |
| // Keep the result an ArrayList to keep calls monomorphic. |
| return EMPTY_RESULT; |
| } |
| |
| try { |
| if (!onlyMorePopular && rootArcs.length > 1) { |
| // We could emit a warning here (?). An optimal strategy for alphabetically sorted |
| // suggestions would be to add them with a constant weight -- this saves unnecessary |
| // traversals and sorting. |
| return lookupSortedAlphabetically(key, num); |
| } else { |
| return lookupSortedByWeight(key, num, true); |
| } |
| } catch (IOException e) { |
| // Should never happen, but anyway. |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** |
| * Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This |
| * is a workaround: in general, use constant weights for alphabetically sorted result. |
| */ |
| private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException { |
| // Greedily get num results from each weight branch. |
| List<LookupResult> res = lookupSortedByWeight(key, num, false); |
| |
| // Sort and trim. |
| Collections.sort(res, new Comparator<LookupResult>() { |
| // not till java6 @Override |
| public int compare(LookupResult o1, LookupResult o2) { |
| return o1.key.compareTo(o2.key); |
| } |
| }); |
| if (res.size() > num) { |
| res = res.subList(0, num); |
| } |
| return res; |
| } |
| |
| /** |
| * Lookup suggestions sorted by weight (descending order). |
| * |
| * @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code> |
| * suggestions have been collected. If <code>false</code>, it will collect suggestions from |
| * all weight arcs (needed for {@link #lookupSortedAlphabetically}. |
| */ |
| private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException { |
| final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num)); |
| final StringBuilder output = new StringBuilder(key); |
| final int matchLength = key.length() - 1; |
| |
| for (int i = 0; i < rootArcs.length; i++) { |
| final FST.Arc<Object> rootArc = rootArcs[i]; |
| final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc); |
| |
| // Descend into the automaton using the key as prefix. |
| if (descendWithPrefix(arc, key)) { |
| // Prefix-encoded weight. |
| final float weight = rootArc.label / (float) buckets; |
| |
| // A subgraph starting from the current node has the completions |
| // of the key prefix. The arc we're at is the last key's byte, |
| // so we will collect it too. |
| output.setLength(matchLength); |
| if (collect(res, num, weight, output, arc) && greedy) { |
| // We have enough suggestion to return immediately. Keep on looking for an |
| // exact match, if requested. |
| if (exactMatchFirst) { |
| Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key); |
| if (exactMatchWeight != null) { |
| res.add(0, new LookupResult(key, exactMatchWeight)); |
| while (res.size() > num) { |
| res.remove(res.size() - 1); |
| } |
| } |
| } |
| break; |
| } |
| } |
| } |
| return res; |
| } |
| |
| /** |
| * Descend along the path starting at <code>arc</code> and going through |
| * bytes in <code>utf8</code> argument. |
| * |
| * @param arc The starting arc. This argument is modified in-place. |
| * @param term The term to descend with. |
| * @return If <code>true</code>, <code>arc</code> will be set to the arc matching |
| * last byte of <code>utf8</code>. <code>false</code> is returned if no such |
| * prefix <code>utf8</code> exists. |
| */ |
| private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException { |
| final int max = term.length(); |
| |
| for (int i = 0; i < max; i++) { |
| if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) { |
| // No matching prefixes, return an empty result. |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| /** |
| * Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>. |
| * |
| * @param num Maximum number of results needed (early termination). |
| * @param weight Weight of all results found during this collection. |
| */ |
| private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException { |
| output.append((char) arc.label); |
| |
| automaton.readFirstTargetArc(arc, arc); |
| while (true) { |
| if (arc.label == FST.END_LABEL) { |
| res.add(new LookupResult(output.toString(), weight)); |
| if (res.size() >= num) |
| return true; |
| } else { |
| int save = output.length(); |
| if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) { |
| return true; |
| } |
| output.setLength(save); |
| } |
| |
| if (arc.isLast()) { |
| break; |
| } |
| automaton.readNextArc(arc); |
| } |
| return false; |
| } |
| |
| /** |
| * Builds the final automaton from a list of entries. |
| */ |
| private FST<Object> buildAutomaton(List<Entry> entries) throws IOException { |
| if (entries.size() == 0) |
| return null; |
| |
| // Sort by utf16 (raw char value) |
| final Comparator<Entry> comp = new Comparator<Entry>() { |
| public int compare(Entry o1, Entry o2) { |
| char [] ch1 = o1.term; |
| char [] ch2 = o2.term; |
| int len1 = ch1.length; |
| int len2 = ch2.length; |
| |
| int max = Math.min(len1, len2); |
| for (int i = 0; i < max; i++) { |
| int v = ch1[i] - ch2[i]; |
| if (v != 0) return v; |
| } |
| return len1 - len2; |
| } |
| }; |
| Collections.sort(entries, comp); |
| |
| // Avoid duplicated identical entries, if possible. This is required because |
| // it breaks automaton construction otherwise. |
| int len = entries.size(); |
| int j = 0; |
| for (int i = 1; i < len; i++) { |
| if (comp.compare(entries.get(j), entries.get(i)) != 0) { |
| entries.set(++j, entries.get(i)); |
| } |
| } |
| entries = entries.subList(0, j + 1); |
| |
| // Build the automaton. |
| final Outputs<Object> outputs = NoOutputs.getSingleton(); |
| final Object empty = outputs.getNoOutput(); |
| final Builder<Object> builder = |
| new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs); |
| final IntsRef scratchIntsRef = new IntsRef(10); |
| for (Entry e : entries) { |
| final int termLength = scratchIntsRef.length = e.term.length; |
| |
| scratchIntsRef.grow(termLength); |
| final int [] ints = scratchIntsRef.ints; |
| final char [] chars = e.term; |
| for (int i = termLength; --i >= 0;) { |
| ints[i] = chars[i]; |
| } |
| builder.add(scratchIntsRef, empty); |
| } |
| return builder.finish(); |
| } |
| |
| /** |
| * Prepends the entry's weight to each entry, encoded as a single byte, so that the |
| * root automaton node fans out to all possible priorities, starting with the arc that has |
| * the highest weights. |
| */ |
| private void encodeWeightPrefix(List<Entry> entries) { |
| for (Entry e : entries) { |
| int weight = (int) e.weight; |
| assert (weight >= 0 && weight <= buckets) : |
| "Weight out of range: " + weight + " [" + buckets + "]"; |
| |
| // There should be a single empty char reserved in front for the weight. |
| e.term[0] = (char) weight; |
| } |
| } |
| |
| /** |
| * Split [min, max] range into buckets, reassigning weights. Entries' weights are |
| * remapped to [0, buckets] range (so, buckets + 1 buckets, actually). |
| */ |
| private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) { |
| float min = entries.get(0).weight; |
| float max = min; |
| for (Entry e : entries) { |
| min = Math.min(e.weight, min); |
| max = Math.max(e.weight, max); |
| } |
| |
| final float range = max - min; |
| for (Entry e : entries) { |
| e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor() |
| } |
| } |
| |
| /** |
| * Deserialization from disk. |
| */ |
| @Override |
| public synchronized boolean load(File storeDir) throws IOException { |
| File data = new File(storeDir, FILENAME); |
| if (!data.exists() || !data.canRead()) { |
| return false; |
| } |
| |
| InputStream is = new BufferedInputStream(new FileInputStream(data)); |
| try { |
| this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton()); |
| cacheRootArcs(); |
| } finally { |
| IOUtils.close(is); |
| } |
| return true; |
| } |
| |
| /** |
| * Serialization to disk. |
| */ |
| @Override |
| public synchronized boolean store(File storeDir) throws IOException { |
| if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { |
| return false; |
| } |
| |
| if (this.automaton == null) |
| return false; |
| |
| File data = new File(storeDir, FILENAME); |
| OutputStream os = new BufferedOutputStream(new FileOutputStream(data)); |
| try { |
| this.automaton.save(new OutputStreamDataOutput(os)); |
| } finally { |
| IOUtils.close(os); |
| } |
| |
| return true; |
| } |
| } |