lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java - lucene-solr - Git at Google

 package org.apache.lucene.search.suggest.analyzing;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.search.spell.TermFreqIterator;
 import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.search.suggest.fst.Sort;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.InputStreamDataInput;
 import org.apache.lucene.store.OutputStreamDataOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicOperations;
 import org.apache.lucene.util.automaton.SpecialOperations;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.FST.BytesReader;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PairOutputs.Pair;
 import org.apache.lucene.util.fst.PairOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.fst.Util.MinResult;
 import org.apache.lucene.util.fst.Util;

 /**
  * Suggester that first analyzes the surface form, adds the
  * analyzed form to a weighted FST, and then does the same
  * thing at lookup time.  This means lookup is based on the
  * analyzed form while suggestions are still the surface
  * form(s).
  *
  * <p>
  * This can result in powerful suggester functionality.  For
  * example, if you use an analyzer removing stop words,
  * then the partial text "ghost chr..." could see the
  * suggestion "The Ghost of Christmas Past".  If
  * SynonymFilter is used to map wifi and wireless network to
  * hotspot then the partial text "wirele..." could suggest
  * "wifi router".  Token normalization like stemmers, accent
  * removal, etc., would allow suggestions to ignore such
  * variations.
  *
  * <p>
  * There are some limitations:
  * <ul>
  *
  *   <li> A lookup from a query like "net" in English won't
  *        be any different than "net " (ie, user added a
  *        trailing space) because analyzers don't reflect
  *        when they've seen a token separator and when they
  *        haven't.
  *
  *   <li> If you're using {@code StopFilter}, and the user will
  *        type "fast apple", but so far all they've typed is
  *        "fast a", again because the analyzer doesn't convey whether
  *        it's seen a token separator after the "a",
  *        {@code StopFilter} will remove that "a" causing
  *        far more matches than you'd expect.
  *
  *   <li> Lookups with the empty string return no results
  *        instead of all results.
  * </ul>
  *
  * @lucene.experimental
  */
 public class AnalyzingSuggester extends Lookup {

   /**
    * FST<Weight,Surface>:
    *  input is the analyzed form, with a null byte between terms
    *  weights are encoded as costs: (Integer.MAX_VALUE-weight)
    *  surface is the original, unanalyzed form.
    */
   private FST<Pair<Long,BytesRef>> fst = null;

   /**
    * Analyzer that will be used for analyzing suggestions at
    * index time.
    */
   private final Analyzer indexAnalyzer;

   /**
    * Analyzer that will be used for analyzing suggestions at
    * query time.
    */
   private final Analyzer queryAnalyzer;

   /**
    * True if exact match suggestions should always be returned first.
    */
   private final boolean exactFirst;

   /**
    * True if separator between tokens should be preserved.
    */
   private final boolean preserveSep;

   /** Include this flag in the options parameter to {@link
    *  #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always
    *  return the exact match first, regardless of score.  This
    *  has no performance impact but could result in
    *  low-quality suggestions. */
   public static final int EXACT_FIRST = 1;

   /** Include this flag in the options parameter to {@link
    *  #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to preserve
    *  token separators when matching. */
   public static final int PRESERVE_SEP = 2;

   /** Represents the separation between tokens, if
    *  PRESERVE_SEP was specified */
   private static final int SEP_LABEL = 0xff;

   /** Marks end of the analyzed input and start of dedup
    *  byte. */
   private static final int END_BYTE = 0x0;

   /** Maximum number of dup surface forms (different surface
    *  forms for the same analyzed form). */
   private final int maxSurfaceFormsPerAnalyzedForm;

   /** Maximum graph paths to index for a single analyzed
    *  surface form.  This only matters if your analyzer
    *  makes lots of alternate paths (e.g. contains
    *  SynonymFilter). */
   private final int maxGraphExpansions;

   /** Highest number of analyzed paths we saw for any single
    *  input surface form.  For analyzers that never create
    *  graphs this will always be 1. */
   private int maxAnalyzedPathsForOneInput;

   /**
    * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
    * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
    * PRESERVE_SEP, 256, -1)}
    */
   public AnalyzingSuggester(Analyzer analyzer) {
     this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);
   }

   /**
    * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
    * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
    * PRESERVE_SEP, 256, -1)}
    */
   public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
     this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1);
   }

   /**
    * Creates a new suggester.
    *
    * @param indexAnalyzer Analyzer that will be used for
    *   analyzing suggestions while building the index.
    * @param queryAnalyzer Analyzer that will be used for
    *   analyzing query text during lookup
    * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
    * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
    *   surface forms to keep for a single analyzed form.
    *   When there are too many surface forms we discard the
    *   lowest weighted ones.
    * @param maxGraphExpansions Maximum number of graph paths
    *   to expand from the analyzed form.  Set this to -1 for
    *   no limit.
    */
   public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) {
     this.indexAnalyzer = indexAnalyzer;
     this.queryAnalyzer = queryAnalyzer;
     if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
       throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
     }
     this.exactFirst = (options & EXACT_FIRST) != 0;
     this.preserveSep = (options & PRESERVE_SEP) != 0;

     // NOTE: this is just an implementation limitation; if
     // somehow this is a problem we could fix it by using
     // more than one byte to disambiguate ... but 256 seems
     // like it should be way more then enough.
     if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
       throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
     }
     this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

     if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
       throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
     }
     this.maxGraphExpansions = maxGraphExpansions;
   }

   /** Returns byte size of the underlying FST. */
   public long sizeInBytes() {
     return fst == null ? 0 : fst.sizeInBytes();
   }

   private void copyDestTransitions(State from, State to, List<Transition> transitions) {
     if (to.isAccept()) {
       from.setAccept(true);
     }
     for(Transition t : to.getTransitions()) {
       transitions.add(t);
     }
   }

   // Replaces SEP with epsilon or remaps them if
   // we were asked to preserve them:
   private void replaceSep(Automaton a) {

     State[] states = a.getNumberedStates();

     // Go in reverse topo sort so we know we only have to
     // make one pass:
     for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
       final State state = states[stateNumber];
       List<Transition> newTransitions = new ArrayList<Transition>();
       for(Transition t : state.getTransitions()) {
         assert t.getMin() == t.getMax();
         if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
           if (preserveSep) {
             // Remap to SEP_LABEL:
             newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
           } else {
             copyDestTransitions(state, t.getDest(), newTransitions);
             a.setDeterministic(false);
           }
         } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {

           // Just remove the hole: there will then be two
           // SEP tokens next to each other, which will only
           // match another hole at search time.  Note that
           // it will also match an empty-string token ... if
           // that's somehow a problem we can always map HOLE
           // to a dedicated byte (and escape it in the
           // input).
           copyDestTransitions(state, t.getDest(), newTransitions);
           a.setDeterministic(false);
         } else {
           newTransitions.add(t);
         }
       }
       state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
     }
   }

   /** Just escapes the 0xff byte (which we still for SEP). */
   private static final class  EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

     final BytesRef spare = new BytesRef();

     @Override
     protected BytesRef changeToken(BytesRef in) {
       int upto = 0;
       for(int i=0;i<in.length;i++) {
         byte b = in.bytes[in.offset+i];
         if (b == (byte) 0xff) {
           if (spare.bytes.length == upto) {
             spare.grow(upto+2);
           }
           spare.bytes[upto++] = (byte) 0xff;
           spare.bytes[upto++] = b;
         } else {
           if (spare.bytes.length == upto) {
             spare.grow(upto+1);
           }
           spare.bytes[upto++] = b;
         }
       }
       spare.offset = 0;
       spare.length = upto;
       return spare;
     }
   }

   TokenStreamToAutomaton getTokenStreamToAutomaton() {
     if (preserveSep) {
       return new EscapingTokenStreamToAutomaton();
     } else {
       // When we're not preserving sep, we don't steal 0xff
       // byte, so we don't need to do any escaping:
       return new TokenStreamToAutomaton();
     }
   }

   @Override
   public void build(TermFreqIterator iterator) throws IOException {
     String prefix = getClass().getSimpleName();
     File directory = Sort.defaultTempDir();
     File tempInput = File.createTempFile(prefix, ".input", directory);
     File tempSorted = File.createTempFile(prefix, ".sorted", directory);

     Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
     Sort.ByteSequencesReader reader = null;
     BytesRef scratch = new BytesRef();

     TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

     // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
     boolean success = false;
     byte buffer[] = new byte[8];
     try {
       ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
       BytesRef surfaceForm;
       while ((surfaceForm = iterator.next()) != null) {
         Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);

         maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());

         for (IntsRef path : paths) {

           Util.toBytesRef(path, scratch);

           // length of the analyzed text (FST input)
           short analyzedLength = (short) scratch.length;
           // compute the required length:
           // analyzed sequence + 12 (separator) + weight (4) + surface + analyzedLength (short)
           int requiredLength = analyzedLength + 2 + 4 + surfaceForm.length + 2;

           buffer = ArrayUtil.grow(buffer, requiredLength);

           output.reset(buffer);
           output.writeBytes(scratch.bytes, scratch.offset, scratch.length);
           output.writeByte((byte)0); // separator: not used, just for sort order
           output.writeByte((byte)0); // separator: not used, just for sort order

           // NOTE: important that writeInt is big-endian,
           // because this means we sort secondarily by
           // cost ascending (= weight descending) so that
           // when we discard too many surface forms for a
           // single analyzed form we are discarding the
           // least weight ones:
           output.writeInt(encodeWeight(iterator.weight()));

           output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
           output.writeShort(analyzedLength);
           writer.write(buffer, 0, output.getPosition());
         }
       }
       writer.close();

       // Sort all input/output pairs (required by FST.Builder):
       new Sort().sort(tempInput, tempSorted);
       reader = new Sort.ByteSequencesReader(tempSorted);

       PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
       Builder<Pair<Long,BytesRef>> builder = new Builder<Pair<Long,BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);

       // Build FST:
       BytesRef previous = null;
       BytesRef analyzed = new BytesRef();
       BytesRef surface = new BytesRef();
       IntsRef scratchInts = new IntsRef();
       ByteArrayDataInput input = new ByteArrayDataInput();

       int dedup = 0;
       while (reader.read(scratch)) {
         input.reset(scratch.bytes, scratch.offset, scratch.length);
         input.setPosition(input.length()-2);
         short analyzedLength = input.readShort();

         analyzed.bytes = scratch.bytes;
         analyzed.offset = scratch.offset;
         analyzed.length = analyzedLength;

         input.setPosition(analyzedLength + 2); // analyzed sequence + separator
         long cost = input.readInt();

         surface.bytes = scratch.bytes;
         surface.offset = input.getPosition();
         surface.length = input.length() - input.getPosition() - 2;

         if (previous == null) {
           previous = new BytesRef();
           previous.copyBytes(analyzed);
         } else if (analyzed.equals(previous)) {
           dedup++;
           if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
             // More than maxSurfaceFormsPerAnalyzedForm
             // dups: skip the rest:
             continue;
           }
         } else {
           dedup = 0;
           previous.copyBytes(analyzed);
         }

         analyzed.grow(analyzed.length+2);

         // TODO: I think we can avoid the extra 2 bytes when
         // there is no dup (dedup==0), but we'd have to fix
         // the exactFirst logic ... which would be sort of
         // hairy because we'd need to special case the two
         // (dup/not dup)...

         // NOTE: must be byte 0 so we sort before whatever
         // is next
         analyzed.bytes[analyzed.length] = 0;
         analyzed.bytes[analyzed.length+1] = (byte) dedup;
         analyzed.length += 2;

         Util.toIntsRef(analyzed, scratchInts);
         //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
         builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
       }
       fst = builder.finish();

       //Util.dotToFile(fst, "/tmp/suggest.dot");

       success = true;
     } finally {
       if (success) {
         IOUtils.close(reader, writer);
       } else {
         IOUtils.closeWhileHandlingException(reader, writer);
       }

       tempInput.delete();
       tempSorted.delete();
     }
   }

   @Override
   public boolean store(OutputStream output) throws IOException {
     DataOutput dataOut = new OutputStreamDataOutput(output);
     try {
       fst.save(dataOut);
       dataOut.writeVInt(maxAnalyzedPathsForOneInput);
     } finally {
       IOUtils.close(output);
     }
     return true;
   }

   @Override
   public boolean load(InputStream input) throws IOException {
     DataInput dataIn = new InputStreamDataInput(input);
     try {
       this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
       maxAnalyzedPathsForOneInput = dataIn.readVInt();
     } finally {
       IOUtils.close(input);
     }
     return true;
   }

   @Override
   public List<LookupResult> lookup(final CharSequence key, boolean onlyMorePopular, int num) {
     assert num > 0;

     if (onlyMorePopular) {
       throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
     }

     //System.out.println("lookup key=" + key + " num=" + num);
     final BytesRef utf8Key = new BytesRef(key);
     try {

       Automaton lookupAutomaton = toLookupAutomaton(key);

       final CharsRef spare = new CharsRef();

       //System.out.println("  now intersect exactFirst=" + exactFirst);

       // Intersect automaton w/ suggest wFST and get all
       // prefix starting nodes & their outputs:
       //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

       //System.out.println("  prefixPaths: " + prefixPaths.size());

       BytesReader bytesReader = fst.getBytesReader(0);

       FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();

       final List<LookupResult> results = new ArrayList<LookupResult>();

       List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);

       if (exactFirst) {

         int count = 0;
         for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
           if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
             // This node has END_BYTE arc leaving, meaning it's an
             // "exact" match:
             count++;
           }
         }

         // Searcher just to find the single exact only
         // match, if present:
         Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
         searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);

         // NOTE: we could almost get away with only using
         // the first start node.  The only catch is if
         // maxSurfaceFormsPerAnalyzedForm had kicked in and
         // pruned our exact match from one of these nodes
         // ...:
         for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
           if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
             // This node has END_BYTE arc leaving, meaning it's an
             // "exact" match:
             searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
           }
         }

         MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();

         // NOTE: this is rather inefficient: we enumerate
         // every matching "exactly the same analyzed form"
         // path, and then do linear scan to see if one of
         // these exactly matches the input.  It should be
         // possible (though hairy) to do something similar
         // to getByOutput, since the surface form is encoded
         // into the FST output, so we more efficiently hone
         // in on the exact surface-form match.  Still, I
         // suspect very little time is spent in this linear
         // seach: it's bounded by how many prefix start
         // nodes we have and the
         // maxSurfaceFormsPerAnalyzedForm:
         for(MinResult<Pair<Long,BytesRef>> completion : completions) {
           if (utf8Key.bytesEquals(completion.output.output2)) {
             spare.grow(completion.output.output2.length);
             UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
             results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
             break;
           }
         }

         if (results.size() == num) {
           // That was quick:
           return results;
         }
       }

       Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
       searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
                                                             num - results.size(),
                                                             num * maxAnalyzedPathsForOneInput,
                                                             weightComparator) {
         private final Set<BytesRef> seen = new HashSet<BytesRef>();

         @Override
         protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {

           // Dedup: when the input analyzes to a graph we
           // can get duplicate surface forms:
           if (seen.contains(output.output2)) {
             return false;
           }
           seen.add(output.output2);

           if (!exactFirst) {
             return true;
           } else {
             // In exactFirst mode, don't accept any paths
             // matching the surface form since that will
             // create duplicate results:
             if (utf8Key.bytesEquals(output.output2)) {
               // We found exact match, which means we should
               // have already found it in the first search:
               assert results.size() == 1;
               return false;
             } else {
               return true;
             }
           }
         }
       };

       prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

       for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
         searcher.addStartPaths(path.fstNode, path.output, true, path.input);
       }

       MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();

       for(MinResult<Pair<Long,BytesRef>> completion : completions) {
         spare.grow(completion.output.output2.length);
         UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
         LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));

         // TODO: for fuzzy case would be nice to return
         // how many edits were required

         //System.out.println("    result=" + result);
         results.add(result);

         if (results.size() == num) {
           // In the exactFirst=true case the search may
           // produce one extra path
           break;
         }
       }

       return results;
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
   }

   /** Returns all prefix paths to initialize the search. */
   protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                        Automaton lookupAutomaton,
                                                                        FST<Pair<Long,BytesRef>> fst)
     throws IOException {
     return prefixPaths;
   }

   final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
  // Analyze surface form:
     TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));

     // Create corresponding automaton: labels are bytes
     // from each analyzed token, with byte 0 used as
     // separator between tokens:
     Automaton automaton = ts2a.toAutomaton(ts);
     ts.end();
     ts.close();

     replaceSep(automaton);

     assert SpecialOperations.isFinite(automaton);

     // Get all paths from the automaton (there can be
     // more than one path, eg if the analyzer created a
     // graph using SynFilter or WDF):

     // TODO: we could walk & add simultaneously, so we
     // don't have to alloc [possibly biggish]
     // intermediate HashSet in RAM:
     return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
   }

   final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
     // TODO: is there a Reader from a CharSequence?
     // Turn tokenstream into automaton:
     TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
     Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
     ts.end();
     ts.close();

     // TODO: we could use the end offset to "guess"
     // whether the final token was a partial token; this
     // would only be a heuristic ... but maybe an OK one.
     // This way we could eg differentiate "net" from "net ",
     // which we can't today...

     replaceSep(automaton);

     // TODO: we can optimize this somewhat by determinizing
     // while we convert
     BasicOperations.determinize(automaton);
     return automaton;
   }


   /**
    * Returns the weight associated with an input string,
    * or null if it does not exist.
    */
   public Object get(CharSequence key) {
     throw new UnsupportedOperationException();
   }

   /** cost -> weight */
   private static int decodeWeight(long encoded) {
     return (int)(Integer.MAX_VALUE - encoded);
   }

   /** weight -> cost */
   private static int encodeWeight(long value) {
     if (value < 0 || value > Integer.MAX_VALUE) {
       throw new UnsupportedOperationException("cannot encode value: " + value);
     }
     return Integer.MAX_VALUE - (int)value;
   }

   static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () {
     public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) {
       return left.output1.compareTo(right.output1);
     }
   };
 }