docs/attachments/LUCENE-5893/LUCENE-5893.patch - lucene-jira-archive - Git at Google

 Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
 <+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements.  See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License.  You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n// TODO\n//   - test w/ syns\n//   - add pruning of low-freq ngrams?\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.AnalyzerWrapper;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.shingle.ShingleFilter;\nimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;\nimport org.apache.lucene.codecs.CodecUtil;\nimport org.apache.lucene.document.Document;\nimport org.apache.lucene.document.Field;\nimport org.apache.lucene.document.FieldType;\nimport org.apache.lucene.document.TextField;\nimport org.apache.lucene.index.DirectoryReader;\nimport org.apache.lucene.index.FieldInfo.IndexOptions;\nimport org.apache.lucene.index.IndexReader;\nimport org.apache.lucene.index.IndexWriter;\nimport org.apache.lucene.index.IndexWriterConfig;\nimport org.apache.lucene.index.MultiFields;\nimport org.apache.lucene.index.Terms;\nimport org.apache.lucene.index.TermsEnum;\nimport org.apache.lucene.search.suggest.InputIterator;\nimport org.apache.lucene.search.suggest.Lookup;\nimport org.apache.lucene.store.ByteArrayDataInput;\nimport org.apache.lucene.store.DataInput;\nimport org.apache.lucene.store.DataOutput;\nimport org.apache.lucene.store.Directory;\nimport org.apache.lucene.store.FSDirectory;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.CharsRef;\nimport org.apache.lucene.util.IOUtils;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.OfflineSorter;\nimport org.apache.lucene.util.UnicodeUtil;\nimport org.apache.lucene.util.Version;\nimport org.apache.lucene.util.fst.Builder;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.FST.Arc;\nimport org.apache.lucene.util.fst.FST.BytesReader;\nimport org.apache.lucene.util.fst.Outputs;\nimport org.apache.lucene.util.fst.PositiveIntOutputs;\nimport org.apache.lucene.util.fst.Util;\nimport org.apache.lucene.util.fst.Util.Result;\nimport org.apache.lucene.util.fst.Util.TopResults;\n\nimport java.io.File;\nimport java.io.IOException;\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Random;\nimport java.util.Set;\n\n//import java.io.PrintWriter;\n\n/**\n * Builds an ngram model from the text sent to {@link\n * #build} and predicts based on the last grams-1 tokens in\n * the request sent to {@link #lookup}.  This tries to\n * handle the \"long tail\" of suggestions for when the\n * incoming query is a never before seen query string.\n *\n * <p>Likely this suggester would only be used as a\n * fallback, when the primary suggester fails to find\n * any suggestions.\n *\n * <p>Note that the weight for each suggestion is unused,\n * and the suggestions are the analyzed forms (so your\n * analysis process should normally be very \"light\").\n *\n * <p>This uses the stupid backoff language model to smooth\n * scores across ngram models; see\n * \"Large language models in machine translation\",\n * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126\n * for details.\n *\n * <p> From {@link #lookup}, the key of each result is the\n * ngram token; the value is Long.MAX_VALUE * score (fixed\n * point, cast to long).  Divide by Long.MAX_VALUE to get\n * the score back, which ranges from 0.0 to 1.0.\n * \n * onlyMorePopular is unused.\n *\n * @lucene.experimental\n */\npublic class FreeTextSuggester extends Lookup {\n\n  /** Codec name used in the header for the saved model. */\n  public final static String CODEC_NAME = \"freetextsuggest\";\n\n  /** Initial version of the the saved model file format. */\n  public final static int VERSION_START = 0;\n\n  /** Current version of the the saved model file format. */\n  public final static int VERSION_CURRENT = VERSION_START;\n\n  /** By default we use a bigram model. */\n  public static final int DEFAULT_GRAMS = 2;\n\n  // In general this could vary with gram, but the\n  // original paper seems to use this constant:\n  /** The constant used for backoff smoothing; during\n   *  lookup, this means that if a given trigram did not\n   *  occur, and we backoff to the bigram, the overall score\n   *  will be 0.4 times what the bigram model would have\n   *  assigned. */\n  public final static double ALPHA = 0.4;\n\n  /** Holds 1gram, 2gram, 3gram models as a single FST. */\n  private FST<Long> fst;\n \n  /** \n   * Analyzer that will be used for analyzing suggestions at\n   * index time.\n   */\n  private final Analyzer indexAnalyzer;\n\n  private long totTokens;\n\n  /** \n   * Analyzer that will be used for analyzing suggestions at\n   * query time.\n   */\n  private final Analyzer queryAnalyzer;\n  \n  // 2 = bigram, 3 = trigram\n  private final int grams;\n\n  private final byte separator;\n\n  /** Number of entries the lookup was built with */\n  private long count = 0;\n\n  /** The default character used to join multiple tokens\n   *  into a single ngram token.  The input tokens produced\n   *  by the analyzer must not contain this character. */\n  public static final byte DEFAULT_SEPARATOR = 0x1e;\n\n  /** Instantiate, using the provided analyzer for both\n   *  indexing and lookup, using bigram model by default. */\n  public FreeTextSuggester(Analyzer analyzer) {\n    this(analyzer, analyzer, DEFAULT_GRAMS);\n  }\n\n  /** Instantiate, using the provided indexing and lookup\n   *  analyzers, using bigram model by default. */\n  public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n    this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAMS);\n  }\n\n  /** Instantiate, using the provided indexing and lookup\n   *  analyzers, with the specified model (2\n   *  = bigram, 3 = trigram, etc.). */\n  public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams) {\n    this(indexAnalyzer, queryAnalyzer, grams, DEFAULT_SEPARATOR);\n  }\n\n  /** Instantiate, using the provided indexing and lookup\n   *  analyzers, and specified model (2 = bigram, 3 =\n   *  trigram ,etc.).  The separator is passed to {@link\n   *  ShingleFilter#setTokenSeparator} to join multiple\n   *  tokens into a single ngram token; it must be an ascii\n   *  (7-bit-clean) byte.  No input tokens should have this\n   *  byte, otherwise {@code IllegalArgumentException} is\n   *  thrown. */\n  public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams, byte separator) {\n    this.grams = grams;\n    this.indexAnalyzer = addShingles(indexAnalyzer);\n    this.queryAnalyzer = addShingles(queryAnalyzer);\n    if (grams < 1) {\n      throw new IllegalArgumentException(\"grams must be >= 1\");\n    }\n    if ((separator & 0x80) != 0) {\n      throw new IllegalArgumentException(\"separator must be simple ascii character\");\n    }\n    this.separator = separator;\n  }\n\n  /** Returns byte size of the underlying FST. */ \n  @Override\n  public long ramBytesUsed() {\n    if (fst == null) {\n      return 0;\n    }\n    return fst.ramBytesUsed();\n  }\n\n  private static class AnalyzingComparator implements Comparator<BytesRef> {\n\n    private final ByteArrayDataInput readerA = new ByteArrayDataInput();\n    private final ByteArrayDataInput readerB = new ByteArrayDataInput();\n    private final BytesRef scratchA = new BytesRef();\n    private final BytesRef scratchB = new BytesRef();\n\n    @Override\n    public int compare(BytesRef a, BytesRef b) {\n      readerA.reset(a.bytes, a.offset, a.length);\n      readerB.reset(b.bytes, b.offset, b.length);\n\n      // By token:\n      scratchA.length = readerA.readShort();\n      scratchA.bytes = a.bytes;\n      scratchA.offset = readerA.getPosition();\n\n      scratchB.bytes = b.bytes;\n      scratchB.length = readerB.readShort();\n      scratchB.offset = readerB.getPosition();\n\n      int cmp = scratchA.compareTo(scratchB);\n      if (cmp != 0) {\n        return cmp;\n      }\n      readerA.skipBytes(scratchA.length);\n      readerB.skipBytes(scratchB.length);\n\n      // By length (smaller surface forms sorted first):\n      cmp = a.length - b.length;\n      if (cmp != 0) {\n        return cmp;\n      }\n\n      // By surface form:\n      scratchA.offset = readerA.getPosition();\n      scratchA.length = a.length - scratchA.offset;\n      scratchB.offset = readerB.getPosition();\n      scratchB.length = b.length - scratchB.offset;\n\n      return scratchA.compareTo(scratchB);\n    }\n  }\n\n  private Analyzer addShingles(final Analyzer other) {\n    if (grams == 1) {\n      return other;\n    } else {\n      // TODO: use ShingleAnalyzerWrapper?\n      // Tack on ShingleFilter to the end, to generate token ngrams:\n      return new AnalyzerWrapper(other.getReuseStrategy()) {\n        @Override\n        protected Analyzer getWrappedAnalyzer(String fieldName) {\n          return other;\n        }\n\n        @Override\n        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {\n          ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);\n          shingles.setTokenSeparator(Character.toString((char) separator));\n          return new TokenStreamComponents(components.getTokenizer(), shingles);\n        }\n      };\n    }\n  }\n\n  @Override\n  public void build(InputIterator iterator) throws IOException {\n    build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);\n  }\n\n  /** Build the suggest index, using up to the specified\n   *  amount of temporary RAM while building.  Note that\n   *  the weights for the suggestions are ignored. */\n  public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {\n    if (iterator.hasPayloads()) {\n      throw new IllegalArgumentException(\"this suggester doesn't support payloads\");\n    }\n    if (iterator.hasContexts()) {\n      throw new IllegalArgumentException(\"this suggester doesn't support contexts\");\n    }\n\n    String prefix = getClass().getSimpleName();\n    File directory = OfflineSorter.defaultTempDir();\n    // TODO: messy ... java7 has Files.createTempDirectory\n    // ... but 4.x is java6:\n    File tempIndexPath = null;\n    Random random = new Random();\n    while (true) {\n      tempIndexPath = new File(directory, prefix + \".index.\" + random.nextInt(Integer.MAX_VALUE));\n      if (tempIndexPath.mkdir()) {\n        break;\n      }\n    }\n\n    Directory dir = FSDirectory.open(tempIndexPath);\n\n    IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);\n    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);\n    iwc.setRAMBufferSizeMB(ramBufferSizeMB);\n    IndexWriter writer = new IndexWriter(dir, iwc);\n\n    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);\n    // TODO: if only we had IndexOptions.TERMS_ONLY...\n    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);\n    ft.setOmitNorms(true);\n    ft.freeze();\n\n    Document doc = new Document();\n    Field field = new Field(\"body\", \"\", ft);\n    doc.add(field);\n\n    totTokens = 0;\n    IndexReader reader = null;\n\n    boolean success = false;\n    count = 0;\n    try {\n      while (true) {\n        BytesRef surfaceForm = iterator.next();\n        if (surfaceForm == null) {\n          break;\n        }\n        field.setStringValue(surfaceForm.utf8ToString());\n        writer.addDocument(doc);\n        count++;\n      }\n      reader = DirectoryReader.open(writer, false);\n\n      Terms terms = MultiFields.getTerms(reader, \"body\");\n      if (terms == null) {\n        throw new IllegalArgumentException(\"need at least one suggestion\");\n      }\n\n      // Move all ngrams into an FST:\n      TermsEnum termsEnum = terms.iterator(null);\n\n      Outputs<Long> outputs = PositiveIntOutputs.getSingleton();\n      Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);\n\n      IntsRef scratchInts = new IntsRef();\n      while (true) {\n        BytesRef term = termsEnum.next();\n        if (term == null) {\n          break;\n        }\n        int ngramCount = countGrams(term);\n        if (ngramCount > grams) {\n          throw new IllegalArgumentException(\"tokens must not contain separator byte; got token=\" + term + \" but gramCount=\" + ngramCount + \", which is greater than expected max ngram size=\" + grams);\n        }\n        if (ngramCount == 1) {\n          totTokens += termsEnum.totalTermFreq();\n        }\n\n        builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));\n      }\n\n      fst = builder.finish();\n      if (fst == null) {\n        throw new IllegalArgumentException(\"need at least one suggestion\");\n      }\n      //System.out.println(\"FST: \" + fst.getNodeCount() + \" nodes\");\n\n      /*\n      PrintWriter pw = new PrintWriter(\"/x/tmp/out.dot\");\n      Util.toDot(fst, pw, true, true);\n      pw.close();\n      */\n\n      // Writer was only temporary, to count up bigrams,\n      // which we transferred to the FST, so now we\n      // rollback:\n      writer.rollback();\n      success = true;\n    } finally {\n      try {\n        if (success) {\n          IOUtils.close(reader);\n        } else {\n          IOUtils.closeWhileHandlingException(reader, writer);\n        }\n      } finally {\n        for(String file : dir.listAll()) {\n          File path = new File(tempIndexPath, file);\n          if (path.delete() == false) {\n            throw new IllegalStateException(\"failed to remove \" + path);\n          }\n        }\n\n        if (tempIndexPath.delete() == false) {\n          throw new IllegalStateException(\"failed to remove \" + tempIndexPath);\n        }\n\n        dir.close();\n      }\n    }\n  }\n\n  @Override\n  public boolean store(DataOutput output) throws IOException {\n    CodecUtil.writeHeader(output, CODEC_NAME, VERSION_CURRENT);\n    output.writeVLong(count);\n    output.writeByte(separator);\n    output.writeVInt(grams);\n    output.writeVLong(totTokens);\n    fst.save(output);\n    return true;\n  }\n\n  @Override\n  public boolean load(DataInput input) throws IOException {\n    CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);\n    count = input.readVLong();\n    byte separatorOrig = input.readByte();\n    if (separatorOrig != separator) {\n      throw new IllegalStateException(\"separator=\" + separator + \" is incorrect: original model was built with separator=\" + separatorOrig);\n    }\n    int gramsOrig = input.readVInt();\n    if (gramsOrig != grams) {\n      throw new IllegalStateException(\"grams=\" + grams + \" is incorrect: original model was built with grams=\" + gramsOrig);\n    }\n    totTokens = input.readVLong();\n\n    fst = new FST<>(input, PositiveIntOutputs.getSingleton());\n\n    return true;\n  }\n\n  @Override\n  public List<LookupResult> lookup(final CharSequence key, /* ignored */ boolean onlyMorePopular, int num) {\n    return lookup(key, null, onlyMorePopular, num);\n  }\n\n  /** Lookup, without any context. */\n  public List<LookupResult> lookup(final CharSequence key, int num) {\n    return lookup(key, null, true, num);\n  }\n\n  @Override\n  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, /* ignored */ boolean onlyMorePopular, int num) {\n    try {\n      return lookup(key, contexts, num);\n    } catch (IOException ioe) {\n      // bogus:\n      throw new RuntimeException(ioe);\n    }\n  }\n\n  @Override\n  public long getCount() {\n    return count;\n  }\n  \n  private int countGrams(BytesRef token) {\n    int count = 1;\n    for(int i=0;i<token.length;i++) {\n      if (token.bytes[token.offset + i] == separator) {\n        count++;\n      }\n    }\n\n    return count;\n  }\n\n  /** Retrieve suggestions. */\n  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {\n    if (contexts != null) {\n      throw new IllegalArgumentException(\"this suggester doesn't support contexts\");\n    }\n\n    try (TokenStream ts = queryAnalyzer.tokenStream(\"\", key.toString())) {\n      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);\n      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);\n      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n      ts.reset();\n      \n      BytesRef[] lastTokens = new BytesRef[grams];\n      //System.out.println(\"lookup: key='\" + key + \"'\");\n      \n      // Run full analysis, but save only the\n      // last 1gram, last 2gram, etc.:\n      BytesRef tokenBytes = termBytesAtt.getBytesRef();\n      int maxEndOffset = -1;\n      boolean sawRealToken = false;\n      while(ts.incrementToken()) {\n        termBytesAtt.fillBytesRef();\n        sawRealToken |= tokenBytes.length > 0;\n        // TODO: this is somewhat iffy; today, ShingleFilter\n        // sets posLen to the gram count; maybe we should make\n        // a separate dedicated att for this?\n        int gramCount = posLenAtt.getPositionLength();\n        \n        assert gramCount <= grams;\n        \n        // Safety: make sure the recalculated count \"agrees\":\n        if (countGrams(tokenBytes) != gramCount) {\n          throw new IllegalArgumentException(\"tokens must not contain separator byte; got token=\" + tokenBytes + \" but gramCount=\" + gramCount + \" does not match recalculated count=\" + countGrams(tokenBytes));\n        }\n        maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());\n        lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);\n      }\n      ts.end();\n      \n      if (!sawRealToken) {\n        throw new IllegalArgumentException(\"no tokens produced by analyzer, or the only tokens were empty strings\");\n      }\n      \n      // Carefully fill last tokens with _ tokens;\n      // ShingleFilter appraently won't emit \"only hole\"\n      // tokens:\n      int endPosInc = posIncAtt.getPositionIncrement();\n      \n      // Note this will also be true if input is the empty\n      // string (in which case we saw no tokens and\n      // maxEndOffset is still -1), which in fact works out OK\n      // because we fill the unigram with an empty BytesRef\n      // below:\n      boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;\n      //System.out.println(\"maxEndOffset=\" + maxEndOffset + \" vs \" + offsetAtt.endOffset());\n      \n      if (lastTokenEnded) {\n        //System.out.println(\"  lastTokenEnded\");\n        // If user hit space after the last token, then\n        // \"upgrade\" all tokens.  This way \"foo \" will suggest\n        // all bigrams starting w/ foo, and not any unigrams\n        // starting with \"foo\":\n        for(int i=grams-1;i>0;i--) {\n          BytesRef token = lastTokens[i-1];\n          if (token == null) {\n            continue;\n          }\n          token.grow(token.length+1);\n          token.bytes[token.length] = separator;\n          token.length++;\n          lastTokens[i] = token;\n        }\n        lastTokens[0] = new BytesRef();\n      }\n      \n      Arc<Long> arc = new Arc<>();\n      \n      BytesReader bytesReader = fst.getBytesReader();\n      \n      // Try highest order models first, and if they return\n      // results, return that; else, fallback:\n      double backoff = 1.0;\n      \n      List<LookupResult> results = new ArrayList<>(num);\n      \n      // We only add a given suffix once, from the highest\n      // order model that saw it; for subsequent lower order\n      // models we skip it:\n      final Set<BytesRef> seen = new HashSet<>();\n      \n      for(int gram=grams-1;gram>=0;gram--) {\n        BytesRef token = lastTokens[gram];\n        // Don't make unigram predictions from empty string:\n        if (token == null || (token.length == 0 && key.length() > 0)) {\n          // Input didn't have enough tokens:\n          //System.out.println(\"  gram=\" + gram + \": skip: not enough input\");\n          continue;\n        }\n        \n        if (endPosInc > 0 && gram <= endPosInc) {\n          // Skip hole-only predictions; in theory we\n          // shouldn't have to do this, but we'd need to fix\n          // ShingleFilter to produce only-hole tokens:\n          //System.out.println(\"  break: only holes now\");\n          break;\n        }\n        \n        //System.out.println(\"try \" + (gram+1) + \" gram token=\" + token.utf8ToString());\n        \n        // TODO: we could add fuzziness here\n        // match the prefix portion exactly\n        //Pair<Long,BytesRef> prefixOutput = null;\n        Long prefixOutput = null;\n        try {\n          prefixOutput = lookupPrefix(fst, bytesReader, token, arc);\n        } catch (IOException bogus) {\n          throw new RuntimeException(bogus);\n        }\n        //System.out.println(\"  prefixOutput=\" + prefixOutput);\n        \n        if (prefixOutput == null) {\n          // This model never saw this prefix, e.g. the\n          // trigram model never saw context \"purple mushroom\"\n          backoff *= ALPHA;\n          continue;\n        }\n        \n        // TODO: we could do this division at build time, and\n        // bake it into the FST?\n        \n        // Denominator for computing scores from current\n        // model's predictions:\n        long contextCount = totTokens;\n        \n        BytesRef lastTokenFragment = null;\n        \n        for(int i=token.length-1;i>=0;i--) {\n          if (token.bytes[token.offset+i] == separator) {\n            BytesRef context = new BytesRef(token.bytes, token.offset, i);\n            Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));\n            assert output != null;\n            contextCount = decodeWeight(output);\n            lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);\n            break;\n          }\n        }\n        \n        final BytesRef finalLastToken;\n        \n        if (lastTokenFragment == null) {\n          finalLastToken = BytesRef.deepCopyOf(token);\n        } else {\n          finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);\n        }\n        assert finalLastToken.offset == 0;\n        \n        CharsRef spare = new CharsRef();\n        \n        // complete top-N\n        TopResults<Long> completions = null;\n        try {\n          \n          // Because we store multiple models in one FST\n          // (1gram, 2gram, 3gram), we must restrict the\n          // search so that it only considers the current\n          // model.  For highest order model, this is not\n          // necessary since all completions in the FST\n          // must be from this model, but for lower order\n          // models we have to filter out the higher order\n          // ones:\n          \n          // Must do num+seen.size() for queue depth because we may\n          // reject up to seen.size() paths in acceptResult():\n          Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {\n            \n            BytesRef scratchBytes = new BytesRef();\n            \n            @Override\n            protected void addIfCompetitive(Util.FSTPath<Long> path) {\n              if (path.arc.label != separator) {\n                //System.out.println(\"    keep path: \" + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + \"; \" + path + \"; arc=\" + path.arc);\n                super.addIfCompetitive(path);\n              } else {\n                //System.out.println(\"    prevent path: \" + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + \"; \" + path + \"; arc=\" + path.arc);\n              }\n            }\n            \n            @Override\n            protected boolean acceptResult(IntsRef input, Long output) {\n              Util.toBytesRef(input, scratchBytes);\n              finalLastToken.grow(finalLastToken.length + scratchBytes.length);\n              int lenSav = finalLastToken.length;\n              finalLastToken.append(scratchBytes);\n              //System.out.println(\"    accept? input='\" + scratchBytes.utf8ToString() + \"'; lastToken='\" + finalLastToken.utf8ToString() + \"'; return \" + (seen.contains(finalLastToken) == false));\n              boolean ret = seen.contains(finalLastToken) == false;\n              \n              finalLastToken.length = lenSav;\n              return ret;\n            }\n          };\n          \n          // since this search is initialized with a single start node \n          // it is okay to start with an empty input path here\n          searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());\n          \n          completions = searcher.search();\n          assert completions.isComplete;\n        } catch (IOException bogus) {\n          throw new RuntimeException(bogus);\n        }\n        \n        int prefixLength = token.length;\n        \n        BytesRef suffix = new BytesRef(8);\n        //System.out.println(\"    \" + completions.length + \" completions\");\n        \n        nextCompletion:\n          for (Result<Long> completion : completions) {\n            token.length = prefixLength;\n            // append suffix\n            Util.toBytesRef(completion.input, suffix);\n            token.append(suffix);\n            \n            //System.out.println(\"    completion \" + token.utf8ToString());\n            \n            // Skip this path if a higher-order model already\n            // saw/predicted its last token:\n            BytesRef lastToken = token;\n            for(int i=token.length-1;i>=0;i--) {\n              if (token.bytes[token.offset+i] == separator) {\n                assert token.length-i-1 > 0;\n                lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);\n                break;\n              }\n            }\n            if (seen.contains(lastToken)) {\n              //System.out.println(\"      skip dup \" + lastToken.utf8ToString());\n              continue nextCompletion;\n            }\n            seen.add(BytesRef.deepCopyOf(lastToken));\n            spare.grow(token.length);\n            UnicodeUtil.UTF8toUTF16(token, spare);\n            LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));\n            results.add(result);\n            assert results.size() == seen.size();\n            //System.out.println(\"  add result=\" + result);\n          }\n        backoff *= ALPHA;\n      }\n      \n      Collections.sort(results, new Comparator<LookupResult>() {\n        @Override\n        public int compare(LookupResult a, LookupResult b) {\n          if (a.value > b.value) {\n            return -1;\n          } else if (a.value < b.value) {\n            return 1;\n          } else {\n            // Tie break by UTF16 sort order:\n            return ((String) a.key).compareTo((String) b.key);\n          }\n        }\n      });\n      \n      if (results.size() > num) {\n        results.subList(num, results.size()).clear();\n      }\n      \n      return results;\n    }\n  }\n\n  /** weight -> cost */\n  private long encodeWeight(long ngramCount) {\n    return Long.MAX_VALUE - ngramCount;\n  }\n\n  /** cost -> weight */\n  //private long decodeWeight(Pair<Long,BytesRef> output) {\n  private long decodeWeight(Long output) {\n    assert output != null;\n    return (int)(Long.MAX_VALUE - output);\n  }\n  \n  // NOTE: copied from WFSTCompletionLookup & tweaked\n  private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,\n                            BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {\n\n    Long output = fst.outputs.getNoOutput();\n    \n    fst.getFirstArc(arc);\n    \n    byte[] bytes = scratch.bytes;\n    int pos = scratch.offset;\n    int end = pos + scratch.length;\n    while (pos < end) {\n      if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {\n        return null;\n      } else {\n        output = fst.outputs.add(output, arc.output);\n      }\n    }\n    \n    return output;\n  }\n\n  static final Comparator<Long> weightComparator = new Comparator<Long> () {\n    @Override\n    public int compare(Long left, Long right) {\n      return left.compareTo(right);\n    }  \n  };\n\n  /**\n   * Returns the weight associated with an input string,\n   * or null if it does not exist.\n   */\n  public Object get(CharSequence key) {\n    throw new UnsupportedOperationException();\n  }\n}\n
 ===================================================================
 --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java	(revision 1617256)
 +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java	(revision )
 @@ -68,6 +68,7 @@

  import java.io.File;
  import java.io.IOException;
 +import java.nio.file.Files;
  import java.util.ArrayList;
  import java.util.Collections;
  import java.util.Comparator;
 @@ -293,17 +294,7 @@
      }

      String prefix = getClass().getSimpleName();
 -    File directory = OfflineSorter.defaultTempDir();
 -    // TODO: messy ... java7 has Files.createTempDirectory
 -    // ... but 4.x is java6:
 -    File tempIndexPath = null;
 -    Random random = new Random();
 -    while (true) {
 -      tempIndexPath = new File(directory, prefix + ".index." + random.nextInt(Integer.MAX_VALUE));
 -      if (tempIndexPath.mkdir()) {
 -        break;
 -      }
 -    }
 +    File tempIndexPath = Files.createTempDirectory(prefix + ".index.").toFile();

      Directory dir = FSDirectory.open(tempIndexPath);