blob: 58ecc8fa7daf816889d8e892fcc4a97043e72f1b [file] [log] [blame]
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
Subsystem: com.intellij.openapi.diff.impl.patch.BaseRevisionTextPatchEP
<+>package org.apache.lucene.search.suggest.analyzing;\n\n/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements. See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the \"License\"); you may not use this file except in compliance with\n * the License. You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n// TODO\n// - test w/ syns\n// - add pruning of low-freq ngrams?\n\nimport org.apache.lucene.analysis.Analyzer;\nimport org.apache.lucene.analysis.AnalyzerWrapper;\nimport org.apache.lucene.analysis.TokenStream;\nimport org.apache.lucene.analysis.shingle.ShingleFilter;\nimport org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;\nimport org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;\nimport org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;\nimport org.apache.lucene.codecs.CodecUtil;\nimport org.apache.lucene.document.Document;\nimport org.apache.lucene.document.Field;\nimport org.apache.lucene.document.FieldType;\nimport org.apache.lucene.document.TextField;\nimport org.apache.lucene.index.DirectoryReader;\nimport org.apache.lucene.index.FieldInfo.IndexOptions;\nimport org.apache.lucene.index.IndexReader;\nimport org.apache.lucene.index.IndexWriter;\nimport org.apache.lucene.index.IndexWriterConfig;\nimport org.apache.lucene.index.MultiFields;\nimport org.apache.lucene.index.Terms;\nimport org.apache.lucene.index.TermsEnum;\nimport org.apache.lucene.search.suggest.InputIterator;\nimport org.apache.lucene.search.suggest.Lookup;\nimport org.apache.lucene.store.ByteArrayDataInput;\nimport org.apache.lucene.store.DataInput;\nimport org.apache.lucene.store.DataOutput;\nimport org.apache.lucene.store.Directory;\nimport org.apache.lucene.store.FSDirectory;\nimport org.apache.lucene.util.BytesRef;\nimport org.apache.lucene.util.CharsRef;\nimport org.apache.lucene.util.IOUtils;\nimport org.apache.lucene.util.IntsRef;\nimport org.apache.lucene.util.OfflineSorter;\nimport org.apache.lucene.util.UnicodeUtil;\nimport org.apache.lucene.util.Version;\nimport org.apache.lucene.util.fst.Builder;\nimport org.apache.lucene.util.fst.FST;\nimport org.apache.lucene.util.fst.FST.Arc;\nimport org.apache.lucene.util.fst.FST.BytesReader;\nimport org.apache.lucene.util.fst.Outputs;\nimport org.apache.lucene.util.fst.PositiveIntOutputs;\nimport org.apache.lucene.util.fst.Util;\nimport org.apache.lucene.util.fst.Util.Result;\nimport org.apache.lucene.util.fst.Util.TopResults;\n\nimport java.io.File;\nimport java.io.IOException;\nimport java.util.ArrayList;\nimport java.util.Collections;\nimport java.util.Comparator;\nimport java.util.HashSet;\nimport java.util.List;\nimport java.util.Random;\nimport java.util.Set;\n\n//import java.io.PrintWriter;\n\n/**\n * Builds an ngram model from the text sent to {@link\n * #build} and predicts based on the last grams-1 tokens in\n * the request sent to {@link #lookup}. This tries to\n * handle the \"long tail\" of suggestions for when the\n * incoming query is a never before seen query string.\n *\n * <p>Likely this suggester would only be used as a\n * fallback, when the primary suggester fails to find\n * any suggestions.\n *\n * <p>Note that the weight for each suggestion is unused,\n * and the suggestions are the analyzed forms (so your\n * analysis process should normally be very \"light\").\n *\n * <p>This uses the stupid backoff language model to smooth\n * scores across ngram models; see\n * \"Large language models in machine translation\",\n * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126\n * for details.\n *\n * <p> From {@link #lookup}, the key of each result is the\n * ngram token; the value is Long.MAX_VALUE * score (fixed\n * point, cast to long). Divide by Long.MAX_VALUE to get\n * the score back, which ranges from 0.0 to 1.0.\n * \n * onlyMorePopular is unused.\n *\n * @lucene.experimental\n */\npublic class FreeTextSuggester extends Lookup {\n\n /** Codec name used in the header for the saved model. */\n public final static String CODEC_NAME = \"freetextsuggest\";\n\n /** Initial version of the the saved model file format. */\n public final static int VERSION_START = 0;\n\n /** Current version of the the saved model file format. */\n public final static int VERSION_CURRENT = VERSION_START;\n\n /** By default we use a bigram model. */\n public static final int DEFAULT_GRAMS = 2;\n\n // In general this could vary with gram, but the\n // original paper seems to use this constant:\n /** The constant used for backoff smoothing; during\n * lookup, this means that if a given trigram did not\n * occur, and we backoff to the bigram, the overall score\n * will be 0.4 times what the bigram model would have\n * assigned. */\n public final static double ALPHA = 0.4;\n\n /** Holds 1gram, 2gram, 3gram models as a single FST. */\n private FST<Long> fst;\n \n /** \n * Analyzer that will be used for analyzing suggestions at\n * index time.\n */\n private final Analyzer indexAnalyzer;\n\n private long totTokens;\n\n /** \n * Analyzer that will be used for analyzing suggestions at\n * query time.\n */\n private final Analyzer queryAnalyzer;\n \n // 2 = bigram, 3 = trigram\n private final int grams;\n\n private final byte separator;\n\n /** Number of entries the lookup was built with */\n private long count = 0;\n\n /** The default character used to join multiple tokens\n * into a single ngram token. The input tokens produced\n * by the analyzer must not contain this character. */\n public static final byte DEFAULT_SEPARATOR = 0x1e;\n\n /** Instantiate, using the provided analyzer for both\n * indexing and lookup, using bigram model by default. */\n public FreeTextSuggester(Analyzer analyzer) {\n this(analyzer, analyzer, DEFAULT_GRAMS);\n }\n\n /** Instantiate, using the provided indexing and lookup\n * analyzers, using bigram model by default. */\n public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {\n this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAMS);\n }\n\n /** Instantiate, using the provided indexing and lookup\n * analyzers, with the specified model (2\n * = bigram, 3 = trigram, etc.). */\n public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams) {\n this(indexAnalyzer, queryAnalyzer, grams, DEFAULT_SEPARATOR);\n }\n\n /** Instantiate, using the provided indexing and lookup\n * analyzers, and specified model (2 = bigram, 3 =\n * trigram ,etc.). The separator is passed to {@link\n * ShingleFilter#setTokenSeparator} to join multiple\n * tokens into a single ngram token; it must be an ascii\n * (7-bit-clean) byte. No input tokens should have this\n * byte, otherwise {@code IllegalArgumentException} is\n * thrown. */\n public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams, byte separator) {\n this.grams = grams;\n this.indexAnalyzer = addShingles(indexAnalyzer);\n this.queryAnalyzer = addShingles(queryAnalyzer);\n if (grams < 1) {\n throw new IllegalArgumentException(\"grams must be >= 1\");\n }\n if ((separator & 0x80) != 0) {\n throw new IllegalArgumentException(\"separator must be simple ascii character\");\n }\n this.separator = separator;\n }\n\n /** Returns byte size of the underlying FST. */ \n @Override\n public long ramBytesUsed() {\n if (fst == null) {\n return 0;\n }\n return fst.ramBytesUsed();\n }\n\n private static class AnalyzingComparator implements Comparator<BytesRef> {\n\n private final ByteArrayDataInput readerA = new ByteArrayDataInput();\n private final ByteArrayDataInput readerB = new ByteArrayDataInput();\n private final BytesRef scratchA = new BytesRef();\n private final BytesRef scratchB = new BytesRef();\n\n @Override\n public int compare(BytesRef a, BytesRef b) {\n readerA.reset(a.bytes, a.offset, a.length);\n readerB.reset(b.bytes, b.offset, b.length);\n\n // By token:\n scratchA.length = readerA.readShort();\n scratchA.bytes = a.bytes;\n scratchA.offset = readerA.getPosition();\n\n scratchB.bytes = b.bytes;\n scratchB.length = readerB.readShort();\n scratchB.offset = readerB.getPosition();\n\n int cmp = scratchA.compareTo(scratchB);\n if (cmp != 0) {\n return cmp;\n }\n readerA.skipBytes(scratchA.length);\n readerB.skipBytes(scratchB.length);\n\n // By length (smaller surface forms sorted first):\n cmp = a.length - b.length;\n if (cmp != 0) {\n return cmp;\n }\n\n // By surface form:\n scratchA.offset = readerA.getPosition();\n scratchA.length = a.length - scratchA.offset;\n scratchB.offset = readerB.getPosition();\n scratchB.length = b.length - scratchB.offset;\n\n return scratchA.compareTo(scratchB);\n }\n }\n\n private Analyzer addShingles(final Analyzer other) {\n if (grams == 1) {\n return other;\n } else {\n // TODO: use ShingleAnalyzerWrapper?\n // Tack on ShingleFilter to the end, to generate token ngrams:\n return new AnalyzerWrapper(other.getReuseStrategy()) {\n @Override\n protected Analyzer getWrappedAnalyzer(String fieldName) {\n return other;\n }\n\n @Override\n protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {\n ShingleFilter shingles = new ShingleFilter(components.getTokenStream(), 2, grams);\n shingles.setTokenSeparator(Character.toString((char) separator));\n return new TokenStreamComponents(components.getTokenizer(), shingles);\n }\n };\n }\n }\n\n @Override\n public void build(InputIterator iterator) throws IOException {\n build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);\n }\n\n /** Build the suggest index, using up to the specified\n * amount of temporary RAM while building. Note that\n * the weights for the suggestions are ignored. */\n public void build(InputIterator iterator, double ramBufferSizeMB) throws IOException {\n if (iterator.hasPayloads()) {\n throw new IllegalArgumentException(\"this suggester doesn't support payloads\");\n }\n if (iterator.hasContexts()) {\n throw new IllegalArgumentException(\"this suggester doesn't support contexts\");\n }\n\n String prefix = getClass().getSimpleName();\n File directory = OfflineSorter.defaultTempDir();\n // TODO: messy ... java7 has Files.createTempDirectory\n // ... but 4.x is java6:\n File tempIndexPath = null;\n Random random = new Random();\n while (true) {\n tempIndexPath = new File(directory, prefix + \".index.\" + random.nextInt(Integer.MAX_VALUE));\n if (tempIndexPath.mkdir()) {\n break;\n }\n }\n\n Directory dir = FSDirectory.open(tempIndexPath);\n\n IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);\n iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);\n iwc.setRAMBufferSizeMB(ramBufferSizeMB);\n IndexWriter writer = new IndexWriter(dir, iwc);\n\n FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);\n // TODO: if only we had IndexOptions.TERMS_ONLY...\n ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);\n ft.setOmitNorms(true);\n ft.freeze();\n\n Document doc = new Document();\n Field field = new Field(\"body\", \"\", ft);\n doc.add(field);\n\n totTokens = 0;\n IndexReader reader = null;\n\n boolean success = false;\n count = 0;\n try {\n while (true) {\n BytesRef surfaceForm = iterator.next();\n if (surfaceForm == null) {\n break;\n }\n field.setStringValue(surfaceForm.utf8ToString());\n writer.addDocument(doc);\n count++;\n }\n reader = DirectoryReader.open(writer, false);\n\n Terms terms = MultiFields.getTerms(reader, \"body\");\n if (terms == null) {\n throw new IllegalArgumentException(\"need at least one suggestion\");\n }\n\n // Move all ngrams into an FST:\n TermsEnum termsEnum = terms.iterator(null);\n\n Outputs<Long> outputs = PositiveIntOutputs.getSingleton();\n Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);\n\n IntsRef scratchInts = new IntsRef();\n while (true) {\n BytesRef term = termsEnum.next();\n if (term == null) {\n break;\n }\n int ngramCount = countGrams(term);\n if (ngramCount > grams) {\n throw new IllegalArgumentException(\"tokens must not contain separator byte; got token=\" + term + \" but gramCount=\" + ngramCount + \", which is greater than expected max ngram size=\" + grams);\n }\n if (ngramCount == 1) {\n totTokens += termsEnum.totalTermFreq();\n }\n\n builder.add(Util.toIntsRef(term, scratchInts), encodeWeight(termsEnum.totalTermFreq()));\n }\n\n fst = builder.finish();\n if (fst == null) {\n throw new IllegalArgumentException(\"need at least one suggestion\");\n }\n //System.out.println(\"FST: \" + fst.getNodeCount() + \" nodes\");\n\n /*\n PrintWriter pw = new PrintWriter(\"/x/tmp/out.dot\");\n Util.toDot(fst, pw, true, true);\n pw.close();\n */\n\n // Writer was only temporary, to count up bigrams,\n // which we transferred to the FST, so now we\n // rollback:\n writer.rollback();\n success = true;\n } finally {\n try {\n if (success) {\n IOUtils.close(reader);\n } else {\n IOUtils.closeWhileHandlingException(reader, writer);\n }\n } finally {\n for(String file : dir.listAll()) {\n File path = new File(tempIndexPath, file);\n if (path.delete() == false) {\n throw new IllegalStateException(\"failed to remove \" + path);\n }\n }\n\n if (tempIndexPath.delete() == false) {\n throw new IllegalStateException(\"failed to remove \" + tempIndexPath);\n }\n\n dir.close();\n }\n }\n }\n\n @Override\n public boolean store(DataOutput output) throws IOException {\n CodecUtil.writeHeader(output, CODEC_NAME, VERSION_CURRENT);\n output.writeVLong(count);\n output.writeByte(separator);\n output.writeVInt(grams);\n output.writeVLong(totTokens);\n fst.save(output);\n return true;\n }\n\n @Override\n public boolean load(DataInput input) throws IOException {\n CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);\n count = input.readVLong();\n byte separatorOrig = input.readByte();\n if (separatorOrig != separator) {\n throw new IllegalStateException(\"separator=\" + separator + \" is incorrect: original model was built with separator=\" + separatorOrig);\n }\n int gramsOrig = input.readVInt();\n if (gramsOrig != grams) {\n throw new IllegalStateException(\"grams=\" + grams + \" is incorrect: original model was built with grams=\" + gramsOrig);\n }\n totTokens = input.readVLong();\n\n fst = new FST<>(input, PositiveIntOutputs.getSingleton());\n\n return true;\n }\n\n @Override\n public List<LookupResult> lookup(final CharSequence key, /* ignored */ boolean onlyMorePopular, int num) {\n return lookup(key, null, onlyMorePopular, num);\n }\n\n /** Lookup, without any context. */\n public List<LookupResult> lookup(final CharSequence key, int num) {\n return lookup(key, null, true, num);\n }\n\n @Override\n public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, /* ignored */ boolean onlyMorePopular, int num) {\n try {\n return lookup(key, contexts, num);\n } catch (IOException ioe) {\n // bogus:\n throw new RuntimeException(ioe);\n }\n }\n\n @Override\n public long getCount() {\n return count;\n }\n \n private int countGrams(BytesRef token) {\n int count = 1;\n for(int i=0;i<token.length;i++) {\n if (token.bytes[token.offset + i] == separator) {\n count++;\n }\n }\n\n return count;\n }\n\n /** Retrieve suggestions. */\n public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {\n if (contexts != null) {\n throw new IllegalArgumentException(\"this suggester doesn't support contexts\");\n }\n\n try (TokenStream ts = queryAnalyzer.tokenStream(\"\", key.toString())) {\n TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);\n OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);\n PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);\n PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);\n ts.reset();\n \n BytesRef[] lastTokens = new BytesRef[grams];\n //System.out.println(\"lookup: key='\" + key + \"'\");\n \n // Run full analysis, but save only the\n // last 1gram, last 2gram, etc.:\n BytesRef tokenBytes = termBytesAtt.getBytesRef();\n int maxEndOffset = -1;\n boolean sawRealToken = false;\n while(ts.incrementToken()) {\n termBytesAtt.fillBytesRef();\n sawRealToken |= tokenBytes.length > 0;\n // TODO: this is somewhat iffy; today, ShingleFilter\n // sets posLen to the gram count; maybe we should make\n // a separate dedicated att for this?\n int gramCount = posLenAtt.getPositionLength();\n \n assert gramCount <= grams;\n \n // Safety: make sure the recalculated count \"agrees\":\n if (countGrams(tokenBytes) != gramCount) {\n throw new IllegalArgumentException(\"tokens must not contain separator byte; got token=\" + tokenBytes + \" but gramCount=\" + gramCount + \" does not match recalculated count=\" + countGrams(tokenBytes));\n }\n maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());\n lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);\n }\n ts.end();\n \n if (!sawRealToken) {\n throw new IllegalArgumentException(\"no tokens produced by analyzer, or the only tokens were empty strings\");\n }\n \n // Carefully fill last tokens with _ tokens;\n // ShingleFilter appraently won't emit \"only hole\"\n // tokens:\n int endPosInc = posIncAtt.getPositionIncrement();\n \n // Note this will also be true if input is the empty\n // string (in which case we saw no tokens and\n // maxEndOffset is still -1), which in fact works out OK\n // because we fill the unigram with an empty BytesRef\n // below:\n boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;\n //System.out.println(\"maxEndOffset=\" + maxEndOffset + \" vs \" + offsetAtt.endOffset());\n \n if (lastTokenEnded) {\n //System.out.println(\" lastTokenEnded\");\n // If user hit space after the last token, then\n // \"upgrade\" all tokens. This way \"foo \" will suggest\n // all bigrams starting w/ foo, and not any unigrams\n // starting with \"foo\":\n for(int i=grams-1;i>0;i--) {\n BytesRef token = lastTokens[i-1];\n if (token == null) {\n continue;\n }\n token.grow(token.length+1);\n token.bytes[token.length] = separator;\n token.length++;\n lastTokens[i] = token;\n }\n lastTokens[0] = new BytesRef();\n }\n \n Arc<Long> arc = new Arc<>();\n \n BytesReader bytesReader = fst.getBytesReader();\n \n // Try highest order models first, and if they return\n // results, return that; else, fallback:\n double backoff = 1.0;\n \n List<LookupResult> results = new ArrayList<>(num);\n \n // We only add a given suffix once, from the highest\n // order model that saw it; for subsequent lower order\n // models we skip it:\n final Set<BytesRef> seen = new HashSet<>();\n \n for(int gram=grams-1;gram>=0;gram--) {\n BytesRef token = lastTokens[gram];\n // Don't make unigram predictions from empty string:\n if (token == null || (token.length == 0 && key.length() > 0)) {\n // Input didn't have enough tokens:\n //System.out.println(\" gram=\" + gram + \": skip: not enough input\");\n continue;\n }\n \n if (endPosInc > 0 && gram <= endPosInc) {\n // Skip hole-only predictions; in theory we\n // shouldn't have to do this, but we'd need to fix\n // ShingleFilter to produce only-hole tokens:\n //System.out.println(\" break: only holes now\");\n break;\n }\n \n //System.out.println(\"try \" + (gram+1) + \" gram token=\" + token.utf8ToString());\n \n // TODO: we could add fuzziness here\n // match the prefix portion exactly\n //Pair<Long,BytesRef> prefixOutput = null;\n Long prefixOutput = null;\n try {\n prefixOutput = lookupPrefix(fst, bytesReader, token, arc);\n } catch (IOException bogus) {\n throw new RuntimeException(bogus);\n }\n //System.out.println(\" prefixOutput=\" + prefixOutput);\n \n if (prefixOutput == null) {\n // This model never saw this prefix, e.g. the\n // trigram model never saw context \"purple mushroom\"\n backoff *= ALPHA;\n continue;\n }\n \n // TODO: we could do this division at build time, and\n // bake it into the FST?\n \n // Denominator for computing scores from current\n // model's predictions:\n long contextCount = totTokens;\n \n BytesRef lastTokenFragment = null;\n \n for(int i=token.length-1;i>=0;i--) {\n if (token.bytes[token.offset+i] == separator) {\n BytesRef context = new BytesRef(token.bytes, token.offset, i);\n Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));\n assert output != null;\n contextCount = decodeWeight(output);\n lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);\n break;\n }\n }\n \n final BytesRef finalLastToken;\n \n if (lastTokenFragment == null) {\n finalLastToken = BytesRef.deepCopyOf(token);\n } else {\n finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);\n }\n assert finalLastToken.offset == 0;\n \n CharsRef spare = new CharsRef();\n \n // complete top-N\n TopResults<Long> completions = null;\n try {\n \n // Because we store multiple models in one FST\n // (1gram, 2gram, 3gram), we must restrict the\n // search so that it only considers the current\n // model. For highest order model, this is not\n // necessary since all completions in the FST\n // must be from this model, but for lower order\n // models we have to filter out the higher order\n // ones:\n \n // Must do num+seen.size() for queue depth because we may\n // reject up to seen.size() paths in acceptResult():\n Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {\n \n BytesRef scratchBytes = new BytesRef();\n \n @Override\n protected void addIfCompetitive(Util.FSTPath<Long> path) {\n if (path.arc.label != separator) {\n //System.out.println(\" keep path: \" + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + \"; \" + path + \"; arc=\" + path.arc);\n super.addIfCompetitive(path);\n } else {\n //System.out.println(\" prevent path: \" + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + \"; \" + path + \"; arc=\" + path.arc);\n }\n }\n \n @Override\n protected boolean acceptResult(IntsRef input, Long output) {\n Util.toBytesRef(input, scratchBytes);\n finalLastToken.grow(finalLastToken.length + scratchBytes.length);\n int lenSav = finalLastToken.length;\n finalLastToken.append(scratchBytes);\n //System.out.println(\" accept? input='\" + scratchBytes.utf8ToString() + \"'; lastToken='\" + finalLastToken.utf8ToString() + \"'; return \" + (seen.contains(finalLastToken) == false));\n boolean ret = seen.contains(finalLastToken) == false;\n \n finalLastToken.length = lenSav;\n return ret;\n }\n };\n \n // since this search is initialized with a single start node \n // it is okay to start with an empty input path here\n searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());\n \n completions = searcher.search();\n assert completions.isComplete;\n } catch (IOException bogus) {\n throw new RuntimeException(bogus);\n }\n \n int prefixLength = token.length;\n \n BytesRef suffix = new BytesRef(8);\n //System.out.println(\" \" + completions.length + \" completions\");\n \n nextCompletion:\n for (Result<Long> completion : completions) {\n token.length = prefixLength;\n // append suffix\n Util.toBytesRef(completion.input, suffix);\n token.append(suffix);\n \n //System.out.println(\" completion \" + token.utf8ToString());\n \n // Skip this path if a higher-order model already\n // saw/predicted its last token:\n BytesRef lastToken = token;\n for(int i=token.length-1;i>=0;i--) {\n if (token.bytes[token.offset+i] == separator) {\n assert token.length-i-1 > 0;\n lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);\n break;\n }\n }\n if (seen.contains(lastToken)) {\n //System.out.println(\" skip dup \" + lastToken.utf8ToString());\n continue nextCompletion;\n }\n seen.add(BytesRef.deepCopyOf(lastToken));\n spare.grow(token.length);\n UnicodeUtil.UTF8toUTF16(token, spare);\n LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));\n results.add(result);\n assert results.size() == seen.size();\n //System.out.println(\" add result=\" + result);\n }\n backoff *= ALPHA;\n }\n \n Collections.sort(results, new Comparator<LookupResult>() {\n @Override\n public int compare(LookupResult a, LookupResult b) {\n if (a.value > b.value) {\n return -1;\n } else if (a.value < b.value) {\n return 1;\n } else {\n // Tie break by UTF16 sort order:\n return ((String) a.key).compareTo((String) b.key);\n }\n }\n });\n \n if (results.size() > num) {\n results.subList(num, results.size()).clear();\n }\n \n return results;\n }\n }\n\n /** weight -> cost */\n private long encodeWeight(long ngramCount) {\n return Long.MAX_VALUE - ngramCount;\n }\n\n /** cost -> weight */\n //private long decodeWeight(Pair<Long,BytesRef> output) {\n private long decodeWeight(Long output) {\n assert output != null;\n return (int)(Long.MAX_VALUE - output);\n }\n \n // NOTE: copied from WFSTCompletionLookup & tweaked\n private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,\n BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {\n\n Long output = fst.outputs.getNoOutput();\n \n fst.getFirstArc(arc);\n \n byte[] bytes = scratch.bytes;\n int pos = scratch.offset;\n int end = pos + scratch.length;\n while (pos < end) {\n if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {\n return null;\n } else {\n output = fst.outputs.add(output, arc.output);\n }\n }\n \n return output;\n }\n\n static final Comparator<Long> weightComparator = new Comparator<Long> () {\n @Override\n public int compare(Long left, Long right) {\n return left.compareTo(right);\n } \n };\n\n /**\n * Returns the weight associated with an input string,\n * or null if it does not exist.\n */\n public Object get(CharSequence key) {\n throw new UnsupportedOperationException();\n }\n}\n
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java (revision 1617256)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java (revision )
@@ -68,6 +68,7 @@
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@@ -293,17 +294,7 @@
}
String prefix = getClass().getSimpleName();
- File directory = OfflineSorter.defaultTempDir();
- // TODO: messy ... java7 has Files.createTempDirectory
- // ... but 4.x is java6:
- File tempIndexPath = null;
- Random random = new Random();
- while (true) {
- tempIndexPath = new File(directory, prefix + ".index." + random.nextInt(Integer.MAX_VALUE));
- if (tempIndexPath.mkdir()) {
- break;
- }
- }
+ File tempIndexPath = Files.createTempDirectory(prefix + ".index.").toFile();
Directory dir = FSDirectory.open(tempIndexPath);