docs/attachments/LUCENE-6155/LUCENE-6155_Payloads_in_MemoryIndex.patch - lucene-jira-archive - Git at Google

 Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java	(revision 1645984)
 +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java	(revision )
 @@ -17,6 +17,8 @@
   * limitations under the License.
   */

 +import javax.xml.parsers.DocumentBuilder;
 +import javax.xml.parsers.DocumentBuilderFactory;
  import java.io.ByteArrayInputStream;
  import java.io.IOException;
  import java.nio.charset.StandardCharsets;
 @@ -28,10 +30,17 @@
  import java.util.List;
  import java.util.Map;
  import java.util.StringTokenizer;
 -import javax.xml.parsers.DocumentBuilder;
 -import javax.xml.parsers.DocumentBuilderFactory;

 -import org.apache.lucene.analysis.*;
 +import org.apache.lucene.analysis.Analyzer;
 +import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 +import org.apache.lucene.analysis.CachingTokenFilter;
 +import org.apache.lucene.analysis.MockAnalyzer;
 +import org.apache.lucene.analysis.MockPayloadAnalyzer;
 +import org.apache.lucene.analysis.MockTokenFilter;
 +import org.apache.lucene.analysis.MockTokenizer;
 +import org.apache.lucene.analysis.Token;
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.Tokenizer;
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 @@ -44,20 +53,43 @@
  import org.apache.lucene.index.DirectoryReader;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.IndexWriter;
 -import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  import org.apache.lucene.index.IndexWriterConfig;
 +import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  import org.apache.lucene.index.StoredDocument;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.queries.CommonTermsQuery;
 -import org.apache.lucene.search.*;
  import org.apache.lucene.search.BooleanClause.Occur;
 +import org.apache.lucene.search.BooleanQuery;
 +import org.apache.lucene.search.ConstantScoreQuery;
 +import org.apache.lucene.search.FilteredQuery;
 +import org.apache.lucene.search.FuzzyQuery;
 +import org.apache.lucene.search.IndexSearcher;
 +import org.apache.lucene.search.MultiPhraseQuery;
 +import org.apache.lucene.search.MultiTermQuery;
 +import org.apache.lucene.search.NumericRangeQuery;
 +import org.apache.lucene.search.PhraseQuery;
 +import org.apache.lucene.search.PrefixQuery;
 +import org.apache.lucene.search.Query;
 +import org.apache.lucene.search.QueryWrapperFilter;
 +import org.apache.lucene.search.RegexpQuery;
 +import org.apache.lucene.search.TermQuery;
 +import org.apache.lucene.search.TermRangeFilter;
 +import org.apache.lucene.search.TermRangeQuery;
 +import org.apache.lucene.search.TopDocs;
 +import org.apache.lucene.search.WildcardQuery;
  import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
  import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter;
  import org.apache.lucene.search.join.BitDocIdSetFilter;
  import org.apache.lucene.search.join.ScoreMode;
  import org.apache.lucene.search.join.ToChildBlockJoinQuery;
  import org.apache.lucene.search.join.ToParentBlockJoinQuery;
 -import org.apache.lucene.search.spans.*;
 +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
 +import org.apache.lucene.search.spans.SpanNearQuery;
 +import org.apache.lucene.search.spans.SpanNotQuery;
 +import org.apache.lucene.search.spans.SpanOrQuery;
 +import org.apache.lucene.search.spans.SpanPayloadCheckQuery;
 +import org.apache.lucene.search.spans.SpanQuery;
 +import org.apache.lucene.search.spans.SpanTermQuery;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.util.BytesRef;
  import org.apache.lucene.util.LuceneTestCase;
 @@ -1891,7 +1923,7 @@
      reader.close();
    }

 -  /** If we have term vectors, we can highlight based on payloads */
 +  /** We can highlight based on payloads. It's supported both via term vectors and MemoryIndex since Lucene 5. */
    public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
      final String text = "random words and words";//"words" at positions 1 & 4

 @@ -1900,7 +1932,7 @@
        writer.deleteAll();
        Document doc = new Document();

 -      doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV));
 +      doc.add(new Field(FIELD_NAME, text, fieldType));
        writer.addDocument(doc);
        writer.commit();
      }
 @@ -1908,12 +1940,17 @@
        Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")),
            Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence
        IndexSearcher searcher = newSearcher(reader);
 -      Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
 +      QueryScorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
 +      scorer.setUsePayloads(true);
        Highlighter h = new Highlighter(scorer);

        TopDocs hits = searcher.search(query, null, 10);
        assertEquals(1, hits.scoreDocs.length);
        TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
 +      if (random().nextBoolean()) {
 +        stream.reset();
 +        stream = new CachingTokenFilter(stream);//conceals detection of TokenStreamFromTermVector
 +      }
        String result = h.getBestFragment(stream, text);
        assertEquals("random <B>words</B> and words", result);//only highlight first "word"
      }
 Index: lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java	(revision 1645984)
 +++ lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndexAgainstRAMDir.java	(revision )
 @@ -68,8 +68,8 @@
  import org.apache.lucene.search.spans.SpanQuery;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.store.RAMDirectory;
 -import org.apache.lucene.util.ByteBlockPool.Allocator;
  import org.apache.lucene.util.ByteBlockPool;
 +import org.apache.lucene.util.ByteBlockPool.Allocator;
  import org.apache.lucene.util.BytesRef;
  import org.apache.lucene.util.IOUtils;
  import org.apache.lucene.util.LineFileDocs;
 @@ -116,7 +116,7 @@
     * runs random tests, up to ITERATIONS times.
     */
    public void testRandomQueries() throws Exception {
 -    MemoryIndex index =  new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex index = randomMemoryIndex();
      for (int i = 0; i < ITERATIONS; i++) {
        assertAgainstRAMDirectory(index);
      }
 @@ -148,7 +148,8 @@
      Directory ramdir = new RAMDirectory();
      Analyzer analyzer = randomAnalyzer();
      IndexWriter writer = new IndexWriter(ramdir,
 -                                         new IndexWriterConfig(analyzer).setCodec(TestUtil.alwaysPostingsFormat(TestUtil.getDefaultPostingsFormat())));
 +                                         new IndexWriterConfig(analyzer).setCodec(
 +                                             TestUtil.alwaysPostingsFormat(TestUtil.getDefaultPostingsFormat())));
      Document doc = new Document();
      Field field1 = newTextField("foo", fooField.toString(), Field.Store.NO);
      Field field2 = newTextField("term", termField.toString(), Field.Store.NO);
 @@ -209,7 +210,11 @@
                    assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset());
                    assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset());
                  }
 +
 +                if (iwTerms.hasPayloads()) {
 +                  assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload());
 -              }
 +                }
 +              }

              }

 @@ -311,7 +316,7 @@

    public void testDocsEnumStart() throws Exception {
      Analyzer analyzer = new MockAnalyzer(random());
 -    MemoryIndex memory = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex memory = new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024);
      memory.addField("foo", "bar", analyzer);
      LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
      DocsEnum disi = TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, null, DocsEnum.FLAG_NONE);
 @@ -336,11 +341,15 @@
        return new ByteBlockPool.DirectAllocator();
      }
    }
 -
 +
 +  private MemoryIndex randomMemoryIndex() {
 +    return new MemoryIndex(random().nextBoolean(), random().nextBoolean(), random().nextInt(50) * 1024 * 1024);
 +  }
 +
    public void testDocsAndPositionsEnumStart() throws Exception {
      Analyzer analyzer = new MockAnalyzer(random());
      int numIters = atLeast(3);
 -    MemoryIndex memory = new MemoryIndex(true,  random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024);
      for (int i = 0; i < numIters; i++) { // check reuse
        memory.addField("foo", "bar", analyzer);
        LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
 @@ -370,7 +379,7 @@
      RegexpQuery regex = new RegexpQuery(new Term("field", "worl."));
      SpanQuery wrappedquery = new SpanMultiTermQueryWrapper<>(regex);

 -    MemoryIndex mindex = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex mindex = randomMemoryIndex();
      mindex.addField("field", new MockAnalyzer(random()).tokenStream("field", "hello there"));

      // This throws an NPE
 @@ -382,7 +391,7 @@
      RegexpQuery regex = new RegexpQuery(new Term("field", "worl."));
      SpanQuery wrappedquery = new SpanOrQuery(new SpanMultiTermQueryWrapper<>(regex));

 -    MemoryIndex mindex = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex mindex = randomMemoryIndex();
      mindex.addField("field", new MockAnalyzer(random()).tokenStream("field", "hello there"));

      // This passes though
 @@ -390,7 +399,7 @@
    }

    public void testSameFieldAddedMultipleTimes() throws IOException {
 -    MemoryIndex mindex = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex mindex = randomMemoryIndex();
      MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
      mindex.addField("field", "the quick brown fox", mockAnalyzer);
      mindex.addField("field", "jumps over the", mockAnalyzer);
 @@ -409,8 +418,8 @@
      assertTrue("posGap" + mockAnalyzer.getPositionIncrementGap("field") , mindex.search(query) > 0.0001);
    }

 -  public void testNonExistingsField() throws IOException {
 -    MemoryIndex mindex = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
 +  public void testNonExistentField() throws IOException {
 +    MemoryIndex mindex = randomMemoryIndex();
      MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
      mindex.addField("field", "the quick brown fox", mockAnalyzer);
      LeafReader reader = (LeafReader) mindex.createSearcher().getIndexReader();
 @@ -420,11 +429,11 @@
      assertNull(reader.termPositionsEnum(new Term("not-in-index", "foo")));
      assertNull(reader.terms("not-in-index"));
    }
 -
 +
    public void testDuellMemIndex() throws IOException {
      LineFileDocs lineFileDocs = new LineFileDocs(random());
      int numDocs = atLeast(10);
 -    MemoryIndex memory = new MemoryIndex(random().nextBoolean(),  random().nextInt(50) * 1024 * 1024);
 +    MemoryIndex memory = randomMemoryIndex();
      for (int i = 0; i < numDocs; i++) {
        Directory dir = newDirectory();
        MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
 @@ -535,7 +544,7 @@
          assertThat("Position test failed" + failDesc, memPos, equalTo(pos));
          assertThat("Start offset test failed" + failDesc, memDocsPosEnum.startOffset(), equalTo(docsPosEnum.startOffset()));
          assertThat("End offset test failed" + failDesc, memDocsPosEnum.endOffset(), equalTo(docsPosEnum.endOffset()));
 -        assertThat("Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(null));
 +        assertThat("Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(docsPosEnum.getPayload()));
        }
      }
      assertNull("Still some tokens not processed", memTermEnum.next());
 Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java	(revision 1645984)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java	(revision )
 @@ -21,18 +21,24 @@
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
  import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  import org.apache.lucene.index.DocsAndPositionsEnum;
  import org.apache.lucene.index.Terms;
  import org.apache.lucene.index.TermsEnum;
 +import org.apache.lucene.util.AttributeFactory;
  import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.BytesRefArray;
 +import org.apache.lucene.util.BytesRefBuilder;
 +import org.apache.lucene.util.Counter;
  import org.apache.lucene.util.UnicodeUtil;

  /**
   * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
   * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
 - * because you know the term vector has payloads.  This TokenStream supports an efficient {@link #reset()}, so there's
 + * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
 + * for them and if not then won't get them.  This TokenStream supports an efficient {@link #reset()}, so there's
   * no need to wrap with a caching impl.
   * <p />
   * The implementation will create an array of tokens indexed by token position.  As long as there aren't massive jumps
 @@ -47,6 +53,11 @@

    //TODO add a maxStartOffset filter, which highlighters will find handy

 +  //This attribute factory uses less memory when captureState() is called.
 +  public static final AttributeFactory ATTRIBUTE_FACTORY =
 +      AttributeFactory.getStaticImplementation(
 +          AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, PackedTokenAttributeImpl.class);
 +
    private final Terms vector;

    private final CharTermAttribute termAttribute;
 @@ -56,11 +67,15 @@
    private OffsetAttribute offsetAttribute;//maybe null

    private PayloadAttribute payloadAttribute;//maybe null
 +  private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
 +  private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null

    private TokenLL firstToken = null; // the head of a linked-list

    private TokenLL incrementToken = null;

 +  private boolean initialized = false;//lazy
 +
    /**
     * Constructor.
     *
 @@ -68,6 +83,8 @@
     *        creating the TokenStream. Must have positions and/or offsets.
     */
    public TokenStreamFromTermVector(Terms vector) throws IOException {
 +    super(ATTRIBUTE_FACTORY);
 +    assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
      if (!vector.hasPositions() && !vector.hasOffsets()) {
        throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
      }
 @@ -81,20 +98,20 @@

    @Override
    public void reset() throws IOException {
 -    if (firstToken == null) {//just the first time
 -      init();
 -    }
      incrementToken = null;
      super.reset();
    }

 -  //We initialize in reset() because we can see which attributes the consumer wants, particularly payloads
 +  //We delay initialization because we can see which attributes the consumer wants, particularly payloads
    private void init() throws IOException {
 +    assert !initialized;
      if (vector.hasOffsets()) {
        offsetAttribute = addAttribute(OffsetAttribute.class);
      }
      if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
        payloadAttribute = getAttribute(PayloadAttribute.class);
 +      payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
 +      spareBytesRefBuilder = new BytesRefBuilder();
      }

      // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
 @@ -132,14 +149,9 @@
          }

          if (payloadAttribute != null) {
 -          // Must make a deep copy of the returned payload,
 -          // since D&PEnum API is allowed to re-use on every
 -          // call:
            final BytesRef payload = dpEnum.getPayload();
 -          if (payload != null) {
 -            token.payload = BytesRef.deepCopyOf(payload);//TODO share a ByteBlockPool & re-use BytesRef
 +          token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
 -          }
 +        }
 -        }

          //Add token to an array indexed by position
          if (positionedTokens.length <= pos) {
 @@ -198,6 +210,8 @@
        prevTokenPos = pos;
        prevToken = token;
      }
 +
 +    initialized = true;
    }

    private TokenLL[] initTokensArray() throws IOException {
 @@ -216,8 +230,12 @@
    }

    @Override
 -  public boolean incrementToken() {
 +  public boolean incrementToken() throws IOException {
      if (incrementToken == null) {
 +      if (!initialized) {
 +        init();
 +        assert initialized;
 +      }
        incrementToken = firstToken;
        if (incrementToken == null) {
          return false;
 @@ -234,8 +252,12 @@
        offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
      }
      if (payloadAttribute != null) {
 -      payloadAttribute.setPayload(incrementToken.payload);
 +      if (incrementToken.payloadIndex == -1) {
 +        payloadAttribute.setPayload(null);
 +      } else {
 +        payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
 -    }
 +      }
 +    }
      return true;
    }

 @@ -245,7 +267,7 @@
      int positionIncrement;
      int startOffset;
      int endOffset;
 -    BytesRef payload;
 +    int payloadIndex;

      TokenLL next;

 Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java	(revision 1645984)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java	(revision )
 @@ -83,9 +83,9 @@
    private boolean cachedTokenStream;
    private boolean wrapToCaching = true;
    private int maxDocCharsToAnalyze;
 +  private boolean usePayloads = false;
    private LeafReader internalReader = null;

 -
    public WeightedSpanTermExtractor() {
    }

 @@ -384,7 +384,7 @@

        // Use MemoryIndex (index/invert this tokenStream now)
        if (internalReader == null) {
 -        final MemoryIndex indexer = new MemoryIndex(true);
 +        final MemoryIndex indexer = new MemoryIndex(true, usePayloads);//offsets and payloads
          if (cacheIt) {
            assert !cachedTokenStream;
            tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
 @@ -652,8 +652,16 @@

    public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
      this.expandMultiTermQuery = expandMultiTermQuery;
 +  }
 +
 +  public boolean isUsePayloads() {
 +    return usePayloads;
 +  }
 +
 +  public void setUsePayloads(boolean usePayloads) {
 +    this.usePayloads = usePayloads;
    }
 -
 +
    public boolean isCachedTokenStream() {
      return cachedTokenStream;
    }
 Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java	(revision 1645984)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java	(revision )
 @@ -54,6 +54,7 @@
    private boolean skipInitExtractor;
    private boolean wrapToCaching = true;
    private int maxCharsToAnalyze;
 +  private boolean usePayloads = false;

    /**
     * @param query Query to use for highlighting
 @@ -213,6 +214,7 @@
      qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
      qse.setExpandMultiTermQuery(expandMultiTermQuery);
      qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
 +    qse.setUsePayloads(usePayloads);
      if (reader == null) {
        this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
            tokenStream, field);
 @@ -258,8 +260,20 @@
     */
    public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
      this.expandMultiTermQuery = expandMultiTermQuery;
 +  }
 +
 +  /**
 +   * Whether or not we should capture payloads in {@link MemoryIndex} at each position so that queries can access them.
 +   * This does not apply to term vector based TokenStreams, which support payloads only when the term vector has them.
 +   */
 +  public boolean isUsePayloads() {
 +    return usePayloads;
 +  }
 +
 +  public void setUsePayloads(boolean usePayloads) {
 +    this.usePayloads = usePayloads;
    }
 -
 +
    /**
     * By default, {@link TokenStream}s that are not of the type
     * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
 Index: lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java	(revision 1645984)
 +++ lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java	(revision )
 @@ -29,6 +29,7 @@
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
  import org.apache.lucene.index.BinaryDocValues;
 @@ -60,6 +61,8 @@
  import org.apache.lucene.util.Bits;
  import org.apache.lucene.util.ByteBlockPool;
  import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.BytesRefArray;
 +import org.apache.lucene.util.BytesRefBuilder;
  import org.apache.lucene.util.BytesRefHash;
  import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
  import org.apache.lucene.util.Counter;
 @@ -187,17 +190,19 @@
   */
  public class MemoryIndex {

 +  private static final boolean DEBUG = false;
 +
    /** info for each field: Map&lt;String fieldName, Info field&gt; */
    private final SortedMap<String,Info> fields = new TreeMap<>();

    private final boolean storeOffsets;
 +  private final boolean storePayloads;
 -
 +
 -  private static final boolean DEBUG = false;
 -
    private final ByteBlockPool byteBlockPool;
    private final IntBlockPool intBlockPool;
  //  private final IntBlockPool.SliceReader postingsReader;
    private final IntBlockPool.SliceWriter postingsWriter;
 +  private final BytesRefArray payloadsBytesRefs;//non null only when storePayloads

    private Counter bytesUsed;

 @@ -206,7 +211,7 @@
    private Similarity normSimilarity = IndexSearcher.getDefaultSimilarity();

    /**
 -   * Constructs an empty instance.
 +   * Constructs an empty instance that will not store offsets or payloads.
     */
    public MemoryIndex() {
      this(false);
 @@ -215,25 +220,36 @@
    /**
     * Constructs an empty instance that can optionally store the start and end
     * character offset of each token term in the text. This can be useful for
 -   * highlighting of hit locations with the Lucene highlighter package.
 -   * Protected until the highlighter package matures, so that this can actually
 -   * be meaningfully integrated.
 +   * highlighting of hit locations with the Lucene highlighter package.  But
 +   * it will not store payloads; use another constructor for that.
     *
     * @param storeOffsets
     *            whether or not to store the start and end character offset of
     *            each token term in the text
     */
    public MemoryIndex(boolean storeOffsets) {
 -    this(storeOffsets, 0);
 +    this(storeOffsets, false);
    }
 -
 +
    /**
 +   * Constructs an empty instance with the option of storing offsets and payloads.
 +   *
 +   * @param storeOffsets store term offsets at each position
 +   * @param storePayloads store term payloads at each position
 +   */
 +  public MemoryIndex(boolean storeOffsets, boolean storePayloads) {
 +    this(storeOffsets, storePayloads, 0);
 +  }
 +
 +  /**
     * Expert: This constructor accepts an upper limit for the number of bytes that should be reused if this instance is {@link #reset()}.
     * @param storeOffsets <code>true</code> if offsets should be stored
 +   * @param storePayloads <code>true</code> if payloads should be stored
     * @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called
     */
 -  MemoryIndex(boolean storeOffsets, long maxReusedBytes) {
 +  MemoryIndex(boolean storeOffsets, boolean storePayloads, long maxReusedBytes) {
      this.storeOffsets = storeOffsets;
 +    this.storePayloads = storePayloads;
      this.bytesUsed = Counter.newCounter();
      final int maxBufferedByteBlocks = (int)((maxReusedBytes/2) / ByteBlockPool.BYTE_BLOCK_SIZE );
      final int maxBufferedIntBlocks = (int) ((maxReusedBytes - (maxBufferedByteBlocks*ByteBlockPool.BYTE_BLOCK_SIZE))/(IntBlockPool.INT_BLOCK_SIZE * RamUsageEstimator.NUM_BYTES_INT));
 @@ -241,6 +257,7 @@
      byteBlockPool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, maxBufferedByteBlocks, bytesUsed));
      intBlockPool = new IntBlockPool(new RecyclingIntBlockAllocator(IntBlockPool.INT_BLOCK_SIZE, maxBufferedIntBlocks, bytesUsed));
      postingsWriter = new SliceWriter(intBlockPool);
 +    payloadsBytesRefs = storePayloads ? new BytesRefArray(bytesUsed) : null;
    }

    /**
 @@ -381,8 +398,8 @@
     *
     * @param fieldName
     *            a name to be associated with the text
 -   * @param stream
 -   *            the token stream to retrieve tokens from.
 +   * @param tokenStream
 +   *            the token stream to retrieve tokens from. It's guaranteed to be closed no matter what.
     * @param boost
     *            the boost factor for hits for this field
     * @param positionIncrementGap
 @@ -391,16 +408,17 @@
     *            the offset gap if fields with the same name are added more than once
     * @see org.apache.lucene.document.Field#setBoost(float)
     */
 -  public void addField(String fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) {
 -    try {
 +  public void addField(String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap,
 +                       int offsetGap) {
 +    try (TokenStream stream = tokenStream) {
        if (frozen)
          throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
        if (fieldName == null)
          throw new IllegalArgumentException("fieldName must not be null");
        if (stream == null)
 -          throw new IllegalArgumentException("token stream must not be null");
 +        throw new IllegalArgumentException("token stream must not be null");
        if (boost <= 0.0f)
 -          throw new IllegalArgumentException("boost factor must be greater than 0.0");
 +        throw new IllegalArgumentException("boost factor must be greater than 0.0");
        int numTokens = 0;
        int numOverlapTokens = 0;
        int pos = -1;
 @@ -421,8 +439,9 @@
          sliceArray = info.sliceArray;
          sumTotalTermFreq = info.sumTotalTermFreq;
        } else {
 -        fieldInfo = new FieldInfo(fieldName, fields.size(), false, false, false,
 -            this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
 +        fieldInfo = new FieldInfo(fieldName, fields.size(), false, false, this.storePayloads,
 +            this.storeOffsets
 +                ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
              DocValuesType.NONE, -1, null);
          sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
          terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
 @@ -431,6 +450,7 @@
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
 +      PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
        BytesRef ref = termAtt.getBytesRef();
        stream.reset();

 @@ -451,13 +471,16 @@
          }
          sliceArray.freq[ord]++;
          sumTotalTermFreq++;
 -        if (!storeOffsets) {
 -          postingsWriter.writeInt(pos);
 +        postingsWriter.writeInt(pos);
 -        } else {
 -          postingsWriter.writeInt(pos);
 +        if (storeOffsets) {
            postingsWriter.writeInt(offsetAtt.startOffset() + offset);
            postingsWriter.writeInt(offsetAtt.endOffset() + offset);
          }
 +        if (storePayloads) {
 +          final BytesRef payload = payloadAtt.getPayload();
 +          int pIndex = payload == null ? -1 : payloadsBytesRefs.append(payload);
 +          postingsWriter.writeInt(pIndex);
 +        }
          sliceArray.end[ord] = postingsWriter.getCurrentOffset();
        }
        stream.end();
 @@ -466,18 +489,10 @@
        if (numTokens > 0) {
          fields.put(fieldName, new Info(fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq));
        }
 -    } catch (Exception e) { // can never happen
 +    } catch (IOException e) {
        throw new RuntimeException(e);
 -    } finally {
 -      try {
 -        if (stream != null) {
 -          stream.close();
 -        }
 +    }
 -      } catch (IOException e2) {
 -        throw new RuntimeException(e2);
 -      }
 +  }
 -    }
 -  }

    /**
     * Set the Similarity to be used for calculating field norms
 @@ -861,7 +876,7 @@

            @Override
            public boolean hasPayloads() {
 -            return false;
 +            return storePayloads;
            }
          };
        }
 @@ -1043,17 +1058,20 @@
      }

      private class MemoryDocsAndPositionsEnum extends DocsAndPositionsEnum {
 +      private final SliceReader sliceReader;
        private int posUpto; // for assert
        private boolean hasNext;
        private Bits liveDocs;
        private int doc = -1;
 -      private SliceReader sliceReader;
        private int freq;
        private int startOffset;
        private int endOffset;
 +      private int payloadIndex;
 +      private final BytesRefBuilder payloadBuilder;//only non-null when storePayloads
 -
 +
        public MemoryDocsAndPositionsEnum() {
          this.sliceReader = new SliceReader(intBlockPool);
 +        this.payloadBuilder = storePayloads ? new BytesRefBuilder() : null;
        }

        public DocsAndPositionsEnum reset(Bits liveDocs, int start, int end, int freq) {
 @@ -1096,15 +1114,16 @@
        public int nextPosition() {
          assert posUpto++ < freq;
          assert !sliceReader.endOfSlice() : " stores offsets : " + startOffset;
 -        if (storeOffsets) {
 -          int pos = sliceReader.readInt();
 +        int pos = sliceReader.readInt();
 +        if (storeOffsets) {
            startOffset = sliceReader.readInt();
            endOffset = sliceReader.readInt();
 -          return pos;
 -        } else {
 -          return sliceReader.readInt();
          }
 +        if (storePayloads) {
 +          payloadIndex = sliceReader.readInt();
 -      }
 +        }
 +        return pos;
 +      }

        @Override
        public int startOffset() {
 @@ -1118,8 +1137,11 @@

        @Override
        public BytesRef getPayload() {
 +        if (payloadBuilder == null || payloadIndex == -1) {
 -        return null;
 -      }
 +          return null;
 +        }
 +        return payloadsBytesRefs.get(payloadBuilder, payloadIndex);
 +      }

        @Override
        public long cost() {
 @@ -1178,6 +1200,9 @@
      this.normSimilarity = IndexSearcher.getDefaultSimilarity();
      byteBlockPool.reset(false, false); // no need to 0-fill the buffers
      intBlockPool.reset(true, false); // here must must 0-fill since we use slices
 +    if (payloadsBytesRefs != null) {
 +      payloadsBytesRefs.clear();
 +    }
      this.frozen = false;
    }